1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#if ENABLE(MHTML)
34
35#include "MHTMLArchive.h"
36
37#include "Document.h"
38#include "Frame.h"
39#include "MHTMLParser.h"
40#include "MIMETypeRegistry.h"
41#include "Page.h"
42#include "PageSerializer.h"
43#include "QuotedPrintable.h"
44#include "SchemeRegistry.h"
45#include "SharedBuffer.h"
46#include <time.h>
47#include <wtf/CryptographicallyRandomNumber.h>
48#include <wtf/DateMath.h>
49#include <wtf/GregorianDateTime.h>
50#include <wtf/StdLibExtras.h>
51#include <wtf/text/Base64.h>
52#include <wtf/text/StringBuilder.h>
53
54#if HAVE(SYS_TIME_H)
55#include <sys/time.h>
56#endif
57
58namespace WebCore {
59
60const char* const quotedPrintable = "quoted-printable";
61const char* const base64 = "base64";
62
63static String generateRandomBoundary()
64{
65 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
66 const size_t randomValuesLength = 10;
67 char randomValues[randomValuesLength];
68 cryptographicallyRandomValues(&randomValues, randomValuesLength);
69 StringBuilder stringBuilder;
70 stringBuilder.append("----=_NextPart_000_");
71 for (size_t i = 0; i < randomValuesLength; ++i) {
72 if (i == 2)
73 stringBuilder.append('_');
74 else if (i == 6)
75 stringBuilder.append('.');
76 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
77 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
78 }
79 return stringBuilder.toString();
80}
81
82static String replaceNonPrintableCharacters(const String& text)
83{
84 StringBuilder stringBuilder;
85 for (size_t i = 0; i < text.length(); ++i) {
86 if (isASCIIPrintable(text[i]))
87 stringBuilder.append(text[i]);
88 else
89 stringBuilder.append('?');
90 }
91 return stringBuilder.toString();
92}
93
94MHTMLArchive::MHTMLArchive()
95{
96}
97
98MHTMLArchive::~MHTMLArchive()
99{
100 // Because all frames know about each other we need to perform a deep clearing of the archives graph.
101 clearAllSubframeArchives();
102}
103
104Ref<MHTMLArchive> MHTMLArchive::create()
105{
106 return adoptRef(*new MHTMLArchive);
107}
108
109RefPtr<MHTMLArchive> MHTMLArchive::create(const URL& url, SharedBuffer& data)
110{
111 // For security reasons we only load MHTML pages from local URLs.
112 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol().toString()))
113 return nullptr;
114
115 MHTMLParser parser(&data);
116 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive();
117 if (!mainArchive)
118 return nullptr; // Invalid MHTML file.
119
120 // Since MHTML is a flat format, we need to make all frames aware of all resources.
121 for (size_t i = 0; i < parser.frameCount(); ++i) {
122 RefPtr<MHTMLArchive> archive = parser.frameAt(i);
123 for (size_t j = 1; j < parser.frameCount(); ++j) {
124 if (i != j)
125 archive->addSubframeArchive(*parser.frameAt(j));
126 }
127 for (size_t j = 0; j < parser.subResourceCount(); ++j)
128 archive->addSubresource(*parser.subResourceAt(j));
129 }
130 return mainArchive;
131}
132
133Ref<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page)
134{
135 Vector<PageSerializer::Resource> resources;
136 PageSerializer pageSerializer(resources);
137 pageSerializer.serialize(*page);
138
139 String boundary = generateRandomBoundary();
140 String endOfResourceBoundary = makeString("--", boundary, "\r\n");
141
142 GregorianDateTime now;
143 now.setToCurrentLocalTime();
144 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60);
145
146 StringBuilder stringBuilder;
147 stringBuilder.append("From: <Saved by WebKit>\r\n");
148 stringBuilder.append("Subject: ");
149 // We replace non ASCII characters with '?' characters to match IE's behavior.
150 stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame().document()->title()));
151 stringBuilder.append("\r\nDate: ");
152 stringBuilder.append(dateString);
153 stringBuilder.append("\r\nMIME-Version: 1.0\r\n");
154 stringBuilder.append("Content-Type: multipart/related;\r\n");
155 stringBuilder.append("\ttype=\"");
156 stringBuilder.append(page->mainFrame().document()->suggestedMIMEType());
157 stringBuilder.append("\";\r\n");
158 stringBuilder.append("\tboundary=\"");
159 stringBuilder.append(boundary);
160 stringBuilder.append("\"\r\n\r\n");
161
162 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
163 ASSERT(stringBuilder.toString().isAllASCII());
164 CString asciiString = stringBuilder.toString().utf8();
165 auto mhtmlData = SharedBuffer::create();
166 mhtmlData->append(asciiString.data(), asciiString.length());
167
168 for (auto& resource : resources) {
169 stringBuilder.clear();
170 stringBuilder.append(endOfResourceBoundary);
171 stringBuilder.append("Content-Type: ");
172 stringBuilder.append(resource.mimeType);
173
174 const char* contentEncoding = nullptr;
175 if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
176 contentEncoding = quotedPrintable;
177 else
178 contentEncoding = base64;
179
180 stringBuilder.append("\r\nContent-Transfer-Encoding: ");
181 stringBuilder.append(contentEncoding);
182 stringBuilder.append("\r\nContent-Location: ");
183 stringBuilder.append(resource.url);
184 stringBuilder.append("\r\n\r\n");
185
186 asciiString = stringBuilder.toString().utf8();
187 mhtmlData->append(asciiString.data(), asciiString.length());
188
189 // FIXME: ideally we would encode the content as a stream without having to fetch it all.
190 const char* data = resource.data->data();
191 size_t dataLength = resource.data->size();
192 Vector<char> encodedData;
193 if (!strcmp(contentEncoding, quotedPrintable)) {
194 quotedPrintableEncode(data, dataLength, encodedData);
195 mhtmlData->append(encodedData.data(), encodedData.size());
196 mhtmlData->append("\r\n", 2);
197 } else {
198 ASSERT(!strcmp(contentEncoding, base64));
199 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
200 base64Encode(data, dataLength, encodedData);
201 const size_t maximumLineLength = 76;
202 size_t index = 0;
203 size_t encodedDataLength = encodedData.size();
204 do {
205 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
206 mhtmlData->append(encodedData.data() + index, lineLength);
207 mhtmlData->append("\r\n", 2);
208 index += maximumLineLength;
209 } while (index < encodedDataLength);
210 }
211 }
212
213 asciiString = makeString("--", boundary, "--\r\n").utf8();
214 mhtmlData->append(asciiString.data(), asciiString.length());
215
216 return mhtmlData;
217}
218
219}
220
221#endif
222