1 | /* |
2 | * Copyright (C) 2011 Google Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions are |
6 | * met: |
7 | * |
8 | * * Redistributions of source code must retain the above copyright |
9 | * notice, this list of conditions and the following disclaimer. |
10 | * * Redistributions in binary form must reproduce the above |
11 | * copyright notice, this list of conditions and the following disclaimer |
12 | * in the documentation and/or other materials provided with the |
13 | * distribution. |
14 | * * Neither the name of Google Inc. nor the names of its |
15 | * contributors may be used to endorse or promote products derived from |
16 | * this software without specific prior written permission. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | */ |
30 | |
31 | #include "config.h" |
32 | |
33 | #if ENABLE(MHTML) |
34 | |
35 | #include "MHTMLArchive.h" |
36 | |
37 | #include "Document.h" |
38 | #include "Frame.h" |
39 | #include "MHTMLParser.h" |
40 | #include "MIMETypeRegistry.h" |
41 | #include "Page.h" |
42 | #include "PageSerializer.h" |
43 | #include "QuotedPrintable.h" |
44 | #include "SchemeRegistry.h" |
45 | #include "SharedBuffer.h" |
46 | #include <time.h> |
47 | #include <wtf/CryptographicallyRandomNumber.h> |
48 | #include <wtf/DateMath.h> |
49 | #include <wtf/GregorianDateTime.h> |
50 | #include <wtf/StdLibExtras.h> |
51 | #include <wtf/text/Base64.h> |
52 | #include <wtf/text/StringBuilder.h> |
53 | |
54 | #if HAVE(SYS_TIME_H) |
55 | #include <sys/time.h> |
56 | #endif |
57 | |
58 | namespace WebCore { |
59 | |
60 | const char* const quotedPrintable = "quoted-printable" ; |
61 | const char* const base64 = "base64" ; |
62 | |
63 | static String generateRandomBoundary() |
64 | { |
65 | // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). |
66 | const size_t randomValuesLength = 10; |
67 | char randomValues[randomValuesLength]; |
68 | cryptographicallyRandomValues(&randomValues, randomValuesLength); |
69 | StringBuilder stringBuilder; |
70 | stringBuilder.append("----=_NextPart_000_" ); |
71 | for (size_t i = 0; i < randomValuesLength; ++i) { |
72 | if (i == 2) |
73 | stringBuilder.append('_'); |
74 | else if (i == 6) |
75 | stringBuilder.append('.'); |
76 | stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); |
77 | stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); |
78 | } |
79 | return stringBuilder.toString(); |
80 | } |
81 | |
82 | static String replaceNonPrintableCharacters(const String& text) |
83 | { |
84 | StringBuilder stringBuilder; |
85 | for (size_t i = 0; i < text.length(); ++i) { |
86 | if (isASCIIPrintable(text[i])) |
87 | stringBuilder.append(text[i]); |
88 | else |
89 | stringBuilder.append('?'); |
90 | } |
91 | return stringBuilder.toString(); |
92 | } |
93 | |
94 | MHTMLArchive::MHTMLArchive() |
95 | { |
96 | } |
97 | |
98 | MHTMLArchive::~MHTMLArchive() |
99 | { |
100 | // Because all frames know about each other we need to perform a deep clearing of the archives graph. |
101 | clearAllSubframeArchives(); |
102 | } |
103 | |
104 | Ref<MHTMLArchive> MHTMLArchive::create() |
105 | { |
106 | return adoptRef(*new MHTMLArchive); |
107 | } |
108 | |
109 | RefPtr<MHTMLArchive> MHTMLArchive::create(const URL& url, SharedBuffer& data) |
110 | { |
111 | // For security reasons we only load MHTML pages from local URLs. |
112 | if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol().toString())) |
113 | return nullptr; |
114 | |
115 | MHTMLParser parser(&data); |
116 | RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); |
117 | if (!mainArchive) |
118 | return nullptr; // Invalid MHTML file. |
119 | |
120 | // Since MHTML is a flat format, we need to make all frames aware of all resources. |
121 | for (size_t i = 0; i < parser.frameCount(); ++i) { |
122 | RefPtr<MHTMLArchive> archive = parser.frameAt(i); |
123 | for (size_t j = 1; j < parser.frameCount(); ++j) { |
124 | if (i != j) |
125 | archive->addSubframeArchive(*parser.frameAt(j)); |
126 | } |
127 | for (size_t j = 0; j < parser.subResourceCount(); ++j) |
128 | archive->addSubresource(*parser.subResourceAt(j)); |
129 | } |
130 | return mainArchive; |
131 | } |
132 | |
133 | Ref<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page) |
134 | { |
135 | Vector<PageSerializer::Resource> resources; |
136 | PageSerializer pageSerializer(resources); |
137 | pageSerializer.serialize(*page); |
138 | |
139 | String boundary = generateRandomBoundary(); |
140 | String endOfResourceBoundary = makeString("--" , boundary, "\r\n" ); |
141 | |
142 | GregorianDateTime now; |
143 | now.setToCurrentLocalTime(); |
144 | String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); |
145 | |
146 | StringBuilder stringBuilder; |
147 | stringBuilder.append("From: <Saved by WebKit>\r\n" ); |
148 | stringBuilder.append("Subject: " ); |
149 | // We replace non ASCII characters with '?' characters to match IE's behavior. |
150 | stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame().document()->title())); |
151 | stringBuilder.append("\r\nDate: " ); |
152 | stringBuilder.append(dateString); |
153 | stringBuilder.append("\r\nMIME-Version: 1.0\r\n" ); |
154 | stringBuilder.append("Content-Type: multipart/related;\r\n" ); |
155 | stringBuilder.append("\ttype=\"" ); |
156 | stringBuilder.append(page->mainFrame().document()->suggestedMIMEType()); |
157 | stringBuilder.append("\";\r\n" ); |
158 | stringBuilder.append("\tboundary=\"" ); |
159 | stringBuilder.append(boundary); |
160 | stringBuilder.append("\"\r\n\r\n" ); |
161 | |
162 | // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). |
163 | ASSERT(stringBuilder.toString().isAllASCII()); |
164 | CString asciiString = stringBuilder.toString().utf8(); |
165 | auto mhtmlData = SharedBuffer::create(); |
166 | mhtmlData->append(asciiString.data(), asciiString.length()); |
167 | |
168 | for (auto& resource : resources) { |
169 | stringBuilder.clear(); |
170 | stringBuilder.append(endOfResourceBoundary); |
171 | stringBuilder.append("Content-Type: " ); |
172 | stringBuilder.append(resource.mimeType); |
173 | |
174 | const char* contentEncoding = nullptr; |
175 | if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) |
176 | contentEncoding = quotedPrintable; |
177 | else |
178 | contentEncoding = base64; |
179 | |
180 | stringBuilder.append("\r\nContent-Transfer-Encoding: " ); |
181 | stringBuilder.append(contentEncoding); |
182 | stringBuilder.append("\r\nContent-Location: " ); |
183 | stringBuilder.append(resource.url); |
184 | stringBuilder.append("\r\n\r\n" ); |
185 | |
186 | asciiString = stringBuilder.toString().utf8(); |
187 | mhtmlData->append(asciiString.data(), asciiString.length()); |
188 | |
189 | // FIXME: ideally we would encode the content as a stream without having to fetch it all. |
190 | const char* data = resource.data->data(); |
191 | size_t dataLength = resource.data->size(); |
192 | Vector<char> encodedData; |
193 | if (!strcmp(contentEncoding, quotedPrintable)) { |
194 | quotedPrintableEncode(data, dataLength, encodedData); |
195 | mhtmlData->append(encodedData.data(), encodedData.size()); |
196 | mhtmlData->append("\r\n" , 2); |
197 | } else { |
198 | ASSERT(!strcmp(contentEncoding, base64)); |
199 | // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. |
200 | base64Encode(data, dataLength, encodedData); |
201 | const size_t maximumLineLength = 76; |
202 | size_t index = 0; |
203 | size_t encodedDataLength = encodedData.size(); |
204 | do { |
205 | size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); |
206 | mhtmlData->append(encodedData.data() + index, lineLength); |
207 | mhtmlData->append("\r\n" , 2); |
208 | index += maximumLineLength; |
209 | } while (index < encodedDataLength); |
210 | } |
211 | } |
212 | |
213 | asciiString = makeString("--" , boundary, "--\r\n" ).utf8(); |
214 | mhtmlData->append(asciiString.data(), asciiString.length()); |
215 | |
216 | return mhtmlData; |
217 | } |
218 | |
219 | } |
220 | |
221 | #endif |
222 | |