1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "PageSerializer.h"
33
34#include "CSSFontFaceRule.h"
35#include "CSSImageValue.h"
36#include "CSSImportRule.h"
37#include "CSSStyleRule.h"
38#include "CachedImage.h"
39#include "Document.h"
40#include "Element.h"
41#include "Frame.h"
42#include "HTMLFrameOwnerElement.h"
43#include "HTMLHeadElement.h"
44#include "HTMLImageElement.h"
45#include "HTMLLinkElement.h"
46#include "HTMLMetaCharsetParser.h"
47#include "HTMLNames.h"
48#include "HTMLObjectElement.h"
49#include "HTMLStyleElement.h"
50#include "HTTPParsers.h"
51#include "Image.h"
52#include "MarkupAccumulator.h"
53#include "Page.h"
54#include "RenderElement.h"
55#include "StyleCachedImage.h"
56#include "StyleImage.h"
57#include "StyleProperties.h"
58#include "StyleRule.h"
59#include "StyleSheetContents.h"
60#include "Text.h"
61#include "TextEncoding.h"
62#include <wtf/text/CString.h>
63#include <wtf/text/StringBuilder.h>
64#include <wtf/text/WTFString.h>
65
66namespace WebCore {
67
68static bool isCharsetSpecifyingNode(const Node& node)
69{
70 if (!is<HTMLElement>(node))
71 return false;
72
73 const HTMLElement& element = downcast<HTMLElement>(node);
74 if (!element.hasTagName(HTMLNames::metaTag))
75 return false;
76 HTMLMetaCharsetParser::AttributeList attributes;
77 if (element.hasAttributes()) {
78 for (const Attribute& attribute : element.attributesIterator()) {
79 // FIXME: We should deal appropriately with the attribute if they have a namespace.
80 attributes.append(std::make_pair(attribute.name().toString(), attribute.value().string()));
81 }
82 }
83 return HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes).isValid();
84}
85
86static bool shouldIgnoreElement(const Element& element)
87{
88 return element.hasTagName(HTMLNames::scriptTag) || element.hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element);
89}
90
91static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
92{
93 // FIXME: We should support all frame owners including applets.
94 return is<HTMLObjectElement>(frameOwner) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
95}
96
97class PageSerializer::SerializerMarkupAccumulator final : public MarkupAccumulator {
98public:
99 SerializerMarkupAccumulator(PageSerializer&, Document&, Vector<Node*>*);
100
101private:
102 PageSerializer& m_serializer;
103 Document& m_document;
104
105 void appendText(StringBuilder&, const Text&) override;
106 void appendStartTag(StringBuilder&, const Element&, Namespaces*) override;
107 void appendCustomAttributes(StringBuilder&, const Element&, Namespaces*) override;
108 void appendEndTag(StringBuilder&, const Element&) override;
109};
110
111PageSerializer::SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer& serializer, Document& document, Vector<Node*>* nodes)
112 : MarkupAccumulator(nodes, ResolveURLs::Yes)
113 , m_serializer(serializer)
114 , m_document(document)
115{
116 // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified.
117 if (m_document.isXMLDocument() || m_document.xmlStandalone())
118 appendString("<?xml version=\"" + m_document.xmlVersion() + "\" encoding=\"" + m_document.charset() + "\"?>");
119}
120
121void PageSerializer::SerializerMarkupAccumulator::appendText(StringBuilder& out, const Text& text)
122{
123 Element* parent = text.parentElement();
124 if (parent && !shouldIgnoreElement(*parent))
125 MarkupAccumulator::appendText(out, text);
126}
127
128void PageSerializer::SerializerMarkupAccumulator::appendStartTag(StringBuilder& out, const Element& element, Namespaces* namespaces)
129{
130 if (!shouldIgnoreElement(element))
131 MarkupAccumulator::appendStartTag(out, element, namespaces);
132
133 if (element.hasTagName(HTMLNames::headTag)) {
134 out.appendLiteral("<meta charset=\"");
135 out.append(m_document.charset());
136 out.appendLiteral("\">");
137 }
138
139 // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
140}
141
142void PageSerializer::SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, const Element& element, Namespaces* namespaces)
143{
144 if (!is<HTMLFrameOwnerElement>(element))
145 return;
146
147 const HTMLFrameOwnerElement& frameOwner = downcast<HTMLFrameOwnerElement>(element);
148 Frame* frame = frameOwner.contentFrame();
149 if (!frame)
150 return;
151
152 URL url = frame->document()->url();
153 if (url.isValid() && !url.protocolIsAbout())
154 return;
155
156 // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
157 url = m_serializer.urlForBlankFrame(frame);
158 appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(frameOwner), url.string()), namespaces);
159}
160
161void PageSerializer::SerializerMarkupAccumulator::appendEndTag(StringBuilder& out, const Element& element)
162{
163 if (!shouldIgnoreElement(element))
164 MarkupAccumulator::appendEndTag(out, element);
165}
166
167PageSerializer::PageSerializer(Vector<PageSerializer::Resource>& resources)
168 : m_resources(resources)
169{
170}
171
172void PageSerializer::serialize(Page& page)
173{
174 serializeFrame(&page.mainFrame());
175}
176
177void PageSerializer::serializeFrame(Frame* frame)
178{
179 Document* document = frame->document();
180 URL url = document->url();
181 if (!url.isValid() || url.protocolIsAbout()) {
182 // For blank frames we generate a fake URL so they can be referenced by their containing frame.
183 url = urlForBlankFrame(frame);
184 }
185
186 if (m_resourceURLs.contains(url)) {
187 // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
188 // different content. So we should serialize both and somehow rename the frame src in the containing
189 // frame. Arg!
190 return;
191 }
192
193 Vector<Node*> nodes;
194 SerializerMarkupAccumulator accumulator(*this, *document, &nodes);
195 TextEncoding textEncoding(document->charset());
196 CString data;
197 if (!textEncoding.isValid()) {
198 // FIXME: iframes used as images trigger this. We should deal with them correctly.
199 return;
200 }
201 String text = accumulator.serializeNodes(*document->documentElement(), SerializedNodes::SubtreeIncludingNode);
202 m_resources.append({ url, document->suggestedMIMEType(), SharedBuffer::create(textEncoding.encode(text, UnencodableHandling::Entities)) });
203 m_resourceURLs.add(url);
204
205 for (auto& node : nodes) {
206 if (!is<Element>(*node))
207 continue;
208
209 Element& element = downcast<Element>(*node);
210 // We have to process in-line style as it might contain some resources (typically background images).
211 if (is<StyledElement>(element))
212 retrieveResourcesForProperties(downcast<StyledElement>(element).inlineStyle(), document);
213
214 if (is<HTMLImageElement>(element)) {
215 HTMLImageElement& imageElement = downcast<HTMLImageElement>(element);
216 URL url = document->completeURL(imageElement.attributeWithoutSynchronization(HTMLNames::srcAttr));
217 CachedImage* cachedImage = imageElement.cachedImage();
218 addImageToResources(cachedImage, imageElement.renderer(), url);
219 } else if (is<HTMLLinkElement>(element)) {
220 HTMLLinkElement& linkElement = downcast<HTMLLinkElement>(element);
221 if (CSSStyleSheet* sheet = linkElement.sheet()) {
222 URL url = document->completeURL(linkElement.attributeWithoutSynchronization(HTMLNames::hrefAttr));
223 serializeCSSStyleSheet(sheet, url);
224 ASSERT(m_resourceURLs.contains(url));
225 }
226 } else if (is<HTMLStyleElement>(element)) {
227 if (CSSStyleSheet* sheet = downcast<HTMLStyleElement>(element).sheet())
228 serializeCSSStyleSheet(sheet, URL());
229 }
230 }
231
232 for (Frame* childFrame = frame->tree().firstChild(); childFrame; childFrame = childFrame->tree().nextSibling())
233 serializeFrame(childFrame);
234}
235
236void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const URL& url)
237{
238 StringBuilder cssText;
239 for (unsigned i = 0; i < styleSheet->length(); ++i) {
240 CSSRule* rule = styleSheet->item(i);
241 String itemText = rule->cssText();
242 if (!itemText.isEmpty()) {
243 cssText.append(itemText);
244 if (i < styleSheet->length() - 1)
245 cssText.appendLiteral("\n\n");
246 }
247 Document* document = styleSheet->ownerDocument();
248 // Some rules have resources associated with them that we need to retrieve.
249 if (is<CSSImportRule>(*rule)) {
250 CSSImportRule& importRule = downcast<CSSImportRule>(*rule);
251 URL importURL = document->completeURL(importRule.href());
252 if (m_resourceURLs.contains(importURL))
253 continue;
254 serializeCSSStyleSheet(importRule.styleSheet(), importURL);
255 } else if (is<CSSFontFaceRule>(*rule)) {
256 // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can
257 // be retrieved from the CSSFontFaceRule object.
258 } else if (is<CSSStyleRule>(*rule))
259 retrieveResourcesForRule(downcast<CSSStyleRule>(*rule).styleRule(), document);
260 }
261
262 if (url.isValid() && !m_resourceURLs.contains(url)) {
263 // FIXME: We should check whether a charset has been specified and if none was found add one.
264 TextEncoding textEncoding(styleSheet->contents().charset());
265 ASSERT(textEncoding.isValid());
266 m_resources.append({ url, "text/css"_s, SharedBuffer::create(textEncoding.encode(cssText.toString(), UnencodableHandling::Entities)) });
267 m_resourceURLs.add(url);
268 }
269}
270
271void PageSerializer::addImageToResources(CachedImage* image, RenderElement* imageRenderer, const URL& url)
272{
273 if (!url.isValid() || m_resourceURLs.contains(url))
274 return;
275
276 if (!image || image->image() == &Image::nullImage())
277 return;
278
279 RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0;
280 if (!data)
281 data = image->image()->data();
282
283 if (!data) {
284 LOG_ERROR("No data for image %s", url.string().utf8().data());
285 return;
286 }
287
288 m_resources.append({ url, image->response().mimeType(), WTFMove(data) });
289 m_resourceURLs.add(url);
290}
291
292void PageSerializer::retrieveResourcesForRule(StyleRule& rule, Document* document)
293{
294 retrieveResourcesForProperties(&rule.properties(), document);
295}
296
297void PageSerializer::retrieveResourcesForProperties(const StyleProperties* styleDeclaration, Document* document)
298{
299 if (!styleDeclaration)
300 return;
301
302 // The background-image and list-style-image (for ul or ol) are the CSS properties
303 // that make use of images. We iterate to make sure we include any other
304 // image properties there might be.
305 unsigned propertyCount = styleDeclaration->propertyCount();
306 for (unsigned i = 0; i < propertyCount; ++i) {
307 RefPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
308 if (!is<CSSImageValue>(*cssValue))
309 continue;
310
311 auto* image = downcast<CSSImageValue>(*cssValue).cachedImage();
312 if (!image)
313 continue;
314
315 addImageToResources(image, nullptr, document->completeURL(image->url()));
316 }
317}
318
319URL PageSerializer::urlForBlankFrame(Frame* frame)
320{
321 auto iter = m_blankFrameURLs.find(frame);
322 if (iter != m_blankFrameURLs.end())
323 return iter->value;
324 String url = makeString("wyciwyg://frame/", m_blankFrameCounter++);
325 URL fakeURL({ }, url);
326 m_blankFrameURLs.add(frame, fakeURL);
327 return fakeURL;
328}
329
330}
331