1/*
2 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
3 * Copyright (C) 2015-2017 Apple Inc. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "HTMLDocumentParser.h"
29
30#include "CustomHeaderFields.h"
31#include "CustomElementReactionQueue.h"
32#include "DocumentFragment.h"
33#include "DocumentLoader.h"
34#include "Frame.h"
35#include "HTMLDocument.h"
36#include "HTMLParserScheduler.h"
37#include "HTMLPreloadScanner.h"
38#include "HTMLScriptRunner.h"
39#include "HTMLTreeBuilder.h"
40#include "HTMLUnknownElement.h"
41#include "JSCustomElementInterface.h"
42#include "LinkLoader.h"
43#include "Microtasks.h"
44#include "NavigationScheduler.h"
45#include "ScriptElement.h"
46#include "ThrowOnDynamicMarkupInsertionCountIncrementer.h"
47
48namespace WebCore {
49
50using namespace HTMLNames;
51
52HTMLDocumentParser::HTMLDocumentParser(HTMLDocument& document)
53 : ScriptableDocumentParser(document)
54 , m_options(document)
55 , m_tokenizer(m_options)
56 , m_scriptRunner(std::make_unique<HTMLScriptRunner>(document, static_cast<HTMLScriptRunnerHost&>(*this)))
57 , m_treeBuilder(std::make_unique<HTMLTreeBuilder>(*this, document, parserContentPolicy(), m_options))
58 , m_parserScheduler(std::make_unique<HTMLParserScheduler>(*this))
59 , m_xssAuditorDelegate(document)
60 , m_preloader(std::make_unique<HTMLResourcePreloader>(document))
61{
62}
63
64Ref<HTMLDocumentParser> HTMLDocumentParser::create(HTMLDocument& document)
65{
66 return adoptRef(*new HTMLDocumentParser(document));
67}
68
69inline HTMLDocumentParser::HTMLDocumentParser(DocumentFragment& fragment, Element& contextElement, ParserContentPolicy rawPolicy)
70 : ScriptableDocumentParser(fragment.document(), rawPolicy)
71 , m_options(fragment.document())
72 , m_tokenizer(m_options)
73 , m_treeBuilder(std::make_unique<HTMLTreeBuilder>(*this, fragment, contextElement, parserContentPolicy(), m_options))
74 , m_xssAuditorDelegate(fragment.document())
75{
76 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
77 if (contextElement.isHTMLElement())
78 m_tokenizer.updateStateFor(contextElement.tagQName().localName());
79 m_xssAuditor.initForFragment();
80}
81
82inline Ref<HTMLDocumentParser> HTMLDocumentParser::create(DocumentFragment& fragment, Element& contextElement, ParserContentPolicy parserContentPolicy)
83{
84 return adoptRef(*new HTMLDocumentParser(fragment, contextElement, parserContentPolicy));
85}
86
87HTMLDocumentParser::~HTMLDocumentParser()
88{
89 ASSERT(!m_parserScheduler);
90 ASSERT(!m_pumpSessionNestingLevel);
91 ASSERT(!m_preloadScanner);
92 ASSERT(!m_insertionPreloadScanner);
93}
94
95void HTMLDocumentParser::detach()
96{
97 ScriptableDocumentParser::detach();
98
99 if (m_scriptRunner)
100 m_scriptRunner->detach();
101 // FIXME: It seems wrong that we would have a preload scanner here.
102 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do.
103 m_preloadScanner = nullptr;
104 m_insertionPreloadScanner = nullptr;
105 m_parserScheduler = nullptr; // Deleting the scheduler will clear any timers.
106}
107
108void HTMLDocumentParser::stopParsing()
109{
110 DocumentParser::stopParsing();
111 m_parserScheduler = nullptr; // Deleting the scheduler will clear any timers.
112}
113
114// This kicks off "Once the user agent stops parsing" as described by:
115// https://html.spec.whatwg.org/multipage/syntax.html#the-end
116void HTMLDocumentParser::prepareToStopParsing()
117{
118 ASSERT(!hasInsertionPoint());
119
120 // pumpTokenizer can cause this parser to be detached from the Document,
121 // but we need to ensure it isn't deleted yet.
122 Ref<HTMLDocumentParser> protectedThis(*this);
123
124 // NOTE: This pump should only ever emit buffered character tokens,
125 // so ForceSynchronous vs. AllowYield should be meaningless.
126 pumpTokenizerIfPossible(ForceSynchronous);
127
128 if (isStopped())
129 return;
130
131 DocumentParser::prepareToStopParsing();
132
133 // We will not have a scriptRunner when parsing a DocumentFragment.
134 if (m_scriptRunner)
135 document()->setReadyState(Document::Interactive);
136
137 // Setting the ready state above can fire mutation event and detach us
138 // from underneath. In that case, just bail out.
139 if (isDetached())
140 return;
141
142 attemptToRunDeferredScriptsAndEnd();
143}
144
145inline bool HTMLDocumentParser::inPumpSession() const
146{
147 return m_pumpSessionNestingLevel > 0;
148}
149
150inline bool HTMLDocumentParser::shouldDelayEnd() const
151{
152 return inPumpSession() || isWaitingForScripts() || isScheduledForResume() || isExecutingScript();
153}
154
155void HTMLDocumentParser::didBeginYieldingParser()
156{
157 m_parserScheduler->didBeginYieldingParser();
158}
159
160void HTMLDocumentParser::didEndYieldingParser()
161{
162 m_parserScheduler->didEndYieldingParser();
163}
164
165bool HTMLDocumentParser::isParsingFragment() const
166{
167 return m_treeBuilder->isParsingFragment();
168}
169
170bool HTMLDocumentParser::processingData() const
171{
172 return isScheduledForResume() || inPumpSession();
173}
174
175void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode)
176{
177 if (isStopped() || isWaitingForScripts())
178 return;
179
180 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump.
181 if (isScheduledForResume()) {
182 ASSERT(mode == AllowYield);
183 return;
184 }
185
186 pumpTokenizer(mode);
187}
188
189bool HTMLDocumentParser::isScheduledForResume() const
190{
191 return m_parserScheduler && m_parserScheduler->isScheduledForResume();
192}
193
194// Used by HTMLParserScheduler
195void HTMLDocumentParser::resumeParsingAfterYield()
196{
197 // pumpTokenizer can cause this parser to be detached from the Document,
198 // but we need to ensure it isn't deleted yet.
199 Ref<HTMLDocumentParser> protectedThis(*this);
200
201 // We should never be here unless we can pump immediately.
202 // Call pumpTokenizer() directly so that ASSERTS will fire if we're wrong.
203 pumpTokenizer(AllowYield);
204 endIfDelayed();
205}
206
207void HTMLDocumentParser::runScriptsForPausedTreeBuilder()
208{
209 ASSERT(scriptingContentIsAllowed(parserContentPolicy()));
210
211 if (std::unique_ptr<CustomElementConstructionData> constructionData = m_treeBuilder->takeCustomElementConstructionData()) {
212 ASSERT(!m_treeBuilder->hasParserBlockingScriptWork());
213
214 // https://html.spec.whatwg.org/#create-an-element-for-the-token
215 {
216 // Prevent document.open/write during reactions by allocating the incrementer before the reactions stack.
217 ThrowOnDynamicMarkupInsertionCountIncrementer incrementer(*document());
218
219 MicrotaskQueue::mainThreadQueue().performMicrotaskCheckpoint();
220
221 CustomElementReactionStack reactionStack(document()->execState());
222 auto& elementInterface = constructionData->elementInterface.get();
223 auto newElement = elementInterface.constructElementWithFallback(*document(), constructionData->name);
224 m_treeBuilder->didCreateCustomOrFallbackElement(WTFMove(newElement), *constructionData);
225 }
226 return;
227 }
228
229 TextPosition scriptStartPosition = TextPosition::belowRangePosition();
230 if (auto scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition)) {
231 ASSERT(!m_treeBuilder->hasParserBlockingScriptWork());
232 // We will not have a scriptRunner when parsing a DocumentFragment.
233 if (m_scriptRunner)
234 m_scriptRunner->execute(scriptElement.releaseNonNull(), scriptStartPosition);
235 }
236}
237
238Document* HTMLDocumentParser::contextForParsingSession()
239{
240 // The parsing session should interact with the document only when parsing
241 // non-fragments. Otherwise, we might delay the load event mistakenly.
242 if (isParsingFragment())
243 return nullptr;
244 return document();
245}
246
247bool HTMLDocumentParser::pumpTokenizerLoop(SynchronousMode mode, bool parsingFragment, PumpSession& session)
248{
249 do {
250 if (UNLIKELY(isWaitingForScripts())) {
251 if (mode == AllowYield && m_parserScheduler->shouldYieldBeforeExecutingScript(session))
252 return true;
253 runScriptsForPausedTreeBuilder();
254 // If we're paused waiting for a script, we try to execute scripts before continuing.
255 if (isWaitingForScripts() || isStopped())
256 return false;
257 }
258
259 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the Frame, but this approach is
260 // how the parser has always handled stopping when the page assigns window.location. What should
261 // happen instead is that assigning window.location causes the parser to stop parsing cleanly.
262 // The problem is we're not prepared to do that at every point where we run JavaScript.
263 if (UNLIKELY(!parsingFragment && document()->frame() && document()->frame()->navigationScheduler().locationChangePending()))
264 return false;
265
266 if (UNLIKELY(mode == AllowYield && m_parserScheduler->shouldYieldBeforeToken(session)))
267 return true;
268
269 if (!parsingFragment)
270 m_sourceTracker.startToken(m_input.current(), m_tokenizer);
271
272 auto token = m_tokenizer.nextToken(m_input.current());
273 if (!token)
274 return false;
275
276 if (!parsingFragment) {
277 m_sourceTracker.endToken(m_input.current(), m_tokenizer);
278
279 // We do not XSS filter innerHTML, which means we (intentionally) fail
280 // http/tests/security/xssAuditor/dom-write-innerHTML.html
281 if (auto xssInfo = m_xssAuditor.filterToken(FilterTokenRequest(*token, m_sourceTracker, m_tokenizer.shouldAllowCDATA())))
282 m_xssAuditorDelegate.didBlockScript(*xssInfo);
283 }
284
285 constructTreeFromHTMLToken(token);
286 } while (!isStopped());
287
288 return false;
289}
290
291void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
292{
293 ASSERT(!isStopped());
294 ASSERT(!isScheduledForResume());
295
296 // This is an attempt to check that this object is both attached to the Document and protected by something.
297 ASSERT(refCount() >= 2);
298
299 PumpSession session(m_pumpSessionNestingLevel, contextForParsingSession());
300
301 m_xssAuditor.init(document(), &m_xssAuditorDelegate);
302
303 bool shouldResume = pumpTokenizerLoop(mode, isParsingFragment(), session);
304
305 // Ensure we haven't been totally deref'ed after pumping. Any caller of this
306 // function should be holding a RefPtr to this to ensure we weren't deleted.
307 ASSERT(refCount() >= 1);
308
309 if (isStopped())
310 return;
311
312 if (shouldResume)
313 m_parserScheduler->scheduleForResume();
314
315 if (isWaitingForScripts()) {
316 ASSERT(m_tokenizer.isInDataState());
317 if (!m_preloadScanner) {
318 m_preloadScanner = std::make_unique<HTMLPreloadScanner>(m_options, document()->url(), document()->deviceScaleFactor());
319 m_preloadScanner->appendToEnd(m_input.current());
320 }
321 m_preloadScanner->scan(*m_preloader, *document());
322 }
323 // The viewport definition is known here, so we can load link preloads with media attributes.
324 if (document()->loader())
325 LinkLoader::loadLinksFromHeader(document()->loader()->response().httpHeaderField(HTTPHeaderName::Link), document()->url(), *document(), LinkLoader::MediaAttributeCheck::MediaAttributeNotEmpty);
326}
327
328void HTMLDocumentParser::constructTreeFromHTMLToken(HTMLTokenizer::TokenPtr& rawToken)
329{
330 AtomicHTMLToken token(*rawToken);
331
332 // We clear the rawToken in case constructTreeFromAtomicToken
333 // synchronously re-enters the parser. We don't clear the token immedately
334 // for Character tokens because the AtomicHTMLToken avoids copying the
335 // characters by keeping a pointer to the underlying buffer in the
336 // HTMLToken. Fortunately, Character tokens can't cause us to re-enter
337 // the parser.
338 //
339 // FIXME: Stop clearing the rawToken once we start running the parser off
340 // the main thread or once we stop allowing synchronous JavaScript
341 // execution from parseAttribute.
342 if (rawToken->type() != HTMLToken::Character) {
343 // Clearing the TokenPtr makes sure we don't clear the HTMLToken a second time
344 // later when the TokenPtr is destroyed.
345 rawToken.clear();
346 }
347
348 m_treeBuilder->constructTree(WTFMove(token));
349}
350
351bool HTMLDocumentParser::hasInsertionPoint()
352{
353 // FIXME: The wasCreatedByScript() branch here might not be fully correct.
354 // Our model of the EOF character differs slightly from the one in the spec
355 // because our treatment is uniform between network-sourced and script-sourced
356 // input streams whereas the spec treats them differently.
357 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile());
358}
359
360void HTMLDocumentParser::insert(SegmentedString&& source)
361{
362 if (isStopped())
363 return;
364
365 // pumpTokenizer can cause this parser to be detached from the Document,
366 // but we need to ensure it isn't deleted yet.
367 Ref<HTMLDocumentParser> protectedThis(*this);
368
369 source.setExcludeLineNumbers();
370 m_input.insertAtCurrentInsertionPoint(WTFMove(source));
371 pumpTokenizerIfPossible(ForceSynchronous);
372
373 if (isWaitingForScripts()) {
374 // Check the document.write() output with a separate preload scanner as
375 // the main scanner can't deal with insertions.
376 if (!m_insertionPreloadScanner)
377 m_insertionPreloadScanner = std::make_unique<HTMLPreloadScanner>(m_options, document()->url(), document()->deviceScaleFactor());
378 m_insertionPreloadScanner->appendToEnd(source);
379 m_insertionPreloadScanner->scan(*m_preloader, *document());
380 }
381
382 endIfDelayed();
383}
384
385void HTMLDocumentParser::append(RefPtr<StringImpl>&& inputSource)
386{
387 if (isStopped())
388 return;
389
390 // pumpTokenizer can cause this parser to be detached from the Document,
391 // but we need to ensure it isn't deleted yet.
392 Ref<HTMLDocumentParser> protectedThis(*this);
393
394 String source { WTFMove(inputSource) };
395
396 if (m_preloadScanner) {
397 if (m_input.current().isEmpty() && !isWaitingForScripts()) {
398 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner.
399 // Clear the scanner so we know to scan starting from the current input point if we block again.
400 m_preloadScanner = nullptr;
401 } else {
402 m_preloadScanner->appendToEnd(source);
403 if (isWaitingForScripts())
404 m_preloadScanner->scan(*m_preloader, *document());
405 }
406 }
407
408 m_input.appendToEnd(source);
409
410 if (inPumpSession()) {
411 // We've gotten data off the network in a nested write.
412 // We don't want to consume any more of the input stream now. Do
413 // not worry. We'll consume this data in a less-nested write().
414 return;
415 }
416
417 pumpTokenizerIfPossible(AllowYield);
418
419 endIfDelayed();
420}
421
422void HTMLDocumentParser::end()
423{
424 ASSERT(!isDetached());
425 ASSERT(!isScheduledForResume());
426
427 // Informs the rest of WebCore that parsing is really finished (and deletes this).
428 m_treeBuilder->finished();
429}
430
431void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd()
432{
433 ASSERT(isStopping());
434 ASSERT(!hasInsertionPoint());
435 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing())
436 return;
437 end();
438}
439
440void HTMLDocumentParser::attemptToEnd()
441{
442 // finish() indicates we will not receive any more data. If we are waiting on
443 // an external script to load, we can't finish parsing quite yet.
444
445 if (shouldDelayEnd()) {
446 m_endWasDelayed = true;
447 return;
448 }
449 prepareToStopParsing();
450}
451
452void HTMLDocumentParser::endIfDelayed()
453{
454 // If we've already been detached, don't bother ending.
455 if (isDetached())
456 return;
457
458 if (!m_endWasDelayed || shouldDelayEnd())
459 return;
460
461 m_endWasDelayed = false;
462 prepareToStopParsing();
463}
464
465void HTMLDocumentParser::finish()
466{
467 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not
468 // makes sense to call any methods on DocumentParser once it's been stopped.
469 // However, FrameLoader::stop calls DocumentParser::finish unconditionally.
470
471 // We're not going to get any more data off the network, so we tell the
472 // input stream we've reached the end of file. finish() can be called more
473 // than once, if the first time does not call end().
474 if (!m_input.haveSeenEndOfFile())
475 m_input.markEndOfFile();
476
477 attemptToEnd();
478}
479
480bool HTMLDocumentParser::isExecutingScript() const
481{
482 return m_scriptRunner && m_scriptRunner->isExecutingScript();
483}
484
485TextPosition HTMLDocumentParser::textPosition() const
486{
487 auto& currentString = m_input.current();
488 return TextPosition(currentString.currentLine(), currentString.currentColumn());
489}
490
491bool HTMLDocumentParser::shouldAssociateConsoleMessagesWithTextPosition() const
492{
493 return inPumpSession() && !isExecutingScript();
494}
495
496bool HTMLDocumentParser::isWaitingForScripts() const
497{
498 // When the TreeBuilder encounters a </script> tag, it returns to the HTMLDocumentParser
499 // where the script is transfered from the treebuilder to the script runner.
500 // The script runner will hold the script until its loaded and run. During
501 // any of this time, we want to count ourselves as "waiting for a script" and thus
502 // run the preload scanner, as well as delay completion of parsing.
503 bool treeBuilderHasBlockingScript = m_treeBuilder->hasParserBlockingScriptWork();
504 bool scriptRunnerHasBlockingScript = m_scriptRunner && m_scriptRunner->hasParserBlockingScript();
505 // Since the parser is paused while a script runner has a blocking script, it should
506 // never be possible to end up with both objects holding a blocking script.
507 ASSERT(!(treeBuilderHasBlockingScript && scriptRunnerHasBlockingScript));
508 // If either object has a blocking script, the parser should be paused.
509 return treeBuilderHasBlockingScript || scriptRunnerHasBlockingScript;
510}
511
512void HTMLDocumentParser::resumeParsingAfterScriptExecution()
513{
514 ASSERT(!isExecutingScript());
515 ASSERT(!isWaitingForScripts());
516
517 // pumpTokenizer can cause this parser to be detached from the Document,
518 // but we need to ensure it isn't deleted yet.
519 Ref<HTMLDocumentParser> protectedThis(*this);
520
521 m_insertionPreloadScanner = nullptr;
522 pumpTokenizerIfPossible(AllowYield);
523 endIfDelayed();
524}
525
526void HTMLDocumentParser::watchForLoad(PendingScript& pendingScript)
527{
528 ASSERT(!pendingScript.isLoaded());
529 // setClient would call notifyFinished if the load were complete.
530 // Callers do not expect to be re-entered from this call, so they should
531 // not an already-loaded PendingScript.
532 pendingScript.setClient(*this);
533}
534
535void HTMLDocumentParser::stopWatchingForLoad(PendingScript& pendingScript)
536{
537 pendingScript.clearClient();
538}
539
540void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan()
541{
542 ASSERT(m_preloadScanner);
543 m_preloadScanner->appendToEnd(m_input.current());
544 m_preloadScanner->scan(*m_preloader, *document());
545}
546
547void HTMLDocumentParser::notifyFinished(PendingScript& pendingScript)
548{
549 // pumpTokenizer can cause this parser to be detached from the Document,
550 // but we need to ensure it isn't deleted yet.
551 Ref<HTMLDocumentParser> protectedThis(*this);
552
553 // After Document parser is stopped or detached, the parser-inserted deferred script execution should be ignored.
554 if (isStopped())
555 return;
556
557 ASSERT(m_scriptRunner);
558 ASSERT(!isExecutingScript());
559 if (isStopping()) {
560 attemptToRunDeferredScriptsAndEnd();
561 return;
562 }
563
564 m_scriptRunner->executeScriptsWaitingForLoad(pendingScript);
565 if (!isWaitingForScripts())
566 resumeParsingAfterScriptExecution();
567}
568
569bool HTMLDocumentParser::hasScriptsWaitingForStylesheets() const
570{
571 return m_scriptRunner && m_scriptRunner->hasScriptsWaitingForStylesheets();
572}
573
574void HTMLDocumentParser::executeScriptsWaitingForStylesheets()
575{
576 // Document only calls this when the Document owns the DocumentParser
577 // so this will not be called in the DocumentFragment case.
578 ASSERT(m_scriptRunner);
579 // Ignore calls unless we have a script blocking the parser waiting on a
580 // stylesheet load. Otherwise we are currently parsing and this
581 // is a re-entrant call from encountering a </ style> tag.
582 if (!m_scriptRunner->hasScriptsWaitingForStylesheets())
583 return;
584
585 // pumpTokenizer can cause this parser to be detached from the Document,
586 // but we need to ensure it isn't deleted yet.
587 Ref<HTMLDocumentParser> protectedThis(*this);
588 m_scriptRunner->executeScriptsWaitingForStylesheets();
589 if (!isWaitingForScripts())
590 resumeParsingAfterScriptExecution();
591}
592
593void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment& fragment, Element& contextElement, ParserContentPolicy parserContentPolicy)
594{
595 auto parser = create(fragment, contextElement, parserContentPolicy);
596 parser->insert(source); // Use insert() so that the parser will not yield.
597 parser->finish();
598 ASSERT(!parser->processingData());
599 parser->detach();
600}
601
602void HTMLDocumentParser::suspendScheduledTasks()
603{
604 if (m_parserScheduler)
605 m_parserScheduler->suspend();
606}
607
608void HTMLDocumentParser::resumeScheduledTasks()
609{
610 if (m_parserScheduler)
611 m_parserScheduler->resume();
612}
613
614}
615