1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrInterpreter.h"
29
30#include "Options.h"
31#include "SuperSampler.h"
32#include "Yarr.h"
33#include "YarrCanonicalize.h"
34#include <wtf/BumpPointerAllocator.h>
35#include <wtf/CheckedArithmetic.h>
36#include <wtf/DataLog.h>
37#include <wtf/StackCheck.h>
38#include <wtf/text/CString.h>
39#include <wtf/text/WTFString.h>
40
41namespace JSC { namespace Yarr {
42
43template<typename CharType>
44class Interpreter {
45public:
46 struct ParenthesesDisjunctionContext;
47
48 struct BackTrackInfoParentheses {
49 uintptr_t matchAmount;
50 ParenthesesDisjunctionContext* lastContext;
51 };
52
53 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
54 {
55 context->next = backTrack->lastContext;
56 backTrack->lastContext = context;
57 ++backTrack->matchAmount;
58 }
59
60 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
61 {
62 RELEASE_ASSERT(backTrack->matchAmount);
63 RELEASE_ASSERT(backTrack->lastContext);
64 backTrack->lastContext = backTrack->lastContext->next;
65 --backTrack->matchAmount;
66 }
67
68 struct DisjunctionContext
69 {
70 DisjunctionContext() = default;
71
72 void* operator new(size_t, void* where)
73 {
74 return where;
75 }
76
77 static size_t allocationSize(unsigned numberOfFrames)
78 {
79 static_assert(alignof(DisjunctionContext) <= sizeof(void*), "");
80 size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet();
81 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
82 RELEASE_ASSERT(roundedSize >= rawSize);
83 return roundedSize;
84 }
85
86 int term { 0 };
87 unsigned matchBegin;
88 unsigned matchEnd;
89 uintptr_t frame[1];
90 };
91
92 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
93 {
94 size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize);
95 allocatorPool = allocatorPool->ensureCapacity(size);
96 RELEASE_ASSERT(allocatorPool);
97 return new (allocatorPool->alloc(size)) DisjunctionContext();
98 }
99
100 void freeDisjunctionContext(DisjunctionContext* context)
101 {
102 allocatorPool = allocatorPool->dealloc(context);
103 }
104
105 struct ParenthesesDisjunctionContext
106 {
107 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
108 {
109 unsigned firstSubpatternId = term.atom.subpatternId;
110 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
111
112 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
113 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
114 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
115 }
116
117 new (getDisjunctionContext(term)) DisjunctionContext();
118 }
119
120 void* operator new(size_t, void* where)
121 {
122 return where;
123 }
124
125 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
126 {
127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
128 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
129 }
130
131 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
132 {
133 return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns));
134 }
135
136 static size_t allocationSize(unsigned numberOfSubpatterns)
137 {
138 static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), "");
139 size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet();
140 size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize);
141 RELEASE_ASSERT(roundedSize >= rawSize);
142 return roundedSize;
143 }
144
145 ParenthesesDisjunctionContext* next { nullptr };
146 unsigned subpatternBackup[1];
147 };
148
149 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
150 {
151 size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet();
152 allocatorPool = allocatorPool->ensureCapacity(size);
153 RELEASE_ASSERT(allocatorPool);
154 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
155 }
156
157 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
158 {
159 allocatorPool = allocatorPool->dealloc(context);
160 }
161
162 class InputStream {
163 public:
164 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
165 : input(input)
166 , pos(start)
167 , length(length)
168 , decodeSurrogatePairs(decodeSurrogatePairs)
169 {
170 }
171
172 void next()
173 {
174 ++pos;
175 }
176
177 void rewind(unsigned amount)
178 {
179 ASSERT(pos >= amount);
180 pos -= amount;
181 }
182
183 int read()
184 {
185 ASSERT(pos < length);
186 if (pos < length)
187 return input[pos];
188 return -1;
189 }
190
191 int readPair()
192 {
193 ASSERT(pos + 1 < length);
194 return input[pos] | input[pos + 1] << 16;
195 }
196
197 int readChecked(unsigned negativePositionOffest)
198 {
199 RELEASE_ASSERT(pos >= negativePositionOffest);
200 unsigned p = pos - negativePositionOffest;
201 ASSERT(p < length);
202 int result = input[p];
203 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
204 if (atEnd())
205 return -1;
206
207 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
208 next();
209 }
210 return result;
211 }
212
213 int readSurrogatePairChecked(unsigned negativePositionOffset)
214 {
215 RELEASE_ASSERT(pos >= negativePositionOffset);
216 unsigned p = pos - negativePositionOffset;
217 ASSERT(p < length);
218 if (p + 1 >= length)
219 return -1;
220
221 int first = input[p];
222 int second = input[p + 1];
223 if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
224 return U16_GET_SUPPLEMENTARY(first, second);
225
226 return -1;
227 }
228
229 int reread(unsigned from)
230 {
231 ASSERT(from < length);
232 int result = input[from];
233 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
234 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
235 return result;
236 }
237
238 int prev()
239 {
240 ASSERT(!(pos > length));
241 if (pos && length)
242 return input[pos - 1];
243 return -1;
244 }
245
246 unsigned getPos()
247 {
248 return pos;
249 }
250
251 void setPos(unsigned p)
252 {
253 pos = p;
254 }
255
256 bool atStart()
257 {
258 return pos == 0;
259 }
260
261 bool atEnd()
262 {
263 return pos == length;
264 }
265
266 unsigned end()
267 {
268 return length;
269 }
270
271 bool checkInput(unsigned count)
272 {
273 if (((pos + count) <= length) && ((pos + count) >= pos)) {
274 pos += count;
275 return true;
276 }
277 return false;
278 }
279
280 void uncheckInput(unsigned count)
281 {
282 RELEASE_ASSERT(pos >= count);
283 pos -= count;
284 }
285
286 bool atStart(unsigned negativePositionOffset)
287 {
288 return pos == negativePositionOffset;
289 }
290
291 bool atEnd(unsigned negativePositionOffest)
292 {
293 RELEASE_ASSERT(pos >= negativePositionOffest);
294 return (pos - negativePositionOffest) == length;
295 }
296
297 bool isAvailableInput(unsigned offset)
298 {
299 return (((pos + offset) <= length) && ((pos + offset) >= pos));
300 }
301
302 private:
303 const CharType* input;
304 unsigned pos;
305 unsigned length;
306 bool decodeSurrogatePairs;
307 };
308
309 bool testCharacterClass(CharacterClass* characterClass, int ch)
310 {
311 auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
312 for (unsigned i = 0; i < matches.size(); ++i) {
313 if (ch == matches[i])
314 return true;
315 }
316
317 return false;
318 };
319
320 auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
321 size_t low = 0;
322 size_t high = matches.size() - 1;
323
324 while (low <= high) {
325 size_t mid = low + (high - low) / 2;
326 int diff = ch - matches[mid];
327 if (!diff)
328 return true;
329
330 if (diff < 0) {
331 if (mid == low)
332 return false;
333 high = mid - 1;
334 } else
335 low = mid + 1;
336 }
337 return false;
338 };
339
340 auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
341 for (unsigned i = 0; i < ranges.size(); ++i) {
342 if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
343 return true;
344 }
345
346 return false;
347 };
348
349 auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
350 size_t low = 0;
351 size_t high = ranges.size() - 1;
352
353 while (low <= high) {
354 size_t mid = low + (high - low) / 2;
355 int rangeBeginDiff = ch - ranges[mid].begin;
356 if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
357 return true;
358
359 if (rangeBeginDiff < 0) {
360 if (mid == low)
361 return false;
362 high = mid - 1;
363 } else
364 low = mid + 1;
365 }
366 return false;
367 };
368
369 if (characterClass->m_anyCharacter)
370 return true;
371
372 const size_t thresholdForBinarySearch = 6;
373
374 if (!isASCII(ch)) {
375 if (characterClass->m_matchesUnicode.size()) {
376 if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
377 if (binarySearchMatches(characterClass->m_matchesUnicode))
378 return true;
379 } else if (linearSearchMatches(characterClass->m_matchesUnicode))
380 return true;
381 }
382
383 if (characterClass->m_rangesUnicode.size()) {
384 if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
385 if (binarySearchRanges(characterClass->m_rangesUnicode))
386 return true;
387 } else if (linearSearchRanges(characterClass->m_rangesUnicode))
388 return true;
389 }
390 } else {
391 if (characterClass->m_matches.size()) {
392 if (characterClass->m_matches.size() > thresholdForBinarySearch) {
393 if (binarySearchMatches(characterClass->m_matches))
394 return true;
395 } else if (linearSearchMatches(characterClass->m_matches))
396 return true;
397 }
398
399 if (characterClass->m_ranges.size()) {
400 if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
401 if (binarySearchRanges(characterClass->m_ranges))
402 return true;
403 } else if (linearSearchRanges(characterClass->m_ranges))
404 return true;
405 }
406 }
407
408 return false;
409 }
410
411 bool checkCharacter(int testChar, unsigned negativeInputOffset)
412 {
413 return testChar == input.readChecked(negativeInputOffset);
414 }
415
416 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
417 {
418 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
419 }
420
421 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
422 {
423 int ch = input.readChecked(negativeInputOffset);
424 return (loChar == ch) || (hiChar == ch);
425 }
426
427 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
428 {
429 bool match = testCharacterClass(characterClass, input.readChecked(negativeInputOffset));
430 return invert ? !match : match;
431 }
432
433 bool checkCharacterClassDontAdvanceInputForNonBMP(CharacterClass* characterClass, unsigned negativeInputOffset)
434 {
435 int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset);
436 return testCharacterClass(characterClass, readCharacter);
437 }
438
439 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
440 {
441 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
442
443 if (!input.checkInput(matchSize))
444 return false;
445
446 for (unsigned i = 0; i < matchSize; ++i) {
447 int oldCh = input.reread(matchBegin + i);
448 int ch;
449 if (!U_IS_BMP(oldCh)) {
450 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
451 ++i;
452 } else
453 ch = input.readChecked(negativeInputOffset + matchSize - i);
454
455 if (oldCh == ch)
456 continue;
457
458 if (pattern->ignoreCase()) {
459 // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
460 // patterns, Unicode values are never allowed to match against ASCII ones.
461 // For Unicode, we need to check all canonical equivalents of a character.
462 if (!unicode && (isASCII(oldCh) || isASCII(ch))) {
463 if (toASCIIUpper(oldCh) == toASCIIUpper(ch))
464 continue;
465 } else if (areCanonicallyEquivalent(oldCh, ch, unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
466 continue;
467 }
468
469 input.uncheckInput(matchSize);
470 return false;
471 }
472
473 return true;
474 }
475
476 bool matchAssertionBOL(ByteTerm& term)
477 {
478 return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition + 1)));
479 }
480
481 bool matchAssertionEOL(ByteTerm& term)
482 {
483 if (term.inputPosition)
484 return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readChecked(term.inputPosition)));
485
486 return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read()));
487 }
488
489 bool matchAssertionWordBoundary(ByteTerm& term)
490 {
491 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition + 1));
492 bool readIsWordchar;
493 if (term.inputPosition)
494 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(term.inputPosition));
495 else
496 readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read());
497
498 bool wordBoundary = prevIsWordchar != readIsWordchar;
499 return term.invert() ? !wordBoundary : wordBoundary;
500 }
501
502 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
503 {
504 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
505
506 switch (term.atom.quantityType) {
507 case QuantifierFixedCount:
508 break;
509
510 case QuantifierGreedy:
511 if (backTrack->matchAmount) {
512 --backTrack->matchAmount;
513 input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
514 return true;
515 }
516 break;
517
518 case QuantifierNonGreedy:
519 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
520 ++backTrack->matchAmount;
521 if (checkCharacter(term.atom.patternCharacter, term.inputPosition + 1))
522 return true;
523 }
524 input.setPos(backTrack->begin);
525 break;
526 }
527
528 return false;
529 }
530
531 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
532 {
533 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
534
535 switch (term.atom.quantityType) {
536 case QuantifierFixedCount:
537 break;
538
539 case QuantifierGreedy:
540 if (backTrack->matchAmount) {
541 --backTrack->matchAmount;
542 input.uncheckInput(1);
543 return true;
544 }
545 break;
546
547 case QuantifierNonGreedy:
548 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
549 ++backTrack->matchAmount;
550 if (checkCasedCharacter(term.atom.casedCharacter.lo, term.atom.casedCharacter.hi, term.inputPosition + 1))
551 return true;
552 }
553 input.uncheckInput(backTrack->matchAmount);
554 break;
555 }
556
557 return false;
558 }
559
560 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
561 {
562 ASSERT(term.type == ByteTerm::TypeCharacterClass);
563 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
564
565 switch (term.atom.quantityType) {
566 case QuantifierFixedCount: {
567 if (unicode) {
568 CharacterClass* charClass = term.atom.characterClass;
569 backTrack->begin = input.getPos();
570 unsigned matchAmount = 0;
571 for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
572 if (term.invert()) {
573 if (!checkCharacterClass(charClass, term.invert(), term.inputPosition - matchAmount)) {
574 input.setPos(backTrack->begin);
575 return false;
576 }
577 } else {
578 unsigned matchOffset = matchAmount * (charClass->hasOnlyNonBMPCharacters() ? 2 : 1);
579 if (!checkCharacterClassDontAdvanceInputForNonBMP(charClass, term.inputPosition - matchOffset)) {
580 input.setPos(backTrack->begin);
581 return false;
582 }
583 }
584 }
585
586 return true;
587 }
588
589 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
590 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition - matchAmount))
591 return false;
592 }
593 return true;
594 }
595
596 case QuantifierGreedy: {
597 unsigned position = input.getPos();
598 backTrack->begin = position;
599 unsigned matchAmount = 0;
600 while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
601 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
602 input.setPos(position);
603 break;
604 }
605 ++matchAmount;
606 position = input.getPos();
607 }
608 backTrack->matchAmount = matchAmount;
609
610 return true;
611 }
612
613 case QuantifierNonGreedy:
614 backTrack->begin = input.getPos();
615 backTrack->matchAmount = 0;
616 return true;
617 }
618
619 RELEASE_ASSERT_NOT_REACHED();
620 return false;
621 }
622
623 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
624 {
625 ASSERT(term.type == ByteTerm::TypeCharacterClass);
626 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
627
628 switch (term.atom.quantityType) {
629 case QuantifierFixedCount:
630 if (unicode)
631 input.setPos(backTrack->begin);
632 break;
633
634 case QuantifierGreedy:
635 if (backTrack->matchAmount) {
636 if (unicode) {
637 // Rematch one less match
638 input.setPos(backTrack->begin);
639 --backTrack->matchAmount;
640 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
641 if (!checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1)) {
642 input.uncheckInput(1);
643 break;
644 }
645 }
646 return true;
647 }
648 --backTrack->matchAmount;
649 input.uncheckInput(1);
650 return true;
651 }
652 break;
653
654 case QuantifierNonGreedy:
655 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
656 ++backTrack->matchAmount;
657 if (checkCharacterClass(term.atom.characterClass, term.invert(), term.inputPosition + 1))
658 return true;
659 }
660 input.setPos(backTrack->begin);
661 break;
662 }
663
664 return false;
665 }
666
667 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
668 {
669 ASSERT(term.type == ByteTerm::TypeBackReference);
670 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
671
672 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
673 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
674
675 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
676 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
677 // Eg.: /(a\1)/
678 if (matchEnd == offsetNoMatch)
679 return true;
680
681 if (matchBegin == offsetNoMatch)
682 return true;
683
684 ASSERT(matchBegin <= matchEnd);
685
686 if (matchBegin == matchEnd)
687 return true;
688
689 switch (term.atom.quantityType) {
690 case QuantifierFixedCount: {
691 backTrack->begin = input.getPos();
692 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
693 if (!tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
694 input.setPos(backTrack->begin);
695 return false;
696 }
697 }
698 return true;
699 }
700
701 case QuantifierGreedy: {
702 unsigned matchAmount = 0;
703 while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition))
704 ++matchAmount;
705 backTrack->matchAmount = matchAmount;
706 return true;
707 }
708
709 case QuantifierNonGreedy:
710 backTrack->begin = input.getPos();
711 backTrack->matchAmount = 0;
712 return true;
713 }
714
715 RELEASE_ASSERT_NOT_REACHED();
716 return false;
717 }
718
719 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
720 {
721 ASSERT(term.type == ByteTerm::TypeBackReference);
722 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
723
724 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
725 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
726
727 if (matchBegin == offsetNoMatch)
728 return false;
729
730 ASSERT(matchBegin <= matchEnd);
731
732 if (matchBegin == matchEnd)
733 return false;
734
735 switch (term.atom.quantityType) {
736 case QuantifierFixedCount:
737 // for quantityMaxCount == 1, could rewind.
738 input.setPos(backTrack->begin);
739 break;
740
741 case QuantifierGreedy:
742 if (backTrack->matchAmount) {
743 --backTrack->matchAmount;
744 input.rewind(matchEnd - matchBegin);
745 return true;
746 }
747 break;
748
749 case QuantifierNonGreedy:
750 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term.inputPosition)) {
751 ++backTrack->matchAmount;
752 return true;
753 }
754 input.setPos(backTrack->begin);
755 break;
756 }
757
758 return false;
759 }
760
761 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
762 {
763 if (term.capture()) {
764 unsigned subpatternId = term.atom.subpatternId;
765 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
766 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
767 }
768 }
769 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
770 {
771 unsigned firstSubpatternId = term.atom.subpatternId;
772 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
773 context->restoreOutput(output, firstSubpatternId, count);
774 }
775 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
776 {
777 while (backTrack->matchAmount) {
778 ParenthesesDisjunctionContext* context = backTrack->lastContext;
779
780 JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(term), true);
781 if (result == JSRegExpMatch)
782 return JSRegExpMatch;
783
784 resetMatches(term, context);
785 popParenthesesDisjunctionContext(backTrack);
786 freeParenthesesDisjunctionContext(context);
787
788 if (result != JSRegExpNoMatch)
789 return result;
790 }
791
792 return JSRegExpNoMatch;
793 }
794
795 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
796 {
797 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
798 ASSERT(term.atom.quantityMaxCount == 1);
799
800 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
801
802 switch (term.atom.quantityType) {
803 case QuantifierGreedy: {
804 // set this speculatively; if we get to the parens end this will be true.
805 backTrack->begin = input.getPos();
806 break;
807 }
808 case QuantifierNonGreedy: {
809 backTrack->begin = notFound;
810 context->term += term.atom.parenthesesWidth;
811 return true;
812 }
813 case QuantifierFixedCount:
814 break;
815 }
816
817 if (term.capture()) {
818 unsigned subpatternId = term.atom.subpatternId;
819 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
820 }
821
822 return true;
823 }
824
825 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
826 {
827 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
828 ASSERT(term.atom.quantityMaxCount == 1);
829
830 if (term.capture()) {
831 unsigned subpatternId = term.atom.subpatternId;
832 output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
833 }
834
835 if (term.atom.quantityType == QuantifierFixedCount)
836 return true;
837
838 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
839 return backTrack->begin != input.getPos();
840 }
841
842 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
843 {
844 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
845 ASSERT(term.atom.quantityMaxCount == 1);
846
847 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
848
849 if (term.capture()) {
850 unsigned subpatternId = term.atom.subpatternId;
851 output[(subpatternId << 1)] = offsetNoMatch;
852 output[(subpatternId << 1) + 1] = offsetNoMatch;
853 }
854
855 switch (term.atom.quantityType) {
856 case QuantifierGreedy:
857 // if we backtrack to this point, there is another chance - try matching nothing.
858 ASSERT(backTrack->begin != notFound);
859 backTrack->begin = notFound;
860 context->term += term.atom.parenthesesWidth;
861 return true;
862 case QuantifierNonGreedy:
863 ASSERT(backTrack->begin != notFound);
864 FALLTHROUGH;
865 case QuantifierFixedCount:
866 break;
867 }
868
869 return false;
870 }
871
872 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
873 {
874 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
875 ASSERT(term.atom.quantityMaxCount == 1);
876
877 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
878
879 switch (term.atom.quantityType) {
880 case QuantifierGreedy:
881 if (backTrack->begin == notFound) {
882 context->term -= term.atom.parenthesesWidth;
883 return false;
884 }
885 FALLTHROUGH;
886 case QuantifierNonGreedy:
887 if (backTrack->begin == notFound) {
888 backTrack->begin = input.getPos();
889 if (term.capture()) {
890 // Technically this access to inputPosition should be accessing the begin term's
891 // inputPosition, but for repeats other than fixed these values should be
892 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
893 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
894 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
895 unsigned subpatternId = term.atom.subpatternId;
896 output[subpatternId << 1] = input.getPos() - term.inputPosition;
897 }
898 context->term -= term.atom.parenthesesWidth;
899 return true;
900 }
901 FALLTHROUGH;
902 case QuantifierFixedCount:
903 break;
904 }
905
906 return false;
907 }
908
909 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
910 {
911 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
912 ASSERT(term.atom.quantityType == QuantifierGreedy);
913 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
914 ASSERT(!term.capture());
915
916 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
917 backTrack->begin = input.getPos();
918 return true;
919 }
920
921 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
922 {
923 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
924
925 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
926 // Empty match is a failed match.
927 if (backTrack->begin == input.getPos())
928 return false;
929
930 // Successful match! Okay, what's next? - loop around and try to match more!
931 context->term -= (term.atom.parenthesesWidth + 1);
932 return true;
933 }
934
935 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
936 {
937 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
938 ASSERT(term.atom.quantityType == QuantifierGreedy);
939 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
940 ASSERT(!term.capture());
941
942 // If we backtrack to this point, we have failed to match this iteration of the parens.
943 // Since this is greedy / zero minimum a failed is also accepted as a match!
944 context->term += term.atom.parenthesesWidth;
945 return true;
946 }
947
948 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
949 {
950 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
951 // should always be returned as a successful match - we should never backtrack to here.
952 RELEASE_ASSERT_NOT_REACHED();
953 return false;
954 }
955
956 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
957 {
958 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
959 ASSERT(term.atom.quantityMaxCount == 1);
960
961 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
962
963 backTrack->begin = input.getPos();
964 return true;
965 }
966
967 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
968 {
969 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
970 ASSERT(term.atom.quantityMaxCount == 1);
971
972 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
973
974 input.setPos(backTrack->begin);
975
976 // We've reached the end of the parens; if they are inverted, this is failure.
977 if (term.invert()) {
978 context->term -= term.atom.parenthesesWidth;
979 return false;
980 }
981
982 return true;
983 }
984
985 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
986 {
987 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
988 ASSERT(term.atom.quantityMaxCount == 1);
989
990 // We've failed to match parens; if they are inverted, this is win!
991 if (term.invert()) {
992 context->term += term.atom.parenthesesWidth;
993 return true;
994 }
995
996 return false;
997 }
998
999 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
1000 {
1001 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
1002 ASSERT(term.atom.quantityMaxCount == 1);
1003
1004 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
1005
1006 input.setPos(backTrack->begin);
1007
1008 context->term -= term.atom.parenthesesWidth;
1009 return false;
1010 }
1011
1012 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
1013 {
1014 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1015
1016 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1017 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1018
1019 backTrack->matchAmount = 0;
1020 backTrack->lastContext = 0;
1021
1022 ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
1023
1024 unsigned minimumMatchCount = term.atom.quantityMinCount;
1025 JSRegExpResult fixedMatchResult;
1026
1027 // Handle fixed matches and the minimum part of a variable length match.
1028 if (minimumMatchCount) {
1029 // While we haven't yet reached our fixed limit,
1030 while (backTrack->matchAmount < minimumMatchCount) {
1031 // Try to do a match, and it it succeeds, add it to the list.
1032 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1033 fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1034 if (fixedMatchResult == JSRegExpMatch)
1035 appendParenthesesDisjunctionContext(backTrack, context);
1036 else {
1037 // The match failed; try to find an alternate point to carry on from.
1038 resetMatches(term, context);
1039 freeParenthesesDisjunctionContext(context);
1040
1041 if (fixedMatchResult != JSRegExpNoMatch)
1042 return fixedMatchResult;
1043 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1044 if (backtrackResult != JSRegExpMatch)
1045 return backtrackResult;
1046 }
1047 }
1048
1049 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1050 recordParenthesesMatch(term, context);
1051 }
1052
1053 switch (term.atom.quantityType) {
1054 case QuantifierFixedCount: {
1055 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1056 return JSRegExpMatch;
1057 }
1058
1059 case QuantifierGreedy: {
1060 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1061 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1062 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1063 if (result == JSRegExpMatch)
1064 appendParenthesesDisjunctionContext(backTrack, context);
1065 else {
1066 resetMatches(term, context);
1067 freeParenthesesDisjunctionContext(context);
1068
1069 if (result != JSRegExpNoMatch)
1070 return result;
1071
1072 break;
1073 }
1074 }
1075
1076 if (backTrack->matchAmount) {
1077 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1078 recordParenthesesMatch(term, context);
1079 }
1080 return JSRegExpMatch;
1081 }
1082
1083 case QuantifierNonGreedy:
1084 return JSRegExpMatch;
1085 }
1086
1087 RELEASE_ASSERT_NOT_REACHED();
1088 return JSRegExpErrorNoMatch;
1089 }
1090
1091 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
1092 //
1093 // Greedy matches never should try just adding more - you should already have done
1094 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
1095 // you backtrack an item off the list needs checking, since we'll never have matched
1096 // the one less case. Tracking forwards, still add as much as possible.
1097 //
1098 // Non-greedy, we've already done the one less case, so don't match on popping.
1099 // We haven't done the one more case, so always try to add that.
1100 //
1101 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
1102 {
1103 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1104
1105 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1106 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1107
1108 switch (term.atom.quantityType) {
1109 case QuantifierFixedCount: {
1110 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1111
1112 ParenthesesDisjunctionContext* context = 0;
1113 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
1114
1115 if (result != JSRegExpMatch)
1116 return result;
1117
1118 // While we haven't yet reached our fixed limit,
1119 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1120 // Try to do a match, and it it succeeds, add it to the list.
1121 context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1122 result = matchDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1123
1124 if (result == JSRegExpMatch)
1125 appendParenthesesDisjunctionContext(backTrack, context);
1126 else {
1127 // The match failed; try to find an alternate point to carry on from.
1128 resetMatches(term, context);
1129 freeParenthesesDisjunctionContext(context);
1130
1131 if (result != JSRegExpNoMatch)
1132 return result;
1133 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1134 if (backtrackResult != JSRegExpMatch)
1135 return backtrackResult;
1136 }
1137 }
1138
1139 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1140 context = backTrack->lastContext;
1141 recordParenthesesMatch(term, context);
1142 return JSRegExpMatch;
1143 }
1144
1145 case QuantifierGreedy: {
1146 if (!backTrack->matchAmount)
1147 return JSRegExpNoMatch;
1148
1149 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1150 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1151 if (result == JSRegExpMatch) {
1152 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1153 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1154 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1155 if (parenthesesResult == JSRegExpMatch)
1156 appendParenthesesDisjunctionContext(backTrack, context);
1157 else {
1158 resetMatches(term, context);
1159 freeParenthesesDisjunctionContext(context);
1160
1161 if (parenthesesResult != JSRegExpNoMatch)
1162 return parenthesesResult;
1163
1164 break;
1165 }
1166 }
1167 } else {
1168 resetMatches(term, context);
1169 popParenthesesDisjunctionContext(backTrack);
1170 freeParenthesesDisjunctionContext(context);
1171
1172 if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount)
1173 return result;
1174 }
1175
1176 if (backTrack->matchAmount) {
1177 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1178 recordParenthesesMatch(term, context);
1179 }
1180 return JSRegExpMatch;
1181 }
1182
1183 case QuantifierNonGreedy: {
1184 // If we've not reached the limit, try to add one more match.
1185 if (backTrack->matchAmount < term.atom.quantityMaxCount) {
1186 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term);
1187 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term));
1188 if (result == JSRegExpMatch) {
1189 appendParenthesesDisjunctionContext(backTrack, context);
1190 recordParenthesesMatch(term, context);
1191 return JSRegExpMatch;
1192 }
1193
1194 resetMatches(term, context);
1195 freeParenthesesDisjunctionContext(context);
1196
1197 if (result != JSRegExpNoMatch)
1198 return result;
1199 }
1200
1201 // Nope - okay backtrack looking for an alternative.
1202 while (backTrack->matchAmount) {
1203 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1204 JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(term), true);
1205 if (result == JSRegExpMatch) {
1206 // successful backtrack! we're back in the game!
1207 if (backTrack->matchAmount) {
1208 context = backTrack->lastContext;
1209 recordParenthesesMatch(term, context);
1210 }
1211 return JSRegExpMatch;
1212 }
1213
1214 // pop a match off the stack
1215 resetMatches(term, context);
1216 popParenthesesDisjunctionContext(backTrack);
1217 freeParenthesesDisjunctionContext(context);
1218
1219 if (result != JSRegExpNoMatch)
1220 return result;
1221 }
1222
1223 return JSRegExpNoMatch;
1224 }
1225 }
1226
1227 RELEASE_ASSERT_NOT_REACHED();
1228 return JSRegExpErrorNoMatch;
1229 }
1230
1231 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1232 {
1233 UNUSED_PARAM(term);
1234
1235 if (pattern->dotAll()) {
1236 context->matchBegin = startOffset;
1237 context->matchEnd = input.end();
1238 return true;
1239 }
1240
1241 unsigned matchBegin = context->matchBegin;
1242
1243 if (matchBegin > startOffset) {
1244 for (matchBegin--; true; matchBegin--) {
1245 if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) {
1246 ++matchBegin;
1247 break;
1248 }
1249
1250 if (matchBegin == startOffset)
1251 break;
1252 }
1253 }
1254
1255 unsigned matchEnd = input.getPos();
1256
1257 for (; (matchEnd != input.end())
1258 && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { }
1259
1260 if (((matchBegin && term.anchors.m_bol)
1261 || ((matchEnd != input.end()) && term.anchors.m_eol))
1262 && !pattern->multiline())
1263 return false;
1264
1265 context->matchBegin = matchBegin;
1266 context->matchEnd = matchEnd;
1267 return true;
1268 }
1269
1270#define MATCH_NEXT() { ++context->term; goto matchAgain; }
1271#define BACKTRACK() { --context->term; goto backtrack; }
1272#define currentTerm() (disjunction->terms[context->term])
1273 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1274 {
1275 if (!--remainingMatchCount)
1276 return JSRegExpErrorHitLimit;
1277
1278 if (btrack)
1279 BACKTRACK();
1280
1281 context->matchBegin = input.getPos();
1282 context->term = 0;
1283
1284 matchAgain:
1285 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1286
1287 switch (currentTerm().type) {
1288 case ByteTerm::TypeSubpatternBegin:
1289 MATCH_NEXT();
1290 case ByteTerm::TypeSubpatternEnd:
1291 context->matchEnd = input.getPos();
1292 return JSRegExpMatch;
1293
1294 case ByteTerm::TypeBodyAlternativeBegin:
1295 MATCH_NEXT();
1296 case ByteTerm::TypeBodyAlternativeDisjunction:
1297 case ByteTerm::TypeBodyAlternativeEnd:
1298 context->matchEnd = input.getPos();
1299 return JSRegExpMatch;
1300
1301 case ByteTerm::TypeAlternativeBegin:
1302 MATCH_NEXT();
1303 case ByteTerm::TypeAlternativeDisjunction:
1304 case ByteTerm::TypeAlternativeEnd: {
1305 int offset = currentTerm().alternative.end;
1306 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1307 backTrack->offset = offset;
1308 context->term += offset;
1309 MATCH_NEXT();
1310 }
1311
1312 case ByteTerm::TypeAssertionBOL:
1313 if (matchAssertionBOL(currentTerm()))
1314 MATCH_NEXT();
1315 BACKTRACK();
1316 case ByteTerm::TypeAssertionEOL:
1317 if (matchAssertionEOL(currentTerm()))
1318 MATCH_NEXT();
1319 BACKTRACK();
1320 case ByteTerm::TypeAssertionWordBoundary:
1321 if (matchAssertionWordBoundary(currentTerm()))
1322 MATCH_NEXT();
1323 BACKTRACK();
1324
1325 case ByteTerm::TypePatternCharacterOnce:
1326 case ByteTerm::TypePatternCharacterFixed: {
1327 if (unicode) {
1328 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
1329 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1330 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
1331 BACKTRACK();
1332 }
1333 }
1334 MATCH_NEXT();
1335 }
1336 }
1337 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1338
1339 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1340 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
1341 input.setPos(position);
1342 BACKTRACK();
1343 }
1344 }
1345 MATCH_NEXT();
1346 }
1347 case ByteTerm::TypePatternCharacterGreedy: {
1348 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1349 unsigned matchAmount = 0;
1350 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1351 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1352 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1353 input.setPos(position);
1354 break;
1355 }
1356 ++matchAmount;
1357 position = input.getPos();
1358 }
1359 backTrack->matchAmount = matchAmount;
1360
1361 MATCH_NEXT();
1362 }
1363 case ByteTerm::TypePatternCharacterNonGreedy: {
1364 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1365 backTrack->begin = input.getPos();
1366 backTrack->matchAmount = 0;
1367 MATCH_NEXT();
1368 }
1369
1370 case ByteTerm::TypePatternCasedCharacterOnce:
1371 case ByteTerm::TypePatternCasedCharacterFixed: {
1372 if (unicode) {
1373 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1374 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
1375
1376 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1377
1378 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1379 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
1380 input.setPos(position);
1381 BACKTRACK();
1382 }
1383 }
1384 MATCH_NEXT();
1385 }
1386
1387 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1388 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1389 BACKTRACK();
1390 }
1391 MATCH_NEXT();
1392 }
1393 case ByteTerm::TypePatternCasedCharacterGreedy: {
1394 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1395
1396 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1397 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1398
1399 unsigned matchAmount = 0;
1400 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1401 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1402 input.uncheckInput(1);
1403 break;
1404 }
1405 ++matchAmount;
1406 }
1407 backTrack->matchAmount = matchAmount;
1408
1409 MATCH_NEXT();
1410 }
1411 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1412 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1413
1414 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1415 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1416
1417 backTrack->matchAmount = 0;
1418 MATCH_NEXT();
1419 }
1420
1421 case ByteTerm::TypeCharacterClass:
1422 if (matchCharacterClass(currentTerm(), context))
1423 MATCH_NEXT();
1424 BACKTRACK();
1425 case ByteTerm::TypeBackReference:
1426 if (matchBackReference(currentTerm(), context))
1427 MATCH_NEXT();
1428 BACKTRACK();
1429 case ByteTerm::TypeParenthesesSubpattern: {
1430 JSRegExpResult result = matchParentheses(currentTerm(), context);
1431
1432 if (result == JSRegExpMatch) {
1433 MATCH_NEXT();
1434 } else if (result != JSRegExpNoMatch)
1435 return result;
1436
1437 BACKTRACK();
1438 }
1439 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1440 if (matchParenthesesOnceBegin(currentTerm(), context))
1441 MATCH_NEXT();
1442 BACKTRACK();
1443 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1444 if (matchParenthesesOnceEnd(currentTerm(), context))
1445 MATCH_NEXT();
1446 BACKTRACK();
1447 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1448 if (matchParenthesesTerminalBegin(currentTerm(), context))
1449 MATCH_NEXT();
1450 BACKTRACK();
1451 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1452 if (matchParenthesesTerminalEnd(currentTerm(), context))
1453 MATCH_NEXT();
1454 BACKTRACK();
1455 case ByteTerm::TypeParentheticalAssertionBegin:
1456 if (matchParentheticalAssertionBegin(currentTerm(), context))
1457 MATCH_NEXT();
1458 BACKTRACK();
1459 case ByteTerm::TypeParentheticalAssertionEnd:
1460 if (matchParentheticalAssertionEnd(currentTerm(), context))
1461 MATCH_NEXT();
1462 BACKTRACK();
1463
1464 case ByteTerm::TypeCheckInput:
1465 if (input.checkInput(currentTerm().checkInputCount))
1466 MATCH_NEXT();
1467 BACKTRACK();
1468
1469 case ByteTerm::TypeUncheckInput:
1470 input.uncheckInput(currentTerm().checkInputCount);
1471 MATCH_NEXT();
1472
1473 case ByteTerm::TypeDotStarEnclosure:
1474 if (matchDotStarEnclosure(currentTerm(), context))
1475 return JSRegExpMatch;
1476 BACKTRACK();
1477 }
1478
1479 // We should never fall-through to here.
1480 RELEASE_ASSERT_NOT_REACHED();
1481
1482 backtrack:
1483 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1484
1485 switch (currentTerm().type) {
1486 case ByteTerm::TypeSubpatternBegin:
1487 return JSRegExpNoMatch;
1488 case ByteTerm::TypeSubpatternEnd:
1489 RELEASE_ASSERT_NOT_REACHED();
1490
1491 case ByteTerm::TypeBodyAlternativeBegin:
1492 case ByteTerm::TypeBodyAlternativeDisjunction: {
1493 int offset = currentTerm().alternative.next;
1494 context->term += offset;
1495 if (offset > 0)
1496 MATCH_NEXT();
1497
1498 if (input.atEnd() || pattern->sticky())
1499 return JSRegExpNoMatch;
1500
1501 input.next();
1502
1503 context->matchBegin = input.getPos();
1504
1505 if (currentTerm().alternative.onceThrough)
1506 context->term += currentTerm().alternative.next;
1507
1508 MATCH_NEXT();
1509 }
1510 case ByteTerm::TypeBodyAlternativeEnd:
1511 RELEASE_ASSERT_NOT_REACHED();
1512
1513 case ByteTerm::TypeAlternativeBegin:
1514 case ByteTerm::TypeAlternativeDisjunction: {
1515 int offset = currentTerm().alternative.next;
1516 context->term += offset;
1517 if (offset > 0)
1518 MATCH_NEXT();
1519 BACKTRACK();
1520 }
1521 case ByteTerm::TypeAlternativeEnd: {
1522 // We should never backtrack back into an alternative of the main body of the regex.
1523 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1524 unsigned offset = backTrack->offset;
1525 context->term -= offset;
1526 BACKTRACK();
1527 }
1528
1529 case ByteTerm::TypeAssertionBOL:
1530 case ByteTerm::TypeAssertionEOL:
1531 case ByteTerm::TypeAssertionWordBoundary:
1532 BACKTRACK();
1533
1534 case ByteTerm::TypePatternCharacterOnce:
1535 case ByteTerm::TypePatternCharacterFixed:
1536 case ByteTerm::TypePatternCharacterGreedy:
1537 case ByteTerm::TypePatternCharacterNonGreedy:
1538 if (backtrackPatternCharacter(currentTerm(), context))
1539 MATCH_NEXT();
1540 BACKTRACK();
1541 case ByteTerm::TypePatternCasedCharacterOnce:
1542 case ByteTerm::TypePatternCasedCharacterFixed:
1543 case ByteTerm::TypePatternCasedCharacterGreedy:
1544 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1545 if (backtrackPatternCasedCharacter(currentTerm(), context))
1546 MATCH_NEXT();
1547 BACKTRACK();
1548 case ByteTerm::TypeCharacterClass:
1549 if (backtrackCharacterClass(currentTerm(), context))
1550 MATCH_NEXT();
1551 BACKTRACK();
1552 case ByteTerm::TypeBackReference:
1553 if (backtrackBackReference(currentTerm(), context))
1554 MATCH_NEXT();
1555 BACKTRACK();
1556 case ByteTerm::TypeParenthesesSubpattern: {
1557 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1558
1559 if (result == JSRegExpMatch) {
1560 MATCH_NEXT();
1561 } else if (result != JSRegExpNoMatch)
1562 return result;
1563
1564 BACKTRACK();
1565 }
1566 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1567 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1568 MATCH_NEXT();
1569 BACKTRACK();
1570 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1571 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1572 MATCH_NEXT();
1573 BACKTRACK();
1574 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1575 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1576 MATCH_NEXT();
1577 BACKTRACK();
1578 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1579 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1580 MATCH_NEXT();
1581 BACKTRACK();
1582 case ByteTerm::TypeParentheticalAssertionBegin:
1583 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1584 MATCH_NEXT();
1585 BACKTRACK();
1586 case ByteTerm::TypeParentheticalAssertionEnd:
1587 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1588 MATCH_NEXT();
1589 BACKTRACK();
1590
1591 case ByteTerm::TypeCheckInput:
1592 input.uncheckInput(currentTerm().checkInputCount);
1593 BACKTRACK();
1594
1595 case ByteTerm::TypeUncheckInput:
1596 input.checkInput(currentTerm().checkInputCount);
1597 BACKTRACK();
1598
1599 case ByteTerm::TypeDotStarEnclosure:
1600 RELEASE_ASSERT_NOT_REACHED();
1601 }
1602
1603 RELEASE_ASSERT_NOT_REACHED();
1604 return JSRegExpErrorNoMatch;
1605 }
1606
1607 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1608 {
1609 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1610
1611 if (result == JSRegExpMatch) {
1612 while (context->matchBegin == context->matchEnd) {
1613 result = matchDisjunction(disjunction, context, true);
1614 if (result != JSRegExpMatch)
1615 return result;
1616 }
1617 return JSRegExpMatch;
1618 }
1619
1620 return result;
1621 }
1622
1623 unsigned interpret()
1624 {
1625 // FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970
1626 // [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion
1627 if (!input.isAvailableInput(0))
1628 return offsetNoMatch;
1629
1630 if (pattern->m_lock)
1631 pattern->m_lock->lock();
1632
1633 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1634 output[i << 1] = offsetNoMatch;
1635
1636 allocatorPool = pattern->m_allocator->startAllocator();
1637 RELEASE_ASSERT(allocatorPool);
1638
1639 DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
1640
1641 JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
1642 if (result == JSRegExpMatch) {
1643 output[0] = context->matchBegin;
1644 output[1] = context->matchEnd;
1645 }
1646
1647 freeDisjunctionContext(context);
1648
1649 pattern->m_allocator->stopAllocator();
1650
1651 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1652
1653 if (pattern->m_lock)
1654 pattern->m_lock->unlock();
1655
1656 return output[0];
1657 }
1658
1659 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1660 : pattern(pattern)
1661 , unicode(pattern->unicode())
1662 , output(output)
1663 , input(input, start, length, pattern->unicode())
1664 , startOffset(start)
1665 , remainingMatchCount(matchLimit)
1666 {
1667 }
1668
1669private:
1670 BytecodePattern* pattern;
1671 bool unicode;
1672 unsigned* output;
1673 InputStream input;
1674 WTF::BumpPointerPool* allocatorPool { nullptr };
1675 unsigned startOffset;
1676 unsigned remainingMatchCount;
1677};
1678
1679class ByteCompiler {
1680 struct ParenthesesStackEntry {
1681 unsigned beginTerm;
1682 unsigned savedAlternativeIndex;
1683 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1684 : beginTerm(beginTerm)
1685 , savedAlternativeIndex(savedAlternativeIndex)
1686 {
1687 }
1688 };
1689
1690public:
1691 ByteCompiler(YarrPattern& pattern)
1692 : m_pattern(pattern)
1693 {
1694 }
1695
1696 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock, ErrorCode& errorCode)
1697 {
1698 if (UNLIKELY(!isSafeToRecurse())) {
1699 errorCode = ErrorCode::TooManyDisjunctions;
1700 return nullptr;
1701 }
1702
1703 regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough());
1704 if (auto error = emitDisjunction(m_pattern.m_body, 0, 0)) {
1705 errorCode = error.value();
1706 return nullptr;
1707 }
1708 regexEnd();
1709
1710#ifndef NDEBUG
1711 if (Options::dumpCompiledRegExpPatterns())
1712 dumpDisjunction(m_bodyDisjunction.get());
1713#endif
1714
1715 return makeUnique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock);
1716 }
1717
1718 void checkInput(unsigned count)
1719 {
1720 m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count));
1721 }
1722
1723 void uncheckInput(unsigned count)
1724 {
1725 m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count));
1726 }
1727
1728 void assertionBOL(unsigned inputPosition)
1729 {
1730 m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition));
1731 }
1732
1733 void assertionEOL(unsigned inputPosition)
1734 {
1735 m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition));
1736 }
1737
1738 void assertionWordBoundary(bool invert, unsigned inputPosition)
1739 {
1740 m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, inputPosition));
1741 }
1742
1743 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1744 {
1745 if (m_pattern.ignoreCase()) {
1746 UChar32 lo = u_tolower(ch);
1747 UChar32 hi = u_toupper(ch);
1748
1749 if (lo != hi) {
1750 m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
1751 return;
1752 }
1753 }
1754
1755 m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
1756 }
1757
1758 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1759 {
1760 m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition));
1761
1762 m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1763 m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
1764 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1765 }
1766
1767 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1768 {
1769 ASSERT(subpatternId);
1770
1771 m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, inputPosition));
1772
1773 m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1774 m_bodyDisjunction->terms.last().atom.quantityType = quantityType;
1775 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1776 }
1777
1778 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1779 {
1780 unsigned beginTerm = m_bodyDisjunction->terms.size();
1781
1782 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1783 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1784 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1785 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1786
1787 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1788 m_currentAlternativeIndex = beginTerm + 1;
1789 }
1790
1791 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1792 {
1793 unsigned beginTerm = m_bodyDisjunction->terms.size();
1794
1795 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1796 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1797 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1798 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1799
1800 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1801 m_currentAlternativeIndex = beginTerm + 1;
1802 }
1803
1804 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1805 {
1806 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1807 // then fix this up at the end! - simplifying this should make it much clearer.
1808 // https://bugs.webkit.org/show_bug.cgi?id=50136
1809
1810 unsigned beginTerm = m_bodyDisjunction->terms.size();
1811
1812 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1813 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1814 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1815 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1816
1817 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1818 m_currentAlternativeIndex = beginTerm + 1;
1819 }
1820
1821 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1822 {
1823 unsigned beginTerm = m_bodyDisjunction->terms.size();
1824
1825 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1826 m_bodyDisjunction->terms.last().frameLocation = frameLocation;
1827 m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin());
1828 m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation;
1829
1830 m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1831 m_currentAlternativeIndex = beginTerm + 1;
1832 }
1833
1834 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1835 {
1836 unsigned beginTerm = popParenthesesStack();
1837 closeAlternative(beginTerm + 1);
1838 unsigned endTerm = m_bodyDisjunction->terms.size();
1839
1840 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1841
1842 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1843 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1844
1845 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1846 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1847 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1848 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1849
1850 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1851 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1852 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1853 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1854 }
1855
1856 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1857 {
1858 m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored));
1859 }
1860
1861 unsigned popParenthesesStack()
1862 {
1863 ASSERT(m_parenthesesStack.size());
1864 unsigned beginTerm = m_parenthesesStack.last().beginTerm;
1865 m_currentAlternativeIndex = m_parenthesesStack.last().savedAlternativeIndex;
1866 m_parenthesesStack.removeLast();
1867
1868 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1869 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1870
1871 return beginTerm;
1872 }
1873
1874 void closeAlternative(unsigned beginTerm)
1875 {
1876 unsigned origBeginTerm = beginTerm;
1877 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1878 unsigned endIndex = m_bodyDisjunction->terms.size();
1879
1880 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1881
1882 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1883 m_bodyDisjunction->terms.remove(beginTerm);
1884 else {
1885 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1886 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1887 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1888 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1889 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1890 }
1891
1892 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1893
1894 m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd());
1895 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1896 }
1897 }
1898
1899 void closeBodyAlternative()
1900 {
1901 unsigned beginTerm = 0;
1902 unsigned origBeginTerm = 0;
1903 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1904 unsigned endIndex = m_bodyDisjunction->terms.size();
1905
1906 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1907
1908 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1909 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1910 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1911 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1912 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1913 }
1914
1915 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1916
1917 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd());
1918 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1919 }
1920
1921 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1922 {
1923 unsigned beginTerm = popParenthesesStack();
1924 closeAlternative(beginTerm + 1);
1925 unsigned endTerm = m_bodyDisjunction->terms.size();
1926
1927 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1928
1929 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1930
1931 bool capture = parenthesesBegin.capture();
1932 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1933
1934 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1935 auto parenthesesDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize);
1936
1937 unsigned firstTermInParentheses = beginTerm + 1;
1938 parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2);
1939
1940 parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin());
1941 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1942 parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]);
1943 parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd());
1944
1945 m_bodyDisjunction->terms.shrink(beginTerm);
1946
1947 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1948 m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
1949
1950 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1951 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1952 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1953 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1954 }
1955
1956 void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1957 {
1958 unsigned beginTerm = popParenthesesStack();
1959 closeAlternative(beginTerm + 1);
1960 unsigned endTerm = m_bodyDisjunction->terms.size();
1961
1962 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1963
1964 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1965 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1966
1967 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1968 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1969 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1970 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1971
1972 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1973 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1974 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1975 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1976 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1977 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1978 }
1979
1980 void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1981 {
1982 unsigned beginTerm = popParenthesesStack();
1983 closeAlternative(beginTerm + 1);
1984 unsigned endTerm = m_bodyDisjunction->terms.size();
1985
1986 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1987
1988 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1989 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1990
1991 m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1992 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1993 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1994 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1995
1996 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1997 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1998 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1999 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
2000 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
2001 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
2002 }
2003
2004 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
2005 {
2006 m_bodyDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize);
2007 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough));
2008 m_bodyDisjunction->terms[0].frameLocation = 0;
2009 m_currentAlternativeIndex = 0;
2010 }
2011
2012 void regexEnd()
2013 {
2014 closeBodyAlternative();
2015 }
2016
2017 void alternativeBodyDisjunction(bool onceThrough)
2018 {
2019 unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
2020 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2021 m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough));
2022
2023 m_currentAlternativeIndex = newAlternativeIndex;
2024 }
2025
2026 void alternativeDisjunction()
2027 {
2028 unsigned newAlternativeIndex = m_bodyDisjunction->terms.size();
2029 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2030 m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction());
2031
2032 m_currentAlternativeIndex = newAlternativeIndex;
2033 }
2034
2035 Optional<ErrorCode> WARN_UNUSED_RETURN emitDisjunction(PatternDisjunction* disjunction, Checked<unsigned, RecordOverflow> inputCountAlreadyChecked, unsigned parenthesesInputCountAlreadyChecked)
2036 {
2037 if (UNLIKELY(!isSafeToRecurse()))
2038 return ErrorCode::TooManyDisjunctions;
2039
2040 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
2041 auto currentCountAlreadyChecked = inputCountAlreadyChecked;
2042
2043 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
2044
2045 if (alt) {
2046 if (disjunction == m_pattern.m_body)
2047 alternativeBodyDisjunction(alternative->onceThrough());
2048 else
2049 alternativeDisjunction();
2050 }
2051
2052 unsigned minimumSize = alternative->m_minimumSize;
2053 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
2054 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
2055
2056 if (countToCheck) {
2057 checkInput(countToCheck);
2058 currentCountAlreadyChecked += countToCheck;
2059 if (currentCountAlreadyChecked.hasOverflowed())
2060 return ErrorCode::OffsetTooLarge;
2061 }
2062
2063 for (auto& term : alternative->m_terms) {
2064 switch (term.type) {
2065 case PatternTerm::TypeAssertionBOL:
2066 assertionBOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2067 break;
2068
2069 case PatternTerm::TypeAssertionEOL:
2070 assertionEOL((currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2071 break;
2072
2073 case PatternTerm::TypeAssertionWordBoundary:
2074 assertionWordBoundary(term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet());
2075 break;
2076
2077 case PatternTerm::TypePatternCharacter:
2078 atomPatternCharacter(term.patternCharacter, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2079 break;
2080
2081 case PatternTerm::TypeCharacterClass:
2082 atomCharacterClass(term.characterClass, term.invert(), (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2083 break;
2084
2085 case PatternTerm::TypeBackReference:
2086 atomBackReference(term.backReferenceSubpatternId, (currentCountAlreadyChecked - term.inputPosition).unsafeGet(), term.frameLocation, term.quantityMaxCount, term.quantityType);
2087 break;
2088
2089 case PatternTerm::TypeForwardReference:
2090 break;
2091
2092 case PatternTerm::TypeParenthesesSubpattern: {
2093 unsigned disjunctionAlreadyCheckedCount = 0;
2094 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2095 unsigned alternativeFrameLocation = term.frameLocation;
2096 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
2097 if (term.quantityType == QuantifierFixedCount)
2098 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
2099 else
2100 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2101 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2102 atomParenthesesOnceBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation);
2103 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
2104 return error;
2105 atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2106 } else if (term.parentheses.isTerminal) {
2107 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2108 atomParenthesesTerminalBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal);
2109 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount))
2110 return error;
2111 atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType);
2112 } else {
2113 unsigned delegateEndInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2114 atomParenthesesSubpatternBegin(term.parentheses.subpatternId, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0);
2115 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, 0))
2116 return error;
2117 atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize);
2118 }
2119 break;
2120 }
2121
2122 case PatternTerm::TypeParentheticalAssertion: {
2123 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
2124 unsigned positiveInputOffset = (currentCountAlreadyChecked - term.inputPosition).unsafeGet();
2125 unsigned uncheckAmount = 0;
2126 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
2127 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
2128 uncheckInput(uncheckAmount);
2129 currentCountAlreadyChecked -= uncheckAmount;
2130 if (currentCountAlreadyChecked.hasOverflowed())
2131 return ErrorCode::OffsetTooLarge;
2132 }
2133
2134 atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.frameLocation, alternativeFrameLocation);
2135 if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount))
2136 return error;
2137 atomParentheticalAssertionEnd(0, term.frameLocation, term.quantityMaxCount, term.quantityType);
2138 if (uncheckAmount) {
2139 checkInput(uncheckAmount);
2140 currentCountAlreadyChecked += uncheckAmount;
2141 if (currentCountAlreadyChecked.hasOverflowed())
2142 return ErrorCode::OffsetTooLarge;
2143 }
2144 break;
2145 }
2146
2147 case PatternTerm::TypeDotStarEnclosure:
2148 assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor);
2149 break;
2150 }
2151 }
2152 }
2153 return WTF::nullopt;
2154 }
2155#ifndef NDEBUG
2156 void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
2157 {
2158 PrintStream& out = WTF::dataFile();
2159
2160 unsigned termIndexNest = 0;
2161
2162 if (!nesting) {
2163 out.printf("ByteDisjunction(%p):\n", disjunction);
2164 nesting = 1;
2165 } else {
2166 termIndexNest = nesting - 1;
2167 nesting = 2;
2168 }
2169
2170 auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
2171 for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
2172 out.print(" ");
2173 out.printf("%4zu", index);
2174 for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
2175 out.print(" ");
2176 };
2177
2178 auto dumpQuantity = [&](ByteTerm& term) {
2179 if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
2180 return;
2181
2182 out.print(" {", term.atom.quantityMinCount);
2183 if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
2184 if (term.atom.quantityMaxCount == UINT_MAX)
2185 out.print(",inf");
2186 else
2187 out.print(",", term.atom.quantityMaxCount);
2188 }
2189 out.print("}");
2190 if (term.atom.quantityType == QuantifierGreedy)
2191 out.print(" greedy");
2192 else if (term.atom.quantityType == QuantifierNonGreedy)
2193 out.print(" non-greedy");
2194 };
2195
2196 auto dumpCaptured = [&](ByteTerm& term) {
2197 if (term.capture())
2198 out.print(" captured (#", term.atom.subpatternId, ")");
2199 };
2200
2201 auto dumpInverted = [&](ByteTerm& term) {
2202 if (term.invert())
2203 out.print(" inverted");
2204 };
2205
2206 auto dumpInputPosition = [&](ByteTerm& term) {
2207 out.printf(" inputPosition %u", term.inputPosition);
2208 };
2209
2210 auto dumpFrameLocation = [&](ByteTerm& term) {
2211 out.printf(" frameLocation %u", term.frameLocation);
2212 };
2213
2214 auto dumpCharacter = [&](ByteTerm& term) {
2215 out.print(" ");
2216 dumpUChar32(out, term.atom.patternCharacter);
2217 };
2218
2219 auto dumpCharClass = [&](ByteTerm& term) {
2220 out.print(" ");
2221 dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
2222 };
2223
2224 for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
2225 ByteTerm term = disjunction->terms[idx];
2226
2227 bool outputNewline = true;
2228
2229 switch (term.type) {
2230 case ByteTerm::TypeBodyAlternativeBegin:
2231 outputTermIndexAndNest(idx, nesting++);
2232 out.print("BodyAlternativeBegin");
2233 if (term.alternative.onceThrough)
2234 out.print(" onceThrough");
2235 dumpFrameLocation(term);
2236 break;
2237 case ByteTerm::TypeBodyAlternativeDisjunction:
2238 outputTermIndexAndNest(idx, nesting - 1);
2239 out.print("BodyAlternativeDisjunction");
2240 dumpFrameLocation(term);
2241 break;
2242 case ByteTerm::TypeBodyAlternativeEnd:
2243 outputTermIndexAndNest(idx, --nesting);
2244 out.print("BodyAlternativeEnd");
2245 dumpFrameLocation(term);
2246 break;
2247 case ByteTerm::TypeAlternativeBegin:
2248 outputTermIndexAndNest(idx, nesting++);
2249 out.print("AlternativeBegin");
2250 dumpFrameLocation(term);
2251 break;
2252 case ByteTerm::TypeAlternativeDisjunction:
2253 outputTermIndexAndNest(idx, nesting - 1);
2254 out.print("AlternativeDisjunction");
2255 dumpFrameLocation(term);
2256 break;
2257 case ByteTerm::TypeAlternativeEnd:
2258 outputTermIndexAndNest(idx, --nesting);
2259 out.print("AlternativeEnd");
2260 dumpFrameLocation(term);
2261 break;
2262 case ByteTerm::TypeSubpatternBegin:
2263 outputTermIndexAndNest(idx, nesting++);
2264 out.print("SubpatternBegin");
2265 break;
2266 case ByteTerm::TypeSubpatternEnd:
2267 outputTermIndexAndNest(idx, --nesting);
2268 out.print("SubpatternEnd");
2269 break;
2270 case ByteTerm::TypeAssertionBOL:
2271 outputTermIndexAndNest(idx, nesting);
2272 out.print("AssertionBOL");
2273 break;
2274 case ByteTerm::TypeAssertionEOL:
2275 outputTermIndexAndNest(idx, nesting);
2276 out.print("AssertionEOL");
2277 break;
2278 case ByteTerm::TypeAssertionWordBoundary:
2279 outputTermIndexAndNest(idx, nesting);
2280 out.print("AssertionWordBoundary");
2281 break;
2282 case ByteTerm::TypePatternCharacterOnce:
2283 outputTermIndexAndNest(idx, nesting);
2284 out.print("PatternCharacterOnce");
2285 dumpInverted(term);
2286 dumpInputPosition(term);
2287 dumpFrameLocation(term);
2288 dumpCharacter(term);
2289 dumpQuantity(term);
2290 break;
2291 case ByteTerm::TypePatternCharacterFixed:
2292 outputTermIndexAndNest(idx, nesting);
2293 out.print("PatternCharacterFixed");
2294 dumpInverted(term);
2295 dumpInputPosition(term);
2296 dumpFrameLocation(term);
2297 dumpCharacter(term);
2298 out.print(" {", term.atom.quantityMinCount, "}");
2299 break;
2300 case ByteTerm::TypePatternCharacterGreedy:
2301 outputTermIndexAndNest(idx, nesting);
2302 out.print("PatternCharacterGreedy");
2303 dumpInverted(term);
2304 dumpInputPosition(term);
2305 dumpFrameLocation(term);
2306 dumpCharacter(term);
2307 dumpQuantity(term);
2308 break;
2309 case ByteTerm::TypePatternCharacterNonGreedy:
2310 outputTermIndexAndNest(idx, nesting);
2311 out.print("PatternCharacterNonGreedy");
2312 dumpInverted(term);
2313 dumpInputPosition(term);
2314 dumpFrameLocation(term);
2315 dumpCharacter(term);
2316 dumpQuantity(term);
2317 break;
2318 case ByteTerm::TypePatternCasedCharacterOnce:
2319 outputTermIndexAndNest(idx, nesting);
2320 out.print("PatternCasedCharacterOnce");
2321 break;
2322 case ByteTerm::TypePatternCasedCharacterFixed:
2323 outputTermIndexAndNest(idx, nesting);
2324 out.print("PatternCasedCharacterFixed");
2325 break;
2326 case ByteTerm::TypePatternCasedCharacterGreedy:
2327 outputTermIndexAndNest(idx, nesting);
2328 out.print("PatternCasedCharacterGreedy");
2329 break;
2330 case ByteTerm::TypePatternCasedCharacterNonGreedy:
2331 outputTermIndexAndNest(idx, nesting);
2332 out.print("PatternCasedCharacterNonGreedy");
2333 break;
2334 case ByteTerm::TypeCharacterClass:
2335 outputTermIndexAndNest(idx, nesting);
2336 out.print("CharacterClass");
2337 dumpInverted(term);
2338 dumpInputPosition(term);
2339 dumpFrameLocation(term);
2340 dumpCharClass(term);
2341 dumpQuantity(term);
2342 break;
2343 case ByteTerm::TypeBackReference:
2344 outputTermIndexAndNest(idx, nesting);
2345 out.print("BackReference #", term.atom.subpatternId);
2346 dumpQuantity(term);
2347 break;
2348 case ByteTerm::TypeParenthesesSubpattern:
2349 outputTermIndexAndNest(idx, nesting);
2350 out.print("ParenthesesSubpattern");
2351 dumpCaptured(term);
2352 dumpInverted(term);
2353 dumpInputPosition(term);
2354 dumpFrameLocation(term);
2355 dumpQuantity(term);
2356 out.print("\n");
2357 outputNewline = false;
2358 dumpDisjunction(term.atom.parenthesesDisjunction, nesting);
2359 break;
2360 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
2361 outputTermIndexAndNest(idx, nesting++);
2362 out.print("ParenthesesSubpatternOnceBegin");
2363 dumpCaptured(term);
2364 dumpInverted(term);
2365 dumpInputPosition(term);
2366 dumpFrameLocation(term);
2367 break;
2368 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
2369 outputTermIndexAndNest(idx, --nesting);
2370 out.print("ParenthesesSubpatternOnceEnd");
2371 dumpFrameLocation(term);
2372 break;
2373 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
2374 outputTermIndexAndNest(idx, nesting++);
2375 out.print("ParenthesesSubpatternTerminalBegin");
2376 dumpInverted(term);
2377 dumpInputPosition(term);
2378 dumpFrameLocation(term);
2379 break;
2380 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
2381 outputTermIndexAndNest(idx, --nesting);
2382 out.print("ParenthesesSubpatternTerminalEnd");
2383 dumpFrameLocation(term);
2384 break;
2385 case ByteTerm::TypeParentheticalAssertionBegin:
2386 outputTermIndexAndNest(idx, nesting++);
2387 out.print("ParentheticalAssertionBegin");
2388 dumpInverted(term);
2389 dumpInputPosition(term);
2390 dumpFrameLocation(term);
2391 break;
2392 case ByteTerm::TypeParentheticalAssertionEnd:
2393 outputTermIndexAndNest(idx, --nesting);
2394 out.print("ParentheticalAssertionEnd");
2395 dumpFrameLocation(term);
2396 break;
2397 case ByteTerm::TypeCheckInput:
2398 outputTermIndexAndNest(idx, nesting);
2399 out.print("CheckInput ", term.checkInputCount);
2400 break;
2401 case ByteTerm::TypeUncheckInput:
2402 outputTermIndexAndNest(idx, nesting);
2403 out.print("UncheckInput ", term.checkInputCount);
2404 break;
2405 case ByteTerm::TypeDotStarEnclosure:
2406 outputTermIndexAndNest(idx, nesting);
2407 out.print("DotStarEnclosure");
2408 break;
2409 }
2410 if (outputNewline)
2411 out.print("\n");
2412 }
2413 }
2414#endif
2415
2416private:
2417 inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); }
2418
2419 YarrPattern& m_pattern;
2420 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
2421 StackCheck m_stackCheck;
2422 unsigned m_currentAlternativeIndex { 0 };
2423 Vector<ParenthesesStackEntry> m_parenthesesStack;
2424 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
2425};
2426
2427std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ErrorCode& errorCode, ConcurrentJSLock* lock)
2428{
2429 return ByteCompiler(pattern).compile(allocator, lock, errorCode);
2430}
2431
2432unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
2433{
2434 SuperSamplerScope superSamplerScope(false);
2435 if (input.is8Bit())
2436 return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret();
2437 return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret();
2438}
2439
2440unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
2441{
2442 SuperSamplerScope superSamplerScope(false);
2443 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
2444}
2445
2446unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
2447{
2448 SuperSamplerScope superSamplerScope(false);
2449 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
2450}
2451
2452// These should be the same for both UChar & LChar.
2453COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
2454COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
2455COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
2456COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
2457COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
2458COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
2459COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
2460
2461
2462} }
2463