1 | // Copyright 2016 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #ifndef V8_REGEXP_REGEXP_PARSER_H_ |
6 | #define V8_REGEXP_REGEXP_PARSER_H_ |
7 | |
8 | #include "src/objects.h" |
9 | #include "src/objects/js-regexp.h" |
10 | #include "src/regexp/regexp-ast.h" |
11 | #include "src/zone/zone.h" |
12 | |
13 | namespace v8 { |
14 | namespace internal { |
15 | |
16 | struct RegExpCompileData; |
17 | |
18 | // A BufferedZoneList is an automatically growing list, just like (and backed |
19 | // by) a ZoneList, that is optimized for the case of adding and removing |
20 | // a single element. The last element added is stored outside the backing list, |
21 | // and if no more than one element is ever added, the ZoneList isn't even |
22 | // allocated. |
23 | // Elements must not be nullptr pointers. |
24 | template <typename T, int initial_size> |
25 | class BufferedZoneList { |
26 | public: |
27 | BufferedZoneList() : list_(nullptr), last_(nullptr) {} |
28 | |
29 | // Adds element at end of list. This element is buffered and can |
30 | // be read using last() or removed using RemoveLast until a new Add or until |
31 | // RemoveLast or GetList has been called. |
32 | void Add(T* value, Zone* zone) { |
33 | if (last_ != nullptr) { |
34 | if (list_ == nullptr) { |
35 | list_ = new (zone) ZoneList<T*>(initial_size, zone); |
36 | } |
37 | list_->Add(last_, zone); |
38 | } |
39 | last_ = value; |
40 | } |
41 | |
42 | T* last() { |
43 | DCHECK(last_ != nullptr); |
44 | return last_; |
45 | } |
46 | |
47 | T* RemoveLast() { |
48 | DCHECK(last_ != nullptr); |
49 | T* result = last_; |
50 | if ((list_ != nullptr) && (list_->length() > 0)) |
51 | last_ = list_->RemoveLast(); |
52 | else |
53 | last_ = nullptr; |
54 | return result; |
55 | } |
56 | |
57 | T* Get(int i) { |
58 | DCHECK((0 <= i) && (i < length())); |
59 | if (list_ == nullptr) { |
60 | DCHECK_EQ(0, i); |
61 | return last_; |
62 | } else { |
63 | if (i == list_->length()) { |
64 | DCHECK(last_ != nullptr); |
65 | return last_; |
66 | } else { |
67 | return list_->at(i); |
68 | } |
69 | } |
70 | } |
71 | |
72 | void Clear() { |
73 | list_ = nullptr; |
74 | last_ = nullptr; |
75 | } |
76 | |
77 | int length() { |
78 | int length = (list_ == nullptr) ? 0 : list_->length(); |
79 | return length + ((last_ == nullptr) ? 0 : 1); |
80 | } |
81 | |
82 | ZoneList<T*>* GetList(Zone* zone) { |
83 | if (list_ == nullptr) { |
84 | list_ = new (zone) ZoneList<T*>(initial_size, zone); |
85 | } |
86 | if (last_ != nullptr) { |
87 | list_->Add(last_, zone); |
88 | last_ = nullptr; |
89 | } |
90 | return list_; |
91 | } |
92 | |
93 | private: |
94 | ZoneList<T*>* list_; |
95 | T* last_; |
96 | }; |
97 | |
98 | |
99 | // Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
100 | class RegExpBuilder : public ZoneObject { |
101 | public: |
102 | RegExpBuilder(Zone* zone, JSRegExp::Flags flags); |
103 | void AddCharacter(uc16 character); |
104 | void AddUnicodeCharacter(uc32 character); |
105 | void AddEscapedUnicodeCharacter(uc32 character); |
106 | // "Adds" an empty expression. Does nothing except consume a |
107 | // following quantifier |
108 | void AddEmpty(); |
109 | void AddCharacterClass(RegExpCharacterClass* cc); |
110 | void AddCharacterClassForDesugaring(uc32 c); |
111 | void AddAtom(RegExpTree* tree); |
112 | void AddTerm(RegExpTree* tree); |
113 | void AddAssertion(RegExpTree* tree); |
114 | void NewAlternative(); // '|' |
115 | bool AddQuantifierToAtom(int min, int max, |
116 | RegExpQuantifier::QuantifierType type); |
117 | void FlushText(); |
118 | RegExpTree* ToRegExp(); |
119 | JSRegExp::Flags flags() const { return flags_; } |
120 | void set_flags(JSRegExp::Flags flags) { flags_ = flags; } |
121 | |
122 | bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } |
123 | bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } |
124 | bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } |
125 | |
126 | private: |
127 | static const uc16 kNoPendingSurrogate = 0; |
128 | void AddLeadSurrogate(uc16 lead_surrogate); |
129 | void AddTrailSurrogate(uc16 trail_surrogate); |
130 | void FlushPendingSurrogate(); |
131 | void FlushCharacters(); |
132 | void FlushTerms(); |
133 | bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); |
134 | bool NeedsDesugaringForIgnoreCase(uc32 c); |
135 | Zone* zone() const { return zone_; } |
136 | bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } |
137 | |
138 | Zone* zone_; |
139 | bool pending_empty_; |
140 | JSRegExp::Flags flags_; |
141 | ZoneList<uc16>* characters_; |
142 | uc16 pending_surrogate_; |
143 | BufferedZoneList<RegExpTree, 2> terms_; |
144 | BufferedZoneList<RegExpTree, 2> text_; |
145 | BufferedZoneList<RegExpTree, 2> alternatives_; |
146 | #ifdef DEBUG |
147 | enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; |
148 | #define LAST(x) last_added_ = x; |
149 | #else |
150 | #define LAST(x) |
151 | #endif |
152 | }; |
153 | |
154 | class V8_EXPORT_PRIVATE RegExpParser { |
155 | public: |
156 | RegExpParser(FlatStringReader* in, Handle<String>* error, |
157 | JSRegExp::Flags flags, Isolate* isolate, Zone* zone); |
158 | |
159 | static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, |
160 | JSRegExp::Flags flags, RegExpCompileData* result); |
161 | |
162 | RegExpTree* ParsePattern(); |
163 | RegExpTree* ParseDisjunction(); |
164 | RegExpTree* ParseGroup(); |
165 | |
166 | // Parses a {...,...} quantifier and stores the range in the given |
167 | // out parameters. |
168 | bool ParseIntervalQuantifier(int* min_out, int* max_out); |
169 | |
170 | // Parses and returns a single escaped character. The character |
171 | // must not be 'b' or 'B' since they are usually handle specially. |
172 | uc32 ParseClassCharacterEscape(); |
173 | |
174 | // Checks whether the following is a length-digit hexadecimal number, |
175 | // and sets the value if it is. |
176 | bool ParseHexEscape(int length, uc32* value); |
177 | bool ParseUnicodeEscape(uc32* value); |
178 | bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); |
179 | |
180 | bool ParsePropertyClassName(std::vector<char>* name_1, |
181 | std::vector<char>* name_2); |
182 | bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, |
183 | const std::vector<char>& name_1, |
184 | const std::vector<char>& name_2); |
185 | |
186 | RegExpTree* GetPropertySequence(const std::vector<char>& name_1); |
187 | RegExpTree* ParseCharacterClass(const RegExpBuilder* state); |
188 | |
189 | uc32 ParseOctalLiteral(); |
190 | |
191 | // Tries to parse the input as a back reference. If successful it |
192 | // stores the result in the output parameter and returns true. If |
193 | // it fails it will push back the characters read so the same characters |
194 | // can be reparsed. |
195 | bool ParseBackReferenceIndex(int* index_out); |
196 | |
197 | // Parse inside a class. Either add escaped class to the range, or return |
198 | // false and pass parsed single character through |char_out|. |
199 | void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, |
200 | bool add_unicode_case_equivalents, uc32* char_out, |
201 | bool* is_class_escape); |
202 | |
203 | char ParseClassEscape(); |
204 | |
205 | RegExpTree* ReportError(Vector<const char> message); |
206 | void Advance(); |
207 | void Advance(int dist); |
208 | void Reset(int pos); |
209 | |
210 | // Reports whether the pattern might be used as a literal search string. |
211 | // Only use if the result of the parse is a single atom node. |
212 | bool simple(); |
213 | bool contains_anchor() { return contains_anchor_; } |
214 | void set_contains_anchor() { contains_anchor_ = true; } |
215 | int captures_started() { return captures_started_; } |
216 | int position() { return next_pos_ - 1; } |
217 | bool failed() { return failed_; } |
218 | // The Unicode flag can't be changed using in-regexp syntax, so it's OK to |
219 | // just read the initial flag value here. |
220 | bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } |
221 | |
222 | static bool IsSyntaxCharacterOrSlash(uc32 c); |
223 | |
224 | static const int kMaxCaptures = 1 << 16; |
225 | static const uc32 kEndMarker = (1 << 21); |
226 | |
227 | private: |
228 | enum SubexpressionType { |
229 | INITIAL, |
230 | CAPTURE, // All positive values represent captures. |
231 | POSITIVE_LOOKAROUND, |
232 | NEGATIVE_LOOKAROUND, |
233 | GROUPING |
234 | }; |
235 | |
236 | class RegExpParserState : public ZoneObject { |
237 | public: |
238 | // Push a state on the stack. |
239 | RegExpParserState(RegExpParserState* previous_state, |
240 | SubexpressionType group_type, |
241 | RegExpLookaround::Type lookaround_type, |
242 | int disjunction_capture_index, |
243 | const ZoneVector<uc16>* capture_name, |
244 | JSRegExp::Flags flags, Zone* zone) |
245 | : previous_state_(previous_state), |
246 | builder_(new (zone) RegExpBuilder(zone, flags)), |
247 | group_type_(group_type), |
248 | lookaround_type_(lookaround_type), |
249 | disjunction_capture_index_(disjunction_capture_index), |
250 | capture_name_(capture_name) {} |
251 | // Parser state of containing expression, if any. |
252 | RegExpParserState* previous_state() const { return previous_state_; } |
253 | bool IsSubexpression() { return previous_state_ != nullptr; } |
254 | // RegExpBuilder building this regexp's AST. |
255 | RegExpBuilder* builder() const { return builder_; } |
256 | // Type of regexp being parsed (parenthesized group or entire regexp). |
257 | SubexpressionType group_type() const { return group_type_; } |
258 | // Lookahead or Lookbehind. |
259 | RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } |
260 | // Index in captures array of first capture in this sub-expression, if any. |
261 | // Also the capture index of this sub-expression itself, if group_type |
262 | // is CAPTURE. |
263 | int capture_index() const { return disjunction_capture_index_; } |
264 | // The name of the current sub-expression, if group_type is CAPTURE. Only |
265 | // used for named captures. |
266 | const ZoneVector<uc16>* capture_name() const { return capture_name_; } |
267 | |
268 | bool IsNamedCapture() const { return capture_name_ != nullptr; } |
269 | |
270 | // Check whether the parser is inside a capture group with the given index. |
271 | bool IsInsideCaptureGroup(int index); |
272 | // Check whether the parser is inside a capture group with the given name. |
273 | bool IsInsideCaptureGroup(const ZoneVector<uc16>* name); |
274 | |
275 | private: |
276 | // Linked list implementation of stack of states. |
277 | RegExpParserState* const previous_state_; |
278 | // Builder for the stored disjunction. |
279 | RegExpBuilder* const builder_; |
280 | // Stored disjunction type (capture, look-ahead or grouping), if any. |
281 | const SubexpressionType group_type_; |
282 | // Stored read direction. |
283 | const RegExpLookaround::Type lookaround_type_; |
284 | // Stored disjunction's capture index (if any). |
285 | const int disjunction_capture_index_; |
286 | // Stored capture name (if any). |
287 | const ZoneVector<uc16>* const capture_name_; |
288 | }; |
289 | |
290 | // Return the 1-indexed RegExpCapture object, allocate if necessary. |
291 | RegExpCapture* GetCapture(int index); |
292 | |
293 | // Creates a new named capture at the specified index. Must be called exactly |
294 | // once for each named capture. Fails if a capture with the same name is |
295 | // encountered. |
296 | bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index); |
297 | |
298 | // Parses the name of a capture group (?<name>pattern). The name must adhere |
299 | // to IdentifierName in the ECMAScript standard. |
300 | const ZoneVector<uc16>* ParseCaptureGroupName(); |
301 | |
302 | bool ParseNamedBackReference(RegExpBuilder* builder, |
303 | RegExpParserState* state); |
304 | RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); |
305 | |
306 | // After the initial parsing pass, patch corresponding RegExpCapture objects |
307 | // into all RegExpBackReferences. This is done after initial parsing in order |
308 | // to avoid complicating cases in which references comes before the capture. |
309 | void PatchNamedBackReferences(); |
310 | |
311 | Handle<FixedArray> CreateCaptureNameMap(); |
312 | |
313 | // Returns true iff the pattern contains named captures. May call |
314 | // ScanForCaptures to look ahead at the remaining pattern. |
315 | bool HasNamedCaptures(); |
316 | |
317 | Isolate* isolate() { return isolate_; } |
318 | Zone* zone() const { return zone_; } |
319 | |
320 | uc32 current() { return current_; } |
321 | bool has_more() { return has_more_; } |
322 | bool has_next() { return next_pos_ < in()->length(); } |
323 | uc32 Next(); |
324 | template <bool update_position> |
325 | uc32 ReadNext(); |
326 | FlatStringReader* in() { return in_; } |
327 | void ScanForCaptures(); |
328 | |
329 | Isolate* isolate_; |
330 | Zone* zone_; |
331 | Handle<String>* error_; |
332 | ZoneList<RegExpCapture*>* captures_; |
333 | ZoneList<RegExpCapture*>* named_captures_; |
334 | ZoneList<RegExpBackReference*>* named_back_references_; |
335 | FlatStringReader* in_; |
336 | uc32 current_; |
337 | // These are the flags specified outside the regexp syntax ie after the |
338 | // terminating '/' or in the second argument to the constructor. The current |
339 | // flags are stored on the RegExpBuilder. |
340 | JSRegExp::Flags top_level_flags_; |
341 | int next_pos_; |
342 | int captures_started_; |
343 | int capture_count_; // Only valid after we have scanned for captures. |
344 | bool has_more_; |
345 | bool simple_; |
346 | bool contains_anchor_; |
347 | bool is_scanned_for_captures_; |
348 | bool has_named_captures_; // Only valid after we have scanned for captures. |
349 | bool failed_; |
350 | }; |
351 | |
352 | } // namespace internal |
353 | } // namespace v8 |
354 | |
355 | #endif // V8_REGEXP_REGEXP_PARSER_H_ |
356 | |