1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/objects/js-regexp.h"
10#include "src/regexp/regexp-ast.h"
11#include "src/zone/zone.h"
12
13namespace v8 {
14namespace internal {
15
16struct RegExpCompileData;
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be nullptr pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27 BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28
29 // Adds element at end of list. This element is buffered and can
30 // be read using last() or removed using RemoveLast until a new Add or until
31 // RemoveLast or GetList has been called.
32 void Add(T* value, Zone* zone) {
33 if (last_ != nullptr) {
34 if (list_ == nullptr) {
35 list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 }
37 list_->Add(last_, zone);
38 }
39 last_ = value;
40 }
41
42 T* last() {
43 DCHECK(last_ != nullptr);
44 return last_;
45 }
46
47 T* RemoveLast() {
48 DCHECK(last_ != nullptr);
49 T* result = last_;
50 if ((list_ != nullptr) && (list_->length() > 0))
51 last_ = list_->RemoveLast();
52 else
53 last_ = nullptr;
54 return result;
55 }
56
57 T* Get(int i) {
58 DCHECK((0 <= i) && (i < length()));
59 if (list_ == nullptr) {
60 DCHECK_EQ(0, i);
61 return last_;
62 } else {
63 if (i == list_->length()) {
64 DCHECK(last_ != nullptr);
65 return last_;
66 } else {
67 return list_->at(i);
68 }
69 }
70 }
71
72 void Clear() {
73 list_ = nullptr;
74 last_ = nullptr;
75 }
76
77 int length() {
78 int length = (list_ == nullptr) ? 0 : list_->length();
79 return length + ((last_ == nullptr) ? 0 : 1);
80 }
81
82 ZoneList<T*>* GetList(Zone* zone) {
83 if (list_ == nullptr) {
84 list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 }
86 if (last_ != nullptr) {
87 list_->Add(last_, zone);
88 last_ = nullptr;
89 }
90 return list_;
91 }
92
93 private:
94 ZoneList<T*>* list_;
95 T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
102 RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103 void AddCharacter(uc16 character);
104 void AddUnicodeCharacter(uc32 character);
105 void AddEscapedUnicodeCharacter(uc32 character);
106 // "Adds" an empty expression. Does nothing except consume a
107 // following quantifier
108 void AddEmpty();
109 void AddCharacterClass(RegExpCharacterClass* cc);
110 void AddCharacterClassForDesugaring(uc32 c);
111 void AddAtom(RegExpTree* tree);
112 void AddTerm(RegExpTree* tree);
113 void AddAssertion(RegExpTree* tree);
114 void NewAlternative(); // '|'
115 bool AddQuantifierToAtom(int min, int max,
116 RegExpQuantifier::QuantifierType type);
117 void FlushText();
118 RegExpTree* ToRegExp();
119 JSRegExp::Flags flags() const { return flags_; }
120 void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
121
122 bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
123 bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
124 bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
125
126 private:
127 static const uc16 kNoPendingSurrogate = 0;
128 void AddLeadSurrogate(uc16 lead_surrogate);
129 void AddTrailSurrogate(uc16 trail_surrogate);
130 void FlushPendingSurrogate();
131 void FlushCharacters();
132 void FlushTerms();
133 bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
134 bool NeedsDesugaringForIgnoreCase(uc32 c);
135 Zone* zone() const { return zone_; }
136 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
137
138 Zone* zone_;
139 bool pending_empty_;
140 JSRegExp::Flags flags_;
141 ZoneList<uc16>* characters_;
142 uc16 pending_surrogate_;
143 BufferedZoneList<RegExpTree, 2> terms_;
144 BufferedZoneList<RegExpTree, 2> text_;
145 BufferedZoneList<RegExpTree, 2> alternatives_;
146#ifdef DEBUG
147 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
148#define LAST(x) last_added_ = x;
149#else
150#define LAST(x)
151#endif
152};
153
154class V8_EXPORT_PRIVATE RegExpParser {
155 public:
156 RegExpParser(FlatStringReader* in, Handle<String>* error,
157 JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
158
159 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
160 JSRegExp::Flags flags, RegExpCompileData* result);
161
162 RegExpTree* ParsePattern();
163 RegExpTree* ParseDisjunction();
164 RegExpTree* ParseGroup();
165
166 // Parses a {...,...} quantifier and stores the range in the given
167 // out parameters.
168 bool ParseIntervalQuantifier(int* min_out, int* max_out);
169
170 // Parses and returns a single escaped character. The character
171 // must not be 'b' or 'B' since they are usually handle specially.
172 uc32 ParseClassCharacterEscape();
173
174 // Checks whether the following is a length-digit hexadecimal number,
175 // and sets the value if it is.
176 bool ParseHexEscape(int length, uc32* value);
177 bool ParseUnicodeEscape(uc32* value);
178 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
179
180 bool ParsePropertyClassName(std::vector<char>* name_1,
181 std::vector<char>* name_2);
182 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
183 const std::vector<char>& name_1,
184 const std::vector<char>& name_2);
185
186 RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
187 RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
188
189 uc32 ParseOctalLiteral();
190
191 // Tries to parse the input as a back reference. If successful it
192 // stores the result in the output parameter and returns true. If
193 // it fails it will push back the characters read so the same characters
194 // can be reparsed.
195 bool ParseBackReferenceIndex(int* index_out);
196
197 // Parse inside a class. Either add escaped class to the range, or return
198 // false and pass parsed single character through |char_out|.
199 void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
200 bool add_unicode_case_equivalents, uc32* char_out,
201 bool* is_class_escape);
202
203 char ParseClassEscape();
204
205 RegExpTree* ReportError(Vector<const char> message);
206 void Advance();
207 void Advance(int dist);
208 void Reset(int pos);
209
210 // Reports whether the pattern might be used as a literal search string.
211 // Only use if the result of the parse is a single atom node.
212 bool simple();
213 bool contains_anchor() { return contains_anchor_; }
214 void set_contains_anchor() { contains_anchor_ = true; }
215 int captures_started() { return captures_started_; }
216 int position() { return next_pos_ - 1; }
217 bool failed() { return failed_; }
218 // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
219 // just read the initial flag value here.
220 bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
221
222 static bool IsSyntaxCharacterOrSlash(uc32 c);
223
224 static const int kMaxCaptures = 1 << 16;
225 static const uc32 kEndMarker = (1 << 21);
226
227 private:
228 enum SubexpressionType {
229 INITIAL,
230 CAPTURE, // All positive values represent captures.
231 POSITIVE_LOOKAROUND,
232 NEGATIVE_LOOKAROUND,
233 GROUPING
234 };
235
236 class RegExpParserState : public ZoneObject {
237 public:
238 // Push a state on the stack.
239 RegExpParserState(RegExpParserState* previous_state,
240 SubexpressionType group_type,
241 RegExpLookaround::Type lookaround_type,
242 int disjunction_capture_index,
243 const ZoneVector<uc16>* capture_name,
244 JSRegExp::Flags flags, Zone* zone)
245 : previous_state_(previous_state),
246 builder_(new (zone) RegExpBuilder(zone, flags)),
247 group_type_(group_type),
248 lookaround_type_(lookaround_type),
249 disjunction_capture_index_(disjunction_capture_index),
250 capture_name_(capture_name) {}
251 // Parser state of containing expression, if any.
252 RegExpParserState* previous_state() const { return previous_state_; }
253 bool IsSubexpression() { return previous_state_ != nullptr; }
254 // RegExpBuilder building this regexp's AST.
255 RegExpBuilder* builder() const { return builder_; }
256 // Type of regexp being parsed (parenthesized group or entire regexp).
257 SubexpressionType group_type() const { return group_type_; }
258 // Lookahead or Lookbehind.
259 RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
260 // Index in captures array of first capture in this sub-expression, if any.
261 // Also the capture index of this sub-expression itself, if group_type
262 // is CAPTURE.
263 int capture_index() const { return disjunction_capture_index_; }
264 // The name of the current sub-expression, if group_type is CAPTURE. Only
265 // used for named captures.
266 const ZoneVector<uc16>* capture_name() const { return capture_name_; }
267
268 bool IsNamedCapture() const { return capture_name_ != nullptr; }
269
270 // Check whether the parser is inside a capture group with the given index.
271 bool IsInsideCaptureGroup(int index);
272 // Check whether the parser is inside a capture group with the given name.
273 bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
274
275 private:
276 // Linked list implementation of stack of states.
277 RegExpParserState* const previous_state_;
278 // Builder for the stored disjunction.
279 RegExpBuilder* const builder_;
280 // Stored disjunction type (capture, look-ahead or grouping), if any.
281 const SubexpressionType group_type_;
282 // Stored read direction.
283 const RegExpLookaround::Type lookaround_type_;
284 // Stored disjunction's capture index (if any).
285 const int disjunction_capture_index_;
286 // Stored capture name (if any).
287 const ZoneVector<uc16>* const capture_name_;
288 };
289
290 // Return the 1-indexed RegExpCapture object, allocate if necessary.
291 RegExpCapture* GetCapture(int index);
292
293 // Creates a new named capture at the specified index. Must be called exactly
294 // once for each named capture. Fails if a capture with the same name is
295 // encountered.
296 bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
297
298 // Parses the name of a capture group (?<name>pattern). The name must adhere
299 // to IdentifierName in the ECMAScript standard.
300 const ZoneVector<uc16>* ParseCaptureGroupName();
301
302 bool ParseNamedBackReference(RegExpBuilder* builder,
303 RegExpParserState* state);
304 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
305
306 // After the initial parsing pass, patch corresponding RegExpCapture objects
307 // into all RegExpBackReferences. This is done after initial parsing in order
308 // to avoid complicating cases in which references comes before the capture.
309 void PatchNamedBackReferences();
310
311 Handle<FixedArray> CreateCaptureNameMap();
312
313 // Returns true iff the pattern contains named captures. May call
314 // ScanForCaptures to look ahead at the remaining pattern.
315 bool HasNamedCaptures();
316
317 Isolate* isolate() { return isolate_; }
318 Zone* zone() const { return zone_; }
319
320 uc32 current() { return current_; }
321 bool has_more() { return has_more_; }
322 bool has_next() { return next_pos_ < in()->length(); }
323 uc32 Next();
324 template <bool update_position>
325 uc32 ReadNext();
326 FlatStringReader* in() { return in_; }
327 void ScanForCaptures();
328
329 Isolate* isolate_;
330 Zone* zone_;
331 Handle<String>* error_;
332 ZoneList<RegExpCapture*>* captures_;
333 ZoneList<RegExpCapture*>* named_captures_;
334 ZoneList<RegExpBackReference*>* named_back_references_;
335 FlatStringReader* in_;
336 uc32 current_;
337 // These are the flags specified outside the regexp syntax ie after the
338 // terminating '/' or in the second argument to the constructor. The current
339 // flags are stored on the RegExpBuilder.
340 JSRegExp::Flags top_level_flags_;
341 int next_pos_;
342 int captures_started_;
343 int capture_count_; // Only valid after we have scanned for captures.
344 bool has_more_;
345 bool simple_;
346 bool contains_anchor_;
347 bool is_scanned_for_captures_;
348 bool has_named_captures_; // Only valid after we have scanned for captures.
349 bool failed_;
350};
351
352} // namespace internal
353} // namespace v8
354
355#endif // V8_REGEXP_REGEXP_PARSER_H_
356