regexp-parser.h source code [v8/src/regexp/regexp-parser.h]

1	// Copyright 2016 the V8 project authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style license that can be
3	// found in the LICENSE file.
4
5	#ifndef V8_REGEXP_REGEXP_PARSER_H_
6	#define V8_REGEXP_REGEXP_PARSER_H_
7
8	#include "src/objects.h"
9	#include "src/objects/js-regexp.h"
10	#include "src/regexp/regexp-ast.h"
11	#include "src/zone/zone.h"
12
13	namespace v8 {
14	namespace internal {
15
16	struct RegExpCompileData;
17
18	// A BufferedZoneList is an automatically growing list, just like (and backed
19	// by) a ZoneList, that is optimized for the case of adding and removing
20	// a single element. The last element added is stored outside the backing list,
21	// and if no more than one element is ever added, the ZoneList isn't even
22	// allocated.
23	// Elements must not be nullptr pointers.
24	template <typename T, int initial_size>
25	class BufferedZoneList {
26	public:
27	BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28
29	// Adds element at end of list. This element is buffered and can
30	// be read using last() or removed using RemoveLast until a new Add or until
31	// RemoveLast or GetList has been called.
32	void Add(T* value, Zone* zone) {
33	if (last_ != nullptr) {
34	if (list_ == nullptr) {
35	list_ = new (zone) ZoneList<T*>(initial_size, zone);
36	}
37	list_->Add(last_, zone);
38	}
39	last_ = value;
40	}
41
42	T* last() {
43	DCHECK(last_ != nullptr);
44	return last_;
45	}
46
47	T* RemoveLast() {
48	DCHECK(last_ != nullptr);
49	T* result = last_;
50	if ((list_ != nullptr) && (list_->length() > `0`))
51	last_ = list_->RemoveLast();
52	else
53	last_ = nullptr;
54	return result;
55	}
56
57	T* Get(int i) {
58	DCHECK((`0` <= i) && (i < length()));
59	if (list_ == nullptr) {
60	DCHECK_EQ(`0`, i);
61	return last_;
62	} else {
63	if (i == list_->length()) {
64	DCHECK(last_ != nullptr);
65	return last_;
66	} else {
67	return list_->at(i);
68	}
69	}
70	}
71
72	void Clear() {
73	list_ = nullptr;
74	last_ = nullptr;
75	}
76
77	int length() {
78	int length = (list_ == nullptr) ? `0` : list_->length();
79	return length + ((last_ == nullptr) ? `0` : `1`);
80	}
81
82	ZoneList<T> GetList(Zone* zone) {
83	if (list_ == nullptr) {
84	list_ = new (zone) ZoneList<T*>(initial_size, zone);
85	}
86	if (last_ != nullptr) {
87	list_->Add(last_, zone);
88	last_ = nullptr;
89	}
90	return list_;
91	}
92
93	private:
94	ZoneList<T> list_;
95	T* last_;
96	};
97
98
99	// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100	class RegExpBuilder : public ZoneObject {
101	public:
102	RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103	void AddCharacter(uc16 character);
104	void AddUnicodeCharacter(uc32 character);
105	void AddEscapedUnicodeCharacter(uc32 character);
106	// "Adds" an empty expression. Does nothing except consume a
107	// following quantifier
108	void AddEmpty();
109	void AddCharacterClass(RegExpCharacterClass* cc);
110	void AddCharacterClassForDesugaring(uc32 c);
111	void AddAtom(RegExpTree* tree);
112	void AddTerm(RegExpTree* tree);
113	void AddAssertion(RegExpTree* tree);
114	void NewAlternative(); // '\|'
115	bool AddQuantifierToAtom(int min, int max,
116	RegExpQuantifier::QuantifierType type);
117	void FlushText();
118	RegExpTree* ToRegExp();
119	JSRegExp::Flags flags() const { return flags_; }
120	void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
121
122	bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != `0`; }
123	bool multiline() const { return (flags_ & JSRegExp::kMultiline) != `0`; }
124	bool dotall() const { return (flags_ & JSRegExp::kDotAll) != `0`; }
125
126	private:
127	static const uc16 kNoPendingSurrogate = `0`;
128	void AddLeadSurrogate(uc16 lead_surrogate);
129	void AddTrailSurrogate(uc16 trail_surrogate);
130	void FlushPendingSurrogate();
131	void FlushCharacters();
132	void FlushTerms();
133	bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
134	bool NeedsDesugaringForIgnoreCase(uc32 c);
135	Zone* zone() const { return zone_; }
136	bool unicode() const { return (flags_ & JSRegExp::kUnicode) != `0`; }
137
138	Zone* zone_;
139	bool pending_empty_;
140	JSRegExp::Flags flags_;
141	ZoneList<uc16>* characters_;
142	uc16 pending_surrogate_;
143	BufferedZoneList<RegExpTree, `2`> terms_;
144	BufferedZoneList<RegExpTree, `2`> text_;
145	BufferedZoneList<RegExpTree, `2`> alternatives_;
146	#ifdef DEBUG
147	enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
148	#define LAST(x) last_added_ = x;
149	#else
150	#define LAST(x)
151	#endif
152	};
153
154	class V8_EXPORT_PRIVATE RegExpParser {
155	public:
156	RegExpParser(FlatStringReader* in, Handle<String>* error,
157	JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
158
159	static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
160	JSRegExp::Flags flags, RegExpCompileData* result);
161
162	RegExpTree* ParsePattern();
163	RegExpTree* ParseDisjunction();
164	RegExpTree* ParseGroup();
165
166	// Parses a {...,...} quantifier and stores the range in the given
167	// out parameters.
168	bool ParseIntervalQuantifier(int* min_out, int* max_out);
169
170	// Parses and returns a single escaped character. The character
171	// must not be 'b' or 'B' since they are usually handle specially.
172	uc32 ParseClassCharacterEscape();
173
174	// Checks whether the following is a length-digit hexadecimal number,
175	// and sets the value if it is.
176	bool ParseHexEscape(int length, uc32* value);
177	bool ParseUnicodeEscape(uc32* value);
178	bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
179
180	bool ParsePropertyClassName(std::vector<char>* name_1,
181	std::vector<char>* name_2);
182	bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
183	const std::vector<char>& name_1,
184	const std::vector<char>& name_2);
185
186	RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
187	RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
188
189	uc32 ParseOctalLiteral();
190
191	// Tries to parse the input as a back reference. If successful it
192	// stores the result in the output parameter and returns true. If
193	// it fails it will push back the characters read so the same characters
194	// can be reparsed.
195	bool ParseBackReferenceIndex(int* index_out);
196
197	// Parse inside a class. Either add escaped class to the range, or return
198	// false and pass parsed single character through \|char_out\|.
199	void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
200	bool add_unicode_case_equivalents, uc32* char_out,
201	bool* is_class_escape);
202
203	char ParseClassEscape();
204
205	RegExpTree* ReportError(Vector<const char> message);
206	void Advance();
207	void Advance(int dist);
208	void Reset(int pos);
209
210	// Reports whether the pattern might be used as a literal search string.
211	// Only use if the result of the parse is a single atom node.
212	bool simple();
213	bool contains_anchor() { return contains_anchor_; }
214	void set_contains_anchor() { contains_anchor_ = true; }
215	int captures_started() { return captures_started_; }
216	int position() { return next_pos_ - `1`; }
217	bool failed() { return failed_; }
218	// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
219	// just read the initial flag value here.
220	bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != `0`; }
221
222	static bool IsSyntaxCharacterOrSlash(uc32 c);
223
224	static const int kMaxCaptures = `1` << `16`;
225	static const uc32 kEndMarker = (`1` << `21`);
226
227	private:
228	enum SubexpressionType {
229	INITIAL,
230	CAPTURE, // All positive values represent captures.
231	POSITIVE_LOOKAROUND,
232	NEGATIVE_LOOKAROUND,
233	GROUPING
234	};
235
236	class RegExpParserState : public ZoneObject {
237	public:
238	// Push a state on the stack.
239	RegExpParserState(RegExpParserState* previous_state,
240	SubexpressionType group_type,
241	RegExpLookaround::Type lookaround_type,
242	int disjunction_capture_index,
243	const ZoneVector<uc16>* capture_name,
244	JSRegExp::Flags flags, Zone* zone)
245	: previous_state_(previous_state),
246	builder_(new (zone) RegExpBuilder (zone, flags)),
247	group_type_(group_type),
248	lookaround_type_(lookaround_type),
249	disjunction_capture_index_(disjunction_capture_index),
250	capture_name_(capture_name) {}
251	// Parser state of containing expression, if any.
252	RegExpParserState* previous_state() const { return previous_state_; }
253	bool IsSubexpression() { return previous_state_ != nullptr; }
254	// RegExpBuilder building this regexp's AST.
255	RegExpBuilder* builder() const { return builder_; }
256	// Type of regexp being parsed (parenthesized group or entire regexp).
257	SubexpressionType group_type() const { return group_type_; }
258	// Lookahead or Lookbehind.
259	RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
260	// Index in captures array of first capture in this sub-expression, if any.
261	// Also the capture index of this sub-expression itself, if group_type
262	// is CAPTURE.
263	int capture_index() const { return disjunction_capture_index_; }
264	// The name of the current sub-expression, if group_type is CAPTURE. Only
265	// used for named captures.
266	const ZoneVector<uc16>* capture_name() const { return capture_name_; }
267
268	bool IsNamedCapture() const { return capture_name_ != nullptr; }
269
270	// Check whether the parser is inside a capture group with the given index.
271	bool IsInsideCaptureGroup(int index);
272	// Check whether the parser is inside a capture group with the given name.
273	bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
274
275	private:
276	// Linked list implementation of stack of states.
277	RegExpParserState* const previous_state_;
278	// Builder for the stored disjunction.
279	RegExpBuilder* const builder_;
280	// Stored disjunction type (capture, look-ahead or grouping), if any.
281	const SubexpressionType group_type_;
282	// Stored read direction.
283	const RegExpLookaround::Type lookaround_type_;
284	// Stored disjunction's capture index (if any).
285	const int disjunction_capture_index_;
286	// Stored capture name (if any).
287	const ZoneVector<uc16>* const capture_name_;
288	};
289
290	// Return the 1-indexed RegExpCapture object, allocate if necessary.
291	RegExpCapture* GetCapture(int index);
292
293	// Creates a new named capture at the specified index. Must be called exactly
294	// once for each named capture. Fails if a capture with the same name is
295	// encountered.
296	bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
297
298	// Parses the name of a capture group (?<name>pattern). The name must adhere
299	// to IdentifierName in the ECMAScript standard.
300	const ZoneVector<uc16>* ParseCaptureGroupName();
301
302	bool ParseNamedBackReference(RegExpBuilder* builder,
303	RegExpParserState* state);
304	RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
305
306	// After the initial parsing pass, patch corresponding RegExpCapture objects
307	// into all RegExpBackReferences. This is done after initial parsing in order
308	// to avoid complicating cases in which references comes before the capture.
309	void PatchNamedBackReferences();
310
311	Handle<FixedArray> CreateCaptureNameMap();
312
313	// Returns true iff the pattern contains named captures. May call
314	// ScanForCaptures to look ahead at the remaining pattern.
315	bool HasNamedCaptures();
316
317	Isolate* isolate() { return isolate_; }
318	Zone* zone() const { return zone_; }
319
320	uc32 current() { return current_; }
321	bool has_more() { return has_more_; }
322	bool has_next() { return next_pos_ < in()->length(); }
323	uc32 Next();
324	template <bool update_position>
325	uc32 ReadNext();
326	FlatStringReader* in() { return in_; }
327	void ScanForCaptures();
328
329	Isolate* isolate_;
330	Zone* zone_;
331	Handle<String>* error_;
332	ZoneList<RegExpCapture> captures_;
333	ZoneList<RegExpCapture> named_captures_;
334	ZoneList<RegExpBackReference> named_back_references_;
335	FlatStringReader* in_;
336	uc32 current_;
337	// These are the flags specified outside the regexp syntax ie after the
338	// terminating '/' or in the second argument to the constructor. The current
339	// flags are stored on the RegExpBuilder.
340	JSRegExp::Flags top_level_flags_;
341	int next_pos_;
342	int captures_started_;
343	int capture_count_; // Only valid after we have scanned for captures.
344	bool has_more_;
345	bool simple_;
346	bool contains_anchor_;
347	bool is_scanned_for_captures_;
348	bool has_named_captures_; // Only valid after we have scanned for captures.
349	bool failed_;
350	};
351
352	} // namespace internal
353	} // namespace v8
354
355	#endif // V8_REGEXP_REGEXP_PARSER_H_
356

Browse the source code of v8/src/regexp/regexp-parser.h