Lexer.h source code [jsc/Source/JavaScriptCore/parser/Lexer.h]

1	/*
2	* Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3	* Copyright (C) 2002-2019 Apple Inc. All rights reserved.
4	* Copyright (C) 2010 Zoltan Herczeg (zherczeg@inf.u-szeged.hu)
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Library General Public
8	* License as published by the Free Software Foundation; either
9	* version 2 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Library General Public License for more details.
15	*
16	* You should have received a copy of the GNU Library General Public License
17	* along with this library; see the file COPYING.LIB. If not, write to
18	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19	* Boston, MA 02110-1301, USA.
20	*
21	*/
22
23	#pragma once
24
25	#include "Lookup.h"
26	#include "ParserArena.h"
27	#include "ParserModes.h"
28	#include "ParserTokens.h"
29	#include "SourceCode.h"
30	#include <wtf/ASCIICType.h>
31	#include <wtf/Vector.h>
32
33	namespace JSC {
34
35	enum class LexerFlags : uint8_t {
36	IgnoreReservedWords = `1` << `0`,
37	DontBuildStrings = `1` << `1`,
38	DontBuildKeywords = `1` << `2`
39	};
40
41	enum class LexerEscapeParseMode { Template, String };
42
43	struct ParsedUnicodeEscapeValue;
44
45	bool isLexerKeyword(const Identifier&);
46
47	template <typename T>
48	class Lexer {
49	WTF_MAKE_NONCOPYABLE(Lexer);
50	WTF_MAKE_FAST_ALLOCATED;
51
52	public:
53	Lexer(VM&, JSParserBuiltinMode, JSParserScriptMode);
54	~Lexer();
55
56	// Character manipulation functions.
57	static bool isWhiteSpace(T character);
58	static bool isLineTerminator(T character);
59	static unsigned char convertHex(int c1, int c2);
60	static UChar convertUnicode(int c1, int c2, int c3, int c4);
61
62	// Functions to set up parsing.
63	void setCode(const SourceCode&, ParserArena*);
64	void setIsReparsingFunction() { m_isReparsingFunction = true; }
65	bool isReparsingFunction() const { return m_isReparsingFunction; }
66
67	JSTokenType lex(JSToken, OptionSet<LexerFlags>, bool* strictMode);
68	JSTokenType lexWithoutClearingLineTerminator(JSToken, OptionSet<LexerFlags>, bool* strictMode);
69	bool nextTokenIsColon();
70	int lineNumber() const { return m_lineNumber; }
71	ALWAYS_INLINE int currentOffset() const { return offsetFromSourcePtr(m_code); }
72	ALWAYS_INLINE int currentLineStartOffset() const { return offsetFromSourcePtr(m_lineStart); }
73	ALWAYS_INLINE JSTextPosition currentPosition() const
74	{
75	return JSTextPosition(m_lineNumber, currentOffset(), currentLineStartOffset());
76	}
77	JSTextPosition positionBeforeLastNewline() const { return m_positionBeforeLastNewline; }
78	JSTokenLocation lastTokenLocation() const { return m_lastTokenLocation; }
79	void setLastLineNumber(int lastLineNumber) { m_lastLineNumber = lastLineNumber; }
80	int lastLineNumber() const { return m_lastLineNumber; }
81	bool hasLineTerminatorBeforeToken() const { return m_hasLineTerminatorBeforeToken; }
82	JSTokenType scanRegExp(JSToken*, UChar patternPrefix = `0`);
83	enum class RawStringsBuildMode { BuildRawStrings, DontBuildRawStrings };
84	JSTokenType scanTemplateString(JSToken*, RawStringsBuildMode);
85
86	// Functions for use after parsing.
87	bool sawError() const { return m_error; }
88	void setSawError(bool sawError) { m_error = sawError; }
89	String getErrorMessage() const { return m_lexErrorMessage; }
90	void setErrorMessage(const String& errorMessage) { m_lexErrorMessage = errorMessage; }
91	String sourceURLDirective() const { return m_sourceURLDirective; }
92	String sourceMappingURLDirective() const { return m_sourceMappingURLDirective; }
93	void clear();
94	void setOffset(int offset, int lineStartOffset)
95	{
96	m_error = `0`;
97	m_lexErrorMessage = String ();
98
99	m_code = sourcePtrFromOffset(offset);
100	m_lineStart = sourcePtrFromOffset(lineStartOffset);
101	ASSERT(currentOffset() >= currentLineStartOffset());
102
103	m_buffer8.shrink(`0`);
104	m_buffer16.shrink(`0`);
105	if (LIKELY(m_code < m_codeEnd))
106	m_current = *m_code;
107	else
108	m_current = `0`;
109	}
110	void setLineNumber(int line)
111	{
112	m_lineNumber = line;
113	}
114	void setHasLineTerminatorBeforeToken(bool terminator)
115	{
116	m_hasLineTerminatorBeforeToken = terminator;
117	}
118
119	JSTokenType lexExpectIdentifier(JSToken, OptionSet<LexerFlags>, bool* strictMode);
120
121	ALWAYS_INLINE StringView getToken(const JSToken& token)
122	{
123	SourceProvider* sourceProvider = m_source->provider();
124	ASSERT_WITH_MESSAGE(token.m_location.startOffset <= token.m_location.endOffset, "Calling this function with the baked token.");
125	return sourceProvider->getRange(token.m_location.startOffset, token.m_location.endOffset);
126	}
127
128	private:
129	void record8(int);
130	void append8(const T*, size_t);
131	void record16(int);
132	void record16(T);
133	void recordUnicodeCodePoint(UChar32);
134	void append16(const LChar*, size_t);
135	void append16(const UChar* characters, size_t length) { m_buffer16.append(characters, length); }
136
137	ALWAYS_INLINE void shift();
138	ALWAYS_INLINE bool atEnd() const;
139	ALWAYS_INLINE T peek(int offset) const;
140
141	ParsedUnicodeEscapeValue parseUnicodeEscape();
142	void shiftLineTerminator();
143
144	ALWAYS_INLINE int offsetFromSourcePtr(const T* ptr) const { return ptr - m_codeStart; }
145	ALWAYS_INLINE const T* sourcePtrFromOffset(int offset) const { return m_codeStart + offset; }
146
147	String invalidCharacterMessage() const;
148	ALWAYS_INLINE const T* currentSourcePtr() const;
149	ALWAYS_INLINE void setOffsetFromSourcePtr(const T* sourcePtr, unsigned lineStartOffset) { setOffset(offsetFromSourcePtr(sourcePtr), lineStartOffset); }
150
151	ALWAYS_INLINE void setCodeStart(const StringView&);
152
153	ALWAYS_INLINE const Identifier* makeIdentifier(const LChar* characters, size_t length);
154	ALWAYS_INLINE const Identifier* makeIdentifier(const UChar* characters, size_t length);
155	ALWAYS_INLINE const Identifier* makeLCharIdentifier(const LChar* characters, size_t length);
156	ALWAYS_INLINE const Identifier* makeLCharIdentifier(const UChar* characters, size_t length);
157	ALWAYS_INLINE const Identifier* makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars);
158	ALWAYS_INLINE const Identifier* makeIdentifierLCharFromUChar(const UChar* characters, size_t length);
159	ALWAYS_INLINE const Identifier* makeEmptyIdentifier();
160
161	ALWAYS_INLINE bool lastTokenWasRestrKeyword() const;
162
163	ALWAYS_INLINE void skipWhitespace();
164
165	template <int shiftAmount> void internalShift();
166	template <bool shouldCreateIdentifier> ALWAYS_INLINE JSTokenType parseKeyword(JSTokenData*);
167	template <bool shouldBuildIdentifiers> ALWAYS_INLINE JSTokenType parseIdentifier(JSTokenData, OptionSet<LexerFlags>, bool* strictMode);
168	template <bool shouldBuildIdentifiers> NEVER_INLINE JSTokenType parseIdentifierSlowCase(JSTokenData, OptionSet<LexerFlags>, bool* strictMode);
169	enum StringParseResult {
170	StringParsedSuccessfully,
171	StringUnterminated,
172	StringCannotBeParsed
173	};
174	template <bool shouldBuildStrings> ALWAYS_INLINE StringParseResult parseString(JSTokenData, bool* strictMode);
175	template <bool shouldBuildStrings> NEVER_INLINE StringParseResult parseStringSlowCase(JSTokenData, bool* strictMode);
176
177
178	template <bool shouldBuildStrings, LexerEscapeParseMode escapeParseMode> ALWAYS_INLINE StringParseResult parseComplexEscape(bool strictMode, T stringQuoteCharacter);
179	ALWAYS_INLINE StringParseResult parseTemplateLiteral(JSTokenData*, RawStringsBuildMode);
180
181	using NumberParseResult = Variant<double, const Identifier*>;
182	ALWAYS_INLINE Optional<NumberParseResult> parseHex();
183	ALWAYS_INLINE Optional<NumberParseResult> parseBinary();
184	ALWAYS_INLINE Optional<NumberParseResult> parseOctal();
185	ALWAYS_INLINE Optional<NumberParseResult> parseDecimal();
186	ALWAYS_INLINE bool parseNumberAfterDecimalPoint();
187	ALWAYS_INLINE bool parseNumberAfterExponentIndicator();
188	ALWAYS_INLINE bool parseMultilineComment();
189
190	ALWAYS_INLINE void parseCommentDirective();
191	ALWAYS_INLINE String parseCommentDirectiveValue();
192
193	template <unsigned length>
194	ALWAYS_INLINE bool consume(const char (&input)[length]);
195
196	void fillTokenInfo(JSToken, JSTokenType, int* lineNumber, int endOffset, int lineStartOffset, JSTextPosition endPosition);
197
198	static constexpr size_t initialReadBufferCapacity = `32`;
199
200	int m_lineNumber;
201	int m_lastLineNumber;
202
203	Vector<LChar> m_buffer8;
204	Vector<UChar> m_buffer16;
205	Vector<UChar> m_bufferForRawTemplateString16;
206	bool m_hasLineTerminatorBeforeToken;
207	int m_lastToken;
208
209	const SourceCode* m_source;
210	unsigned m_sourceOffset;
211	const T* m_code;
212	const T* m_codeStart;
213	const T* m_codeEnd;
214	const T* m_codeStartPlusOffset;
215	const T* m_lineStart;
216	JSTextPosition m_positionBeforeLastNewline;
217	JSTokenLocation m_lastTokenLocation;
218	bool m_isReparsingFunction;
219	bool m_atLineStart;
220	bool m_error;
221	String m_lexErrorMessage;
222
223	String m_sourceURLDirective;
224	String m_sourceMappingURLDirective;
225
226	T m_current;
227
228	IdentifierArena* m_arena;
229
230	VM& m_vm;
231	bool m_parsingBuiltinFunction;
232	JSParserScriptMode m_scriptMode;
233	};
234
235	template <>
236	ALWAYS_INLINE bool Lexer<LChar>::isWhiteSpace(LChar ch)
237	{
238	return ch == `' '` \|\| ch == `'\t'` \|\| ch == `0xB` \|\| ch == `0xC` \|\| ch == `0xA0`;
239	}
240
241	template <>
242	ALWAYS_INLINE bool Lexer<UChar>::isWhiteSpace(UChar ch)
243	{
244	return isLatin1(ch) ? Lexer<LChar>::isWhiteSpace(static_cast<LChar>(ch)) : (u_charType(ch) == U_SPACE_SEPARATOR \|\| ch == `0xFEFF`);
245	}
246
247	template <>
248	ALWAYS_INLINE bool Lexer<LChar>::isLineTerminator(LChar ch)
249	{
250	return ch == `'\r'` \|\| ch == `'\n'`;
251	}
252
253	template <>
254	ALWAYS_INLINE bool Lexer<UChar>::isLineTerminator(UChar ch)
255	{
256	return ch == `'\r'` \|\| ch == `'\n'` \|\| (ch & ~`1`) == `0x2028`;
257	}
258
259	template <typename T>
260	inline unsigned char Lexer<T>::convertHex(int c1, int c2)
261	{
262	return (toASCIIHexValue(c1) << `4`) \| toASCIIHexValue(c2);
263	}
264
265	template <typename T>
266	inline UChar Lexer<T>::convertUnicode(int c1, int c2, int c3, int c4)
267	{
268	return (convertHex(c1, c2) << `8`) \| convertHex(c3, c4);
269	}
270
271	template <typename T>
272	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const LChar* characters, size_t length)
273	{
274	return &m_arena->makeIdentifier(m_vm, characters, length);
275	}
276
277	template <typename T>
278	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifier(const UChar* characters, size_t length)
279	{
280	return &m_arena->makeIdentifier(m_vm, characters, length);
281	}
282
283	template <>
284	ALWAYS_INLINE const Identifier* Lexer<LChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar)
285	{
286	return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
287	}
288
289	template <>
290	ALWAYS_INLINE const Identifier* Lexer<UChar>::makeRightSizedIdentifier(const UChar* characters, size_t length, UChar orAllChars)
291	{
292	if (!(orAllChars & ~`0xff`))
293	return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
294
295	return &m_arena->makeIdentifier(m_vm, characters, length);
296	}
297
298	template <typename T>
299	ALWAYS_INLINE const Identifier* Lexer<T>::makeEmptyIdentifier()
300	{
301	return &m_arena->makeEmptyIdentifier(m_vm);
302	}
303
304	template <>
305	ALWAYS_INLINE void Lexer<LChar>::setCodeStart(const StringView& sourceString)
306	{
307	ASSERT(sourceString.is8Bit());
308	m_codeStart = sourceString.characters8();
309	}
310
311	template <>
312	ALWAYS_INLINE void Lexer<UChar>::setCodeStart(const StringView& sourceString)
313	{
314	ASSERT(!sourceString.is8Bit());
315	m_codeStart = sourceString.characters16();
316	}
317
318	template <typename T>
319	ALWAYS_INLINE const Identifier* Lexer<T>::makeIdentifierLCharFromUChar(const UChar* characters, size_t length)
320	{
321	return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
322	}
323
324	template <typename T>
325	ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const LChar* characters, size_t length)
326	{
327	return &m_arena->makeIdentifier(m_vm, characters, length);
328	}
329
330	template <typename T>
331	ALWAYS_INLINE const Identifier* Lexer<T>::makeLCharIdentifier(const UChar* characters, size_t length)
332	{
333	return &m_arena->makeIdentifierLCharFromUChar(m_vm, characters, length);
334	}
335
336	#if ASSERT_DISABLED
337	ALWAYS_INLINE bool isSafeBuiltinIdentifier(VM&, const Identifier) { return* true; }
338	#else
339	bool isSafeBuiltinIdentifier(VM&, const Identifier*);
340	#endif
341
342	template <typename T>
343	ALWAYS_INLINE JSTokenType Lexer<T>::lexExpectIdentifier(JSToken* tokenRecord, OptionSet<LexerFlags> lexerFlags, bool strictMode)
344	{
345	JSTokenData* tokenData = &tokenRecord->m_data;
346	JSTokenLocation* tokenLocation = &tokenRecord->m_location;
347	ASSERT(lexerFlags.contains(LexerFlags::IgnoreReservedWords));
348	const T* start = m_code;
349	const T* ptr = start;
350	const T* end = m_codeEnd;
351	JSTextPosition startPosition = currentPosition();
352	if (ptr >= end) {
353	ASSERT(ptr == end);
354	goto slowCase;
355	}
356	if (!WTF::isASCIIAlpha(*ptr))
357	goto slowCase;
358	++ptr;
359	while (ptr < end) {
360	if (!WTF::isASCIIAlphanumeric(*ptr))
361	break;
362	++ptr;
363	}
364
365	// Here's the shift
366	if (ptr < end) {
367	if ((!WTF::isASCII(ptr)) \|\| (ptr == `'\\'`) \|\| (ptr == `'_'`) \|\| (ptr == `'$'`))
368	goto slowCase;
369	m_current = *ptr;
370	} else
371	m_current = `0`;
372
373	m_code = ptr;
374	ASSERT(currentOffset() >= currentLineStartOffset());
375
376	// Create the identifier if needed
377	if (lexerFlags.contains(LexerFlags::DontBuildKeywords)
378	#if !ASSERT_DISABLED
379	&& !m_parsingBuiltinFunction
380	#endif
381	)
382	tokenData->ident = `0`;
383	else
384	tokenData->ident = makeLCharIdentifier(start, ptr - start);
385
386	tokenLocation->line = m_lineNumber;
387	tokenLocation->lineStartOffset = currentLineStartOffset();
388	tokenLocation->startOffset = offsetFromSourcePtr(start);
389	tokenLocation->endOffset = currentOffset();
390	ASSERT(tokenLocation->startOffset >= tokenLocation->lineStartOffset);
391	tokenRecord->m_startPosition = startPosition;
392	tokenRecord->m_endPosition = currentPosition();
393	#if !ASSERT_DISABLED
394	if (m_parsingBuiltinFunction) {
395	if (!isSafeBuiltinIdentifier(m_vm, tokenData->ident))
396	return ERRORTOK;
397	}
398	#endif
399
400	m_lastToken = IDENT;
401	return IDENT;
402
403	slowCase:
404	return lex(tokenRecord, lexerFlags, strictMode);
405	}
406
407	template <typename T>
408	ALWAYS_INLINE JSTokenType Lexer<T>::lex(JSToken* tokenRecord, OptionSet<LexerFlags> lexerFlags, bool strictMode)
409	{
410	m_hasLineTerminatorBeforeToken = false;
411	return lexWithoutClearingLineTerminator(tokenRecord, lexerFlags, strictMode);
412	}
413
414	} // namespace JSC
415

Browse the source code of jsc/Source/JavaScriptCore/parser/Lexer.h