1/*
2 * Copyright (C) 1999-2001, 2004 Harri Porten (porten@kde.org)
3 * Copyright (c) 2007, 2008, 2016-2017 Apple Inc. All rights reserved.
4 * Copyright (C) 2009 Torch Mobile, Inc.
5 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "config.h"
24#include "RegExp.h"
25
26#include "ExceptionHelpers.h"
27#include "Lexer.h"
28#include "JSCInlines.h"
29#include "RegExpCache.h"
30#include "RegExpInlines.h"
31#include "YarrJIT.h"
32#include <wtf/Assertions.h>
33
34namespace JSC {
35
36const ClassInfo RegExp::s_info = { "RegExp", nullptr, nullptr, nullptr, CREATE_METHOD_TABLE(RegExp) };
37
38#if REGEXP_FUNC_TEST_DATA_GEN
39const char* const RegExpFunctionalTestCollector::s_fileName = "/tmp/RegExpTestsData";
40RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::s_instance = 0;
41
42RegExpFunctionalTestCollector* RegExpFunctionalTestCollector::get()
43{
44 if (!s_instance)
45 s_instance = new RegExpFunctionalTestCollector();
46
47 return s_instance;
48}
49
50void RegExpFunctionalTestCollector::outputOneTest(RegExp* regExp, const String& s, int startOffset, int* ovector, int result)
51{
52 if ((!m_lastRegExp) || (m_lastRegExp != regExp)) {
53 m_lastRegExp = regExp;
54 fputc('/', m_file);
55 outputEscapedString(regExp->pattern(), true);
56 fputc('/', m_file);
57 if (regExp->global())
58 fputc('g', m_file);
59 if (regExp->ignoreCase())
60 fputc('i', m_file);
61 if (regExp->multiline())
62 fputc('m', m_file);
63 if (regExp->dotAll())
64 fputc('s', m_file);
65 if (regExp->unicode())
66 fputc('u', m_file);
67 if (regExp->sticky())
68 fputc('y', m_file);
69 fprintf(m_file, "\n");
70 }
71
72 fprintf(m_file, " \"");
73 outputEscapedString(s);
74 fprintf(m_file, "\", %d, %d, (", startOffset, result);
75 for (unsigned i = 0; i <= regExp->numSubpatterns(); i++) {
76 int subpatternBegin = ovector[i * 2];
77 int subpatternEnd = ovector[i * 2 + 1];
78 if (subpatternBegin == -1)
79 subpatternEnd = -1;
80 fprintf(m_file, "%d, %d", subpatternBegin, subpatternEnd);
81 if (i < regExp->numSubpatterns())
82 fputs(", ", m_file);
83 }
84
85 fprintf(m_file, ")\n");
86 fflush(m_file);
87}
88
89RegExpFunctionalTestCollector::RegExpFunctionalTestCollector()
90{
91 m_file = fopen(s_fileName, "r+");
92 if (!m_file)
93 m_file = fopen(s_fileName, "w+");
94
95 fseek(m_file, 0L, SEEK_END);
96}
97
98RegExpFunctionalTestCollector::~RegExpFunctionalTestCollector()
99{
100 fclose(m_file);
101 s_instance = 0;
102}
103
104void RegExpFunctionalTestCollector::outputEscapedString(const String& s, bool escapeSlash)
105{
106 int len = s.length();
107
108 for (int i = 0; i < len; ++i) {
109 UChar c = s[i];
110
111 switch (c) {
112 case '\0':
113 fputs("\\0", m_file);
114 break;
115 case '\a':
116 fputs("\\a", m_file);
117 break;
118 case '\b':
119 fputs("\\b", m_file);
120 break;
121 case '\f':
122 fputs("\\f", m_file);
123 break;
124 case '\n':
125 fputs("\\n", m_file);
126 break;
127 case '\r':
128 fputs("\\r", m_file);
129 break;
130 case '\t':
131 fputs("\\t", m_file);
132 break;
133 case '\v':
134 fputs("\\v", m_file);
135 break;
136 case '/':
137 if (escapeSlash)
138 fputs("\\/", m_file);
139 else
140 fputs("/", m_file);
141 break;
142 case '\"':
143 fputs("\\\"", m_file);
144 break;
145 case '\\':
146 fputs("\\\\", m_file);
147 break;
148 case '\?':
149 fputs("\?", m_file);
150 break;
151 default:
152 if (c > 0x7f)
153 fprintf(m_file, "\\u%04x", c);
154 else
155 fputc(c, m_file);
156 break;
157 }
158 }
159}
160#endif
161
162RegExp::RegExp(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
163 : JSCell(vm, vm.regExpStructure.get())
164 , m_patternString(patternString)
165 , m_flags(flags)
166{
167 ASSERT(m_flags != Yarr::Flags::DeletedValue);
168}
169
170void RegExp::finishCreation(VM& vm)
171{
172 Base::finishCreation(vm);
173 Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm.stackLimit());
174 if (!isValid()) {
175 m_state = ParseError;
176 return;
177 }
178
179 m_numSubpatterns = pattern.m_numSubpatterns;
180 if (!pattern.m_captureGroupNames.isEmpty() || !pattern.m_namedGroupToParenIndex.isEmpty()) {
181 m_rareData = makeUnique<RareData>();
182 m_rareData->m_captureGroupNames.swap(pattern.m_captureGroupNames);
183 m_rareData->m_namedGroupToParenIndex.swap(pattern.m_namedGroupToParenIndex);
184 }
185}
186
187void RegExp::destroy(JSCell* cell)
188{
189 RegExp* thisObject = static_cast<RegExp*>(cell);
190#if REGEXP_FUNC_TEST_DATA_GEN
191 RegExpFunctionalTestCollector::get()->clearRegExp(this);
192#endif
193 thisObject->RegExp::~RegExp();
194}
195
196size_t RegExp::estimatedSize(JSCell* cell, VM& vm)
197{
198 RegExp* thisObject = static_cast<RegExp*>(cell);
199 size_t regexDataSize = thisObject->m_regExpBytecode ? thisObject->m_regExpBytecode->estimatedSizeInBytes() : 0;
200#if ENABLE(YARR_JIT)
201 if (auto* jitCode = thisObject->m_regExpJITCode.get())
202 regexDataSize += jitCode->size();
203#endif
204 return Base::estimatedSize(cell, vm) + regexDataSize;
205}
206
207RegExp* RegExp::createWithoutCaching(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
208{
209 RegExp* regExp = new (NotNull, allocateCell<RegExp>(vm.heap)) RegExp(vm, patternString, flags);
210 regExp->finishCreation(vm);
211 return regExp;
212}
213
214RegExp* RegExp::create(VM& vm, const String& patternString, OptionSet<Yarr::Flags> flags)
215{
216 return vm.regExpCache()->lookupOrCreate(patternString, flags);
217}
218
219
220static std::unique_ptr<Yarr::BytecodePattern> byteCodeCompilePattern(VM* vm, Yarr::YarrPattern& pattern, Yarr::ErrorCode& errorCode)
221{
222 return Yarr::byteCompile(pattern, &vm->m_regExpAllocator, errorCode, &vm->m_regExpAllocatorLock);
223}
224
225void RegExp::byteCodeCompileIfNecessary(VM* vm)
226{
227 if (m_regExpBytecode)
228 return;
229
230 Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
231 if (hasError(m_constructionErrorCode)) {
232 RELEASE_ASSERT_NOT_REACHED();
233#if COMPILER_QUIRK(CONSIDERS_UNREACHABLE_CODE)
234 m_state = ParseError;
235 return;
236#endif
237 }
238 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
239
240 m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode);
241 if (!m_regExpBytecode) {
242 m_state = ParseError;
243 return;
244 }
245}
246
247void RegExp::compile(VM* vm, Yarr::YarrCharSize charSize)
248{
249 auto locker = holdLock(cellLock());
250
251 Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
252 if (hasError(m_constructionErrorCode)) {
253 m_state = ParseError;
254 return;
255 }
256 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
257
258 if (!hasCode()) {
259 ASSERT(m_state == NotCompiled);
260 vm->regExpCache()->addToStrongCache(this);
261 m_state = ByteCode;
262 }
263
264#if ENABLE(YARR_JIT)
265 if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT()
266#if !ENABLE(YARR_JIT_BACKREFERENCES)
267 && !pattern.m_containsBackreferences
268#endif
269 ) {
270 auto& jitCode = ensureRegExpJITCode();
271 Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode);
272 if (!jitCode.failureReason()) {
273 m_state = JITCode;
274 return;
275 }
276 }
277#else
278 UNUSED_PARAM(charSize);
279#endif
280
281 if (Options::dumpCompiledRegExpPatterns())
282 dataLog("Can't JIT this regular expression: \"", m_patternString, "\"\n");
283
284 m_state = ByteCode;
285 m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode);
286 if (!m_regExpBytecode) {
287 m_state = ParseError;
288 return;
289 }
290}
291
292int RegExp::match(VM& vm, const String& s, unsigned startOffset, Vector<int>& ovector)
293{
294 return matchInline(vm, s, startOffset, ovector);
295}
296
297bool RegExp::matchConcurrently(
298 VM& vm, const String& s, unsigned startOffset, int& position, Vector<int>& ovector)
299{
300 auto locker = holdLock(cellLock());
301
302 if (!hasCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16))
303 return false;
304
305 position = match(vm, s, startOffset, ovector);
306 return true;
307}
308
309void RegExp::compileMatchOnly(VM* vm, Yarr::YarrCharSize charSize)
310{
311 auto locker = holdLock(cellLock());
312
313 Yarr::YarrPattern pattern(m_patternString, m_flags, m_constructionErrorCode, vm->stackLimit());
314 if (hasError(m_constructionErrorCode)) {
315 m_state = ParseError;
316 return;
317 }
318 ASSERT(m_numSubpatterns == pattern.m_numSubpatterns);
319
320 if (!hasCode()) {
321 ASSERT(m_state == NotCompiled);
322 vm->regExpCache()->addToStrongCache(this);
323 m_state = ByteCode;
324 }
325
326#if ENABLE(YARR_JIT)
327 if (!pattern.containsUnsignedLengthPattern() && VM::canUseJIT() && Options::useRegExpJIT()
328#if !ENABLE(YARR_JIT_BACKREFERENCES)
329 && !pattern.m_containsBackreferences
330#endif
331 ) {
332 auto& jitCode = ensureRegExpJITCode();
333 Yarr::jitCompile(pattern, m_patternString, charSize, vm, jitCode, Yarr::MatchOnly);
334 if (!jitCode.failureReason()) {
335 m_state = JITCode;
336 return;
337 }
338 }
339#else
340 UNUSED_PARAM(charSize);
341#endif
342
343 if (Options::dumpCompiledRegExpPatterns())
344 dataLog("Can't JIT this regular expression: \"", m_patternString, "\"\n");
345
346 m_state = ByteCode;
347 m_regExpBytecode = byteCodeCompilePattern(vm, pattern, m_constructionErrorCode);
348 if (!m_regExpBytecode) {
349 m_state = ParseError;
350 return;
351 }
352}
353
354MatchResult RegExp::match(VM& vm, const String& s, unsigned startOffset)
355{
356 return matchInline(vm, s, startOffset);
357}
358
359bool RegExp::matchConcurrently(VM& vm, const String& s, unsigned startOffset, MatchResult& result)
360{
361 auto locker = holdLock(cellLock());
362
363 if (!hasMatchOnlyCodeFor(s.is8Bit() ? Yarr::Char8 : Yarr::Char16))
364 return false;
365
366 result = match(vm, s, startOffset);
367 return true;
368}
369
370void RegExp::deleteCode()
371{
372 auto locker = holdLock(cellLock());
373
374 if (!hasCode())
375 return;
376 m_state = NotCompiled;
377#if ENABLE(YARR_JIT)
378 if (m_regExpJITCode)
379 m_regExpJITCode->clear();
380#endif
381 m_regExpBytecode = nullptr;
382}
383
384#if ENABLE(YARR_JIT_DEBUG)
385void RegExp::matchCompareWithInterpreter(const String& s, int startOffset, int* offsetVector, int jitResult)
386{
387 int offsetVectorSize = (m_numSubpatterns + 1) * 2;
388 Vector<int> interpreterOvector;
389 interpreterOvector.resize(offsetVectorSize);
390 int* interpreterOffsetVector = interpreterOvector.data();
391 int interpreterResult = 0;
392 int differences = 0;
393
394 // Initialize interpreterOffsetVector with the return value (index 0) and the
395 // first subpattern start indicies (even index values) set to -1.
396 // No need to init the subpattern end indicies.
397 for (unsigned j = 0, i = 0; i < m_numSubpatterns + 1; j += 2, i++)
398 interpreterOffsetVector[j] = -1;
399
400 interpreterResult = Yarr::interpret(m_regExpBytecode.get(), s, startOffset, reinterpret_cast<unsigned*>(interpreterOffsetVector));
401
402 if (jitResult != interpreterResult)
403 differences++;
404
405 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++)
406 if ((offsetVector[j] != interpreterOffsetVector[j])
407 || ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1])))
408 differences++;
409
410 if (differences) {
411 dataLogF("RegExp Discrepency for /%s/\n string input ", pattern().utf8().data());
412 unsigned segmentLen = s.length() - static_cast<unsigned>(startOffset);
413
414 dataLogF((segmentLen < 150) ? "\"%s\"\n" : "\"%148s...\"\n", s.utf8().data() + startOffset);
415
416 if (jitResult != interpreterResult) {
417 dataLogF(" JIT result = %d, interpreted result = %d\n", jitResult, interpreterResult);
418 differences--;
419 } else {
420 dataLogF(" Correct result = %d\n", jitResult);
421 }
422
423 if (differences) {
424 for (unsigned j = 2, i = 0; i < m_numSubpatterns; j +=2, i++) {
425 if (offsetVector[j] != interpreterOffsetVector[j])
426 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j, offsetVector[j], j, interpreterOffsetVector[j]);
427 if ((offsetVector[j] >= 0) && (offsetVector[j+1] != interpreterOffsetVector[j+1]))
428 dataLogF(" JIT offset[%d] = %d, interpreted offset[%d] = %d\n", j+1, offsetVector[j+1], j+1, interpreterOffsetVector[j+1]);
429 }
430 }
431 }
432}
433#endif
434
435#if ENABLE(REGEXP_TRACING)
436 void RegExp::printTraceData()
437 {
438 char formattedPattern[41];
439 char rawPattern[41];
440
441 strncpy(rawPattern, pattern().utf8().data(), 40);
442 rawPattern[40]= '\0';
443
444 int pattLen = strlen(rawPattern);
445
446 snprintf(formattedPattern, 41, (pattLen <= 38) ? "/%.38s/" : "/%.36s...", rawPattern);
447
448#if ENABLE(YARR_JIT)
449 const size_t jitAddrSize = 20;
450 char jit8BitMatchOnlyAddr[jitAddrSize] { };
451 char jit16BitMatchOnlyAddr[jitAddrSize] { };
452 char jit8BitMatchAddr[jitAddrSize] { };
453 char jit16BitMatchAddr[jitAddrSize] { };
454 switch (m_state) {
455 case ParseError:
456 case NotCompiled:
457 break;
458 case ByteCode:
459 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "fallback ");
460 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "---- ");
461 snprintf(jit8BitMatchAddr, jitAddrSize, "fallback ");
462 snprintf(jit16BitMatchAddr, jitAddrSize, "---- ");
463 break;
464 case JITCode: {
465 Yarr::YarrCodeBlock& codeBlock = *m_regExpJITCode.get();
466 snprintf(jit8BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchOnlyAddr()));
467 snprintf(jit16BitMatchOnlyAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchOnlyAddr()));
468 snprintf(jit8BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get8BitMatchAddr()));
469 snprintf(jit16BitMatchAddr, jitAddrSize, "0x%014lx", reinterpret_cast<uintptr_t>(codeBlock.get16BitMatchAddr()));
470 break;
471 }
472 }
473#else
474 const char* jit8BitMatchOnlyAddr = "JIT Off";
475 const char* jit16BitMatchOnlyAddr = "";
476 const char* jit8BitMatchAddr = "JIT Off";
477 const char* jit16BitMatchAddr = "";
478#endif
479 unsigned averageMatchOnlyStringLen = (unsigned)(m_rtMatchOnlyTotalSubjectStringLen / m_rtMatchOnlyCallCount);
480 unsigned averageMatchStringLen = (unsigned)(m_rtMatchTotalSubjectStringLen / m_rtMatchCallCount);
481
482 printf("%-40.40s %16.16s %16.16s %10d %10d %10u\n", formattedPattern, jit8BitMatchOnlyAddr, jit16BitMatchOnlyAddr, m_rtMatchOnlyCallCount, m_rtMatchOnlyFoundCount, averageMatchOnlyStringLen);
483 printf(" %16.16s %16.16s %10d %10d %10u\n", jit8BitMatchAddr, jit16BitMatchAddr, m_rtMatchCallCount, m_rtMatchFoundCount, averageMatchStringLen);
484 }
485#endif
486
487static CString regexpToSourceString(const RegExp* regExp)
488{
489 char postfix[7] = { '/', 0, 0, 0, 0, 0, 0 };
490 int index = 1;
491 if (regExp->global())
492 postfix[index++] = 'g';
493 if (regExp->ignoreCase())
494 postfix[index++] = 'i';
495 if (regExp->multiline())
496 postfix[index] = 'm';
497 if (regExp->dotAll())
498 postfix[index++] = 's';
499 if (regExp->unicode())
500 postfix[index++] = 'u';
501 if (regExp->sticky())
502 postfix[index++] = 'y';
503
504 return toCString("/", regExp->pattern().impl(), postfix);
505}
506
507void RegExp::dumpToStream(const JSCell* cell, PrintStream& out)
508{
509 out.print(regexpToSourceString(jsCast<const RegExp*>(cell)));
510}
511
512} // namespace JSC
513