1 | // Copyright 2011 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #ifndef V8_STRING_SEARCH_H_ |
6 | #define V8_STRING_SEARCH_H_ |
7 | |
8 | #include "src/isolate.h" |
9 | #include "src/vector.h" |
10 | |
11 | namespace v8 { |
12 | namespace internal { |
13 | |
14 | |
15 | //--------------------------------------------------------------------- |
16 | // String Search object. |
17 | //--------------------------------------------------------------------- |
18 | |
19 | // Class holding constants and methods that apply to all string search variants, |
20 | // independently of subject and pattern char size. |
21 | class StringSearchBase { |
22 | protected: |
23 | // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
24 | // limit, we can fix the size of tables. For a needle longer than this limit, |
25 | // search will not be optimal, since we only build tables for a suffix |
26 | // of the string, but it is a safe approximation. |
27 | static const int kBMMaxShift = Isolate::kBMMaxShift; |
28 | |
29 | // Reduce alphabet to this size. |
30 | // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
31 | // proportional to the input alphabet. We reduce the alphabet size by |
32 | // equating input characters modulo a smaller alphabet size. This gives |
33 | // a potentially less efficient searching, but is a safe approximation. |
34 | // For needles using only characters in the same Unicode 256-code point page, |
35 | // there is no search speed degradation. |
36 | static const int kLatin1AlphabetSize = 256; |
37 | static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; |
38 | |
39 | // Bad-char shift table stored in the state. It's length is the alphabet size. |
40 | // For patterns below this length, the skip length of Boyer-Moore is too short |
41 | // to compensate for the algorithmic overhead compared to simple brute force. |
42 | static const int kBMMinPatternLength = 7; |
43 | |
44 | static inline bool IsOneByteString(Vector<const uint8_t> string) { |
45 | return true; |
46 | } |
47 | |
48 | static inline bool IsOneByteString(Vector<const uc16> string) { |
49 | return String::IsOneByte(string.start(), string.length()); |
50 | } |
51 | |
52 | friend class Isolate; |
53 | }; |
54 | |
55 | |
56 | template <typename PatternChar, typename SubjectChar> |
57 | class StringSearch : private StringSearchBase { |
58 | public: |
59 | StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) |
60 | : isolate_(isolate), |
61 | pattern_(pattern), |
62 | start_(Max(0, pattern.length() - kBMMaxShift)) { |
63 | if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
64 | if (!IsOneByteString(pattern_)) { |
65 | strategy_ = &FailSearch; |
66 | return; |
67 | } |
68 | } |
69 | int pattern_length = pattern_.length(); |
70 | if (pattern_length < kBMMinPatternLength) { |
71 | if (pattern_length == 1) { |
72 | strategy_ = &SingleCharSearch; |
73 | return; |
74 | } |
75 | strategy_ = &LinearSearch; |
76 | return; |
77 | } |
78 | strategy_ = &InitialSearch; |
79 | } |
80 | |
81 | int Search(Vector<const SubjectChar> subject, int index) { |
82 | return strategy_(this, subject, index); |
83 | } |
84 | |
85 | static inline int AlphabetSize() { |
86 | if (sizeof(PatternChar) == 1) { |
87 | // Latin1 needle. |
88 | return kLatin1AlphabetSize; |
89 | } else { |
90 | DCHECK_EQ(sizeof(PatternChar), 2); |
91 | // UC16 needle. |
92 | return kUC16AlphabetSize; |
93 | } |
94 | } |
95 | |
96 | private: |
97 | typedef int (*SearchFunction)( // NOLINT - it's not a cast! |
98 | StringSearch<PatternChar, SubjectChar>*, |
99 | Vector<const SubjectChar>, |
100 | int); |
101 | |
102 | static int FailSearch(StringSearch<PatternChar, SubjectChar>*, |
103 | Vector<const SubjectChar>, |
104 | int) { |
105 | return -1; |
106 | } |
107 | |
108 | static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, |
109 | Vector<const SubjectChar> subject, |
110 | int start_index); |
111 | |
112 | static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, |
113 | Vector<const SubjectChar> subject, |
114 | int start_index); |
115 | |
116 | static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, |
117 | Vector<const SubjectChar> subject, |
118 | int start_index); |
119 | |
120 | static int BoyerMooreHorspoolSearch( |
121 | StringSearch<PatternChar, SubjectChar>* search, |
122 | Vector<const SubjectChar> subject, |
123 | int start_index); |
124 | |
125 | static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, |
126 | Vector<const SubjectChar> subject, |
127 | int start_index); |
128 | |
129 | void PopulateBoyerMooreHorspoolTable(); |
130 | |
131 | void PopulateBoyerMooreTable(); |
132 | |
133 | static inline bool exceedsOneByte(uint8_t c) { |
134 | return false; |
135 | } |
136 | |
137 | static inline bool exceedsOneByte(uint16_t c) { |
138 | return c > String::kMaxOneByteCharCodeU; |
139 | } |
140 | |
141 | static inline int CharOccurrence(int* bad_char_occurrence, |
142 | SubjectChar char_code) { |
143 | if (sizeof(SubjectChar) == 1) { |
144 | return bad_char_occurrence[static_cast<int>(char_code)]; |
145 | } |
146 | if (sizeof(PatternChar) == 1) { |
147 | if (exceedsOneByte(char_code)) { |
148 | return -1; |
149 | } |
150 | return bad_char_occurrence[static_cast<unsigned int>(char_code)]; |
151 | } |
152 | // Both pattern and subject are UC16. Reduce character to equivalence class. |
153 | int equiv_class = char_code % kUC16AlphabetSize; |
154 | return bad_char_occurrence[equiv_class]; |
155 | } |
156 | |
157 | // The following tables are shared by all searches. |
158 | // TODO(lrn): Introduce a way for a pattern to keep its tables |
159 | // between searches (e.g., for an Atom RegExp). |
160 | |
161 | // Store for the BoyerMoore(Horspool) bad char shift table. |
162 | // Return a table covering the last kBMMaxShift+1 positions of |
163 | // pattern. |
164 | int* bad_char_table() { |
165 | return isolate_->bad_char_shift_table(); |
166 | } |
167 | |
168 | // Store for the BoyerMoore good suffix shift table. |
169 | int* good_suffix_shift_table() { |
170 | // Return biased pointer that maps the range [start_..pattern_.length() |
171 | // to the kGoodSuffixShiftTable array. |
172 | return isolate_->good_suffix_shift_table() - start_; |
173 | } |
174 | |
175 | // Table used temporarily while building the BoyerMoore good suffix |
176 | // shift table. |
177 | int* suffix_table() { |
178 | // Return biased pointer that maps the range [start_..pattern_.length() |
179 | // to the kSuffixTable array. |
180 | return isolate_->suffix_table() - start_; |
181 | } |
182 | |
183 | Isolate* isolate_; |
184 | // The pattern to search for. |
185 | Vector<const PatternChar> pattern_; |
186 | // Pointer to implementation of the search. |
187 | SearchFunction strategy_; |
188 | // Cache value of Max(0, pattern_length() - kBMMaxShift) |
189 | int start_; |
190 | }; |
191 | |
192 | |
193 | template <typename T, typename U> |
194 | inline T AlignDown(T value, U alignment) { |
195 | return reinterpret_cast<T>( |
196 | (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
197 | } |
198 | |
199 | |
200 | inline uint8_t GetHighestValueByte(uc16 character) { |
201 | return Max(static_cast<uint8_t>(character & 0xFF), |
202 | static_cast<uint8_t>(character >> 8)); |
203 | } |
204 | |
205 | |
206 | inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
207 | |
208 | |
209 | template <typename PatternChar, typename SubjectChar> |
210 | inline int FindFirstCharacter(Vector<const PatternChar> pattern, |
211 | Vector<const SubjectChar> subject, int index) { |
212 | const PatternChar pattern_first_char = pattern[0]; |
213 | const int max_n = (subject.length() - pattern.length() + 1); |
214 | |
215 | const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
216 | const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); |
217 | int pos = index; |
218 | do { |
219 | DCHECK_GE(max_n - pos, 0); |
220 | const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( |
221 | memchr(subject.start() + pos, search_byte, |
222 | (max_n - pos) * sizeof(SubjectChar))); |
223 | if (char_pos == nullptr) return -1; |
224 | char_pos = AlignDown(char_pos, sizeof(SubjectChar)); |
225 | pos = static_cast<int>(char_pos - subject.start()); |
226 | if (subject[pos] == search_char) return pos; |
227 | } while (++pos < max_n); |
228 | |
229 | return -1; |
230 | } |
231 | |
232 | |
233 | //--------------------------------------------------------------------- |
234 | // Single Character Pattern Search Strategy |
235 | //--------------------------------------------------------------------- |
236 | |
237 | template <typename PatternChar, typename SubjectChar> |
238 | int StringSearch<PatternChar, SubjectChar>::SingleCharSearch( |
239 | StringSearch<PatternChar, SubjectChar>* search, |
240 | Vector<const SubjectChar> subject, |
241 | int index) { |
242 | DCHECK_EQ(1, search->pattern_.length()); |
243 | PatternChar pattern_first_char = search->pattern_[0]; |
244 | if (sizeof(PatternChar) > sizeof(SubjectChar)) { |
245 | if (exceedsOneByte(pattern_first_char)) { |
246 | return -1; |
247 | } |
248 | } |
249 | return FindFirstCharacter(search->pattern_, subject, index); |
250 | } |
251 | |
252 | //--------------------------------------------------------------------- |
253 | // Linear Search Strategy |
254 | //--------------------------------------------------------------------- |
255 | |
256 | |
257 | template <typename PatternChar, typename SubjectChar> |
258 | inline bool CharCompare(const PatternChar* pattern, |
259 | const SubjectChar* subject, |
260 | int length) { |
261 | DCHECK_GT(length, 0); |
262 | int pos = 0; |
263 | do { |
264 | if (pattern[pos] != subject[pos]) { |
265 | return false; |
266 | } |
267 | pos++; |
268 | } while (pos < length); |
269 | return true; |
270 | } |
271 | |
272 | |
273 | // Simple linear search for short patterns. Never bails out. |
274 | template <typename PatternChar, typename SubjectChar> |
275 | int StringSearch<PatternChar, SubjectChar>::LinearSearch( |
276 | StringSearch<PatternChar, SubjectChar>* search, |
277 | Vector<const SubjectChar> subject, |
278 | int index) { |
279 | Vector<const PatternChar> pattern = search->pattern_; |
280 | DCHECK_GT(pattern.length(), 1); |
281 | int pattern_length = pattern.length(); |
282 | int i = index; |
283 | int n = subject.length() - pattern_length; |
284 | while (i <= n) { |
285 | i = FindFirstCharacter(pattern, subject, i); |
286 | if (i == -1) return -1; |
287 | DCHECK_LE(i, n); |
288 | i++; |
289 | // Loop extracted to separate function to allow using return to do |
290 | // a deeper break. |
291 | if (CharCompare(pattern.start() + 1, |
292 | subject.start() + i, |
293 | pattern_length - 1)) { |
294 | return i - 1; |
295 | } |
296 | } |
297 | return -1; |
298 | } |
299 | |
300 | //--------------------------------------------------------------------- |
301 | // Boyer-Moore string search |
302 | //--------------------------------------------------------------------- |
303 | |
304 | template <typename PatternChar, typename SubjectChar> |
305 | int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( |
306 | StringSearch<PatternChar, SubjectChar>* search, |
307 | Vector<const SubjectChar> subject, |
308 | int start_index) { |
309 | Vector<const PatternChar> pattern = search->pattern_; |
310 | int subject_length = subject.length(); |
311 | int pattern_length = pattern.length(); |
312 | // Only preprocess at most kBMMaxShift last characters of pattern. |
313 | int start = search->start_; |
314 | |
315 | int* bad_char_occurence = search->bad_char_table(); |
316 | int* good_suffix_shift = search->good_suffix_shift_table(); |
317 | |
318 | PatternChar last_char = pattern[pattern_length - 1]; |
319 | int index = start_index; |
320 | // Continue search from i. |
321 | while (index <= subject_length - pattern_length) { |
322 | int j = pattern_length - 1; |
323 | int c; |
324 | while (last_char != (c = subject[index + j])) { |
325 | int shift = |
326 | j - CharOccurrence(bad_char_occurence, c); |
327 | index += shift; |
328 | if (index > subject_length - pattern_length) { |
329 | return -1; |
330 | } |
331 | } |
332 | while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; |
333 | if (j < 0) { |
334 | return index; |
335 | } else if (j < start) { |
336 | // we have matched more than our tables allow us to be smart about. |
337 | // Fall back on BMH shift. |
338 | index += pattern_length - 1 |
339 | - CharOccurrence(bad_char_occurence, |
340 | static_cast<SubjectChar>(last_char)); |
341 | } else { |
342 | int gs_shift = good_suffix_shift[j + 1]; |
343 | int bc_occ = |
344 | CharOccurrence(bad_char_occurence, c); |
345 | int shift = j - bc_occ; |
346 | if (gs_shift > shift) { |
347 | shift = gs_shift; |
348 | } |
349 | index += shift; |
350 | } |
351 | } |
352 | |
353 | return -1; |
354 | } |
355 | |
356 | |
357 | template <typename PatternChar, typename SubjectChar> |
358 | void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { |
359 | int pattern_length = pattern_.length(); |
360 | const PatternChar* pattern = pattern_.start(); |
361 | // Only look at the last kBMMaxShift characters of pattern (from start_ |
362 | // to pattern_length). |
363 | int start = start_; |
364 | int length = pattern_length - start; |
365 | |
366 | // Biased tables so that we can use pattern indices as table indices, |
367 | // even if we only cover the part of the pattern from offset start. |
368 | int* shift_table = good_suffix_shift_table(); |
369 | int* suffix_table = this->suffix_table(); |
370 | |
371 | // Initialize table. |
372 | for (int i = start; i < pattern_length; i++) { |
373 | shift_table[i] = length; |
374 | } |
375 | shift_table[pattern_length] = 1; |
376 | suffix_table[pattern_length] = pattern_length + 1; |
377 | |
378 | if (pattern_length <= start) { |
379 | return; |
380 | } |
381 | |
382 | // Find suffixes. |
383 | PatternChar last_char = pattern[pattern_length - 1]; |
384 | int suffix = pattern_length + 1; |
385 | { |
386 | int i = pattern_length; |
387 | while (i > start) { |
388 | PatternChar c = pattern[i - 1]; |
389 | while (suffix <= pattern_length && c != pattern[suffix - 1]) { |
390 | if (shift_table[suffix] == length) { |
391 | shift_table[suffix] = suffix - i; |
392 | } |
393 | suffix = suffix_table[suffix]; |
394 | } |
395 | suffix_table[--i] = --suffix; |
396 | if (suffix == pattern_length) { |
397 | // No suffix to extend, so we check against last_char only. |
398 | while ((i > start) && (pattern[i - 1] != last_char)) { |
399 | if (shift_table[pattern_length] == length) { |
400 | shift_table[pattern_length] = pattern_length - i; |
401 | } |
402 | suffix_table[--i] = pattern_length; |
403 | } |
404 | if (i > start) { |
405 | suffix_table[--i] = --suffix; |
406 | } |
407 | } |
408 | } |
409 | } |
410 | // Build shift table using suffixes. |
411 | if (suffix < pattern_length) { |
412 | for (int i = start; i <= pattern_length; i++) { |
413 | if (shift_table[i] == length) { |
414 | shift_table[i] = suffix - start; |
415 | } |
416 | if (i == suffix) { |
417 | suffix = suffix_table[suffix]; |
418 | } |
419 | } |
420 | } |
421 | } |
422 | |
423 | //--------------------------------------------------------------------- |
424 | // Boyer-Moore-Horspool string search. |
425 | //--------------------------------------------------------------------- |
426 | |
427 | template <typename PatternChar, typename SubjectChar> |
428 | int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( |
429 | StringSearch<PatternChar, SubjectChar>* search, |
430 | Vector<const SubjectChar> subject, |
431 | int start_index) { |
432 | Vector<const PatternChar> pattern = search->pattern_; |
433 | int subject_length = subject.length(); |
434 | int pattern_length = pattern.length(); |
435 | int* char_occurrences = search->bad_char_table(); |
436 | int badness = -pattern_length; |
437 | |
438 | // How bad we are doing without a good-suffix table. |
439 | PatternChar last_char = pattern[pattern_length - 1]; |
440 | int last_char_shift = pattern_length - 1 - |
441 | CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); |
442 | // Perform search |
443 | int index = start_index; // No matches found prior to this index. |
444 | while (index <= subject_length - pattern_length) { |
445 | int j = pattern_length - 1; |
446 | int subject_char; |
447 | while (last_char != (subject_char = subject[index + j])) { |
448 | int bc_occ = CharOccurrence(char_occurrences, subject_char); |
449 | int shift = j - bc_occ; |
450 | index += shift; |
451 | badness += 1 - shift; // at most zero, so badness cannot increase. |
452 | if (index > subject_length - pattern_length) { |
453 | return -1; |
454 | } |
455 | } |
456 | j--; |
457 | while (j >= 0 && pattern[j] == (subject[index + j])) j--; |
458 | if (j < 0) { |
459 | return index; |
460 | } else { |
461 | index += last_char_shift; |
462 | // Badness increases by the number of characters we have |
463 | // checked, and decreases by the number of characters we |
464 | // can skip by shifting. It's a measure of how we are doing |
465 | // compared to reading each character exactly once. |
466 | badness += (pattern_length - j) - last_char_shift; |
467 | if (badness > 0) { |
468 | search->PopulateBoyerMooreTable(); |
469 | search->strategy_ = &BoyerMooreSearch; |
470 | return BoyerMooreSearch(search, subject, index); |
471 | } |
472 | } |
473 | } |
474 | return -1; |
475 | } |
476 | |
477 | |
478 | template <typename PatternChar, typename SubjectChar> |
479 | void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { |
480 | int pattern_length = pattern_.length(); |
481 | |
482 | int* bad_char_occurrence = bad_char_table(); |
483 | |
484 | // Only preprocess at most kBMMaxShift last characters of pattern. |
485 | int start = start_; |
486 | // Run forwards to populate bad_char_table, so that *last* instance |
487 | // of character equivalence class is the one registered. |
488 | // Notice: Doesn't include the last character. |
489 | int table_size = AlphabetSize(); |
490 | if (start == 0) { // All patterns less than kBMMaxShift in length. |
491 | memset(bad_char_occurrence, |
492 | -1, |
493 | table_size * sizeof(*bad_char_occurrence)); |
494 | } else { |
495 | for (int i = 0; i < table_size; i++) { |
496 | bad_char_occurrence[i] = start - 1; |
497 | } |
498 | } |
499 | for (int i = start; i < pattern_length - 1; i++) { |
500 | PatternChar c = pattern_[i]; |
501 | int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); |
502 | bad_char_occurrence[bucket] = i; |
503 | } |
504 | } |
505 | |
506 | //--------------------------------------------------------------------- |
507 | // Linear string search with bailout to BMH. |
508 | //--------------------------------------------------------------------- |
509 | |
510 | // Simple linear search for short patterns, which bails out if the string |
511 | // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
512 | template <typename PatternChar, typename SubjectChar> |
513 | int StringSearch<PatternChar, SubjectChar>::InitialSearch( |
514 | StringSearch<PatternChar, SubjectChar>* search, |
515 | Vector<const SubjectChar> subject, |
516 | int index) { |
517 | Vector<const PatternChar> pattern = search->pattern_; |
518 | int pattern_length = pattern.length(); |
519 | // Badness is a count of how much work we have done. When we have |
520 | // done enough work we decide it's probably worth switching to a better |
521 | // algorithm. |
522 | int badness = -10 - (pattern_length << 2); |
523 | |
524 | // We know our pattern is at least 2 characters, we cache the first so |
525 | // the common case of the first character not matching is faster. |
526 | for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { |
527 | badness++; |
528 | if (badness <= 0) { |
529 | i = FindFirstCharacter(pattern, subject, i); |
530 | if (i == -1) return -1; |
531 | DCHECK_LE(i, n); |
532 | int j = 1; |
533 | do { |
534 | if (pattern[j] != subject[i + j]) { |
535 | break; |
536 | } |
537 | j++; |
538 | } while (j < pattern_length); |
539 | if (j == pattern_length) { |
540 | return i; |
541 | } |
542 | badness += j; |
543 | } else { |
544 | search->PopulateBoyerMooreHorspoolTable(); |
545 | search->strategy_ = &BoyerMooreHorspoolSearch; |
546 | return BoyerMooreHorspoolSearch(search, subject, i); |
547 | } |
548 | } |
549 | return -1; |
550 | } |
551 | |
552 | |
553 | // Perform a a single stand-alone search. |
554 | // If searching multiple times for the same pattern, a search |
555 | // object should be constructed once and the Search function then called |
556 | // for each search. |
557 | template <typename SubjectChar, typename PatternChar> |
558 | int SearchString(Isolate* isolate, |
559 | Vector<const SubjectChar> subject, |
560 | Vector<const PatternChar> pattern, |
561 | int start_index) { |
562 | StringSearch<PatternChar, SubjectChar> search(isolate, pattern); |
563 | return search.Search(subject, start_index); |
564 | } |
565 | |
566 | // A wrapper function around SearchString that wraps raw pointers to the subject |
567 | // and pattern as vectors before calling SearchString. Used from the |
568 | // StringIndexOf builtin. |
569 | template <typename SubjectChar, typename PatternChar> |
570 | intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr, |
571 | int subject_length, const PatternChar* pattern_ptr, |
572 | int pattern_length, int start_index) { |
573 | DisallowHeapAllocation no_gc; |
574 | Vector<const SubjectChar> subject(subject_ptr, subject_length); |
575 | Vector<const PatternChar> pattern(pattern_ptr, pattern_length); |
576 | return SearchString(isolate, subject, pattern, start_index); |
577 | } |
578 | |
579 | } // namespace internal |
580 | } // namespace v8 |
581 | |
582 | #endif // V8_STRING_SEARCH_H_ |
583 | |