1 | // Copyright 2017 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #ifndef V8_OBJECTS_STRING_H_ |
6 | #define V8_OBJECTS_STRING_H_ |
7 | |
8 | #include "src/base/bits.h" |
9 | #include "src/base/export-template.h" |
10 | #include "src/objects/instance-type.h" |
11 | #include "src/objects/name.h" |
12 | #include "src/objects/smi.h" |
13 | #include "src/unicode-decoder.h" |
14 | |
15 | // Has to be the last include (doesn't have include guards): |
16 | #include "src/objects/object-macros.h" |
17 | |
18 | namespace v8 { |
19 | namespace internal { |
20 | |
21 | enum InstanceType : uint16_t; |
22 | |
23 | enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS }; |
24 | enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL }; |
25 | |
26 | // The characteristics of a string are stored in its map. Retrieving these |
27 | // few bits of information is moderately expensive, involving two memory |
28 | // loads where the second is dependent on the first. To improve efficiency |
29 | // the shape of the string is given its own class so that it can be retrieved |
30 | // once and used for several string operations. A StringShape is small enough |
31 | // to be passed by value and is immutable, but be aware that flattening a |
32 | // string can potentially alter its shape. Also be aware that a GC caused by |
33 | // something else can alter the shape of a string due to ConsString |
34 | // shortcutting. Keeping these restrictions in mind has proven to be error- |
35 | // prone and so we no longer put StringShapes in variables unless there is a |
36 | // concrete performance benefit at that particular point in the code. |
37 | class StringShape { |
38 | public: |
39 | inline explicit StringShape(const String s); |
40 | inline explicit StringShape(Map s); |
41 | inline explicit StringShape(InstanceType t); |
42 | inline bool IsSequential(); |
43 | inline bool IsExternal(); |
44 | inline bool IsCons(); |
45 | inline bool IsSliced(); |
46 | inline bool IsThin(); |
47 | inline bool IsIndirect(); |
48 | inline bool IsExternalOneByte(); |
49 | inline bool IsExternalTwoByte(); |
50 | inline bool IsSequentialOneByte(); |
51 | inline bool IsSequentialTwoByte(); |
52 | inline bool IsInternalized(); |
53 | inline StringRepresentationTag representation_tag(); |
54 | inline uint32_t encoding_tag(); |
55 | inline uint32_t full_representation_tag(); |
56 | #ifdef DEBUG |
57 | inline uint32_t type() { return type_; } |
58 | inline void invalidate() { valid_ = false; } |
59 | inline bool valid() { return valid_; } |
60 | #else |
61 | inline void invalidate() {} |
62 | #endif |
63 | |
64 | private: |
65 | uint32_t type_; |
66 | #ifdef DEBUG |
67 | inline void set_valid() { valid_ = true; } |
68 | bool valid_; |
69 | #else |
70 | inline void set_valid() {} |
71 | #endif |
72 | }; |
73 | |
74 | // The String abstract class captures JavaScript string values: |
75 | // |
76 | // Ecma-262: |
77 | // 4.3.16 String Value |
78 | // A string value is a member of the type String and is a finite |
79 | // ordered sequence of zero or more 16-bit unsigned integer values. |
80 | // |
81 | // All string values have a length field. |
82 | class String : public Name { |
83 | public: |
84 | enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING }; |
85 | |
86 | // Representation of the flat content of a String. |
87 | // A non-flat string doesn't have flat content. |
88 | // A flat string has content that's encoded as a sequence of either |
89 | // one-byte chars or two-byte UC16. |
90 | // Returned by String::GetFlatContent(). |
91 | class FlatContent { |
92 | public: |
93 | // Returns true if the string is flat and this structure contains content. |
94 | bool IsFlat() const { return state_ != NON_FLAT; } |
95 | // Returns true if the structure contains one-byte content. |
96 | bool IsOneByte() const { return state_ == ONE_BYTE; } |
97 | // Returns true if the structure contains two-byte content. |
98 | bool IsTwoByte() const { return state_ == TWO_BYTE; } |
99 | |
100 | // Return the one byte content of the string. Only use if IsOneByte() |
101 | // returns true. |
102 | Vector<const uint8_t> ToOneByteVector() const { |
103 | DCHECK_EQ(ONE_BYTE, state_); |
104 | return Vector<const uint8_t>(onebyte_start, length_); |
105 | } |
106 | // Return the two-byte content of the string. Only use if IsTwoByte() |
107 | // returns true. |
108 | Vector<const uc16> ToUC16Vector() const { |
109 | DCHECK_EQ(TWO_BYTE, state_); |
110 | return Vector<const uc16>(twobyte_start, length_); |
111 | } |
112 | |
113 | uc16 Get(int i) const { |
114 | DCHECK(i < length_); |
115 | DCHECK(state_ != NON_FLAT); |
116 | if (state_ == ONE_BYTE) return onebyte_start[i]; |
117 | return twobyte_start[i]; |
118 | } |
119 | |
120 | bool UsesSameString(const FlatContent& other) const { |
121 | return onebyte_start == other.onebyte_start; |
122 | } |
123 | |
124 | private: |
125 | enum State { NON_FLAT, ONE_BYTE, TWO_BYTE }; |
126 | |
127 | // Constructors only used by String::GetFlatContent(). |
128 | explicit FlatContent(const uint8_t* start, int length) |
129 | : onebyte_start(start), length_(length), state_(ONE_BYTE) {} |
130 | explicit FlatContent(const uc16* start, int length) |
131 | : twobyte_start(start), length_(length), state_(TWO_BYTE) {} |
132 | FlatContent() : onebyte_start(nullptr), length_(0), state_(NON_FLAT) {} |
133 | |
134 | union { |
135 | const uint8_t* onebyte_start; |
136 | const uc16* twobyte_start; |
137 | }; |
138 | int length_; |
139 | State state_; |
140 | |
141 | friend class String; |
142 | friend class IterableSubString; |
143 | }; |
144 | |
145 | template <typename Char> |
146 | V8_INLINE Vector<const Char> GetCharVector( |
147 | const DisallowHeapAllocation& no_gc); |
148 | |
149 | // Get and set the length of the string. |
150 | inline int length() const; |
151 | inline void set_length(int value); |
152 | |
153 | // Get and set the length of the string using acquire loads and release |
154 | // stores. |
155 | inline int synchronized_length() const; |
156 | inline void synchronized_set_length(int value); |
157 | |
158 | // Returns whether this string has only one-byte chars, i.e. all of them can |
159 | // be one-byte encoded. This might be the case even if the string is |
160 | // two-byte. Such strings may appear when the embedder prefers |
161 | // two-byte external representations even for one-byte data. |
162 | inline bool IsOneByteRepresentation() const; |
163 | inline bool IsTwoByteRepresentation() const; |
164 | |
165 | // Cons and slices have an encoding flag that may not represent the actual |
166 | // encoding of the underlying string. This is taken into account here. |
167 | // This function is static because that helps it get inlined. |
168 | // Requires: string.IsFlat() |
169 | static inline bool IsOneByteRepresentationUnderneath(String string); |
170 | |
171 | // Get and set individual two byte chars in the string. |
172 | inline void Set(int index, uint16_t value); |
173 | // Get individual two byte char in the string. Repeated calls |
174 | // to this method are not efficient unless the string is flat. |
175 | V8_INLINE uint16_t Get(int index); |
176 | |
177 | // ES6 section 7.1.3.1 ToNumber Applied to the String Type |
178 | static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject); |
179 | |
180 | // Flattens the string. Checks first inline to see if it is |
181 | // necessary. Does nothing if the string is not a cons string. |
182 | // Flattening allocates a sequential string with the same data as |
183 | // the given string and mutates the cons string to a degenerate |
184 | // form, where the first component is the new sequential string and |
185 | // the second component is the empty string. If allocation fails, |
186 | // this function returns a failure. If flattening succeeds, this |
187 | // function returns the sequential string that is now the first |
188 | // component of the cons string. |
189 | // |
190 | // Degenerate cons strings are handled specially by the garbage |
191 | // collector (see IsShortcutCandidate). |
192 | |
193 | static inline Handle<String> Flatten( |
194 | Isolate* isolate, Handle<String> string, |
195 | AllocationType allocation = AllocationType::kYoung); |
196 | |
197 | // Tries to return the content of a flat string as a structure holding either |
198 | // a flat vector of char or of uc16. |
199 | // If the string isn't flat, and therefore doesn't have flat content, the |
200 | // returned structure will report so, and can't provide a vector of either |
201 | // kind. |
202 | V8_EXPORT_PRIVATE FlatContent |
203 | GetFlatContent(const DisallowHeapAllocation& no_gc); |
204 | |
205 | // Returns the parent of a sliced string or first part of a flat cons string. |
206 | // Requires: StringShape(this).IsIndirect() && this->IsFlat() |
207 | inline String GetUnderlying(); |
208 | |
209 | // String relational comparison, implemented according to ES6 section 7.2.11 |
210 | // Abstract Relational Comparison (step 5): The comparison of Strings uses a |
211 | // simple lexicographic ordering on sequences of code unit values. There is no |
212 | // attempt to use the more complex, semantically oriented definitions of |
213 | // character or string equality and collating order defined in the Unicode |
214 | // specification. Therefore String values that are canonically equal according |
215 | // to the Unicode standard could test as unequal. In effect this algorithm |
216 | // assumes that both Strings are already in normalized form. Also, note that |
217 | // for strings containing supplementary characters, lexicographic ordering on |
218 | // sequences of UTF-16 code unit values differs from that on sequences of code |
219 | // point values. |
220 | V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate, |
221 | Handle<String> x, |
222 | Handle<String> y); |
223 | |
224 | // Perform ES6 21.1.3.8, including checking arguments. |
225 | static Object IndexOf(Isolate* isolate, Handle<Object> receiver, |
226 | Handle<Object> search, Handle<Object> position); |
227 | // Perform string match of pattern on subject, starting at start index. |
228 | // Caller must ensure that 0 <= start_index <= sub->length(), as this does not |
229 | // check any arguments. |
230 | static int IndexOf(Isolate* isolate, Handle<String> receiver, |
231 | Handle<String> search, int start_index); |
232 | |
233 | static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver, |
234 | Handle<Object> search, Handle<Object> position); |
235 | |
236 | // Encapsulates logic related to a match and its capture groups as required |
237 | // by GetSubstitution. |
238 | class Match { |
239 | public: |
240 | virtual Handle<String> GetMatch() = 0; |
241 | virtual Handle<String> GetPrefix() = 0; |
242 | virtual Handle<String> GetSuffix() = 0; |
243 | |
244 | // A named capture can be invalid (if it is not specified in the pattern), |
245 | // unmatched (specified but not matched in the current string), and matched. |
246 | enum CaptureState { INVALID, UNMATCHED, MATCHED }; |
247 | |
248 | virtual int CaptureCount() = 0; |
249 | virtual bool HasNamedCaptures() = 0; |
250 | virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0; |
251 | virtual MaybeHandle<String> GetNamedCapture(Handle<String> name, |
252 | CaptureState* state) = 0; |
253 | |
254 | virtual ~Match() = default; |
255 | }; |
256 | |
257 | // ES#sec-getsubstitution |
258 | // GetSubstitution(matched, str, position, captures, replacement) |
259 | // Expand the $-expressions in the string and return a new string with |
260 | // the result. |
261 | // A {start_index} can be passed to specify where to start scanning the |
262 | // replacement string. |
263 | V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution( |
264 | Isolate* isolate, Match* match, Handle<String> replacement, |
265 | int start_index = 0); |
266 | |
267 | // String equality operations. |
268 | inline bool Equals(String other); |
269 | inline static bool Equals(Isolate* isolate, Handle<String> one, |
270 | Handle<String> two); |
271 | V8_EXPORT_PRIVATE bool IsUtf8EqualTo(Vector<const char> str, |
272 | bool allow_prefix_match = false); |
273 | |
274 | // Dispatches to Is{One,Two}ByteEqualTo. |
275 | template <typename Char> |
276 | bool IsEqualTo(Vector<const Char> str); |
277 | |
278 | V8_EXPORT_PRIVATE bool IsOneByteEqualTo(Vector<const uint8_t> str); |
279 | bool IsTwoByteEqualTo(Vector<const uc16> str); |
280 | |
281 | // Return a UTF8 representation of the string. The string is null |
282 | // terminated but may optionally contain nulls. Length is returned |
283 | // in length_output if length_output is not a null pointer The string |
284 | // should be nearly flat, otherwise the performance of this method may |
285 | // be very slow (quadratic in the length). Setting robustness_flag to |
286 | // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it |
287 | // handles unexpected data without causing assert failures and it does not |
288 | // do any heap allocations. This is useful when printing stack traces. |
289 | std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls, |
290 | RobustnessFlag robustness_flag, int offset, |
291 | int length, int* length_output = nullptr); |
292 | V8_EXPORT_PRIVATE std::unique_ptr<char[]> ToCString( |
293 | AllowNullsFlag allow_nulls = DISALLOW_NULLS, |
294 | RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL, |
295 | int* length_output = nullptr); |
296 | |
297 | bool ComputeArrayIndex(uint32_t* index); |
298 | |
299 | // Externalization. |
300 | V8_EXPORT_PRIVATE bool MakeExternal( |
301 | v8::String::ExternalStringResource* resource); |
302 | V8_EXPORT_PRIVATE bool MakeExternal( |
303 | v8::String::ExternalOneByteStringResource* resource); |
304 | bool SupportsExternalization(); |
305 | |
306 | // Conversion. |
307 | inline bool AsArrayIndex(uint32_t* index); |
308 | uint32_t inline ToValidIndex(Object number); |
309 | |
310 | // Trimming. |
311 | enum TrimMode { kTrim, kTrimStart, kTrimEnd }; |
312 | static Handle<String> Trim(Isolate* isolate, Handle<String> string, |
313 | TrimMode mode); |
314 | |
315 | DECL_CAST(String) |
316 | |
317 | V8_EXPORT_PRIVATE void PrintOn(FILE* out); |
318 | |
319 | // For use during stack traces. Performs rudimentary sanity check. |
320 | bool LooksValid(); |
321 | |
322 | // Dispatched behavior. |
323 | void StringShortPrint(StringStream* accumulator, bool show_details = true); |
324 | void PrintUC16(std::ostream& os, int start = 0, int end = -1); // NOLINT |
325 | #if defined(DEBUG) || defined(OBJECT_PRINT) |
326 | char* ToAsciiArray(); |
327 | #endif |
328 | DECL_PRINTER(String) |
329 | DECL_VERIFIER(String) |
330 | |
331 | inline bool IsFlat(); |
332 | |
333 | DEFINE_FIELD_OFFSET_CONSTANTS(Name::kHeaderSize, |
334 | TORQUE_GENERATED_STRING_FIELDS) |
335 | |
336 | static const int = kSize; |
337 | |
338 | // Max char codes. |
339 | static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; |
340 | static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; |
341 | static const int kMaxUtf16CodeUnit = 0xffff; |
342 | static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; |
343 | static const uc32 kMaxCodePoint = 0x10ffff; |
344 | |
345 | // Maximal string length. |
346 | // The max length is different on 32 and 64 bit platforms. Max length for a |
347 | // 32-bit platform is ~268.4M chars. On 64-bit platforms, max length is |
348 | // ~1.073B chars. The limit on 64-bit is so that SeqTwoByteString::kMaxSize |
349 | // can fit in a 32bit int: 2^31 - 1 is the max positive int, minus one bit as |
350 | // each char needs two bytes, subtract 24 bytes for the string header size. |
351 | |
352 | // See include/v8.h for the definition. |
353 | static const int kMaxLength = v8::String::kMaxLength; |
354 | static_assert(kMaxLength <= (Smi::kMaxValue / 2 - kHeaderSize), |
355 | "Unexpected max String length" ); |
356 | |
357 | // Max length for computing hash. For strings longer than this limit the |
358 | // string length is used as the hash value. |
359 | static const int kMaxHashCalcLength = 16383; |
360 | |
361 | // Limit for truncation in short printing. |
362 | static const int kMaxShortPrintLength = 1024; |
363 | |
364 | // Helper function for flattening strings. |
365 | template <typename sinkchar> |
366 | EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) |
367 | static void WriteToFlat(String source, sinkchar* sink, int from, int to); |
368 | |
369 | // The return value may point to the first aligned word containing the first |
370 | // non-one-byte character, rather than directly to the non-one-byte character. |
371 | // If the return value is >= the passed length, the entire string was |
372 | // one-byte. |
373 | static inline int NonAsciiStart(const char* chars, int length) { |
374 | const char* start = chars; |
375 | const char* limit = chars + length; |
376 | |
377 | if (length >= kIntptrSize) { |
378 | // Check unaligned bytes. |
379 | while (!IsAligned(reinterpret_cast<intptr_t>(chars), sizeof(uintptr_t))) { |
380 | if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) { |
381 | return static_cast<int>(chars - start); |
382 | } |
383 | ++chars; |
384 | } |
385 | // Check aligned words. |
386 | DCHECK_EQ(unibrow::Utf8::kMaxOneByteChar, 0x7F); |
387 | const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFF * 0x80; |
388 | while (chars + sizeof(uintptr_t) <= limit) { |
389 | if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) { |
390 | return static_cast<int>(chars - start); |
391 | } |
392 | chars += sizeof(uintptr_t); |
393 | } |
394 | } |
395 | // Check remaining unaligned bytes. |
396 | while (chars < limit) { |
397 | if (static_cast<uint8_t>(*chars) > unibrow::Utf8::kMaxOneByteChar) { |
398 | return static_cast<int>(chars - start); |
399 | } |
400 | ++chars; |
401 | } |
402 | |
403 | return static_cast<int>(chars - start); |
404 | } |
405 | |
406 | static inline bool IsAscii(const char* chars, int length) { |
407 | return NonAsciiStart(chars, length) >= length; |
408 | } |
409 | |
410 | static inline bool IsAscii(const uint8_t* chars, int length) { |
411 | return NonAsciiStart(reinterpret_cast<const char*>(chars), length) >= |
412 | length; |
413 | } |
414 | |
415 | static inline int NonOneByteStart(const uc16* chars, int length) { |
416 | const uc16* limit = chars + length; |
417 | const uc16* start = chars; |
418 | while (chars < limit) { |
419 | if (*chars > kMaxOneByteCharCodeU) return static_cast<int>(chars - start); |
420 | ++chars; |
421 | } |
422 | return static_cast<int>(chars - start); |
423 | } |
424 | |
425 | static inline bool IsOneByte(const uc16* chars, int length) { |
426 | return NonOneByteStart(chars, length) >= length; |
427 | } |
428 | |
429 | template <class Visitor> |
430 | static inline ConsString VisitFlat(Visitor* visitor, String string, |
431 | int offset = 0); |
432 | |
433 | static Handle<FixedArray> CalculateLineEnds(Isolate* isolate, |
434 | Handle<String> string, |
435 | bool include_ending_line); |
436 | |
437 | private: |
438 | friend class Name; |
439 | friend class StringTableInsertionKey; |
440 | friend class InternalizedStringKey; |
441 | |
442 | V8_EXPORT_PRIVATE static Handle<String> SlowFlatten( |
443 | Isolate* isolate, Handle<ConsString> cons, AllocationType allocation); |
444 | |
445 | // Slow case of String::Equals. This implementation works on any strings |
446 | // but it is most efficient on strings that are almost flat. |
447 | V8_EXPORT_PRIVATE bool SlowEquals(String other); |
448 | |
449 | V8_EXPORT_PRIVATE static bool SlowEquals(Isolate* isolate, Handle<String> one, |
450 | Handle<String> two); |
451 | |
452 | // Slow case of AsArrayIndex. |
453 | V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index); |
454 | |
455 | // Compute and set the hash code. |
456 | V8_EXPORT_PRIVATE uint32_t ComputeAndSetHash(); |
457 | |
458 | OBJECT_CONSTRUCTORS(String, Name); |
459 | }; |
460 | |
461 | // clang-format off |
462 | extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) |
463 | void String::WriteToFlat(String source, uint16_t* sink, int from, int to); |
464 | // clang-format on |
465 | |
466 | class SubStringRange { |
467 | public: |
468 | inline SubStringRange(String string, const DisallowHeapAllocation& no_gc, |
469 | int first = 0, int length = -1); |
470 | class iterator; |
471 | inline iterator begin(); |
472 | inline iterator end(); |
473 | |
474 | private: |
475 | String string_; |
476 | int first_; |
477 | int length_; |
478 | const DisallowHeapAllocation& no_gc_; |
479 | }; |
480 | |
481 | // The SeqString abstract class captures sequential string values. |
482 | class SeqString : public String { |
483 | public: |
484 | DECL_CAST(SeqString) |
485 | |
486 | // Truncate the string in-place if possible and return the result. |
487 | // In case of new_length == 0, the empty string is returned without |
488 | // truncating the original string. |
489 | V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string, |
490 | int new_length); |
491 | |
492 | OBJECT_CONSTRUCTORS(SeqString, String); |
493 | }; |
494 | |
495 | class InternalizedString : public String { |
496 | public: |
497 | DECL_CAST(InternalizedString) |
498 | // TODO(neis): Possibly move some stuff from String here. |
499 | |
500 | OBJECT_CONSTRUCTORS(InternalizedString, String); |
501 | }; |
502 | |
503 | // The OneByteString class captures sequential one-byte string objects. |
504 | // Each character in the OneByteString is an one-byte character. |
505 | class SeqOneByteString : public SeqString { |
506 | public: |
507 | static const bool kHasOneByteEncoding = true; |
508 | |
509 | // Dispatched behavior. |
510 | inline uint16_t SeqOneByteStringGet(int index); |
511 | inline void SeqOneByteStringSet(int index, uint16_t value); |
512 | |
513 | // Get the address of the characters in this string. |
514 | inline Address GetCharsAddress(); |
515 | |
516 | inline uint8_t* GetChars(const DisallowHeapAllocation& no_gc); |
517 | |
518 | // Clear uninitialized padding space. This ensures that the snapshot content |
519 | // is deterministic. |
520 | void clear_padding(); |
521 | |
522 | DECL_CAST(SeqOneByteString) |
523 | |
524 | // Garbage collection support. This method is called by the |
525 | // garbage collector to compute the actual size of an OneByteString |
526 | // instance. |
527 | inline int SeqOneByteStringSize(InstanceType instance_type); |
528 | |
529 | // Computes the size for an OneByteString instance of a given length. |
530 | static int SizeFor(int length) { |
531 | return OBJECT_POINTER_ALIGN(kHeaderSize + length * kCharSize); |
532 | } |
533 | |
534 | // Maximal memory usage for a single sequential one-byte string. |
535 | static const int = kMaxLength; |
536 | static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); |
537 | STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength); |
538 | |
539 | class BodyDescriptor; |
540 | |
541 | OBJECT_CONSTRUCTORS(SeqOneByteString, SeqString); |
542 | }; |
543 | |
544 | // The TwoByteString class captures sequential unicode string objects. |
545 | // Each character in the TwoByteString is a two-byte uint16_t. |
546 | class SeqTwoByteString : public SeqString { |
547 | public: |
548 | static const bool kHasOneByteEncoding = false; |
549 | |
550 | // Dispatched behavior. |
551 | inline uint16_t SeqTwoByteStringGet(int index); |
552 | inline void SeqTwoByteStringSet(int index, uint16_t value); |
553 | |
554 | // Get the address of the characters in this string. |
555 | inline Address GetCharsAddress(); |
556 | |
557 | inline uc16* GetChars(const DisallowHeapAllocation& no_gc); |
558 | |
559 | // Clear uninitialized padding space. This ensures that the snapshot content |
560 | // is deterministic. |
561 | void clear_padding(); |
562 | |
563 | DECL_CAST(SeqTwoByteString) |
564 | |
565 | // Garbage collection support. This method is called by the |
566 | // garbage collector to compute the actual size of a TwoByteString |
567 | // instance. |
568 | inline int SeqTwoByteStringSize(InstanceType instance_type); |
569 | |
570 | // Computes the size for a TwoByteString instance of a given length. |
571 | static int SizeFor(int length) { |
572 | return OBJECT_POINTER_ALIGN(kHeaderSize + length * kShortSize); |
573 | } |
574 | |
575 | // Maximal memory usage for a single sequential two-byte string. |
576 | static const int = kMaxLength * 2; |
577 | static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); |
578 | STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >= |
579 | String::kMaxLength); |
580 | |
581 | class BodyDescriptor; |
582 | |
583 | OBJECT_CONSTRUCTORS(SeqTwoByteString, SeqString); |
584 | }; |
585 | |
586 | // The ConsString class describes string values built by using the |
587 | // addition operator on strings. A ConsString is a pair where the |
588 | // first and second components are pointers to other string values. |
589 | // One or both components of a ConsString can be pointers to other |
590 | // ConsStrings, creating a binary tree of ConsStrings where the leaves |
591 | // are non-ConsString string values. The string value represented by |
592 | // a ConsString can be obtained by concatenating the leaf string |
593 | // values in a left-to-right depth-first traversal of the tree. |
594 | class ConsString : public String { |
595 | public: |
596 | // First string of the cons cell. |
597 | inline String first(); |
598 | // Doesn't check that the result is a string, even in debug mode. This is |
599 | // useful during GC where the mark bits confuse the checks. |
600 | inline Object unchecked_first(); |
601 | inline void set_first(Isolate* isolate, String first, |
602 | WriteBarrierMode mode = UPDATE_WRITE_BARRIER); |
603 | |
604 | // Second string of the cons cell. |
605 | inline String second(); |
606 | // Doesn't check that the result is a string, even in debug mode. This is |
607 | // useful during GC where the mark bits confuse the checks. |
608 | inline Object unchecked_second(); |
609 | inline void set_second(Isolate* isolate, String second, |
610 | WriteBarrierMode mode = UPDATE_WRITE_BARRIER); |
611 | |
612 | // Dispatched behavior. |
613 | V8_EXPORT_PRIVATE uint16_t ConsStringGet(int index); |
614 | |
615 | DECL_CAST(ConsString) |
616 | |
617 | DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, |
618 | TORQUE_GENERATED_CONS_STRING_FIELDS) |
619 | |
620 | // Minimum length for a cons string. |
621 | static const int kMinLength = 13; |
622 | |
623 | using BodyDescriptor = FixedBodyDescriptor<kFirstOffset, kSize, kSize>; |
624 | |
625 | DECL_VERIFIER(ConsString) |
626 | |
627 | OBJECT_CONSTRUCTORS(ConsString, String); |
628 | }; |
629 | |
630 | // The ThinString class describes string objects that are just references |
631 | // to another string object. They are used for in-place internalization when |
632 | // the original string cannot actually be internalized in-place: in these |
633 | // cases, the original string is converted to a ThinString pointing at its |
634 | // internalized version (which is allocated as a new object). |
635 | // In terms of memory layout and most algorithms operating on strings, |
636 | // ThinStrings can be thought of as "one-part cons strings". |
637 | class ThinString : public String { |
638 | public: |
639 | // Actual string that this ThinString refers to. |
640 | inline String actual() const; |
641 | inline HeapObject unchecked_actual() const; |
642 | inline void set_actual(String s, |
643 | WriteBarrierMode mode = UPDATE_WRITE_BARRIER); |
644 | |
645 | V8_EXPORT_PRIVATE uint16_t ThinStringGet(int index); |
646 | |
647 | DECL_CAST(ThinString) |
648 | DECL_VERIFIER(ThinString) |
649 | |
650 | DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, |
651 | TORQUE_GENERATED_THIN_STRING_FIELDS) |
652 | |
653 | using BodyDescriptor = FixedBodyDescriptor<kActualOffset, kSize, kSize>; |
654 | |
655 | OBJECT_CONSTRUCTORS(ThinString, String); |
656 | }; |
657 | |
658 | // The Sliced String class describes strings that are substrings of another |
659 | // sequential string. The motivation is to save time and memory when creating |
660 | // a substring. A Sliced String is described as a pointer to the parent, |
661 | // the offset from the start of the parent string and the length. Using |
662 | // a Sliced String therefore requires unpacking of the parent string and |
663 | // adding the offset to the start address. A substring of a Sliced String |
664 | // are not nested since the double indirection is simplified when creating |
665 | // such a substring. |
666 | // Currently missing features are: |
667 | // - handling externalized parent strings |
668 | // - external strings as parent |
669 | // - truncating sliced string to enable otherwise unneeded parent to be GC'ed. |
670 | class SlicedString : public String { |
671 | public: |
672 | inline String parent(); |
673 | inline void set_parent(Isolate* isolate, String parent, |
674 | WriteBarrierMode mode = UPDATE_WRITE_BARRIER); |
675 | inline int offset() const; |
676 | inline void set_offset(int offset); |
677 | |
678 | // Dispatched behavior. |
679 | V8_EXPORT_PRIVATE uint16_t SlicedStringGet(int index); |
680 | |
681 | DECL_CAST(SlicedString) |
682 | |
683 | DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, |
684 | TORQUE_GENERATED_SLICED_STRING_FIELDS) |
685 | |
686 | // Minimum length for a sliced string. |
687 | static const int kMinLength = 13; |
688 | |
689 | using BodyDescriptor = FixedBodyDescriptor<kParentOffset, kSize, kSize>; |
690 | |
691 | DECL_VERIFIER(SlicedString) |
692 | |
693 | OBJECT_CONSTRUCTORS(SlicedString, String); |
694 | }; |
695 | |
696 | // The ExternalString class describes string values that are backed by |
697 | // a string resource that lies outside the V8 heap. ExternalStrings |
698 | // consist of the length field common to all strings, a pointer to the |
699 | // external resource. It is important to ensure (externally) that the |
700 | // resource is not deallocated while the ExternalString is live in the |
701 | // V8 heap. |
702 | // |
703 | // The API expects that all ExternalStrings are created through the |
704 | // API. Therefore, ExternalStrings should not be used internally. |
705 | class ExternalString : public String { |
706 | public: |
707 | DECL_CAST(ExternalString) |
708 | |
709 | DEFINE_FIELD_OFFSET_CONSTANTS(String::kHeaderSize, |
710 | TORQUE_GENERATED_EXTERNAL_STRING_FIELDS) |
711 | |
712 | // Size of uncached external strings. |
713 | static const int kUncachedSize = |
714 | kResourceOffset + FIELD_SIZE(kResourceOffset); |
715 | |
716 | // Return whether the external string data pointer is not cached. |
717 | inline bool is_uncached() const; |
718 | // Size in bytes of the external payload. |
719 | int ExternalPayloadSize() const; |
720 | |
721 | // Used in the serializer/deserializer. |
722 | inline Address resource_as_address(); |
723 | inline void set_address_as_resource(Address address); |
724 | inline uint32_t resource_as_uint32(); |
725 | inline void set_uint32_as_resource(uint32_t value); |
726 | |
727 | // Disposes string's resource object if it has not already been disposed. |
728 | inline void DisposeResource(); |
729 | |
730 | STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset); |
731 | |
732 | OBJECT_CONSTRUCTORS(ExternalString, String); |
733 | }; |
734 | |
735 | // The ExternalOneByteString class is an external string backed by an |
736 | // one-byte string. |
737 | class ExternalOneByteString : public ExternalString { |
738 | public: |
739 | static const bool kHasOneByteEncoding = true; |
740 | |
741 | using Resource = v8::String::ExternalOneByteStringResource; |
742 | |
743 | // The underlying resource. |
744 | inline const Resource* resource(); |
745 | |
746 | // It is assumed that the previous resource is null. If it is not null, then |
747 | // it is the responsability of the caller the handle the previous resource. |
748 | inline void SetResource(Isolate* isolate, const Resource* buffer); |
749 | // Used only during serialization. |
750 | inline void set_resource(const Resource* buffer); |
751 | |
752 | // Update the pointer cache to the external character array. |
753 | // The cached pointer is always valid, as the external character array does = |
754 | // not move during lifetime. Deserialization is the only exception, after |
755 | // which the pointer cache has to be refreshed. |
756 | inline void update_data_cache(); |
757 | |
758 | inline const uint8_t* GetChars(); |
759 | |
760 | // Dispatched behavior. |
761 | inline uint16_t ExternalOneByteStringGet(int index); |
762 | |
763 | DECL_CAST(ExternalOneByteString) |
764 | |
765 | class BodyDescriptor; |
766 | |
767 | OBJECT_CONSTRUCTORS(ExternalOneByteString, ExternalString); |
768 | }; |
769 | |
770 | // The ExternalTwoByteString class is an external string backed by a UTF-16 |
771 | // encoded string. |
772 | class ExternalTwoByteString : public ExternalString { |
773 | public: |
774 | static const bool kHasOneByteEncoding = false; |
775 | |
776 | using Resource = v8::String::ExternalStringResource; |
777 | |
778 | // The underlying string resource. |
779 | inline const Resource* resource(); |
780 | |
781 | // It is assumed that the previous resource is null. If it is not null, then |
782 | // it is the responsability of the caller the handle the previous resource. |
783 | inline void SetResource(Isolate* isolate, const Resource* buffer); |
784 | // Used only during serialization. |
785 | inline void set_resource(const Resource* buffer); |
786 | |
787 | // Update the pointer cache to the external character array. |
788 | // The cached pointer is always valid, as the external character array does = |
789 | // not move during lifetime. Deserialization is the only exception, after |
790 | // which the pointer cache has to be refreshed. |
791 | inline void update_data_cache(); |
792 | |
793 | inline const uint16_t* GetChars(); |
794 | |
795 | // Dispatched behavior. |
796 | inline uint16_t ExternalTwoByteStringGet(int index); |
797 | |
798 | // For regexp code. |
799 | inline const uint16_t* ExternalTwoByteStringGetData(unsigned start); |
800 | |
801 | DECL_CAST(ExternalTwoByteString) |
802 | |
803 | class BodyDescriptor; |
804 | |
805 | OBJECT_CONSTRUCTORS(ExternalTwoByteString, ExternalString); |
806 | }; |
807 | |
808 | // A flat string reader provides random access to the contents of a |
809 | // string independent of the character width of the string. The handle |
810 | // must be valid as long as the reader is being used. |
811 | class V8_EXPORT_PRIVATE FlatStringReader : public Relocatable { |
812 | public: |
813 | FlatStringReader(Isolate* isolate, Handle<String> str); |
814 | FlatStringReader(Isolate* isolate, Vector<const char> input); |
815 | void PostGarbageCollection() override; |
816 | inline uc32 Get(int index); |
817 | template <typename Char> |
818 | inline Char Get(int index); |
819 | int length() { return length_; } |
820 | |
821 | private: |
822 | Address* str_; |
823 | bool is_one_byte_; |
824 | int length_; |
825 | const void* start_; |
826 | }; |
827 | |
828 | // This maintains an off-stack representation of the stack frames required |
829 | // to traverse a ConsString, allowing an entirely iterative and restartable |
830 | // traversal of the entire string |
831 | class ConsStringIterator { |
832 | public: |
833 | inline ConsStringIterator() = default; |
834 | inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) { |
835 | Reset(cons_string, offset); |
836 | } |
837 | inline void Reset(ConsString cons_string, int offset = 0) { |
838 | depth_ = 0; |
839 | // Next will always return nullptr. |
840 | if (cons_string.is_null()) return; |
841 | Initialize(cons_string, offset); |
842 | } |
843 | // Returns nullptr when complete. |
844 | inline String Next(int* offset_out) { |
845 | *offset_out = 0; |
846 | if (depth_ == 0) return String(); |
847 | return Continue(offset_out); |
848 | } |
849 | |
850 | private: |
851 | static const int kStackSize = 32; |
852 | // Use a mask instead of doing modulo operations for stack wrapping. |
853 | static const int kDepthMask = kStackSize - 1; |
854 | static_assert(base::bits::IsPowerOfTwo(kStackSize), |
855 | "kStackSize must be power of two" ); |
856 | static inline int OffsetForDepth(int depth); |
857 | |
858 | inline void PushLeft(ConsString string); |
859 | inline void PushRight(ConsString string); |
860 | inline void AdjustMaximumDepth(); |
861 | inline void Pop(); |
862 | inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; } |
863 | V8_EXPORT_PRIVATE void Initialize(ConsString cons_string, int offset); |
864 | V8_EXPORT_PRIVATE String Continue(int* offset_out); |
865 | String NextLeaf(bool* blew_stack); |
866 | String Search(int* offset_out); |
867 | |
868 | // Stack must always contain only frames for which right traversal |
869 | // has not yet been performed. |
870 | ConsString frames_[kStackSize]; |
871 | ConsString root_; |
872 | int depth_; |
873 | int maximum_depth_; |
874 | int consumed_; |
875 | DISALLOW_COPY_AND_ASSIGN(ConsStringIterator); |
876 | }; |
877 | |
878 | class StringCharacterStream { |
879 | public: |
880 | inline explicit StringCharacterStream(String string, int offset = 0); |
881 | inline uint16_t GetNext(); |
882 | inline bool HasMore(); |
883 | inline void Reset(String string, int offset = 0); |
884 | inline void VisitOneByteString(const uint8_t* chars, int length); |
885 | inline void VisitTwoByteString(const uint16_t* chars, int length); |
886 | |
887 | private: |
888 | ConsStringIterator iter_; |
889 | bool is_one_byte_; |
890 | union { |
891 | const uint8_t* buffer8_; |
892 | const uint16_t* buffer16_; |
893 | }; |
894 | const uint8_t* end_; |
895 | DISALLOW_COPY_AND_ASSIGN(StringCharacterStream); |
896 | }; |
897 | |
898 | } // namespace internal |
899 | } // namespace v8 |
900 | |
901 | #include "src/objects/object-macros-undef.h" |
902 | |
903 | #endif // V8_OBJECTS_STRING_H_ |
904 | |