1 | // Copyright 2007-2010 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #ifndef V8_UNICODE_INL_H_ |
6 | #define V8_UNICODE_INL_H_ |
7 | |
8 | #include "src/unicode.h" |
9 | #include "src/base/logging.h" |
10 | #include "src/utils.h" |
11 | |
12 | namespace unibrow { |
13 | |
14 | #ifndef V8_INTL_SUPPORT |
15 | template <class T, int s> bool Predicate<T, s>::get(uchar code_point) { |
16 | CacheEntry entry = entries_[code_point & kMask]; |
17 | if (entry.code_point() == code_point) return entry.value(); |
18 | return CalculateValue(code_point); |
19 | } |
20 | |
21 | template <class T, int s> bool Predicate<T, s>::CalculateValue( |
22 | uchar code_point) { |
23 | bool result = T::Is(code_point); |
24 | entries_[code_point & kMask] = CacheEntry(code_point, result); |
25 | return result; |
26 | } |
27 | |
28 | template <class T, int s> int Mapping<T, s>::get(uchar c, uchar n, |
29 | uchar* result) { |
30 | CacheEntry entry = entries_[c & kMask]; |
31 | if (entry.code_point_ == c) { |
32 | if (entry.offset_ == 0) { |
33 | return 0; |
34 | } else { |
35 | result[0] = c + entry.offset_; |
36 | return 1; |
37 | } |
38 | } else { |
39 | return CalculateValue(c, n, result); |
40 | } |
41 | } |
42 | |
43 | template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n, |
44 | uchar* result) { |
45 | bool allow_caching = true; |
46 | int length = T::Convert(c, n, result, &allow_caching); |
47 | if (allow_caching) { |
48 | if (length == 1) { |
49 | entries_[c & kMask] = CacheEntry(c, result[0] - c); |
50 | return 1; |
51 | } else { |
52 | entries_[c & kMask] = CacheEntry(c, 0); |
53 | return 0; |
54 | } |
55 | } else { |
56 | return length; |
57 | } |
58 | } |
59 | #endif // !V8_INTL_SUPPORT |
60 | |
61 | // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they |
62 | // stream in. This **must** be followed by a call to ValueOfIncrementalFinish |
63 | // when the stream is complete, to ensure incomplete sequences are handled. |
64 | uchar Utf8::ValueOfIncremental(const byte** cursor, State* state, |
65 | Utf8IncrementalBuffer* buffer) { |
66 | DCHECK_NOT_NULL(buffer); |
67 | State old_state = *state; |
68 | byte next = **cursor; |
69 | *cursor += 1; |
70 | |
71 | if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) { |
72 | DCHECK_EQ(0u, *buffer); |
73 | return static_cast<uchar>(next); |
74 | } |
75 | |
76 | // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation |
77 | // char in that sequence. |
78 | Utf8DfaDecoder::Decode(next, state, buffer); |
79 | |
80 | switch (*state) { |
81 | case State::kAccept: { |
82 | uchar t = *buffer; |
83 | *buffer = 0; |
84 | return t; |
85 | } |
86 | |
87 | case State::kReject: |
88 | *state = State::kAccept; |
89 | *buffer = 0; |
90 | |
91 | // If we hit a bad byte, we need to determine if we were trying to start |
92 | // a sequence or continue one. If we were trying to start a sequence, |
93 | // that means it's just an invalid lead byte and we need to continue to |
94 | // the next (which we already did above). If we were already in a |
95 | // sequence, we need to reprocess this same byte after resetting to the |
96 | // initial state. |
97 | if (old_state != State::kAccept) { |
98 | // We were trying to continue a sequence, so let's reprocess this byte |
99 | // next time. |
100 | *cursor -= 1; |
101 | } |
102 | return kBadChar; |
103 | |
104 | default: |
105 | return kIncomplete; |
106 | } |
107 | } |
108 | |
109 | unsigned Utf8::EncodeOneByte(char* str, uint8_t c) { |
110 | static const int kMask = ~(1 << 6); |
111 | if (c <= kMaxOneByteChar) { |
112 | str[0] = c; |
113 | return 1; |
114 | } |
115 | str[0] = 0xC0 | (c >> 6); |
116 | str[1] = 0x80 | (c & kMask); |
117 | return 2; |
118 | } |
119 | |
120 | // Encode encodes the UTF-16 code units c and previous into the given str |
121 | // buffer, and combines surrogate code units into single code points. If |
122 | // replace_invalid is set to true, orphan surrogate code units will be replaced |
123 | // with kBadChar. |
124 | unsigned Utf8::Encode(char* str, |
125 | uchar c, |
126 | int previous, |
127 | bool replace_invalid) { |
128 | static const int kMask = ~(1 << 6); |
129 | if (c <= kMaxOneByteChar) { |
130 | str[0] = c; |
131 | return 1; |
132 | } else if (c <= kMaxTwoByteChar) { |
133 | str[0] = 0xC0 | (c >> 6); |
134 | str[1] = 0x80 | (c & kMask); |
135 | return 2; |
136 | } else if (c <= kMaxThreeByteChar) { |
137 | DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); |
138 | if (Utf16::IsSurrogatePair(previous, c)) { |
139 | const int kUnmatchedSize = kSizeOfUnmatchedSurrogate; |
140 | return Encode(str - kUnmatchedSize, |
141 | Utf16::CombineSurrogatePair(previous, c), |
142 | Utf16::kNoPreviousCharacter, |
143 | replace_invalid) - kUnmatchedSize; |
144 | } else if (replace_invalid && |
145 | (Utf16::IsLeadSurrogate(c) || |
146 | Utf16::IsTrailSurrogate(c))) { |
147 | c = kBadChar; |
148 | } |
149 | str[0] = 0xE0 | (c >> 12); |
150 | str[1] = 0x80 | ((c >> 6) & kMask); |
151 | str[2] = 0x80 | (c & kMask); |
152 | return 3; |
153 | } else { |
154 | str[0] = 0xF0 | (c >> 18); |
155 | str[1] = 0x80 | ((c >> 12) & kMask); |
156 | str[2] = 0x80 | ((c >> 6) & kMask); |
157 | str[3] = 0x80 | (c & kMask); |
158 | return 4; |
159 | } |
160 | } |
161 | |
162 | |
163 | uchar Utf8::ValueOf(const byte* bytes, size_t length, size_t* cursor) { |
164 | if (length <= 0) return kBadChar; |
165 | byte first = bytes[0]; |
166 | // Characters between 0000 and 007F are encoded as a single character |
167 | if (V8_LIKELY(first <= kMaxOneByteChar)) { |
168 | *cursor += 1; |
169 | return first; |
170 | } |
171 | return CalculateValue(bytes, length, cursor); |
172 | } |
173 | |
174 | unsigned Utf8::Length(uchar c, int previous) { |
175 | if (c <= kMaxOneByteChar) { |
176 | return 1; |
177 | } else if (c <= kMaxTwoByteChar) { |
178 | return 2; |
179 | } else if (c <= kMaxThreeByteChar) { |
180 | DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter)); |
181 | if (Utf16::IsSurrogatePair(previous, c)) { |
182 | return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates; |
183 | } |
184 | return 3; |
185 | } else { |
186 | return 4; |
187 | } |
188 | } |
189 | |
190 | bool Utf8::IsValidCharacter(uchar c) { |
191 | return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) || |
192 | (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu && |
193 | c != kBadChar); |
194 | } |
195 | |
196 | } // namespace unibrow |
197 | |
198 | #endif // V8_UNICODE_INL_H_ |
199 | |