1 | // Copyright 2014 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #ifndef V8_UNICODE_DECODER_H_ |
6 | #define V8_UNICODE_DECODER_H_ |
7 | |
8 | #include <sys/types.h> |
9 | #include <algorithm> |
10 | #include "src/globals.h" |
11 | #include "src/memcopy.h" |
12 | #include "src/unicode.h" |
13 | #include "src/vector.h" |
14 | |
15 | namespace unibrow { |
16 | |
17 | class Utf8Iterator { |
18 | public: |
19 | explicit Utf8Iterator(const v8::internal::Vector<const char>& stream) |
20 | : Utf8Iterator(stream, 0, false) {} |
21 | Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset, |
22 | bool trailing) |
23 | : stream_(stream), |
24 | cursor_(offset), |
25 | offset_(0), |
26 | char_(0), |
27 | trailing_(false) { |
28 | DCHECK_LE(offset, stream.length()); |
29 | // Read the first char, setting offset_ to offset in the process. |
30 | ++*this; |
31 | |
32 | // This must be set after reading the first char, since the offset marks |
33 | // the start of the octet sequence that the trailing char is part of. |
34 | trailing_ = trailing; |
35 | if (trailing) { |
36 | DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode); |
37 | } |
38 | } |
39 | |
40 | uint16_t operator*(); |
41 | Utf8Iterator& operator++(); |
42 | Utf8Iterator operator++(int); |
43 | bool Done(); |
44 | bool Trailing() { return trailing_; } |
45 | size_t Offset() { return offset_; } |
46 | |
47 | private: |
48 | const v8::internal::Vector<const char>& stream_; |
49 | size_t cursor_; |
50 | size_t offset_; |
51 | uint32_t char_; |
52 | bool trailing_; |
53 | }; |
54 | |
55 | class V8_EXPORT_PRIVATE Utf8DecoderBase { |
56 | public: |
57 | // Initialization done in subclass. |
58 | inline Utf8DecoderBase(); |
59 | inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length, |
60 | const v8::internal::Vector<const char>& stream); |
61 | inline size_t Utf16Length() const { return utf16_length_; } |
62 | |
63 | protected: |
64 | // This reads all characters and sets the utf16_length_. |
65 | // The first buffer_length utf16 chars are cached in the buffer. |
66 | void Reset(uint16_t* buffer, size_t buffer_length, |
67 | const v8::internal::Vector<const char>& vector); |
68 | static void WriteUtf16Slow(uint16_t* data, size_t length, |
69 | const v8::internal::Vector<const char>& stream, |
70 | size_t offset, bool trailing); |
71 | |
72 | size_t bytes_read_; |
73 | size_t chars_written_; |
74 | size_t utf16_length_; |
75 | bool trailing_; |
76 | |
77 | private: |
78 | DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase); |
79 | }; |
80 | |
81 | template <size_t kBufferSize> |
82 | class Utf8Decoder : public Utf8DecoderBase { |
83 | public: |
84 | inline Utf8Decoder() = default; |
85 | explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream); |
86 | inline void Reset(const v8::internal::Vector<const char>& stream); |
87 | inline size_t WriteUtf16( |
88 | uint16_t* data, size_t length, |
89 | const v8::internal::Vector<const char>& stream) const; |
90 | |
91 | private: |
92 | uint16_t buffer_[kBufferSize]; |
93 | }; |
94 | |
95 | Utf8DecoderBase::Utf8DecoderBase() |
96 | : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {} |
97 | |
98 | Utf8DecoderBase::Utf8DecoderBase( |
99 | uint16_t* buffer, size_t buffer_length, |
100 | const v8::internal::Vector<const char>& stream) { |
101 | Reset(buffer, buffer_length, stream); |
102 | } |
103 | |
104 | template <size_t kBufferSize> |
105 | Utf8Decoder<kBufferSize>::Utf8Decoder( |
106 | const v8::internal::Vector<const char>& stream) |
107 | : Utf8DecoderBase(buffer_, kBufferSize, stream) {} |
108 | |
109 | template <size_t kBufferSize> |
110 | void Utf8Decoder<kBufferSize>::Reset( |
111 | const v8::internal::Vector<const char>& stream) { |
112 | Utf8DecoderBase::Reset(buffer_, kBufferSize, stream); |
113 | } |
114 | |
115 | template <size_t kBufferSize> |
116 | size_t Utf8Decoder<kBufferSize>::WriteUtf16( |
117 | uint16_t* data, size_t data_length, |
118 | const v8::internal::Vector<const char>& stream) const { |
119 | DCHECK_GT(data_length, 0); |
120 | data_length = std::min(data_length, utf16_length_); |
121 | |
122 | // memcpy everything in buffer. |
123 | size_t memcpy_length = std::min(data_length, chars_written_); |
124 | v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t)); |
125 | |
126 | if (data_length <= chars_written_) return data_length; |
127 | |
128 | // Copy the rest the slow way. |
129 | WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream, |
130 | bytes_read_, trailing_); |
131 | return data_length; |
132 | } |
133 | |
134 | class Latin1 { |
135 | public: |
136 | static const unsigned kMaxChar = 0xff; |
137 | // Convert the character to Latin-1 case equivalent if possible. |
138 | static inline uint16_t TryConvertToLatin1(uint16_t); |
139 | }; |
140 | |
141 | uint16_t Latin1::TryConvertToLatin1(uint16_t c) { |
142 | switch (c) { |
143 | // This are equivalent characters in unicode. |
144 | case 0x39c: |
145 | case 0x3bc: |
146 | return 0xb5; |
147 | // This is an uppercase of a Latin-1 character |
148 | // outside of Latin-1. |
149 | case 0x178: |
150 | return 0xff; |
151 | } |
152 | return c; |
153 | } |
154 | |
155 | |
156 | } // namespace unibrow |
157 | |
158 | #endif // V8_UNICODE_DECODER_H_ |
159 | |