1// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_UNICODE_DECODER_H_
6#define V8_UNICODE_DECODER_H_
7
8#include <sys/types.h>
9#include <algorithm>
10#include "src/globals.h"
11#include "src/memcopy.h"
12#include "src/unicode.h"
13#include "src/vector.h"
14
15namespace unibrow {
16
17class Utf8Iterator {
18 public:
19 explicit Utf8Iterator(const v8::internal::Vector<const char>& stream)
20 : Utf8Iterator(stream, 0, false) {}
21 Utf8Iterator(const v8::internal::Vector<const char>& stream, size_t offset,
22 bool trailing)
23 : stream_(stream),
24 cursor_(offset),
25 offset_(0),
26 char_(0),
27 trailing_(false) {
28 DCHECK_LE(offset, stream.length());
29 // Read the first char, setting offset_ to offset in the process.
30 ++*this;
31
32 // This must be set after reading the first char, since the offset marks
33 // the start of the octet sequence that the trailing char is part of.
34 trailing_ = trailing;
35 if (trailing) {
36 DCHECK_GT(char_, Utf16::kMaxNonSurrogateCharCode);
37 }
38 }
39
40 uint16_t operator*();
41 Utf8Iterator& operator++();
42 Utf8Iterator operator++(int);
43 bool Done();
44 bool Trailing() { return trailing_; }
45 size_t Offset() { return offset_; }
46
47 private:
48 const v8::internal::Vector<const char>& stream_;
49 size_t cursor_;
50 size_t offset_;
51 uint32_t char_;
52 bool trailing_;
53};
54
55class V8_EXPORT_PRIVATE Utf8DecoderBase {
56 public:
57 // Initialization done in subclass.
58 inline Utf8DecoderBase();
59 inline Utf8DecoderBase(uint16_t* buffer, size_t buffer_length,
60 const v8::internal::Vector<const char>& stream);
61 inline size_t Utf16Length() const { return utf16_length_; }
62
63 protected:
64 // This reads all characters and sets the utf16_length_.
65 // The first buffer_length utf16 chars are cached in the buffer.
66 void Reset(uint16_t* buffer, size_t buffer_length,
67 const v8::internal::Vector<const char>& vector);
68 static void WriteUtf16Slow(uint16_t* data, size_t length,
69 const v8::internal::Vector<const char>& stream,
70 size_t offset, bool trailing);
71
72 size_t bytes_read_;
73 size_t chars_written_;
74 size_t utf16_length_;
75 bool trailing_;
76
77 private:
78 DISALLOW_COPY_AND_ASSIGN(Utf8DecoderBase);
79};
80
81template <size_t kBufferSize>
82class Utf8Decoder : public Utf8DecoderBase {
83 public:
84 inline Utf8Decoder() = default;
85 explicit inline Utf8Decoder(const v8::internal::Vector<const char>& stream);
86 inline void Reset(const v8::internal::Vector<const char>& stream);
87 inline size_t WriteUtf16(
88 uint16_t* data, size_t length,
89 const v8::internal::Vector<const char>& stream) const;
90
91 private:
92 uint16_t buffer_[kBufferSize];
93};
94
95Utf8DecoderBase::Utf8DecoderBase()
96 : bytes_read_(0), chars_written_(0), utf16_length_(0), trailing_(false) {}
97
98Utf8DecoderBase::Utf8DecoderBase(
99 uint16_t* buffer, size_t buffer_length,
100 const v8::internal::Vector<const char>& stream) {
101 Reset(buffer, buffer_length, stream);
102}
103
104template <size_t kBufferSize>
105Utf8Decoder<kBufferSize>::Utf8Decoder(
106 const v8::internal::Vector<const char>& stream)
107 : Utf8DecoderBase(buffer_, kBufferSize, stream) {}
108
109template <size_t kBufferSize>
110void Utf8Decoder<kBufferSize>::Reset(
111 const v8::internal::Vector<const char>& stream) {
112 Utf8DecoderBase::Reset(buffer_, kBufferSize, stream);
113}
114
115template <size_t kBufferSize>
116size_t Utf8Decoder<kBufferSize>::WriteUtf16(
117 uint16_t* data, size_t data_length,
118 const v8::internal::Vector<const char>& stream) const {
119 DCHECK_GT(data_length, 0);
120 data_length = std::min(data_length, utf16_length_);
121
122 // memcpy everything in buffer.
123 size_t memcpy_length = std::min(data_length, chars_written_);
124 v8::internal::MemCopy(data, buffer_, memcpy_length * sizeof(uint16_t));
125
126 if (data_length <= chars_written_) return data_length;
127
128 // Copy the rest the slow way.
129 WriteUtf16Slow(data + chars_written_, data_length - chars_written_, stream,
130 bytes_read_, trailing_);
131 return data_length;
132}
133
134class Latin1 {
135 public:
136 static const unsigned kMaxChar = 0xff;
137 // Convert the character to Latin-1 case equivalent if possible.
138 static inline uint16_t TryConvertToLatin1(uint16_t);
139};
140
141uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
142 switch (c) {
143 // This are equivalent characters in unicode.
144 case 0x39c:
145 case 0x3bc:
146 return 0xb5;
147 // This is an uppercase of a Latin-1 character
148 // outside of Latin-1.
149 case 0x178:
150 return 0xff;
151 }
152 return c;
153}
154
155
156} // namespace unibrow
157
158#endif // V8_UNICODE_DECODER_H_
159