UTF8Conversion.cpp source code [jsc/Source/WTF/wtf/unicode/UTF8Conversion.cpp]

1	/*
2	* Copyright (C) 2007, 2014 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com>
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include <wtf/unicode/UTF8Conversion.h>
29
30	#include <wtf/ASCIICType.h>
31	#include <wtf/text/StringHasher.h>
32	#include <wtf/unicode/CharacterNames.h>
33
34	namespace WTF {
35	namespace Unicode {
36
37	inline int inlineUTF8SequenceLengthNonASCII(char b0)
38	{
39	if ((b0 & `0xC0`) != `0xC0`)
40	return `0`;
41	if ((b0 & `0xE0`) == `0xC0`)
42	return `2`;
43	if ((b0 & `0xF0`) == `0xE0`)
44	return `3`;
45	if ((b0 & `0xF8`) == `0xF0`)
46	return `4`;
47	return `0`;
48	}
49
50	inline int inlineUTF8SequenceLength(char b0)
51	{
52	return isASCII(b0) ? `1` : inlineUTF8SequenceLengthNonASCII(b0);
53	}
54
55	int UTF8SequenceLength(char b0)
56	{
57	return isASCII(b0) ? `1` : inlineUTF8SequenceLengthNonASCII(b0);
58	}
59
60	int decodeUTF8Sequence(const char* sequence)
61	{
62	// Handle 0-byte sequences (never valid).
63	const unsigned char b0 = sequence[`0`];
64	const int length = inlineUTF8SequenceLength(b0);
65	if (length == `0`)
66	return -`1`;
67
68	// Handle 1-byte sequences (plain ASCII).
69	const unsigned char b1 = sequence[`1`];
70	if (length == `1`) {
71	if (b1)
72	return -`1`;
73	return b0;
74	}
75
76	// Handle 2-byte sequences.
77	if ((b1 & `0xC0`) != `0x80`)
78	return -`1`;
79	const unsigned char b2 = sequence[`2`];
80	if (length == `2`) {
81	if (b2)
82	return -`1`;
83	const int c = ((b0 & `0x1F`) << `6`) \| (b1 & `0x3F`);
84	if (c < `0x80`)
85	return -`1`;
86	return c;
87	}
88
89	// Handle 3-byte sequences.
90	if ((b2 & `0xC0`) != `0x80`)
91	return -`1`;
92	const unsigned char b3 = sequence[`3`];
93	if (length == `3`) {
94	if (b3)
95	return -`1`;
96	const int c = ((b0 & `0xF`) << `12`) \| ((b1 & `0x3F`) << `6`) \| (b2 & `0x3F`);
97	if (c < `0x800`)
98	return -`1`;
99	// UTF-16 surrogates should never appear in UTF-8 data.
100	if (c >= `0xD800` && c <= `0xDFFF`)
101	return -`1`;
102	return c;
103	}
104
105	// Handle 4-byte sequences.
106	if ((b3 & `0xC0`) != `0x80`)
107	return -`1`;
108	const unsigned char b4 = sequence[`4`];
109	if (length == `4`) {
110	if (b4)
111	return -`1`;
112	const int c = ((b0 & `0x7`) << `18`) \| ((b1 & `0x3F`) << `12`) \| ((b2 & `0x3F`) << `6`) \| (b3 & `0x3F`);
113	if (c < `0x10000` \|\| c > `0x10FFFF`)
114	return -`1`;
115	return c;
116	}
117
118	return -`1`;
119	}
120
121	// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
122	// into the first byte, depending on how many bytes follow. There are
123	// as many entries in this table as there are UTF-8 sequence types.
124	// (I.e., one byte sequence, two byte... etc.). Remember that sequencs
125	// for legal* UTF-8 will be 4 or fewer bytes total.*
126	static const unsigned char firstByteMark[`7`] = { `0x00`, `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
127
128	ConversionResult convertLatin1ToUTF8(
129	const LChar** sourceStart, const LChar* sourceEnd,
130	char** targetStart, char* targetEnd)
131	{
132	ConversionResult result = conversionOK;
133	const LChar* source = *sourceStart;
134	char* target = *targetStart;
135	while (source < sourceEnd) {
136	UChar32 ch;
137	unsigned short bytesToWrite = `0`;
138	const UChar32 byteMask = `0xBF`;
139	const UChar32 byteMark = `0x80`;
140	const LChar* oldSource = source; // In case we have to back up because of target overflow.
141	ch = static_cast<unsigned short>(*source++);
142
143	// Figure out how many bytes the result will require
144	if (ch < (UChar32)`0x80`)
145	bytesToWrite = `1`;
146	else
147	bytesToWrite = `2`;
148
149	target += bytesToWrite;
150	if (target > targetEnd) {
151	source = oldSource; // Back up source pointer!
152	target -= bytesToWrite;
153	result = targetExhausted;
154	break;
155	}
156	switch (bytesToWrite) { // note: everything falls through.
157	case `2`:
158	--target = (char*)((ch \| byteMark) & byteMask);
159	ch >>= `6`;
160	FALLTHROUGH;
161	case `1`:
162	--target = (char*)(ch \| firstByteMark[bytesToWrite]);
163	}
164	target += bytesToWrite;
165	}
166	*sourceStart = source;
167	*targetStart = target;
168	return result;
169	}
170
171	ConversionResult convertUTF16ToUTF8(
172	const UChar** sourceStart, const UChar* sourceEnd,
173	char** targetStart, char* targetEnd, bool strict)
174	{
175	ConversionResult result = conversionOK;
176	const UChar* source = *sourceStart;
177	char* target = *targetStart;
178	while (source < sourceEnd) {
179	UChar32 ch;
180	unsigned short bytesToWrite = `0`;
181	const UChar32 byteMask = `0xBF`;
182	const UChar32 byteMark = `0x80`;
183	const UChar* oldSource = source; // In case we have to back up because of target overflow.
184	ch = static_cast<unsigned short>(*source++);
185	// If we have a surrogate pair, convert to UChar32 first.
186	if (ch >= `0xD800` && ch <= `0xDBFF`) {
187	// If the 16 bits following the high surrogate are in the source buffer...
188	if (source < sourceEnd) {
189	UChar32 ch2 = static_cast<unsigned short>(*source);
190	// If it's a low surrogate, convert to UChar32.
191	if (ch2 >= `0xDC00` && ch2 <= `0xDFFF`) {
192	ch = ((ch - `0xD800`) << `10`) + (ch2 - `0xDC00`) + `0x0010000`;
193	++source;
194	} else if (strict) { // it's an unpaired high surrogate
195	--source; // return to the illegal value itself
196	result = sourceIllegal;
197	break;
198	}
199	} else { // We don't have the 16 bits following the high surrogate.
200	--source; // return to the high surrogate
201	result = sourceExhausted;
202	break;
203	}
204	} else if (strict) {
205	// UTF-16 surrogate values are illegal in UTF-32
206	if (ch >= `0xDC00` && ch <= `0xDFFF`) {
207	--source; // return to the illegal value itself
208	result = sourceIllegal;
209	break;
210	}
211	}
212	// Figure out how many bytes the result will require
213	if (ch < (UChar32)`0x80`) {
214	bytesToWrite = `1`;
215	} else if (ch < (UChar32)`0x800`) {
216	bytesToWrite = `2`;
217	} else if (ch < (UChar32)`0x10000`) {
218	bytesToWrite = `3`;
219	} else if (ch < (UChar32)`0x110000`) {
220	bytesToWrite = `4`;
221	} else {
222	bytesToWrite = `3`;
223	ch = replacementCharacter;
224	}
225
226	target += bytesToWrite;
227	if (target > targetEnd) {
228	source = oldSource; // Back up source pointer!
229	target -= bytesToWrite;
230	result = targetExhausted;
231	break;
232	}
233	switch (bytesToWrite) { // note: everything falls through.
234	case `4`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`; FALLTHROUGH;
235	case `3`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`; FALLTHROUGH;
236	case `2`: --target = (char*)((ch \| byteMark) & byteMask); ch >>= `6`; FALLTHROUGH;
237	case `1`: --target = (char*)(ch \| firstByteMark[bytesToWrite]);
238	}
239	target += bytesToWrite;
240	}
241	*sourceStart = source;
242	*targetStart = target;
243	return result;
244	}
245
246	// This must be called with the length pre-determined by the first byte.
247	// If presented with a length > 4, this returns false. The Unicode
248	// definition of UTF-8 goes up to 4-byte sequences.
249	static bool isLegalUTF8(const unsigned char* source, int length)
250	{
251	unsigned char a;
252	const unsigned char* srcptr = source + length;
253	switch (length) {
254	default: return false;
255	// Everything else falls through when "true"...
256	case `4`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false; FALLTHROUGH;
257	case `3`: if ((a = (--srcptr)) < `0x80` \|\| a > `0xBF`) return* false; FALLTHROUGH;
258	case `2`: if ((a = (--srcptr)) > `0xBF`) return* false;
259
260	switch (*source) {
261	// no fall-through in this inner switch
262	case `0xE0`: if (a < `0xA0`) return false; break;
263	case `0xED`: if (a > `0x9F`) return false; break;
264	case `0xF0`: if (a < `0x90`) return false; break;
265	case `0xF4`: if (a > `0x8F`) return false; break;
266	default: if (a < `0x80`) return false;
267	}
268	FALLTHROUGH;
269
270	case `1`: if (source >= `0x80` && source < `0xC2`) return false;
271	}
272	if (*source > `0xF4`)
273	return false;
274	return true;
275	}
276
277	// Magic values subtracted from a buffer value during UTF8 conversion.
278	// This table contains as many values as there might be trailing bytes
279	// in a UTF-8 sequence.
280	static const UChar32 offsetsFromUTF8[`6`] = { `0x00000000UL`, `0x00003080UL`, `0x000E2080UL`, `0x03C82080UL`, static_cast<UChar32>(`0xFA082080UL`), static_cast<UChar32>(`0x82082080UL`) };
281
282	static inline UChar32 readUTF8Sequence(const char& sequence, unsigned* length)
283	{
284	UChar32 character = `0`;
285
286	// The cases all fall through.
287	switch (length) {
288	case `6`: character += static_cast<unsigned char>(*sequence++); character <<= `6`; FALLTHROUGH;
289	case `5`: character += static_cast<unsigned char>(*sequence++); character <<= `6`; FALLTHROUGH;
290	case `4`: character += static_cast<unsigned char>(*sequence++); character <<= `6`; FALLTHROUGH;
291	case `3`: character += static_cast<unsigned char>(*sequence++); character <<= `6`; FALLTHROUGH;
292	case `2`: character += static_cast<unsigned char>(*sequence++); character <<= `6`; FALLTHROUGH;
293	case `1`: character += static_cast<unsigned char>(*sequence++);
294	}
295
296	return character - offsetsFromUTF8[length - `1`];
297	}
298
299	ConversionResult convertUTF8ToUTF16(
300	const char** sourceStart, const char* sourceEnd,
301	UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
302	{
303	ConversionResult result = conversionOK;
304	const char* source = *sourceStart;
305	UChar* target = *targetStart;
306	UChar orAllData = `0`;
307	while (source < sourceEnd) {
308	int utf8SequenceLength = inlineUTF8SequenceLength(*source);
309	if (sourceEnd - source < utf8SequenceLength) {
310	result = sourceExhausted;
311	break;
312	}
313	// Do this check whether lenient or strict
314	if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
315	result = sourceIllegal;
316	break;
317	}
318
319	UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
320
321	if (target >= targetEnd) {
322	source -= utf8SequenceLength; // Back up source pointer!
323	result = targetExhausted;
324	break;
325	}
326
327	if (U_IS_BMP(character)) {
328	// UTF-16 surrogate values are illegal in UTF-32
329	if (U_IS_SURROGATE(character)) {
330	if (strict) {
331	source -= utf8SequenceLength; // return to the illegal value itself
332	result = sourceIllegal;
333	break;
334	} else {
335	*target++ = replacementCharacter;
336	orAllData \|= replacementCharacter;
337	}
338	} else {
339	target++ = character; // normal case*
340	orAllData \|= character;
341	}
342	} else if (U_IS_SUPPLEMENTARY(character)) {
343	// target is a character in range 0xFFFF - 0x10FFFF
344	if (target + `1` >= targetEnd) {
345	source -= utf8SequenceLength; // Back up source pointer!
346	result = targetExhausted;
347	break;
348	}
349	*target++ = U16_LEAD(character);
350	*target++ = U16_TRAIL(character);
351	orAllData = `0xffff`;
352	} else {
353	if (strict) {
354	source -= utf8SequenceLength; // return to the start
355	result = sourceIllegal;
356	break; // Bail out; shouldn't continue
357	} else {
358	*target++ = replacementCharacter;
359	orAllData \|= replacementCharacter;
360	}
361	}
362	}
363	*sourceStart = source;
364	*targetStart = target;
365
366	if (sourceAllASCII)
367	*sourceAllASCII = !(orAllData & ~`0x7f`);
368
369	return result;
370	}
371
372	unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
373	{
374	if (!data)
375	return `0`;
376
377	StringHasher stringHasher;
378	dataLength = `0`;
379	utf16Length = `0`;
380
381	while (data < dataEnd \|\| (!dataEnd && *data)) {
382	if (isASCII(*data)) {
383	stringHasher.addCharacter(*data++);
384	dataLength++;
385	utf16Length++;
386	continue;
387	}
388
389	int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
390	dataLength += utf8SequenceLength;
391
392	if (!dataEnd) {
393	for (int i = `1`; i < utf8SequenceLength; ++i) {
394	if (!data[i])
395	return `0`;
396	}
397	} else if (dataEnd - data < utf8SequenceLength)
398	return `0`;
399
400	if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
401	return `0`;
402
403	UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
404	ASSERT(!isASCII(character));
405
406	if (U_IS_BMP(character)) {
407	// UTF-16 surrogate values are illegal in UTF-32
408	if (U_IS_SURROGATE(character))
409	return `0`;
410	stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
411	utf16Length++;
412	} else if (U_IS_SUPPLEMENTARY(character)) {
413	stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
414	static_cast<UChar>(U16_TRAIL(character)));
415	utf16Length += `2`;
416	} else
417	return `0`;
418	}
419
420	return stringHasher.hashWithTop8BitsMasked();
421	}
422
423	bool equalUTF16WithUTF8(const UChar* a, const char* b, const char* bEnd)
424	{
425	while (b < bEnd) {
426	if (isASCII(a) \|\| isASCII(b)) {
427	if (a++ != b++)
428	return false;
429	continue;
430	}
431
432	int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
433
434	if (bEnd - b < utf8SequenceLength)
435	return false;
436
437	if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
438	return false;
439
440	UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
441	ASSERT(!isASCII(character));
442
443	if (U_IS_BMP(character)) {
444	// UTF-16 surrogate values are illegal in UTF-32
445	if (U_IS_SURROGATE(character))
446	return false;
447	if (*a++ != character)
448	return false;
449	} else if (U_IS_SUPPLEMENTARY(character)) {
450	if (*a++ != U16_LEAD(character))
451	return false;
452	if (*a++ != U16_TRAIL(character))
453	return false;
454	} else
455	return false;
456	}
457
458	return true;
459	}
460
461	bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
462	{
463	while (b < bEnd) {
464	if (isASCII(a) \|\| isASCII(b)) {
465	if (a++ != b++)
466	return false;
467	continue;
468	}
469
470	if (b + `1` == bEnd)
471	return false;
472
473	if ((b[`0`] & `0xE0`) != `0xC0` \|\| (b[`1`] & `0xC0`) != `0x80`)
474	return false;
475
476	LChar character = ((b[`0`] & `0x1F`) << `6`) \| (b[`1`] & `0x3F`);
477
478	b += `2`;
479
480	if (*a++ != character)
481	return false;
482	}
483
484	return true;
485	}
486
487	} // namespace Unicode
488	} // namespace WTF
489

Browse the source code of jsc/Source/WTF/wtf/unicode/UTF8Conversion.cpp