YarrCanonicalize.h source code [jsc/Source/JavaScriptCore/yarr/YarrCanonicalize.h]

1	/*
2	* Copyright (C) 2012-2016 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#pragma once
27
28	#include <stdint.h>
29	#include <unicode/utypes.h>
30
31	namespace JSC { namespace Yarr {
32
33	// This set of data provides information for each UCS2 code point as to the set of code points
34	// that it should match under the ES6 case insensitive RegExp matching rules, specified in 21.2.2.8.2.
35	// The non-Unicode tables are autogenerated using YarrCanonicalize.js into YarrCanonicalize.cpp.
36	// The Unicode tables are autogenerated using the python script generateYarrCanonicalizeUnicode
37	// which creates YarrCanonicalizeUnicode.cpp.
38	enum UCS2CanonicalizationType {
39	CanonicalizeUnique, // No canonically equal values, e.g. 0x0.
40	CanonicalizeSet, // Value indicates a set in characterSetInfo.
41	CanonicalizeRangeLo, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
42	CanonicalizeRangeHi, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
43	CanonicalizeAlternatingAligned, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
44	CanonicalizeAlternatingUnaligned, // Unaligned consequtive pair, e.g. 0x241,0x242.
45	};
46	struct CanonicalizationRange {
47	UChar32 begin;
48	UChar32 end;
49	UChar32 value;
50	UCS2CanonicalizationType type;
51	};
52
53	extern const size_t UCS2_CANONICALIZATION_RANGES;
54	extern const UChar32* const ucs2CharacterSetInfo[];
55	extern const CanonicalizationRange ucs2RangeInfo[];
56	extern const uint16_t canonicalTableLChar[`256`];
57
58	extern const size_t UNICODE_CANONICALIZATION_RANGES;
59	extern const UChar32* const unicodeCharacterSetInfo[];
60	extern const CanonicalizationRange unicodeRangeInfo[];
61
62	enum class CanonicalMode { UCS2, Unicode };
63
64	inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode)
65	{
66	const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo;
67	return rangeInfo[index];
68	}
69
70	// This searches in log2 time over ~400-600 entries, so should typically result in 9 compares.
71	inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
72	{
73	const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo;
74	size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES;
75
76	while (true) {
77	size_t candidate = entries >> `1`;
78	const CanonicalizationRange* candidateInfo = info + candidate;
79	if (ch < candidateInfo->begin)
80	entries = candidate;
81	else if (ch <= candidateInfo->end)
82	return candidateInfo;
83	else {
84	info = candidateInfo + `1`;
85	entries -= (candidate + `1`);
86	}
87	}
88	}
89
90	// Should only be called for characters that have one canonically matching value.
91	inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch)
92	{
93	ASSERT(ch >= info->begin && ch <= info->end);
94	switch (info->type) {
95	case CanonicalizeRangeLo:
96	return ch + info->value;
97	case CanonicalizeRangeHi:
98	return ch - info->value;
99	case CanonicalizeAlternatingAligned:
100	return ch ^ `1`;
101	case CanonicalizeAlternatingUnaligned:
102	return ((ch - `1`) ^ `1`) + `1`;
103	default:
104	RELEASE_ASSERT_NOT_REACHED();
105	}
106	RELEASE_ASSERT_NOT_REACHED();
107	return `0`;
108	}
109
110	// Returns true if no other UCS2 codepoint can match this value.
111	inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
112	{
113	return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique;
114	}
115
116	// Returns true if values are equal, under the canonicalization rules.
117	inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2)
118	{
119	const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode);
120	switch (info->type) {
121	case CanonicalizeUnique:
122	return a == b;
123	case CanonicalizeSet: {
124	for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) {
125	if (a == b)
126	return true;
127	}
128	return false;
129	}
130	case CanonicalizeRangeLo:
131	return (a == b) \|\| (a + info->value == b);
132	case CanonicalizeRangeHi:
133	return (a == b) \|\| (a - info->value == b);
134	case CanonicalizeAlternatingAligned:
135	return (a \| `1`) == (b \| `1`);
136	case CanonicalizeAlternatingUnaligned:
137	return ((a - `1`) \| `1`) == ((b - `1`) \| `1`);
138	}
139
140	RELEASE_ASSERT_NOT_REACHED();
141	return false;
142	}
143
144	} } // JSC::Yarr
145

Browse the source code of jsc/Source/JavaScriptCore/yarr/YarrCanonicalize.h