URLHelpers.cpp source code [jsc/Source/WTF/wtf/URLHelpers.cpp]

1	/*
2	* Copyright (C) 2005-2019 Apple Inc. All rights reserved.
3	* Copyright (C) 2018 Igalia S.L.
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	*
9	* 1. Redistributions of source code must retain the above copyright
10	* notice, this list of conditions and the following disclaimer.
11	* 2. Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* 3. Neither the name of Apple Inc. ("Apple") nor the names of
15	* its contributors may be used to endorse or promote products derived
16	* from this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21	* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	*/
29
30	#include "config.h"
31	#include "URLHelpers.h"
32
33	#include "URLParser.h"
34	#include <mutex>
35	#include <unicode/uidna.h>
36	#include <unicode/uscript.h>
37	#include <wtf/Optional.h>
38	#include <wtf/text/WTFString.h>
39
40	namespace WTF {
41	namespace URLHelpers {
42
43	// Needs to be big enough to hold an IDN-encoded name.
44	// For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
45	const unsigned hostNameBufferLength = `2048`;
46	const unsigned urlBytesBufferLength = `2048`;
47
48	static uint32_t IDNScriptWhiteList[(USCRIPT_CODE_LIMIT + `31`) / `32`];
49
50	#if !PLATFORM(COCOA)
51
52	// Cocoa has an implementation that uses a whitelist in /Library or ~/Library,
53	// if it exists.
54	void loadIDNScriptWhiteList()
55	{
56	static std::once_flag flag;
57	std::call_once(flag, initializeDefaultIDNScriptWhiteList);
58	}
59
60	#endif // !PLATFORM(COCOA)
61
62	static bool isArmenianLookalikeCharacter(UChar32 codePoint)
63	{
64	return codePoint == `0x0548` \|\| codePoint == `0x054D` \|\| codePoint == `0x0578` \|\| codePoint == `0x057D`;
65	}
66
67	static bool isArmenianScriptCharacter(UChar32 codePoint)
68	{
69	UErrorCode error = U_ZERO_ERROR;
70	UScriptCode script = uscript_getScript(codePoint, &error);
71	if (error != U_ZERO_ERROR) {
72	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
73	return false;
74	}
75
76	return script == USCRIPT_ARMENIAN;
77	}
78
79	template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
80	{
81	if (!isASCIIDigitOrPunctuation(charCode))
82	return false;
83
84	// Things the URL Parser rejects:
85	switch (charCode) {
86	case `'#'`:
87	case `'%'`:
88	case `'/'`:
89	case `':'`:
90	case `'?'`:
91	case `'@'`:
92	case `'['`:
93	case `'\\'`:
94	case `']'`:
95	return false;
96	default:
97	return true;
98	}
99	}
100
101	static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
102	{
103	// This function treats the following as unsafe, lookalike characters:
104	// any non-printable character, any character considered as whitespace,
105	// any ignorable character, and emoji characters related to locks.
106
107	// We also considered the characters in Mozilla's blacklist <http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
108
109	// Some of the characters here will never appear once ICU has encoded.
110	// For example, ICU transforms most spaces into an ASCII space and most
111	// slashes into an ASCII solidus. But one of the two callers uses this
112	// on characters that have not been processed by ICU, so they are needed here.
113
114	if (!u_isprint(charCode) \|\| u_isUWhiteSpace(charCode) \|\| u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
115	return true;
116
117	switch (charCode) {
118	case `0x00BC`: / VULGAR FRACTION ONE QUARTER /
119	case `0x00BD`: / VULGAR FRACTION ONE HALF /
120	case `0x00BE`: / VULGAR FRACTION THREE QUARTERS /
121	case `0x00ED`: / LATIN SMALL LETTER I WITH ACUTE /
122	/ 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because it is visually distinguishable from i and it has legitimate use in the Turkish language. /
123	case `0x01C3`: / LATIN LETTER RETROFLEX CLICK /
124	case `0x0251`: / LATIN SMALL LETTER ALPHA /
125	case `0x0261`: / LATIN SMALL LETTER SCRIPT G /
126	case `0x027E`: / LATIN SMALL LETTER R WITH FISHHOOK /
127	case `0x02D0`: / MODIFIER LETTER TRIANGULAR COLON /
128	case `0x0335`: / COMBINING SHORT STROKE OVERLAY /
129	case `0x0337`: / COMBINING SHORT SOLIDUS OVERLAY /
130	case `0x0338`: / COMBINING LONG SOLIDUS OVERLAY /
131	case `0x0589`: / ARMENIAN FULL STOP /
132	case `0x05B4`: / HEBREW POINT HIRIQ /
133	case `0x05BC`: / HEBREW POINT DAGESH OR MAPIQ /
134	case `0x05C3`: / HEBREW PUNCTUATION SOF PASUQ /
135	case `0x05F4`: / HEBREW PUNCTUATION GERSHAYIM /
136	case `0x0609`: / ARABIC-INDIC PER MILLE SIGN /
137	case `0x060A`: / ARABIC-INDIC PER TEN THOUSAND SIGN /
138	case `0x0650`: / ARABIC KASRA /
139	case `0x0660`: / ARABIC INDIC DIGIT ZERO /
140	case `0x066A`: / ARABIC PERCENT SIGN /
141	case `0x06D4`: / ARABIC FULL STOP /
142	case `0x06F0`: / EXTENDED ARABIC INDIC DIGIT ZERO /
143	case `0x0701`: / SYRIAC SUPRALINEAR FULL STOP /
144	case `0x0702`: / SYRIAC SUBLINEAR FULL STOP /
145	case `0x0703`: / SYRIAC SUPRALINEAR COLON /
146	case `0x0704`: / SYRIAC SUBLINEAR COLON /
147	case `0x1735`: / PHILIPPINE SINGLE PUNCTUATION /
148	case `0x1D04`: / LATIN LETTER SMALL CAPITAL C /
149	case `0x1D0F`: / LATIN LETTER SMALL CAPITAL O /
150	case `0x1D1C`: / LATIN LETTER SMALL CAPITAL U /
151	case `0x1D20`: / LATIN LETTER SMALL CAPITAL V /
152	case `0x1D21`: / LATIN LETTER SMALL CAPITAL W /
153	case `0x1D22`: / LATIN LETTER SMALL CAPITAL Z /
154	case `0x1ECD`: / LATIN SMALL LETTER O WITH DOT BELOW /
155	case `0x2010`: / HYPHEN /
156	case `0x2011`: / NON-BREAKING HYPHEN /
157	case `0x2024`: / ONE DOT LEADER /
158	case `0x2027`: / HYPHENATION POINT /
159	case `0x2039`: / SINGLE LEFT-POINTING ANGLE QUOTATION MARK /
160	case `0x203A`: / SINGLE RIGHT-POINTING ANGLE QUOTATION MARK /
161	case `0x2041`: / CARET INSERTION POINT /
162	case `0x2044`: / FRACTION SLASH /
163	case `0x2052`: / COMMERCIAL MINUS SIGN /
164	case `0x2153`: / VULGAR FRACTION ONE THIRD /
165	case `0x2154`: / VULGAR FRACTION TWO THIRDS /
166	case `0x2155`: / VULGAR FRACTION ONE FIFTH /
167	case `0x2156`: / VULGAR FRACTION TWO FIFTHS /
168	case `0x2157`: / VULGAR FRACTION THREE FIFTHS /
169	case `0x2158`: / VULGAR FRACTION FOUR FIFTHS /
170	case `0x2159`: / VULGAR FRACTION ONE SIXTH /
171	case `0x215A`: / VULGAR FRACTION FIVE SIXTHS /
172	case `0x215B`: / VULGAR FRACTION ONE EIGHT /
173	case `0x215C`: / VULGAR FRACTION THREE EIGHTHS /
174	case `0x215D`: / VULGAR FRACTION FIVE EIGHTHS /
175	case `0x215E`: / VULGAR FRACTION SEVEN EIGHTHS /
176	case `0x215F`: / FRACTION NUMERATOR ONE /
177	case `0x2212`: / MINUS SIGN /
178	case `0x2215`: / DIVISION SLASH /
179	case `0x2216`: / SET MINUS /
180	case `0x2236`: / RATIO /
181	case `0x233F`: / APL FUNCTIONAL SYMBOL SLASH BAR /
182	case `0x23AE`: / INTEGRAL EXTENSION /
183	case `0x244A`: / OCR DOUBLE BACKSLASH /
184	case `0x2571`: / BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT /
185	case `0x2572`: / BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT /
186	case `0x29F6`: / SOLIDUS WITH OVERBAR /
187	case `0x29F8`: / BIG SOLIDUS /
188	case `0x2AFB`: / TRIPLE SOLIDUS BINARY RELATION /
189	case `0x2AFD`: / DOUBLE SOLIDUS OPERATOR /
190	case `0x2FF0`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT /
191	case `0x2FF1`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW /
192	case `0x2FF2`: / IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT /
193	case `0x2FF3`: / IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW /
194	case `0x2FF4`: / IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND /
195	case `0x2FF5`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE /
196	case `0x2FF6`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW /
197	case `0x2FF7`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT /
198	case `0x2FF8`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT /
199	case `0x2FF9`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT /
200	case `0x2FFA`: / IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT /
201	case `0x2FFB`: / IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID /
202	case `0x3002`: / IDEOGRAPHIC FULL STOP /
203	case `0x3008`: / LEFT ANGLE BRACKET /
204	case `0x3014`: / LEFT TORTOISE SHELL BRACKET /
205	case `0x3015`: / RIGHT TORTOISE SHELL BRACKET /
206	case `0x3033`: / VERTICAL KANA REPEAT MARK UPPER HALF /
207	case `0x3035`: / VERTICAL KANA REPEAT MARK LOWER HALF /
208	case `0x321D`: / PARENTHESIZED KOREAN CHARACTER OJEON /
209	case `0x321E`: / PARENTHESIZED KOREAN CHARACTER O HU /
210	case `0x33AE`: / SQUARE RAD OVER S /
211	case `0x33AF`: / SQUARE RAD OVER S SQUARED /
212	case `0x33C6`: / SQUARE C OVER KG /
213	case `0x33DF`: / SQUARE A OVER M /
214	case `0x05B9`: / HEBREW POINT HOLAM /
215	case `0x05BA`: / HEBREW POINT HOLAM HASER FOR VAV /
216	case `0x05C1`: / HEBREW POINT SHIN DOT /
217	case `0x05C2`: / HEBREW POINT SIN DOT /
218	case `0x05C4`: / HEBREW MARK UPPER DOT /
219	case `0xA731`: / LATIN LETTER SMALL CAPITAL S /
220	case `0xA771`: / LATIN SMALL LETTER DUM /
221	case `0xA789`: / MODIFIER LETTER COLON /
222	case `0xFE14`: / PRESENTATION FORM FOR VERTICAL SEMICOLON /
223	case `0xFE15`: / PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK /
224	case `0xFE3F`: / PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET /
225	case `0xFE5D`: / SMALL LEFT TORTOISE SHELL BRACKET /
226	case `0xFE5E`: / SMALL RIGHT TORTOISE SHELL BRACKET /
227	case `0xFF0E`: / FULLWIDTH FULL STOP /
228	case `0xFF0F`: / FULL WIDTH SOLIDUS /
229	case `0xFF61`: / HALFWIDTH IDEOGRAPHIC FULL STOP /
230	case `0xFFFC`: / OBJECT REPLACEMENT CHARACTER /
231	case `0xFFFD`: / REPLACEMENT CHARACTER /
232	case `0x1F50F`: / LOCK WITH INK PEN /
233	case `0x1F510`: / CLOSED LOCK WITH KEY /
234	case `0x1F511`: / KEY /
235	case `0x1F512`: / LOCK /
236	case `0x1F513`: / OPEN LOCK /
237	return true;
238	case `0x0307`: / COMBINING DOT ABOVE /
239	return previousCodePoint == `0x0237` / LATIN SMALL LETTER DOTLESS J /
240	\|\| previousCodePoint == `0x0131` / LATIN SMALL LETTER DOTLESS I /
241	\|\| previousCodePoint == `0x05D5`; / HEBREW LETTER VAV /
242	case `0x0548`: / ARMENIAN CAPITAL LETTER VO /
243	case `0x054D`: / ARMENIAN CAPITAL LETTER SEH /
244	case `0x0578`: / ARMENIAN SMALL LETTER VO /
245	case `0x057D`: / ARMENIAN SMALL LETTER SEH /
246	return previousCodePoint
247	&& !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
248	&& !isArmenianScriptCharacter(previousCodePoint.value());
249	case `'.'`:
250	return false;
251	default:
252	return previousCodePoint
253	&& isArmenianLookalikeCharacter(previousCodePoint.value())
254	&& !(isArmenianScriptCharacter(charCode) \|\| isASCIIDigitOrValidHostCharacter(charCode));
255	}
256	}
257
258	void whiteListIDNScript(const char* scriptName)
259	{
260	int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName);
261	if (script >= `0` && script < USCRIPT_CODE_LIMIT) {
262	size_t index = script / `32`;
263	uint32_t mask = `1` << (script % `32`);
264	IDNScriptWhiteList[index] \|= mask;
265	}
266	}
267
268	void initializeDefaultIDNScriptWhiteList()
269	{
270	const char* defaultIDNScriptWhiteList[`20`] = {
271	"Common",
272	"Inherited",
273	"Arabic",
274	"Armenian",
275	"Bopomofo",
276	"Canadian_Aboriginal",
277	"Devanagari",
278	"Deseret",
279	"Gujarati",
280	"Gurmukhi",
281	"Hangul",
282	"Han",
283	"Hebrew",
284	"Hiragana",
285	"Katakana_Or_Hiragana",
286	"Katakana",
287	"Latin",
288	"Tamil",
289	"Thai",
290	"Yi",
291	};
292	for (const char* scriptName : defaultIDNScriptWhiteList)
293	whiteListIDNScript(scriptName);
294	}
295
296	static bool allCharactersInIDNScriptWhiteList(const UChar* buffer, int32_t length)
297	{
298	loadIDNScriptWhiteList();
299	int32_t i = `0`;
300	Optional<UChar32> previousCodePoint;
301	while (i < length) {
302	UChar32 c;
303	U16_NEXT(buffer, i, length, c)
304	UErrorCode error = U_ZERO_ERROR;
305	UScriptCode script = uscript_getScript(c, &error);
306	if (error != U_ZERO_ERROR) {
307	LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
308	return false;
309	}
310	if (script < `0`) {
311	LOG_ERROR("got negative number for script code from ICU: %d", script);
312	return false;
313	}
314	if (script >= USCRIPT_CODE_LIMIT)
315	return false;
316
317	size_t index = script / `32`;
318	uint32_t mask = `1` << (script % `32`);
319	if (!(IDNScriptWhiteList[index] & mask))
320	return false;
321
322	if (isLookalikeCharacter(previousCodePoint, c))
323	return false;
324	previousCodePoint = c;
325	}
326	return true;
327	}
328
329	static bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, const WTF::Function<bool(UChar)>& characterIsAllowed)
330	{
331	ASSERT(length > `0`);
332
333	for (int32_t i = length - `1`; i >= `0`; --i) {
334	UChar ch = buffer[i];
335
336	if (characterIsAllowed (ch))
337	continue;
338
339	// Only check the second level domain. Lower level registrars may have different rules.
340	if (ch == `'.'`)
341	break;
342
343	return false;
344	}
345	return true;
346	}
347
348	#define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \
349	{ \
350	static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \
351	if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \
352	return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \
353	}
354
355	static bool isRussianDomainNameCharacter(UChar ch)
356	{
357	// Only modern Russian letters, digits and dashes are allowed.
358	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
359	}
360
361	static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
362	{
363	// Skip trailing dot for root domain.
364	if (buffer[length - `1`] == `'.'`)
365	length--;
366
367	// http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf
368	static const UChar cyrillicRF[] = {
369	`'.'`,
370	`0x0440`, // CYRILLIC SMALL LETTER ER
371	`0x0444`, // CYRILLIC SMALL LETTER EF
372	};
373	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter);
374
375	// http://rusnames.ru/rules.pl
376	static const UChar cyrillicRUS[] = {
377	`'.'`,
378	`0x0440`, // CYRILLIC SMALL LETTER ER
379	`0x0443`, // CYRILLIC SMALL LETTER U
380	`0x0441`, // CYRILLIC SMALL LETTER ES
381	};
382	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter);
383
384	// http://ru.faitid.org/projects/moscow/documents/moskva/idn
385	static const UChar cyrillicMOSKVA[] = {
386	`'.'`,
387	`0x043C`, // CYRILLIC SMALL LETTER EM
388	`0x043E`, // CYRILLIC SMALL LETTER O
389	`0x0441`, // CYRILLIC SMALL LETTER ES
390	`0x043A`, // CYRILLIC SMALL LETTER KA
391	`0x0432`, // CYRILLIC SMALL LETTER VE
392	`0x0430`, // CYRILLIC SMALL LETTER A
393	};
394	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter);
395
396	// http://www.dotdeti.ru/foruser/docs/regrules.php
397	static const UChar cyrillicDETI[] = {
398	`'.'`,
399	`0x0434`, // CYRILLIC SMALL LETTER DE
400	`0x0435`, // CYRILLIC SMALL LETTER IE
401	`0x0442`, // CYRILLIC SMALL LETTER TE
402	`0x0438`, // CYRILLIC SMALL LETTER I
403	};
404	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter);
405
406	// http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this time,
407	// although we may need to revise the checks if this ends up being used with other languages spoken in Russia.
408	static const UChar cyrillicONLAYN[] = {
409	`'.'`,
410	`0x043E`, // CYRILLIC SMALL LETTER O
411	`0x043D`, // CYRILLIC SMALL LETTER EN
412	`0x043B`, // CYRILLIC SMALL LETTER EL
413	`0x0430`, // CYRILLIC SMALL LETTER A
414	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
415	`0x043D`, // CYRILLIC SMALL LETTER EN
416	};
417	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter);
418
419	// http://corenic.org - same as above.
420	static const UChar cyrillicSAYT[] = {
421	`'.'`,
422	`0x0441`, // CYRILLIC SMALL LETTER ES
423	`0x0430`, // CYRILLIC SMALL LETTER A
424	`0x0439`, // CYRILLIC SMALL LETTER SHORT I
425	`0x0442`, // CYRILLIC SMALL LETTER TE
426	};
427	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter);
428
429	// http://pir.org/products/opr-domain/ - rules not published. According to the registry site,
430	// the intended audience is "Russian and other Slavic-speaking markets".
431	// Chrome appears to only allow Russian, so sticking with that for now.
432	static const UChar cyrillicORG[] = {
433	`'.'`,
434	`0x043E`, // CYRILLIC SMALL LETTER O
435	`0x0440`, // CYRILLIC SMALL LETTER ER
436	`0x0433`, // CYRILLIC SMALL LETTER GHE
437	};
438	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter);
439
440	// http://cctld.by/rules.html
441	static const UChar cyrillicBEL[] = {
442	`'.'`,
443	`0x0431`, // CYRILLIC SMALL LETTER BE
444	`0x0435`, // CYRILLIC SMALL LETTER IE
445	`0x043B`, // CYRILLIC SMALL LETTER EL
446	};
447	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) {
448	// Russian and Byelorussian letters, digits and dashes are allowed.
449	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0456` \|\| ch == `0x045E` \|\| ch == `0x2019` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
450	});
451
452	// http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf
453	static const UChar cyrillicKAZ[] = {
454	`'.'`,
455	`0x049B`, // CYRILLIC SMALL LETTER KA WITH DESCENDER
456	`0x0430`, // CYRILLIC SMALL LETTER A
457	`0x0437`, // CYRILLIC SMALL LETTER ZE
458	};
459	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) {
460	// Kazakh letters, digits and dashes are allowed.
461	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04D9` \|\| ch == `0x0493` \|\| ch == `0x049B` \|\| ch == `0x04A3` \|\| ch == `0x04E9` \|\| ch == `0x04B1` \|\| ch == `0x04AF` \|\| ch == `0x04BB` \|\| ch == `0x0456` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
462	});
463
464	// http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf
465	static const UChar cyrillicUKR[] = {
466	`'.'`,
467	`0x0443`, // CYRILLIC SMALL LETTER U
468	`0x043A`, // CYRILLIC SMALL LETTER KA
469	`0x0440`, // CYRILLIC SMALL LETTER ER
470	};
471	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) {
472	// Russian and Ukrainian letters, digits and dashes are allowed.
473	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x0491` \|\| ch == `0x0404` \|\| ch == `0x0456` \|\| ch == `0x0457` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
474	});
475
476	// http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf
477	static const UChar cyrillicSRB[] = {
478	`'.'`,
479	`0x0441`, // CYRILLIC SMALL LETTER ES
480	`0x0440`, // CYRILLIC SMALL LETTER ER
481	`0x0431`, // CYRILLIC SMALL LETTER BE
482	};
483	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) {
484	// Serbian letters, digits and dashes are allowed.
485	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0452` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045B` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
486	});
487
488	// http://marnet.mk/doc/pravilnik-mk-mkd.pdf
489	static const UChar cyrillicMKD[] = {
490	`'.'`,
491	`0x043C`, // CYRILLIC SMALL LETTER EM
492	`0x043A`, // CYRILLIC SMALL LETTER KA
493	`0x0434`, // CYRILLIC SMALL LETTER DE
494	};
495	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) {
496	// Macedonian letters, digits and dashes are allowed.
497	return (ch >= `0x0430` && ch <= `0x0438`) \|\| (ch >= `0x043A` && ch <= `0x0448`) \|\| ch == `0x0453` \|\| ch == `0x0455` \|\| ch == `0x0458` \|\| ch == `0x0459` \|\| ch == `0x045A` \|\| ch == `0x045C` \|\| ch == `0x045F` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
498	});
499
500	// https://www.mon.mn/cs/
501	static const UChar cyrillicMON[] = {
502	`'.'`,
503	`0x043C`, // CYRILLIC SMALL LETTER EM
504	`0x043E`, // CYRILLIC SMALL LETTER O
505	`0x043D`, // CYRILLIC SMALL LETTER EN
506	};
507	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) {
508	// Mongolian letters, digits and dashes are allowed.
509	return (ch >= `0x0430` && ch <= `0x044f`) \|\| ch == `0x0451` \|\| ch == `0x04E9` \|\| ch == `0x04AF` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
510	});
511
512	// https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html
513	static const UChar cyrillicBG[] = {
514	`'.'`,
515	`0x0431`, // CYRILLIC SMALL LETTER BE
516	`0x0433` // CYRILLIC SMALL LETTER GHE
517	};
518	CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) {
519	return (ch >= `0x0430` && ch <= `0x044A`) \|\| ch == `0x044C` \|\| (ch >= `0x044E` && ch <= `0x0450`) \|\| ch == `0x045D` \|\| isASCIIDigit(ch) \|\| ch == `'-'`;
520	});
521
522	// Not a known top level domain with special rules.
523	return false;
524	}
525
526	// Return value of null means no mapping is necessary.
527	Optional<String> mapHostName(const String& hostName, const Optional<URLDecodeFunction>& decodeFunction)
528	{
529	if (hostName.length() > hostNameBufferLength)
530	return String ();
531
532	if (!hostName.length())
533	return String ();
534
535	String string;
536	if (decodeFunction && string.contains(`'%'`))
537	string = (*decodeFunction)(hostName);
538	else
539	string = hostName;
540
541	unsigned length = string.length();
542
543	auto sourceBuffer = string.charactersWithNullTermination();
544
545	UChar destinationBuffer[hostNameBufferLength];
546	UErrorCode uerror = U_ZERO_ERROR;
547	UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
548	int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror);
549	if (length && (U_FAILURE(uerror) \|\| processingDetails.errors))
550	return nullopt;
551
552	if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar)))
553	return String ();
554
555	if (!decodeFunction && !allCharactersInIDNScriptWhiteList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
556	return String ();
557
558	return String (destinationBuffer, numCharactersConverted);
559	}
560
561	using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>;
562
563	static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, const Optional<URLDecodeFunction>& decodeFunction)
564	{
565	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
566	// Therefore, we use null to indicate no mapping here and an empty array to indicate error.
567
568	String substring = string.substringSharingImpl(location, length);
569	Optional<String> host = mapHostName(substring, decodeFunction);
570
571	if (host && !*host)
572	return;
573
574	if (!array)
575	array = Vector<std::tuple<unsigned, unsigned, String>>();
576
577	if (host)
578	array ->constructAndAppend(location, length, *host);
579	}
580
581	static void applyHostNameFunctionToMailToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
582	{
583	// In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' character.
584	// Skip quoted strings so that characters in them don't confuse us.
585	// When we find a '?' character, we are past the part of the URL that contains host names.
586
587	unsigned stringLength = string.length();
588	unsigned current = `0`;
589
590	while (`1`) {
591	// Find start of host name or of quoted string.
592	auto hostNameOrStringStart = string.find([](UChar ch) {
593	return ch == `'"'` \|\| ch == `'@'` \|\| ch == `'?'`;
594	}, current);
595	if (hostNameOrStringStart == notFound)
596	return;
597
598	UChar c = string [hostNameOrStringStart];
599	current = hostNameOrStringStart + `1`;
600
601	if (c == `'?'`)
602	return;
603
604	if (c == `'@'`) {
605	// Find end of host name.
606	unsigned hostNameStart = current;
607	auto hostNameEnd = string.find([](UChar ch) {
608	return ch == `'>'` \|\| ch == `','` \|\| ch == `'?'`;
609	}, current);
610
611	bool done;
612	if (hostNameEnd == notFound) {
613	hostNameEnd = stringLength;
614	done = true;
615	} else {
616	current = hostNameEnd;
617	done = false;
618	}
619
620	// Process host name range.
621	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
622
623	if (done)
624	return;
625	} else {
626	// Skip quoted string.
627	ASSERT(c == `'"'`);
628	while (`1`) {
629	auto escapedCharacterOrStringEnd = string.find([](UChar ch) {
630	return ch == `'"'` \|\| ch == `'\\'`;
631	}, current);
632	if (escapedCharacterOrStringEnd == notFound)
633	return;
634
635	c = string [escapedCharacterOrStringEnd];
636	current = escapedCharacterOrStringEnd + `1`;
637
638	// If we are the end of the string, then break from the string loop back to the host name loop.
639	if (c == `'"'`)
640	break;
641
642	// Skip escaped character.
643	ASSERT(c == `'\\'`);
644	if (current == stringLength)
645	return;
646
647	++current;
648	}
649	}
650	}
651	}
652
653	static void applyHostNameFunctionToURLString(const String& string, const Optional<URLDecodeFunction>& decodeFunction, MappingRangesVector& array)
654	{
655	// Find hostnames. Too bad we can't use any real URL-parsing code to do this,
656	// but we have to do it before doing all the %-escaping, and this is the only
657	// code we have that parses mailto URLs anyway.
658
659	// Maybe we should implement this using a character buffer instead?
660
661	if (protocolIs(string, "mailto")) {
662	applyHostNameFunctionToMailToURLString(string, decodeFunction, array);
663	return;
664	}
665
666	// Find the host name in a hierarchical URL.
667	// It comes after a "://" sequence, with scheme characters preceding.
668	// If ends with the end of the string or a ":", "/", or a "?".
669	// If there is a "@" character, the host part is just the part after the "@".
670	static const char* separator = "://";
671	auto separatorIndex = string.find(separator);
672	if (separatorIndex == notFound)
673	return;
674
675	unsigned authorityStart = separatorIndex + strlen(separator);
676
677	// Check that all characters before the :// are valid scheme characters.
678	auto invalidSchemeCharacter = string.substringSharingImpl(`0`, separatorIndex).find([](UChar ch) {
679	static const char* allowedCharacters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-.";
680	static size_t length = strlen(allowedCharacters);
681	for (size_t i = `0`; i < length; ++i) {
682	if (allowedCharacters[i] == ch)
683	return false;
684	}
685	return true;
686	});
687
688	if (invalidSchemeCharacter != notFound)
689	return;
690
691	unsigned stringLength = string.length();
692
693	// Find terminating character.
694	auto hostNameTerminator = string.find([](UChar ch) {
695	static const char* terminatingCharacters = ":/?#";
696	static size_t length = strlen(terminatingCharacters);
697	for (size_t i = `0`; i < length; ++i) {
698	if (terminatingCharacters[i] == ch)
699	return true;
700	}
701	return false;
702	}, authorityStart);
703	unsigned hostNameEnd = hostNameTerminator == notFound ? stringLength : hostNameTerminator;
704
705	// Find "@" for the start of the host name.
706	auto userInfoTerminator = string.substringSharingImpl(`0`, hostNameEnd).find(`'@'`, authorityStart);
707	unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + `1`;
708
709	collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
710	}
711
712	String mapHostNames(const String& string, const Optional<URLDecodeFunction>& decodeFunction)
713	{
714	// Generally, we want to optimize for the case where there is one host name that does not need mapping.
715
716	if (decodeFunction && string.isAllASCII())
717	return string;
718
719	// Make a list of ranges that actually need mapping.
720	MappingRangesVector hostNameRanges;
721	applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges);
722	if (!hostNameRanges)
723	return string;
724
725	if (hostNameRanges ->isEmpty())
726	return { };
727
728	// Do the mapping.
729	String result = string;
730	while (!hostNameRanges ->isEmpty()) {
731	unsigned location, length;
732	String mappedHostName;
733	std::tie(location, length, mappedHostName) = hostNameRanges ->takeLast();
734	result = result.replace(location, length, mappedHostName);
735	}
736	return result;
737	}
738
739	static String escapeUnsafeCharacters(const String& sourceBuffer)
740	{
741	unsigned length = sourceBuffer.length();
742
743	Optional<UChar32> previousCodePoint;
744
745	unsigned i;
746	for (i = `0`; i < length; ) {
747	UChar32 c = sourceBuffer.characterStartingAt(i);
748	if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i)))
749	break;
750	previousCodePoint = c;
751	i += U16_LENGTH(c);
752	}
753
754	if (i == length)
755	return sourceBuffer;
756
757	Vector<UChar, urlBytesBufferLength> outBuffer;
758
759	outBuffer.grow(i);
760	if (sourceBuffer.is8Bit())
761	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i);
762	else
763	StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i);
764
765	for (; i < length; ) {
766	UChar32 c = sourceBuffer.characterStartingAt(i);
767	unsigned characterLength = U16_LENGTH(c);
768	if (isLookalikeCharacter(previousCodePoint, c)) {
769	uint8_t utf8Buffer[`4`];
770	size_t offset = `0`;
771	UBool failure = false;
772	U8_APPEND(utf8Buffer, offset, `4`, c, failure)
773	ASSERT(!failure);
774
775	for (size_t j = `0`; j < offset; ++j) {
776	outBuffer.append(`'%'`);
777	outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j]));
778	outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j]));
779	}
780	} else {
781	for (unsigned j = `0`; j < characterLength; ++j)
782	outBuffer.append(sourceBuffer [i + j]);
783	}
784	previousCodePoint = c;
785	i += characterLength;
786	}
787
788	return String::adopt(WTFMove(outBuffer));
789	}
790
791	String userVisibleURL(const CString& url)
792	{
793	auto* before = reinterpret_cast<const unsigned char*>(url.data());
794	int length = url.length();
795
796	if (!length)
797	return { };
798
799	bool mayNeedHostNameDecoding = false;
800
801	Checked<int, RecordOverflow> bufferLength = length;
802	bufferLength = bufferLength * `3` + `1`; // The buffer should be large enough to %-escape every character.
803	if (bufferLength.hasOverflowed())
804	return { };
805	Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet());
806
807	char* q = after.data();
808	{
809	const unsigned char* p = before;
810	for (int i = `0`; i < length; i++) {
811	unsigned char c = p[i];
812	// unescape escape sequences that indicate bytes greater than 0x7f
813	if (c == `'%'` && i + `2` < length && isASCIIHexDigit(p[i + `1`]) && isASCIIHexDigit(p[i + `2`])) {
814	auto u = toASCIIHexValue(p[i + `1`], p[i + `2`]);
815	if (u > `0x7f`) {
816	// unescape
817	*q++ = u;
818	} else {
819	// do not unescape
820	*q++ = p[i];
821	*q++ = p[i + `1`];
822	*q++ = p[i + `2`];
823	}
824	i += `2`;
825	} else {
826	*q++ = c;
827
828	// Check for "xn--" in an efficient, non-case-sensitive, way.
829	if (c == `'-'` && i >= `3` && !mayNeedHostNameDecoding && (q[-`4`] \| `0x20`) == `'x'` && (q[-`3`] \| `0x20`) == `'n'` && q[-`2`] == `'-'`)
830	mayNeedHostNameDecoding = true;
831	}
832	}
833	*q = `'\0'`;
834	}
835
836	// Check string to see if it can be converted to display using UTF-8
837	String result = String::fromUTF8(after.data());
838	if (!result) {
839	// Could not convert to UTF-8.
840	// Convert characters greater than 0x7f to escape sequences.
841	// Shift current string to the end of the buffer
842	// then we will copy back bytes to the start of the buffer
843	// as we convert.
844	int afterlength = q - after.data();
845	char* p = after.data() + bufferLength.unsafeGet() - afterlength - `1`;
846	memmove(p, after.data(), afterlength + `1`); // copies trailing '\0'
847	char* q = after.data();
848	while (*p) {
849	unsigned char c = *p;
850	if (c > `0x7f`) {
851	*q++ = `'%'`;
852	*q++ = upperNibbleToASCIIHexDigit(c);
853	*q++ = lowerNibbleToASCIIHexDigit(c);
854	} else
855	q++ = p;
856	p++;
857	}
858	*q = `'\0'`;
859	// Note: after.data() points to a null-terminated, pure ASCII string.
860	result = String::fromUTF8(after.data());
861	ASSERT(!!result);
862	}
863
864	// Note: result is UTF–16 string, created from either a valid UTF-8 string,
865	// or a pure ASCII string (where all bytes with the high bit set are
866	// percent-encoded).
867
868	if (mayNeedHostNameDecoding) {
869	// FIXME: Is it good to ignore the failure of mapHostNames and keep result intact?
870	auto mappedResult = mapHostNames(result, nullopt);
871	if (!!mappedResult)
872	result = mappedResult;
873	}
874
875	return escapeUnsafeCharacters(normalizedNFC(result));
876	}
877
878	} // namespace URLHelpers
879	} // namespace WTF
880

Browse the source code of jsc/Source/WTF/wtf/URLHelpers.cpp