1 | // Copyright 2016 the V8 project authors. All rights reserved. |
2 | // Use of this source code is governed by a BSD-style license that can be |
3 | // found in the LICENSE file. |
4 | |
5 | #include "src/uri.h" |
6 | |
7 | #include <vector> |
8 | |
9 | #include "src/char-predicates-inl.h" |
10 | #include "src/isolate-inl.h" |
11 | #include "src/string-search.h" |
12 | #include "src/unicode-inl.h" |
13 | |
14 | namespace v8 { |
15 | namespace internal { |
16 | |
17 | namespace { // anonymous namespace for DecodeURI helper functions |
18 | bool IsReservedPredicate(uc16 c) { |
19 | switch (c) { |
20 | case '#': |
21 | case '$': |
22 | case '&': |
23 | case '+': |
24 | case ',': |
25 | case '/': |
26 | case ':': |
27 | case ';': |
28 | case '=': |
29 | case '?': |
30 | case '@': |
31 | return true; |
32 | default: |
33 | return false; |
34 | } |
35 | } |
36 | |
37 | bool IsReplacementCharacter(const uint8_t* octets, int length) { |
38 | // The replacement character is at codepoint U+FFFD in the Unicode Specials |
39 | // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD. |
40 | if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF || |
41 | octets[2] != 0xBD) { |
42 | return false; |
43 | } |
44 | return true; |
45 | } |
46 | |
47 | bool DecodeOctets(const uint8_t* octets, int length, |
48 | std::vector<uc16>* buffer) { |
49 | size_t cursor = 0; |
50 | uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor); |
51 | if (value == unibrow::Utf8::kBadChar && |
52 | !IsReplacementCharacter(octets, length)) { |
53 | return false; |
54 | } |
55 | |
56 | if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { |
57 | buffer->push_back(value); |
58 | } else { |
59 | buffer->push_back(unibrow::Utf16::LeadSurrogate(value)); |
60 | buffer->push_back(unibrow::Utf16::TrailSurrogate(value)); |
61 | } |
62 | return true; |
63 | } |
64 | |
65 | int TwoDigitHex(uc16 character1, uc16 character2) { |
66 | if (character1 > 'f') return -1; |
67 | int high = HexValue(character1); |
68 | if (high == -1) return -1; |
69 | if (character2 > 'f') return -1; |
70 | int low = HexValue(character2); |
71 | if (low == -1) return -1; |
72 | return (high << 4) + low; |
73 | } |
74 | |
75 | template <typename T> |
76 | void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index, |
77 | bool is_uri, std::vector<T>* buffer) { |
78 | if (is_uri && IsReservedPredicate(decoded)) { |
79 | buffer->push_back('%'); |
80 | uc16 first = uri_content->Get(index + 1); |
81 | uc16 second = uri_content->Get(index + 2); |
82 | DCHECK_GT(std::numeric_limits<T>::max(), first); |
83 | DCHECK_GT(std::numeric_limits<T>::max(), second); |
84 | |
85 | buffer->push_back(first); |
86 | buffer->push_back(second); |
87 | } else { |
88 | buffer->push_back(decoded); |
89 | } |
90 | } |
91 | |
92 | bool IntoTwoByte(int index, bool is_uri, int uri_length, |
93 | String::FlatContent* uri_content, std::vector<uc16>* buffer) { |
94 | for (int k = index; k < uri_length; k++) { |
95 | uc16 code = uri_content->Get(k); |
96 | if (code == '%') { |
97 | int two_digits; |
98 | if (k + 2 >= uri_length || |
99 | (two_digits = TwoDigitHex(uri_content->Get(k + 1), |
100 | uri_content->Get(k + 2))) < 0) { |
101 | return false; |
102 | } |
103 | k += 2; |
104 | uc16 decoded = static_cast<uc16>(two_digits); |
105 | if (decoded > unibrow::Utf8::kMaxOneByteChar) { |
106 | uint8_t octets[unibrow::Utf8::kMaxEncodedSize]; |
107 | octets[0] = decoded; |
108 | |
109 | int number_of_continuation_bytes = 0; |
110 | while ((decoded << ++number_of_continuation_bytes) & 0x80) { |
111 | if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) { |
112 | return false; |
113 | } |
114 | if (uri_content->Get(++k) != '%' || |
115 | (two_digits = TwoDigitHex(uri_content->Get(k + 1), |
116 | uri_content->Get(k + 2))) < 0) { |
117 | return false; |
118 | } |
119 | k += 2; |
120 | uc16 continuation_byte = static_cast<uc16>(two_digits); |
121 | octets[number_of_continuation_bytes] = continuation_byte; |
122 | } |
123 | |
124 | if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) { |
125 | return false; |
126 | } |
127 | } else { |
128 | AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer); |
129 | } |
130 | } else { |
131 | buffer->push_back(code); |
132 | } |
133 | } |
134 | return true; |
135 | } |
136 | |
137 | bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri, |
138 | std::vector<uint8_t>* one_byte_buffer, |
139 | std::vector<uc16>* two_byte_buffer) { |
140 | DisallowHeapAllocation no_gc; |
141 | String::FlatContent uri_content = uri->GetFlatContent(no_gc); |
142 | |
143 | int uri_length = uri->length(); |
144 | for (int k = 0; k < uri_length; k++) { |
145 | uc16 code = uri_content.Get(k); |
146 | if (code == '%') { |
147 | int two_digits; |
148 | if (k + 2 >= uri_length || |
149 | (two_digits = TwoDigitHex(uri_content.Get(k + 1), |
150 | uri_content.Get(k + 2))) < 0) { |
151 | return false; |
152 | } |
153 | |
154 | uc16 decoded = static_cast<uc16>(two_digits); |
155 | if (decoded > unibrow::Utf8::kMaxOneByteChar) { |
156 | return IntoTwoByte(k, is_uri, uri_length, &uri_content, |
157 | two_byte_buffer); |
158 | } |
159 | |
160 | AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer); |
161 | k += 2; |
162 | } else { |
163 | if (code > unibrow::Utf8::kMaxOneByteChar) { |
164 | return IntoTwoByte(k, is_uri, uri_length, &uri_content, |
165 | two_byte_buffer); |
166 | } |
167 | one_byte_buffer->push_back(code); |
168 | } |
169 | } |
170 | return true; |
171 | } |
172 | |
173 | } // anonymous namespace |
174 | |
175 | MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri, |
176 | bool is_uri) { |
177 | uri = String::Flatten(isolate, uri); |
178 | std::vector<uint8_t> one_byte_buffer; |
179 | std::vector<uc16> two_byte_buffer; |
180 | |
181 | if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) { |
182 | THROW_NEW_ERROR(isolate, NewURIError(), String); |
183 | } |
184 | |
185 | if (two_byte_buffer.empty()) { |
186 | return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>( |
187 | one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size()))); |
188 | } |
189 | |
190 | Handle<SeqTwoByteString> result; |
191 | int result_length = |
192 | static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size()); |
193 | ASSIGN_RETURN_ON_EXCEPTION( |
194 | isolate, result, isolate->factory()->NewRawTwoByteString(result_length), |
195 | String); |
196 | |
197 | DisallowHeapAllocation no_gc; |
198 | CopyChars(result->GetChars(no_gc), one_byte_buffer.data(), |
199 | one_byte_buffer.size()); |
200 | CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(), |
201 | two_byte_buffer.data(), two_byte_buffer.size()); |
202 | |
203 | return result; |
204 | } |
205 | |
206 | namespace { // anonymous namespace for EncodeURI helper functions |
207 | bool IsUnescapePredicateInUriComponent(uc16 c) { |
208 | if (IsAlphaNumeric(c)) { |
209 | return true; |
210 | } |
211 | |
212 | switch (c) { |
213 | case '!': |
214 | case '\'': |
215 | case '(': |
216 | case ')': |
217 | case '*': |
218 | case '-': |
219 | case '.': |
220 | case '_': |
221 | case '~': |
222 | return true; |
223 | default: |
224 | return false; |
225 | } |
226 | } |
227 | |
228 | bool IsUriSeparator(uc16 c) { |
229 | switch (c) { |
230 | case '#': |
231 | case ':': |
232 | case ';': |
233 | case '/': |
234 | case '?': |
235 | case '$': |
236 | case '&': |
237 | case '+': |
238 | case ',': |
239 | case '@': |
240 | case '=': |
241 | return true; |
242 | default: |
243 | return false; |
244 | } |
245 | } |
246 | |
247 | void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) { |
248 | buffer->push_back('%'); |
249 | buffer->push_back(HexCharOfValue(octet >> 4)); |
250 | buffer->push_back(HexCharOfValue(octet & 0x0F)); |
251 | } |
252 | |
253 | void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) { |
254 | char s[4] = {}; |
255 | int number_of_bytes; |
256 | number_of_bytes = |
257 | unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false); |
258 | for (int k = 0; k < number_of_bytes; k++) { |
259 | AddEncodedOctetToBuffer(s[k], buffer); |
260 | } |
261 | } |
262 | |
263 | void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) { |
264 | char s[4] = {}; |
265 | int number_of_bytes = |
266 | unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2), |
267 | unibrow::Utf16::kNoPreviousCharacter, false); |
268 | for (int k = 0; k < number_of_bytes; k++) { |
269 | AddEncodedOctetToBuffer(s[k], buffer); |
270 | } |
271 | } |
272 | |
273 | } // anonymous namespace |
274 | |
275 | MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri, |
276 | bool is_uri) { |
277 | uri = String::Flatten(isolate, uri); |
278 | int uri_length = uri->length(); |
279 | std::vector<uint8_t> buffer; |
280 | buffer.reserve(uri_length); |
281 | |
282 | { |
283 | DisallowHeapAllocation no_gc; |
284 | String::FlatContent uri_content = uri->GetFlatContent(no_gc); |
285 | |
286 | for (int k = 0; k < uri_length; k++) { |
287 | uc16 cc1 = uri_content.Get(k); |
288 | if (unibrow::Utf16::IsLeadSurrogate(cc1)) { |
289 | k++; |
290 | if (k < uri_length) { |
291 | uc16 cc2 = uri->Get(k); |
292 | if (unibrow::Utf16::IsTrailSurrogate(cc2)) { |
293 | EncodePair(cc1, cc2, &buffer); |
294 | continue; |
295 | } |
296 | } |
297 | } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) { |
298 | if (IsUnescapePredicateInUriComponent(cc1) || |
299 | (is_uri && IsUriSeparator(cc1))) { |
300 | buffer.push_back(cc1); |
301 | } else { |
302 | EncodeSingle(cc1, &buffer); |
303 | } |
304 | continue; |
305 | } |
306 | |
307 | AllowHeapAllocation allocate_error_and_return; |
308 | THROW_NEW_ERROR(isolate, NewURIError(), String); |
309 | } |
310 | } |
311 | |
312 | return isolate->factory()->NewStringFromOneByte(VectorOf(buffer)); |
313 | } |
314 | |
315 | namespace { // Anonymous namespace for Escape and Unescape |
316 | |
317 | template <typename Char> |
318 | int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) { |
319 | uint16_t character = vector[i]; |
320 | int32_t hi = 0; |
321 | int32_t lo = 0; |
322 | if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' && |
323 | (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 && |
324 | (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) { |
325 | *step = 6; |
326 | return (hi << 8) + lo; |
327 | } else if (character == '%' && i <= length - 3 && |
328 | (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) { |
329 | *step = 3; |
330 | return lo; |
331 | } else { |
332 | *step = 1; |
333 | return character; |
334 | } |
335 | } |
336 | |
337 | template <typename Char> |
338 | MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string, |
339 | int start_index) { |
340 | bool one_byte = true; |
341 | int length = string->length(); |
342 | |
343 | int unescaped_length = 0; |
344 | { |
345 | DisallowHeapAllocation no_allocation; |
346 | Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); |
347 | for (int i = start_index; i < length; unescaped_length++) { |
348 | int step; |
349 | if (UnescapeChar(vector, i, length, &step) > |
350 | String::kMaxOneByteCharCode) { |
351 | one_byte = false; |
352 | } |
353 | i += step; |
354 | } |
355 | } |
356 | |
357 | DCHECK(start_index < length); |
358 | Handle<String> first_part = |
359 | isolate->factory()->NewProperSubString(string, 0, start_index); |
360 | |
361 | int dest_position = 0; |
362 | Handle<String> second_part; |
363 | DCHECK_LE(unescaped_length, String::kMaxLength); |
364 | if (one_byte) { |
365 | Handle<SeqOneByteString> dest = isolate->factory() |
366 | ->NewRawOneByteString(unescaped_length) |
367 | .ToHandleChecked(); |
368 | DisallowHeapAllocation no_allocation; |
369 | Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); |
370 | for (int i = start_index; i < length; dest_position++) { |
371 | int step; |
372 | dest->SeqOneByteStringSet(dest_position, |
373 | UnescapeChar(vector, i, length, &step)); |
374 | i += step; |
375 | } |
376 | second_part = dest; |
377 | } else { |
378 | Handle<SeqTwoByteString> dest = isolate->factory() |
379 | ->NewRawTwoByteString(unescaped_length) |
380 | .ToHandleChecked(); |
381 | DisallowHeapAllocation no_allocation; |
382 | Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); |
383 | for (int i = start_index; i < length; dest_position++) { |
384 | int step; |
385 | dest->SeqTwoByteStringSet(dest_position, |
386 | UnescapeChar(vector, i, length, &step)); |
387 | i += step; |
388 | } |
389 | second_part = dest; |
390 | } |
391 | return isolate->factory()->NewConsString(first_part, second_part); |
392 | } |
393 | |
394 | bool IsNotEscaped(uint16_t c) { |
395 | if (IsAlphaNumeric(c)) { |
396 | return true; |
397 | } |
398 | // @*_+-./ |
399 | switch (c) { |
400 | case '@': |
401 | case '*': |
402 | case '_': |
403 | case '+': |
404 | case '-': |
405 | case '.': |
406 | case '/': |
407 | return true; |
408 | default: |
409 | return false; |
410 | } |
411 | } |
412 | |
413 | template <typename Char> |
414 | static MaybeHandle<String> UnescapePrivate(Isolate* isolate, |
415 | Handle<String> source) { |
416 | int index; |
417 | { |
418 | DisallowHeapAllocation no_allocation; |
419 | StringSearch<uint8_t, Char> search(isolate, StaticCharVector("%" )); |
420 | index = search.Search(source->GetCharVector<Char>(no_allocation), 0); |
421 | if (index < 0) return source; |
422 | } |
423 | return UnescapeSlow<Char>(isolate, source, index); |
424 | } |
425 | |
426 | template <typename Char> |
427 | static MaybeHandle<String> EscapePrivate(Isolate* isolate, |
428 | Handle<String> string) { |
429 | DCHECK(string->IsFlat()); |
430 | int escaped_length = 0; |
431 | int length = string->length(); |
432 | |
433 | { |
434 | DisallowHeapAllocation no_allocation; |
435 | Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); |
436 | for (int i = 0; i < length; i++) { |
437 | uint16_t c = vector[i]; |
438 | if (c >= 256) { |
439 | escaped_length += 6; |
440 | } else if (IsNotEscaped(c)) { |
441 | escaped_length++; |
442 | } else { |
443 | escaped_length += 3; |
444 | } |
445 | |
446 | // We don't allow strings that are longer than a maximal length. |
447 | DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow. |
448 | if (escaped_length > String::kMaxLength) break; // Provoke exception. |
449 | } |
450 | } |
451 | |
452 | // No length change implies no change. Return original string if no change. |
453 | if (escaped_length == length) return string; |
454 | |
455 | Handle<SeqOneByteString> dest; |
456 | ASSIGN_RETURN_ON_EXCEPTION( |
457 | isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length), |
458 | String); |
459 | int dest_position = 0; |
460 | |
461 | { |
462 | DisallowHeapAllocation no_allocation; |
463 | Vector<const Char> vector = string->GetCharVector<Char>(no_allocation); |
464 | for (int i = 0; i < length; i++) { |
465 | uint16_t c = vector[i]; |
466 | if (c >= 256) { |
467 | dest->SeqOneByteStringSet(dest_position, '%'); |
468 | dest->SeqOneByteStringSet(dest_position + 1, 'u'); |
469 | dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12)); |
470 | dest->SeqOneByteStringSet(dest_position + 3, |
471 | HexCharOfValue((c >> 8) & 0xF)); |
472 | dest->SeqOneByteStringSet(dest_position + 4, |
473 | HexCharOfValue((c >> 4) & 0xF)); |
474 | dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF)); |
475 | dest_position += 6; |
476 | } else if (IsNotEscaped(c)) { |
477 | dest->SeqOneByteStringSet(dest_position, c); |
478 | dest_position++; |
479 | } else { |
480 | dest->SeqOneByteStringSet(dest_position, '%'); |
481 | dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4)); |
482 | dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF)); |
483 | dest_position += 3; |
484 | } |
485 | } |
486 | } |
487 | |
488 | return dest; |
489 | } |
490 | |
491 | } // Anonymous namespace |
492 | |
493 | MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) { |
494 | Handle<String> result; |
495 | string = String::Flatten(isolate, string); |
496 | return String::IsOneByteRepresentationUnderneath(*string) |
497 | ? EscapePrivate<uint8_t>(isolate, string) |
498 | : EscapePrivate<uc16>(isolate, string); |
499 | } |
500 | |
501 | MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) { |
502 | Handle<String> result; |
503 | string = String::Flatten(isolate, string); |
504 | return String::IsOneByteRepresentationUnderneath(*string) |
505 | ? UnescapePrivate<uint8_t>(isolate, string) |
506 | : UnescapePrivate<uc16>(isolate, string); |
507 | } |
508 | |
509 | } // namespace internal |
510 | } // namespace v8 |
511 | |