1 /* 2 * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved. 3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "URL.h" 29 30 #include "DecodeEscapeSequences.h" 31 #include "MIMETypeRegistry.h" 32 #include "TextEncoding.h" 33 #include <stdio.h> 34 #include <unicode/uidna.h> 35 #include <wtf/HashMap.h> 36 #include <wtf/HexNumber.h> 37 #include <wtf/StdLibExtras.h> 38 #include <wtf/text/CString.h> 39 #include <wtf/text/StringBuilder.h> 40 #include <wtf/text/StringHash.h> 41 42 // FIXME: This file makes too much use of the + operator on String. 43 // We either have to optimize that operator so it doesn't involve 44 // so many allocations, or change this to use StringBuffer instead. 45 46 using namespace WTF; 47 48 namespace WebCore { 49 50 typedef Vector<char, 512> CharBuffer; 51 typedef Vector<UChar, 512> UCharBuffer; 52 53 static const unsigned maximumValidPortNumber = 0xFFFE; 54 static const unsigned invalidPortNumber = 0xFFFF; 55 56 static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter) 57 { 58 ASSERT(isASCIILower(lowercaseLetter)); 59 return (character | 0x20) == lowercaseLetter; 60 } 61 62 static const char wsScheme[] = {'w', 's'}; 63 static const char ftpScheme[] = {'f', 't', 'p'}; 64 static const char ftpPort[] = {'2', '1'}; 65 static const char wssScheme[] = {'w', 's', 's'}; 66 static const char fileScheme[] = {'f', 'i', 'l', 'e'}; 67 static const char httpScheme[] = {'h', 't', 't', 'p'}; 68 static const char httpPort[] = {'8', '0'}; 69 static const char httpsScheme[] = {'h', 't', 't', 'p', 's'}; 70 static const char httpsPort[] = {'4', '4', '3'}; 71 static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'}; 72 static const char gopherPort[] = {'7', '0'}; 73 74 static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter) 75 { 76 ASSERT(isASCIILower(lowercaseLetter)); 77 return (character | 0x20) == lowercaseLetter; 78 } 79 80 enum URLCharacterClasses { 81 // alpha 82 SchemeFirstChar = 1 << 0, 83 84 // ( alpha | digit | "+" | "-" | "." ) 85 SchemeChar = 1 << 1, 86 87 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 88 // unreserved = alphanum | mark 89 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) 90 UserInfoChar = 1 << 2, 91 92 // alnum | "." | "-" | "%" 93 // The above is what the specification says, but we are lenient to 94 // match existing practice and also allow: 95 // "_" 96 HostnameChar = 1 << 3, 97 98 // hexdigit | ":" | "%" 99 IPv6Char = 1 << 4, 100 101 // "#" | "?" | "/" | nul 102 PathSegmentEndChar = 1 << 5, 103 104 // not allowed in path 105 BadChar = 1 << 6 106 }; 107 108 static const unsigned char characterClassTable[256] = { 109 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, 110 /* 2 stx */ BadChar, /* 3 etx */ BadChar, 111 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, 112 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, 113 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, 114 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, 115 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, 116 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, 117 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, 118 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, 119 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, 120 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, 121 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, 122 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, 123 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, 124 /* 44 , */ UserInfoChar, 125 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, 126 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 127 /* 47 / */ PathSegmentEndChar, 128 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 129 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 130 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 131 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 132 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 133 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 134 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 135 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 136 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 137 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 138 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, 139 /* 60 < */ BadChar, /* 61 = */ UserInfoChar, 140 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, 141 /* 64 @ */ 0, 142 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 143 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 144 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 145 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 146 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 147 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 148 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 149 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 150 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 151 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 152 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 153 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 154 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 155 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 156 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 157 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 158 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 159 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 160 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 161 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 162 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 163 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 164 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 165 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 166 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 167 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 168 /* 91 [ */ 0, 169 /* 92 \ */ 0, /* 93 ] */ 0, 170 /* 94 ^ */ 0, 171 /* 95 _ */ UserInfoChar | HostnameChar, 172 /* 96 ` */ 0, 173 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 174 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 175 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 176 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 177 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 178 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 179 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 180 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 181 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 182 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 183 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 184 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 185 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 186 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 187 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 188 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 189 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 190 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 191 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 192 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 193 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 194 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 195 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 196 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 197 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 198 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 199 /* 123 { */ 0, 200 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, 201 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, 202 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, 203 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, 204 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, 205 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, 206 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, 207 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, 208 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, 209 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, 210 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, 211 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, 212 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, 213 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, 214 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, 215 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, 216 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, 217 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, 218 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, 219 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, 220 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, 221 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, 222 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, 223 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, 224 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, 225 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, 226 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, 227 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, 228 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, 229 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, 230 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, 231 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, 232 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar 233 }; 234 235 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); 236 static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); 237 static String substituteBackslashes(const String&); 238 239 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; } 240 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } 241 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; } 242 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } 243 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } 244 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } 245 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } 246 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; } 247 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } 248 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } 249 250 static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter) 251 { 252 ASSERT(isSchemeChar(character)); 253 ASSERT(schemeCharacter & 0x20); 254 ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter))); 255 return (character | 0x20) == schemeCharacter; 256 } 257 258 // Copies the source to the destination, assuming all the source characters are 259 // ASCII. The destination buffer must be large enough. Null characters are allowed 260 // in the source string, and no attempt is made to null-terminate the result. 261 static void copyASCII(const String& string, char* dest) 262 { 263 if (string.isEmpty()) 264 return; 265 266 if (string.is8Bit()) 267 memcpy(dest, string.characters8(), string.length()); 268 else { 269 const UChar* src = string.characters16(); 270 size_t length = string.length(); 271 for (size_t i = 0; i < length; i++) 272 dest[i] = static_cast<char>(src[i]); 273 } 274 } 275 276 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) 277 { 278 buffer.resize(base.length() + len + 1); 279 copyASCII(base, buffer.data()); 280 memcpy(buffer.data() + base.length(), rel, len); 281 buffer[buffer.size() - 1] = '\0'; 282 } 283 284 // FIXME: Move to WTFString.h eventually. 285 // Returns the index of the first index in string |s| of any of the characters 286 // in |toFind|. |toFind| should be a null-terminated string, all characters up 287 // to the null will be searched. Returns int if not found. 288 static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind) 289 { 290 for (int i = startPos; i < sLen; i++) { 291 const char* cur = toFind; 292 while (*cur) { 293 if (s[i] == *(cur++)) 294 return i; 295 } 296 } 297 return -1; 298 } 299 300 static inline void checkEncodedString(const String& url) 301 { 302 ASSERT_UNUSED(url, url.containsOnlyASCII()); 303 ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0])); 304 } 305 306 inline bool URL::protocolIs(const String& string, const char* protocol) 307 { 308 return WebCore::protocolIs(string, protocol); 309 } 310 311 void URL::invalidate() 312 { 313 m_isValid = false; 314 m_protocolIsInHTTPFamily = false; 315 #if PLATFORM(JAVA) 316 m_protocolIsInJar = false; 317 #endif 318 m_schemeEnd = 0; 319 m_userStart = 0; 320 m_userEnd = 0; 321 m_passwordEnd = 0; 322 m_hostEnd = 0; 323 m_portEnd = 0; 324 m_pathEnd = 0; 325 m_pathAfterLastSlash = 0; 326 m_queryEnd = 0; 327 m_fragmentEnd = 0; 328 } 329 330 URL::URL(ParsedURLStringTag, const String& url) 331 { 332 parse(url); 333 ASSERT(url == m_string); 334 } 335 336 URL::URL(const URL& base, const String& relative) 337 { 338 init(base, relative, UTF8Encoding()); 339 } 340 341 URL::URL(const URL& base, const String& relative, const TextEncoding& encoding) 342 { 343 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as 344 // we do when submitting a form. A form with GET method 345 // has its contents added to a URL as query params and it makes sense 346 // to be consistent. 347 init(base, relative, encoding.encodingForFormSubmission()); 348 } 349 350 static bool shouldTrimFromURL(unsigned char c) 351 { 352 // Browsers ignore leading/trailing whitespace and control 353 // characters from URLs. Note that c is an *unsigned* char here 354 // so this comparison should only catch control characters. 355 return c <= ' '; 356 } 357 358 void URL::init(const URL& base, const String& relative, const TextEncoding& encoding) 359 { 360 // Allow resolutions with a null or empty base URL, but not with any other invalid one. 361 // FIXME: Is this a good rule? 362 if (!base.m_isValid && !base.isEmpty()) { 363 m_string = relative; 364 invalidate(); 365 return; 366 } 367 368 // For compatibility with Win IE, treat backslashes as if they were slashes, 369 // as long as we're not dealing with javascript: or data: URLs. 370 String rel = relative; 371 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) 372 rel = substituteBackslashes(rel); 373 374 bool allASCII = rel.containsOnlyASCII(); 375 CharBuffer strBuffer; 376 char* str; 377 size_t len; 378 if (allASCII) { 379 len = rel.length(); 380 strBuffer.resize(len + 1); 381 copyASCII(rel, strBuffer.data()); 382 strBuffer[len] = 0; 383 str = strBuffer.data(); 384 } else { 385 encodeRelativeString(rel, encoding, strBuffer); 386 str = strBuffer.data(); 387 len = strlen(str); 388 } 389 390 // Get rid of leading whitespace and control characters. 391 while (len && shouldTrimFromURL(*str)) { 392 str++; 393 --len; 394 } 395 396 // Get rid of trailing whitespace and control characters. 397 while (len && shouldTrimFromURL(str[len - 1])) 398 str[--len] = '\0'; 399 400 // According to the RFC, the reference should be interpreted as an 401 // absolute URI if possible, using the "leftmost, longest" 402 // algorithm. If the URI reference is absolute it will have a 403 // scheme, meaning that it will have a colon before the first 404 // non-scheme element. 405 bool absolute = false; 406 char* p = str; 407 if (isSchemeFirstChar(*p)) { 408 ++p; 409 while (isSchemeChar(*p)) { 410 ++p; 411 } 412 if (*p == ':') { 413 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) 414 str = p + 1; 415 else 416 absolute = true; 417 } 418 } 419 420 CharBuffer parseBuffer; 421 422 if (absolute) { 423 parse(str, &relative); 424 } else { 425 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid 426 // unless the relative URL is a single fragment. 427 if (!base.isHierarchical()) { 428 if (str[0] == '#') { 429 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 430 parse(parseBuffer.data(), &relative); 431 #if PLATFORM(JAVA) 432 } else if(base.isJarFile()) { 433 appendASCII(base.m_string.left(base.m_pathAfterLastSlash), str, len, parseBuffer); 434 parse(parseBuffer.data(), &relative); 435 #endif 436 } else { 437 m_string = relative; 438 invalidate(); 439 } 440 return; 441 } 442 443 switch (str[0]) { 444 case '\0': 445 // The reference is empty, so this is a reference to the same document with any fragment identifier removed. 446 *this = base; 447 removeFragmentIdentifier(); 448 break; 449 case '#': { 450 // must be fragment-only reference 451 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 452 parse(parseBuffer.data(), &relative); 453 break; 454 } 455 case '?': { 456 // query-only reference, special case needed for non-URL results 457 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); 458 parse(parseBuffer.data(), &relative); 459 break; 460 } 461 case '/': 462 // must be net-path or absolute-path reference 463 if (str[1] == '/') { 464 // net-path 465 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); 466 parse(parseBuffer.data(), &relative); 467 } else { 468 // abs-path 469 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); 470 parse(parseBuffer.data(), &relative); 471 } 472 break; 473 default: 474 { 475 // must be relative-path reference 476 477 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. 478 const size_t bufferSize = base.m_pathEnd + 1 + len + 1; 479 parseBuffer.resize(bufferSize); 480 481 char* bufferPos = parseBuffer.data(); 482 char* bufferStart = bufferPos; 483 484 // first copy everything before the path from the base 485 CharBuffer baseStringBuffer(base.m_string.length()); 486 copyASCII(base.m_string, baseStringBuffer.data()); 487 const char* baseString = baseStringBuffer.data(); 488 const char* baseStringStart = baseString; 489 const char* pathStart = baseStringStart + base.m_portEnd; 490 while (baseStringStart < pathStart) 491 *bufferPos++ = *baseStringStart++; 492 char* bufferPathStart = bufferPos; 493 494 // now copy the base path 495 const char* baseStringEnd = baseString + base.m_pathEnd; 496 497 // go back to the last slash 498 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') 499 baseStringEnd--; 500 501 if (baseStringEnd == baseStringStart) { 502 // no path in base, add a path separator if necessary 503 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') 504 *bufferPos++ = '/'; 505 } else { 506 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); 507 } 508 509 const char* relStringStart = str; 510 const char* relStringPos = relStringStart; 511 512 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { 513 if (relStringPos[0] == '.' && bufferPos[-1] == '/') { 514 if (isPathSegmentEndChar(relStringPos[1])) { 515 // skip over "." segment 516 relStringPos += 1; 517 if (relStringPos[0] == '/') 518 relStringPos++; 519 continue; 520 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { 521 // skip over ".." segment and rewind the last segment 522 // the RFC leaves it up to the app to decide what to do with excess 523 // ".." segments - we choose to drop them since some web content 524 // relies on this. 525 relStringPos += 2; 526 if (relStringPos[0] == '/') 527 relStringPos++; 528 if (bufferPos > bufferPathStart + 1) 529 bufferPos--; 530 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') 531 bufferPos--; 532 continue; 533 } 534 } 535 536 *bufferPos = *relStringPos; 537 relStringPos++; 538 bufferPos++; 539 } 540 541 // all done with the path work, now copy any remainder 542 // of the relative reference; this will also add a null terminator 543 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart)); 544 545 parse(parseBuffer.data(), &relative); 546 547 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); 548 break; 549 } 550 } 551 } 552 } 553 554 URL URL::copy() const 555 { 556 URL result = *this; 557 result.m_string = result.m_string.isolatedCopy(); 558 return result; 559 } 560 561 String URL::lastPathComponent() const 562 { 563 if (!hasPath()) 564 return String(); 565 566 unsigned end = m_pathEnd - 1; 567 if (m_string[end] == '/') 568 --end; 569 570 size_t start = m_string.reverseFind('/', end); 571 if (start < static_cast<unsigned>(m_portEnd)) 572 return String(); 573 ++start; 574 575 return m_string.substring(start, end - start + 1); 576 } 577 578 String URL::protocol() const 579 { 580 return m_string.left(m_schemeEnd); 581 } 582 583 String URL::host() const 584 { 585 int start = hostStart(); 586 return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start)); 587 } 588 589 unsigned short URL::port() const 590 { 591 // We return a port of 0 if there is no port specified. This can happen in two situations: 592 // 1) The URL contains no colon after the host name and before the path component of the URL. 593 // 2) The URL contains a colon but there's no port number before the path component of the URL begins. 594 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) 595 return 0; 596 597 bool ok = false; 598 unsigned number = charactersToUIntStrict(m_string.deprecatedCharacters() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); 599 if (!ok || number > maximumValidPortNumber) 600 return invalidPortNumber; 601 return number; 602 } 603 604 String URL::pass() const 605 { 606 if (m_passwordEnd == m_userEnd) 607 return String(); 608 609 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); 610 } 611 612 String URL::user() const 613 { 614 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); 615 } 616 617 String URL::fragmentIdentifier() const 618 { 619 if (m_fragmentEnd == m_queryEnd) 620 return String(); 621 622 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); 623 } 624 625 bool URL::hasFragmentIdentifier() const 626 { 627 return m_fragmentEnd != m_queryEnd; 628 } 629 630 String URL::baseAsString() const 631 { 632 return m_string.left(m_pathAfterLastSlash); 633 } 634 635 #if !USE(CF) 636 String URL::fileSystemPath() const 637 { 638 if (!isValid() || !isLocalFile()) 639 return String(); 640 641 return decodeURLEscapeSequences(path()); 642 } 643 #endif 644 645 #ifdef NDEBUG 646 647 static inline void assertProtocolIsGood(const char*) 648 { 649 } 650 651 #else 652 653 static void assertProtocolIsGood(const char* protocol) 654 { 655 const char* p = protocol; 656 while (*p) { 657 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 658 ++p; 659 } 660 } 661 662 #endif 663 664 bool URL::protocolIs(const char* protocol) const 665 { 666 assertProtocolIsGood(protocol); 667 668 // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid. 669 // The free function protocolIsJavaScript() should be used instead. 670 ASSERT(!equalIgnoringCase(protocol, String("javascript"))); 671 672 if (!m_isValid) 673 return false; 674 675 // Do the comparison without making a new string object. 676 for (int i = 0; i < m_schemeEnd; ++i) { 677 if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i])) 678 return false; 679 } 680 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. 681 } 682 683 String URL::query() const 684 { 685 if (m_queryEnd == m_pathEnd) 686 return String(); 687 688 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); 689 } 690 691 String URL::path() const 692 { 693 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd); 694 } 695 696 bool URL::setProtocol(const String& s) 697 { 698 // Firefox and IE remove everything after the first ':'. 699 size_t separatorPosition = s.find(':'); 700 String newProtocol = s.substring(0, separatorPosition); 701 702 if (!isValidProtocol(newProtocol)) 703 return false; 704 705 if (!m_isValid) { 706 parse(newProtocol + ':' + m_string); 707 return true; 708 } 709 710 parse(newProtocol + m_string.substring(m_schemeEnd)); 711 return true; 712 } 713 714 void URL::setHost(const String& s) 715 { 716 if (!m_isValid) 717 return; 718 719 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 720 // and to avoid changing more than just the host. 721 722 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 723 724 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); 725 } 726 727 void URL::removePort() 728 { 729 if (m_hostEnd == m_portEnd) 730 return; 731 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); 732 } 733 734 void URL::setPort(unsigned short i) 735 { 736 if (!m_isValid) 737 return; 738 739 bool colonNeeded = m_portEnd == m_hostEnd; 740 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); 741 742 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); 743 } 744 745 void URL::setHostAndPort(const String& hostAndPort) 746 { 747 if (!m_isValid) 748 return; 749 750 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 751 // and to avoid changing more than just host and port. 752 753 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 754 755 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); 756 } 757 758 void URL::setUser(const String& user) 759 { 760 if (!m_isValid) 761 return; 762 763 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 764 // and to avoid changing more than just the user login. 765 766 int end = m_userEnd; 767 if (!user.isEmpty()) { 768 String u = user; 769 if (m_userStart == m_schemeEnd + 1) 770 u = "//" + u; 771 // Add '@' if we didn't have one before. 772 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) 773 u.append('@'); 774 parse(m_string.left(m_userStart) + u + m_string.substring(end)); 775 } else { 776 // Remove '@' if we now have neither user nor password. 777 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') 778 end += 1; 779 // We don't want to parse in the extremely common case where we are not going to make a change. 780 if (m_userStart != end) 781 parse(m_string.left(m_userStart) + m_string.substring(end)); 782 } 783 } 784 785 void URL::setPass(const String& password) 786 { 787 if (!m_isValid) 788 return; 789 790 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 791 // and to avoid changing more than just the user password. 792 793 int end = m_passwordEnd; 794 if (!password.isEmpty()) { 795 String p = ":" + password + "@"; 796 if (m_userEnd == m_schemeEnd + 1) 797 p = "//" + p; 798 // Eat the existing '@' since we are going to add our own. 799 if (end != m_hostEnd && m_string[end] == '@') 800 end += 1; 801 parse(m_string.left(m_userEnd) + p + m_string.substring(end)); 802 } else { 803 // Remove '@' if we now have neither user nor password. 804 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') 805 end += 1; 806 // We don't want to parse in the extremely common case where we are not going to make a change. 807 if (m_userEnd != end) 808 parse(m_string.left(m_userEnd) + m_string.substring(end)); 809 } 810 } 811 812 void URL::setFragmentIdentifier(const String& s) 813 { 814 if (!m_isValid) 815 return; 816 817 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. 818 parse(m_string.left(m_queryEnd) + "#" + s); 819 } 820 821 void URL::removeFragmentIdentifier() 822 { 823 if (!m_isValid) 824 return; 825 parse(m_string.left(m_queryEnd)); 826 } 827 828 void URL::setQuery(const String& query) 829 { 830 if (!m_isValid) 831 return; 832 833 // FIXME: '#' and non-ASCII characters must be encoded and escaped. 834 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have 835 // access to the document in this function. 836 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) 837 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); 838 else 839 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); 840 841 } 842 843 void URL::setPath(const String& s) 844 { 845 if (!m_isValid) 846 return; 847 848 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts 849 // may be inadvertently affected. 850 String path = s; 851 if (path.isEmpty() || path[0] != '/') 852 path = "/" + path; 853 854 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd)); 855 } 856 857 #if PLATFORM(JAVA) 858 String URL::deprecatedString() const 859 { 860 if (!m_isValid) 861 return m_string; 862 863 StringBuilder result; 864 865 result.append(protocol()); 866 result.append(':'); 867 868 StringBuilder authority; 869 870 if (m_hostEnd != m_passwordEnd) { 871 if (m_userEnd != m_userStart) { 872 authority.append(user()); 873 authority.append('@'); 874 } 875 authority.append(host()); 876 if (hasPort()) { 877 authority.append(':'); 878 authority.append(String::number(port())); 879 } 880 } 881 882 if (!authority.isEmpty()) { 883 result.append('/'); 884 result.append('/'); 885 result.append(authority.deprecatedCharacters(), authority.length()); 886 } else if (protocolIs("file")) { 887 result.append('/'); 888 result.append('/'); 889 } 890 891 result.append(path()); 892 893 if (m_pathEnd != m_queryEnd) { 894 result.append('?'); 895 result.append(query()); 896 } 897 898 if (m_fragmentEnd != m_queryEnd) { 899 result.append('#'); 900 result.append(fragmentIdentifier()); 901 } 902 903 return result.toString(); 904 } 905 #endif 906 907 String decodeURLEscapeSequences(const String& string) 908 { 909 return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding()); 910 } 911 912 String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding) 913 { 914 return decodeEscapeSequences<URLEscapeSequence>(string, encoding); 915 } 916 917 // Caution: This function does not bounds check. 918 static void appendEscapedChar(char*& buffer, unsigned char c) 919 { 920 *buffer++ = '%'; 921 placeByteAsHex(c, buffer); 922 } 923 924 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) 925 { 926 char* p = buffer; 927 928 const char* str = strStart; 929 const char* strEnd = strStart + length; 930 while (str < strEnd) { 931 unsigned char c = *str++; 932 if (isBadChar(c)) { 933 if (c == '%' || c == '?') 934 *p++ = c; 935 else if (c != 0x09 && c != 0x0a && c != 0x0d) 936 appendEscapedChar(p, c); 937 } else 938 *p++ = c; 939 } 940 941 buffer = p; 942 } 943 944 static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length) 945 { 946 char* p = buffer; 947 948 const char* str = strStart; 949 const char* strEnd = strStart + length; 950 while (str < strEnd) { 951 unsigned char c = *str++; 952 // Strip CR, LF and Tab from fragments, per: 953 // https://bugs.webkit.org/show_bug.cgi?id=8770 954 if (c == 0x09 || c == 0x0a || c == 0x0d) 955 continue; 956 957 // Chrome and IE allow non-ascii characters in fragments, however doing 958 // so would hit an ASSERT in checkEncodedString, so for now we don't. 959 if (c < 0x20 || c >= 127) { 960 appendEscapedChar(p, c); 961 continue; 962 } 963 *p++ = c; 964 } 965 966 buffer = p; 967 } 968 969 // copy a path, accounting for "." and ".." segments 970 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) 971 { 972 char* bufferPathStart = dst; 973 974 // empty path is a special case, and need not have a leading slash 975 if (srcStart != srcEnd) { 976 const char* baseStringStart = src + srcStart; 977 const char* baseStringEnd = src + srcEnd; 978 const char* baseStringPos = baseStringStart; 979 980 // this code is unprepared for paths that do not begin with a 981 // slash and we should always have one in the source string 982 #if !PLATFORM(JAVA) 983 //in JAVA the complex protocols like "jar:file" are available. 984 ASSERT(baseStringPos[0] == '/'); 985 #endif 986 987 // copy the leading slash into the destination 988 *dst = *baseStringPos; 989 baseStringPos++; 990 dst++; 991 992 while (baseStringPos < baseStringEnd) { 993 if (baseStringPos[0] == '.' && dst[-1] == '/') { 994 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { 995 // skip over "." segment 996 baseStringPos += 2; 997 continue; 998 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || 999 baseStringPos + 2 == baseStringEnd)) { 1000 // skip over ".." segment and rewind the last segment 1001 // the RFC leaves it up to the app to decide what to do with excess 1002 // ".." segments - we choose to drop them since some web content 1003 // relies on this. 1004 baseStringPos += 3; 1005 if (dst > bufferPathStart + 1) 1006 dst--; 1007 while (dst > bufferPathStart && dst[-1] != '/') 1008 dst--; 1009 continue; 1010 } 1011 } 1012 1013 *dst = *baseStringPos; 1014 baseStringPos++; 1015 dst++; 1016 } 1017 } 1018 *dst = '\0'; 1019 return dst - bufferPathStart; 1020 } 1021 1022 static inline bool hasSlashDotOrDotDot(const char* str) 1023 { 1024 const unsigned char* p = reinterpret_cast<const unsigned char*>(str); 1025 if (!*p) 1026 return false; 1027 unsigned char pc = *p; 1028 while (unsigned char c = *++p) { 1029 if (c == '.' && (pc == '/' || pc == '.')) 1030 return true; 1031 pc = c; 1032 } 1033 return false; 1034 } 1035 1036 void URL::parse(const String& string) 1037 { 1038 checkEncodedString(string); 1039 1040 CharBuffer buffer(string.length() + 1); 1041 copyASCII(string, buffer.data()); 1042 buffer[string.length()] = '\0'; 1043 parse(buffer.data(), &string); 1044 } 1045 1046 #if PLATFORM(IOS) 1047 static bool shouldCanonicalizeScheme = true; 1048 1049 void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization) 1050 { 1051 shouldCanonicalizeScheme = enableSchemeCanonicalization; 1052 } 1053 #endif 1054 1055 template<size_t length> 1056 static inline bool equal(const char* a, const char (&b)[length]) 1057 { 1058 #if PLATFORM(IOS) 1059 if (!shouldCanonicalizeScheme) { 1060 for (size_t i = 0; i < length; ++i) { 1061 if (toASCIILower(a[i]) != b[i]) 1062 return false; 1063 } 1064 return true; 1065 } 1066 #endif 1067 for (size_t i = 0; i < length; ++i) { 1068 if (a[i] != b[i]) 1069 return false; 1070 } 1071 return true; 1072 } 1073 1074 template<size_t lengthB> 1075 static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB]) 1076 { 1077 return lengthA == lengthB && equal(stringA, stringB); 1078 } 1079 1080 // List of default schemes is taken from google-url: 1081 // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120 1082 static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength) 1083 { 1084 // This switch is theoretically a performance optimization. It came over when 1085 // the code was moved from google-url, but may be removed later. 1086 switch (schemeLength) { 1087 case 2: 1088 return equal(scheme, wsScheme) && equal(port, portLength, httpPort); 1089 case 3: 1090 if (equal(scheme, ftpScheme)) 1091 return equal(port, portLength, ftpPort); 1092 if (equal(scheme, wssScheme)) 1093 return equal(port, portLength, httpsPort); 1094 break; 1095 case 4: 1096 return equal(scheme, httpScheme) && equal(port, portLength, httpPort); 1097 case 5: 1098 return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort); 1099 case 6: 1100 return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort); 1101 } 1102 return false; 1103 } 1104 1105 static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar) 1106 { 1107 return userinfoEndChar == '@' && hostStart == portEnd; 1108 } 1109 1110 static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength) 1111 { 1112 switch (schemeLength) { 1113 case 2: 1114 return equal(scheme, wsScheme); 1115 case 3: 1116 return equal(scheme, ftpScheme) || equal(scheme, wssScheme); 1117 case 4: 1118 return equal(scheme, httpScheme); 1119 case 5: 1120 return equal(scheme, httpsScheme); 1121 case 6: 1122 return equal(scheme, gopherScheme); 1123 } 1124 return false; 1125 } 1126 1127 static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength) 1128 { 1129 switch (schemeLength) { 1130 case 2: 1131 return equal(scheme, wsScheme); 1132 case 3: 1133 return equal(scheme, ftpScheme) || equal(scheme, wssScheme); 1134 case 4: 1135 return equal(scheme, httpScheme) || equal(scheme, fileScheme); 1136 case 5: 1137 return equal(scheme, httpsScheme); 1138 case 6: 1139 return equal(scheme, gopherScheme); 1140 } 1141 return false; 1142 } 1143 1144 void URL::parse(const char* url, const String* originalString) 1145 { 1146 if (!url || url[0] == '\0') { 1147 // valid URL must be non-empty 1148 m_string = originalString ? *originalString : url; 1149 invalidate(); 1150 return; 1151 } 1152 1153 if (!isSchemeFirstChar(url[0])) { 1154 // scheme must start with an alphabetic character 1155 m_string = originalString ? *originalString : url; 1156 invalidate(); 1157 return; 1158 } 1159 1160 int schemeEnd = 0; 1161 while (isSchemeChar(url[schemeEnd])) 1162 schemeEnd++; 1163 1164 if (url[schemeEnd] != ':') { 1165 m_string = originalString ? *originalString : url; 1166 invalidate(); 1167 return; 1168 } 1169 1170 int userStart = schemeEnd + 1; 1171 int userEnd; 1172 int passwordStart; 1173 int passwordEnd; 1174 int hostStart; 1175 int hostEnd; 1176 int portStart; 1177 int portEnd; 1178 1179 bool hierarchical = url[schemeEnd + 1] == '/'; 1180 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/'; 1181 1182 bool isFile = schemeEnd == 4 1183 && isLetterMatchIgnoringCase(url[0], 'f') 1184 && isLetterMatchIgnoringCase(url[1], 'i') 1185 && isLetterMatchIgnoringCase(url[2], 'l') 1186 && isLetterMatchIgnoringCase(url[3], 'e'); 1187 1188 #if PLATFORM(JAVA) 1189 m_protocolIsInJar = schemeEnd == 3 1190 && isLetterMatchIgnoringCase(url[0], 'j') 1191 && isLetterMatchIgnoringCase(url[1], 'a') 1192 && isLetterMatchIgnoringCase(url[2], 'r'); 1193 #endif 1194 1195 m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h') 1196 && isLetterMatchIgnoringCase(url[1], 't') 1197 && isLetterMatchIgnoringCase(url[2], 't') 1198 && isLetterMatchIgnoringCase(url[3], 'p') 1199 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':')); 1200 1201 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) { 1202 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. 1203 // Attempt to find an authority. 1204 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. 1205 1206 if (hierarchical) 1207 userStart++; 1208 if (hasSecondSlash) 1209 userStart++; 1210 userEnd = userStart; 1211 1212 int colonPos = 0; 1213 while (isUserInfoChar(url[userEnd])) { 1214 if (url[userEnd] == ':' && colonPos == 0) 1215 colonPos = userEnd; 1216 userEnd++; 1217 } 1218 1219 if (url[userEnd] == '@') { 1220 // actual end of the userinfo, start on the host 1221 if (colonPos != 0) { 1222 passwordEnd = userEnd; 1223 userEnd = colonPos; 1224 passwordStart = colonPos + 1; 1225 } else 1226 passwordStart = passwordEnd = userEnd; 1227 1228 hostStart = passwordEnd + 1; 1229 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { 1230 // hit the end of the authority, must have been no user 1231 // or looks like an IPv6 hostname 1232 // either way, try to parse it as a hostname 1233 userEnd = userStart; 1234 passwordStart = passwordEnd = userEnd; 1235 hostStart = userStart; 1236 } else { 1237 // invalid character 1238 m_string = originalString ? *originalString : url; 1239 invalidate(); 1240 return; 1241 } 1242 1243 hostEnd = hostStart; 1244 1245 // IPV6 IP address 1246 if (url[hostEnd] == '[') { 1247 hostEnd++; 1248 while (isIPv6Char(url[hostEnd])) 1249 hostEnd++; 1250 if (url[hostEnd] == ']') 1251 hostEnd++; 1252 else { 1253 // invalid character 1254 m_string = originalString ? *originalString : url; 1255 invalidate(); 1256 return; 1257 } 1258 } else { 1259 while (isHostnameChar(url[hostEnd])) 1260 hostEnd++; 1261 } 1262 1263 if (url[hostEnd] == ':') { 1264 portStart = portEnd = hostEnd + 1; 1265 1266 // possible start of port 1267 portEnd = portStart; 1268 while (isASCIIDigit(url[portEnd])) 1269 portEnd++; 1270 } else 1271 portStart = portEnd = hostEnd; 1272 1273 if (!isPathSegmentEndChar(url[portEnd])) { 1274 // invalid character 1275 m_string = originalString ? *originalString : url; 1276 invalidate(); 1277 return; 1278 } 1279 1280 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) { 1281 m_string = originalString ? *originalString : url; 1282 invalidate(); 1283 return; 1284 } 1285 1286 if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) { 1287 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two 1288 // path segments are empty. For file, http and https only, an empty authority is allowed. 1289 userStart -= 2; 1290 userEnd = userStart; 1291 passwordStart = userEnd; 1292 passwordEnd = passwordStart; 1293 hostStart = passwordEnd; 1294 hostEnd = hostStart; 1295 portStart = hostEnd; 1296 portEnd = hostEnd; 1297 } 1298 } else { 1299 // the part after the scheme must be an opaque_part or an abs_path 1300 userEnd = userStart; 1301 passwordStart = passwordEnd = userEnd; 1302 hostStart = hostEnd = passwordEnd; 1303 portStart = portEnd = hostEnd; 1304 } 1305 1306 int pathStart = portEnd; 1307 int pathEnd = pathStart; 1308 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') 1309 pathEnd++; 1310 1311 int queryStart = pathEnd; 1312 int queryEnd = queryStart; 1313 if (url[queryStart] == '?') { 1314 while (url[queryEnd] && url[queryEnd] != '#') 1315 queryEnd++; 1316 } 1317 1318 int fragmentStart = queryEnd; 1319 int fragmentEnd = fragmentStart; 1320 if (url[fragmentStart] == '#') { 1321 fragmentStart++; 1322 fragmentEnd = fragmentStart; 1323 while (url[fragmentEnd]) 1324 fragmentEnd++; 1325 } 1326 1327 // assemble it all, remembering the real ranges 1328 1329 Vector<char, 4096> buffer(fragmentEnd * 3 + 1); 1330 1331 char *p = buffer.data(); 1332 const char *strPtr = url; 1333 1334 // copy in the scheme 1335 const char *schemeEndPtr = url + schemeEnd; 1336 #if PLATFORM(IOS) 1337 if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) { 1338 while (strPtr < schemeEndPtr) 1339 *p++ = toASCIILower(*strPtr++); 1340 } else { 1341 while (strPtr < schemeEndPtr) 1342 *p++ = *strPtr++; 1343 } 1344 #else 1345 while (strPtr < schemeEndPtr) 1346 *p++ = toASCIILower(*strPtr++); 1347 #endif 1348 m_schemeEnd = p - buffer.data(); 1349 1350 bool hostIsLocalHost = portEnd - userStart == 9 1351 && isLetterMatchIgnoringCase(url[userStart], 'l') 1352 && isLetterMatchIgnoringCase(url[userStart+1], 'o') 1353 && isLetterMatchIgnoringCase(url[userStart+2], 'c') 1354 && isLetterMatchIgnoringCase(url[userStart+3], 'a') 1355 && isLetterMatchIgnoringCase(url[userStart+4], 'l') 1356 && isLetterMatchIgnoringCase(url[userStart+5], 'h') 1357 && isLetterMatchIgnoringCase(url[userStart+6], 'o') 1358 && isLetterMatchIgnoringCase(url[userStart+7], 's') 1359 && isLetterMatchIgnoringCase(url[userStart+8], 't'); 1360 1361 // File URLs need a host part unless it is just file:// or file://localhost 1362 bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); 1363 1364 // We drop empty credentials, but keep a colon in an empty host/port pair. 1365 // Removing hostname completely would change the structure of the URL on re-parsing. 1366 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd; 1367 1368 // add ":" after scheme 1369 *p++ = ':'; 1370 1371 // if we have at least one authority part or a file URL - add "//" and authority 1372 if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { 1373 *p++ = '/'; 1374 *p++ = '/'; 1375 1376 m_userStart = p - buffer.data(); 1377 1378 // copy in the user 1379 strPtr = url + userStart; 1380 const char* userEndPtr = url + userEnd; 1381 while (strPtr < userEndPtr) { 1382 char c = *strPtr++; 1383 ASSERT(isUserInfoChar(c)); 1384 *p++ = c; 1385 } 1386 m_userEnd = p - buffer.data(); 1387 1388 // copy in the password 1389 if (passwordEnd != passwordStart) { 1390 *p++ = ':'; 1391 strPtr = url + passwordStart; 1392 const char* passwordEndPtr = url + passwordEnd; 1393 while (strPtr < passwordEndPtr) { 1394 char c = *strPtr++; 1395 ASSERT(isUserInfoChar(c)); 1396 *p++ = c; 1397 } 1398 } 1399 m_passwordEnd = p - buffer.data(); 1400 1401 // If we had any user info, add "@" 1402 if (p - buffer.data() != m_userStart) 1403 *p++ = '@'; 1404 1405 // copy in the host, except in the case of a file URL with authority="localhost" 1406 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { 1407 strPtr = url + hostStart; 1408 const char* hostEndPtr = url + hostEnd; 1409 if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) { 1410 while (strPtr < hostEndPtr) { 1411 char c = toASCIILower(*strPtr++); 1412 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':'); 1413 *p++ = c; 1414 } 1415 } else { 1416 while (strPtr < hostEndPtr) { 1417 char c = *strPtr++; 1418 ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':'); 1419 *p++ = c; 1420 } 1421 } 1422 } 1423 m_hostEnd = p - buffer.data(); 1424 1425 // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component. 1426 if (hostEnd != portStart) { 1427 const char* portStr = url + portStart; 1428 size_t portLength = portEnd - portStart; 1429 if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd)) 1430 || (hostStart == hostEnd && hostEnd != portStart)) { 1431 *p++ = ':'; 1432 const char* portEndPtr = url + portEnd; 1433 while (portStr < portEndPtr) 1434 *p++ = *portStr++; 1435 } 1436 } 1437 m_portEnd = p - buffer.data(); 1438 } else { 1439 if (isFile) { 1440 ASSERT(degenerateFilePath); 1441 *p++ = '/'; 1442 *p++ = '/'; 1443 } 1444 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); 1445 } 1446 1447 // For canonicalization, ensure we have a '/' for no path. 1448 // Do this only for URL with protocol file, http or https. 1449 if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart) 1450 *p++ = '/'; 1451 1452 // add path, escaping bad characters 1453 if (!hierarchical) 1454 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart); 1455 else if (!hasSlashDotOrDotDot(url)) 1456 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); 1457 else { 1458 CharBuffer pathBuffer(pathEnd - pathStart + 1); 1459 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); 1460 appendEscapingBadChars(p, pathBuffer.data(), length); 1461 } 1462 1463 m_pathEnd = p - buffer.data(); 1464 1465 // Find the position after the last slash in the path, or 1466 // the position before the path if there are no slashes in it. 1467 int i; 1468 for (i = m_pathEnd; i > m_portEnd; --i) { 1469 if (buffer[i - 1] == '/') 1470 break; 1471 } 1472 m_pathAfterLastSlash = i; 1473 1474 // add query, escaping bad characters 1475 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); 1476 m_queryEnd = p - buffer.data(); 1477 1478 // add fragment, escaping bad characters 1479 if (fragmentEnd != queryEnd) { 1480 *p++ = '#'; 1481 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart); 1482 } 1483 m_fragmentEnd = p - buffer.data(); 1484 1485 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1486 ASSERT(buffer.size() > 0); 1487 1488 // If we didn't end up actually changing the original string and 1489 // it was already in a String, reuse it to avoid extra allocation. 1490 if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd)) 1491 m_string = *originalString; 1492 else 1493 m_string = String(buffer.data(), m_fragmentEnd); 1494 1495 m_isValid = true; 1496 } 1497 1498 bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b) 1499 { 1500 if (a.m_queryEnd != b.m_queryEnd) 1501 return false; 1502 unsigned queryLength = a.m_queryEnd; 1503 for (unsigned i = 0; i < queryLength; ++i) 1504 if (a.string()[i] != b.string()[i]) 1505 return false; 1506 return true; 1507 } 1508 1509 bool protocolHostAndPortAreEqual(const URL& a, const URL& b) 1510 { 1511 if (a.m_schemeEnd != b.m_schemeEnd) 1512 return false; 1513 1514 int hostStartA = a.hostStart(); 1515 int hostLengthA = a.hostEnd() - hostStartA; 1516 int hostStartB = b.hostStart(); 1517 int hostLengthB = b.hostEnd() - b.hostStart(); 1518 if (hostLengthA != hostLengthB) 1519 return false; 1520 1521 // Check the scheme 1522 for (int i = 0; i < a.m_schemeEnd; ++i) 1523 if (a.string()[i] != b.string()[i]) 1524 return false; 1525 1526 // And the host 1527 for (int i = 0; i < hostLengthA; ++i) 1528 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 1529 return false; 1530 1531 if (a.port() != b.port()) 1532 return false; 1533 1534 return true; 1535 } 1536 1537 String encodeWithURLEscapeSequences(const String& notEncodedString) 1538 { 1539 CString asUTF8 = notEncodedString.utf8(); 1540 1541 CharBuffer buffer(asUTF8.length() * 3 + 1); 1542 char* p = buffer.data(); 1543 1544 const char* str = asUTF8.data(); 1545 const char* strEnd = str + asUTF8.length(); 1546 while (str < strEnd) { 1547 unsigned char c = *str++; 1548 if (isBadChar(c)) 1549 appendEscapedChar(p, c); 1550 else 1551 *p++ = c; 1552 } 1553 1554 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1555 1556 return String(buffer.data(), p - buffer.data()); 1557 } 1558 1559 // Appends the punycoded hostname identified by the given string and length to 1560 // the output buffer. The result will not be null terminated. 1561 static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen) 1562 { 1563 // Needs to be big enough to hold an IDN-encoded name. 1564 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 1565 const unsigned hostnameBufferLength = 2048; 1566 1567 if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) { 1568 buffer.append(str, strLen); 1569 return; 1570 } 1571 1572 UChar hostnameBuffer[hostnameBufferLength]; 1573 UErrorCode error = U_ZERO_ERROR; 1574 int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer, 1575 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); 1576 if (error == U_ZERO_ERROR) 1577 buffer.append(hostnameBuffer, numCharactersConverted); 1578 } 1579 1580 static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<std::pair<int, int>>& nameRanges) 1581 { 1582 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. 1583 // Skip quoted strings so that characters in them don't confuse us. 1584 // When we find a '?' character, we are past the part of the URL that contains host names. 1585 1586 nameRanges.clear(); 1587 1588 int p = 0; 1589 while (1) { 1590 // Find start of host name or of quoted string. 1591 int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?"); 1592 if (hostnameOrStringStart == -1) 1593 return; 1594 UChar c = str[hostnameOrStringStart]; 1595 p = hostnameOrStringStart + 1; 1596 1597 if (c == '?') 1598 return; 1599 1600 if (c == '@') { 1601 // Find end of host name. 1602 int hostnameStart = p; 1603 int hostnameEnd = findFirstOf(str, strLen, p, ">,?"); 1604 bool done; 1605 if (hostnameEnd == -1) { 1606 hostnameEnd = strLen; 1607 done = true; 1608 } else { 1609 p = hostnameEnd; 1610 done = false; 1611 } 1612 1613 nameRanges.append(std::make_pair(hostnameStart, hostnameEnd)); 1614 1615 if (done) 1616 return; 1617 } else { 1618 // Skip quoted string. 1619 ASSERT(c == '"'); 1620 while (1) { 1621 int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\"); 1622 if (escapedCharacterOrStringEnd == -1) 1623 return; 1624 1625 c = str[escapedCharacterOrStringEnd]; 1626 p = escapedCharacterOrStringEnd + 1; 1627 1628 // If we are the end of the string, then break from the string loop back to the host name loop. 1629 if (c == '"') 1630 break; 1631 1632 // Skip escaped character. 1633 ASSERT(c == '\\'); 1634 if (p == strLen) 1635 return; 1636 1637 ++p; 1638 } 1639 } 1640 } 1641 } 1642 1643 static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset) 1644 { 1645 // Find the host name in a hierarchical URL. 1646 // It comes after a "://" sequence, with scheme characters preceding, and 1647 // this should be the first colon in the string. 1648 // It ends with the end of the string or a ":" or a path segment ending character. 1649 // If there is a "@" character, the host part is just the part after the "@". 1650 int separator = findFirstOf(str, strLen, 0, ":"); 1651 if (separator == -1 || separator + 2 >= strLen || 1652 str[separator + 1] != '/' || str[separator + 2] != '/') 1653 return false; 1654 1655 // Check that all characters before the :// are valid scheme characters. 1656 if (!isSchemeFirstChar(str[0])) 1657 return false; 1658 for (int i = 1; i < separator; ++i) { 1659 if (!isSchemeChar(str[i])) 1660 return false; 1661 } 1662 1663 // Start after the separator. 1664 int authorityStart = separator + 3; 1665 1666 // Find terminating character. 1667 int hostnameEnd = strLen; 1668 for (int i = authorityStart; i < strLen; ++i) { 1669 UChar c = str[i]; 1670 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { 1671 hostnameEnd = i; 1672 break; 1673 } 1674 } 1675 1676 // Find "@" for the start of the host name. 1677 int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@"); 1678 int hostnameStart; 1679 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) 1680 hostnameStart = authorityStart; 1681 else 1682 hostnameStart = userInfoTerminator + 1; 1683 1684 startOffset = hostnameStart; 1685 endOffset = hostnameEnd; 1686 return true; 1687 } 1688 1689 // Converts all hostnames found in the given input to punycode, preserving the 1690 // rest of the URL unchanged. The output will NOT be null-terminated. 1691 static void encodeHostnames(const String& str, UCharBuffer& output) 1692 { 1693 output.clear(); 1694 1695 if (protocolIs(str, "mailto")) { 1696 Vector<std::pair<int, int>> hostnameRanges; 1697 findHostnamesInMailToURL(str.deprecatedCharacters(), str.length(), hostnameRanges); 1698 int n = hostnameRanges.size(); 1699 int p = 0; 1700 for (int i = 0; i < n; ++i) { 1701 const std::pair<int, int>& r = hostnameRanges[i]; 1702 output.append(&str.deprecatedCharacters()[p], r.first - p); 1703 appendEncodedHostname(output, &str.deprecatedCharacters()[r.first], r.second - r.first); 1704 p = r.second; 1705 } 1706 // This will copy either everything after the last hostname, or the 1707 // whole thing if there is no hostname. 1708 output.append(&str.deprecatedCharacters()[p], str.length() - p); 1709 } else { 1710 int hostStart, hostEnd; 1711 if (findHostnameInHierarchicalURL(str.deprecatedCharacters(), str.length(), hostStart, hostEnd)) { 1712 output.append(str.deprecatedCharacters(), hostStart); // Before hostname. 1713 appendEncodedHostname(output, &str.deprecatedCharacters()[hostStart], hostEnd - hostStart); 1714 output.append(&str.deprecatedCharacters()[hostEnd], str.length() - hostEnd); // After hostname. 1715 } else { 1716 // No hostname to encode, return the input. 1717 output.append(str.deprecatedCharacters(), str.length()); 1718 } 1719 } 1720 } 1721 1722 static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) 1723 { 1724 UCharBuffer s; 1725 encodeHostnames(rel, s); 1726 1727 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. 1728 1729 int pathEnd = -1; 1730 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { 1731 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. 1732 pathEnd = findFirstOf(s.data(), s.size(), 0, "#?"); 1733 } 1734 1735 if (pathEnd == -1) { 1736 CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables); 1737 output.resize(decoded.length()); 1738 memcpy(output.data(), decoded.data(), decoded.length()); 1739 } else { 1740 CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables); 1741 // Unencodable characters in URLs are represented by converting 1742 // them to XML entities and escaping non-alphanumeric characters. 1743 CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables); 1744 1745 output.resize(pathDecoded.length() + otherDecoded.length()); 1746 memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); 1747 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); 1748 } 1749 output.append('\0'); // null-terminate the output. 1750 } 1751 1752 static String substituteBackslashes(const String& string) 1753 { 1754 size_t questionPos = string.find('?'); 1755 size_t hashPos = string.find('#'); 1756 unsigned pathEnd; 1757 1758 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos)) 1759 pathEnd = hashPos; 1760 else if (questionPos != notFound) 1761 pathEnd = questionPos; 1762 else 1763 pathEnd = string.length(); 1764 1765 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); 1766 } 1767 1768 bool URL::isHierarchical() const 1769 { 1770 if (!m_isValid) 1771 return false; 1772 ASSERT(m_string[m_schemeEnd] == ':'); 1773 return m_string[m_schemeEnd + 1] == '/'; 1774 } 1775 1776 void URL::copyToBuffer(Vector<char, 512>& buffer) const 1777 { 1778 // FIXME: This throws away the high bytes of all the characters in the string! 1779 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. 1780 buffer.resize(m_string.length()); 1781 copyASCII(m_string, buffer.data()); 1782 } 1783 1784 bool protocolIs(const String& url, const char* protocol) 1785 { 1786 // Do the comparison without making a new string object. 1787 assertProtocolIsGood(protocol); 1788 for (int i = 0; ; ++i) { 1789 if (!protocol[i]) 1790 return url[i] == ':'; 1791 if (!isLetterMatchIgnoringCase(url[i], protocol[i])) 1792 return false; 1793 } 1794 } 1795 1796 bool isValidProtocol(const String& protocol) 1797 { 1798 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 1799 if (protocol.isEmpty()) 1800 return false; 1801 if (!isSchemeFirstChar(protocol[0])) 1802 return false; 1803 unsigned protocolLength = protocol.length(); 1804 for (unsigned i = 1; i < protocolLength; i++) { 1805 if (!isSchemeChar(protocol[i])) 1806 return false; 1807 } 1808 return true; 1809 } 1810 1811 #ifndef NDEBUG 1812 void URL::print() const 1813 { 1814 printf("%s\n", m_string.utf8().data()); 1815 } 1816 #endif 1817 1818 String URL::strippedForUseAsReferrer() const 1819 { 1820 URL referrer(*this); 1821 referrer.setUser(String()); 1822 referrer.setPass(String()); 1823 referrer.removeFragmentIdentifier(); 1824 return referrer.string(); 1825 } 1826 1827 bool URL::isLocalFile() const 1828 { 1829 // Including feed here might be a bad idea since drag and drop uses this check 1830 // and including feed would allow feeds to potentially let someone's blog 1831 // read the contents of the clipboard on a drag, even without a drop. 1832 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. 1833 return protocolIs("file"); 1834 } 1835 1836 bool protocolIsJavaScript(const String& url) 1837 { 1838 return protocolIs(url, "javascript"); 1839 } 1840 1841 bool protocolIsInHTTPFamily(const String& url) 1842 { 1843 // Do the comparison without making a new string object. 1844 return isLetterMatchIgnoringCase(url[0], 'h') 1845 && isLetterMatchIgnoringCase(url[1], 't') 1846 && isLetterMatchIgnoringCase(url[2], 't') 1847 && isLetterMatchIgnoringCase(url[3], 'p') 1848 && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':')); 1849 } 1850 1851 const URL& blankURL() 1852 { 1853 DEFINE_STATIC_LOCAL(URL, staticBlankURL, (ParsedURLString, "about:blank")); 1854 return staticBlankURL; 1855 } 1856 1857 bool URL::isBlankURL() const 1858 { 1859 return protocolIs("about"); 1860 } 1861 1862 bool isDefaultPortForProtocol(unsigned short port, const String& protocol) 1863 { 1864 if (protocol.isEmpty()) 1865 return false; 1866 1867 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; 1868 DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); 1869 if (defaultPorts.isEmpty()) { 1870 defaultPorts.set("http", 80); 1871 defaultPorts.set("https", 443); 1872 defaultPorts.set("ftp", 21); 1873 defaultPorts.set("ftps", 990); 1874 } 1875 return defaultPorts.get(protocol) == port; 1876 } 1877 1878 bool portAllowed(const URL& url) 1879 { 1880 unsigned short port = url.port(); 1881 1882 // Since most URLs don't have a port, return early for the "no port" case. 1883 if (!port) 1884 return true; 1885 1886 // This blocked port list matches the port blocking that Mozilla implements. 1887 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. 1888 static const unsigned short blockedPortList[] = { 1889 1, // tcpmux 1890 7, // echo 1891 9, // discard 1892 11, // systat 1893 13, // daytime 1894 15, // netstat 1895 17, // qotd 1896 19, // chargen 1897 20, // FTP-data 1898 21, // FTP-control 1899 22, // SSH 1900 23, // telnet 1901 25, // SMTP 1902 37, // time 1903 42, // name 1904 43, // nicname 1905 53, // domain 1906 77, // priv-rjs 1907 79, // finger 1908 87, // ttylink 1909 95, // supdup 1910 101, // hostriame 1911 102, // iso-tsap 1912 103, // gppitnp 1913 104, // acr-nema 1914 109, // POP2 1915 110, // POP3 1916 111, // sunrpc 1917 113, // auth 1918 115, // SFTP 1919 117, // uucp-path 1920 119, // nntp 1921 123, // NTP 1922 135, // loc-srv / epmap 1923 139, // netbios 1924 143, // IMAP2 1925 179, // BGP 1926 389, // LDAP 1927 465, // SMTP+SSL 1928 512, // print / exec 1929 513, // login 1930 514, // shell 1931 515, // printer 1932 526, // tempo 1933 530, // courier 1934 531, // Chat 1935 532, // netnews 1936 540, // UUCP 1937 556, // remotefs 1938 563, // NNTP+SSL 1939 587, // ESMTP 1940 601, // syslog-conn 1941 636, // LDAP+SSL 1942 993, // IMAP+SSL 1943 995, // POP3+SSL 1944 2049, // NFS 1945 3659, // apple-sasl / PasswordServer [Apple addition] 1946 4045, // lockd 1947 6000, // X11 1948 6665, // Alternate IRC [Apple addition] 1949 6666, // Alternate IRC [Apple addition] 1950 6667, // Standard IRC [Apple addition] 1951 6668, // Alternate IRC [Apple addition] 1952 6669, // Alternate IRC [Apple addition] 1953 invalidPortNumber, // Used to block all invalid port numbers 1954 }; 1955 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList); 1956 1957 #ifndef NDEBUG 1958 // The port list must be sorted for binary_search to work. 1959 static bool checkedPortList = false; 1960 if (!checkedPortList) { 1961 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) 1962 ASSERT(*p < *(p + 1)); 1963 checkedPortList = true; 1964 } 1965 #endif 1966 1967 // If the port is not in the blocked port list, allow it. 1968 if (!std::binary_search(blockedPortList, blockedPortListEnd, port)) 1969 return true; 1970 1971 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. 1972 if ((port == 21 || port == 22) && url.protocolIs("ftp")) 1973 return true; 1974 1975 // Allow any port number in a file URL, since the port number is ignored. 1976 if (url.protocolIs("file")) 1977 return true; 1978 1979 return false; 1980 } 1981 1982 String mimeTypeFromDataURL(const String& url) 1983 { 1984 ASSERT(protocolIs(url, "data")); 1985 size_t index = url.find(';'); 1986 if (index == notFound) 1987 index = url.find(','); 1988 if (index != notFound) { 1989 if (index > 5) 1990 return url.substring(5, index - 5).lower(); 1991 return "text/plain"; // Data URLs with no MIME type are considered text/plain. 1992 } 1993 return ""; 1994 } 1995 1996 String mimeTypeFromURL(const URL& url) 1997 { 1998 String decodedPath = decodeURLEscapeSequences(url.path()); 1999 String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1); 2000 2001 // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure 2002 return MIMETypeRegistry::getMIMETypeForExtension(extension); 2003 } 2004 2005 bool URL::isSafeToSendToAnotherThread() const 2006 { 2007 return m_string.isSafeToSendToAnotherThread(); 2008 } 2009 2010 String URL::stringCenterEllipsizedToLength(unsigned length) const 2011 { 2012 if (string().length() <= length) 2013 return string(); 2014 2015 return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2); 2016 } 2017 2018 }