1 /*
   2  * Copyright (C) 2004, 2007, 2008, 2011, 2012, 2013 Apple Inc. All rights reserved.
   3  * Copyright (C) 2012 Research In Motion Limited. All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26 
  27 #include "config.h"
  28 #include "URL.h"
  29 
  30 #include "DecodeEscapeSequences.h"
  31 #include "MIMETypeRegistry.h"
  32 #include "TextEncoding.h"
  33 #include <stdio.h>
  34 #include <unicode/uidna.h>
  35 #include <wtf/HashMap.h>
  36 #include <wtf/HexNumber.h>
  37 #include <wtf/StdLibExtras.h>
  38 #include <wtf/text/CString.h>
  39 #include <wtf/text/StringBuilder.h>
  40 #include <wtf/text/StringHash.h>
  41 
  42 // FIXME: This file makes too much use of the + operator on String.
  43 // We either have to optimize that operator so it doesn't involve
  44 // so many allocations, or change this to use StringBuffer instead.
  45 
  46 using namespace WTF;
  47 
  48 namespace WebCore {
  49 
  50 typedef Vector<char, 512> CharBuffer;
  51 typedef Vector<UChar, 512> UCharBuffer;
  52 
  53 static const unsigned maximumValidPortNumber = 0xFFFE;
  54 static const unsigned invalidPortNumber = 0xFFFF;
  55 
  56 static inline bool isLetterMatchIgnoringCase(UChar character, char lowercaseLetter)
  57 {
  58     ASSERT(isASCIILower(lowercaseLetter));
  59     return (character | 0x20) == lowercaseLetter;
  60 }
  61 
  62 static const char wsScheme[] = {'w', 's'};
  63 static const char ftpScheme[] = {'f', 't', 'p'};
  64 static const char ftpPort[] = {'2', '1'};
  65 static const char wssScheme[] = {'w', 's', 's'};
  66 static const char fileScheme[] = {'f', 'i', 'l', 'e'};
  67 static const char httpScheme[] = {'h', 't', 't', 'p'};
  68 static const char httpPort[] = {'8', '0'};
  69 static const char httpsScheme[] = {'h', 't', 't', 'p', 's'};
  70 static const char httpsPort[] = {'4', '4', '3'};
  71 static const char gopherScheme[] = {'g', 'o', 'p', 'h', 'e', 'r'};
  72 static const char gopherPort[] = {'7', '0'};
  73 
  74 static inline bool isLetterMatchIgnoringCase(char character, char lowercaseLetter)
  75 {
  76     ASSERT(isASCIILower(lowercaseLetter));
  77     return (character | 0x20) == lowercaseLetter;
  78 }
  79 
  80 enum URLCharacterClasses {
  81     // alpha
  82     SchemeFirstChar = 1 << 0,
  83 
  84     // ( alpha | digit | "+" | "-" | "." )
  85     SchemeChar = 1 << 1,
  86 
  87     // mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
  88     // unreserved  = alphanum | mark
  89     // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," )
  90     UserInfoChar = 1 << 2,
  91 
  92     // alnum | "." | "-" | "%"
  93     // The above is what the specification says, but we are lenient to
  94     // match existing practice and also allow:
  95     // "_"
  96     HostnameChar = 1 << 3,
  97 
  98     // hexdigit | ":" | "%"
  99     IPv6Char = 1 << 4,
 100 
 101     // "#" | "?" | "/" | nul
 102     PathSegmentEndChar = 1 << 5,
 103 
 104     // not allowed in path
 105     BadChar = 1 << 6
 106 };
 107 
 108 static const unsigned char characterClassTable[256] = {
 109     /* 0 nul */ PathSegmentEndChar,    /* 1 soh */ BadChar,
 110     /* 2 stx */ BadChar,    /* 3 etx */ BadChar,
 111     /* 4 eot */ BadChar,    /* 5 enq */ BadChar,    /* 6 ack */ BadChar,    /* 7 bel */ BadChar,
 112     /* 8 bs */ BadChar,     /* 9 ht */ BadChar,     /* 10 nl */ BadChar,    /* 11 vt */ BadChar,
 113     /* 12 np */ BadChar,    /* 13 cr */ BadChar,    /* 14 so */ BadChar,    /* 15 si */ BadChar,
 114     /* 16 dle */ BadChar,   /* 17 dc1 */ BadChar,   /* 18 dc2 */ BadChar,   /* 19 dc3 */ BadChar,
 115     /* 20 dc4 */ BadChar,   /* 21 nak */ BadChar,   /* 22 syn */ BadChar,   /* 23 etb */ BadChar,
 116     /* 24 can */ BadChar,   /* 25 em */ BadChar,    /* 26 sub */ BadChar,   /* 27 esc */ BadChar,
 117     /* 28 fs */ BadChar,    /* 29 gs */ BadChar,    /* 30 rs */ BadChar,    /* 31 us */ BadChar,
 118     /* 32 sp */ BadChar,    /* 33  ! */ UserInfoChar,
 119     /* 34  " */ BadChar,    /* 35  # */ PathSegmentEndChar | BadChar,
 120     /* 36  $ */ UserInfoChar,    /* 37  % */ UserInfoChar | HostnameChar | IPv6Char | BadChar,
 121     /* 38  & */ UserInfoChar,    /* 39  ' */ UserInfoChar,
 122     /* 40  ( */ UserInfoChar,    /* 41  ) */ UserInfoChar,
 123     /* 42  * */ UserInfoChar,    /* 43  + */ SchemeChar | UserInfoChar,
 124     /* 44  , */ UserInfoChar,
 125     /* 45  - */ SchemeChar | UserInfoChar | HostnameChar,
 126     /* 46  . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 127     /* 47  / */ PathSegmentEndChar,
 128     /* 48  0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 129     /* 49  1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 130     /* 50  2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 131     /* 51  3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 132     /* 52  4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 133     /* 53  5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 134     /* 54  6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 135     /* 55  7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 136     /* 56  8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 137     /* 57  9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 138     /* 58  : */ UserInfoChar | IPv6Char,    /* 59  ; */ UserInfoChar,
 139     /* 60  < */ BadChar,    /* 61  = */ UserInfoChar,
 140     /* 62  > */ BadChar,    /* 63  ? */ PathSegmentEndChar | BadChar,
 141     /* 64  @ */ 0,
 142     /* 65  A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 143     /* 66  B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 144     /* 67  C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 145     /* 68  D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 146     /* 69  E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 147     /* 70  F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 148     /* 71  G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 149     /* 72  H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 150     /* 73  I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 151     /* 74  J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 152     /* 75  K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 153     /* 76  L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 154     /* 77  M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 155     /* 78  N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 156     /* 79  O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 157     /* 80  P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 158     /* 81  Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 159     /* 82  R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 160     /* 83  S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 161     /* 84  T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 162     /* 85  U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 163     /* 86  V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 164     /* 87  W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 165     /* 88  X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 166     /* 89  Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 167     /* 90  Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 168     /* 91  [ */ 0,
 169     /* 92  \ */ 0,    /* 93  ] */ 0,
 170     /* 94  ^ */ 0,
 171     /* 95  _ */ UserInfoChar | HostnameChar,
 172     /* 96  ` */ 0,
 173     /* 97  a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 174     /* 98  b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 175     /* 99  c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 176     /* 100  d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 177     /* 101  e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 178     /* 102  f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char,
 179     /* 103  g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 180     /* 104  h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 181     /* 105  i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 182     /* 106  j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 183     /* 107  k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 184     /* 108  l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 185     /* 109  m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 186     /* 110  n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 187     /* 111  o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 188     /* 112  p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 189     /* 113  q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 190     /* 114  r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 191     /* 115  s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 192     /* 116  t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 193     /* 117  u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 194     /* 118  v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 195     /* 119  w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 196     /* 120  x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 197     /* 121  y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 198     /* 122  z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar,
 199     /* 123  { */ 0,
 200     /* 124  | */ 0,   /* 125  } */ 0,   /* 126  ~ */ UserInfoChar,   /* 127 del */ BadChar,
 201     /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar,
 202     /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar,
 203     /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar,
 204     /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar,
 205     /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar,
 206     /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar,
 207     /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar,
 208     /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar,
 209     /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar,
 210     /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar,
 211     /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar,
 212     /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar,
 213     /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar,
 214     /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar,
 215     /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar,
 216     /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar,
 217     /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar,
 218     /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar,
 219     /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar,
 220     /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar,
 221     /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar,
 222     /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar,
 223     /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar,
 224     /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar,
 225     /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar,
 226     /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar,
 227     /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar,
 228     /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar,
 229     /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar,
 230     /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar,
 231     /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar,
 232     /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar
 233 };
 234 
 235 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd);
 236 static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput);
 237 static String substituteBackslashes(const String&);
 238 
 239 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; }
 240 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); }
 241 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; }
 242 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); }
 243 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; }
 244 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; }
 245 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; }
 246 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; }
 247 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); }
 248 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; }
 249 
 250 static inline bool isSchemeCharacterMatchIgnoringCase(char character, char schemeCharacter)
 251 {
 252     ASSERT(isSchemeChar(character));
 253     ASSERT(schemeCharacter & 0x20);
 254     ASSERT(isASCIILower(schemeCharacter) || (!isASCIIUpper(schemeCharacter) && isSchemeChar(schemeCharacter)));
 255     return (character | 0x20) == schemeCharacter;
 256 }
 257 
 258 // Copies the source to the destination, assuming all the source characters are
 259 // ASCII. The destination buffer must be large enough. Null characters are allowed
 260 // in the source string, and no attempt is made to null-terminate the result.
 261 static void copyASCII(const String& string, char* dest)
 262 {
 263     if (string.isEmpty())
 264         return;
 265 
 266     if (string.is8Bit())
 267         memcpy(dest, string.characters8(), string.length());
 268     else {
 269         const UChar* src = string.characters16();
 270         size_t length = string.length();
 271         for (size_t i = 0; i < length; i++)
 272             dest[i] = static_cast<char>(src[i]);
 273     }
 274 }
 275 
 276 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer)
 277 {
 278     buffer.resize(base.length() + len + 1);
 279     copyASCII(base, buffer.data());
 280     memcpy(buffer.data() + base.length(), rel, len);
 281     buffer[buffer.size() - 1] = '\0';
 282 }
 283 
 284 // FIXME: Move to WTFString.h eventually.
 285 // Returns the index of the first index in string |s| of any of the characters
 286 // in |toFind|. |toFind| should be a null-terminated string, all characters up
 287 // to the null will be searched. Returns int if not found.
 288 static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind)
 289 {
 290     for (int i = startPos; i < sLen; i++) {
 291         const char* cur = toFind;
 292         while (*cur) {
 293             if (s[i] == *(cur++))
 294                 return i;
 295         }
 296     }
 297     return -1;
 298 }
 299 
 300 static inline void checkEncodedString(const String& url)
 301 {
 302     ASSERT_UNUSED(url, url.containsOnlyASCII());
 303     ASSERT_UNUSED(url, url.isEmpty() || isSchemeFirstChar(url[0]));
 304 }
 305 
 306 inline bool URL::protocolIs(const String& string, const char* protocol)
 307 {
 308     return WebCore::protocolIs(string, protocol);
 309 }
 310 
 311 void URL::invalidate()
 312 {
 313     m_isValid = false;
 314     m_protocolIsInHTTPFamily = false;
 315 #if PLATFORM(JAVA)
 316     m_protocolIsInJar = false;
 317 #endif
 318     m_schemeEnd = 0;
 319     m_userStart = 0;
 320     m_userEnd = 0;
 321     m_passwordEnd = 0;
 322     m_hostEnd = 0;
 323     m_portEnd = 0;
 324     m_pathEnd = 0;
 325     m_pathAfterLastSlash = 0;
 326     m_queryEnd = 0;
 327     m_fragmentEnd = 0;
 328 }
 329 
 330 URL::URL(ParsedURLStringTag, const String& url)
 331 {
 332     parse(url);
 333     ASSERT(url == m_string);
 334 }
 335 
 336 URL::URL(const URL& base, const String& relative)
 337 {
 338     init(base, relative, UTF8Encoding());
 339 }
 340 
 341 URL::URL(const URL& base, const String& relative, const TextEncoding& encoding)
 342 {
 343     // For UTF-{7,16,32}, we want to use UTF-8 for the query part as
 344     // we do when submitting a form. A form with GET method
 345     // has its contents added to a URL as query params and it makes sense
 346     // to be consistent.
 347     init(base, relative, encoding.encodingForFormSubmission());
 348 }
 349 
 350 static bool shouldTrimFromURL(unsigned char c)
 351 {
 352     // Browsers ignore leading/trailing whitespace and control
 353     // characters from URLs.  Note that c is an *unsigned* char here
 354     // so this comparison should only catch control characters.
 355     return c <= ' ';
 356 }
 357 
 358 void URL::init(const URL& base, const String& relative, const TextEncoding& encoding)
 359 {
 360     // Allow resolutions with a null or empty base URL, but not with any other invalid one.
 361     // FIXME: Is this a good rule?
 362     if (!base.m_isValid && !base.isEmpty()) {
 363         m_string = relative;
 364         invalidate();
 365         return;
 366     }
 367 
 368     // For compatibility with Win IE, treat backslashes as if they were slashes,
 369     // as long as we're not dealing with javascript: or data: URLs.
 370     String rel = relative;
 371     if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data")))
 372         rel = substituteBackslashes(rel);
 373 
 374     bool allASCII = rel.containsOnlyASCII();
 375     CharBuffer strBuffer;
 376     char* str;
 377     size_t len;
 378     if (allASCII) {
 379         len = rel.length();
 380         strBuffer.resize(len + 1);
 381         copyASCII(rel, strBuffer.data());
 382         strBuffer[len] = 0;
 383         str = strBuffer.data();
 384     } else {
 385         encodeRelativeString(rel, encoding, strBuffer);
 386         str = strBuffer.data();
 387         len = strlen(str);
 388     }
 389 
 390     // Get rid of leading whitespace and control characters.
 391     while (len && shouldTrimFromURL(*str)) {
 392         str++;
 393         --len;
 394     }
 395 
 396     // Get rid of trailing whitespace and control characters.
 397     while (len && shouldTrimFromURL(str[len - 1]))
 398         str[--len] = '\0';
 399 
 400     // According to the RFC, the reference should be interpreted as an
 401     // absolute URI if possible, using the "leftmost, longest"
 402     // algorithm. If the URI reference is absolute it will have a
 403     // scheme, meaning that it will have a colon before the first
 404     // non-scheme element.
 405     bool absolute = false;
 406     char* p = str;
 407     if (isSchemeFirstChar(*p)) {
 408         ++p;
 409         while (isSchemeChar(*p)) {
 410             ++p;
 411         }
 412         if (*p == ':') {
 413             if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical())
 414                 str = p + 1;
 415             else
 416                 absolute = true;
 417         }
 418     }
 419 
 420     CharBuffer parseBuffer;
 421 
 422     if (absolute) {
 423         parse(str, &relative);
 424     } else {
 425         // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid
 426         // unless the relative URL is a single fragment.
 427         if (!base.isHierarchical()) {
 428             if (str[0] == '#') {
 429                 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
 430                 parse(parseBuffer.data(), &relative);
 431 #if PLATFORM(JAVA)
 432             } else if(base.isJarFile()) {
 433                 appendASCII(base.m_string.left(base.m_pathAfterLastSlash), str, len, parseBuffer);
 434                 parse(parseBuffer.data(), &relative);
 435 #endif
 436             } else {
 437                 m_string = relative;
 438                 invalidate();
 439             }
 440             return;
 441         }
 442 
 443         switch (str[0]) {
 444         case '\0':
 445             // The reference is empty, so this is a reference to the same document with any fragment identifier removed.
 446             *this = base;
 447             removeFragmentIdentifier();
 448             break;
 449         case '#': {
 450             // must be fragment-only reference
 451             appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer);
 452             parse(parseBuffer.data(), &relative);
 453             break;
 454         }
 455         case '?': {
 456             // query-only reference, special case needed for non-URL results
 457             appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer);
 458             parse(parseBuffer.data(), &relative);
 459             break;
 460         }
 461         case '/':
 462             // must be net-path or absolute-path reference
 463             if (str[1] == '/') {
 464                 // net-path
 465                 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer);
 466                 parse(parseBuffer.data(), &relative);
 467             } else {
 468                 // abs-path
 469                 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer);
 470                 parse(parseBuffer.data(), &relative);
 471             }
 472             break;
 473         default:
 474             {
 475                 // must be relative-path reference
 476 
 477                 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte.
 478                 const size_t bufferSize = base.m_pathEnd + 1 + len + 1;
 479                 parseBuffer.resize(bufferSize);
 480 
 481                 char* bufferPos = parseBuffer.data();
 482                 char* bufferStart = bufferPos;
 483 
 484                 // first copy everything before the path from the base
 485                 CharBuffer baseStringBuffer(base.m_string.length());
 486                 copyASCII(base.m_string, baseStringBuffer.data());
 487                 const char* baseString = baseStringBuffer.data();
 488                 const char* baseStringStart = baseString;
 489                 const char* pathStart = baseStringStart + base.m_portEnd;
 490                 while (baseStringStart < pathStart)
 491                     *bufferPos++ = *baseStringStart++;
 492                 char* bufferPathStart = bufferPos;
 493 
 494                 // now copy the base path
 495                 const char* baseStringEnd = baseString + base.m_pathEnd;
 496 
 497                 // go back to the last slash
 498                 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/')
 499                     baseStringEnd--;
 500 
 501                 if (baseStringEnd == baseStringStart) {
 502                     // no path in base, add a path separator if necessary
 503                     if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#')
 504                         *bufferPos++ = '/';
 505                 } else {
 506                     bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart);
 507                 }
 508 
 509                 const char* relStringStart = str;
 510                 const char* relStringPos = relStringStart;
 511 
 512                 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') {
 513                     if (relStringPos[0] == '.' && bufferPos[-1] == '/') {
 514                         if (isPathSegmentEndChar(relStringPos[1])) {
 515                             // skip over "." segment
 516                             relStringPos += 1;
 517                             if (relStringPos[0] == '/')
 518                                 relStringPos++;
 519                             continue;
 520                         } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) {
 521                             // skip over ".." segment and rewind the last segment
 522                             // the RFC leaves it up to the app to decide what to do with excess
 523                             // ".." segments - we choose to drop them since some web content
 524                             // relies on this.
 525                             relStringPos += 2;
 526                             if (relStringPos[0] == '/')
 527                                 relStringPos++;
 528                             if (bufferPos > bufferPathStart + 1)
 529                                 bufferPos--;
 530                             while (bufferPos > bufferPathStart + 1  && bufferPos[-1] != '/')
 531                                 bufferPos--;
 532                             continue;
 533                         }
 534                     }
 535 
 536                     *bufferPos = *relStringPos;
 537                     relStringPos++;
 538                     bufferPos++;
 539                 }
 540 
 541                 // all done with the path work, now copy any remainder
 542                 // of the relative reference; this will also add a null terminator
 543                 strncpy(bufferPos, relStringPos, bufferSize - (bufferPos - bufferStart));
 544 
 545                 parse(parseBuffer.data(), &relative);
 546 
 547                 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size());
 548                 break;
 549             }
 550         }
 551     }
 552 }
 553 
 554 URL URL::copy() const
 555 {
 556     URL result = *this;
 557     result.m_string = result.m_string.isolatedCopy();
 558     return result;
 559 }
 560 
 561 String URL::lastPathComponent() const
 562 {
 563     if (!hasPath())
 564         return String();
 565 
 566     unsigned end = m_pathEnd - 1;
 567     if (m_string[end] == '/')
 568         --end;
 569 
 570     size_t start = m_string.reverseFind('/', end);
 571     if (start < static_cast<unsigned>(m_portEnd))
 572         return String();
 573     ++start;
 574 
 575     return m_string.substring(start, end - start + 1);
 576 }
 577 
 578 String URL::protocol() const
 579 {
 580     return m_string.left(m_schemeEnd);
 581 }
 582 
 583 String URL::host() const
 584 {
 585     int start = hostStart();
 586     return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start));
 587 }
 588 
 589 unsigned short URL::port() const
 590 {
 591     // We return a port of 0 if there is no port specified. This can happen in two situations:
 592     // 1) The URL contains no colon after the host name and before the path component of the URL.
 593     // 2) The URL contains a colon but there's no port number before the path component of the URL begins.
 594     if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1)
 595         return 0;
 596 
 597     bool ok = false;
 598     unsigned number = charactersToUIntStrict(m_string.deprecatedCharacters() + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok);
 599     if (!ok || number > maximumValidPortNumber)
 600         return invalidPortNumber;
 601     return number;
 602 }
 603 
 604 String URL::pass() const
 605 {
 606     if (m_passwordEnd == m_userEnd)
 607         return String();
 608 
 609     return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1));
 610 }
 611 
 612 String URL::user() const
 613 {
 614     return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart));
 615 }
 616 
 617 String URL::fragmentIdentifier() const
 618 {
 619     if (m_fragmentEnd == m_queryEnd)
 620         return String();
 621 
 622     return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1));
 623 }
 624 
 625 bool URL::hasFragmentIdentifier() const
 626 {
 627     return m_fragmentEnd != m_queryEnd;
 628 }
 629 
 630 String URL::baseAsString() const
 631 {
 632     return m_string.left(m_pathAfterLastSlash);
 633 }
 634 
 635 #if !USE(CF)
 636 String URL::fileSystemPath() const
 637 {
 638     if (!isValid() || !isLocalFile())
 639         return String();
 640 
 641     return decodeURLEscapeSequences(path());
 642 }
 643 #endif
 644 
 645 #ifdef NDEBUG
 646 
 647 static inline void assertProtocolIsGood(const char*)
 648 {
 649 }
 650 
 651 #else
 652 
 653 static void assertProtocolIsGood(const char* protocol)
 654 {
 655     const char* p = protocol;
 656     while (*p) {
 657         ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z'));
 658         ++p;
 659     }
 660 }
 661 
 662 #endif
 663 
 664 bool URL::protocolIs(const char* protocol) const
 665 {
 666     assertProtocolIsGood(protocol);
 667 
 668     // JavaScript URLs are "valid" and should be executed even if URL decides they are invalid.
 669     // The free function protocolIsJavaScript() should be used instead.
 670     ASSERT(!equalIgnoringCase(protocol, String("javascript")));
 671 
 672     if (!m_isValid)
 673         return false;
 674 
 675     // Do the comparison without making a new string object.
 676     for (int i = 0; i < m_schemeEnd; ++i) {
 677         if (!protocol[i] || !isSchemeCharacterMatchIgnoringCase(m_string[i], protocol[i]))
 678             return false;
 679     }
 680     return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument.
 681 }
 682 
 683 String URL::query() const
 684 {
 685     if (m_queryEnd == m_pathEnd)
 686         return String();
 687 
 688     return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1));
 689 }
 690 
 691 String URL::path() const
 692 {
 693     return m_string.substring(m_portEnd, m_pathEnd - m_portEnd);
 694 }
 695 
 696 bool URL::setProtocol(const String& s)
 697 {
 698     // Firefox and IE remove everything after the first ':'.
 699     size_t separatorPosition = s.find(':');
 700     String newProtocol = s.substring(0, separatorPosition);
 701 
 702     if (!isValidProtocol(newProtocol))
 703         return false;
 704 
 705     if (!m_isValid) {
 706         parse(newProtocol + ':' + m_string);
 707         return true;
 708     }
 709 
 710     parse(newProtocol + m_string.substring(m_schemeEnd));
 711     return true;
 712 }
 713 
 714 void URL::setHost(const String& s)
 715 {
 716     if (!m_isValid)
 717         return;
 718 
 719     // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
 720     // and to avoid changing more than just the host.
 721 
 722     bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
 723 
 724     parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd));
 725 }
 726 
 727 void URL::removePort()
 728 {
 729     if (m_hostEnd == m_portEnd)
 730         return;
 731     parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd));
 732 }
 733 
 734 void URL::setPort(unsigned short i)
 735 {
 736     if (!m_isValid)
 737         return;
 738 
 739     bool colonNeeded = m_portEnd == m_hostEnd;
 740     int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1);
 741 
 742     parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd));
 743 }
 744 
 745 void URL::setHostAndPort(const String& hostAndPort)
 746 {
 747     if (!m_isValid)
 748         return;
 749 
 750     // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
 751     // and to avoid changing more than just host and port.
 752 
 753     bool slashSlashNeeded = m_userStart == m_schemeEnd + 1;
 754 
 755     parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd));
 756 }
 757 
 758 void URL::setUser(const String& user)
 759 {
 760     if (!m_isValid)
 761         return;
 762 
 763     // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
 764     // and to avoid changing more than just the user login.
 765 
 766     int end = m_userEnd;
 767     if (!user.isEmpty()) {
 768         String u = user;
 769         if (m_userStart == m_schemeEnd + 1)
 770             u = "//" + u;
 771         // Add '@' if we didn't have one before.
 772         if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@'))
 773             u.append('@');
 774         parse(m_string.left(m_userStart) + u + m_string.substring(end));
 775     } else {
 776         // Remove '@' if we now have neither user nor password.
 777         if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@')
 778             end += 1;
 779         // We don't want to parse in the extremely common case where we are not going to make a change.
 780         if (m_userStart != end)
 781             parse(m_string.left(m_userStart) + m_string.substring(end));
 782     }
 783 }
 784 
 785 void URL::setPass(const String& password)
 786 {
 787     if (!m_isValid)
 788         return;
 789 
 790     // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations,
 791     // and to avoid changing more than just the user password.
 792 
 793     int end = m_passwordEnd;
 794     if (!password.isEmpty()) {
 795         String p = ":" + password + "@";
 796         if (m_userEnd == m_schemeEnd + 1)
 797             p = "//" + p;
 798         // Eat the existing '@' since we are going to add our own.
 799         if (end != m_hostEnd && m_string[end] == '@')
 800             end += 1;
 801         parse(m_string.left(m_userEnd) + p + m_string.substring(end));
 802     } else {
 803         // Remove '@' if we now have neither user nor password.
 804         if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@')
 805             end += 1;
 806         // We don't want to parse in the extremely common case where we are not going to make a change.
 807         if (m_userEnd != end)
 808             parse(m_string.left(m_userEnd) + m_string.substring(end));
 809     }
 810 }
 811 
 812 void URL::setFragmentIdentifier(const String& s)
 813 {
 814     if (!m_isValid)
 815         return;
 816 
 817     // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations.
 818     parse(m_string.left(m_queryEnd) + "#" + s);
 819 }
 820 
 821 void URL::removeFragmentIdentifier()
 822 {
 823     if (!m_isValid)
 824         return;
 825     parse(m_string.left(m_queryEnd));
 826 }
 827 
 828 void URL::setQuery(const String& query)
 829 {
 830     if (!m_isValid)
 831         return;
 832 
 833     // FIXME: '#' and non-ASCII characters must be encoded and escaped.
 834     // Usually, the query is encoded using document encoding, not UTF-8, but we don't have
 835     // access to the document in this function.
 836     if ((query.isEmpty() || query[0] != '?') && !query.isNull())
 837         parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd));
 838     else
 839         parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd));
 840 
 841 }
 842 
 843 void URL::setPath(const String& s)
 844 {
 845     if (!m_isValid)
 846         return;
 847 
 848     // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts
 849     // may be inadvertently affected.
 850     String path = s;
 851     if (path.isEmpty() || path[0] != '/')
 852         path = "/" + path;
 853 
 854     parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd));
 855 }
 856 
 857 #if PLATFORM(JAVA)
 858 String URL::deprecatedString() const
 859 {
 860     if (!m_isValid)
 861         return m_string;
 862 
 863     StringBuilder result;
 864 
 865     result.append(protocol());
 866     result.append(':');
 867 
 868     StringBuilder authority;
 869 
 870     if (m_hostEnd != m_passwordEnd) {
 871         if (m_userEnd != m_userStart) {
 872             authority.append(user());
 873             authority.append('@');
 874         }
 875         authority.append(host());
 876         if (hasPort()) {
 877             authority.append(':');
 878             authority.append(String::number(port()));
 879         }
 880     }
 881 
 882     if (!authority.isEmpty()) {
 883         result.append('/');
 884         result.append('/');
 885         result.append(authority.deprecatedCharacters(), authority.length());
 886     } else if (protocolIs("file")) {
 887         result.append('/');
 888         result.append('/');
 889     }
 890 
 891     result.append(path());
 892 
 893     if (m_pathEnd != m_queryEnd) {
 894         result.append('?');
 895         result.append(query());
 896     }
 897 
 898     if (m_fragmentEnd != m_queryEnd) {
 899         result.append('#');
 900         result.append(fragmentIdentifier());
 901     }
 902 
 903     return result.toString();
 904 }
 905 #endif
 906 
 907 String decodeURLEscapeSequences(const String& string)
 908 {
 909     return decodeEscapeSequences<URLEscapeSequence>(string, UTF8Encoding());
 910 }
 911 
 912 String decodeURLEscapeSequences(const String& string, const TextEncoding& encoding)
 913 {
 914     return decodeEscapeSequences<URLEscapeSequence>(string, encoding);
 915 }
 916 
 917 // Caution: This function does not bounds check.
 918 static void appendEscapedChar(char*& buffer, unsigned char c)
 919 {
 920     *buffer++ = '%';
 921     placeByteAsHex(c, buffer);
 922 }
 923 
 924 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length)
 925 {
 926     char* p = buffer;
 927 
 928     const char* str = strStart;
 929     const char* strEnd = strStart + length;
 930     while (str < strEnd) {
 931         unsigned char c = *str++;
 932         if (isBadChar(c)) {
 933             if (c == '%' || c == '?')
 934                 *p++ = c;
 935             else if (c != 0x09 && c != 0x0a && c != 0x0d)
 936                 appendEscapedChar(p, c);
 937         } else
 938             *p++ = c;
 939     }
 940 
 941     buffer = p;
 942 }
 943 
 944 static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length)
 945 {
 946     char* p = buffer;
 947 
 948     const char* str = strStart;
 949     const char* strEnd = strStart + length;
 950     while (str < strEnd) {
 951         unsigned char c = *str++;
 952         // Strip CR, LF and Tab from fragments, per:
 953         // https://bugs.webkit.org/show_bug.cgi?id=8770
 954         if (c == 0x09 || c == 0x0a || c == 0x0d)
 955             continue;
 956 
 957         // Chrome and IE allow non-ascii characters in fragments, however doing
 958         // so would hit an ASSERT in checkEncodedString, so for now we don't.
 959         if (c < 0x20 || c >= 127) {
 960             appendEscapedChar(p, c);
 961             continue;
 962         }
 963         *p++ = c;
 964     }
 965 
 966     buffer = p;
 967 }
 968 
 969 // copy a path, accounting for "." and ".." segments
 970 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd)
 971 {
 972     char* bufferPathStart = dst;
 973 
 974     // empty path is a special case, and need not have a leading slash
 975     if (srcStart != srcEnd) {
 976         const char* baseStringStart = src + srcStart;
 977         const char* baseStringEnd = src + srcEnd;
 978         const char* baseStringPos = baseStringStart;
 979 
 980         // this code is unprepared for paths that do not begin with a
 981         // slash and we should always have one in the source string
 982 #if !PLATFORM(JAVA)
 983         //in JAVA the complex protocols like "jar:file" are available.
 984         ASSERT(baseStringPos[0] == '/');
 985 #endif
 986 
 987         // copy the leading slash into the destination
 988         *dst = *baseStringPos;
 989         baseStringPos++;
 990         dst++;
 991 
 992         while (baseStringPos < baseStringEnd) {
 993             if (baseStringPos[0] == '.' && dst[-1] == '/') {
 994                 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) {
 995                     // skip over "." segment
 996                     baseStringPos += 2;
 997                     continue;
 998                 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' ||
 999                                        baseStringPos + 2 == baseStringEnd)) {
1000                     // skip over ".." segment and rewind the last segment
1001                     // the RFC leaves it up to the app to decide what to do with excess
1002                     // ".." segments - we choose to drop them since some web content
1003                     // relies on this.
1004                     baseStringPos += 3;
1005                     if (dst > bufferPathStart + 1)
1006                         dst--;
1007                     while (dst > bufferPathStart && dst[-1] != '/')
1008                         dst--;
1009                     continue;
1010                 }
1011             }
1012 
1013             *dst = *baseStringPos;
1014             baseStringPos++;
1015             dst++;
1016         }
1017     }
1018     *dst = '\0';
1019     return dst - bufferPathStart;
1020 }
1021 
1022 static inline bool hasSlashDotOrDotDot(const char* str)
1023 {
1024     const unsigned char* p = reinterpret_cast<const unsigned char*>(str);
1025     if (!*p)
1026         return false;
1027     unsigned char pc = *p;
1028     while (unsigned char c = *++p) {
1029         if (c == '.' && (pc == '/' || pc == '.'))
1030             return true;
1031         pc = c;
1032     }
1033     return false;
1034 }
1035 
1036 void URL::parse(const String& string)
1037 {
1038     checkEncodedString(string);
1039 
1040     CharBuffer buffer(string.length() + 1);
1041     copyASCII(string, buffer.data());
1042     buffer[string.length()] = '\0';
1043     parse(buffer.data(), &string);
1044 }
1045 
1046 #if PLATFORM(IOS)
1047 static bool shouldCanonicalizeScheme = true;
1048 
1049 void enableURLSchemeCanonicalization(bool enableSchemeCanonicalization)
1050 {
1051     shouldCanonicalizeScheme = enableSchemeCanonicalization;
1052 }
1053 #endif
1054 
1055 template<size_t length>
1056 static inline bool equal(const char* a, const char (&b)[length])
1057 {
1058 #if PLATFORM(IOS)
1059     if (!shouldCanonicalizeScheme) {
1060         for (size_t i = 0; i < length; ++i) {
1061             if (toASCIILower(a[i]) != b[i])
1062                 return false;
1063         }
1064         return true;
1065     }
1066 #endif
1067     for (size_t i = 0; i < length; ++i) {
1068         if (a[i] != b[i])
1069             return false;
1070     }
1071     return true;
1072 }
1073 
1074 template<size_t lengthB>
1075 static inline bool equal(const char* stringA, size_t lengthA, const char (&stringB)[lengthB])
1076 {
1077     return lengthA == lengthB && equal(stringA, stringB);
1078 }
1079 
1080 // List of default schemes is taken from google-url:
1081 // http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120
1082 static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength)
1083 {
1084     // This switch is theoretically a performance optimization.  It came over when
1085     // the code was moved from google-url, but may be removed later.
1086     switch (schemeLength) {
1087     case 2:
1088         return equal(scheme, wsScheme) && equal(port, portLength, httpPort);
1089     case 3:
1090         if (equal(scheme, ftpScheme))
1091             return equal(port, portLength, ftpPort);
1092         if (equal(scheme, wssScheme))
1093             return equal(port, portLength, httpsPort);
1094         break;
1095     case 4:
1096         return equal(scheme, httpScheme) && equal(port, portLength, httpPort);
1097     case 5:
1098         return equal(scheme, httpsScheme) && equal(port, portLength, httpsPort);
1099     case 6:
1100         return equal(scheme, gopherScheme) && equal(port, portLength, gopherPort);
1101     }
1102     return false;
1103 }
1104 
1105 static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userinfoEndChar)
1106 {
1107     return userinfoEndChar == '@' && hostStart == portEnd;
1108 }
1109 
1110 static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength)
1111 {
1112     switch (schemeLength) {
1113     case 2:
1114         return equal(scheme, wsScheme);
1115     case 3:
1116         return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1117     case 4:
1118         return equal(scheme, httpScheme);
1119     case 5:
1120         return equal(scheme, httpsScheme);
1121     case 6:
1122         return equal(scheme, gopherScheme);
1123     }
1124     return false;
1125 }
1126 
1127 static bool isCanonicalHostnameLowercaseForScheme(const char* scheme, size_t schemeLength)
1128 {
1129     switch (schemeLength) {
1130     case 2:
1131         return equal(scheme, wsScheme);
1132     case 3:
1133         return equal(scheme, ftpScheme) || equal(scheme, wssScheme);
1134     case 4:
1135         return equal(scheme, httpScheme) || equal(scheme, fileScheme);
1136     case 5:
1137         return equal(scheme, httpsScheme);
1138     case 6:
1139         return equal(scheme, gopherScheme);
1140     }
1141     return false;
1142 }
1143 
1144 void URL::parse(const char* url, const String* originalString)
1145 {
1146     if (!url || url[0] == '\0') {
1147         // valid URL must be non-empty
1148         m_string = originalString ? *originalString : url;
1149         invalidate();
1150         return;
1151     }
1152 
1153     if (!isSchemeFirstChar(url[0])) {
1154         // scheme must start with an alphabetic character
1155         m_string = originalString ? *originalString : url;
1156         invalidate();
1157         return;
1158     }
1159 
1160     int schemeEnd = 0;
1161     while (isSchemeChar(url[schemeEnd]))
1162         schemeEnd++;
1163 
1164     if (url[schemeEnd] != ':') {
1165         m_string = originalString ? *originalString : url;
1166         invalidate();
1167         return;
1168     }
1169 
1170     int userStart = schemeEnd + 1;
1171     int userEnd;
1172     int passwordStart;
1173     int passwordEnd;
1174     int hostStart;
1175     int hostEnd;
1176     int portStart;
1177     int portEnd;
1178 
1179     bool hierarchical = url[schemeEnd + 1] == '/';
1180     bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/';
1181 
1182     bool isFile = schemeEnd == 4
1183         && isLetterMatchIgnoringCase(url[0], 'f')
1184         && isLetterMatchIgnoringCase(url[1], 'i')
1185         && isLetterMatchIgnoringCase(url[2], 'l')
1186         && isLetterMatchIgnoringCase(url[3], 'e');
1187 
1188 #if PLATFORM(JAVA)
1189     m_protocolIsInJar = schemeEnd == 3
1190         && isLetterMatchIgnoringCase(url[0], 'j')
1191         && isLetterMatchIgnoringCase(url[1], 'a')
1192         && isLetterMatchIgnoringCase(url[2], 'r');
1193 #endif
1194 
1195     m_protocolIsInHTTPFamily = isLetterMatchIgnoringCase(url[0], 'h')
1196         && isLetterMatchIgnoringCase(url[1], 't')
1197         && isLetterMatchIgnoringCase(url[2], 't')
1198         && isLetterMatchIgnoringCase(url[3], 'p')
1199         && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1200 
1201     if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) {
1202         // The part after the scheme is either a net_path or an abs_path whose first path segment is empty.
1203         // Attempt to find an authority.
1204         // FIXME: Authority characters may be scanned twice, and it would be nice to be faster.
1205 
1206         if (hierarchical)
1207             userStart++;
1208         if (hasSecondSlash)
1209             userStart++;
1210         userEnd = userStart;
1211 
1212         int colonPos = 0;
1213         while (isUserInfoChar(url[userEnd])) {
1214             if (url[userEnd] == ':' && colonPos == 0)
1215                 colonPos = userEnd;
1216             userEnd++;
1217         }
1218 
1219         if (url[userEnd] == '@') {
1220             // actual end of the userinfo, start on the host
1221             if (colonPos != 0) {
1222                 passwordEnd = userEnd;
1223                 userEnd = colonPos;
1224                 passwordStart = colonPos + 1;
1225             } else
1226                 passwordStart = passwordEnd = userEnd;
1227 
1228             hostStart = passwordEnd + 1;
1229         } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) {
1230             // hit the end of the authority, must have been no user
1231             // or looks like an IPv6 hostname
1232             // either way, try to parse it as a hostname
1233             userEnd = userStart;
1234             passwordStart = passwordEnd = userEnd;
1235             hostStart = userStart;
1236         } else {
1237             // invalid character
1238             m_string = originalString ? *originalString : url;
1239             invalidate();
1240             return;
1241         }
1242 
1243         hostEnd = hostStart;
1244 
1245         // IPV6 IP address
1246         if (url[hostEnd] == '[') {
1247             hostEnd++;
1248             while (isIPv6Char(url[hostEnd]))
1249                 hostEnd++;
1250             if (url[hostEnd] == ']')
1251                 hostEnd++;
1252             else {
1253                 // invalid character
1254                 m_string = originalString ? *originalString : url;
1255                 invalidate();
1256                 return;
1257             }
1258         } else {
1259             while (isHostnameChar(url[hostEnd]))
1260                 hostEnd++;
1261         }
1262 
1263         if (url[hostEnd] == ':') {
1264             portStart = portEnd = hostEnd + 1;
1265 
1266             // possible start of port
1267             portEnd = portStart;
1268             while (isASCIIDigit(url[portEnd]))
1269                 portEnd++;
1270         } else
1271             portStart = portEnd = hostEnd;
1272 
1273         if (!isPathSegmentEndChar(url[portEnd])) {
1274             // invalid character
1275             m_string = originalString ? *originalString : url;
1276             invalidate();
1277             return;
1278         }
1279 
1280         if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[passwordEnd])) {
1281             m_string = originalString ? *originalString : url;
1282             invalidate();
1283             return;
1284         }
1285 
1286         if (userStart == portEnd && !m_protocolIsInHTTPFamily && !isFile) {
1287             // No authority found, which means that this is not a net_path, but rather an abs_path whose first two
1288             // path segments are empty. For file, http and https only, an empty authority is allowed.
1289             userStart -= 2;
1290             userEnd = userStart;
1291             passwordStart = userEnd;
1292             passwordEnd = passwordStart;
1293             hostStart = passwordEnd;
1294             hostEnd = hostStart;
1295             portStart = hostEnd;
1296             portEnd = hostEnd;
1297         }
1298     } else {
1299         // the part after the scheme must be an opaque_part or an abs_path
1300         userEnd = userStart;
1301         passwordStart = passwordEnd = userEnd;
1302         hostStart = hostEnd = passwordEnd;
1303         portStart = portEnd = hostEnd;
1304     }
1305 
1306     int pathStart = portEnd;
1307     int pathEnd = pathStart;
1308     while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#')
1309         pathEnd++;
1310 
1311     int queryStart = pathEnd;
1312     int queryEnd = queryStart;
1313     if (url[queryStart] == '?') {
1314         while (url[queryEnd] && url[queryEnd] != '#')
1315             queryEnd++;
1316     }
1317 
1318     int fragmentStart = queryEnd;
1319     int fragmentEnd = fragmentStart;
1320     if (url[fragmentStart] == '#') {
1321         fragmentStart++;
1322         fragmentEnd = fragmentStart;
1323         while (url[fragmentEnd])
1324             fragmentEnd++;
1325     }
1326 
1327     // assemble it all, remembering the real ranges
1328 
1329     Vector<char, 4096> buffer(fragmentEnd * 3 + 1);
1330 
1331     char *p = buffer.data();
1332     const char *strPtr = url;
1333 
1334     // copy in the scheme
1335     const char *schemeEndPtr = url + schemeEnd;
1336 #if PLATFORM(IOS)
1337     if (shouldCanonicalizeScheme || m_protocolIsInHTTPFamily) {
1338         while (strPtr < schemeEndPtr)
1339             *p++ = toASCIILower(*strPtr++);
1340     } else {
1341         while (strPtr < schemeEndPtr)
1342             *p++ = *strPtr++;
1343     }
1344 #else
1345     while (strPtr < schemeEndPtr)
1346         *p++ = toASCIILower(*strPtr++);
1347 #endif
1348     m_schemeEnd = p - buffer.data();
1349 
1350     bool hostIsLocalHost = portEnd - userStart == 9
1351         && isLetterMatchIgnoringCase(url[userStart], 'l')
1352         && isLetterMatchIgnoringCase(url[userStart+1], 'o')
1353         && isLetterMatchIgnoringCase(url[userStart+2], 'c')
1354         && isLetterMatchIgnoringCase(url[userStart+3], 'a')
1355         && isLetterMatchIgnoringCase(url[userStart+4], 'l')
1356         && isLetterMatchIgnoringCase(url[userStart+5], 'h')
1357         && isLetterMatchIgnoringCase(url[userStart+6], 'o')
1358         && isLetterMatchIgnoringCase(url[userStart+7], 's')
1359         && isLetterMatchIgnoringCase(url[userStart+8], 't');
1360 
1361     // File URLs need a host part unless it is just file:// or file://localhost
1362     bool degenerateFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost);
1363 
1364     // We drop empty credentials, but keep a colon in an empty host/port pair.
1365     // Removing hostname completely would change the structure of the URL on re-parsing.
1366     bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || hostEnd != portEnd;
1367 
1368     // add ":" after scheme
1369     *p++ = ':';
1370 
1371     // if we have at least one authority part or a file URL - add "//" and authority
1372     if (isFile ? !degenerateFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) {
1373         *p++ = '/';
1374         *p++ = '/';
1375 
1376         m_userStart = p - buffer.data();
1377 
1378         // copy in the user
1379         strPtr = url + userStart;
1380         const char* userEndPtr = url + userEnd;
1381         while (strPtr < userEndPtr) {
1382             char c = *strPtr++;
1383             ASSERT(isUserInfoChar(c));
1384             *p++ = c;
1385         }
1386         m_userEnd = p - buffer.data();
1387 
1388         // copy in the password
1389         if (passwordEnd != passwordStart) {
1390             *p++ = ':';
1391             strPtr = url + passwordStart;
1392             const char* passwordEndPtr = url + passwordEnd;
1393             while (strPtr < passwordEndPtr) {
1394                 char c = *strPtr++;
1395                 ASSERT(isUserInfoChar(c));
1396                 *p++ = c;
1397             }
1398         }
1399         m_passwordEnd = p - buffer.data();
1400 
1401         // If we had any user info, add "@"
1402         if (p - buffer.data() != m_userStart)
1403             *p++ = '@';
1404 
1405         // copy in the host, except in the case of a file URL with authority="localhost"
1406         if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) {
1407             strPtr = url + hostStart;
1408             const char* hostEndPtr = url + hostEnd;
1409             if (isCanonicalHostnameLowercaseForScheme(buffer.data(), m_schemeEnd)) {
1410                 while (strPtr < hostEndPtr) {
1411                     char c = toASCIILower(*strPtr++);
1412                     ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1413                     *p++ = c;
1414                 }
1415             } else {
1416                 while (strPtr < hostEndPtr) {
1417                     char c = *strPtr++;
1418                     ASSERT(isHostnameChar(c) || c == '[' || c == ']' || c == ':');
1419                     *p++ = c;
1420                 }
1421             }
1422         }
1423         m_hostEnd = p - buffer.data();
1424 
1425         // Copy in the port if the URL has one (and it's not default). Also, copy it if there was no hostname, so that there is still something in authority component.
1426         if (hostEnd != portStart) {
1427             const char* portStr = url + portStart;
1428             size_t portLength = portEnd - portStart;
1429             if ((portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd))
1430                 || (hostStart == hostEnd && hostEnd != portStart)) {
1431                 *p++ = ':';
1432                 const char* portEndPtr = url + portEnd;
1433                 while (portStr < portEndPtr)
1434                     *p++ = *portStr++;
1435             }
1436         }
1437         m_portEnd = p - buffer.data();
1438     } else {
1439         if (isFile) {
1440             ASSERT(degenerateFilePath);
1441             *p++ = '/';
1442             *p++ = '/';
1443         }
1444         m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data();
1445     }
1446 
1447     // For canonicalization, ensure we have a '/' for no path.
1448     // Do this only for URL with protocol file, http or https.
1449     if ((m_protocolIsInHTTPFamily || isFile) && pathEnd == pathStart)
1450         *p++ = '/';
1451 
1452     // add path, escaping bad characters
1453     if (!hierarchical)
1454         escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart);
1455     else if (!hasSlashDotOrDotDot(url))
1456         appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart);
1457     else {
1458         CharBuffer pathBuffer(pathEnd - pathStart + 1);
1459         size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd);
1460         appendEscapingBadChars(p, pathBuffer.data(), length);
1461     }
1462 
1463     m_pathEnd = p - buffer.data();
1464 
1465     // Find the position after the last slash in the path, or
1466     // the position before the path if there are no slashes in it.
1467     int i;
1468     for (i = m_pathEnd; i > m_portEnd; --i) {
1469         if (buffer[i - 1] == '/')
1470             break;
1471     }
1472     m_pathAfterLastSlash = i;
1473 
1474     // add query, escaping bad characters
1475     appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart);
1476     m_queryEnd = p - buffer.data();
1477 
1478     // add fragment, escaping bad characters
1479     if (fragmentEnd != queryEnd) {
1480         *p++ = '#';
1481         escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart);
1482     }
1483     m_fragmentEnd = p - buffer.data();
1484 
1485     ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1486     ASSERT(buffer.size() > 0);
1487 
1488     // If we didn't end up actually changing the original string and
1489     // it was already in a String, reuse it to avoid extra allocation.
1490     if (originalString && equal(originalString->impl(), buffer.data(), m_fragmentEnd))
1491         m_string = *originalString;
1492     else
1493         m_string = String(buffer.data(), m_fragmentEnd);
1494 
1495     m_isValid = true;
1496 }
1497 
1498 bool equalIgnoringFragmentIdentifier(const URL& a, const URL& b)
1499 {
1500     if (a.m_queryEnd != b.m_queryEnd)
1501         return false;
1502     unsigned queryLength = a.m_queryEnd;
1503     for (unsigned i = 0; i < queryLength; ++i)
1504         if (a.string()[i] != b.string()[i])
1505             return false;
1506     return true;
1507 }
1508 
1509 bool protocolHostAndPortAreEqual(const URL& a, const URL& b)
1510 {
1511     if (a.m_schemeEnd != b.m_schemeEnd)
1512         return false;
1513 
1514     int hostStartA = a.hostStart();
1515     int hostLengthA = a.hostEnd() - hostStartA;
1516     int hostStartB = b.hostStart();
1517     int hostLengthB = b.hostEnd() - b.hostStart();
1518     if (hostLengthA != hostLengthB)
1519         return false;
1520 
1521     // Check the scheme
1522     for (int i = 0; i < a.m_schemeEnd; ++i)
1523         if (a.string()[i] != b.string()[i])
1524             return false;
1525 
1526     // And the host
1527     for (int i = 0; i < hostLengthA; ++i)
1528         if (a.string()[hostStartA + i] != b.string()[hostStartB + i])
1529             return false;
1530 
1531     if (a.port() != b.port())
1532         return false;
1533 
1534     return true;
1535 }
1536 
1537 String encodeWithURLEscapeSequences(const String& notEncodedString)
1538 {
1539     CString asUTF8 = notEncodedString.utf8();
1540 
1541     CharBuffer buffer(asUTF8.length() * 3 + 1);
1542     char* p = buffer.data();
1543 
1544     const char* str = asUTF8.data();
1545     const char* strEnd = str + asUTF8.length();
1546     while (str < strEnd) {
1547         unsigned char c = *str++;
1548         if (isBadChar(c))
1549             appendEscapedChar(p, c);
1550         else
1551             *p++ = c;
1552     }
1553 
1554     ASSERT(p - buffer.data() <= static_cast<int>(buffer.size()));
1555 
1556     return String(buffer.data(), p - buffer.data());
1557 }
1558 
1559 // Appends the punycoded hostname identified by the given string and length to
1560 // the output buffer. The result will not be null terminated.
1561 static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen)
1562 {
1563     // Needs to be big enough to hold an IDN-encoded name.
1564     // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK.
1565     const unsigned hostnameBufferLength = 2048;
1566 
1567     if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) {
1568         buffer.append(str, strLen);
1569         return;
1570     }
1571 
1572     UChar hostnameBuffer[hostnameBufferLength];
1573     UErrorCode error = U_ZERO_ERROR;
1574     int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer,
1575         hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error);
1576     if (error == U_ZERO_ERROR)
1577         buffer.append(hostnameBuffer, numCharactersConverted);
1578 }
1579 
1580 static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<std::pair<int, int>>& nameRanges)
1581 {
1582     // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character.
1583     // Skip quoted strings so that characters in them don't confuse us.
1584     // When we find a '?' character, we are past the part of the URL that contains host names.
1585 
1586     nameRanges.clear();
1587 
1588     int p = 0;
1589     while (1) {
1590         // Find start of host name or of quoted string.
1591         int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?");
1592         if (hostnameOrStringStart == -1)
1593             return;
1594         UChar c = str[hostnameOrStringStart];
1595         p = hostnameOrStringStart + 1;
1596 
1597         if (c == '?')
1598             return;
1599 
1600         if (c == '@') {
1601             // Find end of host name.
1602             int hostnameStart = p;
1603             int hostnameEnd = findFirstOf(str, strLen, p, ">,?");
1604             bool done;
1605             if (hostnameEnd == -1) {
1606                 hostnameEnd = strLen;
1607                 done = true;
1608             } else {
1609                 p = hostnameEnd;
1610                 done = false;
1611             }
1612 
1613             nameRanges.append(std::make_pair(hostnameStart, hostnameEnd));
1614 
1615             if (done)
1616                 return;
1617         } else {
1618             // Skip quoted string.
1619             ASSERT(c == '"');
1620             while (1) {
1621                 int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\");
1622                 if (escapedCharacterOrStringEnd == -1)
1623                     return;
1624 
1625                 c = str[escapedCharacterOrStringEnd];
1626                 p = escapedCharacterOrStringEnd + 1;
1627 
1628                 // If we are the end of the string, then break from the string loop back to the host name loop.
1629                 if (c == '"')
1630                     break;
1631 
1632                 // Skip escaped character.
1633                 ASSERT(c == '\\');
1634                 if (p == strLen)
1635                     return;
1636 
1637                 ++p;
1638             }
1639         }
1640     }
1641 }
1642 
1643 static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset)
1644 {
1645     // Find the host name in a hierarchical URL.
1646     // It comes after a "://" sequence, with scheme characters preceding, and
1647     // this should be the first colon in the string.
1648     // It ends with the end of the string or a ":" or a path segment ending character.
1649     // If there is a "@" character, the host part is just the part after the "@".
1650     int separator = findFirstOf(str, strLen, 0, ":");
1651     if (separator == -1 || separator + 2 >= strLen ||
1652         str[separator + 1] != '/' || str[separator + 2] != '/')
1653         return false;
1654 
1655     // Check that all characters before the :// are valid scheme characters.
1656     if (!isSchemeFirstChar(str[0]))
1657         return false;
1658     for (int i = 1; i < separator; ++i) {
1659         if (!isSchemeChar(str[i]))
1660             return false;
1661     }
1662 
1663     // Start after the separator.
1664     int authorityStart = separator + 3;
1665 
1666     // Find terminating character.
1667     int hostnameEnd = strLen;
1668     for (int i = authorityStart; i < strLen; ++i) {
1669         UChar c = str[i];
1670         if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) {
1671             hostnameEnd = i;
1672             break;
1673         }
1674     }
1675 
1676     // Find "@" for the start of the host name.
1677     int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@");
1678     int hostnameStart;
1679     if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd)
1680         hostnameStart = authorityStart;
1681     else
1682         hostnameStart = userInfoTerminator + 1;
1683 
1684     startOffset = hostnameStart;
1685     endOffset = hostnameEnd;
1686     return true;
1687 }
1688 
1689 // Converts all hostnames found in the given input to punycode, preserving the
1690 // rest of the URL unchanged. The output will NOT be null-terminated.
1691 static void encodeHostnames(const String& str, UCharBuffer& output)
1692 {
1693     output.clear();
1694 
1695     if (protocolIs(str, "mailto")) {
1696         Vector<std::pair<int, int>> hostnameRanges;
1697         findHostnamesInMailToURL(str.deprecatedCharacters(), str.length(), hostnameRanges);
1698         int n = hostnameRanges.size();
1699         int p = 0;
1700         for (int i = 0; i < n; ++i) {
1701             const std::pair<int, int>& r = hostnameRanges[i];
1702             output.append(&str.deprecatedCharacters()[p], r.first - p);
1703             appendEncodedHostname(output, &str.deprecatedCharacters()[r.first], r.second - r.first);
1704             p = r.second;
1705         }
1706         // This will copy either everything after the last hostname, or the
1707         // whole thing if there is no hostname.
1708         output.append(&str.deprecatedCharacters()[p], str.length() - p);
1709     } else {
1710         int hostStart, hostEnd;
1711         if (findHostnameInHierarchicalURL(str.deprecatedCharacters(), str.length(), hostStart, hostEnd)) {
1712             output.append(str.deprecatedCharacters(), hostStart); // Before hostname.
1713             appendEncodedHostname(output, &str.deprecatedCharacters()[hostStart], hostEnd - hostStart);
1714             output.append(&str.deprecatedCharacters()[hostEnd], str.length() - hostEnd); // After hostname.
1715         } else {
1716             // No hostname to encode, return the input.
1717             output.append(str.deprecatedCharacters(), str.length());
1718         }
1719     }
1720 }
1721 
1722 static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output)
1723 {
1724     UCharBuffer s;
1725     encodeHostnames(rel, s);
1726 
1727     TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme.
1728 
1729     int pathEnd = -1;
1730     if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) {
1731         // Find the first instance of either # or ?, keep pathEnd at -1 otherwise.
1732         pathEnd = findFirstOf(s.data(), s.size(), 0, "#?");
1733     }
1734 
1735     if (pathEnd == -1) {
1736         CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables);
1737         output.resize(decoded.length());
1738         memcpy(output.data(), decoded.data(), decoded.length());
1739     } else {
1740         CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables);
1741         // Unencodable characters in URLs are represented by converting
1742         // them to XML entities and escaping non-alphanumeric characters.
1743         CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables);
1744 
1745         output.resize(pathDecoded.length() + otherDecoded.length());
1746         memcpy(output.data(), pathDecoded.data(), pathDecoded.length());
1747         memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length());
1748     }
1749     output.append('\0'); // null-terminate the output.
1750 }
1751 
1752 static String substituteBackslashes(const String& string)
1753 {
1754     size_t questionPos = string.find('?');
1755     size_t hashPos = string.find('#');
1756     unsigned pathEnd;
1757 
1758     if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos))
1759         pathEnd = hashPos;
1760     else if (questionPos != notFound)
1761         pathEnd = questionPos;
1762     else
1763         pathEnd = string.length();
1764 
1765     return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd);
1766 }
1767 
1768 bool URL::isHierarchical() const
1769 {
1770     if (!m_isValid)
1771         return false;
1772     ASSERT(m_string[m_schemeEnd] == ':');
1773     return m_string[m_schemeEnd + 1] == '/';
1774 }
1775 
1776 void URL::copyToBuffer(Vector<char, 512>& buffer) const
1777 {
1778     // FIXME: This throws away the high bytes of all the characters in the string!
1779     // That's fine for a valid URL, which is all ASCII, but not for invalid URLs.
1780     buffer.resize(m_string.length());
1781     copyASCII(m_string, buffer.data());
1782 }
1783 
1784 bool protocolIs(const String& url, const char* protocol)
1785 {
1786     // Do the comparison without making a new string object.
1787     assertProtocolIsGood(protocol);
1788     for (int i = 0; ; ++i) {
1789         if (!protocol[i])
1790             return url[i] == ':';
1791         if (!isLetterMatchIgnoringCase(url[i], protocol[i]))
1792             return false;
1793     }
1794 }
1795 
1796 bool isValidProtocol(const String& protocol)
1797 {
1798     // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
1799     if (protocol.isEmpty())
1800         return false;
1801     if (!isSchemeFirstChar(protocol[0]))
1802         return false;
1803     unsigned protocolLength = protocol.length();
1804     for (unsigned i = 1; i < protocolLength; i++) {
1805         if (!isSchemeChar(protocol[i]))
1806             return false;
1807     }
1808     return true;
1809 }
1810 
1811 #ifndef NDEBUG
1812 void URL::print() const
1813 {
1814     printf("%s\n", m_string.utf8().data());
1815 }
1816 #endif
1817 
1818 String URL::strippedForUseAsReferrer() const
1819 {
1820     URL referrer(*this);
1821     referrer.setUser(String());
1822     referrer.setPass(String());
1823     referrer.removeFragmentIdentifier();
1824     return referrer.string();
1825 }
1826 
1827 bool URL::isLocalFile() const
1828 {
1829     // Including feed here might be a bad idea since drag and drop uses this check
1830     // and including feed would allow feeds to potentially let someone's blog
1831     // read the contents of the clipboard on a drag, even without a drop.
1832     // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function.
1833     return protocolIs("file");
1834 }
1835 
1836 bool protocolIsJavaScript(const String& url)
1837 {
1838     return protocolIs(url, "javascript");
1839 }
1840 
1841 bool protocolIsInHTTPFamily(const String& url)
1842 {
1843     // Do the comparison without making a new string object.
1844     return isLetterMatchIgnoringCase(url[0], 'h')
1845         && isLetterMatchIgnoringCase(url[1], 't')
1846         && isLetterMatchIgnoringCase(url[2], 't')
1847         && isLetterMatchIgnoringCase(url[3], 'p')
1848         && (url[4] == ':' || (isLetterMatchIgnoringCase(url[4], 's') && url[5] == ':'));
1849 }
1850 
1851 const URL& blankURL()
1852 {
1853     DEFINE_STATIC_LOCAL(URL, staticBlankURL, (ParsedURLString, "about:blank"));
1854     return staticBlankURL;
1855 }
1856 
1857 bool URL::isBlankURL() const
1858 {
1859     return protocolIs("about");
1860 }
1861 
1862 bool isDefaultPortForProtocol(unsigned short port, const String& protocol)
1863 {
1864     if (protocol.isEmpty())
1865         return false;
1866 
1867     typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap;
1868     DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ());
1869     if (defaultPorts.isEmpty()) {
1870         defaultPorts.set("http", 80);
1871         defaultPorts.set("https", 443);
1872         defaultPorts.set("ftp", 21);
1873         defaultPorts.set("ftps", 990);
1874     }
1875     return defaultPorts.get(protocol) == port;
1876 }
1877 
1878 bool portAllowed(const URL& url)
1879 {
1880     unsigned short port = url.port();
1881 
1882     // Since most URLs don't have a port, return early for the "no port" case.
1883     if (!port)
1884         return true;
1885 
1886     // This blocked port list matches the port blocking that Mozilla implements.
1887     // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information.
1888     static const unsigned short blockedPortList[] = {
1889         1,    // tcpmux
1890         7,    // echo
1891         9,    // discard
1892         11,   // systat
1893         13,   // daytime
1894         15,   // netstat
1895         17,   // qotd
1896         19,   // chargen
1897         20,   // FTP-data
1898         21,   // FTP-control
1899         22,   // SSH
1900         23,   // telnet
1901         25,   // SMTP
1902         37,   // time
1903         42,   // name
1904         43,   // nicname
1905         53,   // domain
1906         77,   // priv-rjs
1907         79,   // finger
1908         87,   // ttylink
1909         95,   // supdup
1910         101,  // hostriame
1911         102,  // iso-tsap
1912         103,  // gppitnp
1913         104,  // acr-nema
1914         109,  // POP2
1915         110,  // POP3
1916         111,  // sunrpc
1917         113,  // auth
1918         115,  // SFTP
1919         117,  // uucp-path
1920         119,  // nntp
1921         123,  // NTP
1922         135,  // loc-srv / epmap
1923         139,  // netbios
1924         143,  // IMAP2
1925         179,  // BGP
1926         389,  // LDAP
1927         465,  // SMTP+SSL
1928         512,  // print / exec
1929         513,  // login
1930         514,  // shell
1931         515,  // printer
1932         526,  // tempo
1933         530,  // courier
1934         531,  // Chat
1935         532,  // netnews
1936         540,  // UUCP
1937         556,  // remotefs
1938         563,  // NNTP+SSL
1939         587,  // ESMTP
1940         601,  // syslog-conn
1941         636,  // LDAP+SSL
1942         993,  // IMAP+SSL
1943         995,  // POP3+SSL
1944         2049, // NFS
1945         3659, // apple-sasl / PasswordServer [Apple addition]
1946         4045, // lockd
1947         6000, // X11
1948         6665, // Alternate IRC [Apple addition]
1949         6666, // Alternate IRC [Apple addition]
1950         6667, // Standard IRC [Apple addition]
1951         6668, // Alternate IRC [Apple addition]
1952         6669, // Alternate IRC [Apple addition]
1953         invalidPortNumber, // Used to block all invalid port numbers
1954     };
1955     const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList);
1956 
1957 #ifndef NDEBUG
1958     // The port list must be sorted for binary_search to work.
1959     static bool checkedPortList = false;
1960     if (!checkedPortList) {
1961         for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p)
1962             ASSERT(*p < *(p + 1));
1963         checkedPortList = true;
1964     }
1965 #endif
1966 
1967     // If the port is not in the blocked port list, allow it.
1968     if (!std::binary_search(blockedPortList, blockedPortListEnd, port))
1969         return true;
1970 
1971     // Allow ports 21 and 22 for FTP URLs, as Mozilla does.
1972     if ((port == 21 || port == 22) && url.protocolIs("ftp"))
1973         return true;
1974 
1975     // Allow any port number in a file URL, since the port number is ignored.
1976     if (url.protocolIs("file"))
1977         return true;
1978 
1979     return false;
1980 }
1981 
1982 String mimeTypeFromDataURL(const String& url)
1983 {
1984     ASSERT(protocolIs(url, "data"));
1985     size_t index = url.find(';');
1986     if (index == notFound)
1987         index = url.find(',');
1988     if (index != notFound) {
1989         if (index > 5)
1990             return url.substring(5, index - 5).lower();
1991         return "text/plain"; // Data URLs with no MIME type are considered text/plain.
1992     }
1993     return "";
1994 }
1995 
1996 String mimeTypeFromURL(const URL& url)
1997 {
1998     String decodedPath = decodeURLEscapeSequences(url.path());
1999     String extension = decodedPath.substring(decodedPath.reverseFind('.') + 1);
2000 
2001     // We don't use MIMETypeRegistry::getMIMETypeForPath() because it returns "application/octet-stream" upon failure
2002     return MIMETypeRegistry::getMIMETypeForExtension(extension);
2003 }
2004 
2005 bool URL::isSafeToSendToAnotherThread() const
2006 {
2007     return m_string.isSafeToSendToAnotherThread();
2008 }
2009 
2010 String URL::stringCenterEllipsizedToLength(unsigned length) const
2011 {
2012     if (string().length() <= length)
2013         return string();
2014 
2015     return string().left(length / 2 - 1) + "..." + string().right(length / 2 - 2);
2016 }
2017 
2018 }