1 /*
   2  * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.net;
  27 
  28 import java.io.File;
  29 import java.io.IOException;
  30 import java.io.InvalidObjectException;
  31 import java.io.ObjectInputStream;
  32 import java.io.ObjectOutputStream;
  33 import java.io.Serializable;
  34 import java.nio.ByteBuffer;
  35 import java.nio.CharBuffer;
  36 import java.nio.charset.CharsetDecoder;
  37 import java.nio.charset.CoderResult;
  38 import java.nio.charset.CodingErrorAction;
  39 import java.nio.charset.CharacterCodingException;
  40 import java.nio.file.Path;
  41 import java.text.Normalizer;
  42 import jdk.internal.access.JavaNetUriAccess;
  43 import jdk.internal.access.SharedSecrets;
  44 import sun.nio.cs.ThreadLocalCoders;
  45 
  46 import java.lang.Character;             // for javadoc
  47 import java.lang.NullPointerException;  // for javadoc
  48 
  49 
  50 /**
  51  * Represents a Uniform Resource Identifier (URI) reference.
  52  *
  53  * <p> Aside from some minor deviations noted below, an instance of this
  54  * class represents a URI reference as defined by
  55  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
  56  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
  57  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  58  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
  59  * also supports scope_ids. The syntax and usage of scope_ids is described
  60  * <a href="Inet6Address.html#scoped">here</a>.
  61  * This class provides constructors for creating URI instances from
  62  * their components or by parsing their string forms, methods for accessing the
  63  * various components of an instance, and methods for normalizing, resolving,
  64  * and relativizing URI instances.  Instances of this class are immutable.
  65  *
  66  *
  67  * <h2> URI syntax and components </h2>
  68  *
  69  * At the highest level a URI reference (hereinafter simply "URI") in string
  70  * form has the syntax
  71  *
  72  * <blockquote>
  73  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
  74  * </blockquote>
  75  *
  76  * where square brackets [...] delineate optional components and the characters
  77  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
  78  *
  79  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
  80  * said to be <i>relative</i>.  URIs are also classified according to whether
  81  * they are <i>opaque</i> or <i>hierarchical</i>.
  82  *
  83  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
  84  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
  85  * subject to further parsing.  Some examples of opaque URIs are:
  86  *
  87  * <blockquote><ul style="list-style-type:none">
  88  * <li>{@code mailto:java-net@www.example.com}</li>
  89  * <li>{@code news:comp.lang.java}</li>
  90  * <li>{@code urn:isbn:096139210x}</li>
  91  * </ul></blockquote>
  92  *
  93  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
  94  * scheme-specific part begins with a slash character, or a relative URI, that
  95  * is, a URI that does not specify a scheme.  Some examples of hierarchical
  96  * URIs are:
  97  *
  98  * <blockquote>
  99  * {@code http://example.com/languages/java/}<br>
 100  * {@code sample/a/index.html#28}<br>
 101  * {@code ../../demo/b/index.html}<br>
 102  * {@code file:///~/calendar}
 103  * </blockquote>
 104  *
 105  * <p> A hierarchical URI is subject to further parsing according to the syntax
 106  *
 107  * <blockquote>
 108  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 109  * </blockquote>
 110  *
 111  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
 112  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
 113  * scheme-specific part of a hierarchical URI consists of the characters
 114  * between the scheme and fragment components.
 115  *
 116  * <p> The authority component of a hierarchical URI is, if specified, either
 117  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
 118  * parses according to the familiar syntax
 119  *
 120  * <blockquote>
 121  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
 122  * </blockquote>
 123  *
 124  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
 125  * themselves.  Nearly all URI schemes currently in use are server-based.  An
 126  * authority component that does not parse in this way is considered to be
 127  * registry-based.
 128  *
 129  * <p> The path component of a hierarchical URI is itself said to be absolute
 130  * if it begins with a slash character ({@code '/'}); otherwise it is
 131  * relative.  The path of a hierarchical URI that is either absolute or
 132  * specifies an authority is always absolute.
 133  *
 134  * <p> All told, then, a URI instance has the following nine components:
 135  *
 136  * <table class="striped" style="margin-left:2em">
 137  * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption>
 138  * <thead>
 139  * <tr><th scope="col">Component</th><th scope="col">Type</th></tr>
 140  * </thead>
 141  * <tbody style="text-align:left">
 142  * <tr><th scope="row">scheme</th><td>{@code String}</td></tr>
 143  * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr>
 144  * <tr><th scope="row">authority</th><td>{@code String}</td></tr>
 145  * <tr><th scope="row">user-info</th><td>{@code String}</td></tr>
 146  * <tr><th scope="row">host</th><td>{@code String}</td></tr>
 147  * <tr><th scope="row">port</th><td>{@code int}</td></tr>
 148  * <tr><th scope="row">path</th><td>{@code String}</td></tr>
 149  * <tr><th scope="row">query</th><td>{@code String}</td></tr>
 150  * <tr><th scope="row">fragment</th><td>{@code String}</td></tr>
 151  * </tbody>
 152  * </table>
 153  *
 154  * In a given instance any particular component is either <i>undefined</i> or
 155  * <i>defined</i> with a distinct value.  Undefined string components are
 156  * represented by {@code null}, while undefined integer components are
 157  * represented by {@code -1}.  A string component may be defined to have the
 158  * empty string as its value; this is not equivalent to that component being
 159  * undefined.
 160  *
 161  * <p> Whether a particular component is or is not defined in an instance
 162  * depends upon the type of the URI being represented.  An absolute URI has a
 163  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
 164  * possibly a fragment, but has no other components.  A hierarchical URI always
 165  * has a path (though it may be empty) and a scheme-specific-part (which at
 166  * least contains the path), and may have any of the other components.  If the
 167  * authority component is present and is server-based then the host component
 168  * will be defined and the user-information and port components may be defined.
 169  *
 170  *
 171  * <h3> Operations on URI instances </h3>
 172  *
 173  * The key operations supported by this class are those of
 174  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
 175  *
 176  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
 177  * and {@code ".."} segments from the path component of a hierarchical URI.
 178  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
 179  * removed only if it is preceded by a non-{@code ".."} segment.
 180  * Normalization has no effect upon opaque URIs.
 181  *
 182  * <p> <i>Resolution</i> is the process of resolving one URI against another,
 183  * <i>base</i> URI.  The resulting URI is constructed from components of both
 184  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
 185  * base URI for those not specified in the original.  For hierarchical URIs,
 186  * the path of the original is resolved against the path of the base and then
 187  * normalized.  The result, for example, of resolving
 188  *
 189  * <blockquote>
 190  * {@code sample/a/index.html#28}
 191  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 192  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
 193  * </blockquote>
 194  *
 195  * against the base URI {@code http://example.com/languages/java/} is the result
 196  * URI
 197  *
 198  * <blockquote>
 199  * {@code http://example.com/languages/java/sample/a/index.html#28}
 200  * </blockquote>
 201  *
 202  * Resolving the relative URI
 203  *
 204  * <blockquote>
 205  * {@code ../../demo/b/index.html}&nbsp;&nbsp;&nbsp;&nbsp;(2)
 206  * </blockquote>
 207  *
 208  * against this result yields, in turn,
 209  *
 210  * <blockquote>
 211  * {@code http://example.com/languages/java/demo/b/index.html}
 212  * </blockquote>
 213  *
 214  * Resolution of both absolute and relative URIs, and of both absolute and
 215  * relative paths in the case of hierarchical URIs, is supported.  Resolving
 216  * the URI {@code file:///~calendar} against any other URI simply yields the
 217  * original URI, since it is absolute.  Resolving the relative URI (2) above
 218  * against the relative base URI (1) yields the normalized, but still relative,
 219  * URI
 220  *
 221  * <blockquote>
 222  * {@code demo/b/index.html}
 223  * </blockquote>
 224  *
 225  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
 226  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
 227  *
 228  * <blockquote>
 229  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
 230  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
 231  * </blockquote>
 232  *
 233  * This operation is often useful when constructing a document containing URIs
 234  * that must be made relative to the base URI of the document wherever
 235  * possible.  For example, relativizing the URI
 236  *
 237  * <blockquote>
 238  * {@code http://example.com/languages/java/sample/a/index.html#28}
 239  * </blockquote>
 240  *
 241  * against the base URI
 242  *
 243  * <blockquote>
 244  * {@code http://example.com/languages/java/}
 245  * </blockquote>
 246  *
 247  * yields the relative URI {@code sample/a/index.html#28}.
 248  *
 249  *
 250  * <h3> Character categories </h3>
 251  *
 252  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
 253  * various components of a URI reference.  The following categories, most of
 254  * which are taken from that specification, are used below to describe these
 255  * constraints:
 256  *
 257  * <table class="striped" style="margin-left:2em">
 258  * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption>
 259  *   <thead>
 260  *   <tr><th scope="col">Category</th><th scope="col">Description</th></tr>
 261  *   </thead>
 262  *   <tbody style="text-align:left">
 263  *   <tr><th scope="row" style="vertical-align:top">alpha</th>
 264  *       <td>The US-ASCII alphabetic characters,
 265  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
 266  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
 267  *   <tr><th scope="row" style="vertical-align:top">digit</th>
 268  *       <td>The US-ASCII decimal digit characters,
 269  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
 270  *   <tr><th scope="row" style="vertical-align:top">alphanum</th>
 271  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
 272  *   <tr><th scope="row" style="vertical-align:top">unreserved</th>
 273  *       <td>All <i>alphanum</i> characters together with those in the string
 274  *        {@code "_-!.~'()*"}</td></tr>
 275  *   <tr><th scope="row" style="vertical-align:top">punct</th>
 276  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
 277  *   <tr><th scope="row" style="vertical-align:top">reserved</th>
 278  *       <td>All <i>punct</i> characters together with those in the string
 279  *        {@code "?/[]@"}</td></tr>
 280  *   <tr><th scope="row" style="vertical-align:top">escaped</th>
 281  *       <td>Escaped octets, that is, triplets consisting of the percent
 282  *           character ({@code '%'}) followed by two hexadecimal digits
 283  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
 284  *           {@code 'a'}-{@code 'f'})</td></tr>
 285  *   <tr><th scope="row" style="vertical-align:top">other</th>
 286  *       <td>The Unicode characters that are not in the US-ASCII character set,
 287  *           are not control characters (according to the {@link
 288  *           java.lang.Character#isISOControl(char) Character.isISOControl}
 289  *           method), and are not space characters (according to the {@link
 290  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
 291  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
 292  *           limited to US-ASCII)</i></td></tr>
 293  * </tbody>
 294  * </table>
 295  *
 296  * <p><a id="legal-chars"></a> The set of all legal URI characters consists of
 297  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
 298  * characters.
 299  *
 300  *
 301  * <h3> Escaped octets, quotation, encoding, and decoding </h3>
 302  *
 303  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
 304  * fragment components.  Escaping serves two purposes in URIs:
 305  *
 306  * <ul>
 307  *
 308  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
 309  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
 310  *   characters.  </p></li>
 311  *
 312  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
 313  *   component.  The user-info, path, query, and fragment components differ
 314  *   slightly in terms of which characters are considered legal and illegal.
 315  *   </p></li>
 316  *
 317  * </ul>
 318  *
 319  * These purposes are served in this class by three related operations:
 320  *
 321  * <ul>
 322  *
 323  *   <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it
 324  *   with the sequence of escaped octets that represent that character in the
 325  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
 326  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
 327  *   RFC&nbsp;2396</b>, which does not specify any particular character
 328  *   set.)</i> </p></li>
 329  *
 330  *   <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by
 331  *   encoding it.  The space character, for example, is quoted by replacing it
 332  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
 333  *   characters this transformation has exactly the effect required by
 334  *   RFC&nbsp;2396. </p></li>
 335  *
 336  *   <li><p><a id="decode"></a>
 337  *   A sequence of escaped octets is <i>decoded</i> by
 338  *   replacing it with the sequence of characters that it represents in the
 339  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
 340  *   effect of de-quoting any quoted US-ASCII characters as well as that of
 341  *   decoding any encoded non-US-ASCII characters.  If a <a
 342  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
 343  *   when decoding the escaped octets then the erroneous octets are replaced by
 344  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
 345  *
 346  * </ul>
 347  *
 348  * These operations are exposed in the constructors and methods of this class
 349  * as follows:
 350  *
 351  * <ul>
 352  *
 353  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
 354  *   constructor} requires any illegal characters in its argument to be
 355  *   quoted and preserves any escaped octets and <i>other</i> characters that
 356  *   are present.  </p></li>
 357  *
 358  *   <li><p> The {@linkplain
 359  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
 360  *   multi-argument constructors} quote illegal characters as
 361  *   required by the components in which they appear.  The percent character
 362  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
 363  *   characters are preserved.  </p></li>
 364  *
 365  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
 366  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
 367  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
 368  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
 369  *   values of their corresponding components in raw form, without interpreting
 370  *   any escaped octets.  The strings returned by these methods may contain
 371  *   both escaped octets and <i>other</i> characters, and will not contain any
 372  *   illegal characters.  </p></li>
 373  *
 374  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
 375  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
 376  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
 377  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
 378  *   octets in their corresponding components.  The strings returned by these
 379  *   methods may contain both <i>other</i> characters and illegal characters,
 380  *   and will not contain any escaped octets.  </p></li>
 381  *
 382  *   <li><p> The {@link #toString() toString} method returns a URI string with
 383  *   all necessary quotation but which may contain <i>other</i> characters.
 384  *   </p></li>
 385  *
 386  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
 387  *   quoted and encoded URI string that does not contain any <i>other</i>
 388  *   characters.  </p></li>
 389  *
 390  * </ul>
 391  *
 392  *
 393  * <h3> Identities </h3>
 394  *
 395  * For any URI <i>u</i>, it is always the case that
 396  *
 397  * <blockquote>
 398  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
 399  * </blockquote>
 400  *
 401  * For any URI <i>u</i> that does not contain redundant syntax such as two
 402  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
 403  * colon following a host name but no port (as in
 404  * {@code http://www.example.com:}&nbsp;), and that does not encode characters
 405  * except those that must be quoted, the following identities also hold:
 406  * <pre>
 407  *     new URI(<i>u</i>.getScheme(),
 408  *             <i>u</i>.getSchemeSpecificPart(),
 409  *             <i>u</i>.getFragment())
 410  *     .equals(<i>u</i>)</pre>
 411  * in all cases,
 412  * <pre>
 413  *     new URI(<i>u</i>.getScheme(),
 414  *             <i>u</i>.getAuthority(),
 415  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 416  *             <i>u</i>.getFragment())
 417  *     .equals(<i>u</i>)</pre>
 418  * if <i>u</i> is hierarchical, and
 419  * <pre>
 420  *     new URI(<i>u</i>.getScheme(),
 421  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
 422  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 423  *             <i>u</i>.getFragment())
 424  *     .equals(<i>u</i>)</pre>
 425  * if <i>u</i> is hierarchical and has either no authority or a server-based
 426  * authority.
 427  *
 428  *
 429  * <h3> URIs, URLs, and URNs </h3>
 430  *
 431  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
 432  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
 433  * not every URI is a URL.  This is because there is another subcategory of
 434  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
 435  * specify how to locate them.  The {@code mailto}, {@code news}, and
 436  * {@code isbn} URIs shown above are examples of URNs.
 437  *
 438  * <p> The conceptual distinction between URIs and URLs is reflected in the
 439  * differences between this class and the {@link URL} class.
 440  *
 441  * <p> An instance of this class represents a URI reference in the syntactic
 442  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
 443  * A URI string is parsed according to the generic syntax without regard to the
 444  * scheme, if any, that it specifies.  No lookup of the host, if any, is
 445  * performed, and no scheme-dependent stream handler is constructed.  Equality,
 446  * hashing, and comparison are defined strictly in terms of the character
 447  * content of the instance.  In other words, a URI instance is little more than
 448  * a structured string that supports the syntactic, scheme-independent
 449  * operations of comparison, normalization, resolution, and relativization.
 450  *
 451  * <p> An instance of the {@link URL} class, by contrast, represents the
 452  * syntactic components of a URL together with some of the information required
 453  * to access the resource that it describes.  A URL must be absolute, that is,
 454  * it must always specify a scheme.  A URL string is parsed according to its
 455  * scheme.  A stream handler is always established for a URL, and in fact it is
 456  * impossible to create a URL instance for a scheme for which no handler is
 457  * available.  Equality and hashing depend upon both the scheme and the
 458  * Internet address of the host, if any; comparison is not defined.  In other
 459  * words, a URL is a structured string that supports the syntactic operation of
 460  * resolution as well as the network I/O operations of looking up the host and
 461  * opening a connection to the specified resource.
 462  *
 463  * @apiNote
 464  *
 465  * Applications working with file paths and file URIs should take great
 466  * care to use the appropriate methods to convert between the two.
 467  * The {@link Path#of(URI)} factory method and the {@link File#File(URI)}
 468  * constructor can be used to create {@link Path} or {@link File}
 469  * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()}
 470  * can be used to create a {@link URI} from a file path.
 471  * Applications should never try to {@linkplain
 472  * #URI(String, String, String, int, String, String, String)
 473  * construct}, {@linkplain #URI(String) parse}, or
 474  * {@linkplain #resolve(String) resolve} a {@code URI}
 475  * from the direct string representation of a {@code File} or {@code Path}
 476  * instance.
 477  * <p>
 478  * Some components of a URL or URI, such as <i>userinfo</i>, may
 479  * be abused to construct misleading URLs or URIs. Applications
 480  * that deal with URLs or URIs should take into account
 481  * the recommendations advised in <a
 482  * href="https://tools.ietf.org/html/rfc3986#section-7">RFC3986,
 483  * Section 7, Security Considerations</a>.
 484  *
 485  * @author Mark Reinhold
 486  * @since 1.4
 487  *
 488  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
 489  * transformation format of ISO 10646</i></a>, <br><a
 490  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
 491  * Architecture</i></a>, <br><a
 492  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
 493  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
 494  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
 495  * Literal IPv6 Addresses in URLs</i></a>, <br><a
 496  * href="URISyntaxException.html">URISyntaxException</a>
 497  */
 498 
 499 public final class URI
 500     implements Comparable<URI>, Serializable
 501 {
 502 
 503     // Note: Comments containing the word "ASSERT" indicate places where a
 504     // throw of an InternalError should be replaced by an appropriate assertion
 505     // statement once asserts are enabled in the build.
 506 
 507     static final long serialVersionUID = -6052424284110960213L;
 508 
 509 
 510     // -- Properties and components of this instance --
 511 
 512     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
 513     private transient String scheme;            // null ==> relative URI
 514     private transient String fragment;
 515 
 516     // Hierarchical URI components: [//<authority>]<path>[?<query>]
 517     private transient String authority;         // Registry or server
 518 
 519     // Server-based authority: [<userInfo>@]<host>[:<port>]
 520     private transient String userInfo;
 521     private transient String host;              // null ==> registry-based
 522     private transient int port = -1;            // -1 ==> undefined
 523 
 524     // Remaining components of hierarchical URIs
 525     private transient String path;              // null ==> opaque
 526     private transient String query;
 527 
 528     // The remaining fields may be computed on demand, which is safe even in
 529     // the face of multiple threads racing to initialize them
 530     private transient String schemeSpecificPart;
 531     private transient int hash;        // Zero ==> undefined
 532 
 533     private transient String decodedUserInfo;
 534     private transient String decodedAuthority;
 535     private transient String decodedPath;
 536     private transient String decodedQuery;
 537     private transient String decodedFragment;
 538     private transient String decodedSchemeSpecificPart;
 539 
 540     /**
 541      * The string form of this URI.
 542      *
 543      * @serial
 544      */
 545     private volatile String string;             // The only serializable field
 546 
 547 
 548 
 549     // -- Constructors and factories --
 550 
 551     private URI() { }                           // Used internally
 552 
 553     /**
 554      * Constructs a URI by parsing the given string.
 555      *
 556      * <p> This constructor parses the given string exactly as specified by the
 557      * grammar in <a
 558      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 559      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
 560      *
 561      * <ul>
 562      *
 563      *   <li><p> An empty authority component is permitted as long as it is
 564      *   followed by a non-empty path, a query component, or a fragment
 565      *   component.  This allows the parsing of URIs such as
 566      *   {@code "file:///foo/bar"}, which seems to be the intent of
 567      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
 568      *   authority component is empty then the user-information, host, and port
 569      *   components are undefined. </p></li>
 570      *
 571      *   <li><p> Empty relative paths are permitted; this seems to be the
 572      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
 573      *   primary consequence of this deviation is that a standalone fragment
 574      *   such as {@code "#foo"} parses as a relative URI with an empty path
 575      *   and the given fragment, and can be usefully <a
 576      *   href="#resolve-frag">resolved</a> against a base URI.
 577      *
 578      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
 579      *   specified by <a
 580      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
 581      *   element of a dotted-quad address must contain no more than three
 582      *   decimal digits.  Each element is further constrained to have a value
 583      *   no greater than 255. </p></li>
 584      *
 585      *   <li> <p> Hostnames in host components that comprise only a single
 586      *   domain label are permitted to start with an <i>alphanum</i>
 587      *   character. This seems to be the intent of <a
 588      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 589      *   section&nbsp;3.2.2 although the grammar does not permit it. The
 590      *   consequence of this deviation is that the authority component of a
 591      *   hierarchical URI such as {@code s://123}, will parse as a server-based
 592      *   authority. </p></li>
 593      *
 594      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
 595      *   address must be enclosed in square brackets ({@code '['} and
 596      *   {@code ']'}) as specified by <a
 597      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
 598      *   IPv6 address itself must parse according to <a
 599      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
 600      *   addresses are further constrained to describe no more than sixteen
 601      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
 602      *   but not expressible in the grammar. </p></li>
 603      *
 604      *   <li><p> Characters in the <i>other</i> category are permitted wherever
 605      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
 606      *   user-information, path, query, and fragment components, as well as in
 607      *   the authority component if the authority is registry-based.  This
 608      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
 609      *   character set. </p></li>
 610      *
 611      * </ul>
 612      *
 613      * @param  str   The string to be parsed into a URI
 614      *
 615      * @throws  NullPointerException
 616      *          If {@code str} is {@code null}
 617      *
 618      * @throws  URISyntaxException
 619      *          If the given string violates RFC&nbsp;2396, as augmented
 620      *          by the above deviations
 621      */
 622     public URI(String str) throws URISyntaxException {
 623         new Parser(str).parse(false);
 624     }
 625 
 626     /**
 627      * Constructs a hierarchical URI from the given components.
 628      *
 629      * <p> If a scheme is given then the path, if also given, must either be
 630      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 631      * component of the new URI may be left undefined by passing {@code null}
 632      * for the corresponding parameter or, in the case of the {@code port}
 633      * parameter, by passing {@code -1}.
 634      *
 635      * <p> This constructor first builds a URI string from the given components
 636      * according to the rules specified in <a
 637      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 638      * section&nbsp;5.2, step&nbsp;7: </p>
 639      *
 640      * <ol>
 641      *
 642      *   <li><p> Initially, the result string is empty. </p></li>
 643      *
 644      *   <li><p> If a scheme is given then it is appended to the result,
 645      *   followed by a colon character ({@code ':'}).  </p></li>
 646      *
 647      *   <li><p> If user information, a host, or a port are given then the
 648      *   string {@code "//"} is appended.  </p></li>
 649      *
 650      *   <li><p> If user information is given then it is appended, followed by
 651      *   a commercial-at character ({@code '@'}).  Any character not in the
 652      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 653      *   categories is <a href="#quote">quoted</a>.  </p></li>
 654      *
 655      *   <li><p> If a host is given then it is appended.  If the host is a
 656      *   literal IPv6 address but is not enclosed in square brackets
 657      *   ({@code '['} and {@code ']'}) then the square brackets are added.
 658      *   </p></li>
 659      *
 660      *   <li><p> If a port number is given then a colon character
 661      *   ({@code ':'}) is appended, followed by the port number in decimal.
 662      *   </p></li>
 663      *
 664      *   <li><p> If a path is given then it is appended.  Any character not in
 665      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 666      *   categories, and not equal to the slash character ({@code '/'}) or the
 667      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 668      *
 669      *   <li><p> If a query is given then a question-mark character
 670      *   ({@code '?'}) is appended, followed by the query.  Any character that
 671      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 672      *   </p></li>
 673      *
 674      *   <li><p> Finally, if a fragment is given then a hash character
 675      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 676      *   that is not a legal URI character is quoted.  </p></li>
 677      *
 678      * </ol>
 679      *
 680      * <p> The resulting URI string is then parsed as if by invoking the {@link
 681      * #URI(String)} constructor and then invoking the {@link
 682      * #parseServerAuthority()} method upon the result; this may cause a {@link
 683      * URISyntaxException} to be thrown.  </p>
 684      *
 685      * @param   scheme    Scheme name
 686      * @param   userInfo  User name and authorization information
 687      * @param   host      Host name
 688      * @param   port      Port number
 689      * @param   path      Path
 690      * @param   query     Query
 691      * @param   fragment  Fragment
 692      *
 693      * @throws URISyntaxException
 694      *         If both a scheme and a path are given but the path is relative,
 695      *         if the URI string constructed from the given components violates
 696      *         RFC&nbsp;2396, or if the authority component of the string is
 697      *         present but cannot be parsed as a server-based authority
 698      */
 699     public URI(String scheme,
 700                String userInfo, String host, int port,
 701                String path, String query, String fragment)
 702         throws URISyntaxException
 703     {
 704         String s = toString(scheme, null,
 705                             null, userInfo, host, port,
 706                             path, query, fragment);
 707         checkPath(s, scheme, path);
 708         new Parser(s).parse(true);
 709     }
 710 
 711     /**
 712      * Constructs a hierarchical URI from the given components.
 713      *
 714      * <p> If a scheme is given then the path, if also given, must either be
 715      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 716      * component of the new URI may be left undefined by passing {@code null}
 717      * for the corresponding parameter.
 718      *
 719      * <p> This constructor first builds a URI string from the given components
 720      * according to the rules specified in <a
 721      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 722      * section&nbsp;5.2, step&nbsp;7: </p>
 723      *
 724      * <ol>
 725      *
 726      *   <li><p> Initially, the result string is empty.  </p></li>
 727      *
 728      *   <li><p> If a scheme is given then it is appended to the result,
 729      *   followed by a colon character ({@code ':'}).  </p></li>
 730      *
 731      *   <li><p> If an authority is given then the string {@code "//"} is
 732      *   appended, followed by the authority.  If the authority contains a
 733      *   literal IPv6 address then the address must be enclosed in square
 734      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
 735      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 736      *   categories, and not equal to the commercial-at character
 737      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
 738      *
 739      *   <li><p> If a path is given then it is appended.  Any character not in
 740      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 741      *   categories, and not equal to the slash character ({@code '/'}) or the
 742      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 743      *
 744      *   <li><p> If a query is given then a question-mark character
 745      *   ({@code '?'}) is appended, followed by the query.  Any character that
 746      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 747      *   </p></li>
 748      *
 749      *   <li><p> Finally, if a fragment is given then a hash character
 750      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 751      *   that is not a legal URI character is quoted.  </p></li>
 752      *
 753      * </ol>
 754      *
 755      * <p> The resulting URI string is then parsed as if by invoking the {@link
 756      * #URI(String)} constructor and then invoking the {@link
 757      * #parseServerAuthority()} method upon the result; this may cause a {@link
 758      * URISyntaxException} to be thrown.  </p>
 759      *
 760      * @param   scheme     Scheme name
 761      * @param   authority  Authority
 762      * @param   path       Path
 763      * @param   query      Query
 764      * @param   fragment   Fragment
 765      *
 766      * @throws URISyntaxException
 767      *         If both a scheme and a path are given but the path is relative,
 768      *         if the URI string constructed from the given components violates
 769      *         RFC&nbsp;2396, or if the authority component of the string is
 770      *         present but cannot be parsed as a server-based authority
 771      */
 772     public URI(String scheme,
 773                String authority,
 774                String path, String query, String fragment)
 775         throws URISyntaxException
 776     {
 777         String s = toString(scheme, null,
 778                             authority, null, null, -1,
 779                             path, query, fragment);
 780         checkPath(s, scheme, path);
 781         new Parser(s).parse(false);
 782     }
 783 
 784     /**
 785      * Constructs a hierarchical URI from the given components.
 786      *
 787      * <p> A component may be left undefined by passing {@code null}.
 788      *
 789      * <p> This convenience constructor works as if by invoking the
 790      * seven-argument constructor as follows:
 791      *
 792      * <blockquote>
 793      * {@code new} {@link #URI(String, String, String, int, String, String, String)
 794      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
 795      * </blockquote>
 796      *
 797      * @param   scheme    Scheme name
 798      * @param   host      Host name
 799      * @param   path      Path
 800      * @param   fragment  Fragment
 801      *
 802      * @throws  URISyntaxException
 803      *          If the URI string constructed from the given components
 804      *          violates RFC&nbsp;2396
 805      */
 806     public URI(String scheme, String host, String path, String fragment)
 807         throws URISyntaxException
 808     {
 809         this(scheme, null, host, -1, path, null, fragment);
 810     }
 811 
 812     /**
 813      * Constructs a URI from the given components.
 814      *
 815      * <p> A component may be left undefined by passing {@code null}.
 816      *
 817      * <p> This constructor first builds a URI in string form using the given
 818      * components as follows:  </p>
 819      *
 820      * <ol>
 821      *
 822      *   <li><p> Initially, the result string is empty.  </p></li>
 823      *
 824      *   <li><p> If a scheme is given then it is appended to the result,
 825      *   followed by a colon character ({@code ':'}).  </p></li>
 826      *
 827      *   <li><p> If a scheme-specific part is given then it is appended.  Any
 828      *   character that is not a <a href="#legal-chars">legal URI character</a>
 829      *   is <a href="#quote">quoted</a>.  </p></li>
 830      *
 831      *   <li><p> Finally, if a fragment is given then a hash character
 832      *   ({@code '#'}) is appended to the string, followed by the fragment.
 833      *   Any character that is not a legal URI character is quoted.  </p></li>
 834      *
 835      * </ol>
 836      *
 837      * <p> The resulting URI string is then parsed in order to create the new
 838      * URI instance as if by invoking the {@link #URI(String)} constructor;
 839      * this may cause a {@link URISyntaxException} to be thrown.  </p>
 840      *
 841      * @param   scheme    Scheme name
 842      * @param   ssp       Scheme-specific part
 843      * @param   fragment  Fragment
 844      *
 845      * @throws  URISyntaxException
 846      *          If the URI string constructed from the given components
 847      *          violates RFC&nbsp;2396
 848      */
 849     public URI(String scheme, String ssp, String fragment)
 850         throws URISyntaxException
 851     {
 852         new Parser(toString(scheme, ssp,
 853                             null, null, null, -1,
 854                             null, null, fragment))
 855             .parse(false);
 856     }
 857 
 858     /**
 859      * Constructs a simple URI consisting of only a scheme and a pre-validated
 860      * path. Provides a fast-path for some internal cases.
 861      */
 862     URI(String scheme, String path) {
 863         assert validSchemeAndPath(scheme, path);
 864         this.scheme = scheme;
 865         this.path = path;
 866     }
 867 
 868     private static boolean validSchemeAndPath(String scheme, String path) {
 869         try {
 870             URI u = new URI(scheme + ":" + path);
 871             return scheme.equals(u.scheme) && path.equals(u.path);
 872         } catch (URISyntaxException e) {
 873             return false;
 874         }
 875     }
 876 
 877     /**
 878      * Creates a URI by parsing the given string.
 879      *
 880      * <p> This convenience factory method works as if by invoking the {@link
 881      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
 882      * constructor is caught and wrapped in a new {@link
 883      * IllegalArgumentException} object, which is then thrown.
 884      *
 885      * <p> This method is provided for use in situations where it is known that
 886      * the given string is a legal URI, for example for URI constants declared
 887      * within a program, and so it would be considered a programming error
 888      * for the string not to parse as such.  The constructors, which throw
 889      * {@link URISyntaxException} directly, should be used in situations where a
 890      * URI is being constructed from user input or from some other source that
 891      * may be prone to errors.  </p>
 892      *
 893      * @param  str   The string to be parsed into a URI
 894      * @return The new URI
 895      *
 896      * @throws  NullPointerException
 897      *          If {@code str} is {@code null}
 898      *
 899      * @throws  IllegalArgumentException
 900      *          If the given string violates RFC&nbsp;2396
 901      */
 902     public static URI create(String str) {
 903         try {
 904             return new URI(str);
 905         } catch (URISyntaxException x) {
 906             throw new IllegalArgumentException(x.getMessage(), x);
 907         }
 908     }
 909 
 910 
 911     // -- Operations --
 912 
 913     /**
 914      * Attempts to parse this URI's authority component, if defined, into
 915      * user-information, host, and port components.
 916      *
 917      * <p> If this URI's authority component has already been recognized as
 918      * being server-based then it will already have been parsed into
 919      * user-information, host, and port components.  In this case, or if this
 920      * URI has no authority component, this method simply returns this URI.
 921      *
 922      * <p> Otherwise this method attempts once more to parse the authority
 923      * component into user-information, host, and port components, and throws
 924      * an exception describing why the authority component could not be parsed
 925      * in that way.
 926      *
 927      * <p> This method is provided because the generic URI syntax specified in
 928      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 929      * cannot always distinguish a malformed server-based authority from a
 930      * legitimate registry-based authority.  It must therefore treat some
 931      * instances of the former as instances of the latter.  The authority
 932      * component in the URI string {@code "//foo:bar"}, for example, is not a
 933      * legal server-based authority but it is legal as a registry-based
 934      * authority.
 935      *
 936      * <p> In many common situations, for example when working URIs that are
 937      * known to be either URNs or URLs, the hierarchical URIs being used will
 938      * always be server-based.  They therefore must either be parsed as such or
 939      * treated as an error.  In these cases a statement such as
 940      *
 941      * <blockquote>
 942      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
 943      * </blockquote>
 944      *
 945      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
 946      * it has an authority component, has a server-based authority with proper
 947      * user-information, host, and port components.  Invoking this method also
 948      * ensures that if the authority could not be parsed in that way then an
 949      * appropriate diagnostic message can be issued based upon the exception
 950      * that is thrown. </p>
 951      *
 952      * @return  A URI whose authority field has been parsed
 953      *          as a server-based authority
 954      *
 955      * @throws  URISyntaxException
 956      *          If the authority component of this URI is defined
 957      *          but cannot be parsed as a server-based authority
 958      *          according to RFC&nbsp;2396
 959      */
 960     public URI parseServerAuthority()
 961         throws URISyntaxException
 962     {
 963         // We could be clever and cache the error message and index from the
 964         // exception thrown during the original parse, but that would require
 965         // either more fields or a more-obscure representation.
 966         if ((host != null) || (authority == null))
 967             return this;
 968         new Parser(toString()).parse(true);
 969         return this;
 970     }
 971 
 972     /**
 973      * Normalizes this URI's path.
 974      *
 975      * <p> If this URI is opaque, or if its path is already in normal form,
 976      * then this URI is returned.  Otherwise a new URI is constructed that is
 977      * identical to this URI except that its path is computed by normalizing
 978      * this URI's path in a manner consistent with <a
 979      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 980      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
 981      * </p>
 982      *
 983      * <ol>
 984      *
 985      *   <li><p> All {@code "."} segments are removed. </p></li>
 986      *
 987      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
 988      *   segment then both of these segments are removed.  This step is
 989      *   repeated until it is no longer applicable. </p></li>
 990      *
 991      *   <li><p> If the path is relative, and if its first segment contains a
 992      *   colon character ({@code ':'}), then a {@code "."} segment is
 993      *   prepended.  This prevents a relative URI with a path such as
 994      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
 995      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
 996      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
 997      *
 998      * </ol>
 999      *
1000      * <p> A normalized path will begin with one or more {@code ".."} segments
1001      * if there were insufficient non-{@code ".."} segments preceding them to
1002      * allow their removal.  A normalized path will begin with a {@code "."}
1003      * segment if one was inserted by step 3 above.  Otherwise, a normalized
1004      * path will not contain any {@code "."} or {@code ".."} segments. </p>
1005      *
1006      * @return  A URI equivalent to this URI,
1007      *          but whose path is in normal form
1008      */
1009     public URI normalize() {
1010         return normalize(this);
1011     }
1012 
1013     /**
1014      * Resolves the given URI against this URI.
1015      *
1016      * <p> If the given URI is already absolute, or if this URI is opaque, then
1017      * the given URI is returned.
1018      *
1019      * <p><a id="resolve-frag"></a> If the given URI's fragment component is
1020      * defined, its path component is empty, and its scheme, authority, and
1021      * query components are undefined, then a URI with the given fragment but
1022      * with all other components equal to those of this URI is returned.  This
1023      * allows a URI representing a standalone fragment reference, such as
1024      * {@code "#foo"}, to be usefully resolved against a base URI.
1025      *
1026      * <p> Otherwise this method constructs a new hierarchical URI in a manner
1027      * consistent with <a
1028      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1029      * section&nbsp;5.2; that is: </p>
1030      *
1031      * <ol>
1032      *
1033      *   <li><p> A new URI is constructed with this URI's scheme and the given
1034      *   URI's query and fragment components. </p></li>
1035      *
1036      *   <li><p> If the given URI has an authority component then the new URI's
1037      *   authority and path are taken from the given URI. </p></li>
1038      *
1039      *   <li><p> Otherwise the new URI's authority component is copied from
1040      *   this URI, and its path is computed as follows: </p>
1041      *
1042      *   <ol>
1043      *
1044      *     <li><p> If the given URI's path is absolute then the new URI's path
1045      *     is taken from the given URI. </p></li>
1046      *
1047      *     <li><p> Otherwise the given URI's path is relative, and so the new
1048      *     URI's path is computed by resolving the path of the given URI
1049      *     against the path of this URI.  This is done by concatenating all but
1050      *     the last segment of this URI's path, if any, with the given URI's
1051      *     path and then normalizing the result as if by invoking the {@link
1052      *     #normalize() normalize} method. </p></li>
1053      *
1054      *   </ol></li>
1055      *
1056      * </ol>
1057      *
1058      * <p> The result of this method is absolute if, and only if, either this
1059      * URI is absolute or the given URI is absolute.  </p>
1060      *
1061      * @param  uri  The URI to be resolved against this URI
1062      * @return The resulting URI
1063      *
1064      * @throws  NullPointerException
1065      *          If {@code uri} is {@code null}
1066      */
1067     public URI resolve(URI uri) {
1068         return resolve(this, uri);
1069     }
1070 
1071     /**
1072      * Constructs a new URI by parsing the given string and then resolving it
1073      * against this URI.
1074      *
1075      * <p> This convenience method works as if invoking it were equivalent to
1076      * evaluating the expression {@link #resolve(java.net.URI)
1077      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1078      *
1079      * @param  str   The string to be parsed into a URI
1080      * @return The resulting URI
1081      *
1082      * @throws  NullPointerException
1083      *          If {@code str} is {@code null}
1084      *
1085      * @throws  IllegalArgumentException
1086      *          If the given string violates RFC&nbsp;2396
1087      */
1088     public URI resolve(String str) {
1089         return resolve(URI.create(str));
1090     }
1091 
1092     /**
1093      * Relativizes the given URI against this URI.
1094      *
1095      * <p> The relativization of the given URI against this URI is computed as
1096      * follows: </p>
1097      *
1098      * <ol>
1099      *
1100      *   <li><p> If either this URI or the given URI are opaque, or if the
1101      *   scheme and authority components of the two URIs are not identical, or
1102      *   if the path of this URI is not a prefix of the path of the given URI,
1103      *   then the given URI is returned. </p></li>
1104      *
1105      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1106      *   query and fragment components taken from the given URI and with a path
1107      *   component computed by removing this URI's path from the beginning of
1108      *   the given URI's path. </p></li>
1109      *
1110      * </ol>
1111      *
1112      * @param  uri  The URI to be relativized against this URI
1113      * @return The resulting URI
1114      *
1115      * @throws  NullPointerException
1116      *          If {@code uri} is {@code null}
1117      */
1118     public URI relativize(URI uri) {
1119         return relativize(this, uri);
1120     }
1121 
1122     /**
1123      * Constructs a URL from this URI.
1124      *
1125      * <p> This convenience method works as if invoking it were equivalent to
1126      * evaluating the expression {@code new URL(this.toString())} after
1127      * first checking that this URI is absolute. </p>
1128      *
1129      * @return  A URL constructed from this URI
1130      *
1131      * @throws  IllegalArgumentException
1132      *          If this URL is not absolute
1133      *
1134      * @throws  MalformedURLException
1135      *          If a protocol handler for the URL could not be found,
1136      *          or if some other error occurred while constructing the URL
1137      */
1138     public URL toURL() throws MalformedURLException {
1139         return URL.fromURI(this);
1140     }
1141 
1142     // -- Component access methods --
1143 
1144     /**
1145      * Returns the scheme component of this URI.
1146      *
1147      * <p> The scheme component of a URI, if defined, only contains characters
1148      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1149      * scheme always starts with an <i>alpha</i> character. <p>
1150      *
1151      * The scheme component of a URI cannot contain escaped octets, hence this
1152      * method does not perform any decoding.
1153      *
1154      * @return  The scheme component of this URI,
1155      *          or {@code null} if the scheme is undefined
1156      */
1157     public String getScheme() {
1158         return scheme;
1159     }
1160 
1161     /**
1162      * Tells whether or not this URI is absolute.
1163      *
1164      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1165      *
1166      * @return  {@code true} if, and only if, this URI is absolute
1167      */
1168     public boolean isAbsolute() {
1169         return scheme != null;
1170     }
1171 
1172     /**
1173      * Tells whether or not this URI is opaque.
1174      *
1175      * <p> A URI is opaque if, and only if, it is absolute and its
1176      * scheme-specific part does not begin with a slash character ('/').
1177      * An opaque URI has a scheme, a scheme-specific part, and possibly
1178      * a fragment; all other components are undefined. </p>
1179      *
1180      * @return  {@code true} if, and only if, this URI is opaque
1181      */
1182     public boolean isOpaque() {
1183         return path == null;
1184     }
1185 
1186     /**
1187      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1188      * part is never undefined, though it may be empty.
1189      *
1190      * <p> The scheme-specific part of a URI only contains legal URI
1191      * characters. </p>
1192      *
1193      * @return  The raw scheme-specific part of this URI
1194      *          (never {@code null})
1195      */
1196     public String getRawSchemeSpecificPart() {
1197         String part = schemeSpecificPart;
1198         if (part != null) {
1199             return part;
1200         }
1201 
1202         String s = string;
1203         if (s != null) {
1204             // if string is defined, components will have been parsed
1205             int start = 0;
1206             int end = s.length();
1207             if (scheme != null) {
1208                 start = scheme.length() + 1;
1209             }
1210             if (fragment != null) {
1211                 end -= fragment.length() + 1;
1212             }
1213             if (path != null && path.length() == end - start) {
1214                 part = path;
1215             } else {
1216                 part = s.substring(start, end);
1217             }
1218         } else {
1219             StringBuilder sb = new StringBuilder();
1220             appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1221                                  host, port, getPath(), getQuery());
1222             part = sb.toString();
1223         }
1224         return schemeSpecificPart = part;
1225     }
1226 
1227     /**
1228      * Returns the decoded scheme-specific part of this URI.
1229      *
1230      * <p> The string returned by this method is equal to that returned by the
1231      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1232      * except that all sequences of escaped octets are <a
1233      * href="#decode">decoded</a>.  </p>
1234      *
1235      * @return  The decoded scheme-specific part of this URI
1236      *          (never {@code null})
1237      */
1238     public String getSchemeSpecificPart() {
1239         String part = decodedSchemeSpecificPart;
1240         if (part == null) {
1241             decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart());
1242         }
1243         return part;
1244     }
1245 
1246     /**
1247      * Returns the raw authority component of this URI.
1248      *
1249      * <p> The authority component of a URI, if defined, only contains the
1250      * commercial-at character ({@code '@'}) and characters in the
1251      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1252      * categories.  If the authority is server-based then it is further
1253      * constrained to have valid user-information, host, and port
1254      * components. </p>
1255      *
1256      * @return  The raw authority component of this URI,
1257      *          or {@code null} if the authority is undefined
1258      */
1259     public String getRawAuthority() {
1260         return authority;
1261     }
1262 
1263     /**
1264      * Returns the decoded authority component of this URI.
1265      *
1266      * <p> The string returned by this method is equal to that returned by the
1267      * {@link #getRawAuthority() getRawAuthority} method except that all
1268      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1269      *
1270      * @return  The decoded authority component of this URI,
1271      *          or {@code null} if the authority is undefined
1272      */
1273     public String getAuthority() {
1274         String auth = decodedAuthority;
1275         if ((auth == null) && (authority != null)) {
1276             decodedAuthority = auth = decode(authority);
1277         }
1278         return auth;
1279     }
1280 
1281     /**
1282      * Returns the raw user-information component of this URI.
1283      *
1284      * <p> The user-information component of a URI, if defined, only contains
1285      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1286      * <i>other</i> categories. </p>
1287      *
1288      * @return  The raw user-information component of this URI,
1289      *          or {@code null} if the user information is undefined
1290      */
1291     public String getRawUserInfo() {
1292         return userInfo;
1293     }
1294 
1295     /**
1296      * Returns the decoded user-information component of this URI.
1297      *
1298      * <p> The string returned by this method is equal to that returned by the
1299      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1300      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1301      *
1302      * @return  The decoded user-information component of this URI,
1303      *          or {@code null} if the user information is undefined
1304      */
1305     public String getUserInfo() {
1306         String user = decodedUserInfo;
1307         if ((user == null) && (userInfo != null)) {
1308             decodedUserInfo = user = decode(userInfo);
1309         }
1310         return user;
1311     }
1312 
1313     /**
1314      * Returns the host component of this URI.
1315      *
1316      * <p> The host component of a URI, if defined, will have one of the
1317      * following forms: </p>
1318      *
1319      * <ul>
1320      *
1321      *   <li><p> A domain name consisting of one or more <i>labels</i>
1322      *   separated by period characters ({@code '.'}), optionally followed by
1323      *   a period character.  Each label consists of <i>alphanum</i> characters
1324      *   as well as hyphen characters ({@code '-'}), though hyphens never
1325      *   occur as the first or last characters in a label. The rightmost
1326      *   label of a domain name consisting of two or more labels, begins
1327      *   with an <i>alpha</i> character. </li>
1328      *
1329      *   <li><p> A dotted-quad IPv4 address of the form
1330      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1331      *   where no <i>digit</i> sequence is longer than three characters and no
1332      *   sequence has a value larger than 255. </p></li>
1333      *
1334      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1335      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1336      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1337      *   syntax of IPv6 addresses is specified in <a
1338      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1339      *   Addressing Architecture</i></a>.  </p></li>
1340      *
1341      * </ul>
1342      *
1343      * The host component of a URI cannot contain escaped octets, hence this
1344      * method does not perform any decoding.
1345      *
1346      * @return  The host component of this URI,
1347      *          or {@code null} if the host is undefined
1348      */
1349     public String getHost() {
1350         return host;
1351     }
1352 
1353     /**
1354      * Returns the port number of this URI.
1355      *
1356      * <p> The port component of a URI, if defined, is a non-negative
1357      * integer. </p>
1358      *
1359      * @return  The port component of this URI,
1360      *          or {@code -1} if the port is undefined
1361      */
1362     public int getPort() {
1363         return port;
1364     }
1365 
1366     /**
1367      * Returns the raw path component of this URI.
1368      *
1369      * <p> The path component of a URI, if defined, only contains the slash
1370      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1371      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1372      * and <i>other</i> categories. </p>
1373      *
1374      * @return  The path component of this URI,
1375      *          or {@code null} if the path is undefined
1376      */
1377     public String getRawPath() {
1378         return path;
1379     }
1380 
1381     /**
1382      * Returns the decoded path component of this URI.
1383      *
1384      * <p> The string returned by this method is equal to that returned by the
1385      * {@link #getRawPath() getRawPath} method except that all sequences of
1386      * escaped octets are <a href="#decode">decoded</a>.  </p>
1387      *
1388      * @return  The decoded path component of this URI,
1389      *          or {@code null} if the path is undefined
1390      */
1391     public String getPath() {
1392         String decoded = decodedPath;
1393         if ((decoded == null) && (path != null)) {
1394             decodedPath = decoded = decode(path);
1395         }
1396         return decoded;
1397     }
1398 
1399     /**
1400      * Returns the raw query component of this URI.
1401      *
1402      * <p> The query component of a URI, if defined, only contains legal URI
1403      * characters. </p>
1404      *
1405      * @return  The raw query component of this URI,
1406      *          or {@code null} if the query is undefined
1407      */
1408     public String getRawQuery() {
1409         return query;
1410     }
1411 
1412     /**
1413      * Returns the decoded query component of this URI.
1414      *
1415      * <p> The string returned by this method is equal to that returned by the
1416      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1417      * escaped octets are <a href="#decode">decoded</a>.  </p>
1418      *
1419      * @return  The decoded query component of this URI,
1420      *          or {@code null} if the query is undefined
1421      */
1422     public String getQuery() {
1423         String decoded = decodedQuery;
1424         if ((decoded == null) && (query != null)) {
1425             decodedQuery = decoded = decode(query, false);
1426         }
1427         return decoded;
1428     }
1429 
1430     /**
1431      * Returns the raw fragment component of this URI.
1432      *
1433      * <p> The fragment component of a URI, if defined, only contains legal URI
1434      * characters. </p>
1435      *
1436      * @return  The raw fragment component of this URI,
1437      *          or {@code null} if the fragment is undefined
1438      */
1439     public String getRawFragment() {
1440         return fragment;
1441     }
1442 
1443     /**
1444      * Returns the decoded fragment component of this URI.
1445      *
1446      * <p> The string returned by this method is equal to that returned by the
1447      * {@link #getRawFragment() getRawFragment} method except that all
1448      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1449      *
1450      * @return  The decoded fragment component of this URI,
1451      *          or {@code null} if the fragment is undefined
1452      */
1453     public String getFragment() {
1454         String decoded = decodedFragment;
1455         if ((decoded == null) && (fragment != null)) {
1456             decodedFragment = decoded = decode(fragment, false);
1457         }
1458         return decoded;
1459     }
1460 
1461 
1462     // -- Equality, comparison, hash code, toString, and serialization --
1463 
1464     /**
1465      * Tests this URI for equality with another object.
1466      *
1467      * <p> If the given object is not a URI then this method immediately
1468      * returns {@code false}.
1469      *
1470      * <p> For two URIs to be considered equal requires that either both are
1471      * opaque or both are hierarchical.  Their schemes must either both be
1472      * undefined or else be equal without regard to case. Their fragments
1473      * must either both be undefined or else be equal.
1474      *
1475      * <p> For two opaque URIs to be considered equal, their scheme-specific
1476      * parts must be equal.
1477      *
1478      * <p> For two hierarchical URIs to be considered equal, their paths must
1479      * be equal and their queries must either both be undefined or else be
1480      * equal.  Their authorities must either both be undefined, or both be
1481      * registry-based, or both be server-based.  If their authorities are
1482      * defined and are registry-based, then they must be equal.  If their
1483      * authorities are defined and are server-based, then their hosts must be
1484      * equal without regard to case, their port numbers must be equal, and
1485      * their user-information components must be equal.
1486      *
1487      * <p> When testing the user-information, path, query, fragment, authority,
1488      * or scheme-specific parts of two URIs for equality, the raw forms rather
1489      * than the encoded forms of these components are compared and the
1490      * hexadecimal digits of escaped octets are compared without regard to
1491      * case.
1492      *
1493      * <p> This method satisfies the general contract of the {@link
1494      * java.lang.Object#equals(Object) Object.equals} method. </p>
1495      *
1496      * @param   ob   The object to which this object is to be compared
1497      *
1498      * @return  {@code true} if, and only if, the given object is a URI that
1499      *          is identical to this URI
1500      */
1501     public boolean equals(Object ob) {
1502         if (ob == this)
1503             return true;
1504         if (!(ob instanceof URI))
1505             return false;
1506         URI that = (URI)ob;
1507         if (this.isOpaque() != that.isOpaque()) return false;
1508         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1509         if (!equal(this.fragment, that.fragment)) return false;
1510 
1511         // Opaque
1512         if (this.isOpaque())
1513             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1514 
1515         // Hierarchical
1516         if (!equal(this.path, that.path)) return false;
1517         if (!equal(this.query, that.query)) return false;
1518 
1519         // Authorities
1520         if (this.authority == that.authority) return true;
1521         if (this.host != null) {
1522             // Server-based
1523             if (!equal(this.userInfo, that.userInfo)) return false;
1524             if (!equalIgnoringCase(this.host, that.host)) return false;
1525             if (this.port != that.port) return false;
1526         } else if (this.authority != null) {
1527             // Registry-based
1528             if (!equal(this.authority, that.authority)) return false;
1529         } else if (this.authority != that.authority) {
1530             return false;
1531         }
1532 
1533         return true;
1534     }
1535 
1536     /**
1537      * Returns a hash-code value for this URI.  The hash code is based upon all
1538      * of the URI's components, and satisfies the general contract of the
1539      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1540      *
1541      * @return  A hash-code value for this URI
1542      */
1543     public int hashCode() {
1544         int h = hash;
1545         if (h == 0) {
1546             h = hashIgnoringCase(0, scheme);
1547             h = hash(h, fragment);
1548             if (isOpaque()) {
1549                 h = hash(h, schemeSpecificPart);
1550             } else {
1551                 h = hash(h, path);
1552                 h = hash(h, query);
1553                 if (host != null) {
1554                     h = hash(h, userInfo);
1555                     h = hashIgnoringCase(h, host);
1556                     h += 1949 * port;
1557                 } else {
1558                     h = hash(h, authority);
1559                 }
1560             }
1561             if (h != 0) {
1562                 hash = h;
1563             }
1564         }
1565         return h;
1566     }
1567 
1568     /**
1569      * Compares this URI to another object, which must be a URI.
1570      *
1571      * <p> When comparing corresponding components of two URIs, if one
1572      * component is undefined but the other is defined then the first is
1573      * considered to be less than the second.  Unless otherwise noted, string
1574      * components are ordered according to their natural, case-sensitive
1575      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1576      * String.compareTo} method.  String components that are subject to
1577      * encoding are compared by comparing their raw forms rather than their
1578      * encoded forms.
1579      *
1580      * <p> The ordering of URIs is defined as follows: </p>
1581      *
1582      * <ul>
1583      *
1584      *   <li><p> Two URIs with different schemes are ordered according the
1585      *   ordering of their schemes, without regard to case. </p></li>
1586      *
1587      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1588      *   with an identical scheme. </p></li>
1589      *
1590      *   <li><p> Two opaque URIs with identical schemes are ordered according
1591      *   to the ordering of their scheme-specific parts. </p></li>
1592      *
1593      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1594      *   parts are ordered according to the ordering of their
1595      *   fragments. </p></li>
1596      *
1597      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1598      *   according to the ordering of their authority components: </p>
1599      *
1600      *   <ul>
1601      *
1602      *     <li><p> If both authority components are server-based then the URIs
1603      *     are ordered according to their user-information components; if these
1604      *     components are identical then the URIs are ordered according to the
1605      *     ordering of their hosts, without regard to case; if the hosts are
1606      *     identical then the URIs are ordered according to the ordering of
1607      *     their ports. </p></li>
1608      *
1609      *     <li><p> If one or both authority components are registry-based then
1610      *     the URIs are ordered according to the ordering of their authority
1611      *     components. </p></li>
1612      *
1613      *   </ul></li>
1614      *
1615      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1616      *   authority components are ordered according to the ordering of their
1617      *   paths; if their paths are identical then they are ordered according to
1618      *   the ordering of their queries; if the queries are identical then they
1619      *   are ordered according to the order of their fragments. </p></li>
1620      *
1621      * </ul>
1622      *
1623      * <p> This method satisfies the general contract of the {@link
1624      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1625      * method. </p>
1626      *
1627      * @param   that
1628      *          The object to which this URI is to be compared
1629      *
1630      * @return  A negative integer, zero, or a positive integer as this URI is
1631      *          less than, equal to, or greater than the given URI
1632      *
1633      * @throws  ClassCastException
1634      *          If the given object is not a URI
1635      */
1636     public int compareTo(URI that) {
1637         int c;
1638 
1639         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1640             return c;
1641 
1642         if (this.isOpaque()) {
1643             if (that.isOpaque()) {
1644                 // Both opaque
1645                 if ((c = compare(this.schemeSpecificPart,
1646                                  that.schemeSpecificPart)) != 0)
1647                     return c;
1648                 return compare(this.fragment, that.fragment);
1649             }
1650             return +1;                  // Opaque > hierarchical
1651         } else if (that.isOpaque()) {
1652             return -1;                  // Hierarchical < opaque
1653         }
1654 
1655         // Hierarchical
1656         if ((this.host != null) && (that.host != null)) {
1657             // Both server-based
1658             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1659                 return c;
1660             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1661                 return c;
1662             if ((c = this.port - that.port) != 0)
1663                 return c;
1664         } else {
1665             // If one or both authorities are registry-based then we simply
1666             // compare them in the usual, case-sensitive way.  If one is
1667             // registry-based and one is server-based then the strings are
1668             // guaranteed to be unequal, hence the comparison will never return
1669             // zero and the compareTo and equals methods will remain
1670             // consistent.
1671             if ((c = compare(this.authority, that.authority)) != 0) return c;
1672         }
1673 
1674         if ((c = compare(this.path, that.path)) != 0) return c;
1675         if ((c = compare(this.query, that.query)) != 0) return c;
1676         return compare(this.fragment, that.fragment);
1677     }
1678 
1679     /**
1680      * Returns the content of this URI as a string.
1681      *
1682      * <p> If this URI was created by invoking one of the constructors in this
1683      * class then a string equivalent to the original input string, or to the
1684      * string computed from the originally-given components, as appropriate, is
1685      * returned.  Otherwise this URI was created by normalization, resolution,
1686      * or relativization, and so a string is constructed from this URI's
1687      * components according to the rules specified in <a
1688      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1689      * section&nbsp;5.2, step&nbsp;7. </p>
1690      *
1691      * @return  The string form of this URI
1692      */
1693     public String toString() {
1694         String s = string;
1695         if (s == null) {
1696             s = defineString();
1697         }
1698         return s;
1699     }
1700 
1701     private String defineString() {
1702         String s = string;
1703         if (s != null) {
1704             return s;
1705         }
1706 
1707         StringBuilder sb = new StringBuilder();
1708         if (scheme != null) {
1709             sb.append(scheme);
1710             sb.append(':');
1711         }
1712         if (isOpaque()) {
1713             sb.append(schemeSpecificPart);
1714         } else {
1715             if (host != null) {
1716                 sb.append("//");
1717                 if (userInfo != null) {
1718                     sb.append(userInfo);
1719                     sb.append('@');
1720                 }
1721                 boolean needBrackets = ((host.indexOf(':') >= 0)
1722                         && !host.startsWith("[")
1723                         && !host.endsWith("]"));
1724                 if (needBrackets) sb.append('[');
1725                 sb.append(host);
1726                 if (needBrackets) sb.append(']');
1727                 if (port != -1) {
1728                     sb.append(':');
1729                     sb.append(port);
1730                 }
1731             } else if (authority != null) {
1732                 sb.append("//");
1733                 sb.append(authority);
1734             }
1735             if (path != null)
1736                 sb.append(path);
1737             if (query != null) {
1738                 sb.append('?');
1739                 sb.append(query);
1740             }
1741         }
1742         if (fragment != null) {
1743             sb.append('#');
1744             sb.append(fragment);
1745         }
1746         return string = sb.toString();
1747     }
1748 
1749     /**
1750      * Returns the content of this URI as a US-ASCII string.
1751      *
1752      * <p> If this URI does not contain any characters in the <i>other</i>
1753      * category then an invocation of this method will return the same value as
1754      * an invocation of the {@link #toString() toString} method.  Otherwise
1755      * this method works as if by invoking that method and then <a
1756      * href="#encode">encoding</a> the result.  </p>
1757      *
1758      * @return  The string form of this URI, encoded as needed
1759      *          so that it only contains characters in the US-ASCII
1760      *          charset
1761      */
1762     public String toASCIIString() {
1763         return encode(toString());
1764     }
1765 
1766 
1767     // -- Serialization support --
1768 
1769     /**
1770      * Saves the content of this URI to the given serial stream.
1771      *
1772      * <p> The only serializable field of a URI instance is its {@code string}
1773      * field.  That field is given a value, if it does not have one already,
1774      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1775      * method of the given object-output stream is invoked. </p>
1776      *
1777      * @param  os  The object-output stream to which this object
1778      *             is to be written
1779      */
1780     private void writeObject(ObjectOutputStream os)
1781         throws IOException
1782     {
1783         defineString();
1784         os.defaultWriteObject();        // Writes the string field only
1785     }
1786 
1787     /**
1788      * Reconstitutes a URI from the given serial stream.
1789      *
1790      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1791      * invoked to read the value of the {@code string} field.  The result is
1792      * then parsed in the usual way.
1793      *
1794      * @param  is  The object-input stream from which this object
1795      *             is being read
1796      */
1797     private void readObject(ObjectInputStream is)
1798         throws ClassNotFoundException, IOException
1799     {
1800         port = -1;                      // Argh
1801         is.defaultReadObject();
1802         try {
1803             new Parser(string).parse(false);
1804         } catch (URISyntaxException x) {
1805             IOException y = new InvalidObjectException("Invalid URI");
1806             y.initCause(x);
1807             throw y;
1808         }
1809     }
1810 
1811 
1812     // -- End of public methods --
1813 
1814 
1815     // -- Utility methods for string-field comparison and hashing --
1816 
1817     // These methods return appropriate values for null string arguments,
1818     // thereby simplifying the equals, hashCode, and compareTo methods.
1819     //
1820     // The case-ignoring methods should only be applied to strings whose
1821     // characters are all known to be US-ASCII.  Because of this restriction,
1822     // these methods are faster than the similar methods in the String class.
1823 
1824     // US-ASCII only
1825     private static int toLower(char c) {
1826         if ((c >= 'A') && (c <= 'Z'))
1827             return c + ('a' - 'A');
1828         return c;
1829     }
1830 
1831     // US-ASCII only
1832     private static int toUpper(char c) {
1833         if ((c >= 'a') && (c <= 'z'))
1834             return c - ('a' - 'A');
1835         return c;
1836     }
1837 
1838     private static boolean equal(String s, String t) {
1839         if (s == t) return true;
1840         if ((s != null) && (t != null)) {
1841             if (s.length() != t.length())
1842                 return false;
1843             if (s.indexOf('%') < 0)
1844                 return s.equals(t);
1845             int n = s.length();
1846             for (int i = 0; i < n;) {
1847                 char c = s.charAt(i);
1848                 char d = t.charAt(i);
1849                 if (c != '%') {
1850                     if (c != d)
1851                         return false;
1852                     i++;
1853                     continue;
1854                 }
1855                 if (d != '%')
1856                     return false;
1857                 i++;
1858                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1859                     return false;
1860                 i++;
1861                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1862                     return false;
1863                 i++;
1864             }
1865             return true;
1866         }
1867         return false;
1868     }
1869 
1870     // US-ASCII only
1871     private static boolean equalIgnoringCase(String s, String t) {
1872         if (s == t) return true;
1873         if ((s != null) && (t != null)) {
1874             int n = s.length();
1875             if (t.length() != n)
1876                 return false;
1877             for (int i = 0; i < n; i++) {
1878                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1879                     return false;
1880             }
1881             return true;
1882         }
1883         return false;
1884     }
1885 
1886     private static int hash(int hash, String s) {
1887         if (s == null) return hash;
1888         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1889                                   : normalizedHash(hash, s);
1890     }
1891 
1892 
1893     private static int normalizedHash(int hash, String s) {
1894         int h = 0;
1895         for (int index = 0; index < s.length(); index++) {
1896             char ch = s.charAt(index);
1897             h = 31 * h + ch;
1898             if (ch == '%') {
1899                 /*
1900                  * Process the next two encoded characters
1901                  */
1902                 for (int i = index + 1; i < index + 3; i++)
1903                     h = 31 * h + toUpper(s.charAt(i));
1904                 index += 2;
1905             }
1906         }
1907         return hash * 127 + h;
1908     }
1909 
1910     // US-ASCII only
1911     private static int hashIgnoringCase(int hash, String s) {
1912         if (s == null) return hash;
1913         int h = hash;
1914         int n = s.length();
1915         for (int i = 0; i < n; i++)
1916             h = 31 * h + toLower(s.charAt(i));
1917         return h;
1918     }
1919 
1920     private static int compare(String s, String t) {
1921         if (s == t) return 0;
1922         if (s != null) {
1923             if (t != null)
1924                 return s.compareTo(t);
1925             else
1926                 return +1;
1927         } else {
1928             return -1;
1929         }
1930     }
1931 
1932     // US-ASCII only
1933     private static int compareIgnoringCase(String s, String t) {
1934         if (s == t) return 0;
1935         if (s != null) {
1936             if (t != null) {
1937                 int sn = s.length();
1938                 int tn = t.length();
1939                 int n = sn < tn ? sn : tn;
1940                 for (int i = 0; i < n; i++) {
1941                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1942                     if (c != 0)
1943                         return c;
1944                 }
1945                 return sn - tn;
1946             }
1947             return +1;
1948         } else {
1949             return -1;
1950         }
1951     }
1952 
1953 
1954     // -- String construction --
1955 
1956     // If a scheme is given then the path, if given, must be absolute
1957     //
1958     private static void checkPath(String s, String scheme, String path)
1959         throws URISyntaxException
1960     {
1961         if (scheme != null) {
1962             if (path != null && !path.isEmpty() && path.charAt(0) != '/')
1963                 throw new URISyntaxException(s, "Relative path in absolute URI");
1964         }
1965     }
1966 
1967     private void appendAuthority(StringBuilder sb,
1968                                  String authority,
1969                                  String userInfo,
1970                                  String host,
1971                                  int port)
1972     {
1973         if (host != null) {
1974             sb.append("//");
1975             if (userInfo != null) {
1976                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1977                 sb.append('@');
1978             }
1979             boolean needBrackets = ((host.indexOf(':') >= 0)
1980                                     && !host.startsWith("[")
1981                                     && !host.endsWith("]"));
1982             if (needBrackets) sb.append('[');
1983             sb.append(host);
1984             if (needBrackets) sb.append(']');
1985             if (port != -1) {
1986                 sb.append(':');
1987                 sb.append(port);
1988             }
1989         } else if (authority != null) {
1990             sb.append("//");
1991             if (authority.startsWith("[")) {
1992                 // authority should (but may not) contain an embedded IPv6 address
1993                 int end = authority.indexOf(']');
1994                 String doquote = authority, dontquote = "";
1995                 if (end != -1 && authority.indexOf(':') != -1) {
1996                     // the authority contains an IPv6 address
1997                     if (end == authority.length()) {
1998                         dontquote = authority;
1999                         doquote = "";
2000                     } else {
2001                         dontquote = authority.substring(0 , end + 1);
2002                         doquote = authority.substring(end + 1);
2003                     }
2004                 }
2005                 sb.append(dontquote);
2006                 sb.append(quote(doquote,
2007                             L_REG_NAME | L_SERVER,
2008                             H_REG_NAME | H_SERVER));
2009             } else {
2010                 sb.append(quote(authority,
2011                             L_REG_NAME | L_SERVER,
2012                             H_REG_NAME | H_SERVER));
2013             }
2014         }
2015     }
2016 
2017     private void appendSchemeSpecificPart(StringBuilder sb,
2018                                           String opaquePart,
2019                                           String authority,
2020                                           String userInfo,
2021                                           String host,
2022                                           int port,
2023                                           String path,
2024                                           String query)
2025     {
2026         if (opaquePart != null) {
2027             /* check if SSP begins with an IPv6 address
2028              * because we must not quote a literal IPv6 address
2029              */
2030             if (opaquePart.startsWith("//[")) {
2031                 int end =  opaquePart.indexOf(']');
2032                 if (end != -1 && opaquePart.indexOf(':')!=-1) {
2033                     String doquote, dontquote;
2034                     if (end == opaquePart.length()) {
2035                         dontquote = opaquePart;
2036                         doquote = "";
2037                     } else {
2038                         dontquote = opaquePart.substring(0,end+1);
2039                         doquote = opaquePart.substring(end+1);
2040                     }
2041                     sb.append (dontquote);
2042                     sb.append(quote(doquote, L_URIC, H_URIC));
2043                 }
2044             } else {
2045                 sb.append(quote(opaquePart, L_URIC, H_URIC));
2046             }
2047         } else {
2048             appendAuthority(sb, authority, userInfo, host, port);
2049             if (path != null)
2050                 sb.append(quote(path, L_PATH, H_PATH));
2051             if (query != null) {
2052                 sb.append('?');
2053                 sb.append(quote(query, L_URIC, H_URIC));
2054             }
2055         }
2056     }
2057 
2058     private void appendFragment(StringBuilder sb, String fragment) {
2059         if (fragment != null) {
2060             sb.append('#');
2061             sb.append(quote(fragment, L_URIC, H_URIC));
2062         }
2063     }
2064 
2065     private String toString(String scheme,
2066                             String opaquePart,
2067                             String authority,
2068                             String userInfo,
2069                             String host,
2070                             int port,
2071                             String path,
2072                             String query,
2073                             String fragment)
2074     {
2075         StringBuilder sb = new StringBuilder();
2076         if (scheme != null) {
2077             sb.append(scheme);
2078             sb.append(':');
2079         }
2080         appendSchemeSpecificPart(sb, opaquePart,
2081                                  authority, userInfo, host, port,
2082                                  path, query);
2083         appendFragment(sb, fragment);
2084         return sb.toString();
2085     }
2086 
2087     // -- Normalization, resolution, and relativization --
2088 
2089     // RFC2396 5.2 (6)
2090     private static String resolvePath(String base, String child,
2091                                       boolean absolute)
2092     {
2093         int i = base.lastIndexOf('/');
2094         int cn = child.length();
2095         String path = "";
2096 
2097         if (cn == 0) {
2098             // 5.2 (6a)
2099             if (i >= 0)
2100                 path = base.substring(0, i + 1);
2101         } else {
2102             StringBuilder sb = new StringBuilder(base.length() + cn);
2103             // 5.2 (6a)
2104             if (i >= 0)
2105                 sb.append(base, 0, i + 1);
2106             // 5.2 (6b)
2107             sb.append(child);
2108             path = sb.toString();
2109         }
2110 
2111         // 5.2 (6c-f)
2112         String np = normalize(path);
2113 
2114         // 5.2 (6g): If the result is absolute but the path begins with "../",
2115         // then we simply leave the path as-is
2116 
2117         return np;
2118     }
2119 
2120     // RFC2396 5.2
2121     private static URI resolve(URI base, URI child) {
2122         // check if child if opaque first so that NPE is thrown
2123         // if child is null.
2124         if (child.isOpaque() || base.isOpaque())
2125             return child;
2126 
2127         // 5.2 (2): Reference to current document (lone fragment)
2128         if ((child.scheme == null) && (child.authority == null)
2129             && child.path.isEmpty() && (child.fragment != null)
2130             && (child.query == null)) {
2131             if ((base.fragment != null)
2132                 && child.fragment.equals(base.fragment)) {
2133                 return base;
2134             }
2135             URI ru = new URI();
2136             ru.scheme = base.scheme;
2137             ru.authority = base.authority;
2138             ru.userInfo = base.userInfo;
2139             ru.host = base.host;
2140             ru.port = base.port;
2141             ru.path = base.path;
2142             ru.fragment = child.fragment;
2143             ru.query = base.query;
2144             return ru;
2145         }
2146 
2147         // 5.2 (3): Child is absolute
2148         if (child.scheme != null)
2149             return child;
2150 
2151         URI ru = new URI();             // Resolved URI
2152         ru.scheme = base.scheme;
2153         ru.query = child.query;
2154         ru.fragment = child.fragment;
2155 
2156         // 5.2 (4): Authority
2157         if (child.authority == null) {
2158             ru.authority = base.authority;
2159             ru.host = base.host;
2160             ru.userInfo = base.userInfo;
2161             ru.port = base.port;
2162 
2163             String cp = (child.path == null) ? "" : child.path;
2164             if (!cp.isEmpty() && cp.charAt(0) == '/') {
2165                 // 5.2 (5): Child path is absolute
2166                 ru.path = child.path;
2167             } else {
2168                 // 5.2 (6): Resolve relative path
2169                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2170             }
2171         } else {
2172             ru.authority = child.authority;
2173             ru.host = child.host;
2174             ru.userInfo = child.userInfo;
2175             ru.host = child.host;
2176             ru.port = child.port;
2177             ru.path = child.path;
2178         }
2179 
2180         // 5.2 (7): Recombine (nothing to do here)
2181         return ru;
2182     }
2183 
2184     // If the given URI's path is normal then return the URI;
2185     // o.w., return a new URI containing the normalized path.
2186     //
2187     private static URI normalize(URI u) {
2188         if (u.isOpaque() || u.path == null || u.path.isEmpty())
2189             return u;
2190 
2191         String np = normalize(u.path);
2192         if (np == u.path)
2193             return u;
2194 
2195         URI v = new URI();
2196         v.scheme = u.scheme;
2197         v.fragment = u.fragment;
2198         v.authority = u.authority;
2199         v.userInfo = u.userInfo;
2200         v.host = u.host;
2201         v.port = u.port;
2202         v.path = np;
2203         v.query = u.query;
2204         return v;
2205     }
2206 
2207     // If both URIs are hierarchical, their scheme and authority components are
2208     // identical, and the base path is a prefix of the child's path, then
2209     // return a relative URI that, when resolved against the base, yields the
2210     // child; otherwise, return the child.
2211     //
2212     private static URI relativize(URI base, URI child) {
2213         // check if child if opaque first so that NPE is thrown
2214         // if child is null.
2215         if (child.isOpaque() || base.isOpaque())
2216             return child;
2217         if (!equalIgnoringCase(base.scheme, child.scheme)
2218             || !equal(base.authority, child.authority))
2219             return child;
2220 
2221         String bp = normalize(base.path);
2222         String cp = normalize(child.path);
2223         if (!bp.equals(cp)) {
2224             if (!bp.endsWith("/"))
2225                 bp = bp + "/";
2226             if (!cp.startsWith(bp))
2227                 return child;
2228         }
2229 
2230         URI v = new URI();
2231         v.path = cp.substring(bp.length());
2232         v.query = child.query;
2233         v.fragment = child.fragment;
2234         return v;
2235     }
2236 
2237 
2238 
2239     // -- Path normalization --
2240 
2241     // The following algorithm for path normalization avoids the creation of a
2242     // string object for each segment, as well as the use of a string buffer to
2243     // compute the final result, by using a single char array and editing it in
2244     // place.  The array is first split into segments, replacing each slash
2245     // with '\0' and creating a segment-index array, each element of which is
2246     // the index of the first char in the corresponding segment.  We then walk
2247     // through both arrays, removing ".", "..", and other segments as necessary
2248     // by setting their entries in the index array to -1.  Finally, the two
2249     // arrays are used to rejoin the segments and compute the final result.
2250     //
2251     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2252 
2253 
2254     // Check the given path to see if it might need normalization.  A path
2255     // might need normalization if it contains duplicate slashes, a "."
2256     // segment, or a ".." segment.  Return -1 if no further normalization is
2257     // possible, otherwise return the number of segments found.
2258     //
2259     // This method takes a string argument rather than a char array so that
2260     // this test can be performed without invoking path.toCharArray().
2261     //
2262     private static int needsNormalization(String path) {
2263         boolean normal = true;
2264         int ns = 0;                     // Number of segments
2265         int end = path.length() - 1;    // Index of last char in path
2266         int p = 0;                      // Index of next char in path
2267 
2268         // Skip initial slashes
2269         while (p <= end) {
2270             if (path.charAt(p) != '/') break;
2271             p++;
2272         }
2273         if (p > 1) normal = false;
2274 
2275         // Scan segments
2276         while (p <= end) {
2277 
2278             // Looking at "." or ".." ?
2279             if ((path.charAt(p) == '.')
2280                 && ((p == end)
2281                     || ((path.charAt(p + 1) == '/')
2282                         || ((path.charAt(p + 1) == '.')
2283                             && ((p + 1 == end)
2284                                 || (path.charAt(p + 2) == '/')))))) {
2285                 normal = false;
2286             }
2287             ns++;
2288 
2289             // Find beginning of next segment
2290             while (p <= end) {
2291                 if (path.charAt(p++) != '/')
2292                     continue;
2293 
2294                 // Skip redundant slashes
2295                 while (p <= end) {
2296                     if (path.charAt(p) != '/') break;
2297                     normal = false;
2298                     p++;
2299                 }
2300 
2301                 break;
2302             }
2303         }
2304 
2305         return normal ? -1 : ns;
2306     }
2307 
2308 
2309     // Split the given path into segments, replacing slashes with nulls and
2310     // filling in the given segment-index array.
2311     //
2312     // Preconditions:
2313     //   segs.length == Number of segments in path
2314     //
2315     // Postconditions:
2316     //   All slashes in path replaced by '\0'
2317     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2318     //
2319     private static void split(char[] path, int[] segs) {
2320         int end = path.length - 1;      // Index of last char in path
2321         int p = 0;                      // Index of next char in path
2322         int i = 0;                      // Index of current segment
2323 
2324         // Skip initial slashes
2325         while (p <= end) {
2326             if (path[p] != '/') break;
2327             path[p] = '\0';
2328             p++;
2329         }
2330 
2331         while (p <= end) {
2332 
2333             // Note start of segment
2334             segs[i++] = p++;
2335 
2336             // Find beginning of next segment
2337             while (p <= end) {
2338                 if (path[p++] != '/')
2339                     continue;
2340                 path[p - 1] = '\0';
2341 
2342                 // Skip redundant slashes
2343                 while (p <= end) {
2344                     if (path[p] != '/') break;
2345                     path[p++] = '\0';
2346                 }
2347                 break;
2348             }
2349         }
2350 
2351         if (i != segs.length)
2352             throw new InternalError();  // ASSERT
2353     }
2354 
2355 
2356     // Join the segments in the given path according to the given segment-index
2357     // array, ignoring those segments whose index entries have been set to -1,
2358     // and inserting slashes as needed.  Return the length of the resulting
2359     // path.
2360     //
2361     // Preconditions:
2362     //   segs[i] == -1 implies segment i is to be ignored
2363     //   path computed by split, as above, with '\0' having replaced '/'
2364     //
2365     // Postconditions:
2366     //   path[0] .. path[return value] == Resulting path
2367     //
2368     private static int join(char[] path, int[] segs) {
2369         int ns = segs.length;           // Number of segments
2370         int end = path.length - 1;      // Index of last char in path
2371         int p = 0;                      // Index of next path char to write
2372 
2373         if (path[p] == '\0') {
2374             // Restore initial slash for absolute paths
2375             path[p++] = '/';
2376         }
2377 
2378         for (int i = 0; i < ns; i++) {
2379             int q = segs[i];            // Current segment
2380             if (q == -1)
2381                 // Ignore this segment
2382                 continue;
2383 
2384             if (p == q) {
2385                 // We're already at this segment, so just skip to its end
2386                 while ((p <= end) && (path[p] != '\0'))
2387                     p++;
2388                 if (p <= end) {
2389                     // Preserve trailing slash
2390                     path[p++] = '/';
2391                 }
2392             } else if (p < q) {
2393                 // Copy q down to p
2394                 while ((q <= end) && (path[q] != '\0'))
2395                     path[p++] = path[q++];
2396                 if (q <= end) {
2397                     // Preserve trailing slash
2398                     path[p++] = '/';
2399                 }
2400             } else
2401                 throw new InternalError(); // ASSERT false
2402         }
2403 
2404         return p;
2405     }
2406 
2407 
2408     // Remove "." segments from the given path, and remove segment pairs
2409     // consisting of a non-".." segment followed by a ".." segment.
2410     //
2411     private static void removeDots(char[] path, int[] segs) {
2412         int ns = segs.length;
2413         int end = path.length - 1;
2414 
2415         for (int i = 0; i < ns; i++) {
2416             int dots = 0;               // Number of dots found (0, 1, or 2)
2417 
2418             // Find next occurrence of "." or ".."
2419             do {
2420                 int p = segs[i];
2421                 if (path[p] == '.') {
2422                     if (p == end) {
2423                         dots = 1;
2424                         break;
2425                     } else if (path[p + 1] == '\0') {
2426                         dots = 1;
2427                         break;
2428                     } else if ((path[p + 1] == '.')
2429                                && ((p + 1 == end)
2430                                    || (path[p + 2] == '\0'))) {
2431                         dots = 2;
2432                         break;
2433                     }
2434                 }
2435                 i++;
2436             } while (i < ns);
2437             if ((i > ns) || (dots == 0))
2438                 break;
2439 
2440             if (dots == 1) {
2441                 // Remove this occurrence of "."
2442                 segs[i] = -1;
2443             } else {
2444                 // If there is a preceding non-".." segment, remove both that
2445                 // segment and this occurrence of ".."; otherwise, leave this
2446                 // ".." segment as-is.
2447                 int j;
2448                 for (j = i - 1; j >= 0; j--) {
2449                     if (segs[j] != -1) break;
2450                 }
2451                 if (j >= 0) {
2452                     int q = segs[j];
2453                     if (!((path[q] == '.')
2454                           && (path[q + 1] == '.')
2455                           && (path[q + 2] == '\0'))) {
2456                         segs[i] = -1;
2457                         segs[j] = -1;
2458                     }
2459                 }
2460             }
2461         }
2462     }
2463 
2464 
2465     // DEVIATION: If the normalized path is relative, and if the first
2466     // segment could be parsed as a scheme name, then prepend a "." segment
2467     //
2468     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2469 
2470         if (path[0] == '\0')
2471             // The path is absolute
2472             return;
2473 
2474         int ns = segs.length;
2475         int f = 0;                      // Index of first segment
2476         while (f < ns) {
2477             if (segs[f] >= 0)
2478                 break;
2479             f++;
2480         }
2481         if ((f >= ns) || (f == 0))
2482             // The path is empty, or else the original first segment survived,
2483             // in which case we already know that no leading "." is needed
2484             return;
2485 
2486         int p = segs[f];
2487         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2488         if (p >= path.length || path[p] == '\0')
2489             // No colon in first segment, so no "." needed
2490             return;
2491 
2492         // At this point we know that the first segment is unused,
2493         // hence we can insert a "." segment at that position
2494         path[0] = '.';
2495         path[1] = '\0';
2496         segs[0] = 0;
2497     }
2498 
2499 
2500     // Normalize the given path string.  A normal path string has no empty
2501     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2502     // segments equal to ".." that are preceded by a segment not equal to "..".
2503     // In contrast to Unix-style pathname normalization, for URI paths we
2504     // always retain trailing slashes.
2505     //
2506     private static String normalize(String ps) {
2507 
2508         // Does this path need normalization?
2509         int ns = needsNormalization(ps);        // Number of segments
2510         if (ns < 0)
2511             // Nope -- just return it
2512             return ps;
2513 
2514         char[] path = ps.toCharArray();         // Path in char-array form
2515 
2516         // Split path into segments
2517         int[] segs = new int[ns];               // Segment-index array
2518         split(path, segs);
2519 
2520         // Remove dots
2521         removeDots(path, segs);
2522 
2523         // Prevent scheme-name confusion
2524         maybeAddLeadingDot(path, segs);
2525 
2526         // Join the remaining segments and return the result
2527         String s = new String(path, 0, join(path, segs));
2528         if (s.equals(ps)) {
2529             // string was already normalized
2530             return ps;
2531         }
2532         return s;
2533     }
2534 
2535 
2536 
2537     // -- Character classes for parsing --
2538 
2539     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2540     // permissible in the various components of a URI reference.  We here
2541     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2542     // mask pair consists of two longs, a low mask and a high mask.  Taken
2543     // together they represent a 128-bit mask, where bit i is set iff the
2544     // character with value i is permitted.
2545     //
2546     // This approach is more efficient than sequentially searching arrays of
2547     // permitted characters.  It could be made still more efficient by
2548     // precompiling the mask information so that a character's presence in a
2549     // given mask could be determined by a single table lookup.
2550 
2551     // To save startup time, we manually calculate the low-/highMask constants.
2552     // For reference, the following methods were used to calculate the values:
2553 
2554     // Compute the low-order mask for the characters in the given string
2555     //     private static long lowMask(String chars) {
2556     //        int n = chars.length();
2557     //        long m = 0;
2558     //        for (int i = 0; i < n; i++) {
2559     //            char c = chars.charAt(i);
2560     //            if (c < 64)
2561     //                m |= (1L << c);
2562     //        }
2563     //        return m;
2564     //    }
2565 
2566     // Compute the high-order mask for the characters in the given string
2567     //    private static long highMask(String chars) {
2568     //        int n = chars.length();
2569     //        long m = 0;
2570     //        for (int i = 0; i < n; i++) {
2571     //            char c = chars.charAt(i);
2572     //            if ((c >= 64) && (c < 128))
2573     //                m |= (1L << (c - 64));
2574     //        }
2575     //        return m;
2576     //    }
2577 
2578     // Compute a low-order mask for the characters
2579     // between first and last, inclusive
2580     //    private static long lowMask(char first, char last) {
2581     //        long m = 0;
2582     //        int f = Math.max(Math.min(first, 63), 0);
2583     //        int l = Math.max(Math.min(last, 63), 0);
2584     //        for (int i = f; i <= l; i++)
2585     //            m |= 1L << i;
2586     //        return m;
2587     //    }
2588 
2589     // Compute a high-order mask for the characters
2590     // between first and last, inclusive
2591     //    private static long highMask(char first, char last) {
2592     //        long m = 0;
2593     //        int f = Math.max(Math.min(first, 127), 64) - 64;
2594     //        int l = Math.max(Math.min(last, 127), 64) - 64;
2595     //        for (int i = f; i <= l; i++)
2596     //            m |= 1L << i;
2597     //        return m;
2598     //    }
2599 
2600     // Tell whether the given character is permitted by the given mask pair
2601     private static boolean match(char c, long lowMask, long highMask) {
2602         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2603             return false;
2604         if (c < 64)
2605             return ((1L << c) & lowMask) != 0;
2606         if (c < 128)
2607             return ((1L << (c - 64)) & highMask) != 0;
2608         return false;
2609     }
2610 
2611     // Character-class masks, in reverse order from RFC2396 because
2612     // initializers for static fields cannot make forward references.
2613 
2614     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2615     //            "8" | "9"
2616     private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9');
2617     private static final long H_DIGIT = 0L;
2618 
2619     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2620     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2621     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2622     private static final long L_UPALPHA = 0L;
2623     private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z');
2624 
2625     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2626     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2627     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2628     private static final long L_LOWALPHA = 0L;
2629     private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z');
2630 
2631     // alpha         = lowalpha | upalpha
2632     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2633     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2634 
2635     // alphanum      = alpha | digit
2636     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2637     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2638 
2639     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2640     //                         "a" | "b" | "c" | "d" | "e" | "f"
2641     private static final long L_HEX = L_DIGIT;
2642     private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f');
2643 
2644     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2645     //                 "(" | ")"
2646     private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()");
2647     private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()");
2648 
2649     // unreserved    = alphanum | mark
2650     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2651     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2652 
2653     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2654     //                 "$" | "," | "[" | "]"
2655     // Added per RFC2732: "[", "]"
2656     private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]");
2657     private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]");
2658 
2659     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2660     // characters are allowed; this is handled by the scanEscape method below.
2661     private static final long L_ESCAPED = 1L;
2662     private static final long H_ESCAPED = 0L;
2663 
2664     // uric          = reserved | unreserved | escaped
2665     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2666     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2667 
2668     // pchar         = unreserved | escaped |
2669     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2670     private static final long L_PCHAR
2671         = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,");
2672     private static final long H_PCHAR
2673         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,");
2674 
2675     // All valid path characters
2676     private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/");
2677     private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L;
2678 
2679     // Dash, for use in domainlabel and toplabel
2680     private static final long L_DASH = 0x200000000000L; // lowMask("-");
2681     private static final long H_DASH = 0x0L; // highMask("-");
2682 
2683     // Dot, for use in hostnames
2684     private static final long L_DOT = 0x400000000000L; // lowMask(".");
2685     private static final long H_DOT = 0x0L; // highMask(".");
2686 
2687     // userinfo      = *( unreserved | escaped |
2688     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2689     private static final long L_USERINFO
2690         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,");
2691     private static final long H_USERINFO
2692         = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L;
2693 
2694     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2695     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2696     private static final long L_REG_NAME
2697         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+");
2698     private static final long H_REG_NAME
2699         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+");
2700 
2701     // All valid characters for server-based authorities
2702     private static final long L_SERVER
2703         = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]");
2704     private static final long H_SERVER
2705         = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]");
2706 
2707     // Special case of server authority that represents an IPv6 address
2708     // In this case, a % does not signify an escape sequence
2709     private static final long L_SERVER_PERCENT
2710         = L_SERVER | 0x2000000000L; // lowMask("%");
2711     private static final long H_SERVER_PERCENT
2712         = H_SERVER; // | highMask("%") == 0L;
2713 
2714     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2715     private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-.");
2716     private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L
2717 
2718     // scope_id = alpha | digit | "_" | "."
2719     private static final long L_SCOPE_ID
2720         = L_ALPHANUM | 0x400000000000L; // lowMask("_.");
2721     private static final long H_SCOPE_ID
2722         = H_ALPHANUM | 0x80000000L; // highMask("_.");
2723 
2724     // -- Escaping and encoding --
2725 
2726     private static final char[] hexDigits = {
2727         '0', '1', '2', '3', '4', '5', '6', '7',
2728         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2729     };
2730 
2731     private static void appendEscape(StringBuilder sb, byte b) {
2732         sb.append('%');
2733         sb.append(hexDigits[(b >> 4) & 0x0f]);
2734         sb.append(hexDigits[(b >> 0) & 0x0f]);
2735     }
2736 
2737     private static void appendEncoded(StringBuilder sb, char c) {
2738         ByteBuffer bb = null;
2739         try {
2740             bb = ThreadLocalCoders.encoderFor("UTF-8")
2741                 .encode(CharBuffer.wrap("" + c));
2742         } catch (CharacterCodingException x) {
2743             assert false;
2744         }
2745         while (bb.hasRemaining()) {
2746             int b = bb.get() & 0xff;
2747             if (b >= 0x80)
2748                 appendEscape(sb, (byte)b);
2749             else
2750                 sb.append((char)b);
2751         }
2752     }
2753 
2754     // Quote any characters in s that are not permitted
2755     // by the given mask pair
2756     //
2757     private static String quote(String s, long lowMask, long highMask) {
2758         StringBuilder sb = null;
2759         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2760         for (int i = 0; i < s.length(); i++) {
2761             char c = s.charAt(i);
2762             if (c < '\u0080') {
2763                 if (!match(c, lowMask, highMask)) {
2764                     if (sb == null) {
2765                         sb = new StringBuilder();
2766                         sb.append(s, 0, i);
2767                     }
2768                     appendEscape(sb, (byte)c);
2769                 } else {
2770                     if (sb != null)
2771                         sb.append(c);
2772                 }
2773             } else if (allowNonASCII
2774                        && (Character.isSpaceChar(c)
2775                            || Character.isISOControl(c))) {
2776                 if (sb == null) {
2777                     sb = new StringBuilder();
2778                     sb.append(s, 0, i);
2779                 }
2780                 appendEncoded(sb, c);
2781             } else {
2782                 if (sb != null)
2783                     sb.append(c);
2784             }
2785         }
2786         return (sb == null) ? s : sb.toString();
2787     }
2788 
2789     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2790     // assuming that s is otherwise legal
2791     //
2792     private static String encode(String s) {
2793         int n = s.length();
2794         if (n == 0)
2795             return s;
2796 
2797         // First check whether we actually need to encode
2798         for (int i = 0;;) {
2799             if (s.charAt(i) >= '\u0080')
2800                 break;
2801             if (++i >= n)
2802                 return s;
2803         }
2804 
2805         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2806         ByteBuffer bb = null;
2807         try {
2808             bb = ThreadLocalCoders.encoderFor("UTF-8")
2809                 .encode(CharBuffer.wrap(ns));
2810         } catch (CharacterCodingException x) {
2811             assert false;
2812         }
2813 
2814         StringBuilder sb = new StringBuilder();
2815         while (bb.hasRemaining()) {
2816             int b = bb.get() & 0xff;
2817             if (b >= 0x80)
2818                 appendEscape(sb, (byte)b);
2819             else
2820                 sb.append((char)b);
2821         }
2822         return sb.toString();
2823     }
2824 
2825     private static int decode(char c) {
2826         if ((c >= '0') && (c <= '9'))
2827             return c - '0';
2828         if ((c >= 'a') && (c <= 'f'))
2829             return c - 'a' + 10;
2830         if ((c >= 'A') && (c <= 'F'))
2831             return c - 'A' + 10;
2832         assert false;
2833         return -1;
2834     }
2835 
2836     private static byte decode(char c1, char c2) {
2837         return (byte)(  ((decode(c1) & 0xf) << 4)
2838                       | ((decode(c2) & 0xf) << 0));
2839     }
2840 
2841     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2842     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2843     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2844     // are replaced with '\uFFFD'.
2845     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2846     //            with a scope_id
2847     //
2848     private static String decode(String s) {
2849         return decode(s, true);
2850     }
2851 
2852     // This method was introduced as a generalization of URI.decode method
2853     // to provide a fix for JDK-8037396
2854     private static String decode(String s, boolean ignorePercentInBrackets) {
2855         if (s == null)
2856             return s;
2857         int n = s.length();
2858         if (n == 0)
2859             return s;
2860         if (s.indexOf('%') < 0)
2861             return s;
2862 
2863         StringBuilder sb = new StringBuilder(n);
2864         ByteBuffer bb = ByteBuffer.allocate(n);
2865         CharBuffer cb = CharBuffer.allocate(n);
2866         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2867                 .onMalformedInput(CodingErrorAction.REPLACE)
2868                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2869 
2870         // This is not horribly efficient, but it will do for now
2871         char c = s.charAt(0);
2872         boolean betweenBrackets = false;
2873 
2874         for (int i = 0; i < n;) {
2875             assert c == s.charAt(i);    // Loop invariant
2876             if (c == '[') {
2877                 betweenBrackets = true;
2878             } else if (betweenBrackets && c == ']') {
2879                 betweenBrackets = false;
2880             }
2881             if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) {
2882                 sb.append(c);
2883                 if (++i >= n)
2884                     break;
2885                 c = s.charAt(i);
2886                 continue;
2887             }
2888             bb.clear();
2889             int ui = i;
2890             for (;;) {
2891                 assert (n - i >= 2);
2892                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2893                 if (++i >= n)
2894                     break;
2895                 c = s.charAt(i);
2896                 if (c != '%')
2897                     break;
2898             }
2899             bb.flip();
2900             cb.clear();
2901             dec.reset();
2902             CoderResult cr = dec.decode(bb, cb, true);
2903             assert cr.isUnderflow();
2904             cr = dec.flush(cb);
2905             assert cr.isUnderflow();
2906             sb.append(cb.flip().toString());
2907         }
2908 
2909         return sb.toString();
2910     }
2911 
2912 
2913     // -- Parsing --
2914 
2915     // For convenience we wrap the input URI string in a new instance of the
2916     // following internal class.  This saves always having to pass the input
2917     // string as an argument to each internal scan/parse method.
2918 
2919     private class Parser {
2920 
2921         private String input;           // URI input string
2922         private boolean requireServerAuthority = false;
2923 
2924         Parser(String s) {
2925             input = s;
2926             string = s;
2927         }
2928 
2929         // -- Methods for throwing URISyntaxException in various ways --
2930 
2931         private void fail(String reason) throws URISyntaxException {
2932             throw new URISyntaxException(input, reason);
2933         }
2934 
2935         private void fail(String reason, int p) throws URISyntaxException {
2936             throw new URISyntaxException(input, reason, p);
2937         }
2938 
2939         private void failExpecting(String expected, int p)
2940             throws URISyntaxException
2941         {
2942             fail("Expected " + expected, p);
2943         }
2944 
2945 
2946         // -- Simple access to the input string --
2947 
2948         // Tells whether start < end and, if so, whether charAt(start) == c
2949         //
2950         private boolean at(int start, int end, char c) {
2951             return (start < end) && (input.charAt(start) == c);
2952         }
2953 
2954         // Tells whether start + s.length() < end and, if so,
2955         // whether the chars at the start position match s exactly
2956         //
2957         private boolean at(int start, int end, String s) {
2958             int p = start;
2959             int sn = s.length();
2960             if (sn > end - p)
2961                 return false;
2962             int i = 0;
2963             while (i < sn) {
2964                 if (input.charAt(p++) != s.charAt(i)) {
2965                     break;
2966                 }
2967                 i++;
2968             }
2969             return (i == sn);
2970         }
2971 
2972 
2973         // -- Scanning --
2974 
2975         // The various scan and parse methods that follow use a uniform
2976         // convention of taking the current start position and end index as
2977         // their first two arguments.  The start is inclusive while the end is
2978         // exclusive, just as in the String class, i.e., a start/end pair
2979         // denotes the left-open interval [start, end) of the input string.
2980         //
2981         // These methods never proceed past the end position.  They may return
2982         // -1 to indicate outright failure, but more often they simply return
2983         // the position of the first char after the last char scanned.  Thus
2984         // a typical idiom is
2985         //
2986         //     int p = start;
2987         //     int q = scan(p, end, ...);
2988         //     if (q > p)
2989         //         // We scanned something
2990         //         ...;
2991         //     else if (q == p)
2992         //         // We scanned nothing
2993         //         ...;
2994         //     else if (q == -1)
2995         //         // Something went wrong
2996         //         ...;
2997 
2998 
2999         // Scan a specific char: If the char at the given start position is
3000         // equal to c, return the index of the next char; otherwise, return the
3001         // start position.
3002         //
3003         private int scan(int start, int end, char c) {
3004             if ((start < end) && (input.charAt(start) == c))
3005                 return start + 1;
3006             return start;
3007         }
3008 
3009         // Scan forward from the given start position.  Stop at the first char
3010         // in the err string (in which case -1 is returned), or the first char
3011         // in the stop string (in which case the index of the preceding char is
3012         // returned), or the end of the input string (in which case the length
3013         // of the input string is returned).  May return the start position if
3014         // nothing matches.
3015         //
3016         private int scan(int start, int end, String err, String stop) {
3017             int p = start;
3018             while (p < end) {
3019                 char c = input.charAt(p);
3020                 if (err.indexOf(c) >= 0)
3021                     return -1;
3022                 if (stop.indexOf(c) >= 0)
3023                     break;
3024                 p++;
3025             }
3026             return p;
3027         }
3028 
3029         // Scan forward from the given start position.  Stop at the first char
3030         // in the stop string (in which case the index of the preceding char is
3031         // returned), or the end of the input string (in which case the length
3032         // of the input string is returned).  May return the start position if
3033         // nothing matches.
3034         //
3035         private int scan(int start, int end, String stop) {
3036             int p = start;
3037             while (p < end) {
3038                 char c = input.charAt(p);
3039                 if (stop.indexOf(c) >= 0)
3040                     break;
3041                 p++;
3042             }
3043             return p;
3044         }
3045 
3046         // Scan a potential escape sequence, starting at the given position,
3047         // with the given first char (i.e., charAt(start) == c).
3048         //
3049         // This method assumes that if escapes are allowed then visible
3050         // non-US-ASCII chars are also allowed.
3051         //
3052         private int scanEscape(int start, int n, char first)
3053             throws URISyntaxException
3054         {
3055             int p = start;
3056             char c = first;
3057             if (c == '%') {
3058                 // Process escape pair
3059                 if ((p + 3 <= n)
3060                     && match(input.charAt(p + 1), L_HEX, H_HEX)
3061                     && match(input.charAt(p + 2), L_HEX, H_HEX)) {
3062                     return p + 3;
3063                 }
3064                 fail("Malformed escape pair", p);
3065             } else if ((c > 128)
3066                        && !Character.isSpaceChar(c)
3067                        && !Character.isISOControl(c)) {
3068                 // Allow unescaped but visible non-US-ASCII chars
3069                 return p + 1;
3070             }
3071             return p;
3072         }
3073 
3074         // Scan chars that match the given mask pair
3075         //
3076         private int scan(int start, int n, long lowMask, long highMask)
3077             throws URISyntaxException
3078         {
3079             int p = start;
3080             while (p < n) {
3081                 char c = input.charAt(p);
3082                 if (match(c, lowMask, highMask)) {
3083                     p++;
3084                     continue;
3085                 }
3086                 if ((lowMask & L_ESCAPED) != 0) {
3087                     int q = scanEscape(p, n, c);
3088                     if (q > p) {
3089                         p = q;
3090                         continue;
3091                     }
3092                 }
3093                 break;
3094             }
3095             return p;
3096         }
3097 
3098         // Check that each of the chars in [start, end) matches the given mask
3099         //
3100         private void checkChars(int start, int end,
3101                                 long lowMask, long highMask,
3102                                 String what)
3103             throws URISyntaxException
3104         {
3105             int p = scan(start, end, lowMask, highMask);
3106             if (p < end)
3107                 fail("Illegal character in " + what, p);
3108         }
3109 
3110         // Check that the char at position p matches the given mask
3111         //
3112         private void checkChar(int p,
3113                                long lowMask, long highMask,
3114                                String what)
3115             throws URISyntaxException
3116         {
3117             checkChars(p, p + 1, lowMask, highMask, what);
3118         }
3119 
3120 
3121         // -- Parsing --
3122 
3123         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3124         //
3125         void parse(boolean rsa) throws URISyntaxException {
3126             requireServerAuthority = rsa;
3127             int n = input.length();
3128             int p = scan(0, n, "/?#", ":");
3129             if ((p >= 0) && at(p, n, ':')) {
3130                 if (p == 0)
3131                     failExpecting("scheme name", 0);
3132                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3133                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3134                 scheme = input.substring(0, p);
3135                 p++;                    // Skip ':'
3136                 if (at(p, n, '/')) {
3137                     p = parseHierarchical(p, n);
3138                 } else {
3139                     // opaque; need to create the schemeSpecificPart
3140                     int q = scan(p, n, "#");
3141                     if (q <= p)
3142                         failExpecting("scheme-specific part", p);
3143                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3144                     schemeSpecificPart = input.substring(p, q);
3145                     p = q;
3146                 }
3147             } else {
3148                 p = parseHierarchical(0, n);
3149             }
3150             if (at(p, n, '#')) {
3151                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3152                 fragment = input.substring(p + 1, n);
3153                 p = n;
3154             }
3155             if (p < n)
3156                 fail("end of URI", p);
3157         }
3158 
3159         // [//authority]<path>[?<query>]
3160         //
3161         // DEVIATION from RFC2396: We allow an empty authority component as
3162         // long as it's followed by a non-empty path, query component, or
3163         // fragment component.  This is so that URIs such as "file:///foo/bar"
3164         // will parse.  This seems to be the intent of RFC2396, though the
3165         // grammar does not permit it.  If the authority is empty then the
3166         // userInfo, host, and port components are undefined.
3167         //
3168         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3169         // to be the intent of RFC2396, but the grammar does not permit it.
3170         // The primary consequence of this deviation is that "#f" parses as a
3171         // relative URI with an empty path.
3172         //
3173         private int parseHierarchical(int start, int n)
3174             throws URISyntaxException
3175         {
3176             int p = start;
3177             if (at(p, n, '/') && at(p + 1, n, '/')) {
3178                 p += 2;
3179                 int q = scan(p, n, "/?#");
3180                 if (q > p) {
3181                     p = parseAuthority(p, q);
3182                 } else if (q < n) {
3183                     // DEVIATION: Allow empty authority prior to non-empty
3184                     // path, query component or fragment identifier
3185                 } else
3186                     failExpecting("authority", p);
3187             }
3188             int q = scan(p, n, "?#"); // DEVIATION: May be empty
3189             checkChars(p, q, L_PATH, H_PATH, "path");
3190             path = input.substring(p, q);
3191             p = q;
3192             if (at(p, n, '?')) {
3193                 p++;
3194                 q = scan(p, n, "#");
3195                 checkChars(p, q, L_URIC, H_URIC, "query");
3196                 query = input.substring(p, q);
3197                 p = q;
3198             }
3199             return p;
3200         }
3201 
3202         // authority     = server | reg_name
3203         //
3204         // Ambiguity: An authority that is a registry name rather than a server
3205         // might have a prefix that parses as a server.  We use the fact that
3206         // the authority component is always followed by '/' or the end of the
3207         // input string to resolve this: If the complete authority did not
3208         // parse as a server then we try to parse it as a registry name.
3209         //
3210         private int parseAuthority(int start, int n)
3211             throws URISyntaxException
3212         {
3213             int p = start;
3214             int q = p;
3215             URISyntaxException ex = null;
3216 
3217             boolean serverChars;
3218             boolean regChars;
3219 
3220             if (scan(p, n, "]") > p) {
3221                 // contains a literal IPv6 address, therefore % is allowed
3222                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3223             } else {
3224                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3225             }
3226             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3227 
3228             if (regChars && !serverChars) {
3229                 // Must be a registry-based authority
3230                 authority = input.substring(p, n);
3231                 return n;
3232             }
3233 
3234             if (serverChars) {
3235                 // Might be (probably is) a server-based authority, so attempt
3236                 // to parse it as such.  If the attempt fails, try to treat it
3237                 // as a registry-based authority.
3238                 try {
3239                     q = parseServer(p, n);
3240                     if (q < n)
3241                         failExpecting("end of authority", q);
3242                     authority = input.substring(p, n);
3243                 } catch (URISyntaxException x) {
3244                     // Undo results of failed parse
3245                     userInfo = null;
3246                     host = null;
3247                     port = -1;
3248                     if (requireServerAuthority) {
3249                         // If we're insisting upon a server-based authority,
3250                         // then just re-throw the exception
3251                         throw x;
3252                     } else {
3253                         // Save the exception in case it doesn't parse as a
3254                         // registry either
3255                         ex = x;
3256                         q = p;
3257                     }
3258                 }
3259             }
3260 
3261             if (q < n) {
3262                 if (regChars) {
3263                     // Registry-based authority
3264                     authority = input.substring(p, n);
3265                 } else if (ex != null) {
3266                     // Re-throw exception; it was probably due to
3267                     // a malformed IPv6 address
3268                     throw ex;
3269                 } else {
3270                     fail("Illegal character in authority", q);
3271                 }
3272             }
3273 
3274             return n;
3275         }
3276 
3277 
3278         // [<userinfo>@]<host>[:<port>]
3279         //
3280         private int parseServer(int start, int n)
3281             throws URISyntaxException
3282         {
3283             int p = start;
3284             int q;
3285 
3286             // userinfo
3287             q = scan(p, n, "/?#", "@");
3288             if ((q >= p) && at(q, n, '@')) {
3289                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3290                 userInfo = input.substring(p, q);
3291                 p = q + 1;              // Skip '@'
3292             }
3293 
3294             // hostname, IPv4 address, or IPv6 address
3295             if (at(p, n, '[')) {
3296                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3297                 p++;
3298                 q = scan(p, n, "/?#", "]");
3299                 if ((q > p) && at(q, n, ']')) {
3300                     // look for a "%" scope id
3301                     int r = scan (p, q, "%");
3302                     if (r > p) {
3303                         parseIPv6Reference(p, r);
3304                         if (r+1 == q) {
3305                             fail ("scope id expected");
3306                         }
3307                         checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID,
3308                                                 "scope id");
3309                     } else {
3310                         parseIPv6Reference(p, q);
3311                     }
3312                     host = input.substring(p-1, q+1);
3313                     p = q + 1;
3314                 } else {
3315                     failExpecting("closing bracket for IPv6 address", q);
3316                 }
3317             } else {
3318                 q = parseIPv4Address(p, n);
3319                 if (q <= p)
3320                     q = parseHostname(p, n);
3321                 p = q;
3322             }
3323 
3324             // port
3325             if (at(p, n, ':')) {
3326                 p++;
3327                 q = scan(p, n, "/");
3328                 if (q > p) {
3329                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3330                     try {
3331                         port = Integer.parseInt(input, p, q, 10);
3332                     } catch (NumberFormatException x) {
3333                         fail("Malformed port number", p);
3334                     }
3335                     p = q;
3336                 }
3337             }
3338             if (p < n)
3339                 failExpecting("port number", p);
3340 
3341             return p;
3342         }
3343 
3344         // Scan a string of decimal digits whose value fits in a byte
3345         //
3346         private int scanByte(int start, int n)
3347             throws URISyntaxException
3348         {
3349             int p = start;
3350             int q = scan(p, n, L_DIGIT, H_DIGIT);
3351             if (q <= p) return q;
3352             if (Integer.parseInt(input, p, q, 10) > 255) return p;
3353             return q;
3354         }
3355 
3356         // Scan an IPv4 address.
3357         //
3358         // If the strict argument is true then we require that the given
3359         // interval contain nothing besides an IPv4 address; if it is false
3360         // then we only require that it start with an IPv4 address.
3361         //
3362         // If the interval does not contain or start with (depending upon the
3363         // strict argument) a legal IPv4 address characters then we return -1
3364         // immediately; otherwise we insist that these characters parse as a
3365         // legal IPv4 address and throw an exception on failure.
3366         //
3367         // We assume that any string of decimal digits and dots must be an IPv4
3368         // address.  It won't parse as a hostname anyway, so making that
3369         // assumption here allows more meaningful exceptions to be thrown.
3370         //
3371         private int scanIPv4Address(int start, int n, boolean strict)
3372             throws URISyntaxException
3373         {
3374             int p = start;
3375             int q;
3376             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3377             if ((m <= p) || (strict && (m != n)))
3378                 return -1;
3379             for (;;) {
3380                 // Per RFC2732: At most three digits per byte
3381                 // Further constraint: Each element fits in a byte
3382                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3383                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3384                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3385                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3386                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3387                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3388                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3389                 if (q < m) break;
3390                 return q;
3391             }
3392             fail("Malformed IPv4 address", q);
3393             return -1;
3394         }
3395 
3396         // Take an IPv4 address: Throw an exception if the given interval
3397         // contains anything except an IPv4 address
3398         //
3399         private int takeIPv4Address(int start, int n, String expected)
3400             throws URISyntaxException
3401         {
3402             int p = scanIPv4Address(start, n, true);
3403             if (p <= start)
3404                 failExpecting(expected, start);
3405             return p;
3406         }
3407 
3408         // Attempt to parse an IPv4 address, returning -1 on failure but
3409         // allowing the given interval to contain [:<characters>] after
3410         // the IPv4 address.
3411         //
3412         private int parseIPv4Address(int start, int n) {
3413             int p;
3414 
3415             try {
3416                 p = scanIPv4Address(start, n, false);
3417             } catch (URISyntaxException x) {
3418                 return -1;
3419             } catch (NumberFormatException nfe) {
3420                 return -1;
3421             }
3422 
3423             if (p > start && p < n) {
3424                 // IPv4 address is followed by something - check that
3425                 // it's a ":" as this is the only valid character to
3426                 // follow an address.
3427                 if (input.charAt(p) != ':') {
3428                     p = -1;
3429                 }
3430             }
3431 
3432             if (p > start)
3433                 host = input.substring(start, p);
3434 
3435             return p;
3436         }
3437 
3438         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3439         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3440         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3441         //
3442         private int parseHostname(int start, int n)
3443             throws URISyntaxException
3444         {
3445             int p = start;
3446             int q;
3447             int l = -1;                 // Start of last parsed label
3448 
3449             do {
3450                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3451                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3452                 if (q <= p)
3453                     break;
3454                 l = p;
3455                 if (q > p) {
3456                     p = q;
3457                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3458                     if (q > p) {
3459                         if (input.charAt(q - 1) == '-')
3460                             fail("Illegal character in hostname", q - 1);
3461                         p = q;
3462                     }
3463                 }
3464                 q = scan(p, n, '.');
3465                 if (q <= p)
3466                     break;
3467                 p = q;
3468             } while (p < n);
3469 
3470             if ((p < n) && !at(p, n, ':'))
3471                 fail("Illegal character in hostname", p);
3472 
3473             if (l < 0)
3474                 failExpecting("hostname", start);
3475 
3476             // for a fully qualified hostname check that the rightmost
3477             // label starts with an alpha character.
3478             if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) {
3479                 fail("Illegal character in hostname", l);
3480             }
3481 
3482             host = input.substring(start, p);
3483             return p;
3484         }
3485 
3486 
3487         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3488         //
3489         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3490         // the form ::12.34.56.78, which are clearly shown in the examples
3491         // earlier in the document.  Here is the original grammar:
3492         //
3493         //   IPv6address = hexpart [ ":" IPv4address ]
3494         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3495         //   hexseq      = hex4 *( ":" hex4)
3496         //   hex4        = 1*4HEXDIG
3497         //
3498         // We therefore use the following revised grammar:
3499         //
3500         //   IPv6address = hexseq [ ":" IPv4address ]
3501         //                 | hexseq [ "::" [ hexpost ] ]
3502         //                 | "::" [ hexpost ]
3503         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3504         //   hexseq      = hex4 *( ":" hex4)
3505         //   hex4        = 1*4HEXDIG
3506         //
3507         // This covers all and only the following cases:
3508         //
3509         //   hexseq
3510         //   hexseq : IPv4address
3511         //   hexseq ::
3512         //   hexseq :: hexseq
3513         //   hexseq :: hexseq : IPv4address
3514         //   hexseq :: IPv4address
3515         //   :: hexseq
3516         //   :: hexseq : IPv4address
3517         //   :: IPv4address
3518         //   ::
3519         //
3520         // Additionally we constrain the IPv6 address as follows :-
3521         //
3522         //  i.  IPv6 addresses without compressed zeros should contain
3523         //      exactly 16 bytes.
3524         //
3525         //  ii. IPv6 addresses with compressed zeros should contain
3526         //      less than 16 bytes.
3527 
3528         private int ipv6byteCount = 0;
3529 
3530         private int parseIPv6Reference(int start, int n)
3531             throws URISyntaxException
3532         {
3533             int p = start;
3534             int q;
3535             boolean compressedZeros = false;
3536 
3537             q = scanHexSeq(p, n);
3538 
3539             if (q > p) {
3540                 p = q;
3541                 if (at(p, n, "::")) {
3542                     compressedZeros = true;
3543                     p = scanHexPost(p + 2, n);
3544                 } else if (at(p, n, ':')) {
3545                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3546                     ipv6byteCount += 4;
3547                 }
3548             } else if (at(p, n, "::")) {
3549                 compressedZeros = true;
3550                 p = scanHexPost(p + 2, n);
3551             }
3552             if (p < n)
3553                 fail("Malformed IPv6 address", start);
3554             if (ipv6byteCount > 16)
3555                 fail("IPv6 address too long", start);
3556             if (!compressedZeros && ipv6byteCount < 16)
3557                 fail("IPv6 address too short", start);
3558             if (compressedZeros && ipv6byteCount == 16)
3559                 fail("Malformed IPv6 address", start);
3560 
3561             return p;
3562         }
3563 
3564         private int scanHexPost(int start, int n)
3565             throws URISyntaxException
3566         {
3567             int p = start;
3568             int q;
3569 
3570             if (p == n)
3571                 return p;
3572 
3573             q = scanHexSeq(p, n);
3574             if (q > p) {
3575                 p = q;
3576                 if (at(p, n, ':')) {
3577                     p++;
3578                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3579                     ipv6byteCount += 4;
3580                 }
3581             } else {
3582                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3583                 ipv6byteCount += 4;
3584             }
3585             return p;
3586         }
3587 
3588         // Scan a hex sequence; return -1 if one could not be scanned
3589         //
3590         private int scanHexSeq(int start, int n)
3591             throws URISyntaxException
3592         {
3593             int p = start;
3594             int q;
3595 
3596             q = scan(p, n, L_HEX, H_HEX);
3597             if (q <= p)
3598                 return -1;
3599             if (at(q, n, '.'))          // Beginning of IPv4 address
3600                 return -1;
3601             if (q > p + 4)
3602                 fail("IPv6 hexadecimal digit sequence too long", p);
3603             ipv6byteCount += 2;
3604             p = q;
3605             while (p < n) {
3606                 if (!at(p, n, ':'))
3607                     break;
3608                 if (at(p + 1, n, ':'))
3609                     break;              // "::"
3610                 p++;
3611                 q = scan(p, n, L_HEX, H_HEX);
3612                 if (q <= p)
3613                     failExpecting("digits for an IPv6 address", p);
3614                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3615                     p--;
3616                     break;
3617                 }
3618                 if (q > p + 4)
3619                     fail("IPv6 hexadecimal digit sequence too long", p);
3620                 ipv6byteCount += 2;
3621                 p = q;
3622             }
3623 
3624             return p;
3625         }
3626 
3627     }
3628     static {
3629         SharedSecrets.setJavaNetUriAccess(
3630             new JavaNetUriAccess() {
3631                 public URI create(String scheme, String path) {
3632                     return new URI(scheme, path);
3633                 }
3634             }
3635         );
3636     }
3637 }