Old src/java.base/share/classes/java/net/URI.java

   1 /*
   2  * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.net;
  27 
  28 import java.io.File;
  29 import java.io.IOException;
  30 import java.io.InvalidObjectException;
  31 import java.io.ObjectInputStream;
  32 import java.io.ObjectOutputStream;
  33 import java.io.Serializable;
  34 import java.nio.ByteBuffer;
  35 import java.nio.CharBuffer;
  36 import java.nio.charset.CharsetDecoder;
  37 import java.nio.charset.CoderResult;
  38 import java.nio.charset.CodingErrorAction;
  39 import java.nio.charset.CharacterCodingException;
  40 import java.nio.file.Path;
  41 import java.text.Normalizer;
  42 import jdk.internal.access.JavaNetUriAccess;
  43 import jdk.internal.access.SharedSecrets;
  44 import sun.nio.cs.ThreadLocalCoders;
  45 
  46 import java.lang.Character;             // for javadoc
  47 import java.lang.NullPointerException;  // for javadoc
  48 
  49 
  50 /**
  51  * Represents a Uniform Resource Identifier (URI) reference.
  52  *
  53  * <p> Aside from some minor deviations noted below, an instance of this
  54  * class represents a URI reference as defined by
  55  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
  56  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
  57  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  58  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
  59  * also supports scope_ids. The syntax and usage of scope_ids is described
  60  * <a href="Inet6Address.html#scoped">here</a>.
  61  * This class provides constructors for creating URI instances from
  62  * their components or by parsing their string forms, methods for accessing the
  63  * various components of an instance, and methods for normalizing, resolving,
  64  * and relativizing URI instances.  Instances of this class are immutable.
  65  *
  66  *
  67  * <h3> URI syntax and components </h3>
  68  *
  69  * At the highest level a URI reference (hereinafter simply "URI") in string
  70  * form has the syntax
  71  *
  72  * <blockquote>
  73  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
  74  * </blockquote>
  75  *
  76  * where square brackets [...] delineate optional components and the characters
  77  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
  78  *
  79  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
  80  * said to be <i>relative</i>.  URIs are also classified according to whether
  81  * they are <i>opaque</i> or <i>hierarchical</i>.
  82  *
  83  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
  84  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
  85  * subject to further parsing.  Some examples of opaque URIs are:
  86  *
  87  * <blockquote><ul style="list-style-type:none">
  88  * <li>{@code mailto:java-net@www.example.com}</li>
  89  * <li>{@code news:comp.lang.java}</li>
  90  * <li>{@code urn:isbn:096139210x}</li>
  91  * </ul></blockquote>
  92  *
  93  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
  94  * scheme-specific part begins with a slash character, or a relative URI, that
  95  * is, a URI that does not specify a scheme.  Some examples of hierarchical
  96  * URIs are:
  97  *
  98  * <blockquote>
  99  * {@code http://example.com/languages/java/}<br>
 100  * {@code sample/a/index.html#28}<br>
 101  * {@code ../../demo/b/index.html}<br>
 102  * {@code file:///~/calendar}
 103  * </blockquote>
 104  *
 105  * <p> A hierarchical URI is subject to further parsing according to the syntax
 106  *
 107  * <blockquote>
 108  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 109  * </blockquote>
 110  *
 111  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
 112  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
 113  * scheme-specific part of a hierarchical URI consists of the characters
 114  * between the scheme and fragment components.
 115  *
 116  * <p> The authority component of a hierarchical URI is, if specified, either
 117  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
 118  * parses according to the familiar syntax
 119  *
 120  * <blockquote>
 121  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
 122  * </blockquote>
 123  *
 124  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
 125  * themselves.  Nearly all URI schemes currently in use are server-based.  An
 126  * authority component that does not parse in this way is considered to be
 127  * registry-based.
 128  *
 129  * <p> The path component of a hierarchical URI is itself said to be absolute
 130  * if it begins with a slash character ({@code '/'}); otherwise it is
 131  * relative.  The path of a hierarchical URI that is either absolute or
 132  * specifies an authority is always absolute.
 133  *
 134  * <p> All told, then, a URI instance has the following nine components:
 135  *
 136  * <table class="striped" style="margin-left:2em">
 137  * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption>
 138  * <thead>
 139  * <tr><th scope="col">Component</th><th scope="col">Type</th></tr>
 140  * </thead>
 141  * <tbody style="text-align:left">
 142  * <tr><th scope="row">scheme</th><td>{@code String}</td></tr>
 143  * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr>
 144  * <tr><th scope="row">authority</th><td>{@code String}</td></tr>
 145  * <tr><th scope="row">user-info</th><td>{@code String}</td></tr>
 146  * <tr><th scope="row">host</th><td>{@code String}</td></tr>
 147  * <tr><th scope="row">port</th><td>{@code int}</td></tr>
 148  * <tr><th scope="row">path</th><td>{@code String}</td></tr>
 149  * <tr><th scope="row">query</th><td>{@code String}</td></tr>
 150  * <tr><th scope="row">fragment</th><td>{@code String}</td></tr>
 151  * </tbody>
 152  * </table>
 153  *
 154  * In a given instance any particular component is either <i>undefined</i> or
 155  * <i>defined</i> with a distinct value.  Undefined string components are
 156  * represented by {@code null}, while undefined integer components are
 157  * represented by {@code -1}.  A string component may be defined to have the
 158  * empty string as its value; this is not equivalent to that component being
 159  * undefined.
 160  *
 161  * <p> Whether a particular component is or is not defined in an instance
 162  * depends upon the type of the URI being represented.  An absolute URI has a
 163  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
 164  * possibly a fragment, but has no other components.  A hierarchical URI always
 165  * has a path (though it may be empty) and a scheme-specific-part (which at
 166  * least contains the path), and may have any of the other components.  If the
 167  * authority component is present and is server-based then the host component
 168  * will be defined and the user-information and port components may be defined.
 169  *
 170  *
 171  * <h4> Operations on URI instances </h4>
 172  *
 173  * The key operations supported by this class are those of
 174  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
 175  *
 176  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
 177  * and {@code ".."} segments from the path component of a hierarchical URI.
 178  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
 179  * removed only if it is preceded by a non-{@code ".."} segment.
 180  * Normalization has no effect upon opaque URIs.
 181  *
 182  * <p> <i>Resolution</i> is the process of resolving one URI against another,
 183  * <i>base</i> URI.  The resulting URI is constructed from components of both
 184  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
 185  * base URI for those not specified in the original.  For hierarchical URIs,
 186  * the path of the original is resolved against the path of the base and then
 187  * normalized.  The result, for example, of resolving
 188  *
 189  * <blockquote>
 190  * {@code sample/a/index.html#28}
 191  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 192  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
 193  * </blockquote>
 194  *
 195  * against the base URI {@code http://example.com/languages/java/} is the result
 196  * URI
 197  *
 198  * <blockquote>
 199  * {@code http://example.com/languages/java/sample/a/index.html#28}
 200  * </blockquote>
 201  *
 202  * Resolving the relative URI
 203  *
 204  * <blockquote>
 205  * {@code ../../demo/b/index.html}&nbsp;&nbsp;&nbsp;&nbsp;(2)
 206  * </blockquote>
 207  *
 208  * against this result yields, in turn,
 209  *
 210  * <blockquote>
 211  * {@code http://example.com/languages/java/demo/b/index.html}
 212  * </blockquote>
 213  *
 214  * Resolution of both absolute and relative URIs, and of both absolute and
 215  * relative paths in the case of hierarchical URIs, is supported.  Resolving
 216  * the URI {@code file:///~calendar} against any other URI simply yields the
 217  * original URI, since it is absolute.  Resolving the relative URI (2) above
 218  * against the relative base URI (1) yields the normalized, but still relative,
 219  * URI
 220  *
 221  * <blockquote>
 222  * {@code demo/b/index.html}
 223  * </blockquote>
 224  *
 225  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
 226  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
 227  *
 228  * <blockquote>
 229  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
 230  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
 231  * </blockquote>
 232  *
 233  * This operation is often useful when constructing a document containing URIs
 234  * that must be made relative to the base URI of the document wherever
 235  * possible.  For example, relativizing the URI
 236  *
 237  * <blockquote>
 238  * {@code http://example.com/languages/java/sample/a/index.html#28}
 239  * </blockquote>
 240  *
 241  * against the base URI
 242  *
 243  * <blockquote>
 244  * {@code http://example.com/languages/java/}
 245  * </blockquote>
 246  *
 247  * yields the relative URI {@code sample/a/index.html#28}.
 248  *
 249  *
 250  * <h4> Character categories </h4>
 251  *
 252  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
 253  * various components of a URI reference.  The following categories, most of
 254  * which are taken from that specification, are used below to describe these
 255  * constraints:
 256  *
 257  * <table class="striped" style="margin-left:2em">
 258  * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption>
 259  *   <thead>
 260  *   <tr><th scope="col">Category</th><th scope="col">Description</th></tr>
 261  *   </thead>
 262  *   <tbody style="text-align:left">
 263  *   <tr><th scope="row" style="vertical-align:top">alpha</th>
 264  *       <td>The US-ASCII alphabetic characters,
 265  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
 266  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
 267  *   <tr><th scope="row" style="vertical-align:top">digit</th>
 268  *       <td>The US-ASCII decimal digit characters,
 269  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
 270  *   <tr><th scope="row" style="vertical-align:top">alphanum</th>
 271  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
 272  *   <tr><th scope="row" style="vertical-align:top">unreserved</th>
 273  *       <td>All <i>alphanum</i> characters together with those in the string
 274  *        {@code "_-!.~'()*"}</td></tr>
 275  *   <tr><th scope="row" style="vertical-align:top">punct</th>
 276  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
 277  *   <tr><th scope="row" style="vertical-align:top">reserved</th>
 278  *       <td>All <i>punct</i> characters together with those in the string
 279  *        {@code "?/[]@"}</td></tr>
 280  *   <tr><th scope="row" style="vertical-align:top">escaped</th>
 281  *       <td>Escaped octets, that is, triplets consisting of the percent
 282  *           character ({@code '%'}) followed by two hexadecimal digits
 283  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
 284  *           {@code 'a'}-{@code 'f'})</td></tr>
 285  *   <tr><th scope="row" style="vertical-align:top">other</th>
 286  *       <td>The Unicode characters that are not in the US-ASCII character set,
 287  *           are not control characters (according to the {@link
 288  *           java.lang.Character#isISOControl(char) Character.isISOControl}
 289  *           method), and are not space characters (according to the {@link
 290  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
 291  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
 292  *           limited to US-ASCII)</i></td></tr>
 293  * </tbody>
 294  * </table>
 295  *
 296  * <p><a id="legal-chars"></a> The set of all legal URI characters consists of
 297  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
 298  * characters.
 299  *
 300  *
 301  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
 302  *
 303  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
 304  * fragment components.  Escaping serves two purposes in URIs:
 305  *
 306  * <ul>
 307  *
 308  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
 309  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
 310  *   characters.  </p></li>
 311  *
 312  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
 313  *   component.  The user-info, path, query, and fragment components differ
 314  *   slightly in terms of which characters are considered legal and illegal.
 315  *   </p></li>
 316  *
 317  * </ul>
 318  *
 319  * These purposes are served in this class by three related operations:
 320  *
 321  * <ul>
 322  *
 323  *   <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it
 324  *   with the sequence of escaped octets that represent that character in the
 325  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
 326  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
 327  *   RFC&nbsp;2396</b>, which does not specify any particular character
 328  *   set.)</i> </p></li>
 329  *
 330  *   <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by
 331  *   encoding it.  The space character, for example, is quoted by replacing it
 332  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
 333  *   characters this transformation has exactly the effect required by
 334  *   RFC&nbsp;2396. </p></li>
 335  *
 336  *   <li><p><a id="decode"></a>
 337  *   A sequence of escaped octets is <i>decoded</i> by
 338  *   replacing it with the sequence of characters that it represents in the
 339  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
 340  *   effect of de-quoting any quoted US-ASCII characters as well as that of
 341  *   decoding any encoded non-US-ASCII characters.  If a <a
 342  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
 343  *   when decoding the escaped octets then the erroneous octets are replaced by
 344  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
 345  *
 346  * </ul>
 347  *
 348  * These operations are exposed in the constructors and methods of this class
 349  * as follows:
 350  *
 351  * <ul>
 352  *
 353  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
 354  *   constructor} requires any illegal characters in its argument to be
 355  *   quoted and preserves any escaped octets and <i>other</i> characters that
 356  *   are present.  </p></li>
 357  *
 358  *   <li><p> The {@linkplain
 359  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
 360  *   multi-argument constructors} quote illegal characters as
 361  *   required by the components in which they appear.  The percent character
 362  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
 363  *   characters are preserved.  </p></li>
 364  *
 365  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
 366  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
 367  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
 368  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
 369  *   values of their corresponding components in raw form, without interpreting
 370  *   any escaped octets.  The strings returned by these methods may contain
 371  *   both escaped octets and <i>other</i> characters, and will not contain any
 372  *   illegal characters.  </p></li>
 373  *
 374  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
 375  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
 376  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
 377  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
 378  *   octets in their corresponding components.  The strings returned by these
 379  *   methods may contain both <i>other</i> characters and illegal characters,
 380  *   and will not contain any escaped octets.  </p></li>
 381  *
 382  *   <li><p> The {@link #toString() toString} method returns a URI string with
 383  *   all necessary quotation but which may contain <i>other</i> characters.
 384  *   </p></li>
 385  *
 386  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
 387  *   quoted and encoded URI string that does not contain any <i>other</i>
 388  *   characters.  </p></li>
 389  *
 390  * </ul>
 391  *
 392  *
 393  * <h4> Identities </h4>
 394  *
 395  * For any URI <i>u</i>, it is always the case that
 396  *
 397  * <blockquote>
 398  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
 399  * </blockquote>
 400  *
 401  * For any URI <i>u</i> that does not contain redundant syntax such as two
 402  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
 403  * colon following a host name but no port (as in
 404  * {@code http://www.example.com:}&nbsp;), and that does not encode characters
 405  * except those that must be quoted, the following identities also hold:
 406  * <pre>
 407  *     new URI(<i>u</i>.getScheme(),
 408  *             <i>u</i>.getSchemeSpecificPart(),
 409  *             <i>u</i>.getFragment())
 410  *     .equals(<i>u</i>)</pre>
 411  * in all cases,
 412  * <pre>
 413  *     new URI(<i>u</i>.getScheme(),
 414  *             <i>u</i>.getAuthority(),
 415  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 416  *             <i>u</i>.getFragment())
 417  *     .equals(<i>u</i>)</pre>
 418  * if <i>u</i> is hierarchical, and
 419  * <pre>
 420  *     new URI(<i>u</i>.getScheme(),
 421  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
 422  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 423  *             <i>u</i>.getFragment())
 424  *     .equals(<i>u</i>)</pre>
 425  * if <i>u</i> is hierarchical and has either no authority or a server-based
 426  * authority.
 427  *
 428  *
 429  * <h4> URIs, URLs, and URNs </h4>
 430  *
 431  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
 432  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
 433  * not every URI is a URL.  This is because there is another subcategory of
 434  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
 435  * specify how to locate them.  The {@code mailto}, {@code news}, and
 436  * {@code isbn} URIs shown above are examples of URNs.
 437  *
 438  * <p> The conceptual distinction between URIs and URLs is reflected in the
 439  * differences between this class and the {@link URL} class.
 440  *
 441  * <p> An instance of this class represents a URI reference in the syntactic
 442  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
 443  * A URI string is parsed according to the generic syntax without regard to the
 444  * scheme, if any, that it specifies.  No lookup of the host, if any, is
 445  * performed, and no scheme-dependent stream handler is constructed.  Equality,
 446  * hashing, and comparison are defined strictly in terms of the character
 447  * content of the instance.  In other words, a URI instance is little more than
 448  * a structured string that supports the syntactic, scheme-independent
 449  * operations of comparison, normalization, resolution, and relativization.
 450  *
 451  * <p> An instance of the {@link URL} class, by contrast, represents the
 452  * syntactic components of a URL together with some of the information required
 453  * to access the resource that it describes.  A URL must be absolute, that is,
 454  * it must always specify a scheme.  A URL string is parsed according to its
 455  * scheme.  A stream handler is always established for a URL, and in fact it is
 456  * impossible to create a URL instance for a scheme for which no handler is
 457  * available.  Equality and hashing depend upon both the scheme and the
 458  * Internet address of the host, if any; comparison is not defined.  In other
 459  * words, a URL is a structured string that supports the syntactic operation of
 460  * resolution as well as the network I/O operations of looking up the host and
 461  * opening a connection to the specified resource.
 462  *
 463  * @apiNote
 464  *
 465  * Applications working with file paths and file URIs should take great
 466  * care to use the appropriate methods to convert between the two.
 467  * The {@link Path#of(URI)} factory method and the {@link File#File(URI)}
 468  * constructor can be used to create {@link Path} or {@link File}
 469  * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()}
 470  * can be used to create a {@link URI} from a file path.
 471  * Applications should never try to {@linkplain
 472  * #URI(String, String, String, int, String, String, String)
 473  * construct}, {@linkplain #URI(String) parse}, or
 474  * {@linkplain #resolve(String) resolve} a {@code URI}
 475  * from the direct string representation of a {@code File} or {@code Path}
 476  * instance.
 477  * <p>
 478  * Some components of a URL or URI, such as <i>userinfo</i>, may
 479  * be abused to construct misleading URLs or URIs. Applications
 480  * that deal with URLs or URIs should take into account
 481  * the recommendations advised in <a
 482  * href="https://tools.ietf.org/html/rfc3986#section-7">RFC3986,
 483  * Section 7, Security Considerations</a>.
 484  *
 485  * @author Mark Reinhold
 486  * @since 1.4
 487  *
 488  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
 489  * transformation format of ISO 10646</i></a>, <br><a
 490  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
 491  * Architecture</i></a>, <br><a
 492  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
 493  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
 494  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
 495  * Literal IPv6 Addresses in URLs</i></a>, <br><a
 496  * href="URISyntaxException.html">URISyntaxException</a>
 497  */
 498 
 499 public final class URI
 500     implements Comparable<URI>, Serializable
 501 {
 502 
 503     // Note: Comments containing the word "ASSERT" indicate places where a
 504     // throw of an InternalError should be replaced by an appropriate assertion
 505     // statement once asserts are enabled in the build.
 506 
 507     static final long serialVersionUID = -6052424284110960213L;
 508 
 509 
 510     // -- Properties and components of this instance --
 511 
 512     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
 513     private transient String scheme;            // null ==> relative URI
 514     private transient String fragment;
 515 
 516     // Hierarchical URI components: [//<authority>]<path>[?<query>]
 517     private transient String authority;         // Registry or server
 518 
 519     // Server-based authority: [<userInfo>@]<host>[:<port>]
 520     private transient String userInfo;
 521     private transient String host;              // null ==> registry-based
 522     private transient int port = -1;            // -1 ==> undefined
 523 
 524     // Remaining components of hierarchical URIs
 525     private transient String path;              // null ==> opaque
 526     private transient String query;
 527 
 528     // The remaining fields may be computed on demand, which is safe even in
 529     // the face of multiple threads racing to initialize them
 530     private transient String schemeSpecificPart;
 531     private transient int hash;        // Zero ==> undefined
 532 
 533     private transient String decodedUserInfo;
 534     private transient String decodedAuthority;
 535     private transient String decodedPath;
 536     private transient String decodedQuery;
 537     private transient String decodedFragment;
 538     private transient String decodedSchemeSpecificPart;
 539 
 540     /**
 541      * The string form of this URI.
 542      *
 543      * @serial
 544      */
 545     private volatile String string;             // The only serializable field
 546 
 547 
 548 
 549     // -- Constructors and factories --
 550 
 551     private URI() { }                           // Used internally
 552 
 553     /**
 554      * Constructs a URI by parsing the given string.
 555      *
 556      * <p> This constructor parses the given string exactly as specified by the
 557      * grammar in <a
 558      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 559      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
 560      *
 561      * <ul>
 562      *
 563      *   <li><p> An empty authority component is permitted as long as it is
 564      *   followed by a non-empty path, a query component, or a fragment
 565      *   component.  This allows the parsing of URIs such as
 566      *   {@code "file:///foo/bar"}, which seems to be the intent of
 567      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
 568      *   authority component is empty then the user-information, host, and port
 569      *   components are undefined. </p></li>
 570      *
 571      *   <li><p> Empty relative paths are permitted; this seems to be the
 572      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
 573      *   primary consequence of this deviation is that a standalone fragment
 574      *   such as {@code "#foo"} parses as a relative URI with an empty path
 575      *   and the given fragment, and can be usefully <a
 576      *   href="#resolve-frag">resolved</a> against a base URI.
 577      *
 578      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
 579      *   specified by <a
 580      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
 581      *   element of a dotted-quad address must contain no more than three
 582      *   decimal digits.  Each element is further constrained to have a value
 583      *   no greater than 255. </p></li>
 584      *
 585      *   <li> <p> Hostnames in host components that comprise only a single
 586      *   domain label are permitted to start with an <i>alphanum</i>
 587      *   character. This seems to be the intent of <a
 588      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 589      *   section&nbsp;3.2.2 although the grammar does not permit it. The
 590      *   consequence of this deviation is that the authority component of a
 591      *   hierarchical URI such as {@code s://123}, will parse as a server-based
 592      *   authority. </p></li>
 593      *
 594      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
 595      *   address must be enclosed in square brackets ({@code '['} and
 596      *   {@code ']'}) as specified by <a
 597      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
 598      *   IPv6 address itself must parse according to <a
 599      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
 600      *   addresses are further constrained to describe no more than sixteen
 601      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
 602      *   but not expressible in the grammar. </p></li>
 603      *
 604      *   <li><p> Characters in the <i>other</i> category are permitted wherever
 605      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
 606      *   user-information, path, query, and fragment components, as well as in
 607      *   the authority component if the authority is registry-based.  This
 608      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
 609      *   character set. </p></li>
 610      *
 611      * </ul>
 612      *
 613      * @param  str   The string to be parsed into a URI
 614      *
 615      * @throws  NullPointerException
 616      *          If {@code str} is {@code null}
 617      *
 618      * @throws  URISyntaxException
 619      *          If the given string violates RFC&nbsp;2396, as augmented
 620      *          by the above deviations
 621      */
 622     public URI(String str) throws URISyntaxException {
 623         new Parser(str).parse(false);
 624     }
 625 
 626     /**
 627      * Constructs a hierarchical URI from the given components.
 628      *
 629      * <p> If a scheme is given then the path, if also given, must either be
 630      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 631      * component of the new URI may be left undefined by passing {@code null}
 632      * for the corresponding parameter or, in the case of the {@code port}
 633      * parameter, by passing {@code -1}.
 634      *
 635      * <p> This constructor first builds a URI string from the given components
 636      * according to the rules specified in <a
 637      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 638      * section&nbsp;5.2, step&nbsp;7: </p>
 639      *
 640      * <ol>
 641      *
 642      *   <li><p> Initially, the result string is empty. </p></li>
 643      *
 644      *   <li><p> If a scheme is given then it is appended to the result,
 645      *   followed by a colon character ({@code ':'}).  </p></li>
 646      *
 647      *   <li><p> If user information, a host, or a port are given then the
 648      *   string {@code "//"} is appended.  </p></li>
 649      *
 650      *   <li><p> If user information is given then it is appended, followed by
 651      *   a commercial-at character ({@code '@'}).  Any character not in the
 652      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 653      *   categories is <a href="#quote">quoted</a>.  </p></li>
 654      *
 655      *   <li><p> If a host is given then it is appended.  If the host is a
 656      *   literal IPv6 address but is not enclosed in square brackets
 657      *   ({@code '['} and {@code ']'}) then the square brackets are added.
 658      *   </p></li>
 659      *
 660      *   <li><p> If a port number is given then a colon character
 661      *   ({@code ':'}) is appended, followed by the port number in decimal.
 662      *   </p></li>
 663      *
 664      *   <li><p> If a path is given then it is appended.  Any character not in
 665      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 666      *   categories, and not equal to the slash character ({@code '/'}) or the
 667      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 668      *
 669      *   <li><p> If a query is given then a question-mark character
 670      *   ({@code '?'}) is appended, followed by the query.  Any character that
 671      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 672      *   </p></li>
 673      *
 674      *   <li><p> Finally, if a fragment is given then a hash character
 675      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 676      *   that is not a legal URI character is quoted.  </p></li>
 677      *
 678      * </ol>
 679      *
 680      * <p> The resulting URI string is then parsed as if by invoking the {@link
 681      * #URI(String)} constructor and then invoking the {@link
 682      * #parseServerAuthority()} method upon the result; this may cause a {@link
 683      * URISyntaxException} to be thrown.  </p>
 684      *
 685      * @param   scheme    Scheme name
 686      * @param   userInfo  User name and authorization information
 687      * @param   host      Host name
 688      * @param   port      Port number
 689      * @param   path      Path
 690      * @param   query     Query
 691      * @param   fragment  Fragment
 692      *
 693      * @throws URISyntaxException
 694      *         If both a scheme and a path are given but the path is relative,
 695      *         if the URI string constructed from the given components violates
 696      *         RFC&nbsp;2396, or if the authority component of the string is
 697      *         present but cannot be parsed as a server-based authority
 698      */
 699     public URI(String scheme,
 700                String userInfo, String host, int port,
 701                String path, String query, String fragment)
 702         throws URISyntaxException
 703     {
 704         String s = toString(scheme, null,
 705                             null, userInfo, host, port,
 706                             path, query, fragment);
 707         checkPath(s, scheme, path);
 708         new Parser(s).parse(true);
 709     }
 710 
 711     /**
 712      * Constructs a hierarchical URI from the given components.
 713      *
 714      * <p> If a scheme is given then the path, if also given, must either be
 715      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 716      * component of the new URI may be left undefined by passing {@code null}
 717      * for the corresponding parameter.
 718      *
 719      * <p> This constructor first builds a URI string from the given components
 720      * according to the rules specified in <a
 721      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 722      * section&nbsp;5.2, step&nbsp;7: </p>
 723      *
 724      * <ol>
 725      *
 726      *   <li><p> Initially, the result string is empty.  </p></li>
 727      *
 728      *   <li><p> If a scheme is given then it is appended to the result,
 729      *   followed by a colon character ({@code ':'}).  </p></li>
 730      *
 731      *   <li><p> If an authority is given then the string {@code "//"} is
 732      *   appended, followed by the authority.  If the authority contains a
 733      *   literal IPv6 address then the address must be enclosed in square
 734      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
 735      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 736      *   categories, and not equal to the commercial-at character
 737      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
 738      *
 739      *   <li><p> If a path is given then it is appended.  Any character not in
 740      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 741      *   categories, and not equal to the slash character ({@code '/'}) or the
 742      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 743      *
 744      *   <li><p> If a query is given then a question-mark character
 745      *   ({@code '?'}) is appended, followed by the query.  Any character that
 746      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 747      *   </p></li>
 748      *
 749      *   <li><p> Finally, if a fragment is given then a hash character
 750      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 751      *   that is not a legal URI character is quoted.  </p></li>
 752      *
 753      * </ol>
 754      *
 755      * <p> The resulting URI string is then parsed as if by invoking the {@link
 756      * #URI(String)} constructor and then invoking the {@link
 757      * #parseServerAuthority()} method upon the result; this may cause a {@link
 758      * URISyntaxException} to be thrown.  </p>
 759      *
 760      * @param   scheme     Scheme name
 761      * @param   authority  Authority
 762      * @param   path       Path
 763      * @param   query      Query
 764      * @param   fragment   Fragment
 765      *
 766      * @throws URISyntaxException
 767      *         If both a scheme and a path are given but the path is relative,
 768      *         if the URI string constructed from the given components violates
 769      *         RFC&nbsp;2396, or if the authority component of the string is
 770      *         present but cannot be parsed as a server-based authority
 771      */
 772     public URI(String scheme,
 773                String authority,
 774                String path, String query, String fragment)
 775         throws URISyntaxException
 776     {
 777         String s = toString(scheme, null,
 778                             authority, null, null, -1,
 779                             path, query, fragment);
 780         checkPath(s, scheme, path);
 781         new Parser(s).parse(false);
 782     }
 783 
 784     /**
 785      * Constructs a hierarchical URI from the given components.
 786      *
 787      * <p> A component may be left undefined by passing {@code null}.
 788      *
 789      * <p> This convenience constructor works as if by invoking the
 790      * seven-argument constructor as follows:
 791      *
 792      * <blockquote>
 793      * {@code new} {@link #URI(String, String, String, int, String, String, String)
 794      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
 795      * </blockquote>
 796      *
 797      * @param   scheme    Scheme name
 798      * @param   host      Host name
 799      * @param   path      Path
 800      * @param   fragment  Fragment
 801      *
 802      * @throws  URISyntaxException
 803      *          If the URI string constructed from the given components
 804      *          violates RFC&nbsp;2396
 805      */
 806     public URI(String scheme, String host, String path, String fragment)
 807         throws URISyntaxException
 808     {
 809         this(scheme, null, host, -1, path, null, fragment);
 810     }
 811 
 812     /**
 813      * Constructs a URI from the given components.
 814      *
 815      * <p> A component may be left undefined by passing {@code null}.
 816      *
 817      * <p> This constructor first builds a URI in string form using the given
 818      * components as follows:  </p>
 819      *
 820      * <ol>
 821      *
 822      *   <li><p> Initially, the result string is empty.  </p></li>
 823      *
 824      *   <li><p> If a scheme is given then it is appended to the result,
 825      *   followed by a colon character ({@code ':'}).  </p></li>
 826      *
 827      *   <li><p> If a scheme-specific part is given then it is appended.  Any
 828      *   character that is not a <a href="#legal-chars">legal URI character</a>
 829      *   is <a href="#quote">quoted</a>.  </p></li>
 830      *
 831      *   <li><p> Finally, if a fragment is given then a hash character
 832      *   ({@code '#'}) is appended to the string, followed by the fragment.
 833      *   Any character that is not a legal URI character is quoted.  </p></li>
 834      *
 835      * </ol>
 836      *
 837      * <p> The resulting URI string is then parsed in order to create the new
 838      * URI instance as if by invoking the {@link #URI(String)} constructor;
 839      * this may cause a {@link URISyntaxException} to be thrown.  </p>
 840      *
 841      * @param   scheme    Scheme name
 842      * @param   ssp       Scheme-specific part
 843      * @param   fragment  Fragment
 844      *
 845      * @throws  URISyntaxException
 846      *          If the URI string constructed from the given components
 847      *          violates RFC&nbsp;2396
 848      */
 849     public URI(String scheme, String ssp, String fragment)
 850         throws URISyntaxException
 851     {
 852         new Parser(toString(scheme, ssp,
 853                             null, null, null, -1,
 854                             null, null, fragment))
 855             .parse(false);
 856     }
 857 
 858     /**
 859      * Constructs a simple URI consisting of only a scheme and a pre-validated
 860      * path. Provides a fast-path for some internal cases.
 861      */
 862     URI(String scheme, String path) {
 863         assert validSchemeAndPath(scheme, path);
 864         this.scheme = scheme;
 865         this.path = path;
 866     }
 867 
 868     private static boolean validSchemeAndPath(String scheme, String path) {
 869         try {
 870             URI u = new URI(scheme + ":" + path);
 871             return scheme.equals(u.scheme) && path.equals(u.path);
 872         } catch (URISyntaxException e) {
 873             return false;
 874         }
 875     }
 876 
 877     /**
 878      * Creates a URI by parsing the given string.
 879      *
 880      * <p> This convenience factory method works as if by invoking the {@link
 881      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
 882      * constructor is caught and wrapped in a new {@link
 883      * IllegalArgumentException} object, which is then thrown.
 884      *
 885      * <p> This method is provided for use in situations where it is known that
 886      * the given string is a legal URI, for example for URI constants declared
 887      * within a program, and so it would be considered a programming error
 888      * for the string not to parse as such.  The constructors, which throw
 889      * {@link URISyntaxException} directly, should be used in situations where a
 890      * URI is being constructed from user input or from some other source that
 891      * may be prone to errors.  </p>
 892      *
 893      * @param  str   The string to be parsed into a URI
 894      * @return The new URI
 895      *
 896      * @throws  NullPointerException
 897      *          If {@code str} is {@code null}
 898      *
 899      * @throws  IllegalArgumentException
 900      *          If the given string violates RFC&nbsp;2396
 901      */
 902     public static URI create(String str) {
 903         try {
 904             return new URI(str);
 905         } catch (URISyntaxException x) {
 906             throw new IllegalArgumentException(x.getMessage(), x);
 907         }
 908     }
 909 
 910 
 911     // -- Operations --
 912 
 913     /**
 914      * Attempts to parse this URI's authority component, if defined, into
 915      * user-information, host, and port components.
 916      *
 917      * <p> If this URI's authority component has already been recognized as
 918      * being server-based then it will already have been parsed into
 919      * user-information, host, and port components.  In this case, or if this
 920      * URI has no authority component, this method simply returns this URI.
 921      *
 922      * <p> Otherwise this method attempts once more to parse the authority
 923      * component into user-information, host, and port components, and throws
 924      * an exception describing why the authority component could not be parsed
 925      * in that way.
 926      *
 927      * <p> This method is provided because the generic URI syntax specified in
 928      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 929      * cannot always distinguish a malformed server-based authority from a
 930      * legitimate registry-based authority.  It must therefore treat some
 931      * instances of the former as instances of the latter.  The authority
 932      * component in the URI string {@code "//foo:bar"}, for example, is not a
 933      * legal server-based authority but it is legal as a registry-based
 934      * authority.
 935      *
 936      * <p> In many common situations, for example when working URIs that are
 937      * known to be either URNs or URLs, the hierarchical URIs being used will
 938      * always be server-based.  They therefore must either be parsed as such or
 939      * treated as an error.  In these cases a statement such as
 940      *
 941      * <blockquote>
 942      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
 943      * </blockquote>
 944      *
 945      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
 946      * it has an authority component, has a server-based authority with proper
 947      * user-information, host, and port components.  Invoking this method also
 948      * ensures that if the authority could not be parsed in that way then an
 949      * appropriate diagnostic message can be issued based upon the exception
 950      * that is thrown. </p>
 951      *
 952      * @return  A URI whose authority field has been parsed
 953      *          as a server-based authority
 954      *
 955      * @throws  URISyntaxException
 956      *          If the authority component of this URI is defined
 957      *          but cannot be parsed as a server-based authority
 958      *          according to RFC&nbsp;2396
 959      */
 960     public URI parseServerAuthority()
 961         throws URISyntaxException
 962     {
 963         // We could be clever and cache the error message and index from the
 964         // exception thrown during the original parse, but that would require
 965         // either more fields or a more-obscure representation.
 966         if ((host != null) || (authority == null))
 967             return this;
 968         new Parser(toString()).parse(true);
 969         return this;
 970     }
 971 
 972     /**
 973      * Normalizes this URI's path.
 974      *
 975      * <p> If this URI is opaque, or if its path is already in normal form,
 976      * then this URI is returned.  Otherwise a new URI is constructed that is
 977      * identical to this URI except that its path is computed by normalizing
 978      * this URI's path in a manner consistent with <a
 979      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 980      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
 981      * </p>
 982      *
 983      * <ol>
 984      *
 985      *   <li><p> All {@code "."} segments are removed. </p></li>
 986      *
 987      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
 988      *   segment then both of these segments are removed.  This step is
 989      *   repeated until it is no longer applicable. </p></li>
 990      *
 991      *   <li><p> If the path is relative, and if its first segment contains a
 992      *   colon character ({@code ':'}), then a {@code "."} segment is
 993      *   prepended.  This prevents a relative URI with a path such as
 994      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
 995      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
 996      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
 997      *
 998      * </ol>
 999      *
1000      * <p> A normalized path will begin with one or more {@code ".."} segments
1001      * if there were insufficient non-{@code ".."} segments preceding them to
1002      * allow their removal.  A normalized path will begin with a {@code "."}
1003      * segment if one was inserted by step 3 above.  Otherwise, a normalized
1004      * path will not contain any {@code "."} or {@code ".."} segments. </p>
1005      *
1006      * @return  A URI equivalent to this URI,
1007      *          but whose path is in normal form
1008      */
1009     public URI normalize() {
1010         return normalize(this);
1011     }
1012 
1013     /**
1014      * Resolves the given URI against this URI.
1015      *
1016      * <p> If the given URI is already absolute, or if this URI is opaque, then
1017      * the given URI is returned.
1018      *
1019      * <p><a id="resolve-frag"></a> If the given URI's fragment component is
1020      * defined, its path component is empty, and its scheme, authority, and
1021      * query components are undefined, then a URI with the given fragment but
1022      * with all other components equal to those of this URI is returned.  This
1023      * allows a URI representing a standalone fragment reference, such as
1024      * {@code "#foo"}, to be usefully resolved against a base URI.
1025      *
1026      * <p> Otherwise this method constructs a new hierarchical URI in a manner
1027      * consistent with <a
1028      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1029      * section&nbsp;5.2; that is: </p>
1030      *
1031      * <ol>
1032      *
1033      *   <li><p> A new URI is constructed with this URI's scheme and the given
1034      *   URI's query and fragment components. </p></li>
1035      *
1036      *   <li><p> If the given URI has an authority component then the new URI's
1037      *   authority and path are taken from the given URI. </p></li>
1038      *
1039      *   <li><p> Otherwise the new URI's authority component is copied from
1040      *   this URI, and its path is computed as follows: </p>
1041      *
1042      *   <ol>
1043      *
1044      *     <li><p> If the given URI's path is absolute then the new URI's path
1045      *     is taken from the given URI. </p></li>
1046      *
1047      *     <li><p> Otherwise the given URI's path is relative, and so the new
1048      *     URI's path is computed by resolving the path of the given URI
1049      *     against the path of this URI.  This is done by concatenating all but
1050      *     the last segment of this URI's path, if any, with the given URI's
1051      *     path and then normalizing the result as if by invoking the {@link
1052      *     #normalize() normalize} method. </p></li>
1053      *
1054      *   </ol></li>
1055      *
1056      * </ol>
1057      *
1058      * <p> The result of this method is absolute if, and only if, either this
1059      * URI is absolute or the given URI is absolute.  </p>
1060      *
1061      * @param  uri  The URI to be resolved against this URI
1062      * @return The resulting URI
1063      *
1064      * @throws  NullPointerException
1065      *          If {@code uri} is {@code null}
1066      */
1067     public URI resolve(URI uri) {
1068         return resolve(this, uri);
1069     }
1070 
1071     /**
1072      * Constructs a new URI by parsing the given string and then resolving it
1073      * against this URI.
1074      *
1075      * <p> This convenience method works as if invoking it were equivalent to
1076      * evaluating the expression {@link #resolve(java.net.URI)
1077      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1078      *
1079      * @param  str   The string to be parsed into a URI
1080      * @return The resulting URI
1081      *
1082      * @throws  NullPointerException
1083      *          If {@code str} is {@code null}
1084      *
1085      * @throws  IllegalArgumentException
1086      *          If the given string violates RFC&nbsp;2396
1087      */
1088     public URI resolve(String str) {
1089         return resolve(URI.create(str));
1090     }
1091 
1092     /**
1093      * Relativizes the given URI against this URI.
1094      *
1095      * <p> The relativization of the given URI against this URI is computed as
1096      * follows: </p>
1097      *
1098      * <ol>
1099      *
1100      *   <li><p> If either this URI or the given URI are opaque, or if the
1101      *   scheme and authority components of the two URIs are not identical, or
1102      *   if the path of this URI is not a prefix of the path of the given URI,
1103      *   then the given URI is returned. </p></li>
1104      *
1105      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1106      *   query and fragment components taken from the given URI and with a path
1107      *   component computed by removing this URI's path from the beginning of
1108      *   the given URI's path. </p></li>
1109      *
1110      * </ol>
1111      *
1112      * @param  uri  The URI to be relativized against this URI
1113      * @return The resulting URI
1114      *
1115      * @throws  NullPointerException
1116      *          If {@code uri} is {@code null}
1117      */
1118     public URI relativize(URI uri) {
1119         return relativize(this, uri);
1120     }
1121 
1122     /**
1123      * Constructs a URL from this URI.
1124      *
1125      * <p> This convenience method works as if invoking it were equivalent to
1126      * evaluating the expression {@code new URL(this.toString())} after
1127      * first checking that this URI is absolute. </p>
1128      *
1129      * @return  A URL constructed from this URI
1130      *
1131      * @throws  IllegalArgumentException
1132      *          If this URL is not absolute
1133      *
1134      * @throws  MalformedURLException
1135      *          If a protocol handler for the URL could not be found,
1136      *          or if some other error occurred while constructing the URL
1137      */
1138     public URL toURL() throws MalformedURLException {
1139         return URL.fromURI(this);
1140     }
1141 
1142     // -- Component access methods --
1143 
1144     /**
1145      * Returns the scheme component of this URI.
1146      *
1147      * <p> The scheme component of a URI, if defined, only contains characters
1148      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1149      * scheme always starts with an <i>alpha</i> character. <p>
1150      *
1151      * The scheme component of a URI cannot contain escaped octets, hence this
1152      * method does not perform any decoding.
1153      *
1154      * @return  The scheme component of this URI,
1155      *          or {@code null} if the scheme is undefined
1156      */
1157     public String getScheme() {
1158         return scheme;
1159     }
1160 
1161     /**
1162      * Tells whether or not this URI is absolute.
1163      *
1164      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1165      *
1166      * @return  {@code true} if, and only if, this URI is absolute
1167      */
1168     public boolean isAbsolute() {
1169         return scheme != null;
1170     }
1171 
1172     /**
1173      * Tells whether or not this URI is opaque.
1174      *
1175      * <p> A URI is opaque if, and only if, it is absolute and its
1176      * scheme-specific part does not begin with a slash character ('/').
1177      * An opaque URI has a scheme, a scheme-specific part, and possibly
1178      * a fragment; all other components are undefined. </p>
1179      *
1180      * @return  {@code true} if, and only if, this URI is opaque
1181      */
1182     public boolean isOpaque() {
1183         return path == null;
1184     }
1185 
1186     /**
1187      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1188      * part is never undefined, though it may be empty.
1189      *
1190      * <p> The scheme-specific part of a URI only contains legal URI
1191      * characters. </p>
1192      *
1193      * @return  The raw scheme-specific part of this URI
1194      *          (never {@code null})
1195      */
1196     public String getRawSchemeSpecificPart() {
1197         String part = schemeSpecificPart;
1198         if (part != null) {
1199             return part;
1200         }
1201 
1202         String s = string;
1203         if (s != null) {
1204             // if string is defined, components will have been parsed
1205             int start = 0;
1206             int end = s.length();
1207             if (scheme != null) {
1208                 start = scheme.length() + 1;
1209             }
1210             if (fragment != null) {
1211                 end -= fragment.length() + 1;
1212             }
1213             if (path != null && path.length() == end - start) {
1214                 part = path;
1215             } else {
1216                 part = s.substring(start, end);
1217             }
1218         } else {
1219             StringBuilder sb = new StringBuilder();
1220             appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1221                                  host, port, getPath(), getQuery());
1222             part = sb.toString();
1223         }
1224         return schemeSpecificPart = part;
1225     }
1226 
1227     /**
1228      * Returns the decoded scheme-specific part of this URI.
1229      *
1230      * <p> The string returned by this method is equal to that returned by the
1231      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1232      * except that all sequences of escaped octets are <a
1233      * href="#decode">decoded</a>.  </p>
1234      *
1235      * @return  The decoded scheme-specific part of this URI
1236      *          (never {@code null})
1237      */
1238     public String getSchemeSpecificPart() {
1239         String part = decodedSchemeSpecificPart;
1240         if (part == null) {
1241             decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart());
1242         }
1243         return part;
1244     }
1245 
1246     /**
1247      * Returns the raw authority component of this URI.
1248      *
1249      * <p> The authority component of a URI, if defined, only contains the
1250      * commercial-at character ({@code '@'}) and characters in the
1251      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1252      * categories.  If the authority is server-based then it is further
1253      * constrained to have valid user-information, host, and port
1254      * components. </p>
1255      *
1256      * @return  The raw authority component of this URI,
1257      *          or {@code null} if the authority is undefined
1258      */
1259     public String getRawAuthority() {
1260         return authority;
1261     }
1262 
1263     /**
1264      * Returns the decoded authority component of this URI.
1265      *
1266      * <p> The string returned by this method is equal to that returned by the
1267      * {@link #getRawAuthority() getRawAuthority} method except that all
1268      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1269      *
1270      * @return  The decoded authority component of this URI,
1271      *          or {@code null} if the authority is undefined
1272      */
1273     public String getAuthority() {
1274         String auth = decodedAuthority;
1275         if ((auth == null) && (authority != null)) {
1276             decodedAuthority = auth = decode(authority);
1277         }
1278         return auth;
1279     }
1280 
1281     /**
1282      * Returns the raw user-information component of this URI.
1283      *
1284      * <p> The user-information component of a URI, if defined, only contains
1285      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1286      * <i>other</i> categories. </p>
1287      *
1288      * @return  The raw user-information component of this URI,
1289      *          or {@code null} if the user information is undefined
1290      */
1291     public String getRawUserInfo() {
1292         return userInfo;
1293     }
1294 
1295     /**
1296      * Returns the decoded user-information component of this URI.
1297      *
1298      * <p> The string returned by this method is equal to that returned by the
1299      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1300      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1301      *
1302      * @return  The decoded user-information component of this URI,
1303      *          or {@code null} if the user information is undefined
1304      */
1305     public String getUserInfo() {
1306         String user = decodedUserInfo;
1307         if ((user == null) && (userInfo != null)) {
1308             decodedUserInfo = user = decode(userInfo);
1309         }
1310         return user;
1311     }
1312 
1313     /**
1314      * Returns the host component of this URI.
1315      *
1316      * <p> The host component of a URI, if defined, will have one of the
1317      * following forms: </p>
1318      *
1319      * <ul>
1320      *
1321      *   <li><p> A domain name consisting of one or more <i>labels</i>
1322      *   separated by period characters ({@code '.'}), optionally followed by
1323      *   a period character.  Each label consists of <i>alphanum</i> characters
1324      *   as well as hyphen characters ({@code '-'}), though hyphens never
1325      *   occur as the first or last characters in a label. The rightmost
1326      *   label of a domain name consisting of two or more labels, begins
1327      *   with an <i>alpha</i> character. </li>
1328      *
1329      *   <li><p> A dotted-quad IPv4 address of the form
1330      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1331      *   where no <i>digit</i> sequence is longer than three characters and no
1332      *   sequence has a value larger than 255. </p></li>
1333      *
1334      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1335      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1336      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1337      *   syntax of IPv6 addresses is specified in <a
1338      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1339      *   Addressing Architecture</i></a>.  </p></li>
1340      *
1341      * </ul>
1342      *
1343      * The host component of a URI cannot contain escaped octets, hence this
1344      * method does not perform any decoding.
1345      *
1346      * @return  The host component of this URI,
1347      *          or {@code null} if the host is undefined
1348      */
1349     public String getHost() {
1350         return host;
1351     }
1352 
1353     /**
1354      * Returns the port number of this URI.
1355      *
1356      * <p> The port component of a URI, if defined, is a non-negative
1357      * integer. </p>
1358      *
1359      * @return  The port component of this URI,
1360      *          or {@code -1} if the port is undefined
1361      */
1362     public int getPort() {
1363         return port;
1364     }
1365 
1366     /**
1367      * Returns the raw path component of this URI.
1368      *
1369      * <p> The path component of a URI, if defined, only contains the slash
1370      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1371      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1372      * and <i>other</i> categories. </p>
1373      *
1374      * @return  The path component of this URI,
1375      *          or {@code null} if the path is undefined
1376      */
1377     public String getRawPath() {
1378         return path;
1379     }
1380 
1381     /**
1382      * Returns the decoded path component of this URI.
1383      *
1384      * <p> The string returned by this method is equal to that returned by the
1385      * {@link #getRawPath() getRawPath} method except that all sequences of
1386      * escaped octets are <a href="#decode">decoded</a>.  </p>
1387      *
1388      * @return  The decoded path component of this URI,
1389      *          or {@code null} if the path is undefined
1390      */
1391     public String getPath() {
1392         String decoded = decodedPath;
1393         if ((decoded == null) && (path != null)) {
1394             decodedPath = decoded = decode(path);
1395         }
1396         return decoded;
1397     }
1398 
1399     /**
1400      * Returns the raw query component of this URI.
1401      *
1402      * <p> The query component of a URI, if defined, only contains legal URI
1403      * characters. </p>
1404      *
1405      * @return  The raw query component of this URI,
1406      *          or {@code null} if the query is undefined
1407      */
1408     public String getRawQuery() {
1409         return query;
1410     }
1411 
1412     /**
1413      * Returns the decoded query component of this URI.
1414      *
1415      * <p> The string returned by this method is equal to that returned by the
1416      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1417      * escaped octets are <a href="#decode">decoded</a>.  </p>
1418      *
1419      * @return  The decoded query component of this URI,
1420      *          or {@code null} if the query is undefined
1421      */
1422     public String getQuery() {
1423         String decoded = decodedQuery;
1424         if ((decoded == null) && (query != null)) {
1425             decodedQuery = decoded = decode(query, false);
1426         }
1427         return decoded;
1428     }
1429 
1430     /**
1431      * Returns the raw fragment component of this URI.
1432      *
1433      * <p> The fragment component of a URI, if defined, only contains legal URI
1434      * characters. </p>
1435      *
1436      * @return  The raw fragment component of this URI,
1437      *          or {@code null} if the fragment is undefined
1438      */
1439     public String getRawFragment() {
1440         return fragment;
1441     }
1442 
1443     /**
1444      * Returns the decoded fragment component of this URI.
1445      *
1446      * <p> The string returned by this method is equal to that returned by the
1447      * {@link #getRawFragment() getRawFragment} method except that all
1448      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1449      *
1450      * @return  The decoded fragment component of this URI,
1451      *          or {@code null} if the fragment is undefined
1452      */
1453     public String getFragment() {
1454         String decoded = decodedFragment;
1455         if ((decoded == null) && (fragment != null)) {
1456             decodedFragment = decoded = decode(fragment, false);
1457         }
1458         return decoded;
1459     }
1460 
1461 
1462     // -- Equality, comparison, hash code, toString, and serialization --
1463 
1464     /**
1465      * Tests this URI for equality with another object.
1466      *
1467      * <p> If the given object is not a URI then this method immediately
1468      * returns {@code false}.
1469      *
1470      * <p> For two URIs to be considered equal requires that either both are
1471      * opaque or both are hierarchical.  Their schemes must either both be
1472      * undefined or else be equal without regard to case. Their fragments
1473      * must either both be undefined or else be equal.
1474      *
1475      * <p> For two opaque URIs to be considered equal, their scheme-specific
1476      * parts must be equal.
1477      *
1478      * <p> For two hierarchical URIs to be considered equal, their paths must
1479      * be equal and their queries must either both be undefined or else be
1480      * equal.  Their authorities must either both be undefined, or both be
1481      * registry-based, or both be server-based.  If their authorities are
1482      * defined and are registry-based, then they must be equal.  If their
1483      * authorities are defined and are server-based, then their hosts must be
1484      * equal without regard to case, their port numbers must be equal, and
1485      * their user-information components must be equal.
1486      *
1487      * <p> When testing the user-information, path, query, fragment, authority,
1488      * or scheme-specific parts of two URIs for equality, the raw forms rather
1489      * than the encoded forms of these components are compared and the
1490      * hexadecimal digits of escaped octets are compared without regard to
1491      * case.
1492      *
1493      * <p> This method satisfies the general contract of the {@link
1494      * java.lang.Object#equals(Object) Object.equals} method. </p>
1495      *
1496      * @param   ob   The object to which this object is to be compared
1497      *
1498      * @return  {@code true} if, and only if, the given object is a URI that
1499      *          is identical to this URI
1500      */
1501     public boolean equals(Object ob) {
1502         if (ob == this)
1503             return true;
1504         if (!(ob instanceof URI))
1505             return false;
1506         URI that = (URI)ob;
1507         if (this.isOpaque() != that.isOpaque()) return false;
1508         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1509         if (!equal(this.fragment, that.fragment)) return false;
1510 
1511         // Opaque
1512         if (this.isOpaque())
1513             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1514 
1515         // Hierarchical
1516         if (!equal(this.path, that.path)) return false;
1517         if (!equal(this.query, that.query)) return false;
1518 
1519         // Authorities
1520         if (this.authority == that.authority) return true;
1521         if (this.host != null) {
1522             // Server-based
1523             if (!equal(this.userInfo, that.userInfo)) return false;
1524             if (!equalIgnoringCase(this.host, that.host)) return false;
1525             if (this.port != that.port) return false;
1526         } else if (this.authority != null) {
1527             // Registry-based
1528             if (!equal(this.authority, that.authority)) return false;
1529         } else if (this.authority != that.authority) {
1530             return false;
1531         }
1532 
1533         return true;
1534     }
1535 
1536     /**
1537      * Returns a hash-code value for this URI.  The hash code is based upon all
1538      * of the URI's components, and satisfies the general contract of the
1539      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1540      *
1541      * @return  A hash-code value for this URI
1542      */
1543     public int hashCode() {
1544         int h = hash;
1545         if (h == 0) {
1546             h = hashIgnoringCase(0, scheme);
1547             h = hash(h, fragment);
1548             if (isOpaque()) {
1549                 h = hash(h, schemeSpecificPart);
1550             } else {
1551                 h = hash(h, path);
1552                 h = hash(h, query);
1553                 if (host != null) {
1554                     h = hash(h, userInfo);
1555                     h = hashIgnoringCase(h, host);
1556                     h += 1949 * port;
1557                 } else {
1558                     h = hash(h, authority);
1559                 }
1560             }
1561             if (h != 0) {
1562                 hash = h;
1563             }
1564         }
1565         return h;
1566     }
1567 
1568     /**
1569      * Compares this URI to another object, which must be a URI.
1570      *
1571      * <p> When comparing corresponding components of two URIs, if one
1572      * component is undefined but the other is defined then the first is
1573      * considered to be less than the second.  Unless otherwise noted, string
1574      * components are ordered according to their natural, case-sensitive
1575      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1576      * String.compareTo} method.  String components that are subject to
1577      * encoding are compared by comparing their raw forms rather than their
1578      * encoded forms.
1579      *
1580      * <p> The ordering of URIs is defined as follows: </p>
1581      *
1582      * <ul>
1583      *
1584      *   <li><p> Two URIs with different schemes are ordered according the
1585      *   ordering of their schemes, without regard to case. </p></li>
1586      *
1587      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1588      *   with an identical scheme. </p></li>
1589      *
1590      *   <li><p> Two opaque URIs with identical schemes are ordered according
1591      *   to the ordering of their scheme-specific parts. </p></li>
1592      *
1593      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1594      *   parts are ordered according to the ordering of their
1595      *   fragments. </p></li>
1596      *
1597      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1598      *   according to the ordering of their authority components: </p>
1599      *
1600      *   <ul>
1601      *
1602      *     <li><p> If both authority components are server-based then the URIs
1603      *     are ordered according to their user-information components; if these
1604      *     components are identical then the URIs are ordered according to the
1605      *     ordering of their hosts, without regard to case; if the hosts are
1606      *     identical then the URIs are ordered according to the ordering of
1607      *     their ports. </p></li>
1608      *
1609      *     <li><p> If one or both authority components are registry-based then
1610      *     the URIs are ordered according to the ordering of their authority
1611      *     components. </p></li>
1612      *
1613      *   </ul></li>
1614      *
1615      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1616      *   authority components are ordered according to the ordering of their
1617      *   paths; if their paths are identical then they are ordered according to
1618      *   the ordering of their queries; if the queries are identical then they
1619      *   are ordered according to the order of their fragments. </p></li>
1620      *
1621      * </ul>
1622      *
1623      * <p> This method satisfies the general contract of the {@link
1624      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1625      * method. </p>
1626      *
1627      * @param   that
1628      *          The object to which this URI is to be compared
1629      *
1630      * @return  A negative integer, zero, or a positive integer as this URI is
1631      *          less than, equal to, or greater than the given URI
1632      *
1633      * @throws  ClassCastException
1634      *          If the given object is not a URI
1635      */
1636     public int compareTo(URI that) {
1637         int c;
1638 
1639         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1640             return c;
1641 
1642         if (this.isOpaque()) {
1643             if (that.isOpaque()) {
1644                 // Both opaque
1645                 if ((c = compare(this.schemeSpecificPart,
1646                                  that.schemeSpecificPart)) != 0)
1647                     return c;
1648                 return compare(this.fragment, that.fragment);
1649             }
1650             return +1;                  // Opaque > hierarchical
1651         } else if (that.isOpaque()) {
1652             return -1;                  // Hierarchical < opaque
1653         }
1654 
1655         // Hierarchical
1656         if ((this.host != null) && (that.host != null)) {
1657             // Both server-based
1658             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1659                 return c;
1660             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1661                 return c;
1662             if ((c = this.port - that.port) != 0)
1663                 return c;
1664         } else {
1665             // If one or both authorities are registry-based then we simply
1666             // compare them in the usual, case-sensitive way.  If one is
1667             // registry-based and one is server-based then the strings are
1668             // guaranteed to be unequal, hence the comparison will never return
1669             // zero and the compareTo and equals methods will remain
1670             // consistent.
1671             if ((c = compare(this.authority, that.authority)) != 0) return c;
1672         }
1673 
1674         if ((c = compare(this.path, that.path)) != 0) return c;
1675         if ((c = compare(this.query, that.query)) != 0) return c;
1676         return compare(this.fragment, that.fragment);
1677     }
1678 
1679     /**
1680      * Returns the content of this URI as a string.
1681      *
1682      * <p> If this URI was created by invoking one of the constructors in this
1683      * class then a string equivalent to the original input string, or to the
1684      * string computed from the originally-given components, as appropriate, is
1685      * returned.  Otherwise this URI was created by normalization, resolution,
1686      * or relativization, and so a string is constructed from this URI's
1687      * components according to the rules specified in <a
1688      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1689      * section&nbsp;5.2, step&nbsp;7. </p>
1690      *
1691      * @return  The string form of this URI
1692      */
1693     public String toString() {
1694         String s = string;
1695         if (s == null) {
1696             s = defineString();
1697         }
1698         return s;
1699     }
1700 
1701     private String defineString() {
1702         String s = string;
1703         if (s != null) {
1704             return s;
1705         }
1706 
1707         StringBuilder sb = new StringBuilder();
1708         if (scheme != null) {
1709             sb.append(scheme);
1710             sb.append(':');
1711         }
1712         if (isOpaque()) {
1713             sb.append(schemeSpecificPart);
1714         } else {
1715             if (host != null) {
1716                 sb.append("//");
1717                 if (userInfo != null) {
1718                     sb.append(userInfo);
1719                     sb.append('@');
1720                 }
1721                 boolean needBrackets = ((host.indexOf(':') >= 0)
1722                         && !host.startsWith("[")
1723                         && !host.endsWith("]"));
1724                 if (needBrackets) sb.append('[');
1725                 sb.append(host);
1726                 if (needBrackets) sb.append(']');
1727                 if (port != -1) {
1728                     sb.append(':');
1729                     sb.append(port);
1730                 }
1731             } else if (authority != null) {
1732                 sb.append("//");
1733                 sb.append(authority);
1734             }
1735             if (path != null)
1736                 sb.append(path);
1737             if (query != null) {
1738                 sb.append('?');
1739                 sb.append(query);
1740             }
1741         }
1742         if (fragment != null) {
1743             sb.append('#');
1744             sb.append(fragment);
1745         }
1746         return string = sb.toString();
1747     }
1748 
1749     /**
1750      * Returns the content of this URI as a US-ASCII string.
1751      *
1752      * <p> If this URI does not contain any characters in the <i>other</i>
1753      * category then an invocation of this method will return the same value as
1754      * an invocation of the {@link #toString() toString} method.  Otherwise
1755      * this method works as if by invoking that method and then <a
1756      * href="#encode">encoding</a> the result.  </p>
1757      *
1758      * @return  The string form of this URI, encoded as needed
1759      *          so that it only contains characters in the US-ASCII
1760      *          charset
1761      */
1762     public String toASCIIString() {
1763         return encode(toString());
1764     }
1765 
1766 
1767     // -- Serialization support --
1768 
1769     /**
1770      * Saves the content of this URI to the given serial stream.
1771      *
1772      * <p> The only serializable field of a URI instance is its {@code string}
1773      * field.  That field is given a value, if it does not have one already,
1774      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1775      * method of the given object-output stream is invoked. </p>
1776      *
1777      * @param  os  The object-output stream to which this object
1778      *             is to be written
1779      */
1780     private void writeObject(ObjectOutputStream os)
1781         throws IOException
1782     {
1783         defineString();
1784         os.defaultWriteObject();        // Writes the string field only
1785     }
1786 
1787     /**
1788      * Reconstitutes a URI from the given serial stream.
1789      *
1790      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1791      * invoked to read the value of the {@code string} field.  The result is
1792      * then parsed in the usual way.
1793      *
1794      * @param  is  The object-input stream from which this object
1795      *             is being read
1796      */
1797     private void readObject(ObjectInputStream is)
1798         throws ClassNotFoundException, IOException
1799     {
1800         port = -1;                      // Argh
1801         is.defaultReadObject();
1802         try {
1803             new Parser(string).parse(false);
1804         } catch (URISyntaxException x) {
1805             IOException y = new InvalidObjectException("Invalid URI");
1806             y.initCause(x);
1807             throw y;
1808         }
1809     }
1810 
1811 
1812     // -- End of public methods --
1813 
1814 
1815     // -- Utility methods for string-field comparison and hashing --
1816 
1817     // These methods return appropriate values for null string arguments,
1818     // thereby simplifying the equals, hashCode, and compareTo methods.
1819     //
1820     // The case-ignoring methods should only be applied to strings whose
1821     // characters are all known to be US-ASCII.  Because of this restriction,
1822     // these methods are faster than the similar methods in the String class.
1823 
1824     // US-ASCII only
1825     private static int toLower(char c) {
1826         if ((c >= 'A') && (c <= 'Z'))
1827             return c + ('a' - 'A');
1828         return c;
1829     }
1830 
1831     // US-ASCII only
1832     private static int toUpper(char c) {
1833         if ((c >= 'a') && (c <= 'z'))
1834             return c - ('a' - 'A');
1835         return c;
1836     }
1837 
1838     private static boolean equal(String s, String t) {
1839         if (s == t) return true;
1840         if ((s != null) && (t != null)) {
1841             if (s.length() != t.length())
1842                 return false;
1843             if (s.indexOf('%') < 0)
1844                 return s.equals(t);
1845             int n = s.length();
1846             for (int i = 0; i < n;) {
1847                 char c = s.charAt(i);
1848                 char d = t.charAt(i);
1849                 if (c != '%') {
1850                     if (c != d)
1851                         return false;
1852                     i++;
1853                     continue;
1854                 }
1855                 if (d != '%')
1856                     return false;
1857                 i++;
1858                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1859                     return false;
1860                 i++;
1861                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1862                     return false;
1863                 i++;
1864             }
1865             return true;
1866         }
1867         return false;
1868     }
1869 
1870     // US-ASCII only
1871     private static boolean equalIgnoringCase(String s, String t) {
1872         if (s == t) return true;
1873         if ((s != null) && (t != null)) {
1874             int n = s.length();
1875             if (t.length() != n)
1876                 return false;
1877             for (int i = 0; i < n; i++) {
1878                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1879                     return false;
1880             }
1881             return true;
1882         }
1883         return false;
1884     }
1885 
1886     private static int hash(int hash, String s) {
1887         if (s == null) return hash;
1888         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1889                                   : normalizedHash(hash, s);
1890     }
1891 
1892 
1893     private static int normalizedHash(int hash, String s) {
1894         int h = 0;
1895         for (int index = 0; index < s.length(); index++) {
1896             char ch = s.charAt(index);
1897             h = 31 * h + ch;
1898             if (ch == '%') {
1899                 /*
1900                  * Process the next two encoded characters
1901                  */
1902                 for (int i = index + 1; i < index + 3; i++)
1903                     h = 31 * h + toUpper(s.charAt(i));
1904                 index += 2;
1905             }
1906         }
1907         return hash * 127 + h;
1908     }
1909 
1910     // US-ASCII only
1911     private static int hashIgnoringCase(int hash, String s) {
1912         if (s == null) return hash;
1913         int h = hash;
1914         int n = s.length();
1915         for (int i = 0; i < n; i++)
1916             h = 31 * h + toLower(s.charAt(i));
1917         return h;
1918     }
1919 
1920     private static int compare(String s, String t) {
1921         if (s == t) return 0;
1922         if (s != null) {
1923             if (t != null)
1924                 return s.compareTo(t);
1925             else
1926                 return +1;
1927         } else {
1928             return -1;
1929         }
1930     }
1931 
1932     // US-ASCII only
1933     private static int compareIgnoringCase(String s, String t) {
1934         if (s == t) return 0;
1935         if (s != null) {
1936             if (t != null) {
1937                 int sn = s.length();
1938                 int tn = t.length();
1939                 int n = sn < tn ? sn : tn;
1940                 for (int i = 0; i < n; i++) {
1941                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1942                     if (c != 0)
1943                         return c;
1944                 }
1945                 return sn - tn;
1946             }
1947             return +1;
1948         } else {
1949             return -1;
1950         }
1951     }
1952 
1953 
1954     // -- String construction --
1955 
1956     // If a scheme is given then the path, if given, must be absolute
1957     //
1958     private static void checkPath(String s, String scheme, String path)
1959         throws URISyntaxException
1960     {
1961         if (scheme != null) {
1962             if ((path != null)
1963                 && ((path.length() > 0) && (path.charAt(0) != '/')))
1964                 throw new URISyntaxException(s,
1965                                              "Relative path in absolute URI");
1966         }
1967     }
1968 
1969     private void appendAuthority(StringBuilder sb,
1970                                  String authority,
1971                                  String userInfo,
1972                                  String host,
1973                                  int port)
1974     {
1975         if (host != null) {
1976             sb.append("//");
1977             if (userInfo != null) {
1978                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1979                 sb.append('@');
1980             }
1981             boolean needBrackets = ((host.indexOf(':') >= 0)
1982                                     && !host.startsWith("[")
1983                                     && !host.endsWith("]"));
1984             if (needBrackets) sb.append('[');
1985             sb.append(host);
1986             if (needBrackets) sb.append(']');
1987             if (port != -1) {
1988                 sb.append(':');
1989                 sb.append(port);
1990             }
1991         } else if (authority != null) {
1992             sb.append("//");
1993             if (authority.startsWith("[")) {
1994                 // authority should (but may not) contain an embedded IPv6 address
1995                 int end = authority.indexOf(']');
1996                 String doquote = authority, dontquote = "";
1997                 if (end != -1 && authority.indexOf(':') != -1) {
1998                     // the authority contains an IPv6 address
1999                     if (end == authority.length()) {
2000                         dontquote = authority;
2001                         doquote = "";
2002                     } else {
2003                         dontquote = authority.substring(0 , end + 1);
2004                         doquote = authority.substring(end + 1);
2005                     }
2006                 }
2007                 sb.append(dontquote);
2008                 sb.append(quote(doquote,
2009                             L_REG_NAME | L_SERVER,
2010                             H_REG_NAME | H_SERVER));
2011             } else {
2012                 sb.append(quote(authority,
2013                             L_REG_NAME | L_SERVER,
2014                             H_REG_NAME | H_SERVER));
2015             }
2016         }
2017     }
2018 
2019     private void appendSchemeSpecificPart(StringBuilder sb,
2020                                           String opaquePart,
2021                                           String authority,
2022                                           String userInfo,
2023                                           String host,
2024                                           int port,
2025                                           String path,
2026                                           String query)
2027     {
2028         if (opaquePart != null) {
2029             /* check if SSP begins with an IPv6 address
2030              * because we must not quote a literal IPv6 address
2031              */
2032             if (opaquePart.startsWith("//[")) {
2033                 int end =  opaquePart.indexOf(']');
2034                 if (end != -1 && opaquePart.indexOf(':')!=-1) {
2035                     String doquote, dontquote;
2036                     if (end == opaquePart.length()) {
2037                         dontquote = opaquePart;
2038                         doquote = "";
2039                     } else {
2040                         dontquote = opaquePart.substring(0,end+1);
2041                         doquote = opaquePart.substring(end+1);
2042                     }
2043                     sb.append (dontquote);
2044                     sb.append(quote(doquote, L_URIC, H_URIC));
2045                 }
2046             } else {
2047                 sb.append(quote(opaquePart, L_URIC, H_URIC));
2048             }
2049         } else {
2050             appendAuthority(sb, authority, userInfo, host, port);
2051             if (path != null)
2052                 sb.append(quote(path, L_PATH, H_PATH));
2053             if (query != null) {
2054                 sb.append('?');
2055                 sb.append(quote(query, L_URIC, H_URIC));
2056             }
2057         }
2058     }
2059 
2060     private void appendFragment(StringBuilder sb, String fragment) {
2061         if (fragment != null) {
2062             sb.append('#');
2063             sb.append(quote(fragment, L_URIC, H_URIC));
2064         }
2065     }
2066 
2067     private String toString(String scheme,
2068                             String opaquePart,
2069                             String authority,
2070                             String userInfo,
2071                             String host,
2072                             int port,
2073                             String path,
2074                             String query,
2075                             String fragment)
2076     {
2077         StringBuilder sb = new StringBuilder();
2078         if (scheme != null) {
2079             sb.append(scheme);
2080             sb.append(':');
2081         }
2082         appendSchemeSpecificPart(sb, opaquePart,
2083                                  authority, userInfo, host, port,
2084                                  path, query);
2085         appendFragment(sb, fragment);
2086         return sb.toString();
2087     }
2088 
2089     // -- Normalization, resolution, and relativization --
2090 
2091     // RFC2396 5.2 (6)
2092     private static String resolvePath(String base, String child,
2093                                       boolean absolute)
2094     {
2095         int i = base.lastIndexOf('/');
2096         int cn = child.length();
2097         String path = "";
2098 
2099         if (cn == 0) {
2100             // 5.2 (6a)
2101             if (i >= 0)
2102                 path = base.substring(0, i + 1);
2103         } else {
2104             StringBuilder sb = new StringBuilder(base.length() + cn);
2105             // 5.2 (6a)
2106             if (i >= 0)
2107                 sb.append(base, 0, i + 1);
2108             // 5.2 (6b)
2109             sb.append(child);
2110             path = sb.toString();
2111         }
2112 
2113         // 5.2 (6c-f)
2114         String np = normalize(path);
2115 
2116         // 5.2 (6g): If the result is absolute but the path begins with "../",
2117         // then we simply leave the path as-is
2118 
2119         return np;
2120     }
2121 
2122     // RFC2396 5.2
2123     private static URI resolve(URI base, URI child) {
2124         // check if child if opaque first so that NPE is thrown
2125         // if child is null.
2126         if (child.isOpaque() || base.isOpaque())
2127             return child;
2128 
2129         // 5.2 (2): Reference to current document (lone fragment)
2130         if ((child.scheme == null) && (child.authority == null)
2131             && child.path.isEmpty() && (child.fragment != null)
2132             && (child.query == null)) {
2133             if ((base.fragment != null)
2134                 && child.fragment.equals(base.fragment)) {
2135                 return base;
2136             }
2137             URI ru = new URI();
2138             ru.scheme = base.scheme;
2139             ru.authority = base.authority;
2140             ru.userInfo = base.userInfo;
2141             ru.host = base.host;
2142             ru.port = base.port;
2143             ru.path = base.path;
2144             ru.fragment = child.fragment;
2145             ru.query = base.query;
2146             return ru;
2147         }
2148 
2149         // 5.2 (3): Child is absolute
2150         if (child.scheme != null)
2151             return child;
2152 
2153         URI ru = new URI();             // Resolved URI
2154         ru.scheme = base.scheme;
2155         ru.query = child.query;
2156         ru.fragment = child.fragment;
2157 
2158         // 5.2 (4): Authority
2159         if (child.authority == null) {
2160             ru.authority = base.authority;
2161             ru.host = base.host;
2162             ru.userInfo = base.userInfo;
2163             ru.port = base.port;
2164 
2165             String cp = (child.path == null) ? "" : child.path;
2166             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2167                 // 5.2 (5): Child path is absolute
2168                 ru.path = child.path;
2169             } else {
2170                 // 5.2 (6): Resolve relative path
2171                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2172             }
2173         } else {
2174             ru.authority = child.authority;
2175             ru.host = child.host;
2176             ru.userInfo = child.userInfo;
2177             ru.host = child.host;
2178             ru.port = child.port;
2179             ru.path = child.path;
2180         }
2181 
2182         // 5.2 (7): Recombine (nothing to do here)
2183         return ru;
2184     }
2185 
2186     // If the given URI's path is normal then return the URI;
2187     // o.w., return a new URI containing the normalized path.
2188     //
2189     private static URI normalize(URI u) {
2190         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2191             return u;
2192 
2193         String np = normalize(u.path);
2194         if (np == u.path)
2195             return u;
2196 
2197         URI v = new URI();
2198         v.scheme = u.scheme;
2199         v.fragment = u.fragment;
2200         v.authority = u.authority;
2201         v.userInfo = u.userInfo;
2202         v.host = u.host;
2203         v.port = u.port;
2204         v.path = np;
2205         v.query = u.query;
2206         return v;
2207     }
2208 
2209     // If both URIs are hierarchical, their scheme and authority components are
2210     // identical, and the base path is a prefix of the child's path, then
2211     // return a relative URI that, when resolved against the base, yields the
2212     // child; otherwise, return the child.
2213     //
2214     private static URI relativize(URI base, URI child) {
2215         // check if child if opaque first so that NPE is thrown
2216         // if child is null.
2217         if (child.isOpaque() || base.isOpaque())
2218             return child;
2219         if (!equalIgnoringCase(base.scheme, child.scheme)
2220             || !equal(base.authority, child.authority))
2221             return child;
2222 
2223         String bp = normalize(base.path);
2224         String cp = normalize(child.path);
2225         if (!bp.equals(cp)) {
2226             if (!bp.endsWith("/"))
2227                 bp = bp + "/";
2228             if (!cp.startsWith(bp))
2229                 return child;
2230         }
2231 
2232         URI v = new URI();
2233         v.path = cp.substring(bp.length());
2234         v.query = child.query;
2235         v.fragment = child.fragment;
2236         return v;
2237     }
2238 
2239 
2240 
2241     // -- Path normalization --
2242 
2243     // The following algorithm for path normalization avoids the creation of a
2244     // string object for each segment, as well as the use of a string buffer to
2245     // compute the final result, by using a single char array and editing it in
2246     // place.  The array is first split into segments, replacing each slash
2247     // with '\0' and creating a segment-index array, each element of which is
2248     // the index of the first char in the corresponding segment.  We then walk
2249     // through both arrays, removing ".", "..", and other segments as necessary
2250     // by setting their entries in the index array to -1.  Finally, the two
2251     // arrays are used to rejoin the segments and compute the final result.
2252     //
2253     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2254 
2255 
2256     // Check the given path to see if it might need normalization.  A path
2257     // might need normalization if it contains duplicate slashes, a "."
2258     // segment, or a ".." segment.  Return -1 if no further normalization is
2259     // possible, otherwise return the number of segments found.
2260     //
2261     // This method takes a string argument rather than a char array so that
2262     // this test can be performed without invoking path.toCharArray().
2263     //
2264     private static int needsNormalization(String path) {
2265         boolean normal = true;
2266         int ns = 0;                     // Number of segments
2267         int end = path.length() - 1;    // Index of last char in path
2268         int p = 0;                      // Index of next char in path
2269 
2270         // Skip initial slashes
2271         while (p <= end) {
2272             if (path.charAt(p) != '/') break;
2273             p++;
2274         }
2275         if (p > 1) normal = false;
2276 
2277         // Scan segments
2278         while (p <= end) {
2279 
2280             // Looking at "." or ".." ?
2281             if ((path.charAt(p) == '.')
2282                 && ((p == end)
2283                     || ((path.charAt(p + 1) == '/')
2284                         || ((path.charAt(p + 1) == '.')
2285                             && ((p + 1 == end)
2286                                 || (path.charAt(p + 2) == '/')))))) {
2287                 normal = false;
2288             }
2289             ns++;
2290 
2291             // Find beginning of next segment
2292             while (p <= end) {
2293                 if (path.charAt(p++) != '/')
2294                     continue;
2295 
2296                 // Skip redundant slashes
2297                 while (p <= end) {
2298                     if (path.charAt(p) != '/') break;
2299                     normal = false;
2300                     p++;
2301                 }
2302 
2303                 break;
2304             }
2305         }
2306 
2307         return normal ? -1 : ns;
2308     }
2309 
2310 
2311     // Split the given path into segments, replacing slashes with nulls and
2312     // filling in the given segment-index array.
2313     //
2314     // Preconditions:
2315     //   segs.length == Number of segments in path
2316     //
2317     // Postconditions:
2318     //   All slashes in path replaced by '\0'
2319     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2320     //
2321     private static void split(char[] path, int[] segs) {
2322         int end = path.length - 1;      // Index of last char in path
2323         int p = 0;                      // Index of next char in path
2324         int i = 0;                      // Index of current segment
2325 
2326         // Skip initial slashes
2327         while (p <= end) {
2328             if (path[p] != '/') break;
2329             path[p] = '\0';
2330             p++;
2331         }
2332 
2333         while (p <= end) {
2334 
2335             // Note start of segment
2336             segs[i++] = p++;
2337 
2338             // Find beginning of next segment
2339             while (p <= end) {
2340                 if (path[p++] != '/')
2341                     continue;
2342                 path[p - 1] = '\0';
2343 
2344                 // Skip redundant slashes
2345                 while (p <= end) {
2346                     if (path[p] != '/') break;
2347                     path[p++] = '\0';
2348                 }
2349                 break;
2350             }
2351         }
2352 
2353         if (i != segs.length)
2354             throw new InternalError();  // ASSERT
2355     }
2356 
2357 
2358     // Join the segments in the given path according to the given segment-index
2359     // array, ignoring those segments whose index entries have been set to -1,
2360     // and inserting slashes as needed.  Return the length of the resulting
2361     // path.
2362     //
2363     // Preconditions:
2364     //   segs[i] == -1 implies segment i is to be ignored
2365     //   path computed by split, as above, with '\0' having replaced '/'
2366     //
2367     // Postconditions:
2368     //   path[0] .. path[return value] == Resulting path
2369     //
2370     private static int join(char[] path, int[] segs) {
2371         int ns = segs.length;           // Number of segments
2372         int end = path.length - 1;      // Index of last char in path
2373         int p = 0;                      // Index of next path char to write
2374 
2375         if (path[p] == '\0') {
2376             // Restore initial slash for absolute paths
2377             path[p++] = '/';
2378         }
2379 
2380         for (int i = 0; i < ns; i++) {
2381             int q = segs[i];            // Current segment
2382             if (q == -1)
2383                 // Ignore this segment
2384                 continue;
2385 
2386             if (p == q) {
2387                 // We're already at this segment, so just skip to its end
2388                 while ((p <= end) && (path[p] != '\0'))
2389                     p++;
2390                 if (p <= end) {
2391                     // Preserve trailing slash
2392                     path[p++] = '/';
2393                 }
2394             } else if (p < q) {
2395                 // Copy q down to p
2396                 while ((q <= end) && (path[q] != '\0'))
2397                     path[p++] = path[q++];
2398                 if (q <= end) {
2399                     // Preserve trailing slash
2400                     path[p++] = '/';
2401                 }
2402             } else
2403                 throw new InternalError(); // ASSERT false
2404         }
2405 
2406         return p;
2407     }
2408 
2409 
2410     // Remove "." segments from the given path, and remove segment pairs
2411     // consisting of a non-".." segment followed by a ".." segment.
2412     //
2413     private static void removeDots(char[] path, int[] segs) {
2414         int ns = segs.length;
2415         int end = path.length - 1;
2416 
2417         for (int i = 0; i < ns; i++) {
2418             int dots = 0;               // Number of dots found (0, 1, or 2)
2419 
2420             // Find next occurrence of "." or ".."
2421             do {
2422                 int p = segs[i];
2423                 if (path[p] == '.') {
2424                     if (p == end) {
2425                         dots = 1;
2426                         break;
2427                     } else if (path[p + 1] == '\0') {
2428                         dots = 1;
2429                         break;
2430                     } else if ((path[p + 1] == '.')
2431                                && ((p + 1 == end)
2432                                    || (path[p + 2] == '\0'))) {
2433                         dots = 2;
2434                         break;
2435                     }
2436                 }
2437                 i++;
2438             } while (i < ns);
2439             if ((i > ns) || (dots == 0))
2440                 break;
2441 
2442             if (dots == 1) {
2443                 // Remove this occurrence of "."
2444                 segs[i] = -1;
2445             } else {
2446                 // If there is a preceding non-".." segment, remove both that
2447                 // segment and this occurrence of ".."; otherwise, leave this
2448                 // ".." segment as-is.
2449                 int j;
2450                 for (j = i - 1; j >= 0; j--) {
2451                     if (segs[j] != -1) break;
2452                 }
2453                 if (j >= 0) {
2454                     int q = segs[j];
2455                     if (!((path[q] == '.')
2456                           && (path[q + 1] == '.')
2457                           && (path[q + 2] == '\0'))) {
2458                         segs[i] = -1;
2459                         segs[j] = -1;
2460                     }
2461                 }
2462             }
2463         }
2464     }
2465 
2466 
2467     // DEVIATION: If the normalized path is relative, and if the first
2468     // segment could be parsed as a scheme name, then prepend a "." segment
2469     //
2470     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2471 
2472         if (path[0] == '\0')
2473             // The path is absolute
2474             return;
2475 
2476         int ns = segs.length;
2477         int f = 0;                      // Index of first segment
2478         while (f < ns) {
2479             if (segs[f] >= 0)
2480                 break;
2481             f++;
2482         }
2483         if ((f >= ns) || (f == 0))
2484             // The path is empty, or else the original first segment survived,
2485             // in which case we already know that no leading "." is needed
2486             return;
2487 
2488         int p = segs[f];
2489         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2490         if (p >= path.length || path[p] == '\0')
2491             // No colon in first segment, so no "." needed
2492             return;
2493 
2494         // At this point we know that the first segment is unused,
2495         // hence we can insert a "." segment at that position
2496         path[0] = '.';
2497         path[1] = '\0';
2498         segs[0] = 0;
2499     }
2500 
2501 
2502     // Normalize the given path string.  A normal path string has no empty
2503     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2504     // segments equal to ".." that are preceded by a segment not equal to "..".
2505     // In contrast to Unix-style pathname normalization, for URI paths we
2506     // always retain trailing slashes.
2507     //
2508     private static String normalize(String ps) {
2509 
2510         // Does this path need normalization?
2511         int ns = needsNormalization(ps);        // Number of segments
2512         if (ns < 0)
2513             // Nope -- just return it
2514             return ps;
2515 
2516         char[] path = ps.toCharArray();         // Path in char-array form
2517 
2518         // Split path into segments
2519         int[] segs = new int[ns];               // Segment-index array
2520         split(path, segs);
2521 
2522         // Remove dots
2523         removeDots(path, segs);
2524 
2525         // Prevent scheme-name confusion
2526         maybeAddLeadingDot(path, segs);
2527 
2528         // Join the remaining segments and return the result
2529         String s = new String(path, 0, join(path, segs));
2530         if (s.equals(ps)) {
2531             // string was already normalized
2532             return ps;
2533         }
2534         return s;
2535     }
2536 
2537 
2538 
2539     // -- Character classes for parsing --
2540 
2541     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2542     // permissible in the various components of a URI reference.  We here
2543     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2544     // mask pair consists of two longs, a low mask and a high mask.  Taken
2545     // together they represent a 128-bit mask, where bit i is set iff the
2546     // character with value i is permitted.
2547     //
2548     // This approach is more efficient than sequentially searching arrays of
2549     // permitted characters.  It could be made still more efficient by
2550     // precompiling the mask information so that a character's presence in a
2551     // given mask could be determined by a single table lookup.
2552 
2553     // To save startup time, we manually calculate the low-/highMask constants.
2554     // For reference, the following methods were used to calculate the values:
2555 
2556     // Compute the low-order mask for the characters in the given string
2557     //     private static long lowMask(String chars) {
2558     //        int n = chars.length();
2559     //        long m = 0;
2560     //        for (int i = 0; i < n; i++) {
2561     //            char c = chars.charAt(i);
2562     //            if (c < 64)
2563     //                m |= (1L << c);
2564     //        }
2565     //        return m;
2566     //    }
2567 
2568     // Compute the high-order mask for the characters in the given string
2569     //    private static long highMask(String chars) {
2570     //        int n = chars.length();
2571     //        long m = 0;
2572     //        for (int i = 0; i < n; i++) {
2573     //            char c = chars.charAt(i);
2574     //            if ((c >= 64) && (c < 128))
2575     //                m |= (1L << (c - 64));
2576     //        }
2577     //        return m;
2578     //    }
2579 
2580     // Compute a low-order mask for the characters
2581     // between first and last, inclusive
2582     //    private static long lowMask(char first, char last) {
2583     //        long m = 0;
2584     //        int f = Math.max(Math.min(first, 63), 0);
2585     //        int l = Math.max(Math.min(last, 63), 0);
2586     //        for (int i = f; i <= l; i++)
2587     //            m |= 1L << i;
2588     //        return m;
2589     //    }
2590 
2591     // Compute a high-order mask for the characters
2592     // between first and last, inclusive
2593     //    private static long highMask(char first, char last) {
2594     //        long m = 0;
2595     //        int f = Math.max(Math.min(first, 127), 64) - 64;
2596     //        int l = Math.max(Math.min(last, 127), 64) - 64;
2597     //        for (int i = f; i <= l; i++)
2598     //            m |= 1L << i;
2599     //        return m;
2600     //    }
2601 
2602     // Tell whether the given character is permitted by the given mask pair
2603     private static boolean match(char c, long lowMask, long highMask) {
2604         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2605             return false;
2606         if (c < 64)
2607             return ((1L << c) & lowMask) != 0;
2608         if (c < 128)
2609             return ((1L << (c - 64)) & highMask) != 0;
2610         return false;
2611     }
2612 
2613     // Character-class masks, in reverse order from RFC2396 because
2614     // initializers for static fields cannot make forward references.
2615 
2616     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2617     //            "8" | "9"
2618     private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9');
2619     private static final long H_DIGIT = 0L;
2620 
2621     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2622     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2623     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2624     private static final long L_UPALPHA = 0L;
2625     private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z');
2626 
2627     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2628     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2629     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2630     private static final long L_LOWALPHA = 0L;
2631     private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z');
2632 
2633     // alpha         = lowalpha | upalpha
2634     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2635     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2636 
2637     // alphanum      = alpha | digit
2638     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2639     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2640 
2641     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2642     //                         "a" | "b" | "c" | "d" | "e" | "f"
2643     private static final long L_HEX = L_DIGIT;
2644     private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f');
2645 
2646     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2647     //                 "(" | ")"
2648     private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()");
2649     private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()");
2650 
2651     // unreserved    = alphanum | mark
2652     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2653     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2654 
2655     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2656     //                 "$" | "," | "[" | "]"
2657     // Added per RFC2732: "[", "]"
2658     private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]");
2659     private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]");
2660 
2661     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2662     // characters are allowed; this is handled by the scanEscape method below.
2663     private static final long L_ESCAPED = 1L;
2664     private static final long H_ESCAPED = 0L;
2665 
2666     // uric          = reserved | unreserved | escaped
2667     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2668     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2669 
2670     // pchar         = unreserved | escaped |
2671     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2672     private static final long L_PCHAR
2673         = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,");
2674     private static final long H_PCHAR
2675         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,");
2676 
2677     // All valid path characters
2678     private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/");
2679     private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L;
2680 
2681     // Dash, for use in domainlabel and toplabel
2682     private static final long L_DASH = 0x200000000000L; // lowMask("-");
2683     private static final long H_DASH = 0x0L; // highMask("-");
2684 
2685     // Dot, for use in hostnames
2686     private static final long L_DOT = 0x400000000000L; // lowMask(".");
2687     private static final long H_DOT = 0x0L; // highMask(".");
2688 
2689     // userinfo      = *( unreserved | escaped |
2690     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2691     private static final long L_USERINFO
2692         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,");
2693     private static final long H_USERINFO
2694         = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L;
2695 
2696     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2697     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2698     private static final long L_REG_NAME
2699         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+");
2700     private static final long H_REG_NAME
2701         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+");
2702 
2703     // All valid characters for server-based authorities
2704     private static final long L_SERVER
2705         = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]");
2706     private static final long H_SERVER
2707         = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]");
2708 
2709     // Special case of server authority that represents an IPv6 address
2710     // In this case, a % does not signify an escape sequence
2711     private static final long L_SERVER_PERCENT
2712         = L_SERVER | 0x2000000000L; // lowMask("%");
2713     private static final long H_SERVER_PERCENT
2714         = H_SERVER; // | highMask("%") == 0L;
2715 
2716     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2717     private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-.");
2718     private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L
2719 
2720     // scope_id = alpha | digit | "_" | "."
2721     private static final long L_SCOPE_ID
2722         = L_ALPHANUM | 0x400000000000L; // lowMask("_.");
2723     private static final long H_SCOPE_ID
2724         = H_ALPHANUM | 0x80000000L; // highMask("_.");
2725 
2726     // -- Escaping and encoding --
2727 
2728     private static final char[] hexDigits = {
2729         '0', '1', '2', '3', '4', '5', '6', '7',
2730         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2731     };
2732 
2733     private static void appendEscape(StringBuilder sb, byte b) {
2734         sb.append('%');
2735         sb.append(hexDigits[(b >> 4) & 0x0f]);
2736         sb.append(hexDigits[(b >> 0) & 0x0f]);
2737     }
2738 
2739     private static void appendEncoded(StringBuilder sb, char c) {
2740         ByteBuffer bb = null;
2741         try {
2742             bb = ThreadLocalCoders.encoderFor("UTF-8")
2743                 .encode(CharBuffer.wrap("" + c));
2744         } catch (CharacterCodingException x) {
2745             assert false;
2746         }
2747         while (bb.hasRemaining()) {
2748             int b = bb.get() & 0xff;
2749             if (b >= 0x80)
2750                 appendEscape(sb, (byte)b);
2751             else
2752                 sb.append((char)b);
2753         }
2754     }
2755 
2756     // Quote any characters in s that are not permitted
2757     // by the given mask pair
2758     //
2759     private static String quote(String s, long lowMask, long highMask) {
2760         StringBuilder sb = null;
2761         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2762         for (int i = 0; i < s.length(); i++) {
2763             char c = s.charAt(i);
2764             if (c < '\u0080') {
2765                 if (!match(c, lowMask, highMask)) {
2766                     if (sb == null) {
2767                         sb = new StringBuilder();
2768                         sb.append(s, 0, i);
2769                     }
2770                     appendEscape(sb, (byte)c);
2771                 } else {
2772                     if (sb != null)
2773                         sb.append(c);
2774                 }
2775             } else if (allowNonASCII
2776                        && (Character.isSpaceChar(c)
2777                            || Character.isISOControl(c))) {
2778                 if (sb == null) {
2779                     sb = new StringBuilder();
2780                     sb.append(s, 0, i);
2781                 }
2782                 appendEncoded(sb, c);
2783             } else {
2784                 if (sb != null)
2785                     sb.append(c);
2786             }
2787         }
2788         return (sb == null) ? s : sb.toString();
2789     }
2790 
2791     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2792     // assuming that s is otherwise legal
2793     //
2794     private static String encode(String s) {
2795         int n = s.length();
2796         if (n == 0)
2797             return s;
2798 
2799         // First check whether we actually need to encode
2800         for (int i = 0;;) {
2801             if (s.charAt(i) >= '\u0080')
2802                 break;
2803             if (++i >= n)
2804                 return s;
2805         }
2806 
2807         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2808         ByteBuffer bb = null;
2809         try {
2810             bb = ThreadLocalCoders.encoderFor("UTF-8")
2811                 .encode(CharBuffer.wrap(ns));
2812         } catch (CharacterCodingException x) {
2813             assert false;
2814         }
2815 
2816         StringBuilder sb = new StringBuilder();
2817         while (bb.hasRemaining()) {
2818             int b = bb.get() & 0xff;
2819             if (b >= 0x80)
2820                 appendEscape(sb, (byte)b);
2821             else
2822                 sb.append((char)b);
2823         }
2824         return sb.toString();
2825     }
2826 
2827     private static int decode(char c) {
2828         if ((c >= '0') && (c <= '9'))
2829             return c - '0';
2830         if ((c >= 'a') && (c <= 'f'))
2831             return c - 'a' + 10;
2832         if ((c >= 'A') && (c <= 'F'))
2833             return c - 'A' + 10;
2834         assert false;
2835         return -1;
2836     }
2837 
2838     private static byte decode(char c1, char c2) {
2839         return (byte)(  ((decode(c1) & 0xf) << 4)
2840                       | ((decode(c2) & 0xf) << 0));
2841     }
2842 
2843     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2844     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2845     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2846     // are replaced with '\uFFFD'.
2847     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2848     //            with a scope_id
2849     //
2850     private static String decode(String s) {
2851         return decode(s, true);
2852     }
2853 
2854     // This method was introduced as a generalization of URI.decode method
2855     // to provide a fix for JDK-8037396
2856     private static String decode(String s, boolean ignorePercentInBrackets) {
2857         if (s == null)
2858             return s;
2859         int n = s.length();
2860         if (n == 0)
2861             return s;
2862         if (s.indexOf('%') < 0)
2863             return s;
2864 
2865         StringBuilder sb = new StringBuilder(n);
2866         ByteBuffer bb = ByteBuffer.allocate(n);
2867         CharBuffer cb = CharBuffer.allocate(n);
2868         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2869                 .onMalformedInput(CodingErrorAction.REPLACE)
2870                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2871 
2872         // This is not horribly efficient, but it will do for now
2873         char c = s.charAt(0);
2874         boolean betweenBrackets = false;
2875 
2876         for (int i = 0; i < n;) {
2877             assert c == s.charAt(i);    // Loop invariant
2878             if (c == '[') {
2879                 betweenBrackets = true;
2880             } else if (betweenBrackets && c == ']') {
2881                 betweenBrackets = false;
2882             }
2883             if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) {
2884                 sb.append(c);
2885                 if (++i >= n)
2886                     break;
2887                 c = s.charAt(i);
2888                 continue;
2889             }
2890             bb.clear();
2891             int ui = i;
2892             for (;;) {
2893                 assert (n - i >= 2);
2894                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2895                 if (++i >= n)
2896                     break;
2897                 c = s.charAt(i);
2898                 if (c != '%')
2899                     break;
2900             }
2901             bb.flip();
2902             cb.clear();
2903             dec.reset();
2904             CoderResult cr = dec.decode(bb, cb, true);
2905             assert cr.isUnderflow();
2906             cr = dec.flush(cb);
2907             assert cr.isUnderflow();
2908             sb.append(cb.flip().toString());
2909         }
2910 
2911         return sb.toString();
2912     }
2913 
2914 
2915     // -- Parsing --
2916 
2917     // For convenience we wrap the input URI string in a new instance of the
2918     // following internal class.  This saves always having to pass the input
2919     // string as an argument to each internal scan/parse method.
2920 
2921     private class Parser {
2922 
2923         private String input;           // URI input string
2924         private boolean requireServerAuthority = false;
2925 
2926         Parser(String s) {
2927             input = s;
2928             string = s;
2929         }
2930 
2931         // -- Methods for throwing URISyntaxException in various ways --
2932 
2933         private void fail(String reason) throws URISyntaxException {
2934             throw new URISyntaxException(input, reason);
2935         }
2936 
2937         private void fail(String reason, int p) throws URISyntaxException {
2938             throw new URISyntaxException(input, reason, p);
2939         }
2940 
2941         private void failExpecting(String expected, int p)
2942             throws URISyntaxException
2943         {
2944             fail("Expected " + expected, p);
2945         }
2946 
2947 
2948         // -- Simple access to the input string --
2949 
2950         // Tells whether start < end and, if so, whether charAt(start) == c
2951         //
2952         private boolean at(int start, int end, char c) {
2953             return (start < end) && (input.charAt(start) == c);
2954         }
2955 
2956         // Tells whether start + s.length() < end and, if so,
2957         // whether the chars at the start position match s exactly
2958         //
2959         private boolean at(int start, int end, String s) {
2960             int p = start;
2961             int sn = s.length();
2962             if (sn > end - p)
2963                 return false;
2964             int i = 0;
2965             while (i < sn) {
2966                 if (input.charAt(p++) != s.charAt(i)) {
2967                     break;
2968                 }
2969                 i++;
2970             }
2971             return (i == sn);
2972         }
2973 
2974 
2975         // -- Scanning --
2976 
2977         // The various scan and parse methods that follow use a uniform
2978         // convention of taking the current start position and end index as
2979         // their first two arguments.  The start is inclusive while the end is
2980         // exclusive, just as in the String class, i.e., a start/end pair
2981         // denotes the left-open interval [start, end) of the input string.
2982         //
2983         // These methods never proceed past the end position.  They may return
2984         // -1 to indicate outright failure, but more often they simply return
2985         // the position of the first char after the last char scanned.  Thus
2986         // a typical idiom is
2987         //
2988         //     int p = start;
2989         //     int q = scan(p, end, ...);
2990         //     if (q > p)
2991         //         // We scanned something
2992         //         ...;
2993         //     else if (q == p)
2994         //         // We scanned nothing
2995         //         ...;
2996         //     else if (q == -1)
2997         //         // Something went wrong
2998         //         ...;
2999 
3000 
3001         // Scan a specific char: If the char at the given start position is
3002         // equal to c, return the index of the next char; otherwise, return the
3003         // start position.
3004         //
3005         private int scan(int start, int end, char c) {
3006             if ((start < end) && (input.charAt(start) == c))
3007                 return start + 1;
3008             return start;
3009         }
3010 
3011         // Scan forward from the given start position.  Stop at the first char
3012         // in the err string (in which case -1 is returned), or the first char
3013         // in the stop string (in which case the index of the preceding char is
3014         // returned), or the end of the input string (in which case the length
3015         // of the input string is returned).  May return the start position if
3016         // nothing matches.
3017         //
3018         private int scan(int start, int end, String err, String stop) {
3019             int p = start;
3020             while (p < end) {
3021                 char c = input.charAt(p);
3022                 if (err.indexOf(c) >= 0)
3023                     return -1;
3024                 if (stop.indexOf(c) >= 0)
3025                     break;
3026                 p++;
3027             }
3028             return p;
3029         }
3030 
3031         // Scan forward from the given start position.  Stop at the first char
3032         // in the stop string (in which case the index of the preceding char is
3033         // returned), or the end of the input string (in which case the length
3034         // of the input string is returned).  May return the start position if
3035         // nothing matches.
3036         //
3037         private int scan(int start, int end, String stop) {
3038             int p = start;
3039             while (p < end) {
3040                 char c = input.charAt(p);
3041                 if (stop.indexOf(c) >= 0)
3042                     break;
3043                 p++;
3044             }
3045             return p;
3046         }
3047 
3048         // Scan a potential escape sequence, starting at the given position,
3049         // with the given first char (i.e., charAt(start) == c).
3050         //
3051         // This method assumes that if escapes are allowed then visible
3052         // non-US-ASCII chars are also allowed.
3053         //
3054         private int scanEscape(int start, int n, char first)
3055             throws URISyntaxException
3056         {
3057             int p = start;
3058             char c = first;
3059             if (c == '%') {
3060                 // Process escape pair
3061                 if ((p + 3 <= n)
3062                     && match(input.charAt(p + 1), L_HEX, H_HEX)
3063                     && match(input.charAt(p + 2), L_HEX, H_HEX)) {
3064                     return p + 3;
3065                 }
3066                 fail("Malformed escape pair", p);
3067             } else if ((c > 128)
3068                        && !Character.isSpaceChar(c)
3069                        && !Character.isISOControl(c)) {
3070                 // Allow unescaped but visible non-US-ASCII chars
3071                 return p + 1;
3072             }
3073             return p;
3074         }
3075 
3076         // Scan chars that match the given mask pair
3077         //
3078         private int scan(int start, int n, long lowMask, long highMask)
3079             throws URISyntaxException
3080         {
3081             int p = start;
3082             while (p < n) {
3083                 char c = input.charAt(p);
3084                 if (match(c, lowMask, highMask)) {
3085                     p++;
3086                     continue;
3087                 }
3088                 if ((lowMask & L_ESCAPED) != 0) {
3089                     int q = scanEscape(p, n, c);
3090                     if (q > p) {
3091                         p = q;
3092                         continue;
3093                     }
3094                 }
3095                 break;
3096             }
3097             return p;
3098         }
3099 
3100         // Check that each of the chars in [start, end) matches the given mask
3101         //
3102         private void checkChars(int start, int end,
3103                                 long lowMask, long highMask,
3104                                 String what)
3105             throws URISyntaxException
3106         {
3107             int p = scan(start, end, lowMask, highMask);
3108             if (p < end)
3109                 fail("Illegal character in " + what, p);
3110         }
3111 
3112         // Check that the char at position p matches the given mask
3113         //
3114         private void checkChar(int p,
3115                                long lowMask, long highMask,
3116                                String what)
3117             throws URISyntaxException
3118         {
3119             checkChars(p, p + 1, lowMask, highMask, what);
3120         }
3121 
3122 
3123         // -- Parsing --
3124 
3125         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3126         //
3127         void parse(boolean rsa) throws URISyntaxException {
3128             requireServerAuthority = rsa;
3129             int n = input.length();
3130             int p = scan(0, n, "/?#", ":");
3131             if ((p >= 0) && at(p, n, ':')) {
3132                 if (p == 0)
3133                     failExpecting("scheme name", 0);
3134                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3135                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3136                 scheme = input.substring(0, p);
3137                 p++;                    // Skip ':'
3138                 if (at(p, n, '/')) {
3139                     p = parseHierarchical(p, n);
3140                 } else {
3141                     // opaque; need to create the schemeSpecificPart
3142                     int q = scan(p, n, "#");
3143                     if (q <= p)
3144                         failExpecting("scheme-specific part", p);
3145                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3146                     schemeSpecificPart = input.substring(p, q);
3147                     p = q;
3148                 }
3149             } else {
3150                 p = parseHierarchical(0, n);
3151             }
3152             if (at(p, n, '#')) {
3153                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3154                 fragment = input.substring(p + 1, n);
3155                 p = n;
3156             }
3157             if (p < n)
3158                 fail("end of URI", p);
3159         }
3160 
3161         // [//authority]<path>[?<query>]
3162         //
3163         // DEVIATION from RFC2396: We allow an empty authority component as
3164         // long as it's followed by a non-empty path, query component, or
3165         // fragment component.  This is so that URIs such as "file:///foo/bar"
3166         // will parse.  This seems to be the intent of RFC2396, though the
3167         // grammar does not permit it.  If the authority is empty then the
3168         // userInfo, host, and port components are undefined.
3169         //
3170         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3171         // to be the intent of RFC2396, but the grammar does not permit it.
3172         // The primary consequence of this deviation is that "#f" parses as a
3173         // relative URI with an empty path.
3174         //
3175         private int parseHierarchical(int start, int n)
3176             throws URISyntaxException
3177         {
3178             int p = start;
3179             if (at(p, n, '/') && at(p + 1, n, '/')) {
3180                 p += 2;
3181                 int q = scan(p, n, "/?#");
3182                 if (q > p) {
3183                     p = parseAuthority(p, q);
3184                 } else if (q < n) {
3185                     // DEVIATION: Allow empty authority prior to non-empty
3186                     // path, query component or fragment identifier
3187                 } else
3188                     failExpecting("authority", p);
3189             }
3190             int q = scan(p, n, "?#"); // DEVIATION: May be empty
3191             checkChars(p, q, L_PATH, H_PATH, "path");
3192             path = input.substring(p, q);
3193             p = q;
3194             if (at(p, n, '?')) {
3195                 p++;
3196                 q = scan(p, n, "#");
3197                 checkChars(p, q, L_URIC, H_URIC, "query");
3198                 query = input.substring(p, q);
3199                 p = q;
3200             }
3201             return p;
3202         }
3203 
3204         // authority     = server | reg_name
3205         //
3206         // Ambiguity: An authority that is a registry name rather than a server
3207         // might have a prefix that parses as a server.  We use the fact that
3208         // the authority component is always followed by '/' or the end of the
3209         // input string to resolve this: If the complete authority did not
3210         // parse as a server then we try to parse it as a registry name.
3211         //
3212         private int parseAuthority(int start, int n)
3213             throws URISyntaxException
3214         {
3215             int p = start;
3216             int q = p;
3217             URISyntaxException ex = null;
3218 
3219             boolean serverChars;
3220             boolean regChars;
3221 
3222             if (scan(p, n, "]") > p) {
3223                 // contains a literal IPv6 address, therefore % is allowed
3224                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3225             } else {
3226                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3227             }
3228             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3229 
3230             if (regChars && !serverChars) {
3231                 // Must be a registry-based authority
3232                 authority = input.substring(p, n);
3233                 return n;
3234             }
3235 
3236             if (serverChars) {
3237                 // Might be (probably is) a server-based authority, so attempt
3238                 // to parse it as such.  If the attempt fails, try to treat it
3239                 // as a registry-based authority.
3240                 try {
3241                     q = parseServer(p, n);
3242                     if (q < n)
3243                         failExpecting("end of authority", q);
3244                     authority = input.substring(p, n);
3245                 } catch (URISyntaxException x) {
3246                     // Undo results of failed parse
3247                     userInfo = null;
3248                     host = null;
3249                     port = -1;
3250                     if (requireServerAuthority) {
3251                         // If we're insisting upon a server-based authority,
3252                         // then just re-throw the exception
3253                         throw x;
3254                     } else {
3255                         // Save the exception in case it doesn't parse as a
3256                         // registry either
3257                         ex = x;
3258                         q = p;
3259                     }
3260                 }
3261             }
3262 
3263             if (q < n) {
3264                 if (regChars) {
3265                     // Registry-based authority
3266                     authority = input.substring(p, n);
3267                 } else if (ex != null) {
3268                     // Re-throw exception; it was probably due to
3269                     // a malformed IPv6 address
3270                     throw ex;
3271                 } else {
3272                     fail("Illegal character in authority", q);
3273                 }
3274             }
3275 
3276             return n;
3277         }
3278 
3279 
3280         // [<userinfo>@]<host>[:<port>]
3281         //
3282         private int parseServer(int start, int n)
3283             throws URISyntaxException
3284         {
3285             int p = start;
3286             int q;
3287 
3288             // userinfo
3289             q = scan(p, n, "/?#", "@");
3290             if ((q >= p) && at(q, n, '@')) {
3291                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3292                 userInfo = input.substring(p, q);
3293                 p = q + 1;              // Skip '@'
3294             }
3295 
3296             // hostname, IPv4 address, or IPv6 address
3297             if (at(p, n, '[')) {
3298                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3299                 p++;
3300                 q = scan(p, n, "/?#", "]");
3301                 if ((q > p) && at(q, n, ']')) {
3302                     // look for a "%" scope id
3303                     int r = scan (p, q, "%");
3304                     if (r > p) {
3305                         parseIPv6Reference(p, r);
3306                         if (r+1 == q) {
3307                             fail ("scope id expected");
3308                         }
3309                         checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID,
3310                                                 "scope id");
3311                     } else {
3312                         parseIPv6Reference(p, q);
3313                     }
3314                     host = input.substring(p-1, q+1);
3315                     p = q + 1;
3316                 } else {
3317                     failExpecting("closing bracket for IPv6 address", q);
3318                 }
3319             } else {
3320                 q = parseIPv4Address(p, n);
3321                 if (q <= p)
3322                     q = parseHostname(p, n);
3323                 p = q;
3324             }
3325 
3326             // port
3327             if (at(p, n, ':')) {
3328                 p++;
3329                 q = scan(p, n, "/");
3330                 if (q > p) {
3331                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3332                     try {
3333                         port = Integer.parseInt(input, p, q, 10);
3334                     } catch (NumberFormatException x) {
3335                         fail("Malformed port number", p);
3336                     }
3337                     p = q;
3338                 }
3339             }
3340             if (p < n)
3341                 failExpecting("port number", p);
3342 
3343             return p;
3344         }
3345 
3346         // Scan a string of decimal digits whose value fits in a byte
3347         //
3348         private int scanByte(int start, int n)
3349             throws URISyntaxException
3350         {
3351             int p = start;
3352             int q = scan(p, n, L_DIGIT, H_DIGIT);
3353             if (q <= p) return q;
3354             if (Integer.parseInt(input, p, q, 10) > 255) return p;
3355             return q;
3356         }
3357 
3358         // Scan an IPv4 address.
3359         //
3360         // If the strict argument is true then we require that the given
3361         // interval contain nothing besides an IPv4 address; if it is false
3362         // then we only require that it start with an IPv4 address.
3363         //
3364         // If the interval does not contain or start with (depending upon the
3365         // strict argument) a legal IPv4 address characters then we return -1
3366         // immediately; otherwise we insist that these characters parse as a
3367         // legal IPv4 address and throw an exception on failure.
3368         //
3369         // We assume that any string of decimal digits and dots must be an IPv4
3370         // address.  It won't parse as a hostname anyway, so making that
3371         // assumption here allows more meaningful exceptions to be thrown.
3372         //
3373         private int scanIPv4Address(int start, int n, boolean strict)
3374             throws URISyntaxException
3375         {
3376             int p = start;
3377             int q;
3378             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3379             if ((m <= p) || (strict && (m != n)))
3380                 return -1;
3381             for (;;) {
3382                 // Per RFC2732: At most three digits per byte
3383                 // Further constraint: Each element fits in a byte
3384                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3385                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3386                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3387                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3388                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3389                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3390                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3391                 if (q < m) break;
3392                 return q;
3393             }
3394             fail("Malformed IPv4 address", q);
3395             return -1;
3396         }
3397 
3398         // Take an IPv4 address: Throw an exception if the given interval
3399         // contains anything except an IPv4 address
3400         //
3401         private int takeIPv4Address(int start, int n, String expected)
3402             throws URISyntaxException
3403         {
3404             int p = scanIPv4Address(start, n, true);
3405             if (p <= start)
3406                 failExpecting(expected, start);
3407             return p;
3408         }
3409 
3410         // Attempt to parse an IPv4 address, returning -1 on failure but
3411         // allowing the given interval to contain [:<characters>] after
3412         // the IPv4 address.
3413         //
3414         private int parseIPv4Address(int start, int n) {
3415             int p;
3416 
3417             try {
3418                 p = scanIPv4Address(start, n, false);
3419             } catch (URISyntaxException x) {
3420                 return -1;
3421             } catch (NumberFormatException nfe) {
3422                 return -1;
3423             }
3424 
3425             if (p > start && p < n) {
3426                 // IPv4 address is followed by something - check that
3427                 // it's a ":" as this is the only valid character to
3428                 // follow an address.
3429                 if (input.charAt(p) != ':') {
3430                     p = -1;
3431                 }
3432             }
3433 
3434             if (p > start)
3435                 host = input.substring(start, p);
3436 
3437             return p;
3438         }
3439 
3440         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3441         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3442         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3443         //
3444         private int parseHostname(int start, int n)
3445             throws URISyntaxException
3446         {
3447             int p = start;
3448             int q;
3449             int l = -1;                 // Start of last parsed label
3450 
3451             do {
3452                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3453                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3454                 if (q <= p)
3455                     break;
3456                 l = p;
3457                 if (q > p) {
3458                     p = q;
3459                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3460                     if (q > p) {
3461                         if (input.charAt(q - 1) == '-')
3462                             fail("Illegal character in hostname", q - 1);
3463                         p = q;
3464                     }
3465                 }
3466                 q = scan(p, n, '.');
3467                 if (q <= p)
3468                     break;
3469                 p = q;
3470             } while (p < n);
3471 
3472             if ((p < n) && !at(p, n, ':'))
3473                 fail("Illegal character in hostname", p);
3474 
3475             if (l < 0)
3476                 failExpecting("hostname", start);
3477 
3478             // for a fully qualified hostname check that the rightmost
3479             // label starts with an alpha character.
3480             if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) {
3481                 fail("Illegal character in hostname", l);
3482             }
3483 
3484             host = input.substring(start, p);
3485             return p;
3486         }
3487 
3488 
3489         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3490         //
3491         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3492         // the form ::12.34.56.78, which are clearly shown in the examples
3493         // earlier in the document.  Here is the original grammar:
3494         //
3495         //   IPv6address = hexpart [ ":" IPv4address ]
3496         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3497         //   hexseq      = hex4 *( ":" hex4)
3498         //   hex4        = 1*4HEXDIG
3499         //
3500         // We therefore use the following revised grammar:
3501         //
3502         //   IPv6address = hexseq [ ":" IPv4address ]
3503         //                 | hexseq [ "::" [ hexpost ] ]
3504         //                 | "::" [ hexpost ]
3505         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3506         //   hexseq      = hex4 *( ":" hex4)
3507         //   hex4        = 1*4HEXDIG
3508         //
3509         // This covers all and only the following cases:
3510         //
3511         //   hexseq
3512         //   hexseq : IPv4address
3513         //   hexseq ::
3514         //   hexseq :: hexseq
3515         //   hexseq :: hexseq : IPv4address
3516         //   hexseq :: IPv4address
3517         //   :: hexseq
3518         //   :: hexseq : IPv4address
3519         //   :: IPv4address
3520         //   ::
3521         //
3522         // Additionally we constrain the IPv6 address as follows :-
3523         //
3524         //  i.  IPv6 addresses without compressed zeros should contain
3525         //      exactly 16 bytes.
3526         //
3527         //  ii. IPv6 addresses with compressed zeros should contain
3528         //      less than 16 bytes.
3529 
3530         private int ipv6byteCount = 0;
3531 
3532         private int parseIPv6Reference(int start, int n)
3533             throws URISyntaxException
3534         {
3535             int p = start;
3536             int q;
3537             boolean compressedZeros = false;
3538 
3539             q = scanHexSeq(p, n);
3540 
3541             if (q > p) {
3542                 p = q;
3543                 if (at(p, n, "::")) {
3544                     compressedZeros = true;
3545                     p = scanHexPost(p + 2, n);
3546                 } else if (at(p, n, ':')) {
3547                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3548                     ipv6byteCount += 4;
3549                 }
3550             } else if (at(p, n, "::")) {
3551                 compressedZeros = true;
3552                 p = scanHexPost(p + 2, n);
3553             }
3554             if (p < n)
3555                 fail("Malformed IPv6 address", start);
3556             if (ipv6byteCount > 16)
3557                 fail("IPv6 address too long", start);
3558             if (!compressedZeros && ipv6byteCount < 16)
3559                 fail("IPv6 address too short", start);
3560             if (compressedZeros && ipv6byteCount == 16)
3561                 fail("Malformed IPv6 address", start);
3562 
3563             return p;
3564         }
3565 
3566         private int scanHexPost(int start, int n)
3567             throws URISyntaxException
3568         {
3569             int p = start;
3570             int q;
3571 
3572             if (p == n)
3573                 return p;
3574 
3575             q = scanHexSeq(p, n);
3576             if (q > p) {
3577                 p = q;
3578                 if (at(p, n, ':')) {
3579                     p++;
3580                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3581                     ipv6byteCount += 4;
3582                 }
3583             } else {
3584                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3585                 ipv6byteCount += 4;
3586             }
3587             return p;
3588         }
3589 
3590         // Scan a hex sequence; return -1 if one could not be scanned
3591         //
3592         private int scanHexSeq(int start, int n)
3593             throws URISyntaxException
3594         {
3595             int p = start;
3596             int q;
3597 
3598             q = scan(p, n, L_HEX, H_HEX);
3599             if (q <= p)
3600                 return -1;
3601             if (at(q, n, '.'))          // Beginning of IPv4 address
3602                 return -1;
3603             if (q > p + 4)
3604                 fail("IPv6 hexadecimal digit sequence too long", p);
3605             ipv6byteCount += 2;
3606             p = q;
3607             while (p < n) {
3608                 if (!at(p, n, ':'))
3609                     break;
3610                 if (at(p + 1, n, ':'))
3611                     break;              // "::"
3612                 p++;
3613                 q = scan(p, n, L_HEX, H_HEX);
3614                 if (q <= p)
3615                     failExpecting("digits for an IPv6 address", p);
3616                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3617                     p--;
3618                     break;
3619                 }
3620                 if (q > p + 4)
3621                     fail("IPv6 hexadecimal digit sequence too long", p);
3622                 ipv6byteCount += 2;
3623                 p = q;
3624             }
3625 
3626             return p;
3627         }
3628 
3629     }
3630     static {
3631         SharedSecrets.setJavaNetUriAccess(
3632             new JavaNetUriAccess() {
3633                 public URI create(String scheme, String path) {
3634                     return new URI(scheme, path);
3635                 }
3636             }
3637         );
3638     }
3639 }