1 /*
   2  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.net;
  27 
  28 import java.io.IOException;
  29 import java.io.InvalidObjectException;
  30 import java.io.ObjectInputStream;
  31 import java.io.ObjectOutputStream;
  32 import java.io.Serializable;
  33 import java.nio.ByteBuffer;
  34 import java.nio.CharBuffer;
  35 import java.nio.charset.CharsetDecoder;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.CharacterCodingException;
  39 import java.text.Normalizer;
  40 import jdk.internal.misc.JavaNetUriAccess;
  41 import jdk.internal.misc.SharedSecrets;
  42 import sun.nio.cs.ThreadLocalCoders;
  43 
  44 import java.lang.Character;             // for javadoc
  45 import java.lang.NullPointerException;  // for javadoc
  46 
  47 
  48 /**
  49  * Represents a Uniform Resource Identifier (URI) reference.
  50  *
  51  * <p> Aside from some minor deviations noted below, an instance of this
  52  * class represents a URI reference as defined by
  53  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
  54  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
  55  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  56  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
  57  * also supports scope_ids. The syntax and usage of scope_ids is described
  58  * <a href="Inet6Address.html#scoped">here</a>.
  59  * This class provides constructors for creating URI instances from
  60  * their components or by parsing their string forms, methods for accessing the
  61  * various components of an instance, and methods for normalizing, resolving,
  62  * and relativizing URI instances.  Instances of this class are immutable.
  63  *
  64  *
  65  * <h3> URI syntax and components </h3>
  66  *
  67  * At the highest level a URI reference (hereinafter simply "URI") in string
  68  * form has the syntax
  69  *
  70  * <blockquote>
  71  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
  72  * </blockquote>
  73  *
  74  * where square brackets [...] delineate optional components and the characters
  75  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
  76  *
  77  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
  78  * said to be <i>relative</i>.  URIs are also classified according to whether
  79  * they are <i>opaque</i> or <i>hierarchical</i>.
  80  *
  81  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
  82  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
  83  * subject to further parsing.  Some examples of opaque URIs are:
  84  *
  85  * <blockquote><ul style="list-style-type:none">
  86  * <li>{@code mailto:java-net@java.sun.com}</li>
  87  * <li>{@code news:comp.lang.java}</li>
  88  * <li>{@code urn:isbn:096139210x}</li>
  89  * </ul></blockquote>
  90  *
  91  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
  92  * scheme-specific part begins with a slash character, or a relative URI, that
  93  * is, a URI that does not specify a scheme.  Some examples of hierarchical
  94  * URIs are:
  95  *
  96  * <blockquote>
  97  * {@code http://example.com/languages/java/}<br>
  98  * {@code sample/a/index.html#28}<br>
  99  * {@code ../../demo/b/index.html}<br>
 100  * {@code file:///~/calendar}
 101  * </blockquote>
 102  *
 103  * <p> A hierarchical URI is subject to further parsing according to the syntax
 104  *
 105  * <blockquote>
 106  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 107  * </blockquote>
 108  *
 109  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
 110  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
 111  * scheme-specific part of a hierarchical URI consists of the characters
 112  * between the scheme and fragment components.
 113  *
 114  * <p> The authority component of a hierarchical URI is, if specified, either
 115  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
 116  * parses according to the familiar syntax
 117  *
 118  * <blockquote>
 119  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
 120  * </blockquote>
 121  *
 122  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
 123  * themselves.  Nearly all URI schemes currently in use are server-based.  An
 124  * authority component that does not parse in this way is considered to be
 125  * registry-based.
 126  *
 127  * <p> The path component of a hierarchical URI is itself said to be absolute
 128  * if it begins with a slash character ({@code '/'}); otherwise it is
 129  * relative.  The path of a hierarchical URI that is either absolute or
 130  * specifies an authority is always absolute.
 131  *
 132  * <p> All told, then, a URI instance has the following nine components:
 133  *
 134  * <table class="striped" style="margin-left:2em">
 135  * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption>
 136  * <thead>
 137  * <tr><th scope="col">Component</th><th scope="col">Type</th></tr>
 138  * </thead>
 139  * <tbody style="text-align:left">
 140  * <tr><th scope="row">scheme</th><td>{@code String}</td></tr>
 141  * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr>
 142  * <tr><th scope="row">authority</th><td>{@code String}</td></tr>
 143  * <tr><th scope="row">user-info</th><td>{@code String}</td></tr>
 144  * <tr><th scope="row">host</th><td>{@code String}</td></tr>
 145  * <tr><th scope="row">port</th><td>{@code int}</td></tr>
 146  * <tr><th scope="row">path</th><td>{@code String}</td></tr>
 147  * <tr><th scope="row">query</th><td>{@code String}</td></tr>
 148  * <tr><th scope="row">fragment</th><td>{@code String}</td></tr>
 149  * </tbody>
 150  * </table>
 151  *
 152  * In a given instance any particular component is either <i>undefined</i> or
 153  * <i>defined</i> with a distinct value.  Undefined string components are
 154  * represented by {@code null}, while undefined integer components are
 155  * represented by {@code -1}.  A string component may be defined to have the
 156  * empty string as its value; this is not equivalent to that component being
 157  * undefined.
 158  *
 159  * <p> Whether a particular component is or is not defined in an instance
 160  * depends upon the type of the URI being represented.  An absolute URI has a
 161  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
 162  * possibly a fragment, but has no other components.  A hierarchical URI always
 163  * has a path (though it may be empty) and a scheme-specific-part (which at
 164  * least contains the path), and may have any of the other components.  If the
 165  * authority component is present and is server-based then the host component
 166  * will be defined and the user-information and port components may be defined.
 167  *
 168  *
 169  * <h4> Operations on URI instances </h4>
 170  *
 171  * The key operations supported by this class are those of
 172  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
 173  *
 174  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
 175  * and {@code ".."} segments from the path component of a hierarchical URI.
 176  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
 177  * removed only if it is preceded by a non-{@code ".."} segment.
 178  * Normalization has no effect upon opaque URIs.
 179  *
 180  * <p> <i>Resolution</i> is the process of resolving one URI against another,
 181  * <i>base</i> URI.  The resulting URI is constructed from components of both
 182  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
 183  * base URI for those not specified in the original.  For hierarchical URIs,
 184  * the path of the original is resolved against the path of the base and then
 185  * normalized.  The result, for example, of resolving
 186  *
 187  * <blockquote>
 188  * {@code sample/a/index.html#28}
 189  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 190  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
 191  * </blockquote>
 192  *
 193  * against the base URI {@code http://example.com/languages/java/} is the result
 194  * URI
 195  *
 196  * <blockquote>
 197  * {@code http://example.com/languages/java/sample/a/index.html#28}
 198  * </blockquote>
 199  *
 200  * Resolving the relative URI
 201  *
 202  * <blockquote>
 203  * {@code ../../demo/b/index.html}&nbsp;&nbsp;&nbsp;&nbsp;(2)
 204  * </blockquote>
 205  *
 206  * against this result yields, in turn,
 207  *
 208  * <blockquote>
 209  * {@code http://example.com/languages/java/demo/b/index.html}
 210  * </blockquote>
 211  *
 212  * Resolution of both absolute and relative URIs, and of both absolute and
 213  * relative paths in the case of hierarchical URIs, is supported.  Resolving
 214  * the URI {@code file:///~calendar} against any other URI simply yields the
 215  * original URI, since it is absolute.  Resolving the relative URI (2) above
 216  * against the relative base URI (1) yields the normalized, but still relative,
 217  * URI
 218  *
 219  * <blockquote>
 220  * {@code demo/b/index.html}
 221  * </blockquote>
 222  *
 223  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
 224  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
 225  *
 226  * <blockquote>
 227  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
 228  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
 229  * </blockquote>
 230  *
 231  * This operation is often useful when constructing a document containing URIs
 232  * that must be made relative to the base URI of the document wherever
 233  * possible.  For example, relativizing the URI
 234  *
 235  * <blockquote>
 236  * {@code http://example.com/languages/java/sample/a/index.html#28}
 237  * </blockquote>
 238  *
 239  * against the base URI
 240  *
 241  * <blockquote>
 242  * {@code http://example.com/languages/java/}
 243  * </blockquote>
 244  *
 245  * yields the relative URI {@code sample/a/index.html#28}.
 246  *
 247  *
 248  * <h4> Character categories </h4>
 249  *
 250  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
 251  * various components of a URI reference.  The following categories, most of
 252  * which are taken from that specification, are used below to describe these
 253  * constraints:
 254  *
 255  * <table class="striped" style="margin-left:2em">
 256  * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption>
 257  *   <thead>
 258  *   <tr><th scope="col">Category</th><th scope="col">Description</th></tr>
 259  *   </thead>
 260  *   <tbody style="text-align:left">
 261  *   <tr><th scope="row" style="vertical-align:top">alpha</th>
 262  *       <td>The US-ASCII alphabetic characters,
 263  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
 264  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
 265  *   <tr><th scope="row" style="vertical-align:top">digit</th>
 266  *       <td>The US-ASCII decimal digit characters,
 267  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
 268  *   <tr><th scope="row" style="vertical-align:top">alphanum</th>
 269  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
 270  *   <tr><th scope="row" style="vertical-align:top">unreserved</th>
 271  *       <td>All <i>alphanum</i> characters together with those in the string
 272  *        {@code "_-!.~'()*"}</td></tr>
 273  *   <tr><th scope="row" style="vertical-align:top">punct</th>
 274  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
 275  *   <tr><th scope="row" style="vertical-align:top">reserved</th>
 276  *       <td>All <i>punct</i> characters together with those in the string
 277  *        {@code "?/[]@"}</td></tr>
 278  *   <tr><th scope="row" style="vertical-align:top">escaped</th>
 279  *       <td>Escaped octets, that is, triplets consisting of the percent
 280  *           character ({@code '%'}) followed by two hexadecimal digits
 281  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
 282  *           {@code 'a'}-{@code 'f'})</td></tr>
 283  *   <tr><th scope="row" style="vertical-align:top">other</th>
 284  *       <td>The Unicode characters that are not in the US-ASCII character set,
 285  *           are not control characters (according to the {@link
 286  *           java.lang.Character#isISOControl(char) Character.isISOControl}
 287  *           method), and are not space characters (according to the {@link
 288  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
 289  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
 290  *           limited to US-ASCII)</i></td></tr>
 291  * </tbody>
 292  * </table>
 293  *
 294  * <p><a id="legal-chars"></a> The set of all legal URI characters consists of
 295  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
 296  * characters.
 297  *
 298  *
 299  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
 300  *
 301  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
 302  * fragment components.  Escaping serves two purposes in URIs:
 303  *
 304  * <ul>
 305  *
 306  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
 307  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
 308  *   characters.  </p></li>
 309  *
 310  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
 311  *   component.  The user-info, path, query, and fragment components differ
 312  *   slightly in terms of which characters are considered legal and illegal.
 313  *   </p></li>
 314  *
 315  * </ul>
 316  *
 317  * These purposes are served in this class by three related operations:
 318  *
 319  * <ul>
 320  *
 321  *   <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it
 322  *   with the sequence of escaped octets that represent that character in the
 323  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
 324  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
 325  *   RFC&nbsp;2396</b>, which does not specify any particular character
 326  *   set.)</i> </p></li>
 327  *
 328  *   <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by
 329  *   encoding it.  The space character, for example, is quoted by replacing it
 330  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
 331  *   characters this transformation has exactly the effect required by
 332  *   RFC&nbsp;2396. </p></li>
 333  *
 334  *   <li><p><a id="decode"></a>
 335  *   A sequence of escaped octets is <i>decoded</i> by
 336  *   replacing it with the sequence of characters that it represents in the
 337  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
 338  *   effect of de-quoting any quoted US-ASCII characters as well as that of
 339  *   decoding any encoded non-US-ASCII characters.  If a <a
 340  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
 341  *   when decoding the escaped octets then the erroneous octets are replaced by
 342  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
 343  *
 344  * </ul>
 345  *
 346  * These operations are exposed in the constructors and methods of this class
 347  * as follows:
 348  *
 349  * <ul>
 350  *
 351  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
 352  *   constructor} requires any illegal characters in its argument to be
 353  *   quoted and preserves any escaped octets and <i>other</i> characters that
 354  *   are present.  </p></li>
 355  *
 356  *   <li><p> The {@linkplain
 357  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
 358  *   multi-argument constructors} quote illegal characters as
 359  *   required by the components in which they appear.  The percent character
 360  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
 361  *   characters are preserved.  </p></li>
 362  *
 363  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
 364  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
 365  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
 366  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
 367  *   values of their corresponding components in raw form, without interpreting
 368  *   any escaped octets.  The strings returned by these methods may contain
 369  *   both escaped octets and <i>other</i> characters, and will not contain any
 370  *   illegal characters.  </p></li>
 371  *
 372  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
 373  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
 374  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
 375  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
 376  *   octets in their corresponding components.  The strings returned by these
 377  *   methods may contain both <i>other</i> characters and illegal characters,
 378  *   and will not contain any escaped octets.  </p></li>
 379  *
 380  *   <li><p> The {@link #toString() toString} method returns a URI string with
 381  *   all necessary quotation but which may contain <i>other</i> characters.
 382  *   </p></li>
 383  *
 384  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
 385  *   quoted and encoded URI string that does not contain any <i>other</i>
 386  *   characters.  </p></li>
 387  *
 388  * </ul>
 389  *
 390  *
 391  * <h4> Identities </h4>
 392  *
 393  * For any URI <i>u</i>, it is always the case that
 394  *
 395  * <blockquote>
 396  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
 397  * </blockquote>
 398  *
 399  * For any URI <i>u</i> that does not contain redundant syntax such as two
 400  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
 401  * colon following a host name but no port (as in
 402  * {@code http://java.sun.com:}&nbsp;), and that does not encode characters
 403  * except those that must be quoted, the following identities also hold:
 404  * <pre>
 405  *     new URI(<i>u</i>.getScheme(),
 406  *             <i>u</i>.getSchemeSpecificPart(),
 407  *             <i>u</i>.getFragment())
 408  *     .equals(<i>u</i>)</pre>
 409  * in all cases,
 410  * <pre>
 411  *     new URI(<i>u</i>.getScheme(),
 412  *             <i>u</i>.getAuthority(),
 413  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 414  *             <i>u</i>.getFragment())
 415  *     .equals(<i>u</i>)</pre>
 416  * if <i>u</i> is hierarchical, and
 417  * <pre>
 418  *     new URI(<i>u</i>.getScheme(),
 419  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
 420  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 421  *             <i>u</i>.getFragment())
 422  *     .equals(<i>u</i>)</pre>
 423  * if <i>u</i> is hierarchical and has either no authority or a server-based
 424  * authority.
 425  *
 426  *
 427  * <h4> URIs, URLs, and URNs </h4>
 428  *
 429  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
 430  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
 431  * not every URI is a URL.  This is because there is another subcategory of
 432  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
 433  * specify how to locate them.  The {@code mailto}, {@code news}, and
 434  * {@code isbn} URIs shown above are examples of URNs.
 435  *
 436  * <p> The conceptual distinction between URIs and URLs is reflected in the
 437  * differences between this class and the {@link URL} class.
 438  *
 439  * <p> An instance of this class represents a URI reference in the syntactic
 440  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
 441  * A URI string is parsed according to the generic syntax without regard to the
 442  * scheme, if any, that it specifies.  No lookup of the host, if any, is
 443  * performed, and no scheme-dependent stream handler is constructed.  Equality,
 444  * hashing, and comparison are defined strictly in terms of the character
 445  * content of the instance.  In other words, a URI instance is little more than
 446  * a structured string that supports the syntactic, scheme-independent
 447  * operations of comparison, normalization, resolution, and relativization.
 448  *
 449  * <p> An instance of the {@link URL} class, by contrast, represents the
 450  * syntactic components of a URL together with some of the information required
 451  * to access the resource that it describes.  A URL must be absolute, that is,
 452  * it must always specify a scheme.  A URL string is parsed according to its
 453  * scheme.  A stream handler is always established for a URL, and in fact it is
 454  * impossible to create a URL instance for a scheme for which no handler is
 455  * available.  Equality and hashing depend upon both the scheme and the
 456  * Internet address of the host, if any; comparison is not defined.  In other
 457  * words, a URL is a structured string that supports the syntactic operation of
 458  * resolution as well as the network I/O operations of looking up the host and
 459  * opening a connection to the specified resource.
 460  *
 461  *
 462  * @author Mark Reinhold
 463  * @since 1.4
 464  *
 465  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
 466  * transformation format of ISO 10646</i></a>, <br><a
 467  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
 468  * Architecture</i></a>, <br><a
 469  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
 470  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
 471  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
 472  * Literal IPv6 Addresses in URLs</i></a>, <br><a
 473  * href="URISyntaxException.html">URISyntaxException</a>
 474  */
 475 
 476 public final class URI
 477     implements Comparable<URI>, Serializable
 478 {
 479 
 480     // Note: Comments containing the word "ASSERT" indicate places where a
 481     // throw of an InternalError should be replaced by an appropriate assertion
 482     // statement once asserts are enabled in the build.
 483 
 484     static final long serialVersionUID = -6052424284110960213L;
 485 
 486 
 487     // -- Properties and components of this instance --
 488 
 489     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
 490     private transient String scheme;            // null ==> relative URI
 491     private transient String fragment;
 492 
 493     // Hierarchical URI components: [//<authority>]<path>[?<query>]
 494     private transient String authority;         // Registry or server
 495 
 496     // Server-based authority: [<userInfo>@]<host>[:<port>]
 497     private transient String userInfo;
 498     private transient String host;              // null ==> registry-based
 499     private transient int port = -1;            // -1 ==> undefined
 500 
 501     // Remaining components of hierarchical URIs
 502     private transient String path;              // null ==> opaque
 503     private transient String query;
 504 
 505     // The remaining fields may be computed on demand, which is safe even in
 506     // the face of multiple threads racing to initialize them
 507     private transient String schemeSpecificPart;
 508     private transient int hash;        // Zero ==> undefined
 509 
 510     private transient String decodedUserInfo;
 511     private transient String decodedAuthority;
 512     private transient String decodedPath;
 513     private transient String decodedQuery;
 514     private transient String decodedFragment;
 515     private transient String decodedSchemeSpecificPart;
 516 
 517     /**
 518      * The string form of this URI.
 519      *
 520      * @serial
 521      */
 522     private volatile String string;             // The only serializable field
 523 
 524 
 525 
 526     // -- Constructors and factories --
 527 
 528     private URI() { }                           // Used internally
 529 
 530     /**
 531      * Constructs a URI by parsing the given string.
 532      *
 533      * <p> This constructor parses the given string exactly as specified by the
 534      * grammar in <a
 535      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 536      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
 537      *
 538      * <ul>
 539      *
 540      *   <li><p> An empty authority component is permitted as long as it is
 541      *   followed by a non-empty path, a query component, or a fragment
 542      *   component.  This allows the parsing of URIs such as
 543      *   {@code "file:///foo/bar"}, which seems to be the intent of
 544      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
 545      *   authority component is empty then the user-information, host, and port
 546      *   components are undefined. </p></li>
 547      *
 548      *   <li><p> Empty relative paths are permitted; this seems to be the
 549      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
 550      *   primary consequence of this deviation is that a standalone fragment
 551      *   such as {@code "#foo"} parses as a relative URI with an empty path
 552      *   and the given fragment, and can be usefully <a
 553      *   href="#resolve-frag">resolved</a> against a base URI.
 554      *
 555      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
 556      *   specified by <a
 557      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
 558      *   element of a dotted-quad address must contain no more than three
 559      *   decimal digits.  Each element is further constrained to have a value
 560      *   no greater than 255. </p></li>
 561      *
 562      *   <li> <p> Hostnames in host components that comprise only a single
 563      *   domain label are permitted to start with an <i>alphanum</i>
 564      *   character. This seems to be the intent of <a
 565      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 566      *   section&nbsp;3.2.2 although the grammar does not permit it. The
 567      *   consequence of this deviation is that the authority component of a
 568      *   hierarchical URI such as {@code s://123}, will parse as a server-based
 569      *   authority. </p></li>
 570      *
 571      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
 572      *   address must be enclosed in square brackets ({@code '['} and
 573      *   {@code ']'}) as specified by <a
 574      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
 575      *   IPv6 address itself must parse according to <a
 576      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
 577      *   addresses are further constrained to describe no more than sixteen
 578      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
 579      *   but not expressible in the grammar. </p></li>
 580      *
 581      *   <li><p> Characters in the <i>other</i> category are permitted wherever
 582      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
 583      *   user-information, path, query, and fragment components, as well as in
 584      *   the authority component if the authority is registry-based.  This
 585      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
 586      *   character set. </p></li>
 587      *
 588      * </ul>
 589      *
 590      * @param  str   The string to be parsed into a URI
 591      *
 592      * @throws  NullPointerException
 593      *          If {@code str} is {@code null}
 594      *
 595      * @throws  URISyntaxException
 596      *          If the given string violates RFC&nbsp;2396, as augmented
 597      *          by the above deviations
 598      */
 599     public URI(String str) throws URISyntaxException {
 600         new Parser(str).parse(false);
 601     }
 602 
 603     /**
 604      * Constructs a hierarchical URI from the given components.
 605      *
 606      * <p> If a scheme is given then the path, if also given, must either be
 607      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 608      * component of the new URI may be left undefined by passing {@code null}
 609      * for the corresponding parameter or, in the case of the {@code port}
 610      * parameter, by passing {@code -1}.
 611      *
 612      * <p> This constructor first builds a URI string from the given components
 613      * according to the rules specified in <a
 614      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 615      * section&nbsp;5.2, step&nbsp;7: </p>
 616      *
 617      * <ol>
 618      *
 619      *   <li><p> Initially, the result string is empty. </p></li>
 620      *
 621      *   <li><p> If a scheme is given then it is appended to the result,
 622      *   followed by a colon character ({@code ':'}).  </p></li>
 623      *
 624      *   <li><p> If user information, a host, or a port are given then the
 625      *   string {@code "//"} is appended.  </p></li>
 626      *
 627      *   <li><p> If user information is given then it is appended, followed by
 628      *   a commercial-at character ({@code '@'}).  Any character not in the
 629      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 630      *   categories is <a href="#quote">quoted</a>.  </p></li>
 631      *
 632      *   <li><p> If a host is given then it is appended.  If the host is a
 633      *   literal IPv6 address but is not enclosed in square brackets
 634      *   ({@code '['} and {@code ']'}) then the square brackets are added.
 635      *   </p></li>
 636      *
 637      *   <li><p> If a port number is given then a colon character
 638      *   ({@code ':'}) is appended, followed by the port number in decimal.
 639      *   </p></li>
 640      *
 641      *   <li><p> If a path is given then it is appended.  Any character not in
 642      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 643      *   categories, and not equal to the slash character ({@code '/'}) or the
 644      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 645      *
 646      *   <li><p> If a query is given then a question-mark character
 647      *   ({@code '?'}) is appended, followed by the query.  Any character that
 648      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 649      *   </p></li>
 650      *
 651      *   <li><p> Finally, if a fragment is given then a hash character
 652      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 653      *   that is not a legal URI character is quoted.  </p></li>
 654      *
 655      * </ol>
 656      *
 657      * <p> The resulting URI string is then parsed as if by invoking the {@link
 658      * #URI(String)} constructor and then invoking the {@link
 659      * #parseServerAuthority()} method upon the result; this may cause a {@link
 660      * URISyntaxException} to be thrown.  </p>
 661      *
 662      * @param   scheme    Scheme name
 663      * @param   userInfo  User name and authorization information
 664      * @param   host      Host name
 665      * @param   port      Port number
 666      * @param   path      Path
 667      * @param   query     Query
 668      * @param   fragment  Fragment
 669      *
 670      * @throws URISyntaxException
 671      *         If both a scheme and a path are given but the path is relative,
 672      *         if the URI string constructed from the given components violates
 673      *         RFC&nbsp;2396, or if the authority component of the string is
 674      *         present but cannot be parsed as a server-based authority
 675      */
 676     public URI(String scheme,
 677                String userInfo, String host, int port,
 678                String path, String query, String fragment)
 679         throws URISyntaxException
 680     {
 681         String s = toString(scheme, null,
 682                             null, userInfo, host, port,
 683                             path, query, fragment);
 684         checkPath(s, scheme, path);
 685         new Parser(s).parse(true);
 686     }
 687 
 688     /**
 689      * Constructs a hierarchical URI from the given components.
 690      *
 691      * <p> If a scheme is given then the path, if also given, must either be
 692      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 693      * component of the new URI may be left undefined by passing {@code null}
 694      * for the corresponding parameter.
 695      *
 696      * <p> This constructor first builds a URI string from the given components
 697      * according to the rules specified in <a
 698      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 699      * section&nbsp;5.2, step&nbsp;7: </p>
 700      *
 701      * <ol>
 702      *
 703      *   <li><p> Initially, the result string is empty.  </p></li>
 704      *
 705      *   <li><p> If a scheme is given then it is appended to the result,
 706      *   followed by a colon character ({@code ':'}).  </p></li>
 707      *
 708      *   <li><p> If an authority is given then the string {@code "//"} is
 709      *   appended, followed by the authority.  If the authority contains a
 710      *   literal IPv6 address then the address must be enclosed in square
 711      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
 712      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 713      *   categories, and not equal to the commercial-at character
 714      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
 715      *
 716      *   <li><p> If a path is given then it is appended.  Any character not in
 717      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 718      *   categories, and not equal to the slash character ({@code '/'}) or the
 719      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 720      *
 721      *   <li><p> If a query is given then a question-mark character
 722      *   ({@code '?'}) is appended, followed by the query.  Any character that
 723      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 724      *   </p></li>
 725      *
 726      *   <li><p> Finally, if a fragment is given then a hash character
 727      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 728      *   that is not a legal URI character is quoted.  </p></li>
 729      *
 730      * </ol>
 731      *
 732      * <p> The resulting URI string is then parsed as if by invoking the {@link
 733      * #URI(String)} constructor and then invoking the {@link
 734      * #parseServerAuthority()} method upon the result; this may cause a {@link
 735      * URISyntaxException} to be thrown.  </p>
 736      *
 737      * @param   scheme     Scheme name
 738      * @param   authority  Authority
 739      * @param   path       Path
 740      * @param   query      Query
 741      * @param   fragment   Fragment
 742      *
 743      * @throws URISyntaxException
 744      *         If both a scheme and a path are given but the path is relative,
 745      *         if the URI string constructed from the given components violates
 746      *         RFC&nbsp;2396, or if the authority component of the string is
 747      *         present but cannot be parsed as a server-based authority
 748      */
 749     public URI(String scheme,
 750                String authority,
 751                String path, String query, String fragment)
 752         throws URISyntaxException
 753     {
 754         String s = toString(scheme, null,
 755                             authority, null, null, -1,
 756                             path, query, fragment);
 757         checkPath(s, scheme, path);
 758         new Parser(s).parse(false);
 759     }
 760 
 761     /**
 762      * Constructs a hierarchical URI from the given components.
 763      *
 764      * <p> A component may be left undefined by passing {@code null}.
 765      *
 766      * <p> This convenience constructor works as if by invoking the
 767      * seven-argument constructor as follows:
 768      *
 769      * <blockquote>
 770      * {@code new} {@link #URI(String, String, String, int, String, String, String)
 771      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
 772      * </blockquote>
 773      *
 774      * @param   scheme    Scheme name
 775      * @param   host      Host name
 776      * @param   path      Path
 777      * @param   fragment  Fragment
 778      *
 779      * @throws  URISyntaxException
 780      *          If the URI string constructed from the given components
 781      *          violates RFC&nbsp;2396
 782      */
 783     public URI(String scheme, String host, String path, String fragment)
 784         throws URISyntaxException
 785     {
 786         this(scheme, null, host, -1, path, null, fragment);
 787     }
 788 
 789     /**
 790      * Constructs a URI from the given components.
 791      *
 792      * <p> A component may be left undefined by passing {@code null}.
 793      *
 794      * <p> This constructor first builds a URI in string form using the given
 795      * components as follows:  </p>
 796      *
 797      * <ol>
 798      *
 799      *   <li><p> Initially, the result string is empty.  </p></li>
 800      *
 801      *   <li><p> If a scheme is given then it is appended to the result,
 802      *   followed by a colon character ({@code ':'}).  </p></li>
 803      *
 804      *   <li><p> If a scheme-specific part is given then it is appended.  Any
 805      *   character that is not a <a href="#legal-chars">legal URI character</a>
 806      *   is <a href="#quote">quoted</a>.  </p></li>
 807      *
 808      *   <li><p> Finally, if a fragment is given then a hash character
 809      *   ({@code '#'}) is appended to the string, followed by the fragment.
 810      *   Any character that is not a legal URI character is quoted.  </p></li>
 811      *
 812      * </ol>
 813      *
 814      * <p> The resulting URI string is then parsed in order to create the new
 815      * URI instance as if by invoking the {@link #URI(String)} constructor;
 816      * this may cause a {@link URISyntaxException} to be thrown.  </p>
 817      *
 818      * @param   scheme    Scheme name
 819      * @param   ssp       Scheme-specific part
 820      * @param   fragment  Fragment
 821      *
 822      * @throws  URISyntaxException
 823      *          If the URI string constructed from the given components
 824      *          violates RFC&nbsp;2396
 825      */
 826     public URI(String scheme, String ssp, String fragment)
 827         throws URISyntaxException
 828     {
 829         new Parser(toString(scheme, ssp,
 830                             null, null, null, -1,
 831                             null, null, fragment))
 832             .parse(false);
 833     }
 834 
 835     /**
 836      * Constructs a simple URI consisting of only a scheme and a pre-validated
 837      * path. Provides a fast-path for some internal cases.
 838      */
 839     URI(String scheme, String path) {
 840         assert validSchemeAndPath(scheme, path);
 841         this.scheme = scheme;
 842         this.path = path;
 843     }
 844 
 845     private static boolean validSchemeAndPath(String scheme, String path) {
 846         try {
 847             URI u = new URI(scheme + ":" + path);
 848             return scheme.equals(u.scheme) && path.equals(u.path);
 849         } catch (URISyntaxException e) {
 850             return false;
 851         }
 852     }
 853 
 854     /**
 855      * Creates a URI by parsing the given string.
 856      *
 857      * <p> This convenience factory method works as if by invoking the {@link
 858      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
 859      * constructor is caught and wrapped in a new {@link
 860      * IllegalArgumentException} object, which is then thrown.
 861      *
 862      * <p> This method is provided for use in situations where it is known that
 863      * the given string is a legal URI, for example for URI constants declared
 864      * within in a program, and so it would be considered a programming error
 865      * for the string not to parse as such.  The constructors, which throw
 866      * {@link URISyntaxException} directly, should be used situations where a
 867      * URI is being constructed from user input or from some other source that
 868      * may be prone to errors.  </p>
 869      *
 870      * @param  str   The string to be parsed into a URI
 871      * @return The new URI
 872      *
 873      * @throws  NullPointerException
 874      *          If {@code str} is {@code null}
 875      *
 876      * @throws  IllegalArgumentException
 877      *          If the given string violates RFC&nbsp;2396
 878      */
 879     public static URI create(String str) {
 880         try {
 881             return new URI(str);
 882         } catch (URISyntaxException x) {
 883             throw new IllegalArgumentException(x.getMessage(), x);
 884         }
 885     }
 886 
 887 
 888     // -- Operations --
 889 
 890     /**
 891      * Attempts to parse this URI's authority component, if defined, into
 892      * user-information, host, and port components.
 893      *
 894      * <p> If this URI's authority component has already been recognized as
 895      * being server-based then it will already have been parsed into
 896      * user-information, host, and port components.  In this case, or if this
 897      * URI has no authority component, this method simply returns this URI.
 898      *
 899      * <p> Otherwise this method attempts once more to parse the authority
 900      * component into user-information, host, and port components, and throws
 901      * an exception describing why the authority component could not be parsed
 902      * in that way.
 903      *
 904      * <p> This method is provided because the generic URI syntax specified in
 905      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 906      * cannot always distinguish a malformed server-based authority from a
 907      * legitimate registry-based authority.  It must therefore treat some
 908      * instances of the former as instances of the latter.  The authority
 909      * component in the URI string {@code "//foo:bar"}, for example, is not a
 910      * legal server-based authority but it is legal as a registry-based
 911      * authority.
 912      *
 913      * <p> In many common situations, for example when working URIs that are
 914      * known to be either URNs or URLs, the hierarchical URIs being used will
 915      * always be server-based.  They therefore must either be parsed as such or
 916      * treated as an error.  In these cases a statement such as
 917      *
 918      * <blockquote>
 919      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
 920      * </blockquote>
 921      *
 922      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
 923      * it has an authority component, has a server-based authority with proper
 924      * user-information, host, and port components.  Invoking this method also
 925      * ensures that if the authority could not be parsed in that way then an
 926      * appropriate diagnostic message can be issued based upon the exception
 927      * that is thrown. </p>
 928      *
 929      * @return  A URI whose authority field has been parsed
 930      *          as a server-based authority
 931      *
 932      * @throws  URISyntaxException
 933      *          If the authority component of this URI is defined
 934      *          but cannot be parsed as a server-based authority
 935      *          according to RFC&nbsp;2396
 936      */
 937     public URI parseServerAuthority()
 938         throws URISyntaxException
 939     {
 940         // We could be clever and cache the error message and index from the
 941         // exception thrown during the original parse, but that would require
 942         // either more fields or a more-obscure representation.
 943         if ((host != null) || (authority == null))
 944             return this;
 945         new Parser(toString()).parse(true);
 946         return this;
 947     }
 948 
 949     /**
 950      * Normalizes this URI's path.
 951      *
 952      * <p> If this URI is opaque, or if its path is already in normal form,
 953      * then this URI is returned.  Otherwise a new URI is constructed that is
 954      * identical to this URI except that its path is computed by normalizing
 955      * this URI's path in a manner consistent with <a
 956      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 957      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
 958      * </p>
 959      *
 960      * <ol>
 961      *
 962      *   <li><p> All {@code "."} segments are removed. </p></li>
 963      *
 964      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
 965      *   segment then both of these segments are removed.  This step is
 966      *   repeated until it is no longer applicable. </p></li>
 967      *
 968      *   <li><p> If the path is relative, and if its first segment contains a
 969      *   colon character ({@code ':'}), then a {@code "."} segment is
 970      *   prepended.  This prevents a relative URI with a path such as
 971      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
 972      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
 973      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
 974      *
 975      * </ol>
 976      *
 977      * <p> A normalized path will begin with one or more {@code ".."} segments
 978      * if there were insufficient non-{@code ".."} segments preceding them to
 979      * allow their removal.  A normalized path will begin with a {@code "."}
 980      * segment if one was inserted by step 3 above.  Otherwise, a normalized
 981      * path will not contain any {@code "."} or {@code ".."} segments. </p>
 982      *
 983      * @return  A URI equivalent to this URI,
 984      *          but whose path is in normal form
 985      */
 986     public URI normalize() {
 987         return normalize(this);
 988     }
 989 
 990     /**
 991      * Resolves the given URI against this URI.
 992      *
 993      * <p> If the given URI is already absolute, or if this URI is opaque, then
 994      * the given URI is returned.
 995      *
 996      * <p><a id="resolve-frag"></a> If the given URI's fragment component is
 997      * defined, its path component is empty, and its scheme, authority, and
 998      * query components are undefined, then a URI with the given fragment but
 999      * with all other components equal to those of this URI is returned.  This
1000      * allows a URI representing a standalone fragment reference, such as
1001      * {@code "#foo"}, to be usefully resolved against a base URI.
1002      *
1003      * <p> Otherwise this method constructs a new hierarchical URI in a manner
1004      * consistent with <a
1005      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1006      * section&nbsp;5.2; that is: </p>
1007      *
1008      * <ol>
1009      *
1010      *   <li><p> A new URI is constructed with this URI's scheme and the given
1011      *   URI's query and fragment components. </p></li>
1012      *
1013      *   <li><p> If the given URI has an authority component then the new URI's
1014      *   authority and path are taken from the given URI. </p></li>
1015      *
1016      *   <li><p> Otherwise the new URI's authority component is copied from
1017      *   this URI, and its path is computed as follows: </p>
1018      *
1019      *   <ol>
1020      *
1021      *     <li><p> If the given URI's path is absolute then the new URI's path
1022      *     is taken from the given URI. </p></li>
1023      *
1024      *     <li><p> Otherwise the given URI's path is relative, and so the new
1025      *     URI's path is computed by resolving the path of the given URI
1026      *     against the path of this URI.  This is done by concatenating all but
1027      *     the last segment of this URI's path, if any, with the given URI's
1028      *     path and then normalizing the result as if by invoking the {@link
1029      *     #normalize() normalize} method. </p></li>
1030      *
1031      *   </ol></li>
1032      *
1033      * </ol>
1034      *
1035      * <p> The result of this method is absolute if, and only if, either this
1036      * URI is absolute or the given URI is absolute.  </p>
1037      *
1038      * @param  uri  The URI to be resolved against this URI
1039      * @return The resulting URI
1040      *
1041      * @throws  NullPointerException
1042      *          If {@code uri} is {@code null}
1043      */
1044     public URI resolve(URI uri) {
1045         return resolve(this, uri);
1046     }
1047 
1048     /**
1049      * Constructs a new URI by parsing the given string and then resolving it
1050      * against this URI.
1051      *
1052      * <p> This convenience method works as if invoking it were equivalent to
1053      * evaluating the expression {@link #resolve(java.net.URI)
1054      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1055      *
1056      * @param  str   The string to be parsed into a URI
1057      * @return The resulting URI
1058      *
1059      * @throws  NullPointerException
1060      *          If {@code str} is {@code null}
1061      *
1062      * @throws  IllegalArgumentException
1063      *          If the given string violates RFC&nbsp;2396
1064      */
1065     public URI resolve(String str) {
1066         return resolve(URI.create(str));
1067     }
1068 
1069     /**
1070      * Relativizes the given URI against this URI.
1071      *
1072      * <p> The relativization of the given URI against this URI is computed as
1073      * follows: </p>
1074      *
1075      * <ol>
1076      *
1077      *   <li><p> If either this URI or the given URI are opaque, or if the
1078      *   scheme and authority components of the two URIs are not identical, or
1079      *   if the path of this URI is not a prefix of the path of the given URI,
1080      *   then the given URI is returned. </p></li>
1081      *
1082      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1083      *   query and fragment components taken from the given URI and with a path
1084      *   component computed by removing this URI's path from the beginning of
1085      *   the given URI's path. </p></li>
1086      *
1087      * </ol>
1088      *
1089      * @param  uri  The URI to be relativized against this URI
1090      * @return The resulting URI
1091      *
1092      * @throws  NullPointerException
1093      *          If {@code uri} is {@code null}
1094      */
1095     public URI relativize(URI uri) {
1096         return relativize(this, uri);
1097     }
1098 
1099     /**
1100      * Constructs a URL from this URI.
1101      *
1102      * <p> This convenience method works as if invoking it were equivalent to
1103      * evaluating the expression {@code new URL(this.toString())} after
1104      * first checking that this URI is absolute. </p>
1105      *
1106      * @return  A URL constructed from this URI
1107      *
1108      * @throws  IllegalArgumentException
1109      *          If this URL is not absolute
1110      *
1111      * @throws  MalformedURLException
1112      *          If a protocol handler for the URL could not be found,
1113      *          or if some other error occurred while constructing the URL
1114      */
1115     public URL toURL() throws MalformedURLException {
1116         return URL.fromURI(this);
1117     }
1118 
1119     // -- Component access methods --
1120 
1121     /**
1122      * Returns the scheme component of this URI.
1123      *
1124      * <p> The scheme component of a URI, if defined, only contains characters
1125      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1126      * scheme always starts with an <i>alpha</i> character. <p>
1127      *
1128      * The scheme component of a URI cannot contain escaped octets, hence this
1129      * method does not perform any decoding.
1130      *
1131      * @return  The scheme component of this URI,
1132      *          or {@code null} if the scheme is undefined
1133      */
1134     public String getScheme() {
1135         return scheme;
1136     }
1137 
1138     /**
1139      * Tells whether or not this URI is absolute.
1140      *
1141      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1142      *
1143      * @return  {@code true} if, and only if, this URI is absolute
1144      */
1145     public boolean isAbsolute() {
1146         return scheme != null;
1147     }
1148 
1149     /**
1150      * Tells whether or not this URI is opaque.
1151      *
1152      * <p> A URI is opaque if, and only if, it is absolute and its
1153      * scheme-specific part does not begin with a slash character ('/').
1154      * An opaque URI has a scheme, a scheme-specific part, and possibly
1155      * a fragment; all other components are undefined. </p>
1156      *
1157      * @return  {@code true} if, and only if, this URI is opaque
1158      */
1159     public boolean isOpaque() {
1160         return path == null;
1161     }
1162 
1163     /**
1164      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1165      * part is never undefined, though it may be empty.
1166      *
1167      * <p> The scheme-specific part of a URI only contains legal URI
1168      * characters. </p>
1169      *
1170      * @return  The raw scheme-specific part of this URI
1171      *          (never {@code null})
1172      */
1173     public String getRawSchemeSpecificPart() {
1174         String part = schemeSpecificPart;
1175         if (part != null) {
1176             return part;
1177         }
1178 
1179         String s = string;
1180         if (s != null) {
1181             // if string is defined, components will have been parsed
1182             int start = 0;
1183             int end = s.length();
1184             if (scheme != null) {
1185                 start = scheme.length() + 1;
1186             }
1187             if (fragment != null) {
1188                 end -= fragment.length() + 1;
1189             }
1190             if (path != null && path.length() == end - start) {
1191                 part = path;
1192             } else {
1193                 part = s.substring(start, end);
1194             }
1195         } else {
1196             StringBuilder sb = new StringBuilder();
1197             appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1198                                  host, port, getPath(), getQuery());
1199             part = sb.toString();
1200         }
1201         return schemeSpecificPart = part;
1202     }
1203 
1204     /**
1205      * Returns the decoded scheme-specific part of this URI.
1206      *
1207      * <p> The string returned by this method is equal to that returned by the
1208      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1209      * except that all sequences of escaped octets are <a
1210      * href="#decode">decoded</a>.  </p>
1211      *
1212      * @return  The decoded scheme-specific part of this URI
1213      *          (never {@code null})
1214      */
1215     public String getSchemeSpecificPart() {
1216         String part = decodedSchemeSpecificPart;
1217         if (part == null) {
1218             decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart());
1219         }
1220         return part;
1221     }
1222 
1223     /**
1224      * Returns the raw authority component of this URI.
1225      *
1226      * <p> The authority component of a URI, if defined, only contains the
1227      * commercial-at character ({@code '@'}) and characters in the
1228      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1229      * categories.  If the authority is server-based then it is further
1230      * constrained to have valid user-information, host, and port
1231      * components. </p>
1232      *
1233      * @return  The raw authority component of this URI,
1234      *          or {@code null} if the authority is undefined
1235      */
1236     public String getRawAuthority() {
1237         return authority;
1238     }
1239 
1240     /**
1241      * Returns the decoded authority component of this URI.
1242      *
1243      * <p> The string returned by this method is equal to that returned by the
1244      * {@link #getRawAuthority() getRawAuthority} method except that all
1245      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1246      *
1247      * @return  The decoded authority component of this URI,
1248      *          or {@code null} if the authority is undefined
1249      */
1250     public String getAuthority() {
1251         String auth = decodedAuthority;
1252         if ((auth == null) && (authority != null)) {
1253             decodedAuthority = auth = decode(authority);
1254         }
1255         return auth;
1256     }
1257 
1258     /**
1259      * Returns the raw user-information component of this URI.
1260      *
1261      * <p> The user-information component of a URI, if defined, only contains
1262      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1263      * <i>other</i> categories. </p>
1264      *
1265      * @return  The raw user-information component of this URI,
1266      *          or {@code null} if the user information is undefined
1267      */
1268     public String getRawUserInfo() {
1269         return userInfo;
1270     }
1271 
1272     /**
1273      * Returns the decoded user-information component of this URI.
1274      *
1275      * <p> The string returned by this method is equal to that returned by the
1276      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1277      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1278      *
1279      * @return  The decoded user-information component of this URI,
1280      *          or {@code null} if the user information is undefined
1281      */
1282     public String getUserInfo() {
1283         String user = decodedUserInfo;
1284         if ((user == null) && (userInfo != null)) {
1285             decodedUserInfo = user = decode(userInfo);
1286         }
1287         return user;
1288     }
1289 
1290     /**
1291      * Returns the host component of this URI.
1292      *
1293      * <p> The host component of a URI, if defined, will have one of the
1294      * following forms: </p>
1295      *
1296      * <ul>
1297      *
1298      *   <li><p> A domain name consisting of one or more <i>labels</i>
1299      *   separated by period characters ({@code '.'}), optionally followed by
1300      *   a period character.  Each label consists of <i>alphanum</i> characters
1301      *   as well as hyphen characters ({@code '-'}), though hyphens never
1302      *   occur as the first or last characters in a label. The rightmost
1303      *   label of a domain name consisting of two or more labels, begins
1304      *   with an <i>alpha</i> character. </li>
1305      *
1306      *   <li><p> A dotted-quad IPv4 address of the form
1307      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1308      *   where no <i>digit</i> sequence is longer than three characters and no
1309      *   sequence has a value larger than 255. </p></li>
1310      *
1311      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1312      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1313      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1314      *   syntax of IPv6 addresses is specified in <a
1315      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1316      *   Addressing Architecture</i></a>.  </p></li>
1317      *
1318      * </ul>
1319      *
1320      * The host component of a URI cannot contain escaped octets, hence this
1321      * method does not perform any decoding.
1322      *
1323      * @return  The host component of this URI,
1324      *          or {@code null} if the host is undefined
1325      */
1326     public String getHost() {
1327         return host;
1328     }
1329 
1330     /**
1331      * Returns the port number of this URI.
1332      *
1333      * <p> The port component of a URI, if defined, is a non-negative
1334      * integer. </p>
1335      *
1336      * @return  The port component of this URI,
1337      *          or {@code -1} if the port is undefined
1338      */
1339     public int getPort() {
1340         return port;
1341     }
1342 
1343     /**
1344      * Returns the raw path component of this URI.
1345      *
1346      * <p> The path component of a URI, if defined, only contains the slash
1347      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1348      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1349      * and <i>other</i> categories. </p>
1350      *
1351      * @return  The path component of this URI,
1352      *          or {@code null} if the path is undefined
1353      */
1354     public String getRawPath() {
1355         return path;
1356     }
1357 
1358     /**
1359      * Returns the decoded path component of this URI.
1360      *
1361      * <p> The string returned by this method is equal to that returned by the
1362      * {@link #getRawPath() getRawPath} method except that all sequences of
1363      * escaped octets are <a href="#decode">decoded</a>.  </p>
1364      *
1365      * @return  The decoded path component of this URI,
1366      *          or {@code null} if the path is undefined
1367      */
1368     public String getPath() {
1369         String decoded = decodedPath;
1370         if ((decoded == null) && (path != null)) {
1371             decodedPath = decoded = decode(path);
1372         }
1373         return decoded;
1374     }
1375 
1376     /**
1377      * Returns the raw query component of this URI.
1378      *
1379      * <p> The query component of a URI, if defined, only contains legal URI
1380      * characters. </p>
1381      *
1382      * @return  The raw query component of this URI,
1383      *          or {@code null} if the query is undefined
1384      */
1385     public String getRawQuery() {
1386         return query;
1387     }
1388 
1389     /**
1390      * Returns the decoded query component of this URI.
1391      *
1392      * <p> The string returned by this method is equal to that returned by the
1393      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1394      * escaped octets are <a href="#decode">decoded</a>.  </p>
1395      *
1396      * @return  The decoded query component of this URI,
1397      *          or {@code null} if the query is undefined
1398      */
1399     public String getQuery() {
1400         String decoded = decodedQuery;
1401         if ((decoded == null) && (query != null)) {
1402             decodedQuery = decoded = decode(query, false);
1403         }
1404         return decoded;
1405     }
1406 
1407     /**
1408      * Returns the raw fragment component of this URI.
1409      *
1410      * <p> The fragment component of a URI, if defined, only contains legal URI
1411      * characters. </p>
1412      *
1413      * @return  The raw fragment component of this URI,
1414      *          or {@code null} if the fragment is undefined
1415      */
1416     public String getRawFragment() {
1417         return fragment;
1418     }
1419 
1420     /**
1421      * Returns the decoded fragment component of this URI.
1422      *
1423      * <p> The string returned by this method is equal to that returned by the
1424      * {@link #getRawFragment() getRawFragment} method except that all
1425      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1426      *
1427      * @return  The decoded fragment component of this URI,
1428      *          or {@code null} if the fragment is undefined
1429      */
1430     public String getFragment() {
1431         String decoded = decodedFragment;
1432         if ((decoded == null) && (fragment != null)) {
1433             decodedFragment = decoded = decode(fragment, false);
1434         }
1435         return decoded;
1436     }
1437 
1438 
1439     // -- Equality, comparison, hash code, toString, and serialization --
1440 
1441     /**
1442      * Tests this URI for equality with another object.
1443      *
1444      * <p> If the given object is not a URI then this method immediately
1445      * returns {@code false}.
1446      *
1447      * <p> For two URIs to be considered equal requires that either both are
1448      * opaque or both are hierarchical.  Their schemes must either both be
1449      * undefined or else be equal without regard to case. Their fragments
1450      * must either both be undefined or else be equal.
1451      *
1452      * <p> For two opaque URIs to be considered equal, their scheme-specific
1453      * parts must be equal.
1454      *
1455      * <p> For two hierarchical URIs to be considered equal, their paths must
1456      * be equal and their queries must either both be undefined or else be
1457      * equal.  Their authorities must either both be undefined, or both be
1458      * registry-based, or both be server-based.  If their authorities are
1459      * defined and are registry-based, then they must be equal.  If their
1460      * authorities are defined and are server-based, then their hosts must be
1461      * equal without regard to case, their port numbers must be equal, and
1462      * their user-information components must be equal.
1463      *
1464      * <p> When testing the user-information, path, query, fragment, authority,
1465      * or scheme-specific parts of two URIs for equality, the raw forms rather
1466      * than the encoded forms of these components are compared and the
1467      * hexadecimal digits of escaped octets are compared without regard to
1468      * case.
1469      *
1470      * <p> This method satisfies the general contract of the {@link
1471      * java.lang.Object#equals(Object) Object.equals} method. </p>
1472      *
1473      * @param   ob   The object to which this object is to be compared
1474      *
1475      * @return  {@code true} if, and only if, the given object is a URI that
1476      *          is identical to this URI
1477      */
1478     public boolean equals(Object ob) {
1479         if (ob == this)
1480             return true;
1481         if (!(ob instanceof URI))
1482             return false;
1483         URI that = (URI)ob;
1484         if (this.isOpaque() != that.isOpaque()) return false;
1485         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1486         if (!equal(this.fragment, that.fragment)) return false;
1487 
1488         // Opaque
1489         if (this.isOpaque())
1490             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1491 
1492         // Hierarchical
1493         if (!equal(this.path, that.path)) return false;
1494         if (!equal(this.query, that.query)) return false;
1495 
1496         // Authorities
1497         if (this.authority == that.authority) return true;
1498         if (this.host != null) {
1499             // Server-based
1500             if (!equal(this.userInfo, that.userInfo)) return false;
1501             if (!equalIgnoringCase(this.host, that.host)) return false;
1502             if (this.port != that.port) return false;
1503         } else if (this.authority != null) {
1504             // Registry-based
1505             if (!equal(this.authority, that.authority)) return false;
1506         } else if (this.authority != that.authority) {
1507             return false;
1508         }
1509 
1510         return true;
1511     }
1512 
1513     /**
1514      * Returns a hash-code value for this URI.  The hash code is based upon all
1515      * of the URI's components, and satisfies the general contract of the
1516      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1517      *
1518      * @return  A hash-code value for this URI
1519      */
1520     public int hashCode() {
1521         int h = hash;
1522         if (h == 0) {
1523             h = hashIgnoringCase(0, scheme);
1524             h = hash(h, fragment);
1525             if (isOpaque()) {
1526                 h = hash(h, schemeSpecificPart);
1527             } else {
1528                 h = hash(h, path);
1529                 h = hash(h, query);
1530                 if (host != null) {
1531                     h = hash(h, userInfo);
1532                     h = hashIgnoringCase(h, host);
1533                     h += 1949 * port;
1534                 } else {
1535                     h = hash(h, authority);
1536                 }
1537             }
1538             if (h != 0) {
1539                 hash = h;
1540             }
1541         }
1542         return h;
1543     }
1544 
1545     /**
1546      * Compares this URI to another object, which must be a URI.
1547      *
1548      * <p> When comparing corresponding components of two URIs, if one
1549      * component is undefined but the other is defined then the first is
1550      * considered to be less than the second.  Unless otherwise noted, string
1551      * components are ordered according to their natural, case-sensitive
1552      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1553      * String.compareTo} method.  String components that are subject to
1554      * encoding are compared by comparing their raw forms rather than their
1555      * encoded forms.
1556      *
1557      * <p> The ordering of URIs is defined as follows: </p>
1558      *
1559      * <ul>
1560      *
1561      *   <li><p> Two URIs with different schemes are ordered according the
1562      *   ordering of their schemes, without regard to case. </p></li>
1563      *
1564      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1565      *   with an identical scheme. </p></li>
1566      *
1567      *   <li><p> Two opaque URIs with identical schemes are ordered according
1568      *   to the ordering of their scheme-specific parts. </p></li>
1569      *
1570      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1571      *   parts are ordered according to the ordering of their
1572      *   fragments. </p></li>
1573      *
1574      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1575      *   according to the ordering of their authority components: </p>
1576      *
1577      *   <ul>
1578      *
1579      *     <li><p> If both authority components are server-based then the URIs
1580      *     are ordered according to their user-information components; if these
1581      *     components are identical then the URIs are ordered according to the
1582      *     ordering of their hosts, without regard to case; if the hosts are
1583      *     identical then the URIs are ordered according to the ordering of
1584      *     their ports. </p></li>
1585      *
1586      *     <li><p> If one or both authority components are registry-based then
1587      *     the URIs are ordered according to the ordering of their authority
1588      *     components. </p></li>
1589      *
1590      *   </ul></li>
1591      *
1592      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1593      *   authority components are ordered according to the ordering of their
1594      *   paths; if their paths are identical then they are ordered according to
1595      *   the ordering of their queries; if the queries are identical then they
1596      *   are ordered according to the order of their fragments. </p></li>
1597      *
1598      * </ul>
1599      *
1600      * <p> This method satisfies the general contract of the {@link
1601      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1602      * method. </p>
1603      *
1604      * @param   that
1605      *          The object to which this URI is to be compared
1606      *
1607      * @return  A negative integer, zero, or a positive integer as this URI is
1608      *          less than, equal to, or greater than the given URI
1609      *
1610      * @throws  ClassCastException
1611      *          If the given object is not a URI
1612      */
1613     public int compareTo(URI that) {
1614         int c;
1615 
1616         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1617             return c;
1618 
1619         if (this.isOpaque()) {
1620             if (that.isOpaque()) {
1621                 // Both opaque
1622                 if ((c = compare(this.schemeSpecificPart,
1623                                  that.schemeSpecificPart)) != 0)
1624                     return c;
1625                 return compare(this.fragment, that.fragment);
1626             }
1627             return +1;                  // Opaque > hierarchical
1628         } else if (that.isOpaque()) {
1629             return -1;                  // Hierarchical < opaque
1630         }
1631 
1632         // Hierarchical
1633         if ((this.host != null) && (that.host != null)) {
1634             // Both server-based
1635             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1636                 return c;
1637             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1638                 return c;
1639             if ((c = this.port - that.port) != 0)
1640                 return c;
1641         } else {
1642             // If one or both authorities are registry-based then we simply
1643             // compare them in the usual, case-sensitive way.  If one is
1644             // registry-based and one is server-based then the strings are
1645             // guaranteed to be unequal, hence the comparison will never return
1646             // zero and the compareTo and equals methods will remain
1647             // consistent.
1648             if ((c = compare(this.authority, that.authority)) != 0) return c;
1649         }
1650 
1651         if ((c = compare(this.path, that.path)) != 0) return c;
1652         if ((c = compare(this.query, that.query)) != 0) return c;
1653         return compare(this.fragment, that.fragment);
1654     }
1655 
1656     /**
1657      * Returns the content of this URI as a string.
1658      *
1659      * <p> If this URI was created by invoking one of the constructors in this
1660      * class then a string equivalent to the original input string, or to the
1661      * string computed from the originally-given components, as appropriate, is
1662      * returned.  Otherwise this URI was created by normalization, resolution,
1663      * or relativization, and so a string is constructed from this URI's
1664      * components according to the rules specified in <a
1665      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1666      * section&nbsp;5.2, step&nbsp;7. </p>
1667      *
1668      * @return  The string form of this URI
1669      */
1670     public String toString() {
1671         String s = string;
1672         if (s == null) {
1673             s = defineString();
1674         }
1675         return s;
1676     }
1677 
1678     private String defineString() {
1679         String s = string;
1680         if (s != null) {
1681             return s;
1682         }
1683 
1684         StringBuilder sb = new StringBuilder();
1685         if (scheme != null) {
1686             sb.append(scheme);
1687             sb.append(':');
1688         }
1689         if (isOpaque()) {
1690             sb.append(schemeSpecificPart);
1691         } else {
1692             if (host != null) {
1693                 sb.append("//");
1694                 if (userInfo != null) {
1695                     sb.append(userInfo);
1696                     sb.append('@');
1697                 }
1698                 boolean needBrackets = ((host.indexOf(':') >= 0)
1699                         && !host.startsWith("[")
1700                         && !host.endsWith("]"));
1701                 if (needBrackets) sb.append('[');
1702                 sb.append(host);
1703                 if (needBrackets) sb.append(']');
1704                 if (port != -1) {
1705                     sb.append(':');
1706                     sb.append(port);
1707                 }
1708             } else if (authority != null) {
1709                 sb.append("//");
1710                 sb.append(authority);
1711             }
1712             if (path != null)
1713                 sb.append(path);
1714             if (query != null) {
1715                 sb.append('?');
1716                 sb.append(query);
1717             }
1718         }
1719         if (fragment != null) {
1720             sb.append('#');
1721             sb.append(fragment);
1722         }
1723         return string = sb.toString();
1724     }
1725 
1726     /**
1727      * Returns the content of this URI as a US-ASCII string.
1728      *
1729      * <p> If this URI does not contain any characters in the <i>other</i>
1730      * category then an invocation of this method will return the same value as
1731      * an invocation of the {@link #toString() toString} method.  Otherwise
1732      * this method works as if by invoking that method and then <a
1733      * href="#encode">encoding</a> the result.  </p>
1734      *
1735      * @return  The string form of this URI, encoded as needed
1736      *          so that it only contains characters in the US-ASCII
1737      *          charset
1738      */
1739     public String toASCIIString() {
1740         return encode(toString());
1741     }
1742 
1743 
1744     // -- Serialization support --
1745 
1746     /**
1747      * Saves the content of this URI to the given serial stream.
1748      *
1749      * <p> The only serializable field of a URI instance is its {@code string}
1750      * field.  That field is given a value, if it does not have one already,
1751      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1752      * method of the given object-output stream is invoked. </p>
1753      *
1754      * @param  os  The object-output stream to which this object
1755      *             is to be written
1756      */
1757     private void writeObject(ObjectOutputStream os)
1758         throws IOException
1759     {
1760         defineString();
1761         os.defaultWriteObject();        // Writes the string field only
1762     }
1763 
1764     /**
1765      * Reconstitutes a URI from the given serial stream.
1766      *
1767      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1768      * invoked to read the value of the {@code string} field.  The result is
1769      * then parsed in the usual way.
1770      *
1771      * @param  is  The object-input stream from which this object
1772      *             is being read
1773      */
1774     private void readObject(ObjectInputStream is)
1775         throws ClassNotFoundException, IOException
1776     {
1777         port = -1;                      // Argh
1778         is.defaultReadObject();
1779         try {
1780             new Parser(string).parse(false);
1781         } catch (URISyntaxException x) {
1782             IOException y = new InvalidObjectException("Invalid URI");
1783             y.initCause(x);
1784             throw y;
1785         }
1786     }
1787 
1788 
1789     // -- End of public methods --
1790 
1791 
1792     // -- Utility methods for string-field comparison and hashing --
1793 
1794     // These methods return appropriate values for null string arguments,
1795     // thereby simplifying the equals, hashCode, and compareTo methods.
1796     //
1797     // The case-ignoring methods should only be applied to strings whose
1798     // characters are all known to be US-ASCII.  Because of this restriction,
1799     // these methods are faster than the similar methods in the String class.
1800 
1801     // US-ASCII only
1802     private static int toLower(char c) {
1803         if ((c >= 'A') && (c <= 'Z'))
1804             return c + ('a' - 'A');
1805         return c;
1806     }
1807 
1808     // US-ASCII only
1809     private static int toUpper(char c) {
1810         if ((c >= 'a') && (c <= 'z'))
1811             return c - ('a' - 'A');
1812         return c;
1813     }
1814 
1815     private static boolean equal(String s, String t) {
1816         if (s == t) return true;
1817         if ((s != null) && (t != null)) {
1818             if (s.length() != t.length())
1819                 return false;
1820             if (s.indexOf('%') < 0)
1821                 return s.equals(t);
1822             int n = s.length();
1823             for (int i = 0; i < n;) {
1824                 char c = s.charAt(i);
1825                 char d = t.charAt(i);
1826                 if (c != '%') {
1827                     if (c != d)
1828                         return false;
1829                     i++;
1830                     continue;
1831                 }
1832                 if (d != '%')
1833                     return false;
1834                 i++;
1835                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1836                     return false;
1837                 i++;
1838                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1839                     return false;
1840                 i++;
1841             }
1842             return true;
1843         }
1844         return false;
1845     }
1846 
1847     // US-ASCII only
1848     private static boolean equalIgnoringCase(String s, String t) {
1849         if (s == t) return true;
1850         if ((s != null) && (t != null)) {
1851             int n = s.length();
1852             if (t.length() != n)
1853                 return false;
1854             for (int i = 0; i < n; i++) {
1855                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1856                     return false;
1857             }
1858             return true;
1859         }
1860         return false;
1861     }
1862 
1863     private static int hash(int hash, String s) {
1864         if (s == null) return hash;
1865         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1866                                   : normalizedHash(hash, s);
1867     }
1868 
1869 
1870     private static int normalizedHash(int hash, String s) {
1871         int h = 0;
1872         for (int index = 0; index < s.length(); index++) {
1873             char ch = s.charAt(index);
1874             h = 31 * h + ch;
1875             if (ch == '%') {
1876                 /*
1877                  * Process the next two encoded characters
1878                  */
1879                 for (int i = index + 1; i < index + 3; i++)
1880                     h = 31 * h + toUpper(s.charAt(i));
1881                 index += 2;
1882             }
1883         }
1884         return hash * 127 + h;
1885     }
1886 
1887     // US-ASCII only
1888     private static int hashIgnoringCase(int hash, String s) {
1889         if (s == null) return hash;
1890         int h = hash;
1891         int n = s.length();
1892         for (int i = 0; i < n; i++)
1893             h = 31 * h + toLower(s.charAt(i));
1894         return h;
1895     }
1896 
1897     private static int compare(String s, String t) {
1898         if (s == t) return 0;
1899         if (s != null) {
1900             if (t != null)
1901                 return s.compareTo(t);
1902             else
1903                 return +1;
1904         } else {
1905             return -1;
1906         }
1907     }
1908 
1909     // US-ASCII only
1910     private static int compareIgnoringCase(String s, String t) {
1911         if (s == t) return 0;
1912         if (s != null) {
1913             if (t != null) {
1914                 int sn = s.length();
1915                 int tn = t.length();
1916                 int n = sn < tn ? sn : tn;
1917                 for (int i = 0; i < n; i++) {
1918                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1919                     if (c != 0)
1920                         return c;
1921                 }
1922                 return sn - tn;
1923             }
1924             return +1;
1925         } else {
1926             return -1;
1927         }
1928     }
1929 
1930 
1931     // -- String construction --
1932 
1933     // If a scheme is given then the path, if given, must be absolute
1934     //
1935     private static void checkPath(String s, String scheme, String path)
1936         throws URISyntaxException
1937     {
1938         if (scheme != null) {
1939             if (path != null && !path.isEmpty() && path.charAt(0) != '/')
1940                 throw new URISyntaxException(s, "Relative path in absolute URI");
1941         }
1942     }
1943 
1944     private void appendAuthority(StringBuilder sb,
1945                                  String authority,
1946                                  String userInfo,
1947                                  String host,
1948                                  int port)
1949     {
1950         if (host != null) {
1951             sb.append("//");
1952             if (userInfo != null) {
1953                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1954                 sb.append('@');
1955             }
1956             boolean needBrackets = ((host.indexOf(':') >= 0)
1957                                     && !host.startsWith("[")
1958                                     && !host.endsWith("]"));
1959             if (needBrackets) sb.append('[');
1960             sb.append(host);
1961             if (needBrackets) sb.append(']');
1962             if (port != -1) {
1963                 sb.append(':');
1964                 sb.append(port);
1965             }
1966         } else if (authority != null) {
1967             sb.append("//");
1968             if (authority.startsWith("[")) {
1969                 // authority should (but may not) contain an embedded IPv6 address
1970                 int end = authority.indexOf(']');
1971                 String doquote = authority, dontquote = "";
1972                 if (end != -1 && authority.indexOf(':') != -1) {
1973                     // the authority contains an IPv6 address
1974                     if (end == authority.length()) {
1975                         dontquote = authority;
1976                         doquote = "";
1977                     } else {
1978                         dontquote = authority.substring(0 , end + 1);
1979                         doquote = authority.substring(end + 1);
1980                     }
1981                 }
1982                 sb.append(dontquote);
1983                 sb.append(quote(doquote,
1984                             L_REG_NAME | L_SERVER,
1985                             H_REG_NAME | H_SERVER));
1986             } else {
1987                 sb.append(quote(authority,
1988                             L_REG_NAME | L_SERVER,
1989                             H_REG_NAME | H_SERVER));
1990             }
1991         }
1992     }
1993 
1994     private void appendSchemeSpecificPart(StringBuilder sb,
1995                                           String opaquePart,
1996                                           String authority,
1997                                           String userInfo,
1998                                           String host,
1999                                           int port,
2000                                           String path,
2001                                           String query)
2002     {
2003         if (opaquePart != null) {
2004             /* check if SSP begins with an IPv6 address
2005              * because we must not quote a literal IPv6 address
2006              */
2007             if (opaquePart.startsWith("//[")) {
2008                 int end =  opaquePart.indexOf(']');
2009                 if (end != -1 && opaquePart.indexOf(':')!=-1) {
2010                     String doquote, dontquote;
2011                     if (end == opaquePart.length()) {
2012                         dontquote = opaquePart;
2013                         doquote = "";
2014                     } else {
2015                         dontquote = opaquePart.substring(0,end+1);
2016                         doquote = opaquePart.substring(end+1);
2017                     }
2018                     sb.append (dontquote);
2019                     sb.append(quote(doquote, L_URIC, H_URIC));
2020                 }
2021             } else {
2022                 sb.append(quote(opaquePart, L_URIC, H_URIC));
2023             }
2024         } else {
2025             appendAuthority(sb, authority, userInfo, host, port);
2026             if (path != null)
2027                 sb.append(quote(path, L_PATH, H_PATH));
2028             if (query != null) {
2029                 sb.append('?');
2030                 sb.append(quote(query, L_URIC, H_URIC));
2031             }
2032         }
2033     }
2034 
2035     private void appendFragment(StringBuilder sb, String fragment) {
2036         if (fragment != null) {
2037             sb.append('#');
2038             sb.append(quote(fragment, L_URIC, H_URIC));
2039         }
2040     }
2041 
2042     private String toString(String scheme,
2043                             String opaquePart,
2044                             String authority,
2045                             String userInfo,
2046                             String host,
2047                             int port,
2048                             String path,
2049                             String query,
2050                             String fragment)
2051     {
2052         StringBuilder sb = new StringBuilder();
2053         if (scheme != null) {
2054             sb.append(scheme);
2055             sb.append(':');
2056         }
2057         appendSchemeSpecificPart(sb, opaquePart,
2058                                  authority, userInfo, host, port,
2059                                  path, query);
2060         appendFragment(sb, fragment);
2061         return sb.toString();
2062     }
2063 
2064     // -- Normalization, resolution, and relativization --
2065 
2066     // RFC2396 5.2 (6)
2067     private static String resolvePath(String base, String child,
2068                                       boolean absolute)
2069     {
2070         int i = base.lastIndexOf('/');
2071         int cn = child.length();
2072         String path = "";
2073 
2074         if (cn == 0) {
2075             // 5.2 (6a)
2076             if (i >= 0)
2077                 path = base.substring(0, i + 1);
2078         } else {
2079             StringBuilder sb = new StringBuilder(base.length() + cn);
2080             // 5.2 (6a)
2081             if (i >= 0)
2082                 sb.append(base, 0, i + 1);
2083             // 5.2 (6b)
2084             sb.append(child);
2085             path = sb.toString();
2086         }
2087 
2088         // 5.2 (6c-f)
2089         String np = normalize(path);
2090 
2091         // 5.2 (6g): If the result is absolute but the path begins with "../",
2092         // then we simply leave the path as-is
2093 
2094         return np;
2095     }
2096 
2097     // RFC2396 5.2
2098     private static URI resolve(URI base, URI child) {
2099         // check if child if opaque first so that NPE is thrown
2100         // if child is null.
2101         if (child.isOpaque() || base.isOpaque())
2102             return child;
2103 
2104         // 5.2 (2): Reference to current document (lone fragment)
2105         if ((child.scheme == null) && (child.authority == null)
2106             && child.path.isEmpty() && (child.fragment != null)
2107             && (child.query == null)) {
2108             if ((base.fragment != null)
2109                 && child.fragment.equals(base.fragment)) {
2110                 return base;
2111             }
2112             URI ru = new URI();
2113             ru.scheme = base.scheme;
2114             ru.authority = base.authority;
2115             ru.userInfo = base.userInfo;
2116             ru.host = base.host;
2117             ru.port = base.port;
2118             ru.path = base.path;
2119             ru.fragment = child.fragment;
2120             ru.query = base.query;
2121             return ru;
2122         }
2123 
2124         // 5.2 (3): Child is absolute
2125         if (child.scheme != null)
2126             return child;
2127 
2128         URI ru = new URI();             // Resolved URI
2129         ru.scheme = base.scheme;
2130         ru.query = child.query;
2131         ru.fragment = child.fragment;
2132 
2133         // 5.2 (4): Authority
2134         if (child.authority == null) {
2135             ru.authority = base.authority;
2136             ru.host = base.host;
2137             ru.userInfo = base.userInfo;
2138             ru.port = base.port;
2139 
2140             String cp = (child.path == null) ? "" : child.path;
2141             if (!cp.isEmpty() && cp.charAt(0) == '/') {
2142                 // 5.2 (5): Child path is absolute
2143                 ru.path = child.path;
2144             } else {
2145                 // 5.2 (6): Resolve relative path
2146                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2147             }
2148         } else {
2149             ru.authority = child.authority;
2150             ru.host = child.host;
2151             ru.userInfo = child.userInfo;
2152             ru.host = child.host;
2153             ru.port = child.port;
2154             ru.path = child.path;
2155         }
2156 
2157         // 5.2 (7): Recombine (nothing to do here)
2158         return ru;
2159     }
2160 
2161     // If the given URI's path is normal then return the URI;
2162     // o.w., return a new URI containing the normalized path.
2163     //
2164     private static URI normalize(URI u) {
2165         if (u.isOpaque() || u.path == null || u.path.isEmpty())
2166             return u;
2167 
2168         String np = normalize(u.path);
2169         if (np == u.path)
2170             return u;
2171 
2172         URI v = new URI();
2173         v.scheme = u.scheme;
2174         v.fragment = u.fragment;
2175         v.authority = u.authority;
2176         v.userInfo = u.userInfo;
2177         v.host = u.host;
2178         v.port = u.port;
2179         v.path = np;
2180         v.query = u.query;
2181         return v;
2182     }
2183 
2184     // If both URIs are hierarchical, their scheme and authority components are
2185     // identical, and the base path is a prefix of the child's path, then
2186     // return a relative URI that, when resolved against the base, yields the
2187     // child; otherwise, return the child.
2188     //
2189     private static URI relativize(URI base, URI child) {
2190         // check if child if opaque first so that NPE is thrown
2191         // if child is null.
2192         if (child.isOpaque() || base.isOpaque())
2193             return child;
2194         if (!equalIgnoringCase(base.scheme, child.scheme)
2195             || !equal(base.authority, child.authority))
2196             return child;
2197 
2198         String bp = normalize(base.path);
2199         String cp = normalize(child.path);
2200         if (!bp.equals(cp)) {
2201             if (!bp.endsWith("/"))
2202                 bp = bp + "/";
2203             if (!cp.startsWith(bp))
2204                 return child;
2205         }
2206 
2207         URI v = new URI();
2208         v.path = cp.substring(bp.length());
2209         v.query = child.query;
2210         v.fragment = child.fragment;
2211         return v;
2212     }
2213 
2214 
2215 
2216     // -- Path normalization --
2217 
2218     // The following algorithm for path normalization avoids the creation of a
2219     // string object for each segment, as well as the use of a string buffer to
2220     // compute the final result, by using a single char array and editing it in
2221     // place.  The array is first split into segments, replacing each slash
2222     // with '\0' and creating a segment-index array, each element of which is
2223     // the index of the first char in the corresponding segment.  We then walk
2224     // through both arrays, removing ".", "..", and other segments as necessary
2225     // by setting their entries in the index array to -1.  Finally, the two
2226     // arrays are used to rejoin the segments and compute the final result.
2227     //
2228     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2229 
2230 
2231     // Check the given path to see if it might need normalization.  A path
2232     // might need normalization if it contains duplicate slashes, a "."
2233     // segment, or a ".." segment.  Return -1 if no further normalization is
2234     // possible, otherwise return the number of segments found.
2235     //
2236     // This method takes a string argument rather than a char array so that
2237     // this test can be performed without invoking path.toCharArray().
2238     //
2239     private static int needsNormalization(String path) {
2240         boolean normal = true;
2241         int ns = 0;                     // Number of segments
2242         int end = path.length() - 1;    // Index of last char in path
2243         int p = 0;                      // Index of next char in path
2244 
2245         // Skip initial slashes
2246         while (p <= end) {
2247             if (path.charAt(p) != '/') break;
2248             p++;
2249         }
2250         if (p > 1) normal = false;
2251 
2252         // Scan segments
2253         while (p <= end) {
2254 
2255             // Looking at "." or ".." ?
2256             if ((path.charAt(p) == '.')
2257                 && ((p == end)
2258                     || ((path.charAt(p + 1) == '/')
2259                         || ((path.charAt(p + 1) == '.')
2260                             && ((p + 1 == end)
2261                                 || (path.charAt(p + 2) == '/')))))) {
2262                 normal = false;
2263             }
2264             ns++;
2265 
2266             // Find beginning of next segment
2267             while (p <= end) {
2268                 if (path.charAt(p++) != '/')
2269                     continue;
2270 
2271                 // Skip redundant slashes
2272                 while (p <= end) {
2273                     if (path.charAt(p) != '/') break;
2274                     normal = false;
2275                     p++;
2276                 }
2277 
2278                 break;
2279             }
2280         }
2281 
2282         return normal ? -1 : ns;
2283     }
2284 
2285 
2286     // Split the given path into segments, replacing slashes with nulls and
2287     // filling in the given segment-index array.
2288     //
2289     // Preconditions:
2290     //   segs.length == Number of segments in path
2291     //
2292     // Postconditions:
2293     //   All slashes in path replaced by '\0'
2294     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2295     //
2296     private static void split(char[] path, int[] segs) {
2297         int end = path.length - 1;      // Index of last char in path
2298         int p = 0;                      // Index of next char in path
2299         int i = 0;                      // Index of current segment
2300 
2301         // Skip initial slashes
2302         while (p <= end) {
2303             if (path[p] != '/') break;
2304             path[p] = '\0';
2305             p++;
2306         }
2307 
2308         while (p <= end) {
2309 
2310             // Note start of segment
2311             segs[i++] = p++;
2312 
2313             // Find beginning of next segment
2314             while (p <= end) {
2315                 if (path[p++] != '/')
2316                     continue;
2317                 path[p - 1] = '\0';
2318 
2319                 // Skip redundant slashes
2320                 while (p <= end) {
2321                     if (path[p] != '/') break;
2322                     path[p++] = '\0';
2323                 }
2324                 break;
2325             }
2326         }
2327 
2328         if (i != segs.length)
2329             throw new InternalError();  // ASSERT
2330     }
2331 
2332 
2333     // Join the segments in the given path according to the given segment-index
2334     // array, ignoring those segments whose index entries have been set to -1,
2335     // and inserting slashes as needed.  Return the length of the resulting
2336     // path.
2337     //
2338     // Preconditions:
2339     //   segs[i] == -1 implies segment i is to be ignored
2340     //   path computed by split, as above, with '\0' having replaced '/'
2341     //
2342     // Postconditions:
2343     //   path[0] .. path[return value] == Resulting path
2344     //
2345     private static int join(char[] path, int[] segs) {
2346         int ns = segs.length;           // Number of segments
2347         int end = path.length - 1;      // Index of last char in path
2348         int p = 0;                      // Index of next path char to write
2349 
2350         if (path[p] == '\0') {
2351             // Restore initial slash for absolute paths
2352             path[p++] = '/';
2353         }
2354 
2355         for (int i = 0; i < ns; i++) {
2356             int q = segs[i];            // Current segment
2357             if (q == -1)
2358                 // Ignore this segment
2359                 continue;
2360 
2361             if (p == q) {
2362                 // We're already at this segment, so just skip to its end
2363                 while ((p <= end) && (path[p] != '\0'))
2364                     p++;
2365                 if (p <= end) {
2366                     // Preserve trailing slash
2367                     path[p++] = '/';
2368                 }
2369             } else if (p < q) {
2370                 // Copy q down to p
2371                 while ((q <= end) && (path[q] != '\0'))
2372                     path[p++] = path[q++];
2373                 if (q <= end) {
2374                     // Preserve trailing slash
2375                     path[p++] = '/';
2376                 }
2377             } else
2378                 throw new InternalError(); // ASSERT false
2379         }
2380 
2381         return p;
2382     }
2383 
2384 
2385     // Remove "." segments from the given path, and remove segment pairs
2386     // consisting of a non-".." segment followed by a ".." segment.
2387     //
2388     private static void removeDots(char[] path, int[] segs) {
2389         int ns = segs.length;
2390         int end = path.length - 1;
2391 
2392         for (int i = 0; i < ns; i++) {
2393             int dots = 0;               // Number of dots found (0, 1, or 2)
2394 
2395             // Find next occurrence of "." or ".."
2396             do {
2397                 int p = segs[i];
2398                 if (path[p] == '.') {
2399                     if (p == end) {
2400                         dots = 1;
2401                         break;
2402                     } else if (path[p + 1] == '\0') {
2403                         dots = 1;
2404                         break;
2405                     } else if ((path[p + 1] == '.')
2406                                && ((p + 1 == end)
2407                                    || (path[p + 2] == '\0'))) {
2408                         dots = 2;
2409                         break;
2410                     }
2411                 }
2412                 i++;
2413             } while (i < ns);
2414             if ((i > ns) || (dots == 0))
2415                 break;
2416 
2417             if (dots == 1) {
2418                 // Remove this occurrence of "."
2419                 segs[i] = -1;
2420             } else {
2421                 // If there is a preceding non-".." segment, remove both that
2422                 // segment and this occurrence of ".."; otherwise, leave this
2423                 // ".." segment as-is.
2424                 int j;
2425                 for (j = i - 1; j >= 0; j--) {
2426                     if (segs[j] != -1) break;
2427                 }
2428                 if (j >= 0) {
2429                     int q = segs[j];
2430                     if (!((path[q] == '.')
2431                           && (path[q + 1] == '.')
2432                           && (path[q + 2] == '\0'))) {
2433                         segs[i] = -1;
2434                         segs[j] = -1;
2435                     }
2436                 }
2437             }
2438         }
2439     }
2440 
2441 
2442     // DEVIATION: If the normalized path is relative, and if the first
2443     // segment could be parsed as a scheme name, then prepend a "." segment
2444     //
2445     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2446 
2447         if (path[0] == '\0')
2448             // The path is absolute
2449             return;
2450 
2451         int ns = segs.length;
2452         int f = 0;                      // Index of first segment
2453         while (f < ns) {
2454             if (segs[f] >= 0)
2455                 break;
2456             f++;
2457         }
2458         if ((f >= ns) || (f == 0))
2459             // The path is empty, or else the original first segment survived,
2460             // in which case we already know that no leading "." is needed
2461             return;
2462 
2463         int p = segs[f];
2464         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2465         if (p >= path.length || path[p] == '\0')
2466             // No colon in first segment, so no "." needed
2467             return;
2468 
2469         // At this point we know that the first segment is unused,
2470         // hence we can insert a "." segment at that position
2471         path[0] = '.';
2472         path[1] = '\0';
2473         segs[0] = 0;
2474     }
2475 
2476 
2477     // Normalize the given path string.  A normal path string has no empty
2478     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2479     // segments equal to ".." that are preceded by a segment not equal to "..".
2480     // In contrast to Unix-style pathname normalization, for URI paths we
2481     // always retain trailing slashes.
2482     //
2483     private static String normalize(String ps) {
2484 
2485         // Does this path need normalization?
2486         int ns = needsNormalization(ps);        // Number of segments
2487         if (ns < 0)
2488             // Nope -- just return it
2489             return ps;
2490 
2491         char[] path = ps.toCharArray();         // Path in char-array form
2492 
2493         // Split path into segments
2494         int[] segs = new int[ns];               // Segment-index array
2495         split(path, segs);
2496 
2497         // Remove dots
2498         removeDots(path, segs);
2499 
2500         // Prevent scheme-name confusion
2501         maybeAddLeadingDot(path, segs);
2502 
2503         // Join the remaining segments and return the result
2504         String s = new String(path, 0, join(path, segs));
2505         if (s.equals(ps)) {
2506             // string was already normalized
2507             return ps;
2508         }
2509         return s;
2510     }
2511 
2512 
2513 
2514     // -- Character classes for parsing --
2515 
2516     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2517     // permissible in the various components of a URI reference.  We here
2518     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2519     // mask pair consists of two longs, a low mask and a high mask.  Taken
2520     // together they represent a 128-bit mask, where bit i is set iff the
2521     // character with value i is permitted.
2522     //
2523     // This approach is more efficient than sequentially searching arrays of
2524     // permitted characters.  It could be made still more efficient by
2525     // precompiling the mask information so that a character's presence in a
2526     // given mask could be determined by a single table lookup.
2527 
2528     // To save startup time, we manually calculate the low-/highMask constants.
2529     // For reference, the following methods were used to calculate the values:
2530 
2531     // Compute the low-order mask for the characters in the given string
2532     //     private static long lowMask(String chars) {
2533     //        int n = chars.length();
2534     //        long m = 0;
2535     //        for (int i = 0; i < n; i++) {
2536     //            char c = chars.charAt(i);
2537     //            if (c < 64)
2538     //                m |= (1L << c);
2539     //        }
2540     //        return m;
2541     //    }
2542 
2543     // Compute the high-order mask for the characters in the given string
2544     //    private static long highMask(String chars) {
2545     //        int n = chars.length();
2546     //        long m = 0;
2547     //        for (int i = 0; i < n; i++) {
2548     //            char c = chars.charAt(i);
2549     //            if ((c >= 64) && (c < 128))
2550     //                m |= (1L << (c - 64));
2551     //        }
2552     //        return m;
2553     //    }
2554 
2555     // Compute a low-order mask for the characters
2556     // between first and last, inclusive
2557     //    private static long lowMask(char first, char last) {
2558     //        long m = 0;
2559     //        int f = Math.max(Math.min(first, 63), 0);
2560     //        int l = Math.max(Math.min(last, 63), 0);
2561     //        for (int i = f; i <= l; i++)
2562     //            m |= 1L << i;
2563     //        return m;
2564     //    }
2565 
2566     // Compute a high-order mask for the characters
2567     // between first and last, inclusive
2568     //    private static long highMask(char first, char last) {
2569     //        long m = 0;
2570     //        int f = Math.max(Math.min(first, 127), 64) - 64;
2571     //        int l = Math.max(Math.min(last, 127), 64) - 64;
2572     //        for (int i = f; i <= l; i++)
2573     //            m |= 1L << i;
2574     //        return m;
2575     //    }
2576 
2577     // Tell whether the given character is permitted by the given mask pair
2578     private static boolean match(char c, long lowMask, long highMask) {
2579         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2580             return false;
2581         if (c < 64)
2582             return ((1L << c) & lowMask) != 0;
2583         if (c < 128)
2584             return ((1L << (c - 64)) & highMask) != 0;
2585         return false;
2586     }
2587 
2588     // Character-class masks, in reverse order from RFC2396 because
2589     // initializers for static fields cannot make forward references.
2590 
2591     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2592     //            "8" | "9"
2593     private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9');
2594     private static final long H_DIGIT = 0L;
2595 
2596     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2597     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2598     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2599     private static final long L_UPALPHA = 0L;
2600     private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z');
2601 
2602     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2603     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2604     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2605     private static final long L_LOWALPHA = 0L;
2606     private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z');
2607 
2608     // alpha         = lowalpha | upalpha
2609     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2610     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2611 
2612     // alphanum      = alpha | digit
2613     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2614     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2615 
2616     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2617     //                         "a" | "b" | "c" | "d" | "e" | "f"
2618     private static final long L_HEX = L_DIGIT;
2619     private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f');
2620 
2621     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2622     //                 "(" | ")"
2623     private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()");
2624     private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()");
2625 
2626     // unreserved    = alphanum | mark
2627     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2628     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2629 
2630     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2631     //                 "$" | "," | "[" | "]"
2632     // Added per RFC2732: "[", "]"
2633     private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]");
2634     private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]");
2635 
2636     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2637     // characters are allowed; this is handled by the scanEscape method below.
2638     private static final long L_ESCAPED = 1L;
2639     private static final long H_ESCAPED = 0L;
2640 
2641     // uric          = reserved | unreserved | escaped
2642     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2643     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2644 
2645     // pchar         = unreserved | escaped |
2646     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2647     private static final long L_PCHAR
2648         = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,");
2649     private static final long H_PCHAR
2650         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,");
2651 
2652     // All valid path characters
2653     private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/");
2654     private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L;
2655 
2656     // Dash, for use in domainlabel and toplabel
2657     private static final long L_DASH = 0x200000000000L; // lowMask("-");
2658     private static final long H_DASH = 0x0L; // highMask("-");
2659 
2660     // Dot, for use in hostnames
2661     private static final long L_DOT = 0x400000000000L; // lowMask(".");
2662     private static final long H_DOT = 0x0L; // highMask(".");
2663 
2664     // userinfo      = *( unreserved | escaped |
2665     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2666     private static final long L_USERINFO
2667         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,");
2668     private static final long H_USERINFO
2669         = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L;
2670 
2671     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2672     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2673     private static final long L_REG_NAME
2674         = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+");
2675     private static final long H_REG_NAME
2676         = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+");
2677 
2678     // All valid characters for server-based authorities
2679     private static final long L_SERVER
2680         = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]");
2681     private static final long H_SERVER
2682         = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]");
2683 
2684     // Special case of server authority that represents an IPv6 address
2685     // In this case, a % does not signify an escape sequence
2686     private static final long L_SERVER_PERCENT
2687         = L_SERVER | 0x2000000000L; // lowMask("%");
2688     private static final long H_SERVER_PERCENT
2689         = H_SERVER; // | highMask("%") == 0L;
2690 
2691     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2692     private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-.");
2693     private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L
2694 
2695     // scope_id = alpha | digit | "_" | "."
2696     private static final long L_SCOPE_ID
2697         = L_ALPHANUM | 0x400000000000L; // lowMask("_.");
2698     private static final long H_SCOPE_ID
2699         = H_ALPHANUM | 0x80000000L; // highMask("_.");
2700 
2701     // -- Escaping and encoding --
2702 
2703     private static final char[] hexDigits = {
2704         '0', '1', '2', '3', '4', '5', '6', '7',
2705         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2706     };
2707 
2708     private static void appendEscape(StringBuilder sb, byte b) {
2709         sb.append('%');
2710         sb.append(hexDigits[(b >> 4) & 0x0f]);
2711         sb.append(hexDigits[(b >> 0) & 0x0f]);
2712     }
2713 
2714     private static void appendEncoded(StringBuilder sb, char c) {
2715         ByteBuffer bb = null;
2716         try {
2717             bb = ThreadLocalCoders.encoderFor("UTF-8")
2718                 .encode(CharBuffer.wrap("" + c));
2719         } catch (CharacterCodingException x) {
2720             assert false;
2721         }
2722         while (bb.hasRemaining()) {
2723             int b = bb.get() & 0xff;
2724             if (b >= 0x80)
2725                 appendEscape(sb, (byte)b);
2726             else
2727                 sb.append((char)b);
2728         }
2729     }
2730 
2731     // Quote any characters in s that are not permitted
2732     // by the given mask pair
2733     //
2734     private static String quote(String s, long lowMask, long highMask) {
2735         StringBuilder sb = null;
2736         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2737         for (int i = 0; i < s.length(); i++) {
2738             char c = s.charAt(i);
2739             if (c < '\u0080') {
2740                 if (!match(c, lowMask, highMask)) {
2741                     if (sb == null) {
2742                         sb = new StringBuilder();
2743                         sb.append(s, 0, i);
2744                     }
2745                     appendEscape(sb, (byte)c);
2746                 } else {
2747                     if (sb != null)
2748                         sb.append(c);
2749                 }
2750             } else if (allowNonASCII
2751                        && (Character.isSpaceChar(c)
2752                            || Character.isISOControl(c))) {
2753                 if (sb == null) {
2754                     sb = new StringBuilder();
2755                     sb.append(s, 0, i);
2756                 }
2757                 appendEncoded(sb, c);
2758             } else {
2759                 if (sb != null)
2760                     sb.append(c);
2761             }
2762         }
2763         return (sb == null) ? s : sb.toString();
2764     }
2765 
2766     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2767     // assuming that s is otherwise legal
2768     //
2769     private static String encode(String s) {
2770         int n = s.length();
2771         if (n == 0)
2772             return s;
2773 
2774         // First check whether we actually need to encode
2775         for (int i = 0;;) {
2776             if (s.charAt(i) >= '\u0080')
2777                 break;
2778             if (++i >= n)
2779                 return s;
2780         }
2781 
2782         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2783         ByteBuffer bb = null;
2784         try {
2785             bb = ThreadLocalCoders.encoderFor("UTF-8")
2786                 .encode(CharBuffer.wrap(ns));
2787         } catch (CharacterCodingException x) {
2788             assert false;
2789         }
2790 
2791         StringBuilder sb = new StringBuilder();
2792         while (bb.hasRemaining()) {
2793             int b = bb.get() & 0xff;
2794             if (b >= 0x80)
2795                 appendEscape(sb, (byte)b);
2796             else
2797                 sb.append((char)b);
2798         }
2799         return sb.toString();
2800     }
2801 
2802     private static int decode(char c) {
2803         if ((c >= '0') && (c <= '9'))
2804             return c - '0';
2805         if ((c >= 'a') && (c <= 'f'))
2806             return c - 'a' + 10;
2807         if ((c >= 'A') && (c <= 'F'))
2808             return c - 'A' + 10;
2809         assert false;
2810         return -1;
2811     }
2812 
2813     private static byte decode(char c1, char c2) {
2814         return (byte)(  ((decode(c1) & 0xf) << 4)
2815                       | ((decode(c2) & 0xf) << 0));
2816     }
2817 
2818     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2819     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2820     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2821     // are replaced with '\uFFFD'.
2822     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2823     //            with a scope_id
2824     //
2825     private static String decode(String s) {
2826         return decode(s, true);
2827     }
2828 
2829     // This method was introduced as a generalization of URI.decode method
2830     // to provide a fix for JDK-8037396
2831     private static String decode(String s, boolean ignorePercentInBrackets) {
2832         if (s == null)
2833             return s;
2834         int n = s.length();
2835         if (n == 0)
2836             return s;
2837         if (s.indexOf('%') < 0)
2838             return s;
2839 
2840         StringBuilder sb = new StringBuilder(n);
2841         ByteBuffer bb = ByteBuffer.allocate(n);
2842         CharBuffer cb = CharBuffer.allocate(n);
2843         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2844                 .onMalformedInput(CodingErrorAction.REPLACE)
2845                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2846 
2847         // This is not horribly efficient, but it will do for now
2848         char c = s.charAt(0);
2849         boolean betweenBrackets = false;
2850 
2851         for (int i = 0; i < n;) {
2852             assert c == s.charAt(i);    // Loop invariant
2853             if (c == '[') {
2854                 betweenBrackets = true;
2855             } else if (betweenBrackets && c == ']') {
2856                 betweenBrackets = false;
2857             }
2858             if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) {
2859                 sb.append(c);
2860                 if (++i >= n)
2861                     break;
2862                 c = s.charAt(i);
2863                 continue;
2864             }
2865             bb.clear();
2866             int ui = i;
2867             for (;;) {
2868                 assert (n - i >= 2);
2869                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2870                 if (++i >= n)
2871                     break;
2872                 c = s.charAt(i);
2873                 if (c != '%')
2874                     break;
2875             }
2876             bb.flip();
2877             cb.clear();
2878             dec.reset();
2879             CoderResult cr = dec.decode(bb, cb, true);
2880             assert cr.isUnderflow();
2881             cr = dec.flush(cb);
2882             assert cr.isUnderflow();
2883             sb.append(cb.flip().toString());
2884         }
2885 
2886         return sb.toString();
2887     }
2888 
2889 
2890     // -- Parsing --
2891 
2892     // For convenience we wrap the input URI string in a new instance of the
2893     // following internal class.  This saves always having to pass the input
2894     // string as an argument to each internal scan/parse method.
2895 
2896     private class Parser {
2897 
2898         private String input;           // URI input string
2899         private boolean requireServerAuthority = false;
2900 
2901         Parser(String s) {
2902             input = s;
2903             string = s;
2904         }
2905 
2906         // -- Methods for throwing URISyntaxException in various ways --
2907 
2908         private void fail(String reason) throws URISyntaxException {
2909             throw new URISyntaxException(input, reason);
2910         }
2911 
2912         private void fail(String reason, int p) throws URISyntaxException {
2913             throw new URISyntaxException(input, reason, p);
2914         }
2915 
2916         private void failExpecting(String expected, int p)
2917             throws URISyntaxException
2918         {
2919             fail("Expected " + expected, p);
2920         }
2921 
2922 
2923         // -- Simple access to the input string --
2924 
2925         // Tells whether start < end and, if so, whether charAt(start) == c
2926         //
2927         private boolean at(int start, int end, char c) {
2928             return (start < end) && (input.charAt(start) == c);
2929         }
2930 
2931         // Tells whether start + s.length() < end and, if so,
2932         // whether the chars at the start position match s exactly
2933         //
2934         private boolean at(int start, int end, String s) {
2935             int p = start;
2936             int sn = s.length();
2937             if (sn > end - p)
2938                 return false;
2939             int i = 0;
2940             while (i < sn) {
2941                 if (input.charAt(p++) != s.charAt(i)) {
2942                     break;
2943                 }
2944                 i++;
2945             }
2946             return (i == sn);
2947         }
2948 
2949 
2950         // -- Scanning --
2951 
2952         // The various scan and parse methods that follow use a uniform
2953         // convention of taking the current start position and end index as
2954         // their first two arguments.  The start is inclusive while the end is
2955         // exclusive, just as in the String class, i.e., a start/end pair
2956         // denotes the left-open interval [start, end) of the input string.
2957         //
2958         // These methods never proceed past the end position.  They may return
2959         // -1 to indicate outright failure, but more often they simply return
2960         // the position of the first char after the last char scanned.  Thus
2961         // a typical idiom is
2962         //
2963         //     int p = start;
2964         //     int q = scan(p, end, ...);
2965         //     if (q > p)
2966         //         // We scanned something
2967         //         ...;
2968         //     else if (q == p)
2969         //         // We scanned nothing
2970         //         ...;
2971         //     else if (q == -1)
2972         //         // Something went wrong
2973         //         ...;
2974 
2975 
2976         // Scan a specific char: If the char at the given start position is
2977         // equal to c, return the index of the next char; otherwise, return the
2978         // start position.
2979         //
2980         private int scan(int start, int end, char c) {
2981             if ((start < end) && (input.charAt(start) == c))
2982                 return start + 1;
2983             return start;
2984         }
2985 
2986         // Scan forward from the given start position.  Stop at the first char
2987         // in the err string (in which case -1 is returned), or the first char
2988         // in the stop string (in which case the index of the preceding char is
2989         // returned), or the end of the input string (in which case the length
2990         // of the input string is returned).  May return the start position if
2991         // nothing matches.
2992         //
2993         private int scan(int start, int end, String err, String stop) {
2994             int p = start;
2995             while (p < end) {
2996                 char c = input.charAt(p);
2997                 if (err.indexOf(c) >= 0)
2998                     return -1;
2999                 if (stop.indexOf(c) >= 0)
3000                     break;
3001                 p++;
3002             }
3003             return p;
3004         }
3005 
3006         // Scan forward from the given start position.  Stop at the first char
3007         // in the stop string (in which case the index of the preceding char is
3008         // returned), or the end of the input string (in which case the length
3009         // of the input string is returned).  May return the start position if
3010         // nothing matches.
3011         //
3012         private int scan(int start, int end, String stop) {
3013             int p = start;
3014             while (p < end) {
3015                 char c = input.charAt(p);
3016                 if (stop.indexOf(c) >= 0)
3017                     break;
3018                 p++;
3019             }
3020             return p;
3021         }
3022 
3023         // Scan a potential escape sequence, starting at the given position,
3024         // with the given first char (i.e., charAt(start) == c).
3025         //
3026         // This method assumes that if escapes are allowed then visible
3027         // non-US-ASCII chars are also allowed.
3028         //
3029         private int scanEscape(int start, int n, char first)
3030             throws URISyntaxException
3031         {
3032             int p = start;
3033             char c = first;
3034             if (c == '%') {
3035                 // Process escape pair
3036                 if ((p + 3 <= n)
3037                     && match(input.charAt(p + 1), L_HEX, H_HEX)
3038                     && match(input.charAt(p + 2), L_HEX, H_HEX)) {
3039                     return p + 3;
3040                 }
3041                 fail("Malformed escape pair", p);
3042             } else if ((c > 128)
3043                        && !Character.isSpaceChar(c)
3044                        && !Character.isISOControl(c)) {
3045                 // Allow unescaped but visible non-US-ASCII chars
3046                 return p + 1;
3047             }
3048             return p;
3049         }
3050 
3051         // Scan chars that match the given mask pair
3052         //
3053         private int scan(int start, int n, long lowMask, long highMask)
3054             throws URISyntaxException
3055         {
3056             int p = start;
3057             while (p < n) {
3058                 char c = input.charAt(p);
3059                 if (match(c, lowMask, highMask)) {
3060                     p++;
3061                     continue;
3062                 }
3063                 if ((lowMask & L_ESCAPED) != 0) {
3064                     int q = scanEscape(p, n, c);
3065                     if (q > p) {
3066                         p = q;
3067                         continue;
3068                     }
3069                 }
3070                 break;
3071             }
3072             return p;
3073         }
3074 
3075         // Check that each of the chars in [start, end) matches the given mask
3076         //
3077         private void checkChars(int start, int end,
3078                                 long lowMask, long highMask,
3079                                 String what)
3080             throws URISyntaxException
3081         {
3082             int p = scan(start, end, lowMask, highMask);
3083             if (p < end)
3084                 fail("Illegal character in " + what, p);
3085         }
3086 
3087         // Check that the char at position p matches the given mask
3088         //
3089         private void checkChar(int p,
3090                                long lowMask, long highMask,
3091                                String what)
3092             throws URISyntaxException
3093         {
3094             checkChars(p, p + 1, lowMask, highMask, what);
3095         }
3096 
3097 
3098         // -- Parsing --
3099 
3100         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3101         //
3102         void parse(boolean rsa) throws URISyntaxException {
3103             requireServerAuthority = rsa;
3104             int n = input.length();
3105             int p = scan(0, n, "/?#", ":");
3106             if ((p >= 0) && at(p, n, ':')) {
3107                 if (p == 0)
3108                     failExpecting("scheme name", 0);
3109                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3110                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3111                 scheme = input.substring(0, p);
3112                 p++;                    // Skip ':'
3113                 if (at(p, n, '/')) {
3114                     p = parseHierarchical(p, n);
3115                 } else {
3116                     // opaque; need to create the schemeSpecificPart
3117                     int q = scan(p, n, "#");
3118                     if (q <= p)
3119                         failExpecting("scheme-specific part", p);
3120                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3121                     schemeSpecificPart = input.substring(p, q);
3122                     p = q;
3123                 }
3124             } else {
3125                 p = parseHierarchical(0, n);
3126             }
3127             if (at(p, n, '#')) {
3128                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3129                 fragment = input.substring(p + 1, n);
3130                 p = n;
3131             }
3132             if (p < n)
3133                 fail("end of URI", p);
3134         }
3135 
3136         // [//authority]<path>[?<query>]
3137         //
3138         // DEVIATION from RFC2396: We allow an empty authority component as
3139         // long as it's followed by a non-empty path, query component, or
3140         // fragment component.  This is so that URIs such as "file:///foo/bar"
3141         // will parse.  This seems to be the intent of RFC2396, though the
3142         // grammar does not permit it.  If the authority is empty then the
3143         // userInfo, host, and port components are undefined.
3144         //
3145         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3146         // to be the intent of RFC2396, but the grammar does not permit it.
3147         // The primary consequence of this deviation is that "#f" parses as a
3148         // relative URI with an empty path.
3149         //
3150         private int parseHierarchical(int start, int n)
3151             throws URISyntaxException
3152         {
3153             int p = start;
3154             if (at(p, n, '/') && at(p + 1, n, '/')) {
3155                 p += 2;
3156                 int q = scan(p, n, "/?#");
3157                 if (q > p) {
3158                     p = parseAuthority(p, q);
3159                 } else if (q < n) {
3160                     // DEVIATION: Allow empty authority prior to non-empty
3161                     // path, query component or fragment identifier
3162                 } else
3163                     failExpecting("authority", p);
3164             }
3165             int q = scan(p, n, "?#"); // DEVIATION: May be empty
3166             checkChars(p, q, L_PATH, H_PATH, "path");
3167             path = input.substring(p, q);
3168             p = q;
3169             if (at(p, n, '?')) {
3170                 p++;
3171                 q = scan(p, n, "#");
3172                 checkChars(p, q, L_URIC, H_URIC, "query");
3173                 query = input.substring(p, q);
3174                 p = q;
3175             }
3176             return p;
3177         }
3178 
3179         // authority     = server | reg_name
3180         //
3181         // Ambiguity: An authority that is a registry name rather than a server
3182         // might have a prefix that parses as a server.  We use the fact that
3183         // the authority component is always followed by '/' or the end of the
3184         // input string to resolve this: If the complete authority did not
3185         // parse as a server then we try to parse it as a registry name.
3186         //
3187         private int parseAuthority(int start, int n)
3188             throws URISyntaxException
3189         {
3190             int p = start;
3191             int q = p;
3192             URISyntaxException ex = null;
3193 
3194             boolean serverChars;
3195             boolean regChars;
3196 
3197             if (scan(p, n, "]") > p) {
3198                 // contains a literal IPv6 address, therefore % is allowed
3199                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3200             } else {
3201                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3202             }
3203             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3204 
3205             if (regChars && !serverChars) {
3206                 // Must be a registry-based authority
3207                 authority = input.substring(p, n);
3208                 return n;
3209             }
3210 
3211             if (serverChars) {
3212                 // Might be (probably is) a server-based authority, so attempt
3213                 // to parse it as such.  If the attempt fails, try to treat it
3214                 // as a registry-based authority.
3215                 try {
3216                     q = parseServer(p, n);
3217                     if (q < n)
3218                         failExpecting("end of authority", q);
3219                     authority = input.substring(p, n);
3220                 } catch (URISyntaxException x) {
3221                     // Undo results of failed parse
3222                     userInfo = null;
3223                     host = null;
3224                     port = -1;
3225                     if (requireServerAuthority) {
3226                         // If we're insisting upon a server-based authority,
3227                         // then just re-throw the exception
3228                         throw x;
3229                     } else {
3230                         // Save the exception in case it doesn't parse as a
3231                         // registry either
3232                         ex = x;
3233                         q = p;
3234                     }
3235                 }
3236             }
3237 
3238             if (q < n) {
3239                 if (regChars) {
3240                     // Registry-based authority
3241                     authority = input.substring(p, n);
3242                 } else if (ex != null) {
3243                     // Re-throw exception; it was probably due to
3244                     // a malformed IPv6 address
3245                     throw ex;
3246                 } else {
3247                     fail("Illegal character in authority", q);
3248                 }
3249             }
3250 
3251             return n;
3252         }
3253 
3254 
3255         // [<userinfo>@]<host>[:<port>]
3256         //
3257         private int parseServer(int start, int n)
3258             throws URISyntaxException
3259         {
3260             int p = start;
3261             int q;
3262 
3263             // userinfo
3264             q = scan(p, n, "/?#", "@");
3265             if ((q >= p) && at(q, n, '@')) {
3266                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3267                 userInfo = input.substring(p, q);
3268                 p = q + 1;              // Skip '@'
3269             }
3270 
3271             // hostname, IPv4 address, or IPv6 address
3272             if (at(p, n, '[')) {
3273                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3274                 p++;
3275                 q = scan(p, n, "/?#", "]");
3276                 if ((q > p) && at(q, n, ']')) {
3277                     // look for a "%" scope id
3278                     int r = scan (p, q, "%");
3279                     if (r > p) {
3280                         parseIPv6Reference(p, r);
3281                         if (r+1 == q) {
3282                             fail ("scope id expected");
3283                         }
3284                         checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID,
3285                                                 "scope id");
3286                     } else {
3287                         parseIPv6Reference(p, q);
3288                     }
3289                     host = input.substring(p-1, q+1);
3290                     p = q + 1;
3291                 } else {
3292                     failExpecting("closing bracket for IPv6 address", q);
3293                 }
3294             } else {
3295                 q = parseIPv4Address(p, n);
3296                 if (q <= p)
3297                     q = parseHostname(p, n);
3298                 p = q;
3299             }
3300 
3301             // port
3302             if (at(p, n, ':')) {
3303                 p++;
3304                 q = scan(p, n, "/");
3305                 if (q > p) {
3306                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3307                     try {
3308                         port = Integer.parseInt(input, p, q, 10);
3309                     } catch (NumberFormatException x) {
3310                         fail("Malformed port number", p);
3311                     }
3312                     p = q;
3313                 }
3314             }
3315             if (p < n)
3316                 failExpecting("port number", p);
3317 
3318             return p;
3319         }
3320 
3321         // Scan a string of decimal digits whose value fits in a byte
3322         //
3323         private int scanByte(int start, int n)
3324             throws URISyntaxException
3325         {
3326             int p = start;
3327             int q = scan(p, n, L_DIGIT, H_DIGIT);
3328             if (q <= p) return q;
3329             if (Integer.parseInt(input, p, q, 10) > 255) return p;
3330             return q;
3331         }
3332 
3333         // Scan an IPv4 address.
3334         //
3335         // If the strict argument is true then we require that the given
3336         // interval contain nothing besides an IPv4 address; if it is false
3337         // then we only require that it start with an IPv4 address.
3338         //
3339         // If the interval does not contain or start with (depending upon the
3340         // strict argument) a legal IPv4 address characters then we return -1
3341         // immediately; otherwise we insist that these characters parse as a
3342         // legal IPv4 address and throw an exception on failure.
3343         //
3344         // We assume that any string of decimal digits and dots must be an IPv4
3345         // address.  It won't parse as a hostname anyway, so making that
3346         // assumption here allows more meaningful exceptions to be thrown.
3347         //
3348         private int scanIPv4Address(int start, int n, boolean strict)
3349             throws URISyntaxException
3350         {
3351             int p = start;
3352             int q;
3353             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3354             if ((m <= p) || (strict && (m != n)))
3355                 return -1;
3356             for (;;) {
3357                 // Per RFC2732: At most three digits per byte
3358                 // Further constraint: Each element fits in a byte
3359                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3360                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3361                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3362                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3363                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3364                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3365                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3366                 if (q < m) break;
3367                 return q;
3368             }
3369             fail("Malformed IPv4 address", q);
3370             return -1;
3371         }
3372 
3373         // Take an IPv4 address: Throw an exception if the given interval
3374         // contains anything except an IPv4 address
3375         //
3376         private int takeIPv4Address(int start, int n, String expected)
3377             throws URISyntaxException
3378         {
3379             int p = scanIPv4Address(start, n, true);
3380             if (p <= start)
3381                 failExpecting(expected, start);
3382             return p;
3383         }
3384 
3385         // Attempt to parse an IPv4 address, returning -1 on failure but
3386         // allowing the given interval to contain [:<characters>] after
3387         // the IPv4 address.
3388         //
3389         private int parseIPv4Address(int start, int n) {
3390             int p;
3391 
3392             try {
3393                 p = scanIPv4Address(start, n, false);
3394             } catch (URISyntaxException x) {
3395                 return -1;
3396             } catch (NumberFormatException nfe) {
3397                 return -1;
3398             }
3399 
3400             if (p > start && p < n) {
3401                 // IPv4 address is followed by something - check that
3402                 // it's a ":" as this is the only valid character to
3403                 // follow an address.
3404                 if (input.charAt(p) != ':') {
3405                     p = -1;
3406                 }
3407             }
3408 
3409             if (p > start)
3410                 host = input.substring(start, p);
3411 
3412             return p;
3413         }
3414 
3415         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3416         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3417         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3418         //
3419         private int parseHostname(int start, int n)
3420             throws URISyntaxException
3421         {
3422             int p = start;
3423             int q;
3424             int l = -1;                 // Start of last parsed label
3425 
3426             do {
3427                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3428                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3429                 if (q <= p)
3430                     break;
3431                 l = p;
3432                 if (q > p) {
3433                     p = q;
3434                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3435                     if (q > p) {
3436                         if (input.charAt(q - 1) == '-')
3437                             fail("Illegal character in hostname", q - 1);
3438                         p = q;
3439                     }
3440                 }
3441                 q = scan(p, n, '.');
3442                 if (q <= p)
3443                     break;
3444                 p = q;
3445             } while (p < n);
3446 
3447             if ((p < n) && !at(p, n, ':'))
3448                 fail("Illegal character in hostname", p);
3449 
3450             if (l < 0)
3451                 failExpecting("hostname", start);
3452 
3453             // for a fully qualified hostname check that the rightmost
3454             // label starts with an alpha character.
3455             if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) {
3456                 fail("Illegal character in hostname", l);
3457             }
3458 
3459             host = input.substring(start, p);
3460             return p;
3461         }
3462 
3463 
3464         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3465         //
3466         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3467         // the form ::12.34.56.78, which are clearly shown in the examples
3468         // earlier in the document.  Here is the original grammar:
3469         //
3470         //   IPv6address = hexpart [ ":" IPv4address ]
3471         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3472         //   hexseq      = hex4 *( ":" hex4)
3473         //   hex4        = 1*4HEXDIG
3474         //
3475         // We therefore use the following revised grammar:
3476         //
3477         //   IPv6address = hexseq [ ":" IPv4address ]
3478         //                 | hexseq [ "::" [ hexpost ] ]
3479         //                 | "::" [ hexpost ]
3480         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3481         //   hexseq      = hex4 *( ":" hex4)
3482         //   hex4        = 1*4HEXDIG
3483         //
3484         // This covers all and only the following cases:
3485         //
3486         //   hexseq
3487         //   hexseq : IPv4address
3488         //   hexseq ::
3489         //   hexseq :: hexseq
3490         //   hexseq :: hexseq : IPv4address
3491         //   hexseq :: IPv4address
3492         //   :: hexseq
3493         //   :: hexseq : IPv4address
3494         //   :: IPv4address
3495         //   ::
3496         //
3497         // Additionally we constrain the IPv6 address as follows :-
3498         //
3499         //  i.  IPv6 addresses without compressed zeros should contain
3500         //      exactly 16 bytes.
3501         //
3502         //  ii. IPv6 addresses with compressed zeros should contain
3503         //      less than 16 bytes.
3504 
3505         private int ipv6byteCount = 0;
3506 
3507         private int parseIPv6Reference(int start, int n)
3508             throws URISyntaxException
3509         {
3510             int p = start;
3511             int q;
3512             boolean compressedZeros = false;
3513 
3514             q = scanHexSeq(p, n);
3515 
3516             if (q > p) {
3517                 p = q;
3518                 if (at(p, n, "::")) {
3519                     compressedZeros = true;
3520                     p = scanHexPost(p + 2, n);
3521                 } else if (at(p, n, ':')) {
3522                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3523                     ipv6byteCount += 4;
3524                 }
3525             } else if (at(p, n, "::")) {
3526                 compressedZeros = true;
3527                 p = scanHexPost(p + 2, n);
3528             }
3529             if (p < n)
3530                 fail("Malformed IPv6 address", start);
3531             if (ipv6byteCount > 16)
3532                 fail("IPv6 address too long", start);
3533             if (!compressedZeros && ipv6byteCount < 16)
3534                 fail("IPv6 address too short", start);
3535             if (compressedZeros && ipv6byteCount == 16)
3536                 fail("Malformed IPv6 address", start);
3537 
3538             return p;
3539         }
3540 
3541         private int scanHexPost(int start, int n)
3542             throws URISyntaxException
3543         {
3544             int p = start;
3545             int q;
3546 
3547             if (p == n)
3548                 return p;
3549 
3550             q = scanHexSeq(p, n);
3551             if (q > p) {
3552                 p = q;
3553                 if (at(p, n, ':')) {
3554                     p++;
3555                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3556                     ipv6byteCount += 4;
3557                 }
3558             } else {
3559                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3560                 ipv6byteCount += 4;
3561             }
3562             return p;
3563         }
3564 
3565         // Scan a hex sequence; return -1 if one could not be scanned
3566         //
3567         private int scanHexSeq(int start, int n)
3568             throws URISyntaxException
3569         {
3570             int p = start;
3571             int q;
3572 
3573             q = scan(p, n, L_HEX, H_HEX);
3574             if (q <= p)
3575                 return -1;
3576             if (at(q, n, '.'))          // Beginning of IPv4 address
3577                 return -1;
3578             if (q > p + 4)
3579                 fail("IPv6 hexadecimal digit sequence too long", p);
3580             ipv6byteCount += 2;
3581             p = q;
3582             while (p < n) {
3583                 if (!at(p, n, ':'))
3584                     break;
3585                 if (at(p + 1, n, ':'))
3586                     break;              // "::"
3587                 p++;
3588                 q = scan(p, n, L_HEX, H_HEX);
3589                 if (q <= p)
3590                     failExpecting("digits for an IPv6 address", p);
3591                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3592                     p--;
3593                     break;
3594                 }
3595                 if (q > p + 4)
3596                     fail("IPv6 hexadecimal digit sequence too long", p);
3597                 ipv6byteCount += 2;
3598                 p = q;
3599             }
3600 
3601             return p;
3602         }
3603 
3604     }
3605     static {
3606         SharedSecrets.setJavaNetUriAccess(
3607             new JavaNetUriAccess() {
3608                 public URI create(String scheme, String path) {
3609                     return new URI(scheme, path);
3610                 }
3611             }
3612         );
3613     }
3614 }