New src/java.base/share/classes/java/net/URI.java

   1 /*
   2  * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.net;
  27 
  28 import java.io.IOException;
  29 import java.io.InvalidObjectException;
  30 import java.io.ObjectInputStream;
  31 import java.io.ObjectOutputStream;
  32 import java.io.Serializable;
  33 import java.nio.ByteBuffer;
  34 import java.nio.CharBuffer;
  35 import java.nio.charset.CharsetDecoder;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.CharacterCodingException;
  39 import java.text.Normalizer;
  40 import jdk.internal.loader.URLClassPath;
  41 import jdk.internal.misc.JavaNetAccess;
  42 import jdk.internal.misc.SharedSecrets;
  43 import sun.nio.cs.ThreadLocalCoders;
  44 
  45 import java.lang.Character;             // for javadoc
  46 import java.lang.NullPointerException;  // for javadoc
  47 
  48 
  49 /**
  50  * Represents a Uniform Resource Identifier (URI) reference.
  51  *
  52  * <p> Aside from some minor deviations noted below, an instance of this
  53  * class represents a URI reference as defined by
  54  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
  55  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
  56  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  57  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
  58  * also supports scope_ids. The syntax and usage of scope_ids is described
  59  * <a href="Inet6Address.html#scoped">here</a>.
  60  * This class provides constructors for creating URI instances from
  61  * their components or by parsing their string forms, methods for accessing the
  62  * various components of an instance, and methods for normalizing, resolving,
  63  * and relativizing URI instances.  Instances of this class are immutable.
  64  *
  65  *
  66  * <h3> URI syntax and components </h3>
  67  *
  68  * At the highest level a URI reference (hereinafter simply "URI") in string
  69  * form has the syntax
  70  *
  71  * <blockquote>
  72  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
  73  * </blockquote>
  74  *
  75  * where square brackets [...] delineate optional components and the characters
  76  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
  77  *
  78  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
  79  * said to be <i>relative</i>.  URIs are also classified according to whether
  80  * they are <i>opaque</i> or <i>hierarchical</i>.
  81  *
  82  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
  83  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
  84  * subject to further parsing.  Some examples of opaque URIs are:
  85  *
  86  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
  87  * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr>
  88  * <tr><td>{@code news:comp.lang.java}<td></tr>
  89  * <tr><td>{@code urn:isbn:096139210x}</td></tr>
  90  * </table></blockquote>
  91  *
  92  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
  93  * scheme-specific part begins with a slash character, or a relative URI, that
  94  * is, a URI that does not specify a scheme.  Some examples of hierarchical
  95  * URIs are:
  96  *
  97  * <blockquote>
  98  * {@code http://example.com/languages/java/}<br>
  99  * {@code sample/a/index.html#28}<br>
 100  * {@code ../../demo/b/index.html}<br>
 101  * {@code file:///~/calendar}
 102  * </blockquote>
 103  *
 104  * <p> A hierarchical URI is subject to further parsing according to the syntax
 105  *
 106  * <blockquote>
 107  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 108  * </blockquote>
 109  *
 110  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
 111  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
 112  * scheme-specific part of a hierarchical URI consists of the characters
 113  * between the scheme and fragment components.
 114  *
 115  * <p> The authority component of a hierarchical URI is, if specified, either
 116  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
 117  * parses according to the familiar syntax
 118  *
 119  * <blockquote>
 120  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
 121  * </blockquote>
 122  *
 123  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
 124  * themselves.  Nearly all URI schemes currently in use are server-based.  An
 125  * authority component that does not parse in this way is considered to be
 126  * registry-based.
 127  *
 128  * <p> The path component of a hierarchical URI is itself said to be absolute
 129  * if it begins with a slash character ({@code '/'}); otherwise it is
 130  * relative.  The path of a hierarchical URI that is either absolute or
 131  * specifies an authority is always absolute.
 132  *
 133  * <p> All told, then, a URI instance has the following nine components:
 134  *
 135  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
 136  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
 137  * <tr><td>scheme</td><td>{@code String}</td></tr>
 138  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td>{@code String}</td></tr>
 139  * <tr><td>authority</td><td>{@code String}</td></tr>
 140  * <tr><td>user-info</td><td>{@code String}</td></tr>
 141  * <tr><td>host</td><td>{@code String}</td></tr>
 142  * <tr><td>port</td><td>{@code int}</td></tr>
 143  * <tr><td>path</td><td>{@code String}</td></tr>
 144  * <tr><td>query</td><td>{@code String}</td></tr>
 145  * <tr><td>fragment</td><td>{@code String}</td></tr>
 146  * </table></blockquote>
 147  *
 148  * In a given instance any particular component is either <i>undefined</i> or
 149  * <i>defined</i> with a distinct value.  Undefined string components are
 150  * represented by {@code null}, while undefined integer components are
 151  * represented by {@code -1}.  A string component may be defined to have the
 152  * empty string as its value; this is not equivalent to that component being
 153  * undefined.
 154  *
 155  * <p> Whether a particular component is or is not defined in an instance
 156  * depends upon the type of the URI being represented.  An absolute URI has a
 157  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
 158  * possibly a fragment, but has no other components.  A hierarchical URI always
 159  * has a path (though it may be empty) and a scheme-specific-part (which at
 160  * least contains the path), and may have any of the other components.  If the
 161  * authority component is present and is server-based then the host component
 162  * will be defined and the user-information and port components may be defined.
 163  *
 164  *
 165  * <h4> Operations on URI instances </h4>
 166  *
 167  * The key operations supported by this class are those of
 168  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
 169  *
 170  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
 171  * and {@code ".."} segments from the path component of a hierarchical URI.
 172  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
 173  * removed only if it is preceded by a non-{@code ".."} segment.
 174  * Normalization has no effect upon opaque URIs.
 175  *
 176  * <p> <i>Resolution</i> is the process of resolving one URI against another,
 177  * <i>base</i> URI.  The resulting URI is constructed from components of both
 178  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
 179  * base URI for those not specified in the original.  For hierarchical URIs,
 180  * the path of the original is resolved against the path of the base and then
 181  * normalized.  The result, for example, of resolving
 182  *
 183  * <blockquote>
 184  * {@code sample/a/index.html#28}
 185  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 186  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
 187  * </blockquote>
 188  *
 189  * against the base URI {@code http://example.com/languages/java/} is the result
 190  * URI
 191  *
 192  * <blockquote>
 193  * {@code http://example.com/languages/java/sample/a/index.html#28}
 194  * </blockquote>
 195  *
 196  * Resolving the relative URI
 197  *
 198  * <blockquote>
 199  * {@code ../../demo/b/index.html}&nbsp;&nbsp;&nbsp;&nbsp;(2)
 200  * </blockquote>
 201  *
 202  * against this result yields, in turn,
 203  *
 204  * <blockquote>
 205  * {@code http://example.com/languages/java/demo/b/index.html}
 206  * </blockquote>
 207  *
 208  * Resolution of both absolute and relative URIs, and of both absolute and
 209  * relative paths in the case of hierarchical URIs, is supported.  Resolving
 210  * the URI {@code file:///~calendar} against any other URI simply yields the
 211  * original URI, since it is absolute.  Resolving the relative URI (2) above
 212  * against the relative base URI (1) yields the normalized, but still relative,
 213  * URI
 214  *
 215  * <blockquote>
 216  * {@code demo/b/index.html}
 217  * </blockquote>
 218  *
 219  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
 220  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
 221  *
 222  * <blockquote>
 223  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
 224  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
 225  * </blockquote>
 226  *
 227  * This operation is often useful when constructing a document containing URIs
 228  * that must be made relative to the base URI of the document wherever
 229  * possible.  For example, relativizing the URI
 230  *
 231  * <blockquote>
 232  * {@code http://example.com/languages/java/sample/a/index.html#28}
 233  * </blockquote>
 234  *
 235  * against the base URI
 236  *
 237  * <blockquote>
 238  * {@code http://example.com/languages/java/}
 239  * </blockquote>
 240  *
 241  * yields the relative URI {@code sample/a/index.html#28}.
 242  *
 243  *
 244  * <h4> Character categories </h4>
 245  *
 246  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
 247  * various components of a URI reference.  The following categories, most of
 248  * which are taken from that specification, are used below to describe these
 249  * constraints:
 250  *
 251  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
 252  *   <tr><th valign=top><i>alpha</i></th>
 253  *       <td>The US-ASCII alphabetic characters,
 254  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
 255  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
 256  *   <tr><th valign=top><i>digit</i></th>
 257  *       <td>The US-ASCII decimal digit characters,
 258  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
 259  *   <tr><th valign=top><i>alphanum</i></th>
 260  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
 261  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
 262  *       <td>All <i>alphanum</i> characters together with those in the string
 263  *        {@code "_-!.~'()*"}</td></tr>
 264  *   <tr><th valign=top><i>punct</i></th>
 265  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
 266  *   <tr><th valign=top><i>reserved</i></th>
 267  *       <td>All <i>punct</i> characters together with those in the string
 268  *        {@code "?/[]@"}</td></tr>
 269  *   <tr><th valign=top><i>escaped</i></th>
 270  *       <td>Escaped octets, that is, triplets consisting of the percent
 271  *           character ({@code '%'}) followed by two hexadecimal digits
 272  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
 273  *           {@code 'a'}-{@code 'f'})</td></tr>
 274  *   <tr><th valign=top><i>other</i></th>
 275  *       <td>The Unicode characters that are not in the US-ASCII character set,
 276  *           are not control characters (according to the {@link
 277  *           java.lang.Character#isISOControl(char) Character.isISOControl}
 278  *           method), and are not space characters (according to the {@link
 279  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
 280  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
 281  *           limited to US-ASCII)</i></td></tr>
 282  * </table></blockquote>
 283  *
 284  * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
 285  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
 286  * characters.
 287  *
 288  *
 289  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
 290  *
 291  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
 292  * fragment components.  Escaping serves two purposes in URIs:
 293  *
 294  * <ul>
 295  *
 296  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
 297  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
 298  *   characters.  </p></li>
 299  *
 300  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
 301  *   component.  The user-info, path, query, and fragment components differ
 302  *   slightly in terms of which characters are considered legal and illegal.
 303  *   </p></li>
 304  *
 305  * </ul>
 306  *
 307  * These purposes are served in this class by three related operations:
 308  *
 309  * <ul>
 310  *
 311  *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
 312  *   with the sequence of escaped octets that represent that character in the
 313  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
 314  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
 315  *   RFC&nbsp;2396</b>, which does not specify any particular character
 316  *   set.)</i> </p></li>
 317  *
 318  *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
 319  *   encoding it.  The space character, for example, is quoted by replacing it
 320  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
 321  *   characters this transformation has exactly the effect required by
 322  *   RFC&nbsp;2396. </p></li>
 323  *
 324  *   <li><p><a name="decode"></a>
 325  *   A sequence of escaped octets is <i>decoded</i> by
 326  *   replacing it with the sequence of characters that it represents in the
 327  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
 328  *   effect of de-quoting any quoted US-ASCII characters as well as that of
 329  *   decoding any encoded non-US-ASCII characters.  If a <a
 330  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
 331  *   when decoding the escaped octets then the erroneous octets are replaced by
 332  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
 333  *
 334  * </ul>
 335  *
 336  * These operations are exposed in the constructors and methods of this class
 337  * as follows:
 338  *
 339  * <ul>
 340  *
 341  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
 342  *   constructor} requires any illegal characters in its argument to be
 343  *   quoted and preserves any escaped octets and <i>other</i> characters that
 344  *   are present.  </p></li>
 345  *
 346  *   <li><p> The {@linkplain
 347  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
 348  *   multi-argument constructors} quote illegal characters as
 349  *   required by the components in which they appear.  The percent character
 350  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
 351  *   characters are preserved.  </p></li>
 352  *
 353  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
 354  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
 355  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
 356  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
 357  *   values of their corresponding components in raw form, without interpreting
 358  *   any escaped octets.  The strings returned by these methods may contain
 359  *   both escaped octets and <i>other</i> characters, and will not contain any
 360  *   illegal characters.  </p></li>
 361  *
 362  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
 363  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
 364  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
 365  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
 366  *   octets in their corresponding components.  The strings returned by these
 367  *   methods may contain both <i>other</i> characters and illegal characters,
 368  *   and will not contain any escaped octets.  </p></li>
 369  *
 370  *   <li><p> The {@link #toString() toString} method returns a URI string with
 371  *   all necessary quotation but which may contain <i>other</i> characters.
 372  *   </p></li>
 373  *
 374  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
 375  *   quoted and encoded URI string that does not contain any <i>other</i>
 376  *   characters.  </p></li>
 377  *
 378  * </ul>
 379  *
 380  *
 381  * <h4> Identities </h4>
 382  *
 383  * For any URI <i>u</i>, it is always the case that
 384  *
 385  * <blockquote>
 386  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
 387  * </blockquote>
 388  *
 389  * For any URI <i>u</i> that does not contain redundant syntax such as two
 390  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
 391  * colon following a host name but no port (as in
 392  * {@code http://java.sun.com:}&nbsp;), and that does not encode characters
 393  * except those that must be quoted, the following identities also hold:
 394  * <pre>
 395  *     new URI(<i>u</i>.getScheme(),
 396  *             <i>u</i>.getSchemeSpecificPart(),
 397  *             <i>u</i>.getFragment())
 398  *     .equals(<i>u</i>)</pre>
 399  * in all cases,
 400  * <pre>
 401  *     new URI(<i>u</i>.getScheme(),
 402  *             <i>u</i>.getAuthority(),
 403  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 404  *             <i>u</i>.getFragment())
 405  *     .equals(<i>u</i>)</pre>
 406  * if <i>u</i> is hierarchical, and
 407  * <pre>
 408  *     new URI(<i>u</i>.getScheme(),
 409  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
 410  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 411  *             <i>u</i>.getFragment())
 412  *     .equals(<i>u</i>)</pre>
 413  * if <i>u</i> is hierarchical and has either no authority or a server-based
 414  * authority.
 415  *
 416  *
 417  * <h4> URIs, URLs, and URNs </h4>
 418  *
 419  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
 420  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
 421  * not every URI is a URL.  This is because there is another subcategory of
 422  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
 423  * specify how to locate them.  The {@code mailto}, {@code news}, and
 424  * {@code isbn} URIs shown above are examples of URNs.
 425  *
 426  * <p> The conceptual distinction between URIs and URLs is reflected in the
 427  * differences between this class and the {@link URL} class.
 428  *
 429  * <p> An instance of this class represents a URI reference in the syntactic
 430  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
 431  * A URI string is parsed according to the generic syntax without regard to the
 432  * scheme, if any, that it specifies.  No lookup of the host, if any, is
 433  * performed, and no scheme-dependent stream handler is constructed.  Equality,
 434  * hashing, and comparison are defined strictly in terms of the character
 435  * content of the instance.  In other words, a URI instance is little more than
 436  * a structured string that supports the syntactic, scheme-independent
 437  * operations of comparison, normalization, resolution, and relativization.
 438  *
 439  * <p> An instance of the {@link URL} class, by contrast, represents the
 440  * syntactic components of a URL together with some of the information required
 441  * to access the resource that it describes.  A URL must be absolute, that is,
 442  * it must always specify a scheme.  A URL string is parsed according to its
 443  * scheme.  A stream handler is always established for a URL, and in fact it is
 444  * impossible to create a URL instance for a scheme for which no handler is
 445  * available.  Equality and hashing depend upon both the scheme and the
 446  * Internet address of the host, if any; comparison is not defined.  In other
 447  * words, a URL is a structured string that supports the syntactic operation of
 448  * resolution as well as the network I/O operations of looking up the host and
 449  * opening a connection to the specified resource.
 450  *
 451  *
 452  * @author Mark Reinhold
 453  * @since 1.4
 454  *
 455  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
 456  * transformation format of ISO 10646</i></a>, <br><a
 457  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
 458  * Architecture</i></a>, <br><a
 459  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
 460  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
 461  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
 462  * Literal IPv6 Addresses in URLs</i></a>, <br><a
 463  * href="URISyntaxException.html">URISyntaxException</a>
 464  */
 465 
 466 public final class URI
 467     implements Comparable<URI>, Serializable
 468 {
 469 
 470     // Note: Comments containing the word "ASSERT" indicate places where a
 471     // throw of an InternalError should be replaced by an appropriate assertion
 472     // statement once asserts are enabled in the build.
 473 
 474     static final long serialVersionUID = -6052424284110960213L;
 475 
 476 
 477     // -- Properties and components of this instance --
 478 
 479     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
 480     private transient String scheme;            // null ==> relative URI
 481     private transient String fragment;
 482 
 483     // Hierarchical URI components: [//<authority>]<path>[?<query>]
 484     private transient String authority;         // Registry or server
 485 
 486     // Server-based authority: [<userInfo>@]<host>[:<port>]
 487     private transient String userInfo;
 488     private transient String host;              // null ==> registry-based
 489     private transient int port = -1;            // -1 ==> undefined
 490 
 491     // Remaining components of hierarchical URIs
 492     private transient String path;              // null ==> opaque
 493     private transient String query;
 494 
 495     // The remaining fields may be computed on demand, which is safe even in
 496     // the face of multiple threads racing to initialize them
 497     private transient String schemeSpecificPart;
 498     private transient int hash;        // Zero ==> undefined
 499 
 500     private transient String decodedUserInfo;
 501     private transient String decodedAuthority;
 502     private transient String decodedPath;
 503     private transient String decodedQuery;
 504     private transient String decodedFragment;
 505     private transient String decodedSchemeSpecificPart;
 506 
 507     /**
 508      * The string form of this URI.
 509      *
 510      * @serial
 511      */
 512     private volatile String string;             // The only serializable field
 513 
 514 
 515 
 516     // -- Constructors and factories --
 517 
 518     private URI() { }                           // Used internally
 519 
 520     /**
 521      * Constructs a URI by parsing the given string.
 522      *
 523      * <p> This constructor parses the given string exactly as specified by the
 524      * grammar in <a
 525      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 526      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
 527      *
 528      * <ul>
 529      *
 530      *   <li><p> An empty authority component is permitted as long as it is
 531      *   followed by a non-empty path, a query component, or a fragment
 532      *   component.  This allows the parsing of URIs such as
 533      *   {@code "file:///foo/bar"}, which seems to be the intent of
 534      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
 535      *   authority component is empty then the user-information, host, and port
 536      *   components are undefined. </p></li>
 537      *
 538      *   <li><p> Empty relative paths are permitted; this seems to be the
 539      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
 540      *   primary consequence of this deviation is that a standalone fragment
 541      *   such as {@code "#foo"} parses as a relative URI with an empty path
 542      *   and the given fragment, and can be usefully <a
 543      *   href="#resolve-frag">resolved</a> against a base URI.
 544      *
 545      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
 546      *   specified by <a
 547      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
 548      *   element of a dotted-quad address must contain no more than three
 549      *   decimal digits.  Each element is further constrained to have a value
 550      *   no greater than 255. </p></li>
 551      *
 552      *   <li> <p> Hostnames in host components that comprise only a single
 553      *   domain label are permitted to start with an <i>alphanum</i>
 554      *   character. This seems to be the intent of <a
 555      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 556      *   section&nbsp;3.2.2 although the grammar does not permit it. The
 557      *   consequence of this deviation is that the authority component of a
 558      *   hierarchical URI such as {@code s://123}, will parse as a server-based
 559      *   authority. </p></li>
 560      *
 561      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
 562      *   address must be enclosed in square brackets ({@code '['} and
 563      *   {@code ']'}) as specified by <a
 564      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
 565      *   IPv6 address itself must parse according to <a
 566      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
 567      *   addresses are further constrained to describe no more than sixteen
 568      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
 569      *   but not expressible in the grammar. </p></li>
 570      *
 571      *   <li><p> Characters in the <i>other</i> category are permitted wherever
 572      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
 573      *   user-information, path, query, and fragment components, as well as in
 574      *   the authority component if the authority is registry-based.  This
 575      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
 576      *   character set. </p></li>
 577      *
 578      * </ul>
 579      *
 580      * @param  str   The string to be parsed into a URI
 581      *
 582      * @throws  NullPointerException
 583      *          If {@code str} is {@code null}
 584      *
 585      * @throws  URISyntaxException
 586      *          If the given string violates RFC&nbsp;2396, as augmented
 587      *          by the above deviations
 588      */
 589     public URI(String str) throws URISyntaxException {
 590         new Parser(str).parse(false);
 591     }
 592 
 593     /**
 594      * Constructs a hierarchical URI from the given components.
 595      *
 596      * <p> If a scheme is given then the path, if also given, must either be
 597      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 598      * component of the new URI may be left undefined by passing {@code null}
 599      * for the corresponding parameter or, in the case of the {@code port}
 600      * parameter, by passing {@code -1}.
 601      *
 602      * <p> This constructor first builds a URI string from the given components
 603      * according to the rules specified in <a
 604      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 605      * section&nbsp;5.2, step&nbsp;7: </p>
 606      *
 607      * <ol>
 608      *
 609      *   <li><p> Initially, the result string is empty. </p></li>
 610      *
 611      *   <li><p> If a scheme is given then it is appended to the result,
 612      *   followed by a colon character ({@code ':'}).  </p></li>
 613      *
 614      *   <li><p> If user information, a host, or a port are given then the
 615      *   string {@code "//"} is appended.  </p></li>
 616      *
 617      *   <li><p> If user information is given then it is appended, followed by
 618      *   a commercial-at character ({@code '@'}).  Any character not in the
 619      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 620      *   categories is <a href="#quote">quoted</a>.  </p></li>
 621      *
 622      *   <li><p> If a host is given then it is appended.  If the host is a
 623      *   literal IPv6 address but is not enclosed in square brackets
 624      *   ({@code '['} and {@code ']'}) then the square brackets are added.
 625      *   </p></li>
 626      *
 627      *   <li><p> If a port number is given then a colon character
 628      *   ({@code ':'}) is appended, followed by the port number in decimal.
 629      *   </p></li>
 630      *
 631      *   <li><p> If a path is given then it is appended.  Any character not in
 632      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 633      *   categories, and not equal to the slash character ({@code '/'}) or the
 634      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 635      *
 636      *   <li><p> If a query is given then a question-mark character
 637      *   ({@code '?'}) is appended, followed by the query.  Any character that
 638      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 639      *   </p></li>
 640      *
 641      *   <li><p> Finally, if a fragment is given then a hash character
 642      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 643      *   that is not a legal URI character is quoted.  </p></li>
 644      *
 645      * </ol>
 646      *
 647      * <p> The resulting URI string is then parsed as if by invoking the {@link
 648      * #URI(String)} constructor and then invoking the {@link
 649      * #parseServerAuthority()} method upon the result; this may cause a {@link
 650      * URISyntaxException} to be thrown.  </p>
 651      *
 652      * @param   scheme    Scheme name
 653      * @param   userInfo  User name and authorization information
 654      * @param   host      Host name
 655      * @param   port      Port number
 656      * @param   path      Path
 657      * @param   query     Query
 658      * @param   fragment  Fragment
 659      *
 660      * @throws URISyntaxException
 661      *         If both a scheme and a path are given but the path is relative,
 662      *         if the URI string constructed from the given components violates
 663      *         RFC&nbsp;2396, or if the authority component of the string is
 664      *         present but cannot be parsed as a server-based authority
 665      */
 666     public URI(String scheme,
 667                String userInfo, String host, int port,
 668                String path, String query, String fragment)
 669         throws URISyntaxException
 670     {
 671         String s = toString(scheme, null,
 672                             null, userInfo, host, port,
 673                             path, query, fragment);
 674         checkPath(s, scheme, path);
 675         new Parser(s).parse(true);
 676     }
 677 
 678     /**
 679      * Constructs a hierarchical URI from the given components.
 680      *
 681      * <p> If a scheme is given then the path, if also given, must either be
 682      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 683      * component of the new URI may be left undefined by passing {@code null}
 684      * for the corresponding parameter.
 685      *
 686      * <p> This constructor first builds a URI string from the given components
 687      * according to the rules specified in <a
 688      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 689      * section&nbsp;5.2, step&nbsp;7: </p>
 690      *
 691      * <ol>
 692      *
 693      *   <li><p> Initially, the result string is empty.  </p></li>
 694      *
 695      *   <li><p> If a scheme is given then it is appended to the result,
 696      *   followed by a colon character ({@code ':'}).  </p></li>
 697      *
 698      *   <li><p> If an authority is given then the string {@code "//"} is
 699      *   appended, followed by the authority.  If the authority contains a
 700      *   literal IPv6 address then the address must be enclosed in square
 701      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
 702      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 703      *   categories, and not equal to the commercial-at character
 704      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
 705      *
 706      *   <li><p> If a path is given then it is appended.  Any character not in
 707      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 708      *   categories, and not equal to the slash character ({@code '/'}) or the
 709      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 710      *
 711      *   <li><p> If a query is given then a question-mark character
 712      *   ({@code '?'}) is appended, followed by the query.  Any character that
 713      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 714      *   </p></li>
 715      *
 716      *   <li><p> Finally, if a fragment is given then a hash character
 717      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 718      *   that is not a legal URI character is quoted.  </p></li>
 719      *
 720      * </ol>
 721      *
 722      * <p> The resulting URI string is then parsed as if by invoking the {@link
 723      * #URI(String)} constructor and then invoking the {@link
 724      * #parseServerAuthority()} method upon the result; this may cause a {@link
 725      * URISyntaxException} to be thrown.  </p>
 726      *
 727      * @param   scheme     Scheme name
 728      * @param   authority  Authority
 729      * @param   path       Path
 730      * @param   query      Query
 731      * @param   fragment   Fragment
 732      *
 733      * @throws URISyntaxException
 734      *         If both a scheme and a path are given but the path is relative,
 735      *         if the URI string constructed from the given components violates
 736      *         RFC&nbsp;2396, or if the authority component of the string is
 737      *         present but cannot be parsed as a server-based authority
 738      */
 739     public URI(String scheme,
 740                String authority,
 741                String path, String query, String fragment)
 742         throws URISyntaxException
 743     {
 744         String s = toString(scheme, null,
 745                             authority, null, null, -1,
 746                             path, query, fragment);
 747         checkPath(s, scheme, path);
 748         new Parser(s).parse(false);
 749     }
 750 
 751     /**
 752      * Constructs a hierarchical URI from the given components.
 753      *
 754      * <p> A component may be left undefined by passing {@code null}.
 755      *
 756      * <p> This convenience constructor works as if by invoking the
 757      * seven-argument constructor as follows:
 758      *
 759      * <blockquote>
 760      * {@code new} {@link #URI(String, String, String, int, String, String, String)
 761      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
 762      * </blockquote>
 763      *
 764      * @param   scheme    Scheme name
 765      * @param   host      Host name
 766      * @param   path      Path
 767      * @param   fragment  Fragment
 768      *
 769      * @throws  URISyntaxException
 770      *          If the URI string constructed from the given components
 771      *          violates RFC&nbsp;2396
 772      */
 773     public URI(String scheme, String host, String path, String fragment)
 774         throws URISyntaxException
 775     {
 776         this(scheme, null, host, -1, path, null, fragment);
 777     }
 778 
 779     /**
 780      * Constructs a URI from the given components.
 781      *
 782      * <p> A component may be left undefined by passing {@code null}.
 783      *
 784      * <p> This constructor first builds a URI in string form using the given
 785      * components as follows:  </p>
 786      *
 787      * <ol>
 788      *
 789      *   <li><p> Initially, the result string is empty.  </p></li>
 790      *
 791      *   <li><p> If a scheme is given then it is appended to the result,
 792      *   followed by a colon character ({@code ':'}).  </p></li>
 793      *
 794      *   <li><p> If a scheme-specific part is given then it is appended.  Any
 795      *   character that is not a <a href="#legal-chars">legal URI character</a>
 796      *   is <a href="#quote">quoted</a>.  </p></li>
 797      *
 798      *   <li><p> Finally, if a fragment is given then a hash character
 799      *   ({@code '#'}) is appended to the string, followed by the fragment.
 800      *   Any character that is not a legal URI character is quoted.  </p></li>
 801      *
 802      * </ol>
 803      *
 804      * <p> The resulting URI string is then parsed in order to create the new
 805      * URI instance as if by invoking the {@link #URI(String)} constructor;
 806      * this may cause a {@link URISyntaxException} to be thrown.  </p>
 807      *
 808      * @param   scheme    Scheme name
 809      * @param   ssp       Scheme-specific part
 810      * @param   fragment  Fragment
 811      *
 812      * @throws  URISyntaxException
 813      *          If the URI string constructed from the given components
 814      *          violates RFC&nbsp;2396
 815      */
 816     public URI(String scheme, String ssp, String fragment)
 817         throws URISyntaxException
 818     {
 819         new Parser(toString(scheme, ssp,
 820                             null, null, null, -1,
 821                             null, null, fragment))
 822             .parse(false);
 823     }
 824 
 825     /**
 826      * Constructs a simple URI consisting of only a scheme and a pre-validated
 827      * path. Provides a fast-path for some internal cases.
 828      */
 829     URI(String scheme, String path) {
 830         this.scheme = scheme;
 831         this.path = path;
 832     }
 833 
 834     /**
 835      * Creates a URI by parsing the given string.
 836      *
 837      * <p> This convenience factory method works as if by invoking the {@link
 838      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
 839      * constructor is caught and wrapped in a new {@link
 840      * IllegalArgumentException} object, which is then thrown.
 841      *
 842      * <p> This method is provided for use in situations where it is known that
 843      * the given string is a legal URI, for example for URI constants declared
 844      * within in a program, and so it would be considered a programming error
 845      * for the string not to parse as such.  The constructors, which throw
 846      * {@link URISyntaxException} directly, should be used situations where a
 847      * URI is being constructed from user input or from some other source that
 848      * may be prone to errors.  </p>
 849      *
 850      * @param  str   The string to be parsed into a URI
 851      * @return The new URI
 852      *
 853      * @throws  NullPointerException
 854      *          If {@code str} is {@code null}
 855      *
 856      * @throws  IllegalArgumentException
 857      *          If the given string violates RFC&nbsp;2396
 858      */
 859     public static URI create(String str) {
 860         try {
 861             return new URI(str);
 862         } catch (URISyntaxException x) {
 863             throw new IllegalArgumentException(x.getMessage(), x);
 864         }
 865     }
 866 
 867 
 868     // -- Operations --
 869 
 870     /**
 871      * Attempts to parse this URI's authority component, if defined, into
 872      * user-information, host, and port components.
 873      *
 874      * <p> If this URI's authority component has already been recognized as
 875      * being server-based then it will already have been parsed into
 876      * user-information, host, and port components.  In this case, or if this
 877      * URI has no authority component, this method simply returns this URI.
 878      *
 879      * <p> Otherwise this method attempts once more to parse the authority
 880      * component into user-information, host, and port components, and throws
 881      * an exception describing why the authority component could not be parsed
 882      * in that way.
 883      *
 884      * <p> This method is provided because the generic URI syntax specified in
 885      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 886      * cannot always distinguish a malformed server-based authority from a
 887      * legitimate registry-based authority.  It must therefore treat some
 888      * instances of the former as instances of the latter.  The authority
 889      * component in the URI string {@code "//foo:bar"}, for example, is not a
 890      * legal server-based authority but it is legal as a registry-based
 891      * authority.
 892      *
 893      * <p> In many common situations, for example when working URIs that are
 894      * known to be either URNs or URLs, the hierarchical URIs being used will
 895      * always be server-based.  They therefore must either be parsed as such or
 896      * treated as an error.  In these cases a statement such as
 897      *
 898      * <blockquote>
 899      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
 900      * </blockquote>
 901      *
 902      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
 903      * it has an authority component, has a server-based authority with proper
 904      * user-information, host, and port components.  Invoking this method also
 905      * ensures that if the authority could not be parsed in that way then an
 906      * appropriate diagnostic message can be issued based upon the exception
 907      * that is thrown. </p>
 908      *
 909      * @return  A URI whose authority field has been parsed
 910      *          as a server-based authority
 911      *
 912      * @throws  URISyntaxException
 913      *          If the authority component of this URI is defined
 914      *          but cannot be parsed as a server-based authority
 915      *          according to RFC&nbsp;2396
 916      */
 917     public URI parseServerAuthority()
 918         throws URISyntaxException
 919     {
 920         // We could be clever and cache the error message and index from the
 921         // exception thrown during the original parse, but that would require
 922         // either more fields or a more-obscure representation.
 923         if ((host != null) || (authority == null))
 924             return this;
 925         new Parser(toString()).parse(true);
 926         return this;
 927     }
 928 
 929     /**
 930      * Normalizes this URI's path.
 931      *
 932      * <p> If this URI is opaque, or if its path is already in normal form,
 933      * then this URI is returned.  Otherwise a new URI is constructed that is
 934      * identical to this URI except that its path is computed by normalizing
 935      * this URI's path in a manner consistent with <a
 936      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 937      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
 938      * </p>
 939      *
 940      * <ol>
 941      *
 942      *   <li><p> All {@code "."} segments are removed. </p></li>
 943      *
 944      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
 945      *   segment then both of these segments are removed.  This step is
 946      *   repeated until it is no longer applicable. </p></li>
 947      *
 948      *   <li><p> If the path is relative, and if its first segment contains a
 949      *   colon character ({@code ':'}), then a {@code "."} segment is
 950      *   prepended.  This prevents a relative URI with a path such as
 951      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
 952      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
 953      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
 954      *
 955      * </ol>
 956      *
 957      * <p> A normalized path will begin with one or more {@code ".."} segments
 958      * if there were insufficient non-{@code ".."} segments preceding them to
 959      * allow their removal.  A normalized path will begin with a {@code "."}
 960      * segment if one was inserted by step 3 above.  Otherwise, a normalized
 961      * path will not contain any {@code "."} or {@code ".."} segments. </p>
 962      *
 963      * @return  A URI equivalent to this URI,
 964      *          but whose path is in normal form
 965      */
 966     public URI normalize() {
 967         return normalize(this);
 968     }
 969 
 970     /**
 971      * Resolves the given URI against this URI.
 972      *
 973      * <p> If the given URI is already absolute, or if this URI is opaque, then
 974      * the given URI is returned.
 975      *
 976      * <p><a name="resolve-frag"></a> If the given URI's fragment component is
 977      * defined, its path component is empty, and its scheme, authority, and
 978      * query components are undefined, then a URI with the given fragment but
 979      * with all other components equal to those of this URI is returned.  This
 980      * allows a URI representing a standalone fragment reference, such as
 981      * {@code "#foo"}, to be usefully resolved against a base URI.
 982      *
 983      * <p> Otherwise this method constructs a new hierarchical URI in a manner
 984      * consistent with <a
 985      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 986      * section&nbsp;5.2; that is: </p>
 987      *
 988      * <ol>
 989      *
 990      *   <li><p> A new URI is constructed with this URI's scheme and the given
 991      *   URI's query and fragment components. </p></li>
 992      *
 993      *   <li><p> If the given URI has an authority component then the new URI's
 994      *   authority and path are taken from the given URI. </p></li>
 995      *
 996      *   <li><p> Otherwise the new URI's authority component is copied from
 997      *   this URI, and its path is computed as follows: </p>
 998      *
 999      *   <ol>
1000      *
1001      *     <li><p> If the given URI's path is absolute then the new URI's path
1002      *     is taken from the given URI. </p></li>
1003      *
1004      *     <li><p> Otherwise the given URI's path is relative, and so the new
1005      *     URI's path is computed by resolving the path of the given URI
1006      *     against the path of this URI.  This is done by concatenating all but
1007      *     the last segment of this URI's path, if any, with the given URI's
1008      *     path and then normalizing the result as if by invoking the {@link
1009      *     #normalize() normalize} method. </p></li>
1010      *
1011      *   </ol></li>
1012      *
1013      * </ol>
1014      *
1015      * <p> The result of this method is absolute if, and only if, either this
1016      * URI is absolute or the given URI is absolute.  </p>
1017      *
1018      * @param  uri  The URI to be resolved against this URI
1019      * @return The resulting URI
1020      *
1021      * @throws  NullPointerException
1022      *          If {@code uri} is {@code null}
1023      */
1024     public URI resolve(URI uri) {
1025         return resolve(this, uri);
1026     }
1027 
1028     /**
1029      * Constructs a new URI by parsing the given string and then resolving it
1030      * against this URI.
1031      *
1032      * <p> This convenience method works as if invoking it were equivalent to
1033      * evaluating the expression {@link #resolve(java.net.URI)
1034      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1035      *
1036      * @param  str   The string to be parsed into a URI
1037      * @return The resulting URI
1038      *
1039      * @throws  NullPointerException
1040      *          If {@code str} is {@code null}
1041      *
1042      * @throws  IllegalArgumentException
1043      *          If the given string violates RFC&nbsp;2396
1044      */
1045     public URI resolve(String str) {
1046         return resolve(URI.create(str));
1047     }
1048 
1049     /**
1050      * Relativizes the given URI against this URI.
1051      *
1052      * <p> The relativization of the given URI against this URI is computed as
1053      * follows: </p>
1054      *
1055      * <ol>
1056      *
1057      *   <li><p> If either this URI or the given URI are opaque, or if the
1058      *   scheme and authority components of the two URIs are not identical, or
1059      *   if the path of this URI is not a prefix of the path of the given URI,
1060      *   then the given URI is returned. </p></li>
1061      *
1062      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1063      *   query and fragment components taken from the given URI and with a path
1064      *   component computed by removing this URI's path from the beginning of
1065      *   the given URI's path. </p></li>
1066      *
1067      * </ol>
1068      *
1069      * @param  uri  The URI to be relativized against this URI
1070      * @return The resulting URI
1071      *
1072      * @throws  NullPointerException
1073      *          If {@code uri} is {@code null}
1074      */
1075     public URI relativize(URI uri) {
1076         return relativize(this, uri);
1077     }
1078 
1079     /**
1080      * Constructs a URL from this URI.
1081      *
1082      * <p> This convenience method works as if invoking it were equivalent to
1083      * evaluating the expression {@code new URL(this.toString())} after
1084      * first checking that this URI is absolute. </p>
1085      *
1086      * @return  A URL constructed from this URI
1087      *
1088      * @throws  IllegalArgumentException
1089      *          If this URL is not absolute
1090      *
1091      * @throws  MalformedURLException
1092      *          If a protocol handler for the URL could not be found,
1093      *          or if some other error occurred while constructing the URL
1094      */
1095     public URL toURL() throws MalformedURLException {
1096         return URL.fromURI(this);
1097     }
1098 
1099     // -- Component access methods --
1100 
1101     /**
1102      * Returns the scheme component of this URI.
1103      *
1104      * <p> The scheme component of a URI, if defined, only contains characters
1105      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1106      * scheme always starts with an <i>alpha</i> character. <p>
1107      *
1108      * The scheme component of a URI cannot contain escaped octets, hence this
1109      * method does not perform any decoding.
1110      *
1111      * @return  The scheme component of this URI,
1112      *          or {@code null} if the scheme is undefined
1113      */
1114     public String getScheme() {
1115         return scheme;
1116     }
1117 
1118     /**
1119      * Tells whether or not this URI is absolute.
1120      *
1121      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1122      *
1123      * @return  {@code true} if, and only if, this URI is absolute
1124      */
1125     public boolean isAbsolute() {
1126         return scheme != null;
1127     }
1128 
1129     /**
1130      * Tells whether or not this URI is opaque.
1131      *
1132      * <p> A URI is opaque if, and only if, it is absolute and its
1133      * scheme-specific part does not begin with a slash character ('/').
1134      * An opaque URI has a scheme, a scheme-specific part, and possibly
1135      * a fragment; all other components are undefined. </p>
1136      *
1137      * @return  {@code true} if, and only if, this URI is opaque
1138      */
1139     public boolean isOpaque() {
1140         return path == null;
1141     }
1142 
1143     /**
1144      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1145      * part is never undefined, though it may be empty.
1146      *
1147      * <p> The scheme-specific part of a URI only contains legal URI
1148      * characters. </p>
1149      *
1150      * @return  The raw scheme-specific part of this URI
1151      *          (never {@code null})
1152      */
1153     public String getRawSchemeSpecificPart() {
1154         String part = schemeSpecificPart;
1155         if (part != null) {
1156             return part;
1157         }
1158 
1159         String s = string;
1160         if (s != null) {
1161             // if string is defined, components will have been parsed
1162             int start = 0;
1163             int end = s.length();
1164             if (scheme != null) {
1165                 start = scheme.length() + 1;
1166             }
1167             if (fragment != null) {
1168                 end -= fragment.length() + 1;
1169             }
1170             if (path != null && path.length() == end - start) {
1171                 part = path;
1172             } else {
1173                 part = s.substring(start, end);
1174             }
1175         } else {
1176             StringBuilder sb = new StringBuilder();
1177             appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1178                                  host, port, getPath(), getQuery());
1179             part = sb.toString();
1180         }
1181         return schemeSpecificPart = part;
1182     }
1183 
1184     /**
1185      * Returns the decoded scheme-specific part of this URI.
1186      *
1187      * <p> The string returned by this method is equal to that returned by the
1188      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1189      * except that all sequences of escaped octets are <a
1190      * href="#decode">decoded</a>.  </p>
1191      *
1192      * @return  The decoded scheme-specific part of this URI
1193      *          (never {@code null})
1194      */
1195     public String getSchemeSpecificPart() {
1196         String part = decodedSchemeSpecificPart;
1197         if (part == null) {
1198             decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart());
1199         }
1200         return part;
1201     }
1202 
1203     /**
1204      * Returns the raw authority component of this URI.
1205      *
1206      * <p> The authority component of a URI, if defined, only contains the
1207      * commercial-at character ({@code '@'}) and characters in the
1208      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1209      * categories.  If the authority is server-based then it is further
1210      * constrained to have valid user-information, host, and port
1211      * components. </p>
1212      *
1213      * @return  The raw authority component of this URI,
1214      *          or {@code null} if the authority is undefined
1215      */
1216     public String getRawAuthority() {
1217         return authority;
1218     }
1219 
1220     /**
1221      * Returns the decoded authority component of this URI.
1222      *
1223      * <p> The string returned by this method is equal to that returned by the
1224      * {@link #getRawAuthority() getRawAuthority} method except that all
1225      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1226      *
1227      * @return  The decoded authority component of this URI,
1228      *          or {@code null} if the authority is undefined
1229      */
1230     public String getAuthority() {
1231         String auth = decodedAuthority;
1232         if ((auth == null) && (authority != null)) {
1233             decodedAuthority = auth = decode(authority);
1234         }
1235         return auth;
1236     }
1237 
1238     /**
1239      * Returns the raw user-information component of this URI.
1240      *
1241      * <p> The user-information component of a URI, if defined, only contains
1242      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1243      * <i>other</i> categories. </p>
1244      *
1245      * @return  The raw user-information component of this URI,
1246      *          or {@code null} if the user information is undefined
1247      */
1248     public String getRawUserInfo() {
1249         return userInfo;
1250     }
1251 
1252     /**
1253      * Returns the decoded user-information component of this URI.
1254      *
1255      * <p> The string returned by this method is equal to that returned by the
1256      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1257      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1258      *
1259      * @return  The decoded user-information component of this URI,
1260      *          or {@code null} if the user information is undefined
1261      */
1262     public String getUserInfo() {
1263         String user = decodedUserInfo;
1264         if ((user == null) && (userInfo != null)) {
1265             decodedUserInfo = user = decode(userInfo);
1266         }
1267         return user;
1268     }
1269 
1270     /**
1271      * Returns the host component of this URI.
1272      *
1273      * <p> The host component of a URI, if defined, will have one of the
1274      * following forms: </p>
1275      *
1276      * <ul>
1277      *
1278      *   <li><p> A domain name consisting of one or more <i>labels</i>
1279      *   separated by period characters ({@code '.'}), optionally followed by
1280      *   a period character.  Each label consists of <i>alphanum</i> characters
1281      *   as well as hyphen characters ({@code '-'}), though hyphens never
1282      *   occur as the first or last characters in a label. The rightmost
1283      *   label of a domain name consisting of two or more labels, begins
1284      *   with an <i>alpha</i> character. </li>
1285      *
1286      *   <li><p> A dotted-quad IPv4 address of the form
1287      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1288      *   where no <i>digit</i> sequence is longer than three characters and no
1289      *   sequence has a value larger than 255. </p></li>
1290      *
1291      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1292      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1293      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1294      *   syntax of IPv6 addresses is specified in <a
1295      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1296      *   Addressing Architecture</i></a>.  </p></li>
1297      *
1298      * </ul>
1299      *
1300      * The host component of a URI cannot contain escaped octets, hence this
1301      * method does not perform any decoding.
1302      *
1303      * @return  The host component of this URI,
1304      *          or {@code null} if the host is undefined
1305      */
1306     public String getHost() {
1307         return host;
1308     }
1309 
1310     /**
1311      * Returns the port number of this URI.
1312      *
1313      * <p> The port component of a URI, if defined, is a non-negative
1314      * integer. </p>
1315      *
1316      * @return  The port component of this URI,
1317      *          or {@code -1} if the port is undefined
1318      */
1319     public int getPort() {
1320         return port;
1321     }
1322 
1323     /**
1324      * Returns the raw path component of this URI.
1325      *
1326      * <p> The path component of a URI, if defined, only contains the slash
1327      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1328      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1329      * and <i>other</i> categories. </p>
1330      *
1331      * @return  The path component of this URI,
1332      *          or {@code null} if the path is undefined
1333      */
1334     public String getRawPath() {
1335         return path;
1336     }
1337 
1338     /**
1339      * Returns the decoded path component of this URI.
1340      *
1341      * <p> The string returned by this method is equal to that returned by the
1342      * {@link #getRawPath() getRawPath} method except that all sequences of
1343      * escaped octets are <a href="#decode">decoded</a>.  </p>
1344      *
1345      * @return  The decoded path component of this URI,
1346      *          or {@code null} if the path is undefined
1347      */
1348     public String getPath() {
1349         String decoded = decodedPath;
1350         if ((decoded == null) && (path != null)) {
1351             decodedPath = decoded = decode(path);
1352         }
1353         return decoded;
1354     }
1355 
1356     /**
1357      * Returns the raw query component of this URI.
1358      *
1359      * <p> The query component of a URI, if defined, only contains legal URI
1360      * characters. </p>
1361      *
1362      * @return  The raw query component of this URI,
1363      *          or {@code null} if the query is undefined
1364      */
1365     public String getRawQuery() {
1366         return query;
1367     }
1368 
1369     /**
1370      * Returns the decoded query component of this URI.
1371      *
1372      * <p> The string returned by this method is equal to that returned by the
1373      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1374      * escaped octets are <a href="#decode">decoded</a>.  </p>
1375      *
1376      * @return  The decoded query component of this URI,
1377      *          or {@code null} if the query is undefined
1378      */
1379     public String getQuery() {
1380         String decoded = decodedQuery;
1381         if ((decoded == null) && (query != null)) {
1382             decodedQuery = decoded = decode(query, false);
1383         }
1384         return decoded;
1385     }
1386 
1387     /**
1388      * Returns the raw fragment component of this URI.
1389      *
1390      * <p> The fragment component of a URI, if defined, only contains legal URI
1391      * characters. </p>
1392      *
1393      * @return  The raw fragment component of this URI,
1394      *          or {@code null} if the fragment is undefined
1395      */
1396     public String getRawFragment() {
1397         return fragment;
1398     }
1399 
1400     /**
1401      * Returns the decoded fragment component of this URI.
1402      *
1403      * <p> The string returned by this method is equal to that returned by the
1404      * {@link #getRawFragment() getRawFragment} method except that all
1405      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1406      *
1407      * @return  The decoded fragment component of this URI,
1408      *          or {@code null} if the fragment is undefined
1409      */
1410     public String getFragment() {
1411         String decoded = decodedFragment;
1412         if ((decoded == null) && (fragment != null)) {
1413             decodedFragment = decoded = decode(fragment, false);
1414         }
1415         return decoded;
1416     }
1417 
1418 
1419     // -- Equality, comparison, hash code, toString, and serialization --
1420 
1421     /**
1422      * Tests this URI for equality with another object.
1423      *
1424      * <p> If the given object is not a URI then this method immediately
1425      * returns {@code false}.
1426      *
1427      * <p> For two URIs to be considered equal requires that either both are
1428      * opaque or both are hierarchical.  Their schemes must either both be
1429      * undefined or else be equal without regard to case. Their fragments
1430      * must either both be undefined or else be equal.
1431      *
1432      * <p> For two opaque URIs to be considered equal, their scheme-specific
1433      * parts must be equal.
1434      *
1435      * <p> For two hierarchical URIs to be considered equal, their paths must
1436      * be equal and their queries must either both be undefined or else be
1437      * equal.  Their authorities must either both be undefined, or both be
1438      * registry-based, or both be server-based.  If their authorities are
1439      * defined and are registry-based, then they must be equal.  If their
1440      * authorities are defined and are server-based, then their hosts must be
1441      * equal without regard to case, their port numbers must be equal, and
1442      * their user-information components must be equal.
1443      *
1444      * <p> When testing the user-information, path, query, fragment, authority,
1445      * or scheme-specific parts of two URIs for equality, the raw forms rather
1446      * than the encoded forms of these components are compared and the
1447      * hexadecimal digits of escaped octets are compared without regard to
1448      * case.
1449      *
1450      * <p> This method satisfies the general contract of the {@link
1451      * java.lang.Object#equals(Object) Object.equals} method. </p>
1452      *
1453      * @param   ob   The object to which this object is to be compared
1454      *
1455      * @return  {@code true} if, and only if, the given object is a URI that
1456      *          is identical to this URI
1457      */
1458     public boolean equals(Object ob) {
1459         if (ob == this)
1460             return true;
1461         if (!(ob instanceof URI))
1462             return false;
1463         URI that = (URI)ob;
1464         if (this.isOpaque() != that.isOpaque()) return false;
1465         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1466         if (!equal(this.fragment, that.fragment)) return false;
1467 
1468         // Opaque
1469         if (this.isOpaque())
1470             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1471 
1472         // Hierarchical
1473         if (!equal(this.path, that.path)) return false;
1474         if (!equal(this.query, that.query)) return false;
1475 
1476         // Authorities
1477         if (this.authority == that.authority) return true;
1478         if (this.host != null) {
1479             // Server-based
1480             if (!equal(this.userInfo, that.userInfo)) return false;
1481             if (!equalIgnoringCase(this.host, that.host)) return false;
1482             if (this.port != that.port) return false;
1483         } else if (this.authority != null) {
1484             // Registry-based
1485             if (!equal(this.authority, that.authority)) return false;
1486         } else if (this.authority != that.authority) {
1487             return false;
1488         }
1489 
1490         return true;
1491     }
1492 
1493     /**
1494      * Returns a hash-code value for this URI.  The hash code is based upon all
1495      * of the URI's components, and satisfies the general contract of the
1496      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1497      *
1498      * @return  A hash-code value for this URI
1499      */
1500     public int hashCode() {
1501         int h = hash;
1502         if (h == 0) {
1503             h = hashIgnoringCase(0, scheme);
1504             h = hash(h, fragment);
1505             if (isOpaque()) {
1506                 h = hash(h, schemeSpecificPart);
1507             } else {
1508                 h = hash(h, path);
1509                 h = hash(h, query);
1510                 if (host != null) {
1511                     h = hash(h, userInfo);
1512                     h = hashIgnoringCase(h, host);
1513                     h += 1949 * port;
1514                 } else {
1515                     h = hash(h, authority);
1516                 }
1517             }
1518             if (h != 0) {
1519                 hash = h;
1520             }
1521         }
1522         return h;
1523     }
1524 
1525     /**
1526      * Compares this URI to another object, which must be a URI.
1527      *
1528      * <p> When comparing corresponding components of two URIs, if one
1529      * component is undefined but the other is defined then the first is
1530      * considered to be less than the second.  Unless otherwise noted, string
1531      * components are ordered according to their natural, case-sensitive
1532      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1533      * String.compareTo} method.  String components that are subject to
1534      * encoding are compared by comparing their raw forms rather than their
1535      * encoded forms.
1536      *
1537      * <p> The ordering of URIs is defined as follows: </p>
1538      *
1539      * <ul>
1540      *
1541      *   <li><p> Two URIs with different schemes are ordered according the
1542      *   ordering of their schemes, without regard to case. </p></li>
1543      *
1544      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1545      *   with an identical scheme. </p></li>
1546      *
1547      *   <li><p> Two opaque URIs with identical schemes are ordered according
1548      *   to the ordering of their scheme-specific parts. </p></li>
1549      *
1550      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1551      *   parts are ordered according to the ordering of their
1552      *   fragments. </p></li>
1553      *
1554      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1555      *   according to the ordering of their authority components: </p>
1556      *
1557      *   <ul>
1558      *
1559      *     <li><p> If both authority components are server-based then the URIs
1560      *     are ordered according to their user-information components; if these
1561      *     components are identical then the URIs are ordered according to the
1562      *     ordering of their hosts, without regard to case; if the hosts are
1563      *     identical then the URIs are ordered according to the ordering of
1564      *     their ports. </p></li>
1565      *
1566      *     <li><p> If one or both authority components are registry-based then
1567      *     the URIs are ordered according to the ordering of their authority
1568      *     components. </p></li>
1569      *
1570      *   </ul></li>
1571      *
1572      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1573      *   authority components are ordered according to the ordering of their
1574      *   paths; if their paths are identical then they are ordered according to
1575      *   the ordering of their queries; if the queries are identical then they
1576      *   are ordered according to the order of their fragments. </p></li>
1577      *
1578      * </ul>
1579      *
1580      * <p> This method satisfies the general contract of the {@link
1581      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1582      * method. </p>
1583      *
1584      * @param   that
1585      *          The object to which this URI is to be compared
1586      *
1587      * @return  A negative integer, zero, or a positive integer as this URI is
1588      *          less than, equal to, or greater than the given URI
1589      *
1590      * @throws  ClassCastException
1591      *          If the given object is not a URI
1592      */
1593     public int compareTo(URI that) {
1594         int c;
1595 
1596         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1597             return c;
1598 
1599         if (this.isOpaque()) {
1600             if (that.isOpaque()) {
1601                 // Both opaque
1602                 if ((c = compare(this.schemeSpecificPart,
1603                                  that.schemeSpecificPart)) != 0)
1604                     return c;
1605                 return compare(this.fragment, that.fragment);
1606             }
1607             return +1;                  // Opaque > hierarchical
1608         } else if (that.isOpaque()) {
1609             return -1;                  // Hierarchical < opaque
1610         }
1611 
1612         // Hierarchical
1613         if ((this.host != null) && (that.host != null)) {
1614             // Both server-based
1615             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1616                 return c;
1617             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1618                 return c;
1619             if ((c = this.port - that.port) != 0)
1620                 return c;
1621         } else {
1622             // If one or both authorities are registry-based then we simply
1623             // compare them in the usual, case-sensitive way.  If one is
1624             // registry-based and one is server-based then the strings are
1625             // guaranteed to be unequal, hence the comparison will never return
1626             // zero and the compareTo and equals methods will remain
1627             // consistent.
1628             if ((c = compare(this.authority, that.authority)) != 0) return c;
1629         }
1630 
1631         if ((c = compare(this.path, that.path)) != 0) return c;
1632         if ((c = compare(this.query, that.query)) != 0) return c;
1633         return compare(this.fragment, that.fragment);
1634     }
1635 
1636     /**
1637      * Returns the content of this URI as a string.
1638      *
1639      * <p> If this URI was created by invoking one of the constructors in this
1640      * class then a string equivalent to the original input string, or to the
1641      * string computed from the originally-given components, as appropriate, is
1642      * returned.  Otherwise this URI was created by normalization, resolution,
1643      * or relativization, and so a string is constructed from this URI's
1644      * components according to the rules specified in <a
1645      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1646      * section&nbsp;5.2, step&nbsp;7. </p>
1647      *
1648      * @return  The string form of this URI
1649      */
1650     public String toString() {
1651         String s = string;
1652         if (s == null) {
1653             s = defineString();
1654         }
1655         return s;
1656     }
1657 
1658     private String defineString() {
1659         String s = string;
1660         if (s != null) {
1661             return s;
1662         }
1663 
1664         StringBuilder sb = new StringBuilder();
1665         if (scheme != null) {
1666             sb.append(scheme);
1667             sb.append(':');
1668         }
1669         if (isOpaque()) {
1670             sb.append(schemeSpecificPart);
1671         } else {
1672             if (host != null) {
1673                 sb.append("//");
1674                 if (userInfo != null) {
1675                     sb.append(userInfo);
1676                     sb.append('@');
1677                 }
1678                 boolean needBrackets = ((host.indexOf(':') >= 0)
1679                         && !host.startsWith("[")
1680                         && !host.endsWith("]"));
1681                 if (needBrackets) sb.append('[');
1682                 sb.append(host);
1683                 if (needBrackets) sb.append(']');
1684                 if (port != -1) {
1685                     sb.append(':');
1686                     sb.append(port);
1687                 }
1688             } else if (authority != null) {
1689                 sb.append("//");
1690                 sb.append(authority);
1691             }
1692             if (path != null)
1693                 sb.append(path);
1694             if (query != null) {
1695                 sb.append('?');
1696                 sb.append(query);
1697             }
1698         }
1699         if (fragment != null) {
1700             sb.append('#');
1701             sb.append(fragment);
1702         }
1703         return string = sb.toString();
1704     }
1705 
1706     /**
1707      * Returns the content of this URI as a US-ASCII string.
1708      *
1709      * <p> If this URI does not contain any characters in the <i>other</i>
1710      * category then an invocation of this method will return the same value as
1711      * an invocation of the {@link #toString() toString} method.  Otherwise
1712      * this method works as if by invoking that method and then <a
1713      * href="#encode">encoding</a> the result.  </p>
1714      *
1715      * @return  The string form of this URI, encoded as needed
1716      *          so that it only contains characters in the US-ASCII
1717      *          charset
1718      */
1719     public String toASCIIString() {
1720         return encode(toString());
1721     }
1722 
1723 
1724     // -- Serialization support --
1725 
1726     /**
1727      * Saves the content of this URI to the given serial stream.
1728      *
1729      * <p> The only serializable field of a URI instance is its {@code string}
1730      * field.  That field is given a value, if it does not have one already,
1731      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1732      * method of the given object-output stream is invoked. </p>
1733      *
1734      * @param  os  The object-output stream to which this object
1735      *             is to be written
1736      */
1737     private void writeObject(ObjectOutputStream os)
1738         throws IOException
1739     {
1740         defineString();
1741         os.defaultWriteObject();        // Writes the string field only
1742     }
1743 
1744     /**
1745      * Reconstitutes a URI from the given serial stream.
1746      *
1747      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1748      * invoked to read the value of the {@code string} field.  The result is
1749      * then parsed in the usual way.
1750      *
1751      * @param  is  The object-input stream from which this object
1752      *             is being read
1753      */
1754     private void readObject(ObjectInputStream is)
1755         throws ClassNotFoundException, IOException
1756     {
1757         port = -1;                      // Argh
1758         is.defaultReadObject();
1759         try {
1760             new Parser(string).parse(false);
1761         } catch (URISyntaxException x) {
1762             IOException y = new InvalidObjectException("Invalid URI");
1763             y.initCause(x);
1764             throw y;
1765         }
1766     }
1767 
1768 
1769     // -- End of public methods --
1770 
1771 
1772     // -- Utility methods for string-field comparison and hashing --
1773 
1774     // These methods return appropriate values for null string arguments,
1775     // thereby simplifying the equals, hashCode, and compareTo methods.
1776     //
1777     // The case-ignoring methods should only be applied to strings whose
1778     // characters are all known to be US-ASCII.  Because of this restriction,
1779     // these methods are faster than the similar methods in the String class.
1780 
1781     // US-ASCII only
1782     private static int toLower(char c) {
1783         if ((c >= 'A') && (c <= 'Z'))
1784             return c + ('a' - 'A');
1785         return c;
1786     }
1787 
1788     // US-ASCII only
1789     private static int toUpper(char c) {
1790         if ((c >= 'a') && (c <= 'z'))
1791             return c - ('a' - 'A');
1792         return c;
1793     }
1794 
1795     private static boolean equal(String s, String t) {
1796         if (s == t) return true;
1797         if ((s != null) && (t != null)) {
1798             if (s.length() != t.length())
1799                 return false;
1800             if (s.indexOf('%') < 0)
1801                 return s.equals(t);
1802             int n = s.length();
1803             for (int i = 0; i < n;) {
1804                 char c = s.charAt(i);
1805                 char d = t.charAt(i);
1806                 if (c != '%') {
1807                     if (c != d)
1808                         return false;
1809                     i++;
1810                     continue;
1811                 }
1812                 if (d != '%')
1813                     return false;
1814                 i++;
1815                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1816                     return false;
1817                 i++;
1818                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1819                     return false;
1820                 i++;
1821             }
1822             return true;
1823         }
1824         return false;
1825     }
1826 
1827     // US-ASCII only
1828     private static boolean equalIgnoringCase(String s, String t) {
1829         if (s == t) return true;
1830         if ((s != null) && (t != null)) {
1831             int n = s.length();
1832             if (t.length() != n)
1833                 return false;
1834             for (int i = 0; i < n; i++) {
1835                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1836                     return false;
1837             }
1838             return true;
1839         }
1840         return false;
1841     }
1842 
1843     private static int hash(int hash, String s) {
1844         if (s == null) return hash;
1845         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1846                                   : normalizedHash(hash, s);
1847     }
1848 
1849 
1850     private static int normalizedHash(int hash, String s) {
1851         int h = 0;
1852         for (int index = 0; index < s.length(); index++) {
1853             char ch = s.charAt(index);
1854             h = 31 * h + ch;
1855             if (ch == '%') {
1856                 /*
1857                  * Process the next two encoded characters
1858                  */
1859                 for (int i = index + 1; i < index + 3; i++)
1860                     h = 31 * h + toUpper(s.charAt(i));
1861                 index += 2;
1862             }
1863         }
1864         return hash * 127 + h;
1865     }
1866 
1867     // US-ASCII only
1868     private static int hashIgnoringCase(int hash, String s) {
1869         if (s == null) return hash;
1870         int h = hash;
1871         int n = s.length();
1872         for (int i = 0; i < n; i++)
1873             h = 31 * h + toLower(s.charAt(i));
1874         return h;
1875     }
1876 
1877     private static int compare(String s, String t) {
1878         if (s == t) return 0;
1879         if (s != null) {
1880             if (t != null)
1881                 return s.compareTo(t);
1882             else
1883                 return +1;
1884         } else {
1885             return -1;
1886         }
1887     }
1888 
1889     // US-ASCII only
1890     private static int compareIgnoringCase(String s, String t) {
1891         if (s == t) return 0;
1892         if (s != null) {
1893             if (t != null) {
1894                 int sn = s.length();
1895                 int tn = t.length();
1896                 int n = sn < tn ? sn : tn;
1897                 for (int i = 0; i < n; i++) {
1898                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1899                     if (c != 0)
1900                         return c;
1901                 }
1902                 return sn - tn;
1903             }
1904             return +1;
1905         } else {
1906             return -1;
1907         }
1908     }
1909 
1910 
1911     // -- String construction --
1912 
1913     // If a scheme is given then the path, if given, must be absolute
1914     //
1915     private static void checkPath(String s, String scheme, String path)
1916         throws URISyntaxException
1917     {
1918         if (scheme != null) {
1919             if ((path != null)
1920                 && ((path.length() > 0) && (path.charAt(0) != '/')))
1921                 throw new URISyntaxException(s,
1922                                              "Relative path in absolute URI");
1923         }
1924     }
1925 
1926     private void appendAuthority(StringBuilder sb,
1927                                  String authority,
1928                                  String userInfo,
1929                                  String host,
1930                                  int port)
1931     {
1932         if (host != null) {
1933             sb.append("//");
1934             if (userInfo != null) {
1935                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1936                 sb.append('@');
1937             }
1938             boolean needBrackets = ((host.indexOf(':') >= 0)
1939                                     && !host.startsWith("[")
1940                                     && !host.endsWith("]"));
1941             if (needBrackets) sb.append('[');
1942             sb.append(host);
1943             if (needBrackets) sb.append(']');
1944             if (port != -1) {
1945                 sb.append(':');
1946                 sb.append(port);
1947             }
1948         } else if (authority != null) {
1949             sb.append("//");
1950             if (authority.startsWith("[")) {
1951                 // authority should (but may not) contain an embedded IPv6 address
1952                 int end = authority.indexOf(']');
1953                 String doquote = authority, dontquote = "";
1954                 if (end != -1 && authority.indexOf(':') != -1) {
1955                     // the authority contains an IPv6 address
1956                     if (end == authority.length()) {
1957                         dontquote = authority;
1958                         doquote = "";
1959                     } else {
1960                         dontquote = authority.substring(0 , end + 1);
1961                         doquote = authority.substring(end + 1);
1962                     }
1963                 }
1964                 sb.append(dontquote);
1965                 sb.append(quote(doquote,
1966                             L_REG_NAME | L_SERVER,
1967                             H_REG_NAME | H_SERVER));
1968             } else {
1969                 sb.append(quote(authority,
1970                             L_REG_NAME | L_SERVER,
1971                             H_REG_NAME | H_SERVER));
1972             }
1973         }
1974     }
1975 
1976     private void appendSchemeSpecificPart(StringBuilder sb,
1977                                           String opaquePart,
1978                                           String authority,
1979                                           String userInfo,
1980                                           String host,
1981                                           int port,
1982                                           String path,
1983                                           String query)
1984     {
1985         if (opaquePart != null) {
1986             /* check if SSP begins with an IPv6 address
1987              * because we must not quote a literal IPv6 address
1988              */
1989             if (opaquePart.startsWith("//[")) {
1990                 int end =  opaquePart.indexOf(']');
1991                 if (end != -1 && opaquePart.indexOf(':')!=-1) {
1992                     String doquote, dontquote;
1993                     if (end == opaquePart.length()) {
1994                         dontquote = opaquePart;
1995                         doquote = "";
1996                     } else {
1997                         dontquote = opaquePart.substring(0,end+1);
1998                         doquote = opaquePart.substring(end+1);
1999                     }
2000                     sb.append (dontquote);
2001                     sb.append(quote(doquote, L_URIC, H_URIC));
2002                 }
2003             } else {
2004                 sb.append(quote(opaquePart, L_URIC, H_URIC));
2005             }
2006         } else {
2007             appendAuthority(sb, authority, userInfo, host, port);
2008             if (path != null)
2009                 sb.append(quote(path, L_PATH, H_PATH));
2010             if (query != null) {
2011                 sb.append('?');
2012                 sb.append(quote(query, L_URIC, H_URIC));
2013             }
2014         }
2015     }
2016 
2017     private void appendFragment(StringBuilder sb, String fragment) {
2018         if (fragment != null) {
2019             sb.append('#');
2020             sb.append(quote(fragment, L_URIC, H_URIC));
2021         }
2022     }
2023 
2024     private String toString(String scheme,
2025                             String opaquePart,
2026                             String authority,
2027                             String userInfo,
2028                             String host,
2029                             int port,
2030                             String path,
2031                             String query,
2032                             String fragment)
2033     {
2034         StringBuilder sb = new StringBuilder();
2035         if (scheme != null) {
2036             sb.append(scheme);
2037             sb.append(':');
2038         }
2039         appendSchemeSpecificPart(sb, opaquePart,
2040                                  authority, userInfo, host, port,
2041                                  path, query);
2042         appendFragment(sb, fragment);
2043         return sb.toString();
2044     }
2045 
2046     // -- Normalization, resolution, and relativization --
2047 
2048     // RFC2396 5.2 (6)
2049     private static String resolvePath(String base, String child,
2050                                       boolean absolute)
2051     {
2052         int i = base.lastIndexOf('/');
2053         int cn = child.length();
2054         String path = "";
2055 
2056         if (cn == 0) {
2057             // 5.2 (6a)
2058             if (i >= 0)
2059                 path = base.substring(0, i + 1);
2060         } else {
2061             StringBuilder sb = new StringBuilder(base.length() + cn);
2062             // 5.2 (6a)
2063             if (i >= 0)
2064                 sb.append(base, 0, i + 1);
2065             // 5.2 (6b)
2066             sb.append(child);
2067             path = sb.toString();
2068         }
2069 
2070         // 5.2 (6c-f)
2071         String np = normalize(path);
2072 
2073         // 5.2 (6g): If the result is absolute but the path begins with "../",
2074         // then we simply leave the path as-is
2075 
2076         return np;
2077     }
2078 
2079     // RFC2396 5.2
2080     private static URI resolve(URI base, URI child) {
2081         // check if child if opaque first so that NPE is thrown
2082         // if child is null.
2083         if (child.isOpaque() || base.isOpaque())
2084             return child;
2085 
2086         // 5.2 (2): Reference to current document (lone fragment)
2087         if ((child.scheme == null) && (child.authority == null)
2088             && child.path.isEmpty() && (child.fragment != null)
2089             && (child.query == null)) {
2090             if ((base.fragment != null)
2091                 && child.fragment.equals(base.fragment)) {
2092                 return base;
2093             }
2094             URI ru = new URI();
2095             ru.scheme = base.scheme;
2096             ru.authority = base.authority;
2097             ru.userInfo = base.userInfo;
2098             ru.host = base.host;
2099             ru.port = base.port;
2100             ru.path = base.path;
2101             ru.fragment = child.fragment;
2102             ru.query = base.query;
2103             return ru;
2104         }
2105 
2106         // 5.2 (3): Child is absolute
2107         if (child.scheme != null)
2108             return child;
2109 
2110         URI ru = new URI();             // Resolved URI
2111         ru.scheme = base.scheme;
2112         ru.query = child.query;
2113         ru.fragment = child.fragment;
2114 
2115         // 5.2 (4): Authority
2116         if (child.authority == null) {
2117             ru.authority = base.authority;
2118             ru.host = base.host;
2119             ru.userInfo = base.userInfo;
2120             ru.port = base.port;
2121 
2122             String cp = (child.path == null) ? "" : child.path;
2123             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2124                 // 5.2 (5): Child path is absolute
2125                 ru.path = child.path;
2126             } else {
2127                 // 5.2 (6): Resolve relative path
2128                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2129             }
2130         } else {
2131             ru.authority = child.authority;
2132             ru.host = child.host;
2133             ru.userInfo = child.userInfo;
2134             ru.host = child.host;
2135             ru.port = child.port;
2136             ru.path = child.path;
2137         }
2138 
2139         // 5.2 (7): Recombine (nothing to do here)
2140         return ru;
2141     }
2142 
2143     // If the given URI's path is normal then return the URI;
2144     // o.w., return a new URI containing the normalized path.
2145     //
2146     private static URI normalize(URI u) {
2147         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2148             return u;
2149 
2150         String np = normalize(u.path);
2151         if (np == u.path)
2152             return u;
2153 
2154         URI v = new URI();
2155         v.scheme = u.scheme;
2156         v.fragment = u.fragment;
2157         v.authority = u.authority;
2158         v.userInfo = u.userInfo;
2159         v.host = u.host;
2160         v.port = u.port;
2161         v.path = np;
2162         v.query = u.query;
2163         return v;
2164     }
2165 
2166     // If both URIs are hierarchical, their scheme and authority components are
2167     // identical, and the base path is a prefix of the child's path, then
2168     // return a relative URI that, when resolved against the base, yields the
2169     // child; otherwise, return the child.
2170     //
2171     private static URI relativize(URI base, URI child) {
2172         // check if child if opaque first so that NPE is thrown
2173         // if child is null.
2174         if (child.isOpaque() || base.isOpaque())
2175             return child;
2176         if (!equalIgnoringCase(base.scheme, child.scheme)
2177             || !equal(base.authority, child.authority))
2178             return child;
2179 
2180         String bp = normalize(base.path);
2181         String cp = normalize(child.path);
2182         if (!bp.equals(cp)) {
2183             if (!bp.endsWith("/"))
2184                 bp = bp + "/";
2185             if (!cp.startsWith(bp))
2186                 return child;
2187         }
2188 
2189         URI v = new URI();
2190         v.path = cp.substring(bp.length());
2191         v.query = child.query;
2192         v.fragment = child.fragment;
2193         return v;
2194     }
2195 
2196 
2197 
2198     // -- Path normalization --
2199 
2200     // The following algorithm for path normalization avoids the creation of a
2201     // string object for each segment, as well as the use of a string buffer to
2202     // compute the final result, by using a single char array and editing it in
2203     // place.  The array is first split into segments, replacing each slash
2204     // with '\0' and creating a segment-index array, each element of which is
2205     // the index of the first char in the corresponding segment.  We then walk
2206     // through both arrays, removing ".", "..", and other segments as necessary
2207     // by setting their entries in the index array to -1.  Finally, the two
2208     // arrays are used to rejoin the segments and compute the final result.
2209     //
2210     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2211 
2212 
2213     // Check the given path to see if it might need normalization.  A path
2214     // might need normalization if it contains duplicate slashes, a "."
2215     // segment, or a ".." segment.  Return -1 if no further normalization is
2216     // possible, otherwise return the number of segments found.
2217     //
2218     // This method takes a string argument rather than a char array so that
2219     // this test can be performed without invoking path.toCharArray().
2220     //
2221     private static int needsNormalization(String path) {
2222         boolean normal = true;
2223         int ns = 0;                     // Number of segments
2224         int end = path.length() - 1;    // Index of last char in path
2225         int p = 0;                      // Index of next char in path
2226 
2227         // Skip initial slashes
2228         while (p <= end) {
2229             if (path.charAt(p) != '/') break;
2230             p++;
2231         }
2232         if (p > 1) normal = false;
2233 
2234         // Scan segments
2235         while (p <= end) {
2236 
2237             // Looking at "." or ".." ?
2238             if ((path.charAt(p) == '.')
2239                 && ((p == end)
2240                     || ((path.charAt(p + 1) == '/')
2241                         || ((path.charAt(p + 1) == '.')
2242                             && ((p + 1 == end)
2243                                 || (path.charAt(p + 2) == '/')))))) {
2244                 normal = false;
2245             }
2246             ns++;
2247 
2248             // Find beginning of next segment
2249             while (p <= end) {
2250                 if (path.charAt(p++) != '/')
2251                     continue;
2252 
2253                 // Skip redundant slashes
2254                 while (p <= end) {
2255                     if (path.charAt(p) != '/') break;
2256                     normal = false;
2257                     p++;
2258                 }
2259 
2260                 break;
2261             }
2262         }
2263 
2264         return normal ? -1 : ns;
2265     }
2266 
2267 
2268     // Split the given path into segments, replacing slashes with nulls and
2269     // filling in the given segment-index array.
2270     //
2271     // Preconditions:
2272     //   segs.length == Number of segments in path
2273     //
2274     // Postconditions:
2275     //   All slashes in path replaced by '\0'
2276     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2277     //
2278     private static void split(char[] path, int[] segs) {
2279         int end = path.length - 1;      // Index of last char in path
2280         int p = 0;                      // Index of next char in path
2281         int i = 0;                      // Index of current segment
2282 
2283         // Skip initial slashes
2284         while (p <= end) {
2285             if (path[p] != '/') break;
2286             path[p] = '\0';
2287             p++;
2288         }
2289 
2290         while (p <= end) {
2291 
2292             // Note start of segment
2293             segs[i++] = p++;
2294 
2295             // Find beginning of next segment
2296             while (p <= end) {
2297                 if (path[p++] != '/')
2298                     continue;
2299                 path[p - 1] = '\0';
2300 
2301                 // Skip redundant slashes
2302                 while (p <= end) {
2303                     if (path[p] != '/') break;
2304                     path[p++] = '\0';
2305                 }
2306                 break;
2307             }
2308         }
2309 
2310         if (i != segs.length)
2311             throw new InternalError();  // ASSERT
2312     }
2313 
2314 
2315     // Join the segments in the given path according to the given segment-index
2316     // array, ignoring those segments whose index entries have been set to -1,
2317     // and inserting slashes as needed.  Return the length of the resulting
2318     // path.
2319     //
2320     // Preconditions:
2321     //   segs[i] == -1 implies segment i is to be ignored
2322     //   path computed by split, as above, with '\0' having replaced '/'
2323     //
2324     // Postconditions:
2325     //   path[0] .. path[return value] == Resulting path
2326     //
2327     private static int join(char[] path, int[] segs) {
2328         int ns = segs.length;           // Number of segments
2329         int end = path.length - 1;      // Index of last char in path
2330         int p = 0;                      // Index of next path char to write
2331 
2332         if (path[p] == '\0') {
2333             // Restore initial slash for absolute paths
2334             path[p++] = '/';
2335         }
2336 
2337         for (int i = 0; i < ns; i++) {
2338             int q = segs[i];            // Current segment
2339             if (q == -1)
2340                 // Ignore this segment
2341                 continue;
2342 
2343             if (p == q) {
2344                 // We're already at this segment, so just skip to its end
2345                 while ((p <= end) && (path[p] != '\0'))
2346                     p++;
2347                 if (p <= end) {
2348                     // Preserve trailing slash
2349                     path[p++] = '/';
2350                 }
2351             } else if (p < q) {
2352                 // Copy q down to p
2353                 while ((q <= end) && (path[q] != '\0'))
2354                     path[p++] = path[q++];
2355                 if (q <= end) {
2356                     // Preserve trailing slash
2357                     path[p++] = '/';
2358                 }
2359             } else
2360                 throw new InternalError(); // ASSERT false
2361         }
2362 
2363         return p;
2364     }
2365 
2366 
2367     // Remove "." segments from the given path, and remove segment pairs
2368     // consisting of a non-".." segment followed by a ".." segment.
2369     //
2370     private static void removeDots(char[] path, int[] segs) {
2371         int ns = segs.length;
2372         int end = path.length - 1;
2373 
2374         for (int i = 0; i < ns; i++) {
2375             int dots = 0;               // Number of dots found (0, 1, or 2)
2376 
2377             // Find next occurrence of "." or ".."
2378             do {
2379                 int p = segs[i];
2380                 if (path[p] == '.') {
2381                     if (p == end) {
2382                         dots = 1;
2383                         break;
2384                     } else if (path[p + 1] == '\0') {
2385                         dots = 1;
2386                         break;
2387                     } else if ((path[p + 1] == '.')
2388                                && ((p + 1 == end)
2389                                    || (path[p + 2] == '\0'))) {
2390                         dots = 2;
2391                         break;
2392                     }
2393                 }
2394                 i++;
2395             } while (i < ns);
2396             if ((i > ns) || (dots == 0))
2397                 break;
2398 
2399             if (dots == 1) {
2400                 // Remove this occurrence of "."
2401                 segs[i] = -1;
2402             } else {
2403                 // If there is a preceding non-".." segment, remove both that
2404                 // segment and this occurrence of ".."; otherwise, leave this
2405                 // ".." segment as-is.
2406                 int j;
2407                 for (j = i - 1; j >= 0; j--) {
2408                     if (segs[j] != -1) break;
2409                 }
2410                 if (j >= 0) {
2411                     int q = segs[j];
2412                     if (!((path[q] == '.')
2413                           && (path[q + 1] == '.')
2414                           && (path[q + 2] == '\0'))) {
2415                         segs[i] = -1;
2416                         segs[j] = -1;
2417                     }
2418                 }
2419             }
2420         }
2421     }
2422 
2423 
2424     // DEVIATION: If the normalized path is relative, and if the first
2425     // segment could be parsed as a scheme name, then prepend a "." segment
2426     //
2427     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2428 
2429         if (path[0] == '\0')
2430             // The path is absolute
2431             return;
2432 
2433         int ns = segs.length;
2434         int f = 0;                      // Index of first segment
2435         while (f < ns) {
2436             if (segs[f] >= 0)
2437                 break;
2438             f++;
2439         }
2440         if ((f >= ns) || (f == 0))
2441             // The path is empty, or else the original first segment survived,
2442             // in which case we already know that no leading "." is needed
2443             return;
2444 
2445         int p = segs[f];
2446         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2447         if (p >= path.length || path[p] == '\0')
2448             // No colon in first segment, so no "." needed
2449             return;
2450 
2451         // At this point we know that the first segment is unused,
2452         // hence we can insert a "." segment at that position
2453         path[0] = '.';
2454         path[1] = '\0';
2455         segs[0] = 0;
2456     }
2457 
2458 
2459     // Normalize the given path string.  A normal path string has no empty
2460     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2461     // segments equal to ".." that are preceded by a segment not equal to "..".
2462     // In contrast to Unix-style pathname normalization, for URI paths we
2463     // always retain trailing slashes.
2464     //
2465     private static String normalize(String ps) {
2466 
2467         // Does this path need normalization?
2468         int ns = needsNormalization(ps);        // Number of segments
2469         if (ns < 0)
2470             // Nope -- just return it
2471             return ps;
2472 
2473         char[] path = ps.toCharArray();         // Path in char-array form
2474 
2475         // Split path into segments
2476         int[] segs = new int[ns];               // Segment-index array
2477         split(path, segs);
2478 
2479         // Remove dots
2480         removeDots(path, segs);
2481 
2482         // Prevent scheme-name confusion
2483         maybeAddLeadingDot(path, segs);
2484 
2485         // Join the remaining segments and return the result
2486         String s = new String(path, 0, join(path, segs));
2487         if (s.equals(ps)) {
2488             // string was already normalized
2489             return ps;
2490         }
2491         return s;
2492     }
2493 
2494 
2495 
2496     // -- Character classes for parsing --
2497 
2498     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2499     // permissible in the various components of a URI reference.  We here
2500     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2501     // mask pair consists of two longs, a low mask and a high mask.  Taken
2502     // together they represent a 128-bit mask, where bit i is set iff the
2503     // character with value i is permitted.
2504     //
2505     // This approach is more efficient than sequentially searching arrays of
2506     // permitted characters.  It could be made still more efficient by
2507     // precompiling the mask information so that a character's presence in a
2508     // given mask could be determined by a single table lookup.
2509 
2510     // Compute the low-order mask for the characters in the given string
2511     private static long lowMask(String chars) {
2512         int n = chars.length();
2513         long m = 0;
2514         for (int i = 0; i < n; i++) {
2515             char c = chars.charAt(i);
2516             if (c < 64)
2517                 m |= (1L << c);
2518         }
2519         return m;
2520     }
2521 
2522     // Compute the high-order mask for the characters in the given string
2523     private static long highMask(String chars) {
2524         int n = chars.length();
2525         long m = 0;
2526         for (int i = 0; i < n; i++) {
2527             char c = chars.charAt(i);
2528             if ((c >= 64) && (c < 128))
2529                 m |= (1L << (c - 64));
2530         }
2531         return m;
2532     }
2533 
2534     // Compute a low-order mask for the characters
2535     // between first and last, inclusive
2536     private static long lowMask(char first, char last) {
2537         long m = 0;
2538         int f = Math.max(Math.min(first, 63), 0);
2539         int l = Math.max(Math.min(last, 63), 0);
2540         for (int i = f; i <= l; i++)
2541             m |= 1L << i;
2542         return m;
2543     }
2544 
2545     // Compute a high-order mask for the characters
2546     // between first and last, inclusive
2547     private static long highMask(char first, char last) {
2548         long m = 0;
2549         int f = Math.max(Math.min(first, 127), 64) - 64;
2550         int l = Math.max(Math.min(last, 127), 64) - 64;
2551         for (int i = f; i <= l; i++)
2552             m |= 1L << i;
2553         return m;
2554     }
2555 
2556     // Tell whether the given character is permitted by the given mask pair
2557     private static boolean match(char c, long lowMask, long highMask) {
2558         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2559             return false;
2560         if (c < 64)
2561             return ((1L << c) & lowMask) != 0;
2562         if (c < 128)
2563             return ((1L << (c - 64)) & highMask) != 0;
2564         return false;
2565     }
2566 
2567     // Character-class masks, in reverse order from RFC2396 because
2568     // initializers for static fields cannot make forward references.
2569 
2570     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2571     //            "8" | "9"
2572     private static final long L_DIGIT = lowMask('0', '9');
2573     private static final long H_DIGIT = 0L;
2574 
2575     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2576     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2577     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2578     private static final long L_UPALPHA = 0L;
2579     private static final long H_UPALPHA = highMask('A', 'Z');
2580 
2581     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2582     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2583     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2584     private static final long L_LOWALPHA = 0L;
2585     private static final long H_LOWALPHA = highMask('a', 'z');
2586 
2587     // alpha         = lowalpha | upalpha
2588     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2589     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2590 
2591     // alphanum      = alpha | digit
2592     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2593     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2594 
2595     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2596     //                         "a" | "b" | "c" | "d" | "e" | "f"
2597     private static final long L_HEX = L_DIGIT;
2598     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2599 
2600     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2601     //                 "(" | ")"
2602     private static final long L_MARK = lowMask("-_.!~*'()");
2603     private static final long H_MARK = highMask("-_.!~*'()");
2604 
2605     // unreserved    = alphanum | mark
2606     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2607     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2608 
2609     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2610     //                 "$" | "," | "[" | "]"
2611     // Added per RFC2732: "[", "]"
2612     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2613     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2614 
2615     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2616     // characters are allowed; this is handled by the scanEscape method below.
2617     private static final long L_ESCAPED = 1L;
2618     private static final long H_ESCAPED = 0L;
2619 
2620     // uric          = reserved | unreserved | escaped
2621     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2622     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2623 
2624     // pchar         = unreserved | escaped |
2625     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2626     private static final long L_PCHAR
2627         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2628     private static final long H_PCHAR
2629         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2630 
2631     // All valid path characters
2632     private static final long L_PATH = L_PCHAR | lowMask(";/");
2633     private static final long H_PATH = H_PCHAR | highMask(";/");
2634 
2635     // Dash, for use in domainlabel and toplabel
2636     private static final long L_DASH = lowMask("-");
2637     private static final long H_DASH = highMask("-");
2638 
2639     // Dot, for use in hostnames
2640     private static final long L_DOT = lowMask(".");
2641     private static final long H_DOT = highMask(".");
2642 
2643     // userinfo      = *( unreserved | escaped |
2644     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2645     private static final long L_USERINFO
2646         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2647     private static final long H_USERINFO
2648         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2649 
2650     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2651     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2652     private static final long L_REG_NAME
2653         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2654     private static final long H_REG_NAME
2655         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2656 
2657     // All valid characters for server-based authorities
2658     private static final long L_SERVER
2659         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2660     private static final long H_SERVER
2661         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2662 
2663     // Special case of server authority that represents an IPv6 address
2664     // In this case, a % does not signify an escape sequence
2665     private static final long L_SERVER_PERCENT
2666         = L_SERVER | lowMask("%");
2667     private static final long H_SERVER_PERCENT
2668         = H_SERVER | highMask("%");
2669     private static final long L_LEFT_BRACKET = lowMask("[");
2670     private static final long H_LEFT_BRACKET = highMask("[");
2671 
2672     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2673     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2674     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2675 
2676     // scope_id = alpha | digit | "_" | "."
2677     private static final long L_SCOPE_ID
2678         = L_ALPHANUM | lowMask("_.");
2679     private static final long H_SCOPE_ID
2680         = H_ALPHANUM | highMask("_.");
2681 
2682     // -- Escaping and encoding --
2683 
2684     private static final char[] hexDigits = {
2685         '0', '1', '2', '3', '4', '5', '6', '7',
2686         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2687     };
2688 
2689     private static void appendEscape(StringBuilder sb, byte b) {
2690         sb.append('%');
2691         sb.append(hexDigits[(b >> 4) & 0x0f]);
2692         sb.append(hexDigits[(b >> 0) & 0x0f]);
2693     }
2694 
2695     private static void appendEncoded(StringBuilder sb, char c) {
2696         ByteBuffer bb = null;
2697         try {
2698             bb = ThreadLocalCoders.encoderFor("UTF-8")
2699                 .encode(CharBuffer.wrap("" + c));
2700         } catch (CharacterCodingException x) {
2701             assert false;
2702         }
2703         while (bb.hasRemaining()) {
2704             int b = bb.get() & 0xff;
2705             if (b >= 0x80)
2706                 appendEscape(sb, (byte)b);
2707             else
2708                 sb.append((char)b);
2709         }
2710     }
2711 
2712     // Quote any characters in s that are not permitted
2713     // by the given mask pair
2714     //
2715     private static String quote(String s, long lowMask, long highMask) {
2716         StringBuilder sb = null;
2717         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2718         for (int i = 0; i < s.length(); i++) {
2719             char c = s.charAt(i);
2720             if (c < '\u0080') {
2721                 if (!match(c, lowMask, highMask)) {
2722                     if (sb == null) {
2723                         sb = new StringBuilder();
2724                         sb.append(s, 0, i);
2725                     }
2726                     appendEscape(sb, (byte)c);
2727                 } else {
2728                     if (sb != null)
2729                         sb.append(c);
2730                 }
2731             } else if (allowNonASCII
2732                        && (Character.isSpaceChar(c)
2733                            || Character.isISOControl(c))) {
2734                 if (sb == null) {
2735                     sb = new StringBuilder();
2736                     sb.append(s, 0, i);
2737                 }
2738                 appendEncoded(sb, c);
2739             } else {
2740                 if (sb != null)
2741                     sb.append(c);
2742             }
2743         }
2744         return (sb == null) ? s : sb.toString();
2745     }
2746 
2747     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2748     // assuming that s is otherwise legal
2749     //
2750     private static String encode(String s) {
2751         int n = s.length();
2752         if (n == 0)
2753             return s;
2754 
2755         // First check whether we actually need to encode
2756         for (int i = 0;;) {
2757             if (s.charAt(i) >= '\u0080')
2758                 break;
2759             if (++i >= n)
2760                 return s;
2761         }
2762 
2763         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2764         ByteBuffer bb = null;
2765         try {
2766             bb = ThreadLocalCoders.encoderFor("UTF-8")
2767                 .encode(CharBuffer.wrap(ns));
2768         } catch (CharacterCodingException x) {
2769             assert false;
2770         }
2771 
2772         StringBuilder sb = new StringBuilder();
2773         while (bb.hasRemaining()) {
2774             int b = bb.get() & 0xff;
2775             if (b >= 0x80)
2776                 appendEscape(sb, (byte)b);
2777             else
2778                 sb.append((char)b);
2779         }
2780         return sb.toString();
2781     }
2782 
2783     private static int decode(char c) {
2784         if ((c >= '0') && (c <= '9'))
2785             return c - '0';
2786         if ((c >= 'a') && (c <= 'f'))
2787             return c - 'a' + 10;
2788         if ((c >= 'A') && (c <= 'F'))
2789             return c - 'A' + 10;
2790         assert false;
2791         return -1;
2792     }
2793 
2794     private static byte decode(char c1, char c2) {
2795         return (byte)(  ((decode(c1) & 0xf) << 4)
2796                       | ((decode(c2) & 0xf) << 0));
2797     }
2798 
2799     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2800     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2801     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2802     // are replaced with '\uFFFD'.
2803     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2804     //            with a scope_id
2805     //
2806     private static String decode(String s) {
2807         return decode(s, true);
2808     }
2809 
2810     // This method was introduced as a generalization of URI.decode method
2811     // to provide a fix for JDK-8037396
2812     private static String decode(String s, boolean ignorePercentInBrackets) {
2813         if (s == null)
2814             return s;
2815         int n = s.length();
2816         if (n == 0)
2817             return s;
2818         if (s.indexOf('%') < 0)
2819             return s;
2820 
2821         StringBuilder sb = new StringBuilder(n);
2822         ByteBuffer bb = ByteBuffer.allocate(n);
2823         CharBuffer cb = CharBuffer.allocate(n);
2824         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2825                 .onMalformedInput(CodingErrorAction.REPLACE)
2826                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2827 
2828         // This is not horribly efficient, but it will do for now
2829         char c = s.charAt(0);
2830         boolean betweenBrackets = false;
2831 
2832         for (int i = 0; i < n;) {
2833             assert c == s.charAt(i);    // Loop invariant
2834             if (c == '[') {
2835                 betweenBrackets = true;
2836             } else if (betweenBrackets && c == ']') {
2837                 betweenBrackets = false;
2838             }
2839             if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) {
2840                 sb.append(c);
2841                 if (++i >= n)
2842                     break;
2843                 c = s.charAt(i);
2844                 continue;
2845             }
2846             bb.clear();
2847             int ui = i;
2848             for (;;) {
2849                 assert (n - i >= 2);
2850                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2851                 if (++i >= n)
2852                     break;
2853                 c = s.charAt(i);
2854                 if (c != '%')
2855                     break;
2856             }
2857             bb.flip();
2858             cb.clear();
2859             dec.reset();
2860             CoderResult cr = dec.decode(bb, cb, true);
2861             assert cr.isUnderflow();
2862             cr = dec.flush(cb);
2863             assert cr.isUnderflow();
2864             sb.append(cb.flip().toString());
2865         }
2866 
2867         return sb.toString();
2868     }
2869 
2870 
2871     // -- Parsing --
2872 
2873     // For convenience we wrap the input URI string in a new instance of the
2874     // following internal class.  This saves always having to pass the input
2875     // string as an argument to each internal scan/parse method.
2876 
2877     private class Parser {
2878 
2879         private String input;           // URI input string
2880         private boolean requireServerAuthority = false;
2881 
2882         Parser(String s) {
2883             input = s;
2884             string = s;
2885         }
2886 
2887         // -- Methods for throwing URISyntaxException in various ways --
2888 
2889         private void fail(String reason) throws URISyntaxException {
2890             throw new URISyntaxException(input, reason);
2891         }
2892 
2893         private void fail(String reason, int p) throws URISyntaxException {
2894             throw new URISyntaxException(input, reason, p);
2895         }
2896 
2897         private void failExpecting(String expected, int p)
2898             throws URISyntaxException
2899         {
2900             fail("Expected " + expected, p);
2901         }
2902 
2903 
2904         // -- Simple access to the input string --
2905 
2906         // Tells whether start < end and, if so, whether charAt(start) == c
2907         //
2908         private boolean at(int start, int end, char c) {
2909             return (start < end) && (input.charAt(start) == c);
2910         }
2911 
2912         // Tells whether start + s.length() < end and, if so,
2913         // whether the chars at the start position match s exactly
2914         //
2915         private boolean at(int start, int end, String s) {
2916             int p = start;
2917             int sn = s.length();
2918             if (sn > end - p)
2919                 return false;
2920             int i = 0;
2921             while (i < sn) {
2922                 if (input.charAt(p++) != s.charAt(i)) {
2923                     break;
2924                 }
2925                 i++;
2926             }
2927             return (i == sn);
2928         }
2929 
2930 
2931         // -- Scanning --
2932 
2933         // The various scan and parse methods that follow use a uniform
2934         // convention of taking the current start position and end index as
2935         // their first two arguments.  The start is inclusive while the end is
2936         // exclusive, just as in the String class, i.e., a start/end pair
2937         // denotes the left-open interval [start, end) of the input string.
2938         //
2939         // These methods never proceed past the end position.  They may return
2940         // -1 to indicate outright failure, but more often they simply return
2941         // the position of the first char after the last char scanned.  Thus
2942         // a typical idiom is
2943         //
2944         //     int p = start;
2945         //     int q = scan(p, end, ...);
2946         //     if (q > p)
2947         //         // We scanned something
2948         //         ...;
2949         //     else if (q == p)
2950         //         // We scanned nothing
2951         //         ...;
2952         //     else if (q == -1)
2953         //         // Something went wrong
2954         //         ...;
2955 
2956 
2957         // Scan a specific char: If the char at the given start position is
2958         // equal to c, return the index of the next char; otherwise, return the
2959         // start position.
2960         //
2961         private int scan(int start, int end, char c) {
2962             if ((start < end) && (input.charAt(start) == c))
2963                 return start + 1;
2964             return start;
2965         }
2966 
2967         // Scan forward from the given start position.  Stop at the first char
2968         // in the err string (in which case -1 is returned), or the first char
2969         // in the stop string (in which case the index of the preceding char is
2970         // returned), or the end of the input string (in which case the length
2971         // of the input string is returned).  May return the start position if
2972         // nothing matches.
2973         //
2974         private int scan(int start, int end, String err, String stop) {
2975             int p = start;
2976             while (p < end) {
2977                 char c = input.charAt(p);
2978                 if (err.indexOf(c) >= 0)
2979                     return -1;
2980                 if (stop.indexOf(c) >= 0)
2981                     break;
2982                 p++;
2983             }
2984             return p;
2985         }
2986 
2987         // Scan forward from the given start position.  Stop at the first char
2988         // in the stop string (in which case the index of the preceding char is
2989         // returned), or the end of the input string (in which case the length
2990         // of the input string is returned).  May return the start position if
2991         // nothing matches.
2992         //
2993         private int scan(int start, int end, String stop) {
2994             int p = start;
2995             while (p < end) {
2996                 char c = input.charAt(p);
2997                 if (stop.indexOf(c) >= 0)
2998                     break;
2999                 p++;
3000             }
3001             return p;
3002         }
3003 
3004         // Scan a potential escape sequence, starting at the given position,
3005         // with the given first char (i.e., charAt(start) == c).
3006         //
3007         // This method assumes that if escapes are allowed then visible
3008         // non-US-ASCII chars are also allowed.
3009         //
3010         private int scanEscape(int start, int n, char first)
3011             throws URISyntaxException
3012         {
3013             int p = start;
3014             char c = first;
3015             if (c == '%') {
3016                 // Process escape pair
3017                 if ((p + 3 <= n)
3018                     && match(input.charAt(p + 1), L_HEX, H_HEX)
3019                     && match(input.charAt(p + 2), L_HEX, H_HEX)) {
3020                     return p + 3;
3021                 }
3022                 fail("Malformed escape pair", p);
3023             } else if ((c > 128)
3024                        && !Character.isSpaceChar(c)
3025                        && !Character.isISOControl(c)) {
3026                 // Allow unescaped but visible non-US-ASCII chars
3027                 return p + 1;
3028             }
3029             return p;
3030         }
3031 
3032         // Scan chars that match the given mask pair
3033         //
3034         private int scan(int start, int n, long lowMask, long highMask)
3035             throws URISyntaxException
3036         {
3037             int p = start;
3038             while (p < n) {
3039                 char c = input.charAt(p);
3040                 if (match(c, lowMask, highMask)) {
3041                     p++;
3042                     continue;
3043                 }
3044                 if ((lowMask & L_ESCAPED) != 0) {
3045                     int q = scanEscape(p, n, c);
3046                     if (q > p) {
3047                         p = q;
3048                         continue;
3049                     }
3050                 }
3051                 break;
3052             }
3053             return p;
3054         }
3055 
3056         // Check that each of the chars in [start, end) matches the given mask
3057         //
3058         private void checkChars(int start, int end,
3059                                 long lowMask, long highMask,
3060                                 String what)
3061             throws URISyntaxException
3062         {
3063             int p = scan(start, end, lowMask, highMask);
3064             if (p < end)
3065                 fail("Illegal character in " + what, p);
3066         }
3067 
3068         // Check that the char at position p matches the given mask
3069         //
3070         private void checkChar(int p,
3071                                long lowMask, long highMask,
3072                                String what)
3073             throws URISyntaxException
3074         {
3075             checkChars(p, p + 1, lowMask, highMask, what);
3076         }
3077 
3078 
3079         // -- Parsing --
3080 
3081         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3082         //
3083         void parse(boolean rsa) throws URISyntaxException {
3084             requireServerAuthority = rsa;
3085             int n = input.length();
3086             int p = scan(0, n, "/?#", ":");
3087             if ((p >= 0) && at(p, n, ':')) {
3088                 if (p == 0)
3089                     failExpecting("scheme name", 0);
3090                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3091                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3092                 scheme = input.substring(0, p);
3093                 p++;                    // Skip ':'
3094                 if (at(p, n, '/')) {
3095                     p = parseHierarchical(p, n);
3096                 } else {
3097                     // opaque; need to create the schemeSpecificPart
3098                     int q = scan(p, n, "#");
3099                     if (q <= p)
3100                         failExpecting("scheme-specific part", p);
3101                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3102                     schemeSpecificPart = input.substring(p, q);
3103                     p = q;
3104                 }
3105             } else {
3106                 p = parseHierarchical(0, n);
3107             }
3108             if (at(p, n, '#')) {
3109                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3110                 fragment = input.substring(p + 1, n);
3111                 p = n;
3112             }
3113             if (p < n)
3114                 fail("end of URI", p);
3115         }
3116 
3117         // [//authority]<path>[?<query>]
3118         //
3119         // DEVIATION from RFC2396: We allow an empty authority component as
3120         // long as it's followed by a non-empty path, query component, or
3121         // fragment component.  This is so that URIs such as "file:///foo/bar"
3122         // will parse.  This seems to be the intent of RFC2396, though the
3123         // grammar does not permit it.  If the authority is empty then the
3124         // userInfo, host, and port components are undefined.
3125         //
3126         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3127         // to be the intent of RFC2396, but the grammar does not permit it.
3128         // The primary consequence of this deviation is that "#f" parses as a
3129         // relative URI with an empty path.
3130         //
3131         private int parseHierarchical(int start, int n)
3132             throws URISyntaxException
3133         {
3134             int p = start;
3135             if (at(p, n, '/') && at(p + 1, n, '/')) {
3136                 p += 2;
3137                 int q = scan(p, n, "/?#");
3138                 if (q > p) {
3139                     p = parseAuthority(p, q);
3140                 } else if (q < n) {
3141                     // DEVIATION: Allow empty authority prior to non-empty
3142                     // path, query component or fragment identifier
3143                 } else
3144                     failExpecting("authority", p);
3145             }
3146             int q = scan(p, n, "?#"); // DEVIATION: May be empty
3147             checkChars(p, q, L_PATH, H_PATH, "path");
3148             path = input.substring(p, q);
3149             p = q;
3150             if (at(p, n, '?')) {
3151                 p++;
3152                 q = scan(p, n, "#");
3153                 checkChars(p, q, L_URIC, H_URIC, "query");
3154                 query = input.substring(p, q);
3155                 p = q;
3156             }
3157             return p;
3158         }
3159 
3160         // authority     = server | reg_name
3161         //
3162         // Ambiguity: An authority that is a registry name rather than a server
3163         // might have a prefix that parses as a server.  We use the fact that
3164         // the authority component is always followed by '/' or the end of the
3165         // input string to resolve this: If the complete authority did not
3166         // parse as a server then we try to parse it as a registry name.
3167         //
3168         private int parseAuthority(int start, int n)
3169             throws URISyntaxException
3170         {
3171             int p = start;
3172             int q = p;
3173             URISyntaxException ex = null;
3174 
3175             boolean serverChars;
3176             boolean regChars;
3177 
3178             if (scan(p, n, "]") > p) {
3179                 // contains a literal IPv6 address, therefore % is allowed
3180                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3181             } else {
3182                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3183             }
3184             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3185 
3186             if (regChars && !serverChars) {
3187                 // Must be a registry-based authority
3188                 authority = input.substring(p, n);
3189                 return n;
3190             }
3191 
3192             if (serverChars) {
3193                 // Might be (probably is) a server-based authority, so attempt
3194                 // to parse it as such.  If the attempt fails, try to treat it
3195                 // as a registry-based authority.
3196                 try {
3197                     q = parseServer(p, n);
3198                     if (q < n)
3199                         failExpecting("end of authority", q);
3200                     authority = input.substring(p, n);
3201                 } catch (URISyntaxException x) {
3202                     // Undo results of failed parse
3203                     userInfo = null;
3204                     host = null;
3205                     port = -1;
3206                     if (requireServerAuthority) {
3207                         // If we're insisting upon a server-based authority,
3208                         // then just re-throw the exception
3209                         throw x;
3210                     } else {
3211                         // Save the exception in case it doesn't parse as a
3212                         // registry either
3213                         ex = x;
3214                         q = p;
3215                     }
3216                 }
3217             }
3218 
3219             if (q < n) {
3220                 if (regChars) {
3221                     // Registry-based authority
3222                     authority = input.substring(p, n);
3223                 } else if (ex != null) {
3224                     // Re-throw exception; it was probably due to
3225                     // a malformed IPv6 address
3226                     throw ex;
3227                 } else {
3228                     fail("Illegal character in authority", q);
3229                 }
3230             }
3231 
3232             return n;
3233         }
3234 
3235 
3236         // [<userinfo>@]<host>[:<port>]
3237         //
3238         private int parseServer(int start, int n)
3239             throws URISyntaxException
3240         {
3241             int p = start;
3242             int q;
3243 
3244             // userinfo
3245             q = scan(p, n, "/?#", "@");
3246             if ((q >= p) && at(q, n, '@')) {
3247                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3248                 userInfo = input.substring(p, q);
3249                 p = q + 1;              // Skip '@'
3250             }
3251 
3252             // hostname, IPv4 address, or IPv6 address
3253             if (at(p, n, '[')) {
3254                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3255                 p++;
3256                 q = scan(p, n, "/?#", "]");
3257                 if ((q > p) && at(q, n, ']')) {
3258                     // look for a "%" scope id
3259                     int r = scan (p, q, "%");
3260                     if (r > p) {
3261                         parseIPv6Reference(p, r);
3262                         if (r+1 == q) {
3263                             fail ("scope id expected");
3264                         }
3265                         checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID,
3266                                                 "scope id");
3267                     } else {
3268                         parseIPv6Reference(p, q);
3269                     }
3270                     host = input.substring(p-1, q+1);
3271                     p = q + 1;
3272                 } else {
3273                     failExpecting("closing bracket for IPv6 address", q);
3274                 }
3275             } else {
3276                 q = parseIPv4Address(p, n);
3277                 if (q <= p)
3278                     q = parseHostname(p, n);
3279                 p = q;
3280             }
3281 
3282             // port
3283             if (at(p, n, ':')) {
3284                 p++;
3285                 q = scan(p, n, "/");
3286                 if (q > p) {
3287                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3288                     try {
3289                         port = Integer.parseInt(input, p, q, 10);
3290                     } catch (NumberFormatException x) {
3291                         fail("Malformed port number", p);
3292                     }
3293                     p = q;
3294                 }
3295             }
3296             if (p < n)
3297                 failExpecting("port number", p);
3298 
3299             return p;
3300         }
3301 
3302         // Scan a string of decimal digits whose value fits in a byte
3303         //
3304         private int scanByte(int start, int n)
3305             throws URISyntaxException
3306         {
3307             int p = start;
3308             int q = scan(p, n, L_DIGIT, H_DIGIT);
3309             if (q <= p) return q;
3310             if (Integer.parseInt(input, p, q, 10) > 255) return p;
3311             return q;
3312         }
3313 
3314         // Scan an IPv4 address.
3315         //
3316         // If the strict argument is true then we require that the given
3317         // interval contain nothing besides an IPv4 address; if it is false
3318         // then we only require that it start with an IPv4 address.
3319         //
3320         // If the interval does not contain or start with (depending upon the
3321         // strict argument) a legal IPv4 address characters then we return -1
3322         // immediately; otherwise we insist that these characters parse as a
3323         // legal IPv4 address and throw an exception on failure.
3324         //
3325         // We assume that any string of decimal digits and dots must be an IPv4
3326         // address.  It won't parse as a hostname anyway, so making that
3327         // assumption here allows more meaningful exceptions to be thrown.
3328         //
3329         private int scanIPv4Address(int start, int n, boolean strict)
3330             throws URISyntaxException
3331         {
3332             int p = start;
3333             int q;
3334             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3335             if ((m <= p) || (strict && (m != n)))
3336                 return -1;
3337             for (;;) {
3338                 // Per RFC2732: At most three digits per byte
3339                 // Further constraint: Each element fits in a byte
3340                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3341                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3342                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3343                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3344                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3345                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3346                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3347                 if (q < m) break;
3348                 return q;
3349             }
3350             fail("Malformed IPv4 address", q);
3351             return -1;
3352         }
3353 
3354         // Take an IPv4 address: Throw an exception if the given interval
3355         // contains anything except an IPv4 address
3356         //
3357         private int takeIPv4Address(int start, int n, String expected)
3358             throws URISyntaxException
3359         {
3360             int p = scanIPv4Address(start, n, true);
3361             if (p <= start)
3362                 failExpecting(expected, start);
3363             return p;
3364         }
3365 
3366         // Attempt to parse an IPv4 address, returning -1 on failure but
3367         // allowing the given interval to contain [:<characters>] after
3368         // the IPv4 address.
3369         //
3370         private int parseIPv4Address(int start, int n) {
3371             int p;
3372 
3373             try {
3374                 p = scanIPv4Address(start, n, false);
3375             } catch (URISyntaxException x) {
3376                 return -1;
3377             } catch (NumberFormatException nfe) {
3378                 return -1;
3379             }
3380 
3381             if (p > start && p < n) {
3382                 // IPv4 address is followed by something - check that
3383                 // it's a ":" as this is the only valid character to
3384                 // follow an address.
3385                 if (input.charAt(p) != ':') {
3386                     p = -1;
3387                 }
3388             }
3389 
3390             if (p > start)
3391                 host = input.substring(start, p);
3392 
3393             return p;
3394         }
3395 
3396         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3397         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3398         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3399         //
3400         private int parseHostname(int start, int n)
3401             throws URISyntaxException
3402         {
3403             int p = start;
3404             int q;
3405             int l = -1;                 // Start of last parsed label
3406 
3407             do {
3408                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3409                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3410                 if (q <= p)
3411                     break;
3412                 l = p;
3413                 if (q > p) {
3414                     p = q;
3415                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3416                     if (q > p) {
3417                         if (input.charAt(q - 1) == '-')
3418                             fail("Illegal character in hostname", q - 1);
3419                         p = q;
3420                     }
3421                 }
3422                 q = scan(p, n, '.');
3423                 if (q <= p)
3424                     break;
3425                 p = q;
3426             } while (p < n);
3427 
3428             if ((p < n) && !at(p, n, ':'))
3429                 fail("Illegal character in hostname", p);
3430 
3431             if (l < 0)
3432                 failExpecting("hostname", start);
3433 
3434             // for a fully qualified hostname check that the rightmost
3435             // label starts with an alpha character.
3436             if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) {
3437                 fail("Illegal character in hostname", l);
3438             }
3439 
3440             host = input.substring(start, p);
3441             return p;
3442         }
3443 
3444 
3445         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3446         //
3447         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3448         // the form ::12.34.56.78, which are clearly shown in the examples
3449         // earlier in the document.  Here is the original grammar:
3450         //
3451         //   IPv6address = hexpart [ ":" IPv4address ]
3452         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3453         //   hexseq      = hex4 *( ":" hex4)
3454         //   hex4        = 1*4HEXDIG
3455         //
3456         // We therefore use the following revised grammar:
3457         //
3458         //   IPv6address = hexseq [ ":" IPv4address ]
3459         //                 | hexseq [ "::" [ hexpost ] ]
3460         //                 | "::" [ hexpost ]
3461         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3462         //   hexseq      = hex4 *( ":" hex4)
3463         //   hex4        = 1*4HEXDIG
3464         //
3465         // This covers all and only the following cases:
3466         //
3467         //   hexseq
3468         //   hexseq : IPv4address
3469         //   hexseq ::
3470         //   hexseq :: hexseq
3471         //   hexseq :: hexseq : IPv4address
3472         //   hexseq :: IPv4address
3473         //   :: hexseq
3474         //   :: hexseq : IPv4address
3475         //   :: IPv4address
3476         //   ::
3477         //
3478         // Additionally we constrain the IPv6 address as follows :-
3479         //
3480         //  i.  IPv6 addresses without compressed zeros should contain
3481         //      exactly 16 bytes.
3482         //
3483         //  ii. IPv6 addresses with compressed zeros should contain
3484         //      less than 16 bytes.
3485 
3486         private int ipv6byteCount = 0;
3487 
3488         private int parseIPv6Reference(int start, int n)
3489             throws URISyntaxException
3490         {
3491             int p = start;
3492             int q;
3493             boolean compressedZeros = false;
3494 
3495             q = scanHexSeq(p, n);
3496 
3497             if (q > p) {
3498                 p = q;
3499                 if (at(p, n, "::")) {
3500                     compressedZeros = true;
3501                     p = scanHexPost(p + 2, n);
3502                 } else if (at(p, n, ':')) {
3503                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3504                     ipv6byteCount += 4;
3505                 }
3506             } else if (at(p, n, "::")) {
3507                 compressedZeros = true;
3508                 p = scanHexPost(p + 2, n);
3509             }
3510             if (p < n)
3511                 fail("Malformed IPv6 address", start);
3512             if (ipv6byteCount > 16)
3513                 fail("IPv6 address too long", start);
3514             if (!compressedZeros && ipv6byteCount < 16)
3515                 fail("IPv6 address too short", start);
3516             if (compressedZeros && ipv6byteCount == 16)
3517                 fail("Malformed IPv6 address", start);
3518 
3519             return p;
3520         }
3521 
3522         private int scanHexPost(int start, int n)
3523             throws URISyntaxException
3524         {
3525             int p = start;
3526             int q;
3527 
3528             if (p == n)
3529                 return p;
3530 
3531             q = scanHexSeq(p, n);
3532             if (q > p) {
3533                 p = q;
3534                 if (at(p, n, ':')) {
3535                     p++;
3536                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3537                     ipv6byteCount += 4;
3538                 }
3539             } else {
3540                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3541                 ipv6byteCount += 4;
3542             }
3543             return p;
3544         }
3545 
3546         // Scan a hex sequence; return -1 if one could not be scanned
3547         //
3548         private int scanHexSeq(int start, int n)
3549             throws URISyntaxException
3550         {
3551             int p = start;
3552             int q;
3553 
3554             q = scan(p, n, L_HEX, H_HEX);
3555             if (q <= p)
3556                 return -1;
3557             if (at(q, n, '.'))          // Beginning of IPv4 address
3558                 return -1;
3559             if (q > p + 4)
3560                 fail("IPv6 hexadecimal digit sequence too long", p);
3561             ipv6byteCount += 2;
3562             p = q;
3563             while (p < n) {
3564                 if (!at(p, n, ':'))
3565                     break;
3566                 if (at(p + 1, n, ':'))
3567                     break;              // "::"
3568                 p++;
3569                 q = scan(p, n, L_HEX, H_HEX);
3570                 if (q <= p)
3571                     failExpecting("digits for an IPv6 address", p);
3572                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3573                     p--;
3574                     break;
3575                 }
3576                 if (q > p + 4)
3577                     fail("IPv6 hexadecimal digit sequence too long", p);
3578                 ipv6byteCount += 2;
3579                 p = q;
3580             }
3581 
3582             return p;
3583         }
3584 
3585     }
3586     static {
3587         SharedSecrets.setJavaNetAccess(
3588             new JavaNetAccess() {
3589                 public URI createURI(String scheme, String path) {
3590                     return new URI(scheme, path);
3591                 }
3592             }
3593         );
3594     }
3595 }