1 /*
   2  * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.net;
  27 
  28 import java.io.IOException;
  29 import java.io.InvalidObjectException;
  30 import java.io.ObjectInputStream;
  31 import java.io.ObjectOutputStream;
  32 import java.io.Serializable;
  33 import java.nio.ByteBuffer;
  34 import java.nio.CharBuffer;
  35 import java.nio.charset.CharsetDecoder;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.CharacterCodingException;
  39 import java.text.Normalizer;
  40 import jdk.internal.loader.URLClassPath;
  41 import jdk.internal.misc.JavaNetUriAccess;
  42 import jdk.internal.misc.SharedSecrets;
  43 import sun.nio.cs.ThreadLocalCoders;
  44 
  45 import java.lang.Character;             // for javadoc
  46 import java.lang.NullPointerException;  // for javadoc
  47 
  48 
  49 /**
  50  * Represents a Uniform Resource Identifier (URI) reference.
  51  *
  52  * <p> Aside from some minor deviations noted below, an instance of this
  53  * class represents a URI reference as defined by
  54  * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
  55  * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
  56  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  57  * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
  58  * also supports scope_ids. The syntax and usage of scope_ids is described
  59  * <a href="Inet6Address.html#scoped">here</a>.
  60  * This class provides constructors for creating URI instances from
  61  * their components or by parsing their string forms, methods for accessing the
  62  * various components of an instance, and methods for normalizing, resolving,
  63  * and relativizing URI instances.  Instances of this class are immutable.
  64  *
  65  *
  66  * <h3> URI syntax and components </h3>
  67  *
  68  * At the highest level a URI reference (hereinafter simply "URI") in string
  69  * form has the syntax
  70  *
  71  * <blockquote>
  72  * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>]
  73  * </blockquote>
  74  *
  75  * where square brackets [...] delineate optional components and the characters
  76  * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves.
  77  *
  78  * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
  79  * said to be <i>relative</i>.  URIs are also classified according to whether
  80  * they are <i>opaque</i> or <i>hierarchical</i>.
  81  *
  82  * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
  83  * not begin with a slash character ({@code '/'}).  Opaque URIs are not
  84  * subject to further parsing.  Some examples of opaque URIs are:
  85  *
  86  * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
  87  * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr>
  88  * <tr><td>{@code news:comp.lang.java}<td></tr>
  89  * <tr><td>{@code urn:isbn:096139210x}</td></tr>
  90  * </table></blockquote>
  91  *
  92  * <p> A <i>hierarchical</i> URI is either an absolute URI whose
  93  * scheme-specific part begins with a slash character, or a relative URI, that
  94  * is, a URI that does not specify a scheme.  Some examples of hierarchical
  95  * URIs are:
  96  *
  97  * <blockquote>
  98  * {@code http://example.com/languages/java/}<br>
  99  * {@code sample/a/index.html#28}<br>
 100  * {@code ../../demo/b/index.html}<br>
 101  * {@code file:///~/calendar}
 102  * </blockquote>
 103  *
 104  * <p> A hierarchical URI is subject to further parsing according to the syntax
 105  *
 106  * <blockquote>
 107  * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>]
 108  * </blockquote>
 109  *
 110  * where the characters <b>{@code :}</b>, <b>{@code /}</b>,
 111  * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves.  The
 112  * scheme-specific part of a hierarchical URI consists of the characters
 113  * between the scheme and fragment components.
 114  *
 115  * <p> The authority component of a hierarchical URI is, if specified, either
 116  * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
 117  * parses according to the familiar syntax
 118  *
 119  * <blockquote>
 120  * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>]
 121  * </blockquote>
 122  *
 123  * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for
 124  * themselves.  Nearly all URI schemes currently in use are server-based.  An
 125  * authority component that does not parse in this way is considered to be
 126  * registry-based.
 127  *
 128  * <p> The path component of a hierarchical URI is itself said to be absolute
 129  * if it begins with a slash character ({@code '/'}); otherwise it is
 130  * relative.  The path of a hierarchical URI that is either absolute or
 131  * specifies an authority is always absolute.
 132  *
 133  * <p> All told, then, a URI instance has the following nine components:
 134  *
 135  * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
 136  * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
 137  * <tr><td>scheme</td><td>{@code String}</td></tr>
 138  * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td>{@code String}</td></tr>
 139  * <tr><td>authority</td><td>{@code String}</td></tr>
 140  * <tr><td>user-info</td><td>{@code String}</td></tr>
 141  * <tr><td>host</td><td>{@code String}</td></tr>
 142  * <tr><td>port</td><td>{@code int}</td></tr>
 143  * <tr><td>path</td><td>{@code String}</td></tr>
 144  * <tr><td>query</td><td>{@code String}</td></tr>
 145  * <tr><td>fragment</td><td>{@code String}</td></tr>
 146  * </table></blockquote>
 147  *
 148  * In a given instance any particular component is either <i>undefined</i> or
 149  * <i>defined</i> with a distinct value.  Undefined string components are
 150  * represented by {@code null}, while undefined integer components are
 151  * represented by {@code -1}.  A string component may be defined to have the
 152  * empty string as its value; this is not equivalent to that component being
 153  * undefined.
 154  *
 155  * <p> Whether a particular component is or is not defined in an instance
 156  * depends upon the type of the URI being represented.  An absolute URI has a
 157  * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
 158  * possibly a fragment, but has no other components.  A hierarchical URI always
 159  * has a path (though it may be empty) and a scheme-specific-part (which at
 160  * least contains the path), and may have any of the other components.  If the
 161  * authority component is present and is server-based then the host component
 162  * will be defined and the user-information and port components may be defined.
 163  *
 164  *
 165  * <h4> Operations on URI instances </h4>
 166  *
 167  * The key operations supported by this class are those of
 168  * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
 169  *
 170  * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."}
 171  * and {@code ".."} segments from the path component of a hierarchical URI.
 172  * Each {@code "."} segment is simply removed.  A {@code ".."} segment is
 173  * removed only if it is preceded by a non-{@code ".."} segment.
 174  * Normalization has no effect upon opaque URIs.
 175  *
 176  * <p> <i>Resolution</i> is the process of resolving one URI against another,
 177  * <i>base</i> URI.  The resulting URI is constructed from components of both
 178  * URIs in the manner specified by RFC&nbsp;2396, taking components from the
 179  * base URI for those not specified in the original.  For hierarchical URIs,
 180  * the path of the original is resolved against the path of the base and then
 181  * normalized.  The result, for example, of resolving
 182  *
 183  * <blockquote>
 184  * {@code sample/a/index.html#28}
 185  * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 186  * &nbsp;&nbsp;&nbsp;&nbsp;(1)
 187  * </blockquote>
 188  *
 189  * against the base URI {@code http://example.com/languages/java/} is the result
 190  * URI
 191  *
 192  * <blockquote>
 193  * {@code http://example.com/languages/java/sample/a/index.html#28}
 194  * </blockquote>
 195  *
 196  * Resolving the relative URI
 197  *
 198  * <blockquote>
 199  * {@code ../../demo/b/index.html}&nbsp;&nbsp;&nbsp;&nbsp;(2)
 200  * </blockquote>
 201  *
 202  * against this result yields, in turn,
 203  *
 204  * <blockquote>
 205  * {@code http://example.com/languages/java/demo/b/index.html}
 206  * </blockquote>
 207  *
 208  * Resolution of both absolute and relative URIs, and of both absolute and
 209  * relative paths in the case of hierarchical URIs, is supported.  Resolving
 210  * the URI {@code file:///~calendar} against any other URI simply yields the
 211  * original URI, since it is absolute.  Resolving the relative URI (2) above
 212  * against the relative base URI (1) yields the normalized, but still relative,
 213  * URI
 214  *
 215  * <blockquote>
 216  * {@code demo/b/index.html}
 217  * </blockquote>
 218  *
 219  * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
 220  * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
 221  *
 222  * <blockquote>
 223  *   <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;and<br>
 224  *   <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )}&nbsp;&nbsp;.<br>
 225  * </blockquote>
 226  *
 227  * This operation is often useful when constructing a document containing URIs
 228  * that must be made relative to the base URI of the document wherever
 229  * possible.  For example, relativizing the URI
 230  *
 231  * <blockquote>
 232  * {@code http://example.com/languages/java/sample/a/index.html#28}
 233  * </blockquote>
 234  *
 235  * against the base URI
 236  *
 237  * <blockquote>
 238  * {@code http://example.com/languages/java/}
 239  * </blockquote>
 240  *
 241  * yields the relative URI {@code sample/a/index.html#28}.
 242  *
 243  *
 244  * <h4> Character categories </h4>
 245  *
 246  * RFC&nbsp;2396 specifies precisely which characters are permitted in the
 247  * various components of a URI reference.  The following categories, most of
 248  * which are taken from that specification, are used below to describe these
 249  * constraints:
 250  *
 251  * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
 252  *   <tr><th valign=top><i>alpha</i></th>
 253  *       <td>The US-ASCII alphabetic characters,
 254  *        {@code 'A'}&nbsp;through&nbsp;{@code 'Z'}
 255  *        and {@code 'a'}&nbsp;through&nbsp;{@code 'z'}</td></tr>
 256  *   <tr><th valign=top><i>digit</i></th>
 257  *       <td>The US-ASCII decimal digit characters,
 258  *       {@code '0'}&nbsp;through&nbsp;{@code '9'}</td></tr>
 259  *   <tr><th valign=top><i>alphanum</i></th>
 260  *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
 261  *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
 262  *       <td>All <i>alphanum</i> characters together with those in the string
 263  *        {@code "_-!.~'()*"}</td></tr>
 264  *   <tr><th valign=top><i>punct</i></th>
 265  *       <td>The characters in the string {@code ",;:$&+="}</td></tr>
 266  *   <tr><th valign=top><i>reserved</i></th>
 267  *       <td>All <i>punct</i> characters together with those in the string
 268  *        {@code "?/[]@"}</td></tr>
 269  *   <tr><th valign=top><i>escaped</i></th>
 270  *       <td>Escaped octets, that is, triplets consisting of the percent
 271  *           character ({@code '%'}) followed by two hexadecimal digits
 272  *           ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and
 273  *           {@code 'a'}-{@code 'f'})</td></tr>
 274  *   <tr><th valign=top><i>other</i></th>
 275  *       <td>The Unicode characters that are not in the US-ASCII character set,
 276  *           are not control characters (according to the {@link
 277  *           java.lang.Character#isISOControl(char) Character.isISOControl}
 278  *           method), and are not space characters (according to the {@link
 279  *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
 280  *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
 281  *           limited to US-ASCII)</i></td></tr>
 282  * </table></blockquote>
 283  *
 284  * <p><a id="legal-chars"></a> The set of all legal URI characters consists of
 285  * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
 286  * characters.
 287  *
 288  *
 289  * <h4> Escaped octets, quotation, encoding, and decoding </h4>
 290  *
 291  * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
 292  * fragment components.  Escaping serves two purposes in URIs:
 293  *
 294  * <ul>
 295  *
 296  *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
 297  *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
 298  *   characters.  </p></li>
 299  *
 300  *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
 301  *   component.  The user-info, path, query, and fragment components differ
 302  *   slightly in terms of which characters are considered legal and illegal.
 303  *   </p></li>
 304  *
 305  * </ul>
 306  *
 307  * These purposes are served in this class by three related operations:
 308  *
 309  * <ul>
 310  *
 311  *   <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it
 312  *   with the sequence of escaped octets that represent that character in the
 313  *   UTF-8 character set.  The Euro currency symbol ({@code '\u005Cu20AC'}),
 314  *   for example, is encoded as {@code "%E2%82%AC"}.  <i>(<b>Deviation from
 315  *   RFC&nbsp;2396</b>, which does not specify any particular character
 316  *   set.)</i> </p></li>
 317  *
 318  *   <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by
 319  *   encoding it.  The space character, for example, is quoted by replacing it
 320  *   with {@code "%20"}.  UTF-8 contains US-ASCII, hence for US-ASCII
 321  *   characters this transformation has exactly the effect required by
 322  *   RFC&nbsp;2396. </p></li>
 323  *
 324  *   <li><p><a id="decode"></a>
 325  *   A sequence of escaped octets is <i>decoded</i> by
 326  *   replacing it with the sequence of characters that it represents in the
 327  *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
 328  *   effect of de-quoting any quoted US-ASCII characters as well as that of
 329  *   decoding any encoded non-US-ASCII characters.  If a <a
 330  *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
 331  *   when decoding the escaped octets then the erroneous octets are replaced by
 332  *   {@code '\u005CuFFFD'}, the Unicode replacement character.  </p></li>
 333  *
 334  * </ul>
 335  *
 336  * These operations are exposed in the constructors and methods of this class
 337  * as follows:
 338  *
 339  * <ul>
 340  *
 341  *   <li><p> The {@linkplain #URI(java.lang.String) single-argument
 342  *   constructor} requires any illegal characters in its argument to be
 343  *   quoted and preserves any escaped octets and <i>other</i> characters that
 344  *   are present.  </p></li>
 345  *
 346  *   <li><p> The {@linkplain
 347  *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
 348  *   multi-argument constructors} quote illegal characters as
 349  *   required by the components in which they appear.  The percent character
 350  *   ({@code '%'}) is always quoted by these constructors.  Any <i>other</i>
 351  *   characters are preserved.  </p></li>
 352  *
 353  *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
 354  *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
 355  *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
 356  *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
 357  *   values of their corresponding components in raw form, without interpreting
 358  *   any escaped octets.  The strings returned by these methods may contain
 359  *   both escaped octets and <i>other</i> characters, and will not contain any
 360  *   illegal characters.  </p></li>
 361  *
 362  *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
 363  *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
 364  *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
 365  *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
 366  *   octets in their corresponding components.  The strings returned by these
 367  *   methods may contain both <i>other</i> characters and illegal characters,
 368  *   and will not contain any escaped octets.  </p></li>
 369  *
 370  *   <li><p> The {@link #toString() toString} method returns a URI string with
 371  *   all necessary quotation but which may contain <i>other</i> characters.
 372  *   </p></li>
 373  *
 374  *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
 375  *   quoted and encoded URI string that does not contain any <i>other</i>
 376  *   characters.  </p></li>
 377  *
 378  * </ul>
 379  *
 380  *
 381  * <h4> Identities </h4>
 382  *
 383  * For any URI <i>u</i>, it is always the case that
 384  *
 385  * <blockquote>
 386  * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )}&nbsp;.
 387  * </blockquote>
 388  *
 389  * For any URI <i>u</i> that does not contain redundant syntax such as two
 390  * slashes before an empty authority (as in {@code file:///tmp/}&nbsp;) or a
 391  * colon following a host name but no port (as in
 392  * {@code http://java.sun.com:}&nbsp;), and that does not encode characters
 393  * except those that must be quoted, the following identities also hold:
 394  * <pre>
 395  *     new URI(<i>u</i>.getScheme(),
 396  *             <i>u</i>.getSchemeSpecificPart(),
 397  *             <i>u</i>.getFragment())
 398  *     .equals(<i>u</i>)</pre>
 399  * in all cases,
 400  * <pre>
 401  *     new URI(<i>u</i>.getScheme(),
 402  *             <i>u</i>.getAuthority(),
 403  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 404  *             <i>u</i>.getFragment())
 405  *     .equals(<i>u</i>)</pre>
 406  * if <i>u</i> is hierarchical, and
 407  * <pre>
 408  *     new URI(<i>u</i>.getScheme(),
 409  *             <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(),
 410  *             <i>u</i>.getPath(), <i>u</i>.getQuery(),
 411  *             <i>u</i>.getFragment())
 412  *     .equals(<i>u</i>)</pre>
 413  * if <i>u</i> is hierarchical and has either no authority or a server-based
 414  * authority.
 415  *
 416  *
 417  * <h4> URIs, URLs, and URNs </h4>
 418  *
 419  * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
 420  * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
 421  * not every URI is a URL.  This is because there is another subcategory of
 422  * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
 423  * specify how to locate them.  The {@code mailto}, {@code news}, and
 424  * {@code isbn} URIs shown above are examples of URNs.
 425  *
 426  * <p> The conceptual distinction between URIs and URLs is reflected in the
 427  * differences between this class and the {@link URL} class.
 428  *
 429  * <p> An instance of this class represents a URI reference in the syntactic
 430  * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
 431  * A URI string is parsed according to the generic syntax without regard to the
 432  * scheme, if any, that it specifies.  No lookup of the host, if any, is
 433  * performed, and no scheme-dependent stream handler is constructed.  Equality,
 434  * hashing, and comparison are defined strictly in terms of the character
 435  * content of the instance.  In other words, a URI instance is little more than
 436  * a structured string that supports the syntactic, scheme-independent
 437  * operations of comparison, normalization, resolution, and relativization.
 438  *
 439  * <p> An instance of the {@link URL} class, by contrast, represents the
 440  * syntactic components of a URL together with some of the information required
 441  * to access the resource that it describes.  A URL must be absolute, that is,
 442  * it must always specify a scheme.  A URL string is parsed according to its
 443  * scheme.  A stream handler is always established for a URL, and in fact it is
 444  * impossible to create a URL instance for a scheme for which no handler is
 445  * available.  Equality and hashing depend upon both the scheme and the
 446  * Internet address of the host, if any; comparison is not defined.  In other
 447  * words, a URL is a structured string that supports the syntactic operation of
 448  * resolution as well as the network I/O operations of looking up the host and
 449  * opening a connection to the specified resource.
 450  *
 451  *
 452  * @author Mark Reinhold
 453  * @since 1.4
 454  *
 455  * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
 456  * transformation format of ISO 10646</i></a>, <br><a
 457  * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
 458  * Architecture</i></a>, <br><a
 459  * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC&nbsp;2396: Uniform
 460  * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
 461  * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
 462  * Literal IPv6 Addresses in URLs</i></a>, <br><a
 463  * href="URISyntaxException.html">URISyntaxException</a>
 464  */
 465 
 466 public final class URI
 467     implements Comparable<URI>, Serializable
 468 {
 469 
 470     // Note: Comments containing the word "ASSERT" indicate places where a
 471     // throw of an InternalError should be replaced by an appropriate assertion
 472     // statement once asserts are enabled in the build.
 473 
 474     static final long serialVersionUID = -6052424284110960213L;
 475 
 476 
 477     // -- Properties and components of this instance --
 478 
 479     // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
 480     private transient String scheme;            // null ==> relative URI
 481     private transient String fragment;
 482 
 483     // Hierarchical URI components: [//<authority>]<path>[?<query>]
 484     private transient String authority;         // Registry or server
 485 
 486     // Server-based authority: [<userInfo>@]<host>[:<port>]
 487     private transient String userInfo;
 488     private transient String host;              // null ==> registry-based
 489     private transient int port = -1;            // -1 ==> undefined
 490 
 491     // Remaining components of hierarchical URIs
 492     private transient String path;              // null ==> opaque
 493     private transient String query;
 494 
 495     // The remaining fields may be computed on demand, which is safe even in
 496     // the face of multiple threads racing to initialize them
 497     private transient String schemeSpecificPart;
 498     private transient int hash;        // Zero ==> undefined
 499 
 500     private transient String decodedUserInfo;
 501     private transient String decodedAuthority;
 502     private transient String decodedPath;
 503     private transient String decodedQuery;
 504     private transient String decodedFragment;
 505     private transient String decodedSchemeSpecificPart;
 506 
 507     /**
 508      * The string form of this URI.
 509      *
 510      * @serial
 511      */
 512     private volatile String string;             // The only serializable field
 513 
 514 
 515 
 516     // -- Constructors and factories --
 517 
 518     private URI() { }                           // Used internally
 519 
 520     /**
 521      * Constructs a URI by parsing the given string.
 522      *
 523      * <p> This constructor parses the given string exactly as specified by the
 524      * grammar in <a
 525      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 526      * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
 527      *
 528      * <ul>
 529      *
 530      *   <li><p> An empty authority component is permitted as long as it is
 531      *   followed by a non-empty path, a query component, or a fragment
 532      *   component.  This allows the parsing of URIs such as
 533      *   {@code "file:///foo/bar"}, which seems to be the intent of
 534      *   RFC&nbsp;2396 although the grammar does not permit it.  If the
 535      *   authority component is empty then the user-information, host, and port
 536      *   components are undefined. </p></li>
 537      *
 538      *   <li><p> Empty relative paths are permitted; this seems to be the
 539      *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
 540      *   primary consequence of this deviation is that a standalone fragment
 541      *   such as {@code "#foo"} parses as a relative URI with an empty path
 542      *   and the given fragment, and can be usefully <a
 543      *   href="#resolve-frag">resolved</a> against a base URI.
 544      *
 545      *   <li><p> IPv4 addresses in host components are parsed rigorously, as
 546      *   specified by <a
 547      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
 548      *   element of a dotted-quad address must contain no more than three
 549      *   decimal digits.  Each element is further constrained to have a value
 550      *   no greater than 255. </p></li>
 551      *
 552      *   <li> <p> Hostnames in host components that comprise only a single
 553      *   domain label are permitted to start with an <i>alphanum</i>
 554      *   character. This seems to be the intent of <a
 555      *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 556      *   section&nbsp;3.2.2 although the grammar does not permit it. The
 557      *   consequence of this deviation is that the authority component of a
 558      *   hierarchical URI such as {@code s://123}, will parse as a server-based
 559      *   authority. </p></li>
 560      *
 561      *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
 562      *   address must be enclosed in square brackets ({@code '['} and
 563      *   {@code ']'}) as specified by <a
 564      *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
 565      *   IPv6 address itself must parse according to <a
 566      *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
 567      *   addresses are further constrained to describe no more than sixteen
 568      *   bytes of address information, a constraint implicit in RFC&nbsp;2373
 569      *   but not expressible in the grammar. </p></li>
 570      *
 571      *   <li><p> Characters in the <i>other</i> category are permitted wherever
 572      *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
 573      *   user-information, path, query, and fragment components, as well as in
 574      *   the authority component if the authority is registry-based.  This
 575      *   allows URIs to contain Unicode characters beyond those in the US-ASCII
 576      *   character set. </p></li>
 577      *
 578      * </ul>
 579      *
 580      * @param  str   The string to be parsed into a URI
 581      *
 582      * @throws  NullPointerException
 583      *          If {@code str} is {@code null}
 584      *
 585      * @throws  URISyntaxException
 586      *          If the given string violates RFC&nbsp;2396, as augmented
 587      *          by the above deviations
 588      */
 589     public URI(String str) throws URISyntaxException {
 590         new Parser(str).parse(false);
 591     }
 592 
 593     /**
 594      * Constructs a hierarchical URI from the given components.
 595      *
 596      * <p> If a scheme is given then the path, if also given, must either be
 597      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 598      * component of the new URI may be left undefined by passing {@code null}
 599      * for the corresponding parameter or, in the case of the {@code port}
 600      * parameter, by passing {@code -1}.
 601      *
 602      * <p> This constructor first builds a URI string from the given components
 603      * according to the rules specified in <a
 604      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 605      * section&nbsp;5.2, step&nbsp;7: </p>
 606      *
 607      * <ol>
 608      *
 609      *   <li><p> Initially, the result string is empty. </p></li>
 610      *
 611      *   <li><p> If a scheme is given then it is appended to the result,
 612      *   followed by a colon character ({@code ':'}).  </p></li>
 613      *
 614      *   <li><p> If user information, a host, or a port are given then the
 615      *   string {@code "//"} is appended.  </p></li>
 616      *
 617      *   <li><p> If user information is given then it is appended, followed by
 618      *   a commercial-at character ({@code '@'}).  Any character not in the
 619      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 620      *   categories is <a href="#quote">quoted</a>.  </p></li>
 621      *
 622      *   <li><p> If a host is given then it is appended.  If the host is a
 623      *   literal IPv6 address but is not enclosed in square brackets
 624      *   ({@code '['} and {@code ']'}) then the square brackets are added.
 625      *   </p></li>
 626      *
 627      *   <li><p> If a port number is given then a colon character
 628      *   ({@code ':'}) is appended, followed by the port number in decimal.
 629      *   </p></li>
 630      *
 631      *   <li><p> If a path is given then it is appended.  Any character not in
 632      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 633      *   categories, and not equal to the slash character ({@code '/'}) or the
 634      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 635      *
 636      *   <li><p> If a query is given then a question-mark character
 637      *   ({@code '?'}) is appended, followed by the query.  Any character that
 638      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 639      *   </p></li>
 640      *
 641      *   <li><p> Finally, if a fragment is given then a hash character
 642      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 643      *   that is not a legal URI character is quoted.  </p></li>
 644      *
 645      * </ol>
 646      *
 647      * <p> The resulting URI string is then parsed as if by invoking the {@link
 648      * #URI(String)} constructor and then invoking the {@link
 649      * #parseServerAuthority()} method upon the result; this may cause a {@link
 650      * URISyntaxException} to be thrown.  </p>
 651      *
 652      * @param   scheme    Scheme name
 653      * @param   userInfo  User name and authorization information
 654      * @param   host      Host name
 655      * @param   port      Port number
 656      * @param   path      Path
 657      * @param   query     Query
 658      * @param   fragment  Fragment
 659      *
 660      * @throws URISyntaxException
 661      *         If both a scheme and a path are given but the path is relative,
 662      *         if the URI string constructed from the given components violates
 663      *         RFC&nbsp;2396, or if the authority component of the string is
 664      *         present but cannot be parsed as a server-based authority
 665      */
 666     public URI(String scheme,
 667                String userInfo, String host, int port,
 668                String path, String query, String fragment)
 669         throws URISyntaxException
 670     {
 671         String s = toString(scheme, null,
 672                             null, userInfo, host, port,
 673                             path, query, fragment);
 674         checkPath(s, scheme, path);
 675         new Parser(s).parse(true);
 676     }
 677 
 678     /**
 679      * Constructs a hierarchical URI from the given components.
 680      *
 681      * <p> If a scheme is given then the path, if also given, must either be
 682      * empty or begin with a slash character ({@code '/'}).  Otherwise a
 683      * component of the new URI may be left undefined by passing {@code null}
 684      * for the corresponding parameter.
 685      *
 686      * <p> This constructor first builds a URI string from the given components
 687      * according to the rules specified in <a
 688      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 689      * section&nbsp;5.2, step&nbsp;7: </p>
 690      *
 691      * <ol>
 692      *
 693      *   <li><p> Initially, the result string is empty.  </p></li>
 694      *
 695      *   <li><p> If a scheme is given then it is appended to the result,
 696      *   followed by a colon character ({@code ':'}).  </p></li>
 697      *
 698      *   <li><p> If an authority is given then the string {@code "//"} is
 699      *   appended, followed by the authority.  If the authority contains a
 700      *   literal IPv6 address then the address must be enclosed in square
 701      *   brackets ({@code '['} and {@code ']'}).  Any character not in the
 702      *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 703      *   categories, and not equal to the commercial-at character
 704      *   ({@code '@'}), is <a href="#quote">quoted</a>.  </p></li>
 705      *
 706      *   <li><p> If a path is given then it is appended.  Any character not in
 707      *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
 708      *   categories, and not equal to the slash character ({@code '/'}) or the
 709      *   commercial-at character ({@code '@'}), is quoted.  </p></li>
 710      *
 711      *   <li><p> If a query is given then a question-mark character
 712      *   ({@code '?'}) is appended, followed by the query.  Any character that
 713      *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
 714      *   </p></li>
 715      *
 716      *   <li><p> Finally, if a fragment is given then a hash character
 717      *   ({@code '#'}) is appended, followed by the fragment.  Any character
 718      *   that is not a legal URI character is quoted.  </p></li>
 719      *
 720      * </ol>
 721      *
 722      * <p> The resulting URI string is then parsed as if by invoking the {@link
 723      * #URI(String)} constructor and then invoking the {@link
 724      * #parseServerAuthority()} method upon the result; this may cause a {@link
 725      * URISyntaxException} to be thrown.  </p>
 726      *
 727      * @param   scheme     Scheme name
 728      * @param   authority  Authority
 729      * @param   path       Path
 730      * @param   query      Query
 731      * @param   fragment   Fragment
 732      *
 733      * @throws URISyntaxException
 734      *         If both a scheme and a path are given but the path is relative,
 735      *         if the URI string constructed from the given components violates
 736      *         RFC&nbsp;2396, or if the authority component of the string is
 737      *         present but cannot be parsed as a server-based authority
 738      */
 739     public URI(String scheme,
 740                String authority,
 741                String path, String query, String fragment)
 742         throws URISyntaxException
 743     {
 744         String s = toString(scheme, null,
 745                             authority, null, null, -1,
 746                             path, query, fragment);
 747         checkPath(s, scheme, path);
 748         new Parser(s).parse(false);
 749     }
 750 
 751     /**
 752      * Constructs a hierarchical URI from the given components.
 753      *
 754      * <p> A component may be left undefined by passing {@code null}.
 755      *
 756      * <p> This convenience constructor works as if by invoking the
 757      * seven-argument constructor as follows:
 758      *
 759      * <blockquote>
 760      * {@code new} {@link #URI(String, String, String, int, String, String, String)
 761      * URI}{@code (scheme, null, host, -1, path, null, fragment);}
 762      * </blockquote>
 763      *
 764      * @param   scheme    Scheme name
 765      * @param   host      Host name
 766      * @param   path      Path
 767      * @param   fragment  Fragment
 768      *
 769      * @throws  URISyntaxException
 770      *          If the URI string constructed from the given components
 771      *          violates RFC&nbsp;2396
 772      */
 773     public URI(String scheme, String host, String path, String fragment)
 774         throws URISyntaxException
 775     {
 776         this(scheme, null, host, -1, path, null, fragment);
 777     }
 778 
 779     /**
 780      * Constructs a URI from the given components.
 781      *
 782      * <p> A component may be left undefined by passing {@code null}.
 783      *
 784      * <p> This constructor first builds a URI in string form using the given
 785      * components as follows:  </p>
 786      *
 787      * <ol>
 788      *
 789      *   <li><p> Initially, the result string is empty.  </p></li>
 790      *
 791      *   <li><p> If a scheme is given then it is appended to the result,
 792      *   followed by a colon character ({@code ':'}).  </p></li>
 793      *
 794      *   <li><p> If a scheme-specific part is given then it is appended.  Any
 795      *   character that is not a <a href="#legal-chars">legal URI character</a>
 796      *   is <a href="#quote">quoted</a>.  </p></li>
 797      *
 798      *   <li><p> Finally, if a fragment is given then a hash character
 799      *   ({@code '#'}) is appended to the string, followed by the fragment.
 800      *   Any character that is not a legal URI character is quoted.  </p></li>
 801      *
 802      * </ol>
 803      *
 804      * <p> The resulting URI string is then parsed in order to create the new
 805      * URI instance as if by invoking the {@link #URI(String)} constructor;
 806      * this may cause a {@link URISyntaxException} to be thrown.  </p>
 807      *
 808      * @param   scheme    Scheme name
 809      * @param   ssp       Scheme-specific part
 810      * @param   fragment  Fragment
 811      *
 812      * @throws  URISyntaxException
 813      *          If the URI string constructed from the given components
 814      *          violates RFC&nbsp;2396
 815      */
 816     public URI(String scheme, String ssp, String fragment)
 817         throws URISyntaxException
 818     {
 819         new Parser(toString(scheme, ssp,
 820                             null, null, null, -1,
 821                             null, null, fragment))
 822             .parse(false);
 823     }
 824 
 825     /**
 826      * Constructs a simple URI consisting of only a scheme and a pre-validated
 827      * path. Provides a fast-path for some internal cases.
 828      */
 829     URI(String scheme, String path) {
 830         assert validSchemeAndPath(scheme, path);
 831         this.scheme = scheme;
 832         this.path = path;
 833     }
 834 
 835     private static boolean validSchemeAndPath(String scheme, String path) {
 836         try {
 837             URI u = new URI(scheme + ":" + path);
 838             return scheme.equals(u.scheme) && path.equals(u.path);
 839         } catch (URISyntaxException e) {
 840             return false;
 841         }
 842     }
 843 
 844     /**
 845      * Creates a URI by parsing the given string.
 846      *
 847      * <p> This convenience factory method works as if by invoking the {@link
 848      * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
 849      * constructor is caught and wrapped in a new {@link
 850      * IllegalArgumentException} object, which is then thrown.
 851      *
 852      * <p> This method is provided for use in situations where it is known that
 853      * the given string is a legal URI, for example for URI constants declared
 854      * within in a program, and so it would be considered a programming error
 855      * for the string not to parse as such.  The constructors, which throw
 856      * {@link URISyntaxException} directly, should be used situations where a
 857      * URI is being constructed from user input or from some other source that
 858      * may be prone to errors.  </p>
 859      *
 860      * @param  str   The string to be parsed into a URI
 861      * @return The new URI
 862      *
 863      * @throws  NullPointerException
 864      *          If {@code str} is {@code null}
 865      *
 866      * @throws  IllegalArgumentException
 867      *          If the given string violates RFC&nbsp;2396
 868      */
 869     public static URI create(String str) {
 870         try {
 871             return new URI(str);
 872         } catch (URISyntaxException x) {
 873             throw new IllegalArgumentException(x.getMessage(), x);
 874         }
 875     }
 876 
 877 
 878     // -- Operations --
 879 
 880     /**
 881      * Attempts to parse this URI's authority component, if defined, into
 882      * user-information, host, and port components.
 883      *
 884      * <p> If this URI's authority component has already been recognized as
 885      * being server-based then it will already have been parsed into
 886      * user-information, host, and port components.  In this case, or if this
 887      * URI has no authority component, this method simply returns this URI.
 888      *
 889      * <p> Otherwise this method attempts once more to parse the authority
 890      * component into user-information, host, and port components, and throws
 891      * an exception describing why the authority component could not be parsed
 892      * in that way.
 893      *
 894      * <p> This method is provided because the generic URI syntax specified in
 895      * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
 896      * cannot always distinguish a malformed server-based authority from a
 897      * legitimate registry-based authority.  It must therefore treat some
 898      * instances of the former as instances of the latter.  The authority
 899      * component in the URI string {@code "//foo:bar"}, for example, is not a
 900      * legal server-based authority but it is legal as a registry-based
 901      * authority.
 902      *
 903      * <p> In many common situations, for example when working URIs that are
 904      * known to be either URNs or URLs, the hierarchical URIs being used will
 905      * always be server-based.  They therefore must either be parsed as such or
 906      * treated as an error.  In these cases a statement such as
 907      *
 908      * <blockquote>
 909      * {@code URI }<i>u</i>{@code  = new URI(str).parseServerAuthority();}
 910      * </blockquote>
 911      *
 912      * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
 913      * it has an authority component, has a server-based authority with proper
 914      * user-information, host, and port components.  Invoking this method also
 915      * ensures that if the authority could not be parsed in that way then an
 916      * appropriate diagnostic message can be issued based upon the exception
 917      * that is thrown. </p>
 918      *
 919      * @return  A URI whose authority field has been parsed
 920      *          as a server-based authority
 921      *
 922      * @throws  URISyntaxException
 923      *          If the authority component of this URI is defined
 924      *          but cannot be parsed as a server-based authority
 925      *          according to RFC&nbsp;2396
 926      */
 927     public URI parseServerAuthority()
 928         throws URISyntaxException
 929     {
 930         // We could be clever and cache the error message and index from the
 931         // exception thrown during the original parse, but that would require
 932         // either more fields or a more-obscure representation.
 933         if ((host != null) || (authority == null))
 934             return this;
 935         new Parser(toString()).parse(true);
 936         return this;
 937     }
 938 
 939     /**
 940      * Normalizes this URI's path.
 941      *
 942      * <p> If this URI is opaque, or if its path is already in normal form,
 943      * then this URI is returned.  Otherwise a new URI is constructed that is
 944      * identical to this URI except that its path is computed by normalizing
 945      * this URI's path in a manner consistent with <a
 946      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 947      * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
 948      * </p>
 949      *
 950      * <ol>
 951      *
 952      *   <li><p> All {@code "."} segments are removed. </p></li>
 953      *
 954      *   <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."}
 955      *   segment then both of these segments are removed.  This step is
 956      *   repeated until it is no longer applicable. </p></li>
 957      *
 958      *   <li><p> If the path is relative, and if its first segment contains a
 959      *   colon character ({@code ':'}), then a {@code "."} segment is
 960      *   prepended.  This prevents a relative URI with a path such as
 961      *   {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a
 962      *   scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}.
 963      *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
 964      *
 965      * </ol>
 966      *
 967      * <p> A normalized path will begin with one or more {@code ".."} segments
 968      * if there were insufficient non-{@code ".."} segments preceding them to
 969      * allow their removal.  A normalized path will begin with a {@code "."}
 970      * segment if one was inserted by step 3 above.  Otherwise, a normalized
 971      * path will not contain any {@code "."} or {@code ".."} segments. </p>
 972      *
 973      * @return  A URI equivalent to this URI,
 974      *          but whose path is in normal form
 975      */
 976     public URI normalize() {
 977         return normalize(this);
 978     }
 979 
 980     /**
 981      * Resolves the given URI against this URI.
 982      *
 983      * <p> If the given URI is already absolute, or if this URI is opaque, then
 984      * the given URI is returned.
 985      *
 986      * <p><a id="resolve-frag"></a> If the given URI's fragment component is
 987      * defined, its path component is empty, and its scheme, authority, and
 988      * query components are undefined, then a URI with the given fragment but
 989      * with all other components equal to those of this URI is returned.  This
 990      * allows a URI representing a standalone fragment reference, such as
 991      * {@code "#foo"}, to be usefully resolved against a base URI.
 992      *
 993      * <p> Otherwise this method constructs a new hierarchical URI in a manner
 994      * consistent with <a
 995      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 996      * section&nbsp;5.2; that is: </p>
 997      *
 998      * <ol>
 999      *
1000      *   <li><p> A new URI is constructed with this URI's scheme and the given
1001      *   URI's query and fragment components. </p></li>
1002      *
1003      *   <li><p> If the given URI has an authority component then the new URI's
1004      *   authority and path are taken from the given URI. </p></li>
1005      *
1006      *   <li><p> Otherwise the new URI's authority component is copied from
1007      *   this URI, and its path is computed as follows: </p>
1008      *
1009      *   <ol>
1010      *
1011      *     <li><p> If the given URI's path is absolute then the new URI's path
1012      *     is taken from the given URI. </p></li>
1013      *
1014      *     <li><p> Otherwise the given URI's path is relative, and so the new
1015      *     URI's path is computed by resolving the path of the given URI
1016      *     against the path of this URI.  This is done by concatenating all but
1017      *     the last segment of this URI's path, if any, with the given URI's
1018      *     path and then normalizing the result as if by invoking the {@link
1019      *     #normalize() normalize} method. </p></li>
1020      *
1021      *   </ol></li>
1022      *
1023      * </ol>
1024      *
1025      * <p> The result of this method is absolute if, and only if, either this
1026      * URI is absolute or the given URI is absolute.  </p>
1027      *
1028      * @param  uri  The URI to be resolved against this URI
1029      * @return The resulting URI
1030      *
1031      * @throws  NullPointerException
1032      *          If {@code uri} is {@code null}
1033      */
1034     public URI resolve(URI uri) {
1035         return resolve(this, uri);
1036     }
1037 
1038     /**
1039      * Constructs a new URI by parsing the given string and then resolving it
1040      * against this URI.
1041      *
1042      * <p> This convenience method works as if invoking it were equivalent to
1043      * evaluating the expression {@link #resolve(java.net.URI)
1044      * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p>
1045      *
1046      * @param  str   The string to be parsed into a URI
1047      * @return The resulting URI
1048      *
1049      * @throws  NullPointerException
1050      *          If {@code str} is {@code null}
1051      *
1052      * @throws  IllegalArgumentException
1053      *          If the given string violates RFC&nbsp;2396
1054      */
1055     public URI resolve(String str) {
1056         return resolve(URI.create(str));
1057     }
1058 
1059     /**
1060      * Relativizes the given URI against this URI.
1061      *
1062      * <p> The relativization of the given URI against this URI is computed as
1063      * follows: </p>
1064      *
1065      * <ol>
1066      *
1067      *   <li><p> If either this URI or the given URI are opaque, or if the
1068      *   scheme and authority components of the two URIs are not identical, or
1069      *   if the path of this URI is not a prefix of the path of the given URI,
1070      *   then the given URI is returned. </p></li>
1071      *
1072      *   <li><p> Otherwise a new relative hierarchical URI is constructed with
1073      *   query and fragment components taken from the given URI and with a path
1074      *   component computed by removing this URI's path from the beginning of
1075      *   the given URI's path. </p></li>
1076      *
1077      * </ol>
1078      *
1079      * @param  uri  The URI to be relativized against this URI
1080      * @return The resulting URI
1081      *
1082      * @throws  NullPointerException
1083      *          If {@code uri} is {@code null}
1084      */
1085     public URI relativize(URI uri) {
1086         return relativize(this, uri);
1087     }
1088 
1089     /**
1090      * Constructs a URL from this URI.
1091      *
1092      * <p> This convenience method works as if invoking it were equivalent to
1093      * evaluating the expression {@code new URL(this.toString())} after
1094      * first checking that this URI is absolute. </p>
1095      *
1096      * @return  A URL constructed from this URI
1097      *
1098      * @throws  IllegalArgumentException
1099      *          If this URL is not absolute
1100      *
1101      * @throws  MalformedURLException
1102      *          If a protocol handler for the URL could not be found,
1103      *          or if some other error occurred while constructing the URL
1104      */
1105     public URL toURL() throws MalformedURLException {
1106         return URL.fromURI(this);
1107     }
1108 
1109     // -- Component access methods --
1110 
1111     /**
1112      * Returns the scheme component of this URI.
1113      *
1114      * <p> The scheme component of a URI, if defined, only contains characters
1115      * in the <i>alphanum</i> category and in the string {@code "-.+"}.  A
1116      * scheme always starts with an <i>alpha</i> character. <p>
1117      *
1118      * The scheme component of a URI cannot contain escaped octets, hence this
1119      * method does not perform any decoding.
1120      *
1121      * @return  The scheme component of this URI,
1122      *          or {@code null} if the scheme is undefined
1123      */
1124     public String getScheme() {
1125         return scheme;
1126     }
1127 
1128     /**
1129      * Tells whether or not this URI is absolute.
1130      *
1131      * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1132      *
1133      * @return  {@code true} if, and only if, this URI is absolute
1134      */
1135     public boolean isAbsolute() {
1136         return scheme != null;
1137     }
1138 
1139     /**
1140      * Tells whether or not this URI is opaque.
1141      *
1142      * <p> A URI is opaque if, and only if, it is absolute and its
1143      * scheme-specific part does not begin with a slash character ('/').
1144      * An opaque URI has a scheme, a scheme-specific part, and possibly
1145      * a fragment; all other components are undefined. </p>
1146      *
1147      * @return  {@code true} if, and only if, this URI is opaque
1148      */
1149     public boolean isOpaque() {
1150         return path == null;
1151     }
1152 
1153     /**
1154      * Returns the raw scheme-specific part of this URI.  The scheme-specific
1155      * part is never undefined, though it may be empty.
1156      *
1157      * <p> The scheme-specific part of a URI only contains legal URI
1158      * characters. </p>
1159      *
1160      * @return  The raw scheme-specific part of this URI
1161      *          (never {@code null})
1162      */
1163     public String getRawSchemeSpecificPart() {
1164         String part = schemeSpecificPart;
1165         if (part != null) {
1166             return part;
1167         }
1168 
1169         String s = string;
1170         if (s != null) {
1171             // if string is defined, components will have been parsed
1172             int start = 0;
1173             int end = s.length();
1174             if (scheme != null) {
1175                 start = scheme.length() + 1;
1176             }
1177             if (fragment != null) {
1178                 end -= fragment.length() + 1;
1179             }
1180             if (path != null && path.length() == end - start) {
1181                 part = path;
1182             } else {
1183                 part = s.substring(start, end);
1184             }
1185         } else {
1186             StringBuilder sb = new StringBuilder();
1187             appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1188                                  host, port, getPath(), getQuery());
1189             part = sb.toString();
1190         }
1191         return schemeSpecificPart = part;
1192     }
1193 
1194     /**
1195      * Returns the decoded scheme-specific part of this URI.
1196      *
1197      * <p> The string returned by this method is equal to that returned by the
1198      * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1199      * except that all sequences of escaped octets are <a
1200      * href="#decode">decoded</a>.  </p>
1201      *
1202      * @return  The decoded scheme-specific part of this URI
1203      *          (never {@code null})
1204      */
1205     public String getSchemeSpecificPart() {
1206         String part = decodedSchemeSpecificPart;
1207         if (part == null) {
1208             decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart());
1209         }
1210         return part;
1211     }
1212 
1213     /**
1214      * Returns the raw authority component of this URI.
1215      *
1216      * <p> The authority component of a URI, if defined, only contains the
1217      * commercial-at character ({@code '@'}) and characters in the
1218      * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1219      * categories.  If the authority is server-based then it is further
1220      * constrained to have valid user-information, host, and port
1221      * components. </p>
1222      *
1223      * @return  The raw authority component of this URI,
1224      *          or {@code null} if the authority is undefined
1225      */
1226     public String getRawAuthority() {
1227         return authority;
1228     }
1229 
1230     /**
1231      * Returns the decoded authority component of this URI.
1232      *
1233      * <p> The string returned by this method is equal to that returned by the
1234      * {@link #getRawAuthority() getRawAuthority} method except that all
1235      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1236      *
1237      * @return  The decoded authority component of this URI,
1238      *          or {@code null} if the authority is undefined
1239      */
1240     public String getAuthority() {
1241         String auth = decodedAuthority;
1242         if ((auth == null) && (authority != null)) {
1243             decodedAuthority = auth = decode(authority);
1244         }
1245         return auth;
1246     }
1247 
1248     /**
1249      * Returns the raw user-information component of this URI.
1250      *
1251      * <p> The user-information component of a URI, if defined, only contains
1252      * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1253      * <i>other</i> categories. </p>
1254      *
1255      * @return  The raw user-information component of this URI,
1256      *          or {@code null} if the user information is undefined
1257      */
1258     public String getRawUserInfo() {
1259         return userInfo;
1260     }
1261 
1262     /**
1263      * Returns the decoded user-information component of this URI.
1264      *
1265      * <p> The string returned by this method is equal to that returned by the
1266      * {@link #getRawUserInfo() getRawUserInfo} method except that all
1267      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1268      *
1269      * @return  The decoded user-information component of this URI,
1270      *          or {@code null} if the user information is undefined
1271      */
1272     public String getUserInfo() {
1273         String user = decodedUserInfo;
1274         if ((user == null) && (userInfo != null)) {
1275             decodedUserInfo = user = decode(userInfo);
1276         }
1277         return user;
1278     }
1279 
1280     /**
1281      * Returns the host component of this URI.
1282      *
1283      * <p> The host component of a URI, if defined, will have one of the
1284      * following forms: </p>
1285      *
1286      * <ul>
1287      *
1288      *   <li><p> A domain name consisting of one or more <i>labels</i>
1289      *   separated by period characters ({@code '.'}), optionally followed by
1290      *   a period character.  Each label consists of <i>alphanum</i> characters
1291      *   as well as hyphen characters ({@code '-'}), though hyphens never
1292      *   occur as the first or last characters in a label. The rightmost
1293      *   label of a domain name consisting of two or more labels, begins
1294      *   with an <i>alpha</i> character. </li>
1295      *
1296      *   <li><p> A dotted-quad IPv4 address of the form
1297      *   <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +},
1298      *   where no <i>digit</i> sequence is longer than three characters and no
1299      *   sequence has a value larger than 255. </p></li>
1300      *
1301      *   <li><p> An IPv6 address enclosed in square brackets ({@code '['} and
1302      *   {@code ']'}) and consisting of hexadecimal digits, colon characters
1303      *   ({@code ':'}), and possibly an embedded IPv4 address.  The full
1304      *   syntax of IPv6 addresses is specified in <a
1305      *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
1306      *   Addressing Architecture</i></a>.  </p></li>
1307      *
1308      * </ul>
1309      *
1310      * The host component of a URI cannot contain escaped octets, hence this
1311      * method does not perform any decoding.
1312      *
1313      * @return  The host component of this URI,
1314      *          or {@code null} if the host is undefined
1315      */
1316     public String getHost() {
1317         return host;
1318     }
1319 
1320     /**
1321      * Returns the port number of this URI.
1322      *
1323      * <p> The port component of a URI, if defined, is a non-negative
1324      * integer. </p>
1325      *
1326      * @return  The port component of this URI,
1327      *          or {@code -1} if the port is undefined
1328      */
1329     public int getPort() {
1330         return port;
1331     }
1332 
1333     /**
1334      * Returns the raw path component of this URI.
1335      *
1336      * <p> The path component of a URI, if defined, only contains the slash
1337      * character ({@code '/'}), the commercial-at character ({@code '@'}),
1338      * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1339      * and <i>other</i> categories. </p>
1340      *
1341      * @return  The path component of this URI,
1342      *          or {@code null} if the path is undefined
1343      */
1344     public String getRawPath() {
1345         return path;
1346     }
1347 
1348     /**
1349      * Returns the decoded path component of this URI.
1350      *
1351      * <p> The string returned by this method is equal to that returned by the
1352      * {@link #getRawPath() getRawPath} method except that all sequences of
1353      * escaped octets are <a href="#decode">decoded</a>.  </p>
1354      *
1355      * @return  The decoded path component of this URI,
1356      *          or {@code null} if the path is undefined
1357      */
1358     public String getPath() {
1359         String decoded = decodedPath;
1360         if ((decoded == null) && (path != null)) {
1361             decodedPath = decoded = decode(path);
1362         }
1363         return decoded;
1364     }
1365 
1366     /**
1367      * Returns the raw query component of this URI.
1368      *
1369      * <p> The query component of a URI, if defined, only contains legal URI
1370      * characters. </p>
1371      *
1372      * @return  The raw query component of this URI,
1373      *          or {@code null} if the query is undefined
1374      */
1375     public String getRawQuery() {
1376         return query;
1377     }
1378 
1379     /**
1380      * Returns the decoded query component of this URI.
1381      *
1382      * <p> The string returned by this method is equal to that returned by the
1383      * {@link #getRawQuery() getRawQuery} method except that all sequences of
1384      * escaped octets are <a href="#decode">decoded</a>.  </p>
1385      *
1386      * @return  The decoded query component of this URI,
1387      *          or {@code null} if the query is undefined
1388      */
1389     public String getQuery() {
1390         String decoded = decodedQuery;
1391         if ((decoded == null) && (query != null)) {
1392             decodedQuery = decoded = decode(query, false);
1393         }
1394         return decoded;
1395     }
1396 
1397     /**
1398      * Returns the raw fragment component of this URI.
1399      *
1400      * <p> The fragment component of a URI, if defined, only contains legal URI
1401      * characters. </p>
1402      *
1403      * @return  The raw fragment component of this URI,
1404      *          or {@code null} if the fragment is undefined
1405      */
1406     public String getRawFragment() {
1407         return fragment;
1408     }
1409 
1410     /**
1411      * Returns the decoded fragment component of this URI.
1412      *
1413      * <p> The string returned by this method is equal to that returned by the
1414      * {@link #getRawFragment() getRawFragment} method except that all
1415      * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
1416      *
1417      * @return  The decoded fragment component of this URI,
1418      *          or {@code null} if the fragment is undefined
1419      */
1420     public String getFragment() {
1421         String decoded = decodedFragment;
1422         if ((decoded == null) && (fragment != null)) {
1423             decodedFragment = decoded = decode(fragment, false);
1424         }
1425         return decoded;
1426     }
1427 
1428 
1429     // -- Equality, comparison, hash code, toString, and serialization --
1430 
1431     /**
1432      * Tests this URI for equality with another object.
1433      *
1434      * <p> If the given object is not a URI then this method immediately
1435      * returns {@code false}.
1436      *
1437      * <p> For two URIs to be considered equal requires that either both are
1438      * opaque or both are hierarchical.  Their schemes must either both be
1439      * undefined or else be equal without regard to case. Their fragments
1440      * must either both be undefined or else be equal.
1441      *
1442      * <p> For two opaque URIs to be considered equal, their scheme-specific
1443      * parts must be equal.
1444      *
1445      * <p> For two hierarchical URIs to be considered equal, their paths must
1446      * be equal and their queries must either both be undefined or else be
1447      * equal.  Their authorities must either both be undefined, or both be
1448      * registry-based, or both be server-based.  If their authorities are
1449      * defined and are registry-based, then they must be equal.  If their
1450      * authorities are defined and are server-based, then their hosts must be
1451      * equal without regard to case, their port numbers must be equal, and
1452      * their user-information components must be equal.
1453      *
1454      * <p> When testing the user-information, path, query, fragment, authority,
1455      * or scheme-specific parts of two URIs for equality, the raw forms rather
1456      * than the encoded forms of these components are compared and the
1457      * hexadecimal digits of escaped octets are compared without regard to
1458      * case.
1459      *
1460      * <p> This method satisfies the general contract of the {@link
1461      * java.lang.Object#equals(Object) Object.equals} method. </p>
1462      *
1463      * @param   ob   The object to which this object is to be compared
1464      *
1465      * @return  {@code true} if, and only if, the given object is a URI that
1466      *          is identical to this URI
1467      */
1468     public boolean equals(Object ob) {
1469         if (ob == this)
1470             return true;
1471         if (!(ob instanceof URI))
1472             return false;
1473         URI that = (URI)ob;
1474         if (this.isOpaque() != that.isOpaque()) return false;
1475         if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1476         if (!equal(this.fragment, that.fragment)) return false;
1477 
1478         // Opaque
1479         if (this.isOpaque())
1480             return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1481 
1482         // Hierarchical
1483         if (!equal(this.path, that.path)) return false;
1484         if (!equal(this.query, that.query)) return false;
1485 
1486         // Authorities
1487         if (this.authority == that.authority) return true;
1488         if (this.host != null) {
1489             // Server-based
1490             if (!equal(this.userInfo, that.userInfo)) return false;
1491             if (!equalIgnoringCase(this.host, that.host)) return false;
1492             if (this.port != that.port) return false;
1493         } else if (this.authority != null) {
1494             // Registry-based
1495             if (!equal(this.authority, that.authority)) return false;
1496         } else if (this.authority != that.authority) {
1497             return false;
1498         }
1499 
1500         return true;
1501     }
1502 
1503     /**
1504      * Returns a hash-code value for this URI.  The hash code is based upon all
1505      * of the URI's components, and satisfies the general contract of the
1506      * {@link java.lang.Object#hashCode() Object.hashCode} method.
1507      *
1508      * @return  A hash-code value for this URI
1509      */
1510     public int hashCode() {
1511         int h = hash;
1512         if (h == 0) {
1513             h = hashIgnoringCase(0, scheme);
1514             h = hash(h, fragment);
1515             if (isOpaque()) {
1516                 h = hash(h, schemeSpecificPart);
1517             } else {
1518                 h = hash(h, path);
1519                 h = hash(h, query);
1520                 if (host != null) {
1521                     h = hash(h, userInfo);
1522                     h = hashIgnoringCase(h, host);
1523                     h += 1949 * port;
1524                 } else {
1525                     h = hash(h, authority);
1526                 }
1527             }
1528             if (h != 0) {
1529                 hash = h;
1530             }
1531         }
1532         return h;
1533     }
1534 
1535     /**
1536      * Compares this URI to another object, which must be a URI.
1537      *
1538      * <p> When comparing corresponding components of two URIs, if one
1539      * component is undefined but the other is defined then the first is
1540      * considered to be less than the second.  Unless otherwise noted, string
1541      * components are ordered according to their natural, case-sensitive
1542      * ordering as defined by the {@link java.lang.String#compareTo(Object)
1543      * String.compareTo} method.  String components that are subject to
1544      * encoding are compared by comparing their raw forms rather than their
1545      * encoded forms.
1546      *
1547      * <p> The ordering of URIs is defined as follows: </p>
1548      *
1549      * <ul>
1550      *
1551      *   <li><p> Two URIs with different schemes are ordered according the
1552      *   ordering of their schemes, without regard to case. </p></li>
1553      *
1554      *   <li><p> A hierarchical URI is considered to be less than an opaque URI
1555      *   with an identical scheme. </p></li>
1556      *
1557      *   <li><p> Two opaque URIs with identical schemes are ordered according
1558      *   to the ordering of their scheme-specific parts. </p></li>
1559      *
1560      *   <li><p> Two opaque URIs with identical schemes and scheme-specific
1561      *   parts are ordered according to the ordering of their
1562      *   fragments. </p></li>
1563      *
1564      *   <li><p> Two hierarchical URIs with identical schemes are ordered
1565      *   according to the ordering of their authority components: </p>
1566      *
1567      *   <ul>
1568      *
1569      *     <li><p> If both authority components are server-based then the URIs
1570      *     are ordered according to their user-information components; if these
1571      *     components are identical then the URIs are ordered according to the
1572      *     ordering of their hosts, without regard to case; if the hosts are
1573      *     identical then the URIs are ordered according to the ordering of
1574      *     their ports. </p></li>
1575      *
1576      *     <li><p> If one or both authority components are registry-based then
1577      *     the URIs are ordered according to the ordering of their authority
1578      *     components. </p></li>
1579      *
1580      *   </ul></li>
1581      *
1582      *   <li><p> Finally, two hierarchical URIs with identical schemes and
1583      *   authority components are ordered according to the ordering of their
1584      *   paths; if their paths are identical then they are ordered according to
1585      *   the ordering of their queries; if the queries are identical then they
1586      *   are ordered according to the order of their fragments. </p></li>
1587      *
1588      * </ul>
1589      *
1590      * <p> This method satisfies the general contract of the {@link
1591      * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1592      * method. </p>
1593      *
1594      * @param   that
1595      *          The object to which this URI is to be compared
1596      *
1597      * @return  A negative integer, zero, or a positive integer as this URI is
1598      *          less than, equal to, or greater than the given URI
1599      *
1600      * @throws  ClassCastException
1601      *          If the given object is not a URI
1602      */
1603     public int compareTo(URI that) {
1604         int c;
1605 
1606         if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1607             return c;
1608 
1609         if (this.isOpaque()) {
1610             if (that.isOpaque()) {
1611                 // Both opaque
1612                 if ((c = compare(this.schemeSpecificPart,
1613                                  that.schemeSpecificPart)) != 0)
1614                     return c;
1615                 return compare(this.fragment, that.fragment);
1616             }
1617             return +1;                  // Opaque > hierarchical
1618         } else if (that.isOpaque()) {
1619             return -1;                  // Hierarchical < opaque
1620         }
1621 
1622         // Hierarchical
1623         if ((this.host != null) && (that.host != null)) {
1624             // Both server-based
1625             if ((c = compare(this.userInfo, that.userInfo)) != 0)
1626                 return c;
1627             if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1628                 return c;
1629             if ((c = this.port - that.port) != 0)
1630                 return c;
1631         } else {
1632             // If one or both authorities are registry-based then we simply
1633             // compare them in the usual, case-sensitive way.  If one is
1634             // registry-based and one is server-based then the strings are
1635             // guaranteed to be unequal, hence the comparison will never return
1636             // zero and the compareTo and equals methods will remain
1637             // consistent.
1638             if ((c = compare(this.authority, that.authority)) != 0) return c;
1639         }
1640 
1641         if ((c = compare(this.path, that.path)) != 0) return c;
1642         if ((c = compare(this.query, that.query)) != 0) return c;
1643         return compare(this.fragment, that.fragment);
1644     }
1645 
1646     /**
1647      * Returns the content of this URI as a string.
1648      *
1649      * <p> If this URI was created by invoking one of the constructors in this
1650      * class then a string equivalent to the original input string, or to the
1651      * string computed from the originally-given components, as appropriate, is
1652      * returned.  Otherwise this URI was created by normalization, resolution,
1653      * or relativization, and so a string is constructed from this URI's
1654      * components according to the rules specified in <a
1655      * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
1656      * section&nbsp;5.2, step&nbsp;7. </p>
1657      *
1658      * @return  The string form of this URI
1659      */
1660     public String toString() {
1661         String s = string;
1662         if (s == null) {
1663             s = defineString();
1664         }
1665         return s;
1666     }
1667 
1668     private String defineString() {
1669         String s = string;
1670         if (s != null) {
1671             return s;
1672         }
1673 
1674         StringBuilder sb = new StringBuilder();
1675         if (scheme != null) {
1676             sb.append(scheme);
1677             sb.append(':');
1678         }
1679         if (isOpaque()) {
1680             sb.append(schemeSpecificPart);
1681         } else {
1682             if (host != null) {
1683                 sb.append("//");
1684                 if (userInfo != null) {
1685                     sb.append(userInfo);
1686                     sb.append('@');
1687                 }
1688                 boolean needBrackets = ((host.indexOf(':') >= 0)
1689                         && !host.startsWith("[")
1690                         && !host.endsWith("]"));
1691                 if (needBrackets) sb.append('[');
1692                 sb.append(host);
1693                 if (needBrackets) sb.append(']');
1694                 if (port != -1) {
1695                     sb.append(':');
1696                     sb.append(port);
1697                 }
1698             } else if (authority != null) {
1699                 sb.append("//");
1700                 sb.append(authority);
1701             }
1702             if (path != null)
1703                 sb.append(path);
1704             if (query != null) {
1705                 sb.append('?');
1706                 sb.append(query);
1707             }
1708         }
1709         if (fragment != null) {
1710             sb.append('#');
1711             sb.append(fragment);
1712         }
1713         return string = sb.toString();
1714     }
1715 
1716     /**
1717      * Returns the content of this URI as a US-ASCII string.
1718      *
1719      * <p> If this URI does not contain any characters in the <i>other</i>
1720      * category then an invocation of this method will return the same value as
1721      * an invocation of the {@link #toString() toString} method.  Otherwise
1722      * this method works as if by invoking that method and then <a
1723      * href="#encode">encoding</a> the result.  </p>
1724      *
1725      * @return  The string form of this URI, encoded as needed
1726      *          so that it only contains characters in the US-ASCII
1727      *          charset
1728      */
1729     public String toASCIIString() {
1730         return encode(toString());
1731     }
1732 
1733 
1734     // -- Serialization support --
1735 
1736     /**
1737      * Saves the content of this URI to the given serial stream.
1738      *
1739      * <p> The only serializable field of a URI instance is its {@code string}
1740      * field.  That field is given a value, if it does not have one already,
1741      * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1742      * method of the given object-output stream is invoked. </p>
1743      *
1744      * @param  os  The object-output stream to which this object
1745      *             is to be written
1746      */
1747     private void writeObject(ObjectOutputStream os)
1748         throws IOException
1749     {
1750         defineString();
1751         os.defaultWriteObject();        // Writes the string field only
1752     }
1753 
1754     /**
1755      * Reconstitutes a URI from the given serial stream.
1756      *
1757      * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1758      * invoked to read the value of the {@code string} field.  The result is
1759      * then parsed in the usual way.
1760      *
1761      * @param  is  The object-input stream from which this object
1762      *             is being read
1763      */
1764     private void readObject(ObjectInputStream is)
1765         throws ClassNotFoundException, IOException
1766     {
1767         port = -1;                      // Argh
1768         is.defaultReadObject();
1769         try {
1770             new Parser(string).parse(false);
1771         } catch (URISyntaxException x) {
1772             IOException y = new InvalidObjectException("Invalid URI");
1773             y.initCause(x);
1774             throw y;
1775         }
1776     }
1777 
1778 
1779     // -- End of public methods --
1780 
1781 
1782     // -- Utility methods for string-field comparison and hashing --
1783 
1784     // These methods return appropriate values for null string arguments,
1785     // thereby simplifying the equals, hashCode, and compareTo methods.
1786     //
1787     // The case-ignoring methods should only be applied to strings whose
1788     // characters are all known to be US-ASCII.  Because of this restriction,
1789     // these methods are faster than the similar methods in the String class.
1790 
1791     // US-ASCII only
1792     private static int toLower(char c) {
1793         if ((c >= 'A') && (c <= 'Z'))
1794             return c + ('a' - 'A');
1795         return c;
1796     }
1797 
1798     // US-ASCII only
1799     private static int toUpper(char c) {
1800         if ((c >= 'a') && (c <= 'z'))
1801             return c - ('a' - 'A');
1802         return c;
1803     }
1804 
1805     private static boolean equal(String s, String t) {
1806         if (s == t) return true;
1807         if ((s != null) && (t != null)) {
1808             if (s.length() != t.length())
1809                 return false;
1810             if (s.indexOf('%') < 0)
1811                 return s.equals(t);
1812             int n = s.length();
1813             for (int i = 0; i < n;) {
1814                 char c = s.charAt(i);
1815                 char d = t.charAt(i);
1816                 if (c != '%') {
1817                     if (c != d)
1818                         return false;
1819                     i++;
1820                     continue;
1821                 }
1822                 if (d != '%')
1823                     return false;
1824                 i++;
1825                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1826                     return false;
1827                 i++;
1828                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1829                     return false;
1830                 i++;
1831             }
1832             return true;
1833         }
1834         return false;
1835     }
1836 
1837     // US-ASCII only
1838     private static boolean equalIgnoringCase(String s, String t) {
1839         if (s == t) return true;
1840         if ((s != null) && (t != null)) {
1841             int n = s.length();
1842             if (t.length() != n)
1843                 return false;
1844             for (int i = 0; i < n; i++) {
1845                 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1846                     return false;
1847             }
1848             return true;
1849         }
1850         return false;
1851     }
1852 
1853     private static int hash(int hash, String s) {
1854         if (s == null) return hash;
1855         return s.indexOf('%') < 0 ? hash * 127 + s.hashCode()
1856                                   : normalizedHash(hash, s);
1857     }
1858 
1859 
1860     private static int normalizedHash(int hash, String s) {
1861         int h = 0;
1862         for (int index = 0; index < s.length(); index++) {
1863             char ch = s.charAt(index);
1864             h = 31 * h + ch;
1865             if (ch == '%') {
1866                 /*
1867                  * Process the next two encoded characters
1868                  */
1869                 for (int i = index + 1; i < index + 3; i++)
1870                     h = 31 * h + toUpper(s.charAt(i));
1871                 index += 2;
1872             }
1873         }
1874         return hash * 127 + h;
1875     }
1876 
1877     // US-ASCII only
1878     private static int hashIgnoringCase(int hash, String s) {
1879         if (s == null) return hash;
1880         int h = hash;
1881         int n = s.length();
1882         for (int i = 0; i < n; i++)
1883             h = 31 * h + toLower(s.charAt(i));
1884         return h;
1885     }
1886 
1887     private static int compare(String s, String t) {
1888         if (s == t) return 0;
1889         if (s != null) {
1890             if (t != null)
1891                 return s.compareTo(t);
1892             else
1893                 return +1;
1894         } else {
1895             return -1;
1896         }
1897     }
1898 
1899     // US-ASCII only
1900     private static int compareIgnoringCase(String s, String t) {
1901         if (s == t) return 0;
1902         if (s != null) {
1903             if (t != null) {
1904                 int sn = s.length();
1905                 int tn = t.length();
1906                 int n = sn < tn ? sn : tn;
1907                 for (int i = 0; i < n; i++) {
1908                     int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1909                     if (c != 0)
1910                         return c;
1911                 }
1912                 return sn - tn;
1913             }
1914             return +1;
1915         } else {
1916             return -1;
1917         }
1918     }
1919 
1920 
1921     // -- String construction --
1922 
1923     // If a scheme is given then the path, if given, must be absolute
1924     //
1925     private static void checkPath(String s, String scheme, String path)
1926         throws URISyntaxException
1927     {
1928         if (scheme != null) {
1929             if ((path != null)
1930                 && ((path.length() > 0) && (path.charAt(0) != '/')))
1931                 throw new URISyntaxException(s,
1932                                              "Relative path in absolute URI");
1933         }
1934     }
1935 
1936     private void appendAuthority(StringBuilder sb,
1937                                  String authority,
1938                                  String userInfo,
1939                                  String host,
1940                                  int port)
1941     {
1942         if (host != null) {
1943             sb.append("//");
1944             if (userInfo != null) {
1945                 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1946                 sb.append('@');
1947             }
1948             boolean needBrackets = ((host.indexOf(':') >= 0)
1949                                     && !host.startsWith("[")
1950                                     && !host.endsWith("]"));
1951             if (needBrackets) sb.append('[');
1952             sb.append(host);
1953             if (needBrackets) sb.append(']');
1954             if (port != -1) {
1955                 sb.append(':');
1956                 sb.append(port);
1957             }
1958         } else if (authority != null) {
1959             sb.append("//");
1960             if (authority.startsWith("[")) {
1961                 // authority should (but may not) contain an embedded IPv6 address
1962                 int end = authority.indexOf(']');
1963                 String doquote = authority, dontquote = "";
1964                 if (end != -1 && authority.indexOf(':') != -1) {
1965                     // the authority contains an IPv6 address
1966                     if (end == authority.length()) {
1967                         dontquote = authority;
1968                         doquote = "";
1969                     } else {
1970                         dontquote = authority.substring(0 , end + 1);
1971                         doquote = authority.substring(end + 1);
1972                     }
1973                 }
1974                 sb.append(dontquote);
1975                 sb.append(quote(doquote,
1976                             L_REG_NAME | L_SERVER,
1977                             H_REG_NAME | H_SERVER));
1978             } else {
1979                 sb.append(quote(authority,
1980                             L_REG_NAME | L_SERVER,
1981                             H_REG_NAME | H_SERVER));
1982             }
1983         }
1984     }
1985 
1986     private void appendSchemeSpecificPart(StringBuilder sb,
1987                                           String opaquePart,
1988                                           String authority,
1989                                           String userInfo,
1990                                           String host,
1991                                           int port,
1992                                           String path,
1993                                           String query)
1994     {
1995         if (opaquePart != null) {
1996             /* check if SSP begins with an IPv6 address
1997              * because we must not quote a literal IPv6 address
1998              */
1999             if (opaquePart.startsWith("//[")) {
2000                 int end =  opaquePart.indexOf(']');
2001                 if (end != -1 && opaquePart.indexOf(':')!=-1) {
2002                     String doquote, dontquote;
2003                     if (end == opaquePart.length()) {
2004                         dontquote = opaquePart;
2005                         doquote = "";
2006                     } else {
2007                         dontquote = opaquePart.substring(0,end+1);
2008                         doquote = opaquePart.substring(end+1);
2009                     }
2010                     sb.append (dontquote);
2011                     sb.append(quote(doquote, L_URIC, H_URIC));
2012                 }
2013             } else {
2014                 sb.append(quote(opaquePart, L_URIC, H_URIC));
2015             }
2016         } else {
2017             appendAuthority(sb, authority, userInfo, host, port);
2018             if (path != null)
2019                 sb.append(quote(path, L_PATH, H_PATH));
2020             if (query != null) {
2021                 sb.append('?');
2022                 sb.append(quote(query, L_URIC, H_URIC));
2023             }
2024         }
2025     }
2026 
2027     private void appendFragment(StringBuilder sb, String fragment) {
2028         if (fragment != null) {
2029             sb.append('#');
2030             sb.append(quote(fragment, L_URIC, H_URIC));
2031         }
2032     }
2033 
2034     private String toString(String scheme,
2035                             String opaquePart,
2036                             String authority,
2037                             String userInfo,
2038                             String host,
2039                             int port,
2040                             String path,
2041                             String query,
2042                             String fragment)
2043     {
2044         StringBuilder sb = new StringBuilder();
2045         if (scheme != null) {
2046             sb.append(scheme);
2047             sb.append(':');
2048         }
2049         appendSchemeSpecificPart(sb, opaquePart,
2050                                  authority, userInfo, host, port,
2051                                  path, query);
2052         appendFragment(sb, fragment);
2053         return sb.toString();
2054     }
2055 
2056     // -- Normalization, resolution, and relativization --
2057 
2058     // RFC2396 5.2 (6)
2059     private static String resolvePath(String base, String child,
2060                                       boolean absolute)
2061     {
2062         int i = base.lastIndexOf('/');
2063         int cn = child.length();
2064         String path = "";
2065 
2066         if (cn == 0) {
2067             // 5.2 (6a)
2068             if (i >= 0)
2069                 path = base.substring(0, i + 1);
2070         } else {
2071             StringBuilder sb = new StringBuilder(base.length() + cn);
2072             // 5.2 (6a)
2073             if (i >= 0)
2074                 sb.append(base, 0, i + 1);
2075             // 5.2 (6b)
2076             sb.append(child);
2077             path = sb.toString();
2078         }
2079 
2080         // 5.2 (6c-f)
2081         String np = normalize(path);
2082 
2083         // 5.2 (6g): If the result is absolute but the path begins with "../",
2084         // then we simply leave the path as-is
2085 
2086         return np;
2087     }
2088 
2089     // RFC2396 5.2
2090     private static URI resolve(URI base, URI child) {
2091         // check if child if opaque first so that NPE is thrown
2092         // if child is null.
2093         if (child.isOpaque() || base.isOpaque())
2094             return child;
2095 
2096         // 5.2 (2): Reference to current document (lone fragment)
2097         if ((child.scheme == null) && (child.authority == null)
2098             && child.path.isEmpty() && (child.fragment != null)
2099             && (child.query == null)) {
2100             if ((base.fragment != null)
2101                 && child.fragment.equals(base.fragment)) {
2102                 return base;
2103             }
2104             URI ru = new URI();
2105             ru.scheme = base.scheme;
2106             ru.authority = base.authority;
2107             ru.userInfo = base.userInfo;
2108             ru.host = base.host;
2109             ru.port = base.port;
2110             ru.path = base.path;
2111             ru.fragment = child.fragment;
2112             ru.query = base.query;
2113             return ru;
2114         }
2115 
2116         // 5.2 (3): Child is absolute
2117         if (child.scheme != null)
2118             return child;
2119 
2120         URI ru = new URI();             // Resolved URI
2121         ru.scheme = base.scheme;
2122         ru.query = child.query;
2123         ru.fragment = child.fragment;
2124 
2125         // 5.2 (4): Authority
2126         if (child.authority == null) {
2127             ru.authority = base.authority;
2128             ru.host = base.host;
2129             ru.userInfo = base.userInfo;
2130             ru.port = base.port;
2131 
2132             String cp = (child.path == null) ? "" : child.path;
2133             if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2134                 // 5.2 (5): Child path is absolute
2135                 ru.path = child.path;
2136             } else {
2137                 // 5.2 (6): Resolve relative path
2138                 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2139             }
2140         } else {
2141             ru.authority = child.authority;
2142             ru.host = child.host;
2143             ru.userInfo = child.userInfo;
2144             ru.host = child.host;
2145             ru.port = child.port;
2146             ru.path = child.path;
2147         }
2148 
2149         // 5.2 (7): Recombine (nothing to do here)
2150         return ru;
2151     }
2152 
2153     // If the given URI's path is normal then return the URI;
2154     // o.w., return a new URI containing the normalized path.
2155     //
2156     private static URI normalize(URI u) {
2157         if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2158             return u;
2159 
2160         String np = normalize(u.path);
2161         if (np == u.path)
2162             return u;
2163 
2164         URI v = new URI();
2165         v.scheme = u.scheme;
2166         v.fragment = u.fragment;
2167         v.authority = u.authority;
2168         v.userInfo = u.userInfo;
2169         v.host = u.host;
2170         v.port = u.port;
2171         v.path = np;
2172         v.query = u.query;
2173         return v;
2174     }
2175 
2176     // If both URIs are hierarchical, their scheme and authority components are
2177     // identical, and the base path is a prefix of the child's path, then
2178     // return a relative URI that, when resolved against the base, yields the
2179     // child; otherwise, return the child.
2180     //
2181     private static URI relativize(URI base, URI child) {
2182         // check if child if opaque first so that NPE is thrown
2183         // if child is null.
2184         if (child.isOpaque() || base.isOpaque())
2185             return child;
2186         if (!equalIgnoringCase(base.scheme, child.scheme)
2187             || !equal(base.authority, child.authority))
2188             return child;
2189 
2190         String bp = normalize(base.path);
2191         String cp = normalize(child.path);
2192         if (!bp.equals(cp)) {
2193             if (!bp.endsWith("/"))
2194                 bp = bp + "/";
2195             if (!cp.startsWith(bp))
2196                 return child;
2197         }
2198 
2199         URI v = new URI();
2200         v.path = cp.substring(bp.length());
2201         v.query = child.query;
2202         v.fragment = child.fragment;
2203         return v;
2204     }
2205 
2206 
2207 
2208     // -- Path normalization --
2209 
2210     // The following algorithm for path normalization avoids the creation of a
2211     // string object for each segment, as well as the use of a string buffer to
2212     // compute the final result, by using a single char array and editing it in
2213     // place.  The array is first split into segments, replacing each slash
2214     // with '\0' and creating a segment-index array, each element of which is
2215     // the index of the first char in the corresponding segment.  We then walk
2216     // through both arrays, removing ".", "..", and other segments as necessary
2217     // by setting their entries in the index array to -1.  Finally, the two
2218     // arrays are used to rejoin the segments and compute the final result.
2219     //
2220     // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2221 
2222 
2223     // Check the given path to see if it might need normalization.  A path
2224     // might need normalization if it contains duplicate slashes, a "."
2225     // segment, or a ".." segment.  Return -1 if no further normalization is
2226     // possible, otherwise return the number of segments found.
2227     //
2228     // This method takes a string argument rather than a char array so that
2229     // this test can be performed without invoking path.toCharArray().
2230     //
2231     private static int needsNormalization(String path) {
2232         boolean normal = true;
2233         int ns = 0;                     // Number of segments
2234         int end = path.length() - 1;    // Index of last char in path
2235         int p = 0;                      // Index of next char in path
2236 
2237         // Skip initial slashes
2238         while (p <= end) {
2239             if (path.charAt(p) != '/') break;
2240             p++;
2241         }
2242         if (p > 1) normal = false;
2243 
2244         // Scan segments
2245         while (p <= end) {
2246 
2247             // Looking at "." or ".." ?
2248             if ((path.charAt(p) == '.')
2249                 && ((p == end)
2250                     || ((path.charAt(p + 1) == '/')
2251                         || ((path.charAt(p + 1) == '.')
2252                             && ((p + 1 == end)
2253                                 || (path.charAt(p + 2) == '/')))))) {
2254                 normal = false;
2255             }
2256             ns++;
2257 
2258             // Find beginning of next segment
2259             while (p <= end) {
2260                 if (path.charAt(p++) != '/')
2261                     continue;
2262 
2263                 // Skip redundant slashes
2264                 while (p <= end) {
2265                     if (path.charAt(p) != '/') break;
2266                     normal = false;
2267                     p++;
2268                 }
2269 
2270                 break;
2271             }
2272         }
2273 
2274         return normal ? -1 : ns;
2275     }
2276 
2277 
2278     // Split the given path into segments, replacing slashes with nulls and
2279     // filling in the given segment-index array.
2280     //
2281     // Preconditions:
2282     //   segs.length == Number of segments in path
2283     //
2284     // Postconditions:
2285     //   All slashes in path replaced by '\0'
2286     //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
2287     //
2288     private static void split(char[] path, int[] segs) {
2289         int end = path.length - 1;      // Index of last char in path
2290         int p = 0;                      // Index of next char in path
2291         int i = 0;                      // Index of current segment
2292 
2293         // Skip initial slashes
2294         while (p <= end) {
2295             if (path[p] != '/') break;
2296             path[p] = '\0';
2297             p++;
2298         }
2299 
2300         while (p <= end) {
2301 
2302             // Note start of segment
2303             segs[i++] = p++;
2304 
2305             // Find beginning of next segment
2306             while (p <= end) {
2307                 if (path[p++] != '/')
2308                     continue;
2309                 path[p - 1] = '\0';
2310 
2311                 // Skip redundant slashes
2312                 while (p <= end) {
2313                     if (path[p] != '/') break;
2314                     path[p++] = '\0';
2315                 }
2316                 break;
2317             }
2318         }
2319 
2320         if (i != segs.length)
2321             throw new InternalError();  // ASSERT
2322     }
2323 
2324 
2325     // Join the segments in the given path according to the given segment-index
2326     // array, ignoring those segments whose index entries have been set to -1,
2327     // and inserting slashes as needed.  Return the length of the resulting
2328     // path.
2329     //
2330     // Preconditions:
2331     //   segs[i] == -1 implies segment i is to be ignored
2332     //   path computed by split, as above, with '\0' having replaced '/'
2333     //
2334     // Postconditions:
2335     //   path[0] .. path[return value] == Resulting path
2336     //
2337     private static int join(char[] path, int[] segs) {
2338         int ns = segs.length;           // Number of segments
2339         int end = path.length - 1;      // Index of last char in path
2340         int p = 0;                      // Index of next path char to write
2341 
2342         if (path[p] == '\0') {
2343             // Restore initial slash for absolute paths
2344             path[p++] = '/';
2345         }
2346 
2347         for (int i = 0; i < ns; i++) {
2348             int q = segs[i];            // Current segment
2349             if (q == -1)
2350                 // Ignore this segment
2351                 continue;
2352 
2353             if (p == q) {
2354                 // We're already at this segment, so just skip to its end
2355                 while ((p <= end) && (path[p] != '\0'))
2356                     p++;
2357                 if (p <= end) {
2358                     // Preserve trailing slash
2359                     path[p++] = '/';
2360                 }
2361             } else if (p < q) {
2362                 // Copy q down to p
2363                 while ((q <= end) && (path[q] != '\0'))
2364                     path[p++] = path[q++];
2365                 if (q <= end) {
2366                     // Preserve trailing slash
2367                     path[p++] = '/';
2368                 }
2369             } else
2370                 throw new InternalError(); // ASSERT false
2371         }
2372 
2373         return p;
2374     }
2375 
2376 
2377     // Remove "." segments from the given path, and remove segment pairs
2378     // consisting of a non-".." segment followed by a ".." segment.
2379     //
2380     private static void removeDots(char[] path, int[] segs) {
2381         int ns = segs.length;
2382         int end = path.length - 1;
2383 
2384         for (int i = 0; i < ns; i++) {
2385             int dots = 0;               // Number of dots found (0, 1, or 2)
2386 
2387             // Find next occurrence of "." or ".."
2388             do {
2389                 int p = segs[i];
2390                 if (path[p] == '.') {
2391                     if (p == end) {
2392                         dots = 1;
2393                         break;
2394                     } else if (path[p + 1] == '\0') {
2395                         dots = 1;
2396                         break;
2397                     } else if ((path[p + 1] == '.')
2398                                && ((p + 1 == end)
2399                                    || (path[p + 2] == '\0'))) {
2400                         dots = 2;
2401                         break;
2402                     }
2403                 }
2404                 i++;
2405             } while (i < ns);
2406             if ((i > ns) || (dots == 0))
2407                 break;
2408 
2409             if (dots == 1) {
2410                 // Remove this occurrence of "."
2411                 segs[i] = -1;
2412             } else {
2413                 // If there is a preceding non-".." segment, remove both that
2414                 // segment and this occurrence of ".."; otherwise, leave this
2415                 // ".." segment as-is.
2416                 int j;
2417                 for (j = i - 1; j >= 0; j--) {
2418                     if (segs[j] != -1) break;
2419                 }
2420                 if (j >= 0) {
2421                     int q = segs[j];
2422                     if (!((path[q] == '.')
2423                           && (path[q + 1] == '.')
2424                           && (path[q + 2] == '\0'))) {
2425                         segs[i] = -1;
2426                         segs[j] = -1;
2427                     }
2428                 }
2429             }
2430         }
2431     }
2432 
2433 
2434     // DEVIATION: If the normalized path is relative, and if the first
2435     // segment could be parsed as a scheme name, then prepend a "." segment
2436     //
2437     private static void maybeAddLeadingDot(char[] path, int[] segs) {
2438 
2439         if (path[0] == '\0')
2440             // The path is absolute
2441             return;
2442 
2443         int ns = segs.length;
2444         int f = 0;                      // Index of first segment
2445         while (f < ns) {
2446             if (segs[f] >= 0)
2447                 break;
2448             f++;
2449         }
2450         if ((f >= ns) || (f == 0))
2451             // The path is empty, or else the original first segment survived,
2452             // in which case we already know that no leading "." is needed
2453             return;
2454 
2455         int p = segs[f];
2456         while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2457         if (p >= path.length || path[p] == '\0')
2458             // No colon in first segment, so no "." needed
2459             return;
2460 
2461         // At this point we know that the first segment is unused,
2462         // hence we can insert a "." segment at that position
2463         path[0] = '.';
2464         path[1] = '\0';
2465         segs[0] = 0;
2466     }
2467 
2468 
2469     // Normalize the given path string.  A normal path string has no empty
2470     // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2471     // segments equal to ".." that are preceded by a segment not equal to "..".
2472     // In contrast to Unix-style pathname normalization, for URI paths we
2473     // always retain trailing slashes.
2474     //
2475     private static String normalize(String ps) {
2476 
2477         // Does this path need normalization?
2478         int ns = needsNormalization(ps);        // Number of segments
2479         if (ns < 0)
2480             // Nope -- just return it
2481             return ps;
2482 
2483         char[] path = ps.toCharArray();         // Path in char-array form
2484 
2485         // Split path into segments
2486         int[] segs = new int[ns];               // Segment-index array
2487         split(path, segs);
2488 
2489         // Remove dots
2490         removeDots(path, segs);
2491 
2492         // Prevent scheme-name confusion
2493         maybeAddLeadingDot(path, segs);
2494 
2495         // Join the remaining segments and return the result
2496         String s = new String(path, 0, join(path, segs));
2497         if (s.equals(ps)) {
2498             // string was already normalized
2499             return ps;
2500         }
2501         return s;
2502     }
2503 
2504 
2505 
2506     // -- Character classes for parsing --
2507 
2508     // RFC2396 precisely specifies which characters in the US-ASCII charset are
2509     // permissible in the various components of a URI reference.  We here
2510     // define a set of mask pairs to aid in enforcing these restrictions.  Each
2511     // mask pair consists of two longs, a low mask and a high mask.  Taken
2512     // together they represent a 128-bit mask, where bit i is set iff the
2513     // character with value i is permitted.
2514     //
2515     // This approach is more efficient than sequentially searching arrays of
2516     // permitted characters.  It could be made still more efficient by
2517     // precompiling the mask information so that a character's presence in a
2518     // given mask could be determined by a single table lookup.
2519 
2520     // Compute the low-order mask for the characters in the given string
2521     private static long lowMask(String chars) {
2522         int n = chars.length();
2523         long m = 0;
2524         for (int i = 0; i < n; i++) {
2525             char c = chars.charAt(i);
2526             if (c < 64)
2527                 m |= (1L << c);
2528         }
2529         return m;
2530     }
2531 
2532     // Compute the high-order mask for the characters in the given string
2533     private static long highMask(String chars) {
2534         int n = chars.length();
2535         long m = 0;
2536         for (int i = 0; i < n; i++) {
2537             char c = chars.charAt(i);
2538             if ((c >= 64) && (c < 128))
2539                 m |= (1L << (c - 64));
2540         }
2541         return m;
2542     }
2543 
2544     // Compute a low-order mask for the characters
2545     // between first and last, inclusive
2546     private static long lowMask(char first, char last) {
2547         long m = 0;
2548         int f = Math.max(Math.min(first, 63), 0);
2549         int l = Math.max(Math.min(last, 63), 0);
2550         for (int i = f; i <= l; i++)
2551             m |= 1L << i;
2552         return m;
2553     }
2554 
2555     // Compute a high-order mask for the characters
2556     // between first and last, inclusive
2557     private static long highMask(char first, char last) {
2558         long m = 0;
2559         int f = Math.max(Math.min(first, 127), 64) - 64;
2560         int l = Math.max(Math.min(last, 127), 64) - 64;
2561         for (int i = f; i <= l; i++)
2562             m |= 1L << i;
2563         return m;
2564     }
2565 
2566     // Tell whether the given character is permitted by the given mask pair
2567     private static boolean match(char c, long lowMask, long highMask) {
2568         if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2569             return false;
2570         if (c < 64)
2571             return ((1L << c) & lowMask) != 0;
2572         if (c < 128)
2573             return ((1L << (c - 64)) & highMask) != 0;
2574         return false;
2575     }
2576 
2577     // Character-class masks, in reverse order from RFC2396 because
2578     // initializers for static fields cannot make forward references.
2579 
2580     // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2581     //            "8" | "9"
2582     private static final long L_DIGIT = lowMask('0', '9');
2583     private static final long H_DIGIT = 0L;
2584 
2585     // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2586     //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2587     //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2588     private static final long L_UPALPHA = 0L;
2589     private static final long H_UPALPHA = highMask('A', 'Z');
2590 
2591     // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2592     //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2593     //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2594     private static final long L_LOWALPHA = 0L;
2595     private static final long H_LOWALPHA = highMask('a', 'z');
2596 
2597     // alpha         = lowalpha | upalpha
2598     private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2599     private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2600 
2601     // alphanum      = alpha | digit
2602     private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2603     private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2604 
2605     // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2606     //                         "a" | "b" | "c" | "d" | "e" | "f"
2607     private static final long L_HEX = L_DIGIT;
2608     private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2609 
2610     // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2611     //                 "(" | ")"
2612     private static final long L_MARK = lowMask("-_.!~*'()");
2613     private static final long H_MARK = highMask("-_.!~*'()");
2614 
2615     // unreserved    = alphanum | mark
2616     private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2617     private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2618 
2619     // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2620     //                 "$" | "," | "[" | "]"
2621     // Added per RFC2732: "[", "]"
2622     private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2623     private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2624 
2625     // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2626     // characters are allowed; this is handled by the scanEscape method below.
2627     private static final long L_ESCAPED = 1L;
2628     private static final long H_ESCAPED = 0L;
2629 
2630     // uric          = reserved | unreserved | escaped
2631     private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2632     private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2633 
2634     // pchar         = unreserved | escaped |
2635     //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
2636     private static final long L_PCHAR
2637         = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2638     private static final long H_PCHAR
2639         = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2640 
2641     // All valid path characters
2642     private static final long L_PATH = L_PCHAR | lowMask(";/");
2643     private static final long H_PATH = H_PCHAR | highMask(";/");
2644 
2645     // Dash, for use in domainlabel and toplabel
2646     private static final long L_DASH = lowMask("-");
2647     private static final long H_DASH = highMask("-");
2648 
2649     // Dot, for use in hostnames
2650     private static final long L_DOT = lowMask(".");
2651     private static final long H_DOT = highMask(".");
2652 
2653     // userinfo      = *( unreserved | escaped |
2654     //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
2655     private static final long L_USERINFO
2656         = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2657     private static final long H_USERINFO
2658         = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2659 
2660     // reg_name      = 1*( unreserved | escaped | "$" | "," |
2661     //                     ";" | ":" | "@" | "&" | "=" | "+" )
2662     private static final long L_REG_NAME
2663         = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2664     private static final long H_REG_NAME
2665         = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2666 
2667     // All valid characters for server-based authorities
2668     private static final long L_SERVER
2669         = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2670     private static final long H_SERVER
2671         = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2672 
2673     // Special case of server authority that represents an IPv6 address
2674     // In this case, a % does not signify an escape sequence
2675     private static final long L_SERVER_PERCENT
2676         = L_SERVER | lowMask("%");
2677     private static final long H_SERVER_PERCENT
2678         = H_SERVER | highMask("%");
2679     private static final long L_LEFT_BRACKET = lowMask("[");
2680     private static final long H_LEFT_BRACKET = highMask("[");
2681 
2682     // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
2683     private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2684     private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2685 
2686     // scope_id = alpha | digit | "_" | "."
2687     private static final long L_SCOPE_ID
2688         = L_ALPHANUM | lowMask("_.");
2689     private static final long H_SCOPE_ID
2690         = H_ALPHANUM | highMask("_.");
2691 
2692     // -- Escaping and encoding --
2693 
2694     private static final char[] hexDigits = {
2695         '0', '1', '2', '3', '4', '5', '6', '7',
2696         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2697     };
2698 
2699     private static void appendEscape(StringBuilder sb, byte b) {
2700         sb.append('%');
2701         sb.append(hexDigits[(b >> 4) & 0x0f]);
2702         sb.append(hexDigits[(b >> 0) & 0x0f]);
2703     }
2704 
2705     private static void appendEncoded(StringBuilder sb, char c) {
2706         ByteBuffer bb = null;
2707         try {
2708             bb = ThreadLocalCoders.encoderFor("UTF-8")
2709                 .encode(CharBuffer.wrap("" + c));
2710         } catch (CharacterCodingException x) {
2711             assert false;
2712         }
2713         while (bb.hasRemaining()) {
2714             int b = bb.get() & 0xff;
2715             if (b >= 0x80)
2716                 appendEscape(sb, (byte)b);
2717             else
2718                 sb.append((char)b);
2719         }
2720     }
2721 
2722     // Quote any characters in s that are not permitted
2723     // by the given mask pair
2724     //
2725     private static String quote(String s, long lowMask, long highMask) {
2726         StringBuilder sb = null;
2727         boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2728         for (int i = 0; i < s.length(); i++) {
2729             char c = s.charAt(i);
2730             if (c < '\u0080') {
2731                 if (!match(c, lowMask, highMask)) {
2732                     if (sb == null) {
2733                         sb = new StringBuilder();
2734                         sb.append(s, 0, i);
2735                     }
2736                     appendEscape(sb, (byte)c);
2737                 } else {
2738                     if (sb != null)
2739                         sb.append(c);
2740                 }
2741             } else if (allowNonASCII
2742                        && (Character.isSpaceChar(c)
2743                            || Character.isISOControl(c))) {
2744                 if (sb == null) {
2745                     sb = new StringBuilder();
2746                     sb.append(s, 0, i);
2747                 }
2748                 appendEncoded(sb, c);
2749             } else {
2750                 if (sb != null)
2751                     sb.append(c);
2752             }
2753         }
2754         return (sb == null) ? s : sb.toString();
2755     }
2756 
2757     // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2758     // assuming that s is otherwise legal
2759     //
2760     private static String encode(String s) {
2761         int n = s.length();
2762         if (n == 0)
2763             return s;
2764 
2765         // First check whether we actually need to encode
2766         for (int i = 0;;) {
2767             if (s.charAt(i) >= '\u0080')
2768                 break;
2769             if (++i >= n)
2770                 return s;
2771         }
2772 
2773         String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2774         ByteBuffer bb = null;
2775         try {
2776             bb = ThreadLocalCoders.encoderFor("UTF-8")
2777                 .encode(CharBuffer.wrap(ns));
2778         } catch (CharacterCodingException x) {
2779             assert false;
2780         }
2781 
2782         StringBuilder sb = new StringBuilder();
2783         while (bb.hasRemaining()) {
2784             int b = bb.get() & 0xff;
2785             if (b >= 0x80)
2786                 appendEscape(sb, (byte)b);
2787             else
2788                 sb.append((char)b);
2789         }
2790         return sb.toString();
2791     }
2792 
2793     private static int decode(char c) {
2794         if ((c >= '0') && (c <= '9'))
2795             return c - '0';
2796         if ((c >= 'a') && (c <= 'f'))
2797             return c - 'a' + 10;
2798         if ((c >= 'A') && (c <= 'F'))
2799             return c - 'A' + 10;
2800         assert false;
2801         return -1;
2802     }
2803 
2804     private static byte decode(char c1, char c2) {
2805         return (byte)(  ((decode(c1) & 0xf) << 4)
2806                       | ((decode(c2) & 0xf) << 0));
2807     }
2808 
2809     // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
2810     // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
2811     // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2812     // are replaced with '\uFFFD'.
2813     // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2814     //            with a scope_id
2815     //
2816     private static String decode(String s) {
2817         return decode(s, true);
2818     }
2819 
2820     // This method was introduced as a generalization of URI.decode method
2821     // to provide a fix for JDK-8037396
2822     private static String decode(String s, boolean ignorePercentInBrackets) {
2823         if (s == null)
2824             return s;
2825         int n = s.length();
2826         if (n == 0)
2827             return s;
2828         if (s.indexOf('%') < 0)
2829             return s;
2830 
2831         StringBuilder sb = new StringBuilder(n);
2832         ByteBuffer bb = ByteBuffer.allocate(n);
2833         CharBuffer cb = CharBuffer.allocate(n);
2834         CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2835                 .onMalformedInput(CodingErrorAction.REPLACE)
2836                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2837 
2838         // This is not horribly efficient, but it will do for now
2839         char c = s.charAt(0);
2840         boolean betweenBrackets = false;
2841 
2842         for (int i = 0; i < n;) {
2843             assert c == s.charAt(i);    // Loop invariant
2844             if (c == '[') {
2845                 betweenBrackets = true;
2846             } else if (betweenBrackets && c == ']') {
2847                 betweenBrackets = false;
2848             }
2849             if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) {
2850                 sb.append(c);
2851                 if (++i >= n)
2852                     break;
2853                 c = s.charAt(i);
2854                 continue;
2855             }
2856             bb.clear();
2857             int ui = i;
2858             for (;;) {
2859                 assert (n - i >= 2);
2860                 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2861                 if (++i >= n)
2862                     break;
2863                 c = s.charAt(i);
2864                 if (c != '%')
2865                     break;
2866             }
2867             bb.flip();
2868             cb.clear();
2869             dec.reset();
2870             CoderResult cr = dec.decode(bb, cb, true);
2871             assert cr.isUnderflow();
2872             cr = dec.flush(cb);
2873             assert cr.isUnderflow();
2874             sb.append(cb.flip().toString());
2875         }
2876 
2877         return sb.toString();
2878     }
2879 
2880 
2881     // -- Parsing --
2882 
2883     // For convenience we wrap the input URI string in a new instance of the
2884     // following internal class.  This saves always having to pass the input
2885     // string as an argument to each internal scan/parse method.
2886 
2887     private class Parser {
2888 
2889         private String input;           // URI input string
2890         private boolean requireServerAuthority = false;
2891 
2892         Parser(String s) {
2893             input = s;
2894             string = s;
2895         }
2896 
2897         // -- Methods for throwing URISyntaxException in various ways --
2898 
2899         private void fail(String reason) throws URISyntaxException {
2900             throw new URISyntaxException(input, reason);
2901         }
2902 
2903         private void fail(String reason, int p) throws URISyntaxException {
2904             throw new URISyntaxException(input, reason, p);
2905         }
2906 
2907         private void failExpecting(String expected, int p)
2908             throws URISyntaxException
2909         {
2910             fail("Expected " + expected, p);
2911         }
2912 
2913 
2914         // -- Simple access to the input string --
2915 
2916         // Tells whether start < end and, if so, whether charAt(start) == c
2917         //
2918         private boolean at(int start, int end, char c) {
2919             return (start < end) && (input.charAt(start) == c);
2920         }
2921 
2922         // Tells whether start + s.length() < end and, if so,
2923         // whether the chars at the start position match s exactly
2924         //
2925         private boolean at(int start, int end, String s) {
2926             int p = start;
2927             int sn = s.length();
2928             if (sn > end - p)
2929                 return false;
2930             int i = 0;
2931             while (i < sn) {
2932                 if (input.charAt(p++) != s.charAt(i)) {
2933                     break;
2934                 }
2935                 i++;
2936             }
2937             return (i == sn);
2938         }
2939 
2940 
2941         // -- Scanning --
2942 
2943         // The various scan and parse methods that follow use a uniform
2944         // convention of taking the current start position and end index as
2945         // their first two arguments.  The start is inclusive while the end is
2946         // exclusive, just as in the String class, i.e., a start/end pair
2947         // denotes the left-open interval [start, end) of the input string.
2948         //
2949         // These methods never proceed past the end position.  They may return
2950         // -1 to indicate outright failure, but more often they simply return
2951         // the position of the first char after the last char scanned.  Thus
2952         // a typical idiom is
2953         //
2954         //     int p = start;
2955         //     int q = scan(p, end, ...);
2956         //     if (q > p)
2957         //         // We scanned something
2958         //         ...;
2959         //     else if (q == p)
2960         //         // We scanned nothing
2961         //         ...;
2962         //     else if (q == -1)
2963         //         // Something went wrong
2964         //         ...;
2965 
2966 
2967         // Scan a specific char: If the char at the given start position is
2968         // equal to c, return the index of the next char; otherwise, return the
2969         // start position.
2970         //
2971         private int scan(int start, int end, char c) {
2972             if ((start < end) && (input.charAt(start) == c))
2973                 return start + 1;
2974             return start;
2975         }
2976 
2977         // Scan forward from the given start position.  Stop at the first char
2978         // in the err string (in which case -1 is returned), or the first char
2979         // in the stop string (in which case the index of the preceding char is
2980         // returned), or the end of the input string (in which case the length
2981         // of the input string is returned).  May return the start position if
2982         // nothing matches.
2983         //
2984         private int scan(int start, int end, String err, String stop) {
2985             int p = start;
2986             while (p < end) {
2987                 char c = input.charAt(p);
2988                 if (err.indexOf(c) >= 0)
2989                     return -1;
2990                 if (stop.indexOf(c) >= 0)
2991                     break;
2992                 p++;
2993             }
2994             return p;
2995         }
2996 
2997         // Scan forward from the given start position.  Stop at the first char
2998         // in the stop string (in which case the index of the preceding char is
2999         // returned), or the end of the input string (in which case the length
3000         // of the input string is returned).  May return the start position if
3001         // nothing matches.
3002         //
3003         private int scan(int start, int end, String stop) {
3004             int p = start;
3005             while (p < end) {
3006                 char c = input.charAt(p);
3007                 if (stop.indexOf(c) >= 0)
3008                     break;
3009                 p++;
3010             }
3011             return p;
3012         }
3013 
3014         // Scan a potential escape sequence, starting at the given position,
3015         // with the given first char (i.e., charAt(start) == c).
3016         //
3017         // This method assumes that if escapes are allowed then visible
3018         // non-US-ASCII chars are also allowed.
3019         //
3020         private int scanEscape(int start, int n, char first)
3021             throws URISyntaxException
3022         {
3023             int p = start;
3024             char c = first;
3025             if (c == '%') {
3026                 // Process escape pair
3027                 if ((p + 3 <= n)
3028                     && match(input.charAt(p + 1), L_HEX, H_HEX)
3029                     && match(input.charAt(p + 2), L_HEX, H_HEX)) {
3030                     return p + 3;
3031                 }
3032                 fail("Malformed escape pair", p);
3033             } else if ((c > 128)
3034                        && !Character.isSpaceChar(c)
3035                        && !Character.isISOControl(c)) {
3036                 // Allow unescaped but visible non-US-ASCII chars
3037                 return p + 1;
3038             }
3039             return p;
3040         }
3041 
3042         // Scan chars that match the given mask pair
3043         //
3044         private int scan(int start, int n, long lowMask, long highMask)
3045             throws URISyntaxException
3046         {
3047             int p = start;
3048             while (p < n) {
3049                 char c = input.charAt(p);
3050                 if (match(c, lowMask, highMask)) {
3051                     p++;
3052                     continue;
3053                 }
3054                 if ((lowMask & L_ESCAPED) != 0) {
3055                     int q = scanEscape(p, n, c);
3056                     if (q > p) {
3057                         p = q;
3058                         continue;
3059                     }
3060                 }
3061                 break;
3062             }
3063             return p;
3064         }
3065 
3066         // Check that each of the chars in [start, end) matches the given mask
3067         //
3068         private void checkChars(int start, int end,
3069                                 long lowMask, long highMask,
3070                                 String what)
3071             throws URISyntaxException
3072         {
3073             int p = scan(start, end, lowMask, highMask);
3074             if (p < end)
3075                 fail("Illegal character in " + what, p);
3076         }
3077 
3078         // Check that the char at position p matches the given mask
3079         //
3080         private void checkChar(int p,
3081                                long lowMask, long highMask,
3082                                String what)
3083             throws URISyntaxException
3084         {
3085             checkChars(p, p + 1, lowMask, highMask, what);
3086         }
3087 
3088 
3089         // -- Parsing --
3090 
3091         // [<scheme>:]<scheme-specific-part>[#<fragment>]
3092         //
3093         void parse(boolean rsa) throws URISyntaxException {
3094             requireServerAuthority = rsa;
3095             int n = input.length();
3096             int p = scan(0, n, "/?#", ":");
3097             if ((p >= 0) && at(p, n, ':')) {
3098                 if (p == 0)
3099                     failExpecting("scheme name", 0);
3100                 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3101                 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3102                 scheme = input.substring(0, p);
3103                 p++;                    // Skip ':'
3104                 if (at(p, n, '/')) {
3105                     p = parseHierarchical(p, n);
3106                 } else {
3107                     // opaque; need to create the schemeSpecificPart
3108                     int q = scan(p, n, "#");
3109                     if (q <= p)
3110                         failExpecting("scheme-specific part", p);
3111                     checkChars(p, q, L_URIC, H_URIC, "opaque part");
3112                     schemeSpecificPart = input.substring(p, q);
3113                     p = q;
3114                 }
3115             } else {
3116                 p = parseHierarchical(0, n);
3117             }
3118             if (at(p, n, '#')) {
3119                 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3120                 fragment = input.substring(p + 1, n);
3121                 p = n;
3122             }
3123             if (p < n)
3124                 fail("end of URI", p);
3125         }
3126 
3127         // [//authority]<path>[?<query>]
3128         //
3129         // DEVIATION from RFC2396: We allow an empty authority component as
3130         // long as it's followed by a non-empty path, query component, or
3131         // fragment component.  This is so that URIs such as "file:///foo/bar"
3132         // will parse.  This seems to be the intent of RFC2396, though the
3133         // grammar does not permit it.  If the authority is empty then the
3134         // userInfo, host, and port components are undefined.
3135         //
3136         // DEVIATION from RFC2396: We allow empty relative paths.  This seems
3137         // to be the intent of RFC2396, but the grammar does not permit it.
3138         // The primary consequence of this deviation is that "#f" parses as a
3139         // relative URI with an empty path.
3140         //
3141         private int parseHierarchical(int start, int n)
3142             throws URISyntaxException
3143         {
3144             int p = start;
3145             if (at(p, n, '/') && at(p + 1, n, '/')) {
3146                 p += 2;
3147                 int q = scan(p, n, "/?#");
3148                 if (q > p) {
3149                     p = parseAuthority(p, q);
3150                 } else if (q < n) {
3151                     // DEVIATION: Allow empty authority prior to non-empty
3152                     // path, query component or fragment identifier
3153                 } else
3154                     failExpecting("authority", p);
3155             }
3156             int q = scan(p, n, "?#"); // DEVIATION: May be empty
3157             checkChars(p, q, L_PATH, H_PATH, "path");
3158             path = input.substring(p, q);
3159             p = q;
3160             if (at(p, n, '?')) {
3161                 p++;
3162                 q = scan(p, n, "#");
3163                 checkChars(p, q, L_URIC, H_URIC, "query");
3164                 query = input.substring(p, q);
3165                 p = q;
3166             }
3167             return p;
3168         }
3169 
3170         // authority     = server | reg_name
3171         //
3172         // Ambiguity: An authority that is a registry name rather than a server
3173         // might have a prefix that parses as a server.  We use the fact that
3174         // the authority component is always followed by '/' or the end of the
3175         // input string to resolve this: If the complete authority did not
3176         // parse as a server then we try to parse it as a registry name.
3177         //
3178         private int parseAuthority(int start, int n)
3179             throws URISyntaxException
3180         {
3181             int p = start;
3182             int q = p;
3183             URISyntaxException ex = null;
3184 
3185             boolean serverChars;
3186             boolean regChars;
3187 
3188             if (scan(p, n, "]") > p) {
3189                 // contains a literal IPv6 address, therefore % is allowed
3190                 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3191             } else {
3192                 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3193             }
3194             regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3195 
3196             if (regChars && !serverChars) {
3197                 // Must be a registry-based authority
3198                 authority = input.substring(p, n);
3199                 return n;
3200             }
3201 
3202             if (serverChars) {
3203                 // Might be (probably is) a server-based authority, so attempt
3204                 // to parse it as such.  If the attempt fails, try to treat it
3205                 // as a registry-based authority.
3206                 try {
3207                     q = parseServer(p, n);
3208                     if (q < n)
3209                         failExpecting("end of authority", q);
3210                     authority = input.substring(p, n);
3211                 } catch (URISyntaxException x) {
3212                     // Undo results of failed parse
3213                     userInfo = null;
3214                     host = null;
3215                     port = -1;
3216                     if (requireServerAuthority) {
3217                         // If we're insisting upon a server-based authority,
3218                         // then just re-throw the exception
3219                         throw x;
3220                     } else {
3221                         // Save the exception in case it doesn't parse as a
3222                         // registry either
3223                         ex = x;
3224                         q = p;
3225                     }
3226                 }
3227             }
3228 
3229             if (q < n) {
3230                 if (regChars) {
3231                     // Registry-based authority
3232                     authority = input.substring(p, n);
3233                 } else if (ex != null) {
3234                     // Re-throw exception; it was probably due to
3235                     // a malformed IPv6 address
3236                     throw ex;
3237                 } else {
3238                     fail("Illegal character in authority", q);
3239                 }
3240             }
3241 
3242             return n;
3243         }
3244 
3245 
3246         // [<userinfo>@]<host>[:<port>]
3247         //
3248         private int parseServer(int start, int n)
3249             throws URISyntaxException
3250         {
3251             int p = start;
3252             int q;
3253 
3254             // userinfo
3255             q = scan(p, n, "/?#", "@");
3256             if ((q >= p) && at(q, n, '@')) {
3257                 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3258                 userInfo = input.substring(p, q);
3259                 p = q + 1;              // Skip '@'
3260             }
3261 
3262             // hostname, IPv4 address, or IPv6 address
3263             if (at(p, n, '[')) {
3264                 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3265                 p++;
3266                 q = scan(p, n, "/?#", "]");
3267                 if ((q > p) && at(q, n, ']')) {
3268                     // look for a "%" scope id
3269                     int r = scan (p, q, "%");
3270                     if (r > p) {
3271                         parseIPv6Reference(p, r);
3272                         if (r+1 == q) {
3273                             fail ("scope id expected");
3274                         }
3275                         checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID,
3276                                                 "scope id");
3277                     } else {
3278                         parseIPv6Reference(p, q);
3279                     }
3280                     host = input.substring(p-1, q+1);
3281                     p = q + 1;
3282                 } else {
3283                     failExpecting("closing bracket for IPv6 address", q);
3284                 }
3285             } else {
3286                 q = parseIPv4Address(p, n);
3287                 if (q <= p)
3288                     q = parseHostname(p, n);
3289                 p = q;
3290             }
3291 
3292             // port
3293             if (at(p, n, ':')) {
3294                 p++;
3295                 q = scan(p, n, "/");
3296                 if (q > p) {
3297                     checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3298                     try {
3299                         port = Integer.parseInt(input, p, q, 10);
3300                     } catch (NumberFormatException x) {
3301                         fail("Malformed port number", p);
3302                     }
3303                     p = q;
3304                 }
3305             }
3306             if (p < n)
3307                 failExpecting("port number", p);
3308 
3309             return p;
3310         }
3311 
3312         // Scan a string of decimal digits whose value fits in a byte
3313         //
3314         private int scanByte(int start, int n)
3315             throws URISyntaxException
3316         {
3317             int p = start;
3318             int q = scan(p, n, L_DIGIT, H_DIGIT);
3319             if (q <= p) return q;
3320             if (Integer.parseInt(input, p, q, 10) > 255) return p;
3321             return q;
3322         }
3323 
3324         // Scan an IPv4 address.
3325         //
3326         // If the strict argument is true then we require that the given
3327         // interval contain nothing besides an IPv4 address; if it is false
3328         // then we only require that it start with an IPv4 address.
3329         //
3330         // If the interval does not contain or start with (depending upon the
3331         // strict argument) a legal IPv4 address characters then we return -1
3332         // immediately; otherwise we insist that these characters parse as a
3333         // legal IPv4 address and throw an exception on failure.
3334         //
3335         // We assume that any string of decimal digits and dots must be an IPv4
3336         // address.  It won't parse as a hostname anyway, so making that
3337         // assumption here allows more meaningful exceptions to be thrown.
3338         //
3339         private int scanIPv4Address(int start, int n, boolean strict)
3340             throws URISyntaxException
3341         {
3342             int p = start;
3343             int q;
3344             int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3345             if ((m <= p) || (strict && (m != n)))
3346                 return -1;
3347             for (;;) {
3348                 // Per RFC2732: At most three digits per byte
3349                 // Further constraint: Each element fits in a byte
3350                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3351                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3352                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3353                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3354                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3355                 if ((q = scan(p, m, '.')) <= p) break;  p = q;
3356                 if ((q = scanByte(p, m)) <= p) break;   p = q;
3357                 if (q < m) break;
3358                 return q;
3359             }
3360             fail("Malformed IPv4 address", q);
3361             return -1;
3362         }
3363 
3364         // Take an IPv4 address: Throw an exception if the given interval
3365         // contains anything except an IPv4 address
3366         //
3367         private int takeIPv4Address(int start, int n, String expected)
3368             throws URISyntaxException
3369         {
3370             int p = scanIPv4Address(start, n, true);
3371             if (p <= start)
3372                 failExpecting(expected, start);
3373             return p;
3374         }
3375 
3376         // Attempt to parse an IPv4 address, returning -1 on failure but
3377         // allowing the given interval to contain [:<characters>] after
3378         // the IPv4 address.
3379         //
3380         private int parseIPv4Address(int start, int n) {
3381             int p;
3382 
3383             try {
3384                 p = scanIPv4Address(start, n, false);
3385             } catch (URISyntaxException x) {
3386                 return -1;
3387             } catch (NumberFormatException nfe) {
3388                 return -1;
3389             }
3390 
3391             if (p > start && p < n) {
3392                 // IPv4 address is followed by something - check that
3393                 // it's a ":" as this is the only valid character to
3394                 // follow an address.
3395                 if (input.charAt(p) != ':') {
3396                     p = -1;
3397                 }
3398             }
3399 
3400             if (p > start)
3401                 host = input.substring(start, p);
3402 
3403             return p;
3404         }
3405 
3406         // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3407         // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
3408         // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
3409         //
3410         private int parseHostname(int start, int n)
3411             throws URISyntaxException
3412         {
3413             int p = start;
3414             int q;
3415             int l = -1;                 // Start of last parsed label
3416 
3417             do {
3418                 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3419                 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3420                 if (q <= p)
3421                     break;
3422                 l = p;
3423                 if (q > p) {
3424                     p = q;
3425                     q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3426                     if (q > p) {
3427                         if (input.charAt(q - 1) == '-')
3428                             fail("Illegal character in hostname", q - 1);
3429                         p = q;
3430                     }
3431                 }
3432                 q = scan(p, n, '.');
3433                 if (q <= p)
3434                     break;
3435                 p = q;
3436             } while (p < n);
3437 
3438             if ((p < n) && !at(p, n, ':'))
3439                 fail("Illegal character in hostname", p);
3440 
3441             if (l < 0)
3442                 failExpecting("hostname", start);
3443 
3444             // for a fully qualified hostname check that the rightmost
3445             // label starts with an alpha character.
3446             if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) {
3447                 fail("Illegal character in hostname", l);
3448             }
3449 
3450             host = input.substring(start, p);
3451             return p;
3452         }
3453 
3454 
3455         // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3456         //
3457         // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3458         // the form ::12.34.56.78, which are clearly shown in the examples
3459         // earlier in the document.  Here is the original grammar:
3460         //
3461         //   IPv6address = hexpart [ ":" IPv4address ]
3462         //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3463         //   hexseq      = hex4 *( ":" hex4)
3464         //   hex4        = 1*4HEXDIG
3465         //
3466         // We therefore use the following revised grammar:
3467         //
3468         //   IPv6address = hexseq [ ":" IPv4address ]
3469         //                 | hexseq [ "::" [ hexpost ] ]
3470         //                 | "::" [ hexpost ]
3471         //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
3472         //   hexseq      = hex4 *( ":" hex4)
3473         //   hex4        = 1*4HEXDIG
3474         //
3475         // This covers all and only the following cases:
3476         //
3477         //   hexseq
3478         //   hexseq : IPv4address
3479         //   hexseq ::
3480         //   hexseq :: hexseq
3481         //   hexseq :: hexseq : IPv4address
3482         //   hexseq :: IPv4address
3483         //   :: hexseq
3484         //   :: hexseq : IPv4address
3485         //   :: IPv4address
3486         //   ::
3487         //
3488         // Additionally we constrain the IPv6 address as follows :-
3489         //
3490         //  i.  IPv6 addresses without compressed zeros should contain
3491         //      exactly 16 bytes.
3492         //
3493         //  ii. IPv6 addresses with compressed zeros should contain
3494         //      less than 16 bytes.
3495 
3496         private int ipv6byteCount = 0;
3497 
3498         private int parseIPv6Reference(int start, int n)
3499             throws URISyntaxException
3500         {
3501             int p = start;
3502             int q;
3503             boolean compressedZeros = false;
3504 
3505             q = scanHexSeq(p, n);
3506 
3507             if (q > p) {
3508                 p = q;
3509                 if (at(p, n, "::")) {
3510                     compressedZeros = true;
3511                     p = scanHexPost(p + 2, n);
3512                 } else if (at(p, n, ':')) {
3513                     p = takeIPv4Address(p + 1,  n, "IPv4 address");
3514                     ipv6byteCount += 4;
3515                 }
3516             } else if (at(p, n, "::")) {
3517                 compressedZeros = true;
3518                 p = scanHexPost(p + 2, n);
3519             }
3520             if (p < n)
3521                 fail("Malformed IPv6 address", start);
3522             if (ipv6byteCount > 16)
3523                 fail("IPv6 address too long", start);
3524             if (!compressedZeros && ipv6byteCount < 16)
3525                 fail("IPv6 address too short", start);
3526             if (compressedZeros && ipv6byteCount == 16)
3527                 fail("Malformed IPv6 address", start);
3528 
3529             return p;
3530         }
3531 
3532         private int scanHexPost(int start, int n)
3533             throws URISyntaxException
3534         {
3535             int p = start;
3536             int q;
3537 
3538             if (p == n)
3539                 return p;
3540 
3541             q = scanHexSeq(p, n);
3542             if (q > p) {
3543                 p = q;
3544                 if (at(p, n, ':')) {
3545                     p++;
3546                     p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3547                     ipv6byteCount += 4;
3548                 }
3549             } else {
3550                 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3551                 ipv6byteCount += 4;
3552             }
3553             return p;
3554         }
3555 
3556         // Scan a hex sequence; return -1 if one could not be scanned
3557         //
3558         private int scanHexSeq(int start, int n)
3559             throws URISyntaxException
3560         {
3561             int p = start;
3562             int q;
3563 
3564             q = scan(p, n, L_HEX, H_HEX);
3565             if (q <= p)
3566                 return -1;
3567             if (at(q, n, '.'))          // Beginning of IPv4 address
3568                 return -1;
3569             if (q > p + 4)
3570                 fail("IPv6 hexadecimal digit sequence too long", p);
3571             ipv6byteCount += 2;
3572             p = q;
3573             while (p < n) {
3574                 if (!at(p, n, ':'))
3575                     break;
3576                 if (at(p + 1, n, ':'))
3577                     break;              // "::"
3578                 p++;
3579                 q = scan(p, n, L_HEX, H_HEX);
3580                 if (q <= p)
3581                     failExpecting("digits for an IPv6 address", p);
3582                 if (at(q, n, '.')) {    // Beginning of IPv4 address
3583                     p--;
3584                     break;
3585                 }
3586                 if (q > p + 4)
3587                     fail("IPv6 hexadecimal digit sequence too long", p);
3588                 ipv6byteCount += 2;
3589                 p = q;
3590             }
3591 
3592             return p;
3593         }
3594 
3595     }
3596     static {
3597         SharedSecrets.setJavaNetUriAccess(
3598             new JavaNetUriAccess() {
3599                 public URI create(String scheme, String path) {
3600                     return new URI(scheme, path);
3601                 }
3602             }
3603         );
3604     }
3605 }