1 /* 2 * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.File; 29 import java.io.IOException; 30 import java.io.InvalidObjectException; 31 import java.io.ObjectInputStream; 32 import java.io.ObjectOutputStream; 33 import java.io.Serializable; 34 import java.nio.ByteBuffer; 35 import java.nio.CharBuffer; 36 import java.nio.charset.CharsetDecoder; 37 import java.nio.charset.CoderResult; 38 import java.nio.charset.CodingErrorAction; 39 import java.nio.charset.CharacterCodingException; 40 import java.nio.file.Path; 41 import java.text.Normalizer; 42 import jdk.internal.access.JavaNetUriAccess; 43 import jdk.internal.access.SharedSecrets; 44 import sun.nio.cs.ThreadLocalCoders; 45 46 import java.lang.Character; // for javadoc 47 import java.lang.NullPointerException; // for javadoc 48 49 50 /** 51 * Represents a Uniform Resource Identifier (URI) reference. 52 * 53 * <p> Aside from some minor deviations noted below, an instance of this 54 * class represents a URI reference as defined by 55 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 56 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 57 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 58 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 59 * also supports scope_ids. The syntax and usage of scope_ids is described 60 * <a href="Inet6Address.html#scoped">here</a>. 61 * This class provides constructors for creating URI instances from 62 * their components or by parsing their string forms, methods for accessing the 63 * various components of an instance, and methods for normalizing, resolving, 64 * and relativizing URI instances. Instances of this class are immutable. 65 * 66 * 67 * <h2> URI syntax and components </h2> 68 * 69 * At the highest level a URI reference (hereinafter simply "URI") in string 70 * form has the syntax 71 * 72 * <blockquote> 73 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 74 * </blockquote> 75 * 76 * where square brackets [...] delineate optional components and the characters 77 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 78 * 79 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 80 * said to be <i>relative</i>. URIs are also classified according to whether 81 * they are <i>opaque</i> or <i>hierarchical</i>. 82 * 83 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 84 * not begin with a slash character ({@code '/'}). Opaque URIs are not 85 * subject to further parsing. Some examples of opaque URIs are: 86 * 87 * <blockquote><ul style="list-style-type:none"> 88 * <li>{@code mailto:java-net@www.example.com}</li> 89 * <li>{@code news:comp.lang.java}</li> 90 * <li>{@code urn:isbn:096139210x}</li> 91 * </ul></blockquote> 92 * 93 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 94 * scheme-specific part begins with a slash character, or a relative URI, that 95 * is, a URI that does not specify a scheme. Some examples of hierarchical 96 * URIs are: 97 * 98 * <blockquote> 99 * {@code http://example.com/languages/java/}<br> 100 * {@code sample/a/index.html#28}<br> 101 * {@code ../../demo/b/index.html}<br> 102 * {@code file:///~/calendar} 103 * </blockquote> 104 * 105 * <p> A hierarchical URI is subject to further parsing according to the syntax 106 * 107 * <blockquote> 108 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 109 * </blockquote> 110 * 111 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 112 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 113 * scheme-specific part of a hierarchical URI consists of the characters 114 * between the scheme and fragment components. 115 * 116 * <p> The authority component of a hierarchical URI is, if specified, either 117 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 118 * parses according to the familiar syntax 119 * 120 * <blockquote> 121 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 122 * </blockquote> 123 * 124 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 125 * themselves. Nearly all URI schemes currently in use are server-based. An 126 * authority component that does not parse in this way is considered to be 127 * registry-based. 128 * 129 * <p> The path component of a hierarchical URI is itself said to be absolute 130 * if it begins with a slash character ({@code '/'}); otherwise it is 131 * relative. The path of a hierarchical URI that is either absolute or 132 * specifies an authority is always absolute. 133 * 134 * <p> All told, then, a URI instance has the following nine components: 135 * 136 * <table class="striped" style="margin-left:2em"> 137 * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption> 138 * <thead> 139 * <tr><th scope="col">Component</th><th scope="col">Type</th></tr> 140 * </thead> 141 * <tbody style="text-align:left"> 142 * <tr><th scope="row">scheme</th><td>{@code String}</td></tr> 143 * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr> 144 * <tr><th scope="row">authority</th><td>{@code String}</td></tr> 145 * <tr><th scope="row">user-info</th><td>{@code String}</td></tr> 146 * <tr><th scope="row">host</th><td>{@code String}</td></tr> 147 * <tr><th scope="row">port</th><td>{@code int}</td></tr> 148 * <tr><th scope="row">path</th><td>{@code String}</td></tr> 149 * <tr><th scope="row">query</th><td>{@code String}</td></tr> 150 * <tr><th scope="row">fragment</th><td>{@code String}</td></tr> 151 * </tbody> 152 * </table> 153 * 154 * In a given instance any particular component is either <i>undefined</i> or 155 * <i>defined</i> with a distinct value. Undefined string components are 156 * represented by {@code null}, while undefined integer components are 157 * represented by {@code -1}. A string component may be defined to have the 158 * empty string as its value; this is not equivalent to that component being 159 * undefined. 160 * 161 * <p> Whether a particular component is or is not defined in an instance 162 * depends upon the type of the URI being represented. An absolute URI has a 163 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 164 * possibly a fragment, but has no other components. A hierarchical URI always 165 * has a path (though it may be empty) and a scheme-specific-part (which at 166 * least contains the path), and may have any of the other components. If the 167 * authority component is present and is server-based then the host component 168 * will be defined and the user-information and port components may be defined. 169 * 170 * 171 * <h3> Operations on URI instances </h3> 172 * 173 * The key operations supported by this class are those of 174 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 175 * 176 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 177 * and {@code ".."} segments from the path component of a hierarchical URI. 178 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 179 * removed only if it is preceded by a non-{@code ".."} segment. 180 * Normalization has no effect upon opaque URIs. 181 * 182 * <p> <i>Resolution</i> is the process of resolving one URI against another, 183 * <i>base</i> URI. The resulting URI is constructed from components of both 184 * URIs in the manner specified by RFC 2396, taking components from the 185 * base URI for those not specified in the original. For hierarchical URIs, 186 * the path of the original is resolved against the path of the base and then 187 * normalized. The result, for example, of resolving 188 * 189 * <blockquote> 190 * {@code sample/a/index.html#28} 191 * 192 * (1) 193 * </blockquote> 194 * 195 * against the base URI {@code http://example.com/languages/java/} is the result 196 * URI 197 * 198 * <blockquote> 199 * {@code http://example.com/languages/java/sample/a/index.html#28} 200 * </blockquote> 201 * 202 * Resolving the relative URI 203 * 204 * <blockquote> 205 * {@code ../../demo/b/index.html} (2) 206 * </blockquote> 207 * 208 * against this result yields, in turn, 209 * 210 * <blockquote> 211 * {@code http://example.com/languages/java/demo/b/index.html} 212 * </blockquote> 213 * 214 * Resolution of both absolute and relative URIs, and of both absolute and 215 * relative paths in the case of hierarchical URIs, is supported. Resolving 216 * the URI {@code file:///~calendar} against any other URI simply yields the 217 * original URI, since it is absolute. Resolving the relative URI (2) above 218 * against the relative base URI (1) yields the normalized, but still relative, 219 * URI 220 * 221 * <blockquote> 222 * {@code demo/b/index.html} 223 * </blockquote> 224 * 225 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 226 * two normalized URIs <i>u</i> and <i>v</i>, 227 * 228 * <blockquote> 229 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 230 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 231 * </blockquote> 232 * 233 * This operation is often useful when constructing a document containing URIs 234 * that must be made relative to the base URI of the document wherever 235 * possible. For example, relativizing the URI 236 * 237 * <blockquote> 238 * {@code http://example.com/languages/java/sample/a/index.html#28} 239 * </blockquote> 240 * 241 * against the base URI 242 * 243 * <blockquote> 244 * {@code http://example.com/languages/java/} 245 * </blockquote> 246 * 247 * yields the relative URI {@code sample/a/index.html#28}. 248 * 249 * 250 * <h3> Character categories </h3> 251 * 252 * RFC 2396 specifies precisely which characters are permitted in the 253 * various components of a URI reference. The following categories, most of 254 * which are taken from that specification, are used below to describe these 255 * constraints: 256 * 257 * <table class="striped" style="margin-left:2em"> 258 * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption> 259 * <thead> 260 * <tr><th scope="col">Category</th><th scope="col">Description</th></tr> 261 * </thead> 262 * <tbody style="text-align:left"> 263 * <tr><th scope="row" style="vertical-align:top">alpha</th> 264 * <td>The US-ASCII alphabetic characters, 265 * {@code 'A'} through {@code 'Z'} 266 * and {@code 'a'} through {@code 'z'}</td></tr> 267 * <tr><th scope="row" style="vertical-align:top">digit</th> 268 * <td>The US-ASCII decimal digit characters, 269 * {@code '0'} through {@code '9'}</td></tr> 270 * <tr><th scope="row" style="vertical-align:top">alphanum</th> 271 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 272 * <tr><th scope="row" style="vertical-align:top">unreserved</th> 273 * <td>All <i>alphanum</i> characters together with those in the string 274 * {@code "_-!.~'()*"}</td></tr> 275 * <tr><th scope="row" style="vertical-align:top">punct</th> 276 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 277 * <tr><th scope="row" style="vertical-align:top">reserved</th> 278 * <td>All <i>punct</i> characters together with those in the string 279 * {@code "?/[]@"}</td></tr> 280 * <tr><th scope="row" style="vertical-align:top">escaped</th> 281 * <td>Escaped octets, that is, triplets consisting of the percent 282 * character ({@code '%'}) followed by two hexadecimal digits 283 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 284 * {@code 'a'}-{@code 'f'})</td></tr> 285 * <tr><th scope="row" style="vertical-align:top">other</th> 286 * <td>The Unicode characters that are not in the US-ASCII character set, 287 * are not control characters (according to the {@link 288 * java.lang.Character#isISOControl(char) Character.isISOControl} 289 * method), and are not space characters (according to the {@link 290 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 291 * method) <i>(<b>Deviation from RFC 2396</b>, which is 292 * limited to US-ASCII)</i></td></tr> 293 * </tbody> 294 * </table> 295 * 296 * <p><a id="legal-chars"></a> The set of all legal URI characters consists of 297 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 298 * characters. 299 * 300 * 301 * <h3> Escaped octets, quotation, encoding, and decoding </h3> 302 * 303 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 304 * fragment components. Escaping serves two purposes in URIs: 305 * 306 * <ul> 307 * 308 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 309 * conform strictly to RFC 2396 by not containing any <i>other</i> 310 * characters. </p></li> 311 * 312 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 313 * component. The user-info, path, query, and fragment components differ 314 * slightly in terms of which characters are considered legal and illegal. 315 * </p></li> 316 * 317 * </ul> 318 * 319 * These purposes are served in this class by three related operations: 320 * 321 * <ul> 322 * 323 * <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it 324 * with the sequence of escaped octets that represent that character in the 325 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 326 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 327 * RFC 2396</b>, which does not specify any particular character 328 * set.)</i> </p></li> 329 * 330 * <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by 331 * encoding it. The space character, for example, is quoted by replacing it 332 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 333 * characters this transformation has exactly the effect required by 334 * RFC 2396. </p></li> 335 * 336 * <li><p><a id="decode"></a> 337 * A sequence of escaped octets is <i>decoded</i> by 338 * replacing it with the sequence of characters that it represents in the 339 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 340 * effect of de-quoting any quoted US-ASCII characters as well as that of 341 * decoding any encoded non-US-ASCII characters. If a <a 342 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 343 * when decoding the escaped octets then the erroneous octets are replaced by 344 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 345 * 346 * </ul> 347 * 348 * These operations are exposed in the constructors and methods of this class 349 * as follows: 350 * 351 * <ul> 352 * 353 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 354 * constructor} requires any illegal characters in its argument to be 355 * quoted and preserves any escaped octets and <i>other</i> characters that 356 * are present. </p></li> 357 * 358 * <li><p> The {@linkplain 359 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 360 * multi-argument constructors} quote illegal characters as 361 * required by the components in which they appear. The percent character 362 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 363 * characters are preserved. </p></li> 364 * 365 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 366 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 367 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 368 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 369 * values of their corresponding components in raw form, without interpreting 370 * any escaped octets. The strings returned by these methods may contain 371 * both escaped octets and <i>other</i> characters, and will not contain any 372 * illegal characters. </p></li> 373 * 374 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 375 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 376 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 377 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 378 * octets in their corresponding components. The strings returned by these 379 * methods may contain both <i>other</i> characters and illegal characters, 380 * and will not contain any escaped octets. </p></li> 381 * 382 * <li><p> The {@link #toString() toString} method returns a URI string with 383 * all necessary quotation but which may contain <i>other</i> characters. 384 * </p></li> 385 * 386 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 387 * quoted and encoded URI string that does not contain any <i>other</i> 388 * characters. </p></li> 389 * 390 * </ul> 391 * 392 * 393 * <h3> Identities </h3> 394 * 395 * For any URI <i>u</i>, it is always the case that 396 * 397 * <blockquote> 398 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 399 * </blockquote> 400 * 401 * For any URI <i>u</i> that does not contain redundant syntax such as two 402 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 403 * colon following a host name but no port (as in 404 * {@code http://www.example.com:} ), and that does not encode characters 405 * except those that must be quoted, the following identities also hold: 406 * <pre> 407 * new URI(<i>u</i>.getScheme(), 408 * <i>u</i>.getSchemeSpecificPart(), 409 * <i>u</i>.getFragment()) 410 * .equals(<i>u</i>)</pre> 411 * in all cases, 412 * <pre> 413 * new URI(<i>u</i>.getScheme(), 414 * <i>u</i>.getAuthority(), 415 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 416 * <i>u</i>.getFragment()) 417 * .equals(<i>u</i>)</pre> 418 * if <i>u</i> is hierarchical, and 419 * <pre> 420 * new URI(<i>u</i>.getScheme(), 421 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 422 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 423 * <i>u</i>.getFragment()) 424 * .equals(<i>u</i>)</pre> 425 * if <i>u</i> is hierarchical and has either no authority or a server-based 426 * authority. 427 * 428 * 429 * <h3> URIs, URLs, and URNs </h3> 430 * 431 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 432 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 433 * not every URI is a URL. This is because there is another subcategory of 434 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 435 * specify how to locate them. The {@code mailto}, {@code news}, and 436 * {@code isbn} URIs shown above are examples of URNs. 437 * 438 * <p> The conceptual distinction between URIs and URLs is reflected in the 439 * differences between this class and the {@link URL} class. 440 * 441 * <p> An instance of this class represents a URI reference in the syntactic 442 * sense defined by RFC 2396. A URI may be either absolute or relative. 443 * A URI string is parsed according to the generic syntax without regard to the 444 * scheme, if any, that it specifies. No lookup of the host, if any, is 445 * performed, and no scheme-dependent stream handler is constructed. Equality, 446 * hashing, and comparison are defined strictly in terms of the character 447 * content of the instance. In other words, a URI instance is little more than 448 * a structured string that supports the syntactic, scheme-independent 449 * operations of comparison, normalization, resolution, and relativization. 450 * 451 * <p> An instance of the {@link URL} class, by contrast, represents the 452 * syntactic components of a URL together with some of the information required 453 * to access the resource that it describes. A URL must be absolute, that is, 454 * it must always specify a scheme. A URL string is parsed according to its 455 * scheme. A stream handler is always established for a URL, and in fact it is 456 * impossible to create a URL instance for a scheme for which no handler is 457 * available. Equality and hashing depend upon both the scheme and the 458 * Internet address of the host, if any; comparison is not defined. In other 459 * words, a URL is a structured string that supports the syntactic operation of 460 * resolution as well as the network I/O operations of looking up the host and 461 * opening a connection to the specified resource. 462 * 463 * @apiNote 464 * 465 * Applications working with file paths and file URIs should take great 466 * care to use the appropriate methods to convert between the two. 467 * The {@link Path#of(URI)} factory method and the {@link File#File(URI)} 468 * constructor can be used to create {@link Path} or {@link File} 469 * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()} 470 * can be used to create a {@link URI} from a file path. 471 * Applications should never try to {@linkplain 472 * #URI(String, String, String, int, String, String, String) 473 * construct}, {@linkplain #URI(String) parse}, or 474 * {@linkplain #resolve(String) resolve} a {@code URI} 475 * from the direct string representation of a {@code File} or {@code Path} 476 * instance. 477 * <p> 478 * Some components of a URL or URI, such as <i>userinfo</i>, may 479 * be abused to construct misleading URLs or URIs. Applications 480 * that deal with URLs or URIs should take into account 481 * the recommendations advised in <a 482 * href="https://tools.ietf.org/html/rfc3986#section-7">RFC3986, 483 * Section 7, Security Considerations</a>. 484 * 485 * @author Mark Reinhold 486 * @since 1.4 487 * 488 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 489 * transformation format of ISO 10646</i></a>, <br><a 490 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 491 * Architecture</i></a>, <br><a 492 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 493 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 494 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 495 * Literal IPv6 Addresses in URLs</i></a>, <br><a 496 * href="URISyntaxException.html">URISyntaxException</a> 497 */ 498 499 public final class URI 500 implements Comparable<URI>, Serializable 501 { 502 503 // Note: Comments containing the word "ASSERT" indicate places where a 504 // throw of an InternalError should be replaced by an appropriate assertion 505 // statement once asserts are enabled in the build. 506 507 static final long serialVersionUID = -6052424284110960213L; 508 509 510 // -- Properties and components of this instance -- 511 512 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 513 private transient String scheme; // null ==> relative URI 514 private transient String fragment; 515 516 // Hierarchical URI components: [//<authority>]<path>[?<query>] 517 private transient String authority; // Registry or server 518 519 // Server-based authority: [<userInfo>@]<host>[:<port>] 520 private transient String userInfo; 521 private transient String host; // null ==> registry-based 522 private transient int port = -1; // -1 ==> undefined 523 524 // Remaining components of hierarchical URIs 525 private transient String path; // null ==> opaque 526 private transient String query; 527 528 // The remaining fields may be computed on demand, which is safe even in 529 // the face of multiple threads racing to initialize them 530 private transient String schemeSpecificPart; 531 private transient int hash; // Zero ==> undefined 532 533 private transient String decodedUserInfo; 534 private transient String decodedAuthority; 535 private transient String decodedPath; 536 private transient String decodedQuery; 537 private transient String decodedFragment; 538 private transient String decodedSchemeSpecificPart; 539 540 /** 541 * The string form of this URI. 542 * 543 * @serial 544 */ 545 private volatile String string; // The only serializable field 546 547 548 549 // -- Constructors and factories -- 550 551 private URI() { } // Used internally 552 553 /** 554 * Constructs a URI by parsing the given string. 555 * 556 * <p> This constructor parses the given string exactly as specified by the 557 * grammar in <a 558 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 559 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 560 * 561 * <ul> 562 * 563 * <li><p> An empty authority component is permitted as long as it is 564 * followed by a non-empty path, a query component, or a fragment 565 * component. This allows the parsing of URIs such as 566 * {@code "file:///foo/bar"}, which seems to be the intent of 567 * RFC 2396 although the grammar does not permit it. If the 568 * authority component is empty then the user-information, host, and port 569 * components are undefined. </p></li> 570 * 571 * <li><p> Empty relative paths are permitted; this seems to be the 572 * intent of RFC 2396 although the grammar does not permit it. The 573 * primary consequence of this deviation is that a standalone fragment 574 * such as {@code "#foo"} parses as a relative URI with an empty path 575 * and the given fragment, and can be usefully <a 576 * href="#resolve-frag">resolved</a> against a base URI. 577 * 578 * <li><p> IPv4 addresses in host components are parsed rigorously, as 579 * specified by <a 580 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 581 * element of a dotted-quad address must contain no more than three 582 * decimal digits. Each element is further constrained to have a value 583 * no greater than 255. </p></li> 584 * 585 * <li> <p> Hostnames in host components that comprise only a single 586 * domain label are permitted to start with an <i>alphanum</i> 587 * character. This seems to be the intent of <a 588 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 589 * section 3.2.2 although the grammar does not permit it. The 590 * consequence of this deviation is that the authority component of a 591 * hierarchical URI such as {@code s://123}, will parse as a server-based 592 * authority. </p></li> 593 * 594 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 595 * address must be enclosed in square brackets ({@code '['} and 596 * {@code ']'}) as specified by <a 597 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 598 * IPv6 address itself must parse according to <a 599 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 600 * addresses are further constrained to describe no more than sixteen 601 * bytes of address information, a constraint implicit in RFC 2373 602 * but not expressible in the grammar. </p></li> 603 * 604 * <li><p> Characters in the <i>other</i> category are permitted wherever 605 * RFC 2396 permits <i>escaped</i> octets, that is, in the 606 * user-information, path, query, and fragment components, as well as in 607 * the authority component if the authority is registry-based. This 608 * allows URIs to contain Unicode characters beyond those in the US-ASCII 609 * character set. </p></li> 610 * 611 * </ul> 612 * 613 * @param str The string to be parsed into a URI 614 * 615 * @throws NullPointerException 616 * If {@code str} is {@code null} 617 * 618 * @throws URISyntaxException 619 * If the given string violates RFC 2396, as augmented 620 * by the above deviations 621 */ 622 public URI(String str) throws URISyntaxException { 623 new Parser(str).parse(false); 624 } 625 626 /** 627 * Constructs a hierarchical URI from the given components. 628 * 629 * <p> If a scheme is given then the path, if also given, must either be 630 * empty or begin with a slash character ({@code '/'}). Otherwise a 631 * component of the new URI may be left undefined by passing {@code null} 632 * for the corresponding parameter or, in the case of the {@code port} 633 * parameter, by passing {@code -1}. 634 * 635 * <p> This constructor first builds a URI string from the given components 636 * according to the rules specified in <a 637 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 638 * section 5.2, step 7: </p> 639 * 640 * <ol> 641 * 642 * <li><p> Initially, the result string is empty. </p></li> 643 * 644 * <li><p> If a scheme is given then it is appended to the result, 645 * followed by a colon character ({@code ':'}). </p></li> 646 * 647 * <li><p> If user information, a host, or a port are given then the 648 * string {@code "//"} is appended. </p></li> 649 * 650 * <li><p> If user information is given then it is appended, followed by 651 * a commercial-at character ({@code '@'}). Any character not in the 652 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 653 * categories is <a href="#quote">quoted</a>. </p></li> 654 * 655 * <li><p> If a host is given then it is appended. If the host is a 656 * literal IPv6 address but is not enclosed in square brackets 657 * ({@code '['} and {@code ']'}) then the square brackets are added. 658 * </p></li> 659 * 660 * <li><p> If a port number is given then a colon character 661 * ({@code ':'}) is appended, followed by the port number in decimal. 662 * </p></li> 663 * 664 * <li><p> If a path is given then it is appended. Any character not in 665 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 666 * categories, and not equal to the slash character ({@code '/'}) or the 667 * commercial-at character ({@code '@'}), is quoted. </p></li> 668 * 669 * <li><p> If a query is given then a question-mark character 670 * ({@code '?'}) is appended, followed by the query. Any character that 671 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 672 * </p></li> 673 * 674 * <li><p> Finally, if a fragment is given then a hash character 675 * ({@code '#'}) is appended, followed by the fragment. Any character 676 * that is not a legal URI character is quoted. </p></li> 677 * 678 * </ol> 679 * 680 * <p> The resulting URI string is then parsed as if by invoking the {@link 681 * #URI(String)} constructor and then invoking the {@link 682 * #parseServerAuthority()} method upon the result; this may cause a {@link 683 * URISyntaxException} to be thrown. </p> 684 * 685 * @param scheme Scheme name 686 * @param userInfo User name and authorization information 687 * @param host Host name 688 * @param port Port number 689 * @param path Path 690 * @param query Query 691 * @param fragment Fragment 692 * 693 * @throws URISyntaxException 694 * If both a scheme and a path are given but the path is relative, 695 * if the URI string constructed from the given components violates 696 * RFC 2396, or if the authority component of the string is 697 * present but cannot be parsed as a server-based authority 698 */ 699 public URI(String scheme, 700 String userInfo, String host, int port, 701 String path, String query, String fragment) 702 throws URISyntaxException 703 { 704 String s = toString(scheme, null, 705 null, userInfo, host, port, 706 path, query, fragment); 707 checkPath(s, scheme, path); 708 new Parser(s).parse(true); 709 } 710 711 /** 712 * Constructs a hierarchical URI from the given components. 713 * 714 * <p> If a scheme is given then the path, if also given, must either be 715 * empty or begin with a slash character ({@code '/'}). Otherwise a 716 * component of the new URI may be left undefined by passing {@code null} 717 * for the corresponding parameter. 718 * 719 * <p> This constructor first builds a URI string from the given components 720 * according to the rules specified in <a 721 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 722 * section 5.2, step 7: </p> 723 * 724 * <ol> 725 * 726 * <li><p> Initially, the result string is empty. </p></li> 727 * 728 * <li><p> If a scheme is given then it is appended to the result, 729 * followed by a colon character ({@code ':'}). </p></li> 730 * 731 * <li><p> If an authority is given then the string {@code "//"} is 732 * appended, followed by the authority. If the authority contains a 733 * literal IPv6 address then the address must be enclosed in square 734 * brackets ({@code '['} and {@code ']'}). Any character not in the 735 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 736 * categories, and not equal to the commercial-at character 737 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 738 * 739 * <li><p> If a path is given then it is appended. Any character not in 740 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 741 * categories, and not equal to the slash character ({@code '/'}) or the 742 * commercial-at character ({@code '@'}), is quoted. </p></li> 743 * 744 * <li><p> If a query is given then a question-mark character 745 * ({@code '?'}) is appended, followed by the query. Any character that 746 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 747 * </p></li> 748 * 749 * <li><p> Finally, if a fragment is given then a hash character 750 * ({@code '#'}) is appended, followed by the fragment. Any character 751 * that is not a legal URI character is quoted. </p></li> 752 * 753 * </ol> 754 * 755 * <p> The resulting URI string is then parsed as if by invoking the {@link 756 * #URI(String)} constructor and then invoking the {@link 757 * #parseServerAuthority()} method upon the result; this may cause a {@link 758 * URISyntaxException} to be thrown. </p> 759 * 760 * @param scheme Scheme name 761 * @param authority Authority 762 * @param path Path 763 * @param query Query 764 * @param fragment Fragment 765 * 766 * @throws URISyntaxException 767 * If both a scheme and a path are given but the path is relative, 768 * if the URI string constructed from the given components violates 769 * RFC 2396, or if the authority component of the string is 770 * present but cannot be parsed as a server-based authority 771 */ 772 public URI(String scheme, 773 String authority, 774 String path, String query, String fragment) 775 throws URISyntaxException 776 { 777 String s = toString(scheme, null, 778 authority, null, null, -1, 779 path, query, fragment); 780 checkPath(s, scheme, path); 781 new Parser(s).parse(false); 782 } 783 784 /** 785 * Constructs a hierarchical URI from the given components. 786 * 787 * <p> A component may be left undefined by passing {@code null}. 788 * 789 * <p> This convenience constructor works as if by invoking the 790 * seven-argument constructor as follows: 791 * 792 * <blockquote> 793 * {@code new} {@link #URI(String, String, String, int, String, String, String) 794 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 795 * </blockquote> 796 * 797 * @param scheme Scheme name 798 * @param host Host name 799 * @param path Path 800 * @param fragment Fragment 801 * 802 * @throws URISyntaxException 803 * If the URI string constructed from the given components 804 * violates RFC 2396 805 */ 806 public URI(String scheme, String host, String path, String fragment) 807 throws URISyntaxException 808 { 809 this(scheme, null, host, -1, path, null, fragment); 810 } 811 812 /** 813 * Constructs a URI from the given components. 814 * 815 * <p> A component may be left undefined by passing {@code null}. 816 * 817 * <p> This constructor first builds a URI in string form using the given 818 * components as follows: </p> 819 * 820 * <ol> 821 * 822 * <li><p> Initially, the result string is empty. </p></li> 823 * 824 * <li><p> If a scheme is given then it is appended to the result, 825 * followed by a colon character ({@code ':'}). </p></li> 826 * 827 * <li><p> If a scheme-specific part is given then it is appended. Any 828 * character that is not a <a href="#legal-chars">legal URI character</a> 829 * is <a href="#quote">quoted</a>. </p></li> 830 * 831 * <li><p> Finally, if a fragment is given then a hash character 832 * ({@code '#'}) is appended to the string, followed by the fragment. 833 * Any character that is not a legal URI character is quoted. </p></li> 834 * 835 * </ol> 836 * 837 * <p> The resulting URI string is then parsed in order to create the new 838 * URI instance as if by invoking the {@link #URI(String)} constructor; 839 * this may cause a {@link URISyntaxException} to be thrown. </p> 840 * 841 * @param scheme Scheme name 842 * @param ssp Scheme-specific part 843 * @param fragment Fragment 844 * 845 * @throws URISyntaxException 846 * If the URI string constructed from the given components 847 * violates RFC 2396 848 */ 849 public URI(String scheme, String ssp, String fragment) 850 throws URISyntaxException 851 { 852 new Parser(toString(scheme, ssp, 853 null, null, null, -1, 854 null, null, fragment)) 855 .parse(false); 856 } 857 858 /** 859 * Constructs a simple URI consisting of only a scheme and a pre-validated 860 * path. Provides a fast-path for some internal cases. 861 */ 862 URI(String scheme, String path) { 863 assert validSchemeAndPath(scheme, path); 864 this.scheme = scheme; 865 this.path = path; 866 } 867 868 private static boolean validSchemeAndPath(String scheme, String path) { 869 try { 870 URI u = new URI(scheme + ":" + path); 871 return scheme.equals(u.scheme) && path.equals(u.path); 872 } catch (URISyntaxException e) { 873 return false; 874 } 875 } 876 877 /** 878 * Creates a URI by parsing the given string. 879 * 880 * <p> This convenience factory method works as if by invoking the {@link 881 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 882 * constructor is caught and wrapped in a new {@link 883 * IllegalArgumentException} object, which is then thrown. 884 * 885 * <p> This method is provided for use in situations where it is known that 886 * the given string is a legal URI, for example for URI constants declared 887 * within a program, and so it would be considered a programming error 888 * for the string not to parse as such. The constructors, which throw 889 * {@link URISyntaxException} directly, should be used in situations where a 890 * URI is being constructed from user input or from some other source that 891 * may be prone to errors. </p> 892 * 893 * @param str The string to be parsed into a URI 894 * @return The new URI 895 * 896 * @throws NullPointerException 897 * If {@code str} is {@code null} 898 * 899 * @throws IllegalArgumentException 900 * If the given string violates RFC 2396 901 */ 902 public static URI create(String str) { 903 try { 904 return new URI(str); 905 } catch (URISyntaxException x) { 906 throw new IllegalArgumentException(x.getMessage(), x); 907 } 908 } 909 910 911 // -- Operations -- 912 913 /** 914 * Attempts to parse this URI's authority component, if defined, into 915 * user-information, host, and port components. 916 * 917 * <p> If this URI's authority component has already been recognized as 918 * being server-based then it will already have been parsed into 919 * user-information, host, and port components. In this case, or if this 920 * URI has no authority component, this method simply returns this URI. 921 * 922 * <p> Otherwise this method attempts once more to parse the authority 923 * component into user-information, host, and port components, and throws 924 * an exception describing why the authority component could not be parsed 925 * in that way. 926 * 927 * <p> This method is provided because the generic URI syntax specified in 928 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 929 * cannot always distinguish a malformed server-based authority from a 930 * legitimate registry-based authority. It must therefore treat some 931 * instances of the former as instances of the latter. The authority 932 * component in the URI string {@code "//foo:bar"}, for example, is not a 933 * legal server-based authority but it is legal as a registry-based 934 * authority. 935 * 936 * <p> In many common situations, for example when working URIs that are 937 * known to be either URNs or URLs, the hierarchical URIs being used will 938 * always be server-based. They therefore must either be parsed as such or 939 * treated as an error. In these cases a statement such as 940 * 941 * <blockquote> 942 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 943 * </blockquote> 944 * 945 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 946 * it has an authority component, has a server-based authority with proper 947 * user-information, host, and port components. Invoking this method also 948 * ensures that if the authority could not be parsed in that way then an 949 * appropriate diagnostic message can be issued based upon the exception 950 * that is thrown. </p> 951 * 952 * @return A URI whose authority field has been parsed 953 * as a server-based authority 954 * 955 * @throws URISyntaxException 956 * If the authority component of this URI is defined 957 * but cannot be parsed as a server-based authority 958 * according to RFC 2396 959 */ 960 public URI parseServerAuthority() 961 throws URISyntaxException 962 { 963 // We could be clever and cache the error message and index from the 964 // exception thrown during the original parse, but that would require 965 // either more fields or a more-obscure representation. 966 if ((host != null) || (authority == null)) 967 return this; 968 new Parser(toString()).parse(true); 969 return this; 970 } 971 972 /** 973 * Normalizes this URI's path. 974 * 975 * <p> If this URI is opaque, or if its path is already in normal form, 976 * then this URI is returned. Otherwise a new URI is constructed that is 977 * identical to this URI except that its path is computed by normalizing 978 * this URI's path in a manner consistent with <a 979 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 980 * section 5.2, step 6, sub-steps c through f; that is: 981 * </p> 982 * 983 * <ol> 984 * 985 * <li><p> All {@code "."} segments are removed. </p></li> 986 * 987 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 988 * segment then both of these segments are removed. This step is 989 * repeated until it is no longer applicable. </p></li> 990 * 991 * <li><p> If the path is relative, and if its first segment contains a 992 * colon character ({@code ':'}), then a {@code "."} segment is 993 * prepended. This prevents a relative URI with a path such as 994 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 995 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 996 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 997 * 998 * </ol> 999 * 1000 * <p> A normalized path will begin with one or more {@code ".."} segments 1001 * if there were insufficient non-{@code ".."} segments preceding them to 1002 * allow their removal. A normalized path will begin with a {@code "."} 1003 * segment if one was inserted by step 3 above. Otherwise, a normalized 1004 * path will not contain any {@code "."} or {@code ".."} segments. </p> 1005 * 1006 * @return A URI equivalent to this URI, 1007 * but whose path is in normal form 1008 */ 1009 public URI normalize() { 1010 return normalize(this); 1011 } 1012 1013 /** 1014 * Resolves the given URI against this URI. 1015 * 1016 * <p> If the given URI is already absolute, or if this URI is opaque, then 1017 * the given URI is returned. 1018 * 1019 * <p><a id="resolve-frag"></a> If the given URI's fragment component is 1020 * defined, its path component is empty, and its scheme, authority, and 1021 * query components are undefined, then a URI with the given fragment but 1022 * with all other components equal to those of this URI is returned. This 1023 * allows a URI representing a standalone fragment reference, such as 1024 * {@code "#foo"}, to be usefully resolved against a base URI. 1025 * 1026 * <p> Otherwise this method constructs a new hierarchical URI in a manner 1027 * consistent with <a 1028 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1029 * section 5.2; that is: </p> 1030 * 1031 * <ol> 1032 * 1033 * <li><p> A new URI is constructed with this URI's scheme and the given 1034 * URI's query and fragment components. </p></li> 1035 * 1036 * <li><p> If the given URI has an authority component then the new URI's 1037 * authority and path are taken from the given URI. </p></li> 1038 * 1039 * <li><p> Otherwise the new URI's authority component is copied from 1040 * this URI, and its path is computed as follows: </p> 1041 * 1042 * <ol> 1043 * 1044 * <li><p> If the given URI's path is absolute then the new URI's path 1045 * is taken from the given URI. </p></li> 1046 * 1047 * <li><p> Otherwise the given URI's path is relative, and so the new 1048 * URI's path is computed by resolving the path of the given URI 1049 * against the path of this URI. This is done by concatenating all but 1050 * the last segment of this URI's path, if any, with the given URI's 1051 * path and then normalizing the result as if by invoking the {@link 1052 * #normalize() normalize} method. </p></li> 1053 * 1054 * </ol></li> 1055 * 1056 * </ol> 1057 * 1058 * <p> The result of this method is absolute if, and only if, either this 1059 * URI is absolute or the given URI is absolute. </p> 1060 * 1061 * @param uri The URI to be resolved against this URI 1062 * @return The resulting URI 1063 * 1064 * @throws NullPointerException 1065 * If {@code uri} is {@code null} 1066 */ 1067 public URI resolve(URI uri) { 1068 return resolve(this, uri); 1069 } 1070 1071 /** 1072 * Constructs a new URI by parsing the given string and then resolving it 1073 * against this URI. 1074 * 1075 * <p> This convenience method works as if invoking it were equivalent to 1076 * evaluating the expression {@link #resolve(java.net.URI) 1077 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1078 * 1079 * @param str The string to be parsed into a URI 1080 * @return The resulting URI 1081 * 1082 * @throws NullPointerException 1083 * If {@code str} is {@code null} 1084 * 1085 * @throws IllegalArgumentException 1086 * If the given string violates RFC 2396 1087 */ 1088 public URI resolve(String str) { 1089 return resolve(URI.create(str)); 1090 } 1091 1092 /** 1093 * Relativizes the given URI against this URI. 1094 * 1095 * <p> The relativization of the given URI against this URI is computed as 1096 * follows: </p> 1097 * 1098 * <ol> 1099 * 1100 * <li><p> If either this URI or the given URI are opaque, or if the 1101 * scheme and authority components of the two URIs are not identical, or 1102 * if the path of this URI is not a prefix of the path of the given URI, 1103 * then the given URI is returned. </p></li> 1104 * 1105 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1106 * query and fragment components taken from the given URI and with a path 1107 * component computed by removing this URI's path from the beginning of 1108 * the given URI's path. </p></li> 1109 * 1110 * </ol> 1111 * 1112 * @param uri The URI to be relativized against this URI 1113 * @return The resulting URI 1114 * 1115 * @throws NullPointerException 1116 * If {@code uri} is {@code null} 1117 */ 1118 public URI relativize(URI uri) { 1119 return relativize(this, uri); 1120 } 1121 1122 /** 1123 * Constructs a URL from this URI. 1124 * 1125 * <p> This convenience method works as if invoking it were equivalent to 1126 * evaluating the expression {@code new URL(this.toString())} after 1127 * first checking that this URI is absolute. </p> 1128 * 1129 * @return A URL constructed from this URI 1130 * 1131 * @throws IllegalArgumentException 1132 * If this URL is not absolute 1133 * 1134 * @throws MalformedURLException 1135 * If a protocol handler for the URL could not be found, 1136 * or if some other error occurred while constructing the URL 1137 */ 1138 public URL toURL() throws MalformedURLException { 1139 return URL.fromURI(this); 1140 } 1141 1142 // -- Component access methods -- 1143 1144 /** 1145 * Returns the scheme component of this URI. 1146 * 1147 * <p> The scheme component of a URI, if defined, only contains characters 1148 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1149 * scheme always starts with an <i>alpha</i> character. <p> 1150 * 1151 * The scheme component of a URI cannot contain escaped octets, hence this 1152 * method does not perform any decoding. 1153 * 1154 * @return The scheme component of this URI, 1155 * or {@code null} if the scheme is undefined 1156 */ 1157 public String getScheme() { 1158 return scheme; 1159 } 1160 1161 /** 1162 * Tells whether or not this URI is absolute. 1163 * 1164 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1165 * 1166 * @return {@code true} if, and only if, this URI is absolute 1167 */ 1168 public boolean isAbsolute() { 1169 return scheme != null; 1170 } 1171 1172 /** 1173 * Tells whether or not this URI is opaque. 1174 * 1175 * <p> A URI is opaque if, and only if, it is absolute and its 1176 * scheme-specific part does not begin with a slash character ('/'). 1177 * An opaque URI has a scheme, a scheme-specific part, and possibly 1178 * a fragment; all other components are undefined. </p> 1179 * 1180 * @return {@code true} if, and only if, this URI is opaque 1181 */ 1182 public boolean isOpaque() { 1183 return path == null; 1184 } 1185 1186 /** 1187 * Returns the raw scheme-specific part of this URI. The scheme-specific 1188 * part is never undefined, though it may be empty. 1189 * 1190 * <p> The scheme-specific part of a URI only contains legal URI 1191 * characters. </p> 1192 * 1193 * @return The raw scheme-specific part of this URI 1194 * (never {@code null}) 1195 */ 1196 public String getRawSchemeSpecificPart() { 1197 String part = schemeSpecificPart; 1198 if (part != null) { 1199 return part; 1200 } 1201 1202 String s = string; 1203 if (s != null) { 1204 // if string is defined, components will have been parsed 1205 int start = 0; 1206 int end = s.length(); 1207 if (scheme != null) { 1208 start = scheme.length() + 1; 1209 } 1210 if (fragment != null) { 1211 end -= fragment.length() + 1; 1212 } 1213 if (path != null && path.length() == end - start) { 1214 part = path; 1215 } else { 1216 part = s.substring(start, end); 1217 } 1218 } else { 1219 StringBuilder sb = new StringBuilder(); 1220 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1221 host, port, getPath(), getQuery()); 1222 part = sb.toString(); 1223 } 1224 return schemeSpecificPart = part; 1225 } 1226 1227 /** 1228 * Returns the decoded scheme-specific part of this URI. 1229 * 1230 * <p> The string returned by this method is equal to that returned by the 1231 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1232 * except that all sequences of escaped octets are <a 1233 * href="#decode">decoded</a>. </p> 1234 * 1235 * @return The decoded scheme-specific part of this URI 1236 * (never {@code null}) 1237 */ 1238 public String getSchemeSpecificPart() { 1239 String part = decodedSchemeSpecificPart; 1240 if (part == null) { 1241 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1242 } 1243 return part; 1244 } 1245 1246 /** 1247 * Returns the raw authority component of this URI. 1248 * 1249 * <p> The authority component of a URI, if defined, only contains the 1250 * commercial-at character ({@code '@'}) and characters in the 1251 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1252 * categories. If the authority is server-based then it is further 1253 * constrained to have valid user-information, host, and port 1254 * components. </p> 1255 * 1256 * @return The raw authority component of this URI, 1257 * or {@code null} if the authority is undefined 1258 */ 1259 public String getRawAuthority() { 1260 return authority; 1261 } 1262 1263 /** 1264 * Returns the decoded authority component of this URI. 1265 * 1266 * <p> The string returned by this method is equal to that returned by the 1267 * {@link #getRawAuthority() getRawAuthority} method except that all 1268 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1269 * 1270 * @return The decoded authority component of this URI, 1271 * or {@code null} if the authority is undefined 1272 */ 1273 public String getAuthority() { 1274 String auth = decodedAuthority; 1275 if ((auth == null) && (authority != null)) { 1276 decodedAuthority = auth = decode(authority); 1277 } 1278 return auth; 1279 } 1280 1281 /** 1282 * Returns the raw user-information component of this URI. 1283 * 1284 * <p> The user-information component of a URI, if defined, only contains 1285 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1286 * <i>other</i> categories. </p> 1287 * 1288 * @return The raw user-information component of this URI, 1289 * or {@code null} if the user information is undefined 1290 */ 1291 public String getRawUserInfo() { 1292 return userInfo; 1293 } 1294 1295 /** 1296 * Returns the decoded user-information component of this URI. 1297 * 1298 * <p> The string returned by this method is equal to that returned by the 1299 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1300 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1301 * 1302 * @return The decoded user-information component of this URI, 1303 * or {@code null} if the user information is undefined 1304 */ 1305 public String getUserInfo() { 1306 String user = decodedUserInfo; 1307 if ((user == null) && (userInfo != null)) { 1308 decodedUserInfo = user = decode(userInfo); 1309 } 1310 return user; 1311 } 1312 1313 /** 1314 * Returns the host component of this URI. 1315 * 1316 * <p> The host component of a URI, if defined, will have one of the 1317 * following forms: </p> 1318 * 1319 * <ul> 1320 * 1321 * <li><p> A domain name consisting of one or more <i>labels</i> 1322 * separated by period characters ({@code '.'}), optionally followed by 1323 * a period character. Each label consists of <i>alphanum</i> characters 1324 * as well as hyphen characters ({@code '-'}), though hyphens never 1325 * occur as the first or last characters in a label. The rightmost 1326 * label of a domain name consisting of two or more labels, begins 1327 * with an <i>alpha</i> character. </li> 1328 * 1329 * <li><p> A dotted-quad IPv4 address of the form 1330 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1331 * where no <i>digit</i> sequence is longer than three characters and no 1332 * sequence has a value larger than 255. </p></li> 1333 * 1334 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1335 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1336 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1337 * syntax of IPv6 addresses is specified in <a 1338 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1339 * Addressing Architecture</i></a>. </p></li> 1340 * 1341 * </ul> 1342 * 1343 * The host component of a URI cannot contain escaped octets, hence this 1344 * method does not perform any decoding. 1345 * 1346 * @return The host component of this URI, 1347 * or {@code null} if the host is undefined 1348 */ 1349 public String getHost() { 1350 return host; 1351 } 1352 1353 /** 1354 * Returns the port number of this URI. 1355 * 1356 * <p> The port component of a URI, if defined, is a non-negative 1357 * integer. </p> 1358 * 1359 * @return The port component of this URI, 1360 * or {@code -1} if the port is undefined 1361 */ 1362 public int getPort() { 1363 return port; 1364 } 1365 1366 /** 1367 * Returns the raw path component of this URI. 1368 * 1369 * <p> The path component of a URI, if defined, only contains the slash 1370 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1371 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1372 * and <i>other</i> categories. </p> 1373 * 1374 * @return The path component of this URI, 1375 * or {@code null} if the path is undefined 1376 */ 1377 public String getRawPath() { 1378 return path; 1379 } 1380 1381 /** 1382 * Returns the decoded path component of this URI. 1383 * 1384 * <p> The string returned by this method is equal to that returned by the 1385 * {@link #getRawPath() getRawPath} method except that all sequences of 1386 * escaped octets are <a href="#decode">decoded</a>. </p> 1387 * 1388 * @return The decoded path component of this URI, 1389 * or {@code null} if the path is undefined 1390 */ 1391 public String getPath() { 1392 String decoded = decodedPath; 1393 if ((decoded == null) && (path != null)) { 1394 decodedPath = decoded = decode(path); 1395 } 1396 return decoded; 1397 } 1398 1399 /** 1400 * Returns the raw query component of this URI. 1401 * 1402 * <p> The query component of a URI, if defined, only contains legal URI 1403 * characters. </p> 1404 * 1405 * @return The raw query component of this URI, 1406 * or {@code null} if the query is undefined 1407 */ 1408 public String getRawQuery() { 1409 return query; 1410 } 1411 1412 /** 1413 * Returns the decoded query component of this URI. 1414 * 1415 * <p> The string returned by this method is equal to that returned by the 1416 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1417 * escaped octets are <a href="#decode">decoded</a>. </p> 1418 * 1419 * @return The decoded query component of this URI, 1420 * or {@code null} if the query is undefined 1421 */ 1422 public String getQuery() { 1423 String decoded = decodedQuery; 1424 if ((decoded == null) && (query != null)) { 1425 decodedQuery = decoded = decode(query, false); 1426 } 1427 return decoded; 1428 } 1429 1430 /** 1431 * Returns the raw fragment component of this URI. 1432 * 1433 * <p> The fragment component of a URI, if defined, only contains legal URI 1434 * characters. </p> 1435 * 1436 * @return The raw fragment component of this URI, 1437 * or {@code null} if the fragment is undefined 1438 */ 1439 public String getRawFragment() { 1440 return fragment; 1441 } 1442 1443 /** 1444 * Returns the decoded fragment component of this URI. 1445 * 1446 * <p> The string returned by this method is equal to that returned by the 1447 * {@link #getRawFragment() getRawFragment} method except that all 1448 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1449 * 1450 * @return The decoded fragment component of this URI, 1451 * or {@code null} if the fragment is undefined 1452 */ 1453 public String getFragment() { 1454 String decoded = decodedFragment; 1455 if ((decoded == null) && (fragment != null)) { 1456 decodedFragment = decoded = decode(fragment, false); 1457 } 1458 return decoded; 1459 } 1460 1461 1462 // -- Equality, comparison, hash code, toString, and serialization -- 1463 1464 /** 1465 * Tests this URI for equality with another object. 1466 * 1467 * <p> If the given object is not a URI then this method immediately 1468 * returns {@code false}. 1469 * 1470 * <p> For two URIs to be considered equal requires that either both are 1471 * opaque or both are hierarchical. Their schemes must either both be 1472 * undefined or else be equal without regard to case. Their fragments 1473 * must either both be undefined or else be equal. 1474 * 1475 * <p> For two opaque URIs to be considered equal, their scheme-specific 1476 * parts must be equal. 1477 * 1478 * <p> For two hierarchical URIs to be considered equal, their paths must 1479 * be equal and their queries must either both be undefined or else be 1480 * equal. Their authorities must either both be undefined, or both be 1481 * registry-based, or both be server-based. If their authorities are 1482 * defined and are registry-based, then they must be equal. If their 1483 * authorities are defined and are server-based, then their hosts must be 1484 * equal without regard to case, their port numbers must be equal, and 1485 * their user-information components must be equal. 1486 * 1487 * <p> When testing the user-information, path, query, fragment, authority, 1488 * or scheme-specific parts of two URIs for equality, the raw forms rather 1489 * than the encoded forms of these components are compared and the 1490 * hexadecimal digits of escaped octets are compared without regard to 1491 * case. 1492 * 1493 * <p> This method satisfies the general contract of the {@link 1494 * java.lang.Object#equals(Object) Object.equals} method. </p> 1495 * 1496 * @param ob The object to which this object is to be compared 1497 * 1498 * @return {@code true} if, and only if, the given object is a URI that 1499 * is identical to this URI 1500 */ 1501 public boolean equals(Object ob) { 1502 if (ob == this) 1503 return true; 1504 if (!(ob instanceof URI)) 1505 return false; 1506 URI that = (URI)ob; 1507 if (this.isOpaque() != that.isOpaque()) return false; 1508 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1509 if (!equal(this.fragment, that.fragment)) return false; 1510 1511 // Opaque 1512 if (this.isOpaque()) 1513 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1514 1515 // Hierarchical 1516 if (!equal(this.path, that.path)) return false; 1517 if (!equal(this.query, that.query)) return false; 1518 1519 // Authorities 1520 if (this.authority == that.authority) return true; 1521 if (this.host != null) { 1522 // Server-based 1523 if (!equal(this.userInfo, that.userInfo)) return false; 1524 if (!equalIgnoringCase(this.host, that.host)) return false; 1525 if (this.port != that.port) return false; 1526 } else if (this.authority != null) { 1527 // Registry-based 1528 if (!equal(this.authority, that.authority)) return false; 1529 } else if (this.authority != that.authority) { 1530 return false; 1531 } 1532 1533 return true; 1534 } 1535 1536 /** 1537 * Returns a hash-code value for this URI. The hash code is based upon all 1538 * of the URI's components, and satisfies the general contract of the 1539 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1540 * 1541 * @return A hash-code value for this URI 1542 */ 1543 public int hashCode() { 1544 int h = hash; 1545 if (h == 0) { 1546 h = hashIgnoringCase(0, scheme); 1547 h = hash(h, fragment); 1548 if (isOpaque()) { 1549 h = hash(h, schemeSpecificPart); 1550 } else { 1551 h = hash(h, path); 1552 h = hash(h, query); 1553 if (host != null) { 1554 h = hash(h, userInfo); 1555 h = hashIgnoringCase(h, host); 1556 h += 1949 * port; 1557 } else { 1558 h = hash(h, authority); 1559 } 1560 } 1561 if (h != 0) { 1562 hash = h; 1563 } 1564 } 1565 return h; 1566 } 1567 1568 /** 1569 * Compares this URI to another object, which must be a URI. 1570 * 1571 * <p> When comparing corresponding components of two URIs, if one 1572 * component is undefined but the other is defined then the first is 1573 * considered to be less than the second. Unless otherwise noted, string 1574 * components are ordered according to their natural, case-sensitive 1575 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1576 * String.compareTo} method. String components that are subject to 1577 * encoding are compared by comparing their raw forms rather than their 1578 * encoded forms. 1579 * 1580 * <p> The ordering of URIs is defined as follows: </p> 1581 * 1582 * <ul> 1583 * 1584 * <li><p> Two URIs with different schemes are ordered according the 1585 * ordering of their schemes, without regard to case. </p></li> 1586 * 1587 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1588 * with an identical scheme. </p></li> 1589 * 1590 * <li><p> Two opaque URIs with identical schemes are ordered according 1591 * to the ordering of their scheme-specific parts. </p></li> 1592 * 1593 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1594 * parts are ordered according to the ordering of their 1595 * fragments. </p></li> 1596 * 1597 * <li><p> Two hierarchical URIs with identical schemes are ordered 1598 * according to the ordering of their authority components: </p> 1599 * 1600 * <ul> 1601 * 1602 * <li><p> If both authority components are server-based then the URIs 1603 * are ordered according to their user-information components; if these 1604 * components are identical then the URIs are ordered according to the 1605 * ordering of their hosts, without regard to case; if the hosts are 1606 * identical then the URIs are ordered according to the ordering of 1607 * their ports. </p></li> 1608 * 1609 * <li><p> If one or both authority components are registry-based then 1610 * the URIs are ordered according to the ordering of their authority 1611 * components. </p></li> 1612 * 1613 * </ul></li> 1614 * 1615 * <li><p> Finally, two hierarchical URIs with identical schemes and 1616 * authority components are ordered according to the ordering of their 1617 * paths; if their paths are identical then they are ordered according to 1618 * the ordering of their queries; if the queries are identical then they 1619 * are ordered according to the order of their fragments. </p></li> 1620 * 1621 * </ul> 1622 * 1623 * <p> This method satisfies the general contract of the {@link 1624 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1625 * method. </p> 1626 * 1627 * @param that 1628 * The object to which this URI is to be compared 1629 * 1630 * @return A negative integer, zero, or a positive integer as this URI is 1631 * less than, equal to, or greater than the given URI 1632 * 1633 * @throws ClassCastException 1634 * If the given object is not a URI 1635 */ 1636 public int compareTo(URI that) { 1637 int c; 1638 1639 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1640 return c; 1641 1642 if (this.isOpaque()) { 1643 if (that.isOpaque()) { 1644 // Both opaque 1645 if ((c = compare(this.schemeSpecificPart, 1646 that.schemeSpecificPart)) != 0) 1647 return c; 1648 return compare(this.fragment, that.fragment); 1649 } 1650 return +1; // Opaque > hierarchical 1651 } else if (that.isOpaque()) { 1652 return -1; // Hierarchical < opaque 1653 } 1654 1655 // Hierarchical 1656 if ((this.host != null) && (that.host != null)) { 1657 // Both server-based 1658 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1659 return c; 1660 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1661 return c; 1662 if ((c = this.port - that.port) != 0) 1663 return c; 1664 } else { 1665 // If one or both authorities are registry-based then we simply 1666 // compare them in the usual, case-sensitive way. If one is 1667 // registry-based and one is server-based then the strings are 1668 // guaranteed to be unequal, hence the comparison will never return 1669 // zero and the compareTo and equals methods will remain 1670 // consistent. 1671 if ((c = compare(this.authority, that.authority)) != 0) return c; 1672 } 1673 1674 if ((c = compare(this.path, that.path)) != 0) return c; 1675 if ((c = compare(this.query, that.query)) != 0) return c; 1676 return compare(this.fragment, that.fragment); 1677 } 1678 1679 /** 1680 * Returns the content of this URI as a string. 1681 * 1682 * <p> If this URI was created by invoking one of the constructors in this 1683 * class then a string equivalent to the original input string, or to the 1684 * string computed from the originally-given components, as appropriate, is 1685 * returned. Otherwise this URI was created by normalization, resolution, 1686 * or relativization, and so a string is constructed from this URI's 1687 * components according to the rules specified in <a 1688 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1689 * section 5.2, step 7. </p> 1690 * 1691 * @return The string form of this URI 1692 */ 1693 public String toString() { 1694 String s = string; 1695 if (s == null) { 1696 s = defineString(); 1697 } 1698 return s; 1699 } 1700 1701 private String defineString() { 1702 String s = string; 1703 if (s != null) { 1704 return s; 1705 } 1706 1707 StringBuilder sb = new StringBuilder(); 1708 if (scheme != null) { 1709 sb.append(scheme); 1710 sb.append(':'); 1711 } 1712 if (isOpaque()) { 1713 sb.append(schemeSpecificPart); 1714 } else { 1715 if (host != null) { 1716 sb.append("//"); 1717 if (userInfo != null) { 1718 sb.append(userInfo); 1719 sb.append('@'); 1720 } 1721 boolean needBrackets = ((host.indexOf(':') >= 0) 1722 && !host.startsWith("[") 1723 && !host.endsWith("]")); 1724 if (needBrackets) sb.append('['); 1725 sb.append(host); 1726 if (needBrackets) sb.append(']'); 1727 if (port != -1) { 1728 sb.append(':'); 1729 sb.append(port); 1730 } 1731 } else if (authority != null) { 1732 sb.append("//"); 1733 sb.append(authority); 1734 } 1735 if (path != null) 1736 sb.append(path); 1737 if (query != null) { 1738 sb.append('?'); 1739 sb.append(query); 1740 } 1741 } 1742 if (fragment != null) { 1743 sb.append('#'); 1744 sb.append(fragment); 1745 } 1746 return string = sb.toString(); 1747 } 1748 1749 /** 1750 * Returns the content of this URI as a US-ASCII string. 1751 * 1752 * <p> If this URI does not contain any characters in the <i>other</i> 1753 * category then an invocation of this method will return the same value as 1754 * an invocation of the {@link #toString() toString} method. Otherwise 1755 * this method works as if by invoking that method and then <a 1756 * href="#encode">encoding</a> the result. </p> 1757 * 1758 * @return The string form of this URI, encoded as needed 1759 * so that it only contains characters in the US-ASCII 1760 * charset 1761 */ 1762 public String toASCIIString() { 1763 return encode(toString()); 1764 } 1765 1766 1767 // -- Serialization support -- 1768 1769 /** 1770 * Saves the content of this URI to the given serial stream. 1771 * 1772 * <p> The only serializable field of a URI instance is its {@code string} 1773 * field. That field is given a value, if it does not have one already, 1774 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1775 * method of the given object-output stream is invoked. </p> 1776 * 1777 * @param os The object-output stream to which this object 1778 * is to be written 1779 */ 1780 private void writeObject(ObjectOutputStream os) 1781 throws IOException 1782 { 1783 defineString(); 1784 os.defaultWriteObject(); // Writes the string field only 1785 } 1786 1787 /** 1788 * Reconstitutes a URI from the given serial stream. 1789 * 1790 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1791 * invoked to read the value of the {@code string} field. The result is 1792 * then parsed in the usual way. 1793 * 1794 * @param is The object-input stream from which this object 1795 * is being read 1796 */ 1797 private void readObject(ObjectInputStream is) 1798 throws ClassNotFoundException, IOException 1799 { 1800 port = -1; // Argh 1801 is.defaultReadObject(); 1802 try { 1803 new Parser(string).parse(false); 1804 } catch (URISyntaxException x) { 1805 IOException y = new InvalidObjectException("Invalid URI"); 1806 y.initCause(x); 1807 throw y; 1808 } 1809 } 1810 1811 1812 // -- End of public methods -- 1813 1814 1815 // -- Utility methods for string-field comparison and hashing -- 1816 1817 // These methods return appropriate values for null string arguments, 1818 // thereby simplifying the equals, hashCode, and compareTo methods. 1819 // 1820 // The case-ignoring methods should only be applied to strings whose 1821 // characters are all known to be US-ASCII. Because of this restriction, 1822 // these methods are faster than the similar methods in the String class. 1823 1824 // US-ASCII only 1825 private static int toLower(char c) { 1826 if ((c >= 'A') && (c <= 'Z')) 1827 return c + ('a' - 'A'); 1828 return c; 1829 } 1830 1831 // US-ASCII only 1832 private static int toUpper(char c) { 1833 if ((c >= 'a') && (c <= 'z')) 1834 return c - ('a' - 'A'); 1835 return c; 1836 } 1837 1838 private static boolean equal(String s, String t) { 1839 if (s == t) return true; 1840 if ((s != null) && (t != null)) { 1841 if (s.length() != t.length()) 1842 return false; 1843 if (s.indexOf('%') < 0) 1844 return s.equals(t); 1845 int n = s.length(); 1846 for (int i = 0; i < n;) { 1847 char c = s.charAt(i); 1848 char d = t.charAt(i); 1849 if (c != '%') { 1850 if (c != d) 1851 return false; 1852 i++; 1853 continue; 1854 } 1855 if (d != '%') 1856 return false; 1857 i++; 1858 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1859 return false; 1860 i++; 1861 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1862 return false; 1863 i++; 1864 } 1865 return true; 1866 } 1867 return false; 1868 } 1869 1870 // US-ASCII only 1871 private static boolean equalIgnoringCase(String s, String t) { 1872 if (s == t) return true; 1873 if ((s != null) && (t != null)) { 1874 int n = s.length(); 1875 if (t.length() != n) 1876 return false; 1877 for (int i = 0; i < n; i++) { 1878 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1879 return false; 1880 } 1881 return true; 1882 } 1883 return false; 1884 } 1885 1886 private static int hash(int hash, String s) { 1887 if (s == null) return hash; 1888 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1889 : normalizedHash(hash, s); 1890 } 1891 1892 1893 private static int normalizedHash(int hash, String s) { 1894 int h = 0; 1895 for (int index = 0; index < s.length(); index++) { 1896 char ch = s.charAt(index); 1897 h = 31 * h + ch; 1898 if (ch == '%') { 1899 /* 1900 * Process the next two encoded characters 1901 */ 1902 for (int i = index + 1; i < index + 3; i++) 1903 h = 31 * h + toUpper(s.charAt(i)); 1904 index += 2; 1905 } 1906 } 1907 return hash * 127 + h; 1908 } 1909 1910 // US-ASCII only 1911 private static int hashIgnoringCase(int hash, String s) { 1912 if (s == null) return hash; 1913 int h = hash; 1914 int n = s.length(); 1915 for (int i = 0; i < n; i++) 1916 h = 31 * h + toLower(s.charAt(i)); 1917 return h; 1918 } 1919 1920 private static int compare(String s, String t) { 1921 if (s == t) return 0; 1922 if (s != null) { 1923 if (t != null) 1924 return s.compareTo(t); 1925 else 1926 return +1; 1927 } else { 1928 return -1; 1929 } 1930 } 1931 1932 // US-ASCII only 1933 private static int compareIgnoringCase(String s, String t) { 1934 if (s == t) return 0; 1935 if (s != null) { 1936 if (t != null) { 1937 int sn = s.length(); 1938 int tn = t.length(); 1939 int n = sn < tn ? sn : tn; 1940 for (int i = 0; i < n; i++) { 1941 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1942 if (c != 0) 1943 return c; 1944 } 1945 return sn - tn; 1946 } 1947 return +1; 1948 } else { 1949 return -1; 1950 } 1951 } 1952 1953 1954 // -- String construction -- 1955 1956 // If a scheme is given then the path, if given, must be absolute 1957 // 1958 private static void checkPath(String s, String scheme, String path) 1959 throws URISyntaxException 1960 { 1961 if (scheme != null) { 1962 if (path != null && !path.isEmpty() && path.charAt(0) != '/') 1963 throw new URISyntaxException(s, "Relative path in absolute URI"); 1964 } 1965 } 1966 1967 private void appendAuthority(StringBuilder sb, 1968 String authority, 1969 String userInfo, 1970 String host, 1971 int port) 1972 { 1973 if (host != null) { 1974 sb.append("//"); 1975 if (userInfo != null) { 1976 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1977 sb.append('@'); 1978 } 1979 boolean needBrackets = ((host.indexOf(':') >= 0) 1980 && !host.startsWith("[") 1981 && !host.endsWith("]")); 1982 if (needBrackets) sb.append('['); 1983 sb.append(host); 1984 if (needBrackets) sb.append(']'); 1985 if (port != -1) { 1986 sb.append(':'); 1987 sb.append(port); 1988 } 1989 } else if (authority != null) { 1990 sb.append("//"); 1991 if (authority.startsWith("[")) { 1992 // authority should (but may not) contain an embedded IPv6 address 1993 int end = authority.indexOf(']'); 1994 String doquote = authority, dontquote = ""; 1995 if (end != -1 && authority.indexOf(':') != -1) { 1996 // the authority contains an IPv6 address 1997 if (end == authority.length()) { 1998 dontquote = authority; 1999 doquote = ""; 2000 } else { 2001 dontquote = authority.substring(0 , end + 1); 2002 doquote = authority.substring(end + 1); 2003 } 2004 } 2005 sb.append(dontquote); 2006 sb.append(quote(doquote, 2007 L_REG_NAME | L_SERVER, 2008 H_REG_NAME | H_SERVER)); 2009 } else { 2010 sb.append(quote(authority, 2011 L_REG_NAME | L_SERVER, 2012 H_REG_NAME | H_SERVER)); 2013 } 2014 } 2015 } 2016 2017 private void appendSchemeSpecificPart(StringBuilder sb, 2018 String opaquePart, 2019 String authority, 2020 String userInfo, 2021 String host, 2022 int port, 2023 String path, 2024 String query) 2025 { 2026 if (opaquePart != null) { 2027 /* check if SSP begins with an IPv6 address 2028 * because we must not quote a literal IPv6 address 2029 */ 2030 if (opaquePart.startsWith("//[")) { 2031 int end = opaquePart.indexOf(']'); 2032 if (end != -1 && opaquePart.indexOf(':')!=-1) { 2033 String doquote, dontquote; 2034 if (end == opaquePart.length()) { 2035 dontquote = opaquePart; 2036 doquote = ""; 2037 } else { 2038 dontquote = opaquePart.substring(0,end+1); 2039 doquote = opaquePart.substring(end+1); 2040 } 2041 sb.append (dontquote); 2042 sb.append(quote(doquote, L_URIC, H_URIC)); 2043 } 2044 } else { 2045 sb.append(quote(opaquePart, L_URIC, H_URIC)); 2046 } 2047 } else { 2048 appendAuthority(sb, authority, userInfo, host, port); 2049 if (path != null) 2050 sb.append(quote(path, L_PATH, H_PATH)); 2051 if (query != null) { 2052 sb.append('?'); 2053 sb.append(quote(query, L_URIC, H_URIC)); 2054 } 2055 } 2056 } 2057 2058 private void appendFragment(StringBuilder sb, String fragment) { 2059 if (fragment != null) { 2060 sb.append('#'); 2061 sb.append(quote(fragment, L_URIC, H_URIC)); 2062 } 2063 } 2064 2065 private String toString(String scheme, 2066 String opaquePart, 2067 String authority, 2068 String userInfo, 2069 String host, 2070 int port, 2071 String path, 2072 String query, 2073 String fragment) 2074 { 2075 StringBuilder sb = new StringBuilder(); 2076 if (scheme != null) { 2077 sb.append(scheme); 2078 sb.append(':'); 2079 } 2080 appendSchemeSpecificPart(sb, opaquePart, 2081 authority, userInfo, host, port, 2082 path, query); 2083 appendFragment(sb, fragment); 2084 return sb.toString(); 2085 } 2086 2087 // -- Normalization, resolution, and relativization -- 2088 2089 // RFC2396 5.2 (6) 2090 private static String resolvePath(String base, String child, 2091 boolean absolute) 2092 { 2093 int i = base.lastIndexOf('/'); 2094 int cn = child.length(); 2095 String path = ""; 2096 2097 if (cn == 0) { 2098 // 5.2 (6a) 2099 if (i >= 0) 2100 path = base.substring(0, i + 1); 2101 } else { 2102 StringBuilder sb = new StringBuilder(base.length() + cn); 2103 // 5.2 (6a) 2104 if (i >= 0) 2105 sb.append(base, 0, i + 1); 2106 // 5.2 (6b) 2107 sb.append(child); 2108 path = sb.toString(); 2109 } 2110 2111 // 5.2 (6c-f) 2112 String np = normalize(path); 2113 2114 // 5.2 (6g): If the result is absolute but the path begins with "../", 2115 // then we simply leave the path as-is 2116 2117 return np; 2118 } 2119 2120 // RFC2396 5.2 2121 private static URI resolve(URI base, URI child) { 2122 // check if child if opaque first so that NPE is thrown 2123 // if child is null. 2124 if (child.isOpaque() || base.isOpaque()) 2125 return child; 2126 2127 // 5.2 (2): Reference to current document (lone fragment) 2128 if ((child.scheme == null) && (child.authority == null) 2129 && child.path.isEmpty() && (child.fragment != null) 2130 && (child.query == null)) { 2131 if ((base.fragment != null) 2132 && child.fragment.equals(base.fragment)) { 2133 return base; 2134 } 2135 URI ru = new URI(); 2136 ru.scheme = base.scheme; 2137 ru.authority = base.authority; 2138 ru.userInfo = base.userInfo; 2139 ru.host = base.host; 2140 ru.port = base.port; 2141 ru.path = base.path; 2142 ru.fragment = child.fragment; 2143 ru.query = base.query; 2144 return ru; 2145 } 2146 2147 // 5.2 (3): Child is absolute 2148 if (child.scheme != null) 2149 return child; 2150 2151 URI ru = new URI(); // Resolved URI 2152 ru.scheme = base.scheme; 2153 ru.query = child.query; 2154 ru.fragment = child.fragment; 2155 2156 // 5.2 (4): Authority 2157 if (child.authority == null) { 2158 ru.authority = base.authority; 2159 ru.host = base.host; 2160 ru.userInfo = base.userInfo; 2161 ru.port = base.port; 2162 2163 String cp = (child.path == null) ? "" : child.path; 2164 if (!cp.isEmpty() && cp.charAt(0) == '/') { 2165 // 5.2 (5): Child path is absolute 2166 ru.path = child.path; 2167 } else { 2168 // 5.2 (6): Resolve relative path 2169 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2170 } 2171 } else { 2172 ru.authority = child.authority; 2173 ru.host = child.host; 2174 ru.userInfo = child.userInfo; 2175 ru.host = child.host; 2176 ru.port = child.port; 2177 ru.path = child.path; 2178 } 2179 2180 // 5.2 (7): Recombine (nothing to do here) 2181 return ru; 2182 } 2183 2184 // If the given URI's path is normal then return the URI; 2185 // o.w., return a new URI containing the normalized path. 2186 // 2187 private static URI normalize(URI u) { 2188 if (u.isOpaque() || u.path == null || u.path.isEmpty()) 2189 return u; 2190 2191 String np = normalize(u.path); 2192 if (np == u.path) 2193 return u; 2194 2195 URI v = new URI(); 2196 v.scheme = u.scheme; 2197 v.fragment = u.fragment; 2198 v.authority = u.authority; 2199 v.userInfo = u.userInfo; 2200 v.host = u.host; 2201 v.port = u.port; 2202 v.path = np; 2203 v.query = u.query; 2204 return v; 2205 } 2206 2207 // If both URIs are hierarchical, their scheme and authority components are 2208 // identical, and the base path is a prefix of the child's path, then 2209 // return a relative URI that, when resolved against the base, yields the 2210 // child; otherwise, return the child. 2211 // 2212 private static URI relativize(URI base, URI child) { 2213 // check if child if opaque first so that NPE is thrown 2214 // if child is null. 2215 if (child.isOpaque() || base.isOpaque()) 2216 return child; 2217 if (!equalIgnoringCase(base.scheme, child.scheme) 2218 || !equal(base.authority, child.authority)) 2219 return child; 2220 2221 String bp = normalize(base.path); 2222 String cp = normalize(child.path); 2223 if (!bp.equals(cp)) { 2224 if (!bp.endsWith("/")) 2225 bp = bp + "/"; 2226 if (!cp.startsWith(bp)) 2227 return child; 2228 } 2229 2230 URI v = new URI(); 2231 v.path = cp.substring(bp.length()); 2232 v.query = child.query; 2233 v.fragment = child.fragment; 2234 return v; 2235 } 2236 2237 2238 2239 // -- Path normalization -- 2240 2241 // The following algorithm for path normalization avoids the creation of a 2242 // string object for each segment, as well as the use of a string buffer to 2243 // compute the final result, by using a single char array and editing it in 2244 // place. The array is first split into segments, replacing each slash 2245 // with '\0' and creating a segment-index array, each element of which is 2246 // the index of the first char in the corresponding segment. We then walk 2247 // through both arrays, removing ".", "..", and other segments as necessary 2248 // by setting their entries in the index array to -1. Finally, the two 2249 // arrays are used to rejoin the segments and compute the final result. 2250 // 2251 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2252 2253 2254 // Check the given path to see if it might need normalization. A path 2255 // might need normalization if it contains duplicate slashes, a "." 2256 // segment, or a ".." segment. Return -1 if no further normalization is 2257 // possible, otherwise return the number of segments found. 2258 // 2259 // This method takes a string argument rather than a char array so that 2260 // this test can be performed without invoking path.toCharArray(). 2261 // 2262 private static int needsNormalization(String path) { 2263 boolean normal = true; 2264 int ns = 0; // Number of segments 2265 int end = path.length() - 1; // Index of last char in path 2266 int p = 0; // Index of next char in path 2267 2268 // Skip initial slashes 2269 while (p <= end) { 2270 if (path.charAt(p) != '/') break; 2271 p++; 2272 } 2273 if (p > 1) normal = false; 2274 2275 // Scan segments 2276 while (p <= end) { 2277 2278 // Looking at "." or ".." ? 2279 if ((path.charAt(p) == '.') 2280 && ((p == end) 2281 || ((path.charAt(p + 1) == '/') 2282 || ((path.charAt(p + 1) == '.') 2283 && ((p + 1 == end) 2284 || (path.charAt(p + 2) == '/')))))) { 2285 normal = false; 2286 } 2287 ns++; 2288 2289 // Find beginning of next segment 2290 while (p <= end) { 2291 if (path.charAt(p++) != '/') 2292 continue; 2293 2294 // Skip redundant slashes 2295 while (p <= end) { 2296 if (path.charAt(p) != '/') break; 2297 normal = false; 2298 p++; 2299 } 2300 2301 break; 2302 } 2303 } 2304 2305 return normal ? -1 : ns; 2306 } 2307 2308 2309 // Split the given path into segments, replacing slashes with nulls and 2310 // filling in the given segment-index array. 2311 // 2312 // Preconditions: 2313 // segs.length == Number of segments in path 2314 // 2315 // Postconditions: 2316 // All slashes in path replaced by '\0' 2317 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2318 // 2319 private static void split(char[] path, int[] segs) { 2320 int end = path.length - 1; // Index of last char in path 2321 int p = 0; // Index of next char in path 2322 int i = 0; // Index of current segment 2323 2324 // Skip initial slashes 2325 while (p <= end) { 2326 if (path[p] != '/') break; 2327 path[p] = '\0'; 2328 p++; 2329 } 2330 2331 while (p <= end) { 2332 2333 // Note start of segment 2334 segs[i++] = p++; 2335 2336 // Find beginning of next segment 2337 while (p <= end) { 2338 if (path[p++] != '/') 2339 continue; 2340 path[p - 1] = '\0'; 2341 2342 // Skip redundant slashes 2343 while (p <= end) { 2344 if (path[p] != '/') break; 2345 path[p++] = '\0'; 2346 } 2347 break; 2348 } 2349 } 2350 2351 if (i != segs.length) 2352 throw new InternalError(); // ASSERT 2353 } 2354 2355 2356 // Join the segments in the given path according to the given segment-index 2357 // array, ignoring those segments whose index entries have been set to -1, 2358 // and inserting slashes as needed. Return the length of the resulting 2359 // path. 2360 // 2361 // Preconditions: 2362 // segs[i] == -1 implies segment i is to be ignored 2363 // path computed by split, as above, with '\0' having replaced '/' 2364 // 2365 // Postconditions: 2366 // path[0] .. path[return value] == Resulting path 2367 // 2368 private static int join(char[] path, int[] segs) { 2369 int ns = segs.length; // Number of segments 2370 int end = path.length - 1; // Index of last char in path 2371 int p = 0; // Index of next path char to write 2372 2373 if (path[p] == '\0') { 2374 // Restore initial slash for absolute paths 2375 path[p++] = '/'; 2376 } 2377 2378 for (int i = 0; i < ns; i++) { 2379 int q = segs[i]; // Current segment 2380 if (q == -1) 2381 // Ignore this segment 2382 continue; 2383 2384 if (p == q) { 2385 // We're already at this segment, so just skip to its end 2386 while ((p <= end) && (path[p] != '\0')) 2387 p++; 2388 if (p <= end) { 2389 // Preserve trailing slash 2390 path[p++] = '/'; 2391 } 2392 } else if (p < q) { 2393 // Copy q down to p 2394 while ((q <= end) && (path[q] != '\0')) 2395 path[p++] = path[q++]; 2396 if (q <= end) { 2397 // Preserve trailing slash 2398 path[p++] = '/'; 2399 } 2400 } else 2401 throw new InternalError(); // ASSERT false 2402 } 2403 2404 return p; 2405 } 2406 2407 2408 // Remove "." segments from the given path, and remove segment pairs 2409 // consisting of a non-".." segment followed by a ".." segment. 2410 // 2411 private static void removeDots(char[] path, int[] segs) { 2412 int ns = segs.length; 2413 int end = path.length - 1; 2414 2415 for (int i = 0; i < ns; i++) { 2416 int dots = 0; // Number of dots found (0, 1, or 2) 2417 2418 // Find next occurrence of "." or ".." 2419 do { 2420 int p = segs[i]; 2421 if (path[p] == '.') { 2422 if (p == end) { 2423 dots = 1; 2424 break; 2425 } else if (path[p + 1] == '\0') { 2426 dots = 1; 2427 break; 2428 } else if ((path[p + 1] == '.') 2429 && ((p + 1 == end) 2430 || (path[p + 2] == '\0'))) { 2431 dots = 2; 2432 break; 2433 } 2434 } 2435 i++; 2436 } while (i < ns); 2437 if ((i > ns) || (dots == 0)) 2438 break; 2439 2440 if (dots == 1) { 2441 // Remove this occurrence of "." 2442 segs[i] = -1; 2443 } else { 2444 // If there is a preceding non-".." segment, remove both that 2445 // segment and this occurrence of ".."; otherwise, leave this 2446 // ".." segment as-is. 2447 int j; 2448 for (j = i - 1; j >= 0; j--) { 2449 if (segs[j] != -1) break; 2450 } 2451 if (j >= 0) { 2452 int q = segs[j]; 2453 if (!((path[q] == '.') 2454 && (path[q + 1] == '.') 2455 && (path[q + 2] == '\0'))) { 2456 segs[i] = -1; 2457 segs[j] = -1; 2458 } 2459 } 2460 } 2461 } 2462 } 2463 2464 2465 // DEVIATION: If the normalized path is relative, and if the first 2466 // segment could be parsed as a scheme name, then prepend a "." segment 2467 // 2468 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2469 2470 if (path[0] == '\0') 2471 // The path is absolute 2472 return; 2473 2474 int ns = segs.length; 2475 int f = 0; // Index of first segment 2476 while (f < ns) { 2477 if (segs[f] >= 0) 2478 break; 2479 f++; 2480 } 2481 if ((f >= ns) || (f == 0)) 2482 // The path is empty, or else the original first segment survived, 2483 // in which case we already know that no leading "." is needed 2484 return; 2485 2486 int p = segs[f]; 2487 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2488 if (p >= path.length || path[p] == '\0') 2489 // No colon in first segment, so no "." needed 2490 return; 2491 2492 // At this point we know that the first segment is unused, 2493 // hence we can insert a "." segment at that position 2494 path[0] = '.'; 2495 path[1] = '\0'; 2496 segs[0] = 0; 2497 } 2498 2499 2500 // Normalize the given path string. A normal path string has no empty 2501 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2502 // segments equal to ".." that are preceded by a segment not equal to "..". 2503 // In contrast to Unix-style pathname normalization, for URI paths we 2504 // always retain trailing slashes. 2505 // 2506 private static String normalize(String ps) { 2507 2508 // Does this path need normalization? 2509 int ns = needsNormalization(ps); // Number of segments 2510 if (ns < 0) 2511 // Nope -- just return it 2512 return ps; 2513 2514 char[] path = ps.toCharArray(); // Path in char-array form 2515 2516 // Split path into segments 2517 int[] segs = new int[ns]; // Segment-index array 2518 split(path, segs); 2519 2520 // Remove dots 2521 removeDots(path, segs); 2522 2523 // Prevent scheme-name confusion 2524 maybeAddLeadingDot(path, segs); 2525 2526 // Join the remaining segments and return the result 2527 String s = new String(path, 0, join(path, segs)); 2528 if (s.equals(ps)) { 2529 // string was already normalized 2530 return ps; 2531 } 2532 return s; 2533 } 2534 2535 2536 2537 // -- Character classes for parsing -- 2538 2539 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2540 // permissible in the various components of a URI reference. We here 2541 // define a set of mask pairs to aid in enforcing these restrictions. Each 2542 // mask pair consists of two longs, a low mask and a high mask. Taken 2543 // together they represent a 128-bit mask, where bit i is set iff the 2544 // character with value i is permitted. 2545 // 2546 // This approach is more efficient than sequentially searching arrays of 2547 // permitted characters. It could be made still more efficient by 2548 // precompiling the mask information so that a character's presence in a 2549 // given mask could be determined by a single table lookup. 2550 2551 // To save startup time, we manually calculate the low-/highMask constants. 2552 // For reference, the following methods were used to calculate the values: 2553 2554 // Compute the low-order mask for the characters in the given string 2555 // private static long lowMask(String chars) { 2556 // int n = chars.length(); 2557 // long m = 0; 2558 // for (int i = 0; i < n; i++) { 2559 // char c = chars.charAt(i); 2560 // if (c < 64) 2561 // m |= (1L << c); 2562 // } 2563 // return m; 2564 // } 2565 2566 // Compute the high-order mask for the characters in the given string 2567 // private static long highMask(String chars) { 2568 // int n = chars.length(); 2569 // long m = 0; 2570 // for (int i = 0; i < n; i++) { 2571 // char c = chars.charAt(i); 2572 // if ((c >= 64) && (c < 128)) 2573 // m |= (1L << (c - 64)); 2574 // } 2575 // return m; 2576 // } 2577 2578 // Compute a low-order mask for the characters 2579 // between first and last, inclusive 2580 // private static long lowMask(char first, char last) { 2581 // long m = 0; 2582 // int f = Math.max(Math.min(first, 63), 0); 2583 // int l = Math.max(Math.min(last, 63), 0); 2584 // for (int i = f; i <= l; i++) 2585 // m |= 1L << i; 2586 // return m; 2587 // } 2588 2589 // Compute a high-order mask for the characters 2590 // between first and last, inclusive 2591 // private static long highMask(char first, char last) { 2592 // long m = 0; 2593 // int f = Math.max(Math.min(first, 127), 64) - 64; 2594 // int l = Math.max(Math.min(last, 127), 64) - 64; 2595 // for (int i = f; i <= l; i++) 2596 // m |= 1L << i; 2597 // return m; 2598 // } 2599 2600 // Tell whether the given character is permitted by the given mask pair 2601 private static boolean match(char c, long lowMask, long highMask) { 2602 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2603 return false; 2604 if (c < 64) 2605 return ((1L << c) & lowMask) != 0; 2606 if (c < 128) 2607 return ((1L << (c - 64)) & highMask) != 0; 2608 return false; 2609 } 2610 2611 // Character-class masks, in reverse order from RFC2396 because 2612 // initializers for static fields cannot make forward references. 2613 2614 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2615 // "8" | "9" 2616 private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9'); 2617 private static final long H_DIGIT = 0L; 2618 2619 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2620 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2621 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2622 private static final long L_UPALPHA = 0L; 2623 private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z'); 2624 2625 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2626 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2627 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2628 private static final long L_LOWALPHA = 0L; 2629 private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z'); 2630 2631 // alpha = lowalpha | upalpha 2632 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2633 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2634 2635 // alphanum = alpha | digit 2636 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2637 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2638 2639 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2640 // "a" | "b" | "c" | "d" | "e" | "f" 2641 private static final long L_HEX = L_DIGIT; 2642 private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f'); 2643 2644 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2645 // "(" | ")" 2646 private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()"); 2647 private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()"); 2648 2649 // unreserved = alphanum | mark 2650 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2651 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2652 2653 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2654 // "$" | "," | "[" | "]" 2655 // Added per RFC2732: "[", "]" 2656 private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]"); 2657 private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]"); 2658 2659 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2660 // characters are allowed; this is handled by the scanEscape method below. 2661 private static final long L_ESCAPED = 1L; 2662 private static final long H_ESCAPED = 0L; 2663 2664 // uric = reserved | unreserved | escaped 2665 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2666 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2667 2668 // pchar = unreserved | escaped | 2669 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2670 private static final long L_PCHAR 2671 = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,"); 2672 private static final long H_PCHAR 2673 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,"); 2674 2675 // All valid path characters 2676 private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/"); 2677 private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L; 2678 2679 // Dash, for use in domainlabel and toplabel 2680 private static final long L_DASH = 0x200000000000L; // lowMask("-"); 2681 private static final long H_DASH = 0x0L; // highMask("-"); 2682 2683 // Dot, for use in hostnames 2684 private static final long L_DOT = 0x400000000000L; // lowMask("."); 2685 private static final long H_DOT = 0x0L; // highMask("."); 2686 2687 // userinfo = *( unreserved | escaped | 2688 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2689 private static final long L_USERINFO 2690 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,"); 2691 private static final long H_USERINFO 2692 = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L; 2693 2694 // reg_name = 1*( unreserved | escaped | "$" | "," | 2695 // ";" | ":" | "@" | "&" | "=" | "+" ) 2696 private static final long L_REG_NAME 2697 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+"); 2698 private static final long H_REG_NAME 2699 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+"); 2700 2701 // All valid characters for server-based authorities 2702 private static final long L_SERVER 2703 = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]"); 2704 private static final long H_SERVER 2705 = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]"); 2706 2707 // Special case of server authority that represents an IPv6 address 2708 // In this case, a % does not signify an escape sequence 2709 private static final long L_SERVER_PERCENT 2710 = L_SERVER | 0x2000000000L; // lowMask("%"); 2711 private static final long H_SERVER_PERCENT 2712 = H_SERVER; // | highMask("%") == 0L; 2713 2714 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2715 private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-."); 2716 private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L 2717 2718 // scope_id = alpha | digit | "_" | "." 2719 private static final long L_SCOPE_ID 2720 = L_ALPHANUM | 0x400000000000L; // lowMask("_."); 2721 private static final long H_SCOPE_ID 2722 = H_ALPHANUM | 0x80000000L; // highMask("_."); 2723 2724 // -- Escaping and encoding -- 2725 2726 private static final char[] hexDigits = { 2727 '0', '1', '2', '3', '4', '5', '6', '7', 2728 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2729 }; 2730 2731 private static void appendEscape(StringBuilder sb, byte b) { 2732 sb.append('%'); 2733 sb.append(hexDigits[(b >> 4) & 0x0f]); 2734 sb.append(hexDigits[(b >> 0) & 0x0f]); 2735 } 2736 2737 private static void appendEncoded(StringBuilder sb, char c) { 2738 ByteBuffer bb = null; 2739 try { 2740 bb = ThreadLocalCoders.encoderFor("UTF-8") 2741 .encode(CharBuffer.wrap("" + c)); 2742 } catch (CharacterCodingException x) { 2743 assert false; 2744 } 2745 while (bb.hasRemaining()) { 2746 int b = bb.get() & 0xff; 2747 if (b >= 0x80) 2748 appendEscape(sb, (byte)b); 2749 else 2750 sb.append((char)b); 2751 } 2752 } 2753 2754 // Quote any characters in s that are not permitted 2755 // by the given mask pair 2756 // 2757 private static String quote(String s, long lowMask, long highMask) { 2758 StringBuilder sb = null; 2759 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2760 for (int i = 0; i < s.length(); i++) { 2761 char c = s.charAt(i); 2762 if (c < '\u0080') { 2763 if (!match(c, lowMask, highMask)) { 2764 if (sb == null) { 2765 sb = new StringBuilder(); 2766 sb.append(s, 0, i); 2767 } 2768 appendEscape(sb, (byte)c); 2769 } else { 2770 if (sb != null) 2771 sb.append(c); 2772 } 2773 } else if (allowNonASCII 2774 && (Character.isSpaceChar(c) 2775 || Character.isISOControl(c))) { 2776 if (sb == null) { 2777 sb = new StringBuilder(); 2778 sb.append(s, 0, i); 2779 } 2780 appendEncoded(sb, c); 2781 } else { 2782 if (sb != null) 2783 sb.append(c); 2784 } 2785 } 2786 return (sb == null) ? s : sb.toString(); 2787 } 2788 2789 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2790 // assuming that s is otherwise legal 2791 // 2792 private static String encode(String s) { 2793 int n = s.length(); 2794 if (n == 0) 2795 return s; 2796 2797 // First check whether we actually need to encode 2798 for (int i = 0;;) { 2799 if (s.charAt(i) >= '\u0080') 2800 break; 2801 if (++i >= n) 2802 return s; 2803 } 2804 2805 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2806 ByteBuffer bb = null; 2807 try { 2808 bb = ThreadLocalCoders.encoderFor("UTF-8") 2809 .encode(CharBuffer.wrap(ns)); 2810 } catch (CharacterCodingException x) { 2811 assert false; 2812 } 2813 2814 StringBuilder sb = new StringBuilder(); 2815 while (bb.hasRemaining()) { 2816 int b = bb.get() & 0xff; 2817 if (b >= 0x80) 2818 appendEscape(sb, (byte)b); 2819 else 2820 sb.append((char)b); 2821 } 2822 return sb.toString(); 2823 } 2824 2825 private static int decode(char c) { 2826 if ((c >= '0') && (c <= '9')) 2827 return c - '0'; 2828 if ((c >= 'a') && (c <= 'f')) 2829 return c - 'a' + 10; 2830 if ((c >= 'A') && (c <= 'F')) 2831 return c - 'A' + 10; 2832 assert false; 2833 return -1; 2834 } 2835 2836 private static byte decode(char c1, char c2) { 2837 return (byte)( ((decode(c1) & 0xf) << 4) 2838 | ((decode(c2) & 0xf) << 0)); 2839 } 2840 2841 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2842 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2843 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2844 // are replaced with '\uFFFD'. 2845 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2846 // with a scope_id 2847 // 2848 private static String decode(String s) { 2849 return decode(s, true); 2850 } 2851 2852 // This method was introduced as a generalization of URI.decode method 2853 // to provide a fix for JDK-8037396 2854 private static String decode(String s, boolean ignorePercentInBrackets) { 2855 if (s == null) 2856 return s; 2857 int n = s.length(); 2858 if (n == 0) 2859 return s; 2860 if (s.indexOf('%') < 0) 2861 return s; 2862 2863 StringBuilder sb = new StringBuilder(n); 2864 ByteBuffer bb = ByteBuffer.allocate(n); 2865 CharBuffer cb = CharBuffer.allocate(n); 2866 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2867 .onMalformedInput(CodingErrorAction.REPLACE) 2868 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2869 2870 // This is not horribly efficient, but it will do for now 2871 char c = s.charAt(0); 2872 boolean betweenBrackets = false; 2873 2874 for (int i = 0; i < n;) { 2875 assert c == s.charAt(i); // Loop invariant 2876 if (c == '[') { 2877 betweenBrackets = true; 2878 } else if (betweenBrackets && c == ']') { 2879 betweenBrackets = false; 2880 } 2881 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2882 sb.append(c); 2883 if (++i >= n) 2884 break; 2885 c = s.charAt(i); 2886 continue; 2887 } 2888 bb.clear(); 2889 int ui = i; 2890 for (;;) { 2891 assert (n - i >= 2); 2892 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2893 if (++i >= n) 2894 break; 2895 c = s.charAt(i); 2896 if (c != '%') 2897 break; 2898 } 2899 bb.flip(); 2900 cb.clear(); 2901 dec.reset(); 2902 CoderResult cr = dec.decode(bb, cb, true); 2903 assert cr.isUnderflow(); 2904 cr = dec.flush(cb); 2905 assert cr.isUnderflow(); 2906 sb.append(cb.flip().toString()); 2907 } 2908 2909 return sb.toString(); 2910 } 2911 2912 2913 // -- Parsing -- 2914 2915 // For convenience we wrap the input URI string in a new instance of the 2916 // following internal class. This saves always having to pass the input 2917 // string as an argument to each internal scan/parse method. 2918 2919 private class Parser { 2920 2921 private String input; // URI input string 2922 private boolean requireServerAuthority = false; 2923 2924 Parser(String s) { 2925 input = s; 2926 string = s; 2927 } 2928 2929 // -- Methods for throwing URISyntaxException in various ways -- 2930 2931 private void fail(String reason) throws URISyntaxException { 2932 throw new URISyntaxException(input, reason); 2933 } 2934 2935 private void fail(String reason, int p) throws URISyntaxException { 2936 throw new URISyntaxException(input, reason, p); 2937 } 2938 2939 private void failExpecting(String expected, int p) 2940 throws URISyntaxException 2941 { 2942 fail("Expected " + expected, p); 2943 } 2944 2945 2946 // -- Simple access to the input string -- 2947 2948 // Tells whether start < end and, if so, whether charAt(start) == c 2949 // 2950 private boolean at(int start, int end, char c) { 2951 return (start < end) && (input.charAt(start) == c); 2952 } 2953 2954 // Tells whether start + s.length() < end and, if so, 2955 // whether the chars at the start position match s exactly 2956 // 2957 private boolean at(int start, int end, String s) { 2958 int p = start; 2959 int sn = s.length(); 2960 if (sn > end - p) 2961 return false; 2962 int i = 0; 2963 while (i < sn) { 2964 if (input.charAt(p++) != s.charAt(i)) { 2965 break; 2966 } 2967 i++; 2968 } 2969 return (i == sn); 2970 } 2971 2972 2973 // -- Scanning -- 2974 2975 // The various scan and parse methods that follow use a uniform 2976 // convention of taking the current start position and end index as 2977 // their first two arguments. The start is inclusive while the end is 2978 // exclusive, just as in the String class, i.e., a start/end pair 2979 // denotes the left-open interval [start, end) of the input string. 2980 // 2981 // These methods never proceed past the end position. They may return 2982 // -1 to indicate outright failure, but more often they simply return 2983 // the position of the first char after the last char scanned. Thus 2984 // a typical idiom is 2985 // 2986 // int p = start; 2987 // int q = scan(p, end, ...); 2988 // if (q > p) 2989 // // We scanned something 2990 // ...; 2991 // else if (q == p) 2992 // // We scanned nothing 2993 // ...; 2994 // else if (q == -1) 2995 // // Something went wrong 2996 // ...; 2997 2998 2999 // Scan a specific char: If the char at the given start position is 3000 // equal to c, return the index of the next char; otherwise, return the 3001 // start position. 3002 // 3003 private int scan(int start, int end, char c) { 3004 if ((start < end) && (input.charAt(start) == c)) 3005 return start + 1; 3006 return start; 3007 } 3008 3009 // Scan forward from the given start position. Stop at the first char 3010 // in the err string (in which case -1 is returned), or the first char 3011 // in the stop string (in which case the index of the preceding char is 3012 // returned), or the end of the input string (in which case the length 3013 // of the input string is returned). May return the start position if 3014 // nothing matches. 3015 // 3016 private int scan(int start, int end, String err, String stop) { 3017 int p = start; 3018 while (p < end) { 3019 char c = input.charAt(p); 3020 if (err.indexOf(c) >= 0) 3021 return -1; 3022 if (stop.indexOf(c) >= 0) 3023 break; 3024 p++; 3025 } 3026 return p; 3027 } 3028 3029 // Scan forward from the given start position. Stop at the first char 3030 // in the stop string (in which case the index of the preceding char is 3031 // returned), or the end of the input string (in which case the length 3032 // of the input string is returned). May return the start position if 3033 // nothing matches. 3034 // 3035 private int scan(int start, int end, String stop) { 3036 int p = start; 3037 while (p < end) { 3038 char c = input.charAt(p); 3039 if (stop.indexOf(c) >= 0) 3040 break; 3041 p++; 3042 } 3043 return p; 3044 } 3045 3046 // Scan a potential escape sequence, starting at the given position, 3047 // with the given first char (i.e., charAt(start) == c). 3048 // 3049 // This method assumes that if escapes are allowed then visible 3050 // non-US-ASCII chars are also allowed. 3051 // 3052 private int scanEscape(int start, int n, char first) 3053 throws URISyntaxException 3054 { 3055 int p = start; 3056 char c = first; 3057 if (c == '%') { 3058 // Process escape pair 3059 if ((p + 3 <= n) 3060 && match(input.charAt(p + 1), L_HEX, H_HEX) 3061 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3062 return p + 3; 3063 } 3064 fail("Malformed escape pair", p); 3065 } else if ((c > 128) 3066 && !Character.isSpaceChar(c) 3067 && !Character.isISOControl(c)) { 3068 // Allow unescaped but visible non-US-ASCII chars 3069 return p + 1; 3070 } 3071 return p; 3072 } 3073 3074 // Scan chars that match the given mask pair 3075 // 3076 private int scan(int start, int n, long lowMask, long highMask) 3077 throws URISyntaxException 3078 { 3079 int p = start; 3080 while (p < n) { 3081 char c = input.charAt(p); 3082 if (match(c, lowMask, highMask)) { 3083 p++; 3084 continue; 3085 } 3086 if ((lowMask & L_ESCAPED) != 0) { 3087 int q = scanEscape(p, n, c); 3088 if (q > p) { 3089 p = q; 3090 continue; 3091 } 3092 } 3093 break; 3094 } 3095 return p; 3096 } 3097 3098 // Check that each of the chars in [start, end) matches the given mask 3099 // 3100 private void checkChars(int start, int end, 3101 long lowMask, long highMask, 3102 String what) 3103 throws URISyntaxException 3104 { 3105 int p = scan(start, end, lowMask, highMask); 3106 if (p < end) 3107 fail("Illegal character in " + what, p); 3108 } 3109 3110 // Check that the char at position p matches the given mask 3111 // 3112 private void checkChar(int p, 3113 long lowMask, long highMask, 3114 String what) 3115 throws URISyntaxException 3116 { 3117 checkChars(p, p + 1, lowMask, highMask, what); 3118 } 3119 3120 3121 // -- Parsing -- 3122 3123 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3124 // 3125 void parse(boolean rsa) throws URISyntaxException { 3126 requireServerAuthority = rsa; 3127 int n = input.length(); 3128 int p = scan(0, n, "/?#", ":"); 3129 if ((p >= 0) && at(p, n, ':')) { 3130 if (p == 0) 3131 failExpecting("scheme name", 0); 3132 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3133 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3134 scheme = input.substring(0, p); 3135 p++; // Skip ':' 3136 if (at(p, n, '/')) { 3137 p = parseHierarchical(p, n); 3138 } else { 3139 // opaque; need to create the schemeSpecificPart 3140 int q = scan(p, n, "#"); 3141 if (q <= p) 3142 failExpecting("scheme-specific part", p); 3143 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3144 schemeSpecificPart = input.substring(p, q); 3145 p = q; 3146 } 3147 } else { 3148 p = parseHierarchical(0, n); 3149 } 3150 if (at(p, n, '#')) { 3151 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3152 fragment = input.substring(p + 1, n); 3153 p = n; 3154 } 3155 if (p < n) 3156 fail("end of URI", p); 3157 } 3158 3159 // [//authority]<path>[?<query>] 3160 // 3161 // DEVIATION from RFC2396: We allow an empty authority component as 3162 // long as it's followed by a non-empty path, query component, or 3163 // fragment component. This is so that URIs such as "file:///foo/bar" 3164 // will parse. This seems to be the intent of RFC2396, though the 3165 // grammar does not permit it. If the authority is empty then the 3166 // userInfo, host, and port components are undefined. 3167 // 3168 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3169 // to be the intent of RFC2396, but the grammar does not permit it. 3170 // The primary consequence of this deviation is that "#f" parses as a 3171 // relative URI with an empty path. 3172 // 3173 private int parseHierarchical(int start, int n) 3174 throws URISyntaxException 3175 { 3176 int p = start; 3177 if (at(p, n, '/') && at(p + 1, n, '/')) { 3178 p += 2; 3179 int q = scan(p, n, "/?#"); 3180 if (q > p) { 3181 p = parseAuthority(p, q); 3182 } else if (q < n) { 3183 // DEVIATION: Allow empty authority prior to non-empty 3184 // path, query component or fragment identifier 3185 } else 3186 failExpecting("authority", p); 3187 } 3188 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3189 checkChars(p, q, L_PATH, H_PATH, "path"); 3190 path = input.substring(p, q); 3191 p = q; 3192 if (at(p, n, '?')) { 3193 p++; 3194 q = scan(p, n, "#"); 3195 checkChars(p, q, L_URIC, H_URIC, "query"); 3196 query = input.substring(p, q); 3197 p = q; 3198 } 3199 return p; 3200 } 3201 3202 // authority = server | reg_name 3203 // 3204 // Ambiguity: An authority that is a registry name rather than a server 3205 // might have a prefix that parses as a server. We use the fact that 3206 // the authority component is always followed by '/' or the end of the 3207 // input string to resolve this: If the complete authority did not 3208 // parse as a server then we try to parse it as a registry name. 3209 // 3210 private int parseAuthority(int start, int n) 3211 throws URISyntaxException 3212 { 3213 int p = start; 3214 int q = p; 3215 URISyntaxException ex = null; 3216 3217 boolean serverChars; 3218 boolean regChars; 3219 3220 if (scan(p, n, "]") > p) { 3221 // contains a literal IPv6 address, therefore % is allowed 3222 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3223 } else { 3224 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3225 } 3226 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3227 3228 if (regChars && !serverChars) { 3229 // Must be a registry-based authority 3230 authority = input.substring(p, n); 3231 return n; 3232 } 3233 3234 if (serverChars) { 3235 // Might be (probably is) a server-based authority, so attempt 3236 // to parse it as such. If the attempt fails, try to treat it 3237 // as a registry-based authority. 3238 try { 3239 q = parseServer(p, n); 3240 if (q < n) 3241 failExpecting("end of authority", q); 3242 authority = input.substring(p, n); 3243 } catch (URISyntaxException x) { 3244 // Undo results of failed parse 3245 userInfo = null; 3246 host = null; 3247 port = -1; 3248 if (requireServerAuthority) { 3249 // If we're insisting upon a server-based authority, 3250 // then just re-throw the exception 3251 throw x; 3252 } else { 3253 // Save the exception in case it doesn't parse as a 3254 // registry either 3255 ex = x; 3256 q = p; 3257 } 3258 } 3259 } 3260 3261 if (q < n) { 3262 if (regChars) { 3263 // Registry-based authority 3264 authority = input.substring(p, n); 3265 } else if (ex != null) { 3266 // Re-throw exception; it was probably due to 3267 // a malformed IPv6 address 3268 throw ex; 3269 } else { 3270 fail("Illegal character in authority", q); 3271 } 3272 } 3273 3274 return n; 3275 } 3276 3277 3278 // [<userinfo>@]<host>[:<port>] 3279 // 3280 private int parseServer(int start, int n) 3281 throws URISyntaxException 3282 { 3283 int p = start; 3284 int q; 3285 3286 // userinfo 3287 q = scan(p, n, "/?#", "@"); 3288 if ((q >= p) && at(q, n, '@')) { 3289 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3290 userInfo = input.substring(p, q); 3291 p = q + 1; // Skip '@' 3292 } 3293 3294 // hostname, IPv4 address, or IPv6 address 3295 if (at(p, n, '[')) { 3296 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3297 p++; 3298 q = scan(p, n, "/?#", "]"); 3299 if ((q > p) && at(q, n, ']')) { 3300 // look for a "%" scope id 3301 int r = scan (p, q, "%"); 3302 if (r > p) { 3303 parseIPv6Reference(p, r); 3304 if (r+1 == q) { 3305 fail ("scope id expected"); 3306 } 3307 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3308 "scope id"); 3309 } else { 3310 parseIPv6Reference(p, q); 3311 } 3312 host = input.substring(p-1, q+1); 3313 p = q + 1; 3314 } else { 3315 failExpecting("closing bracket for IPv6 address", q); 3316 } 3317 } else { 3318 q = parseIPv4Address(p, n); 3319 if (q <= p) 3320 q = parseHostname(p, n); 3321 p = q; 3322 } 3323 3324 // port 3325 if (at(p, n, ':')) { 3326 p++; 3327 q = scan(p, n, "/"); 3328 if (q > p) { 3329 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3330 try { 3331 port = Integer.parseInt(input, p, q, 10); 3332 } catch (NumberFormatException x) { 3333 fail("Malformed port number", p); 3334 } 3335 p = q; 3336 } 3337 } 3338 if (p < n) 3339 failExpecting("port number", p); 3340 3341 return p; 3342 } 3343 3344 // Scan a string of decimal digits whose value fits in a byte 3345 // 3346 private int scanByte(int start, int n) 3347 throws URISyntaxException 3348 { 3349 int p = start; 3350 int q = scan(p, n, L_DIGIT, H_DIGIT); 3351 if (q <= p) return q; 3352 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3353 return q; 3354 } 3355 3356 // Scan an IPv4 address. 3357 // 3358 // If the strict argument is true then we require that the given 3359 // interval contain nothing besides an IPv4 address; if it is false 3360 // then we only require that it start with an IPv4 address. 3361 // 3362 // If the interval does not contain or start with (depending upon the 3363 // strict argument) a legal IPv4 address characters then we return -1 3364 // immediately; otherwise we insist that these characters parse as a 3365 // legal IPv4 address and throw an exception on failure. 3366 // 3367 // We assume that any string of decimal digits and dots must be an IPv4 3368 // address. It won't parse as a hostname anyway, so making that 3369 // assumption here allows more meaningful exceptions to be thrown. 3370 // 3371 private int scanIPv4Address(int start, int n, boolean strict) 3372 throws URISyntaxException 3373 { 3374 int p = start; 3375 int q; 3376 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3377 if ((m <= p) || (strict && (m != n))) 3378 return -1; 3379 for (;;) { 3380 // Per RFC2732: At most three digits per byte 3381 // Further constraint: Each element fits in a byte 3382 if ((q = scanByte(p, m)) <= p) break; p = q; 3383 if ((q = scan(p, m, '.')) <= p) break; p = q; 3384 if ((q = scanByte(p, m)) <= p) break; p = q; 3385 if ((q = scan(p, m, '.')) <= p) break; p = q; 3386 if ((q = scanByte(p, m)) <= p) break; p = q; 3387 if ((q = scan(p, m, '.')) <= p) break; p = q; 3388 if ((q = scanByte(p, m)) <= p) break; p = q; 3389 if (q < m) break; 3390 return q; 3391 } 3392 fail("Malformed IPv4 address", q); 3393 return -1; 3394 } 3395 3396 // Take an IPv4 address: Throw an exception if the given interval 3397 // contains anything except an IPv4 address 3398 // 3399 private int takeIPv4Address(int start, int n, String expected) 3400 throws URISyntaxException 3401 { 3402 int p = scanIPv4Address(start, n, true); 3403 if (p <= start) 3404 failExpecting(expected, start); 3405 return p; 3406 } 3407 3408 // Attempt to parse an IPv4 address, returning -1 on failure but 3409 // allowing the given interval to contain [:<characters>] after 3410 // the IPv4 address. 3411 // 3412 private int parseIPv4Address(int start, int n) { 3413 int p; 3414 3415 try { 3416 p = scanIPv4Address(start, n, false); 3417 } catch (URISyntaxException x) { 3418 return -1; 3419 } catch (NumberFormatException nfe) { 3420 return -1; 3421 } 3422 3423 if (p > start && p < n) { 3424 // IPv4 address is followed by something - check that 3425 // it's a ":" as this is the only valid character to 3426 // follow an address. 3427 if (input.charAt(p) != ':') { 3428 p = -1; 3429 } 3430 } 3431 3432 if (p > start) 3433 host = input.substring(start, p); 3434 3435 return p; 3436 } 3437 3438 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3439 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3440 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3441 // 3442 private int parseHostname(int start, int n) 3443 throws URISyntaxException 3444 { 3445 int p = start; 3446 int q; 3447 int l = -1; // Start of last parsed label 3448 3449 do { 3450 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3451 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3452 if (q <= p) 3453 break; 3454 l = p; 3455 if (q > p) { 3456 p = q; 3457 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3458 if (q > p) { 3459 if (input.charAt(q - 1) == '-') 3460 fail("Illegal character in hostname", q - 1); 3461 p = q; 3462 } 3463 } 3464 q = scan(p, n, '.'); 3465 if (q <= p) 3466 break; 3467 p = q; 3468 } while (p < n); 3469 3470 if ((p < n) && !at(p, n, ':')) 3471 fail("Illegal character in hostname", p); 3472 3473 if (l < 0) 3474 failExpecting("hostname", start); 3475 3476 // for a fully qualified hostname check that the rightmost 3477 // label starts with an alpha character. 3478 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3479 fail("Illegal character in hostname", l); 3480 } 3481 3482 host = input.substring(start, p); 3483 return p; 3484 } 3485 3486 3487 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3488 // 3489 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3490 // the form ::12.34.56.78, which are clearly shown in the examples 3491 // earlier in the document. Here is the original grammar: 3492 // 3493 // IPv6address = hexpart [ ":" IPv4address ] 3494 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3495 // hexseq = hex4 *( ":" hex4) 3496 // hex4 = 1*4HEXDIG 3497 // 3498 // We therefore use the following revised grammar: 3499 // 3500 // IPv6address = hexseq [ ":" IPv4address ] 3501 // | hexseq [ "::" [ hexpost ] ] 3502 // | "::" [ hexpost ] 3503 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3504 // hexseq = hex4 *( ":" hex4) 3505 // hex4 = 1*4HEXDIG 3506 // 3507 // This covers all and only the following cases: 3508 // 3509 // hexseq 3510 // hexseq : IPv4address 3511 // hexseq :: 3512 // hexseq :: hexseq 3513 // hexseq :: hexseq : IPv4address 3514 // hexseq :: IPv4address 3515 // :: hexseq 3516 // :: hexseq : IPv4address 3517 // :: IPv4address 3518 // :: 3519 // 3520 // Additionally we constrain the IPv6 address as follows :- 3521 // 3522 // i. IPv6 addresses without compressed zeros should contain 3523 // exactly 16 bytes. 3524 // 3525 // ii. IPv6 addresses with compressed zeros should contain 3526 // less than 16 bytes. 3527 3528 private int ipv6byteCount = 0; 3529 3530 private int parseIPv6Reference(int start, int n) 3531 throws URISyntaxException 3532 { 3533 int p = start; 3534 int q; 3535 boolean compressedZeros = false; 3536 3537 q = scanHexSeq(p, n); 3538 3539 if (q > p) { 3540 p = q; 3541 if (at(p, n, "::")) { 3542 compressedZeros = true; 3543 p = scanHexPost(p + 2, n); 3544 } else if (at(p, n, ':')) { 3545 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3546 ipv6byteCount += 4; 3547 } 3548 } else if (at(p, n, "::")) { 3549 compressedZeros = true; 3550 p = scanHexPost(p + 2, n); 3551 } 3552 if (p < n) 3553 fail("Malformed IPv6 address", start); 3554 if (ipv6byteCount > 16) 3555 fail("IPv6 address too long", start); 3556 if (!compressedZeros && ipv6byteCount < 16) 3557 fail("IPv6 address too short", start); 3558 if (compressedZeros && ipv6byteCount == 16) 3559 fail("Malformed IPv6 address", start); 3560 3561 return p; 3562 } 3563 3564 private int scanHexPost(int start, int n) 3565 throws URISyntaxException 3566 { 3567 int p = start; 3568 int q; 3569 3570 if (p == n) 3571 return p; 3572 3573 q = scanHexSeq(p, n); 3574 if (q > p) { 3575 p = q; 3576 if (at(p, n, ':')) { 3577 p++; 3578 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3579 ipv6byteCount += 4; 3580 } 3581 } else { 3582 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3583 ipv6byteCount += 4; 3584 } 3585 return p; 3586 } 3587 3588 // Scan a hex sequence; return -1 if one could not be scanned 3589 // 3590 private int scanHexSeq(int start, int n) 3591 throws URISyntaxException 3592 { 3593 int p = start; 3594 int q; 3595 3596 q = scan(p, n, L_HEX, H_HEX); 3597 if (q <= p) 3598 return -1; 3599 if (at(q, n, '.')) // Beginning of IPv4 address 3600 return -1; 3601 if (q > p + 4) 3602 fail("IPv6 hexadecimal digit sequence too long", p); 3603 ipv6byteCount += 2; 3604 p = q; 3605 while (p < n) { 3606 if (!at(p, n, ':')) 3607 break; 3608 if (at(p + 1, n, ':')) 3609 break; // "::" 3610 p++; 3611 q = scan(p, n, L_HEX, H_HEX); 3612 if (q <= p) 3613 failExpecting("digits for an IPv6 address", p); 3614 if (at(q, n, '.')) { // Beginning of IPv4 address 3615 p--; 3616 break; 3617 } 3618 if (q > p + 4) 3619 fail("IPv6 hexadecimal digit sequence too long", p); 3620 ipv6byteCount += 2; 3621 p = q; 3622 } 3623 3624 return p; 3625 } 3626 3627 } 3628 static { 3629 SharedSecrets.setJavaNetUriAccess( 3630 new JavaNetUriAccess() { 3631 public URI create(String scheme, String path) { 3632 return new URI(scheme, path); 3633 } 3634 } 3635 ); 3636 } 3637 }