1 /* 2 * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.File; 29 import java.io.IOException; 30 import java.io.InvalidObjectException; 31 import java.io.ObjectInputStream; 32 import java.io.ObjectOutputStream; 33 import java.io.Serializable; 34 import java.nio.ByteBuffer; 35 import java.nio.CharBuffer; 36 import java.nio.charset.CharsetDecoder; 37 import java.nio.charset.CoderResult; 38 import java.nio.charset.CodingErrorAction; 39 import java.nio.charset.CharacterCodingException; 40 import java.nio.file.Path; 41 import java.text.Normalizer; 42 import jdk.internal.access.JavaNetUriAccess; 43 import jdk.internal.access.SharedSecrets; 44 import sun.nio.cs.ThreadLocalCoders; 45 46 import java.lang.Character; // for javadoc 47 import java.lang.NullPointerException; // for javadoc 48 49 50 /** 51 * Represents a Uniform Resource Identifier (URI) reference. 52 * 53 * <p> Aside from some minor deviations noted below, an instance of this 54 * class represents a URI reference as defined by 55 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 56 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 57 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 58 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 59 * also supports scope_ids. The syntax and usage of scope_ids is described 60 * <a href="Inet6Address.html#scoped">here</a>. 61 * This class provides constructors for creating URI instances from 62 * their components or by parsing their string forms, methods for accessing the 63 * various components of an instance, and methods for normalizing, resolving, 64 * and relativizing URI instances. Instances of this class are immutable. 65 * 66 * 67 * <h3> URI syntax and components </h3> 68 * 69 * At the highest level a URI reference (hereinafter simply "URI") in string 70 * form has the syntax 71 * 72 * <blockquote> 73 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 74 * </blockquote> 75 * 76 * where square brackets [...] delineate optional components and the characters 77 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 78 * 79 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 80 * said to be <i>relative</i>. URIs are also classified according to whether 81 * they are <i>opaque</i> or <i>hierarchical</i>. 82 * 83 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 84 * not begin with a slash character ({@code '/'}). Opaque URIs are not 85 * subject to further parsing. Some examples of opaque URIs are: 86 * 87 * <blockquote><ul style="list-style-type:none"> 88 * <li>{@code mailto:java-net@www.example.com}</li> 89 * <li>{@code news:comp.lang.java}</li> 90 * <li>{@code urn:isbn:096139210x}</li> 91 * </ul></blockquote> 92 * 93 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 94 * scheme-specific part begins with a slash character, or a relative URI, that 95 * is, a URI that does not specify a scheme. Some examples of hierarchical 96 * URIs are: 97 * 98 * <blockquote> 99 * {@code http://example.com/languages/java/}<br> 100 * {@code sample/a/index.html#28}<br> 101 * {@code ../../demo/b/index.html}<br> 102 * {@code file:///~/calendar} 103 * </blockquote> 104 * 105 * <p> A hierarchical URI is subject to further parsing according to the syntax 106 * 107 * <blockquote> 108 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 109 * </blockquote> 110 * 111 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 112 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 113 * scheme-specific part of a hierarchical URI consists of the characters 114 * between the scheme and fragment components. 115 * 116 * <p> The authority component of a hierarchical URI is, if specified, either 117 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 118 * parses according to the familiar syntax 119 * 120 * <blockquote> 121 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 122 * </blockquote> 123 * 124 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 125 * themselves. Nearly all URI schemes currently in use are server-based. An 126 * authority component that does not parse in this way is considered to be 127 * registry-based. 128 * 129 * <p> The path component of a hierarchical URI is itself said to be absolute 130 * if it begins with a slash character ({@code '/'}); otherwise it is 131 * relative. The path of a hierarchical URI that is either absolute or 132 * specifies an authority is always absolute. 133 * 134 * <p> All told, then, a URI instance has the following nine components: 135 * 136 * <table class="striped" style="margin-left:2em"> 137 * <caption style="display:none">Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment</caption> 138 * <thead> 139 * <tr><th scope="col">Component</th><th scope="col">Type</th></tr> 140 * </thead> 141 * <tbody style="text-align:left"> 142 * <tr><th scope="row">scheme</th><td>{@code String}</td></tr> 143 * <tr><th scope="row">scheme-specific-part</th><td>{@code String}</td></tr> 144 * <tr><th scope="row">authority</th><td>{@code String}</td></tr> 145 * <tr><th scope="row">user-info</th><td>{@code String}</td></tr> 146 * <tr><th scope="row">host</th><td>{@code String}</td></tr> 147 * <tr><th scope="row">port</th><td>{@code int}</td></tr> 148 * <tr><th scope="row">path</th><td>{@code String}</td></tr> 149 * <tr><th scope="row">query</th><td>{@code String}</td></tr> 150 * <tr><th scope="row">fragment</th><td>{@code String}</td></tr> 151 * </tbody> 152 * </table> 153 * 154 * In a given instance any particular component is either <i>undefined</i> or 155 * <i>defined</i> with a distinct value. Undefined string components are 156 * represented by {@code null}, while undefined integer components are 157 * represented by {@code -1}. A string component may be defined to have the 158 * empty string as its value; this is not equivalent to that component being 159 * undefined. 160 * 161 * <p> Whether a particular component is or is not defined in an instance 162 * depends upon the type of the URI being represented. An absolute URI has a 163 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 164 * possibly a fragment, but has no other components. A hierarchical URI always 165 * has a path (though it may be empty) and a scheme-specific-part (which at 166 * least contains the path), and may have any of the other components. If the 167 * authority component is present and is server-based then the host component 168 * will be defined and the user-information and port components may be defined. 169 * 170 * 171 * <h4> Operations on URI instances </h4> 172 * 173 * The key operations supported by this class are those of 174 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 175 * 176 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 177 * and {@code ".."} segments from the path component of a hierarchical URI. 178 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 179 * removed only if it is preceded by a non-{@code ".."} segment. 180 * Normalization has no effect upon opaque URIs. 181 * 182 * <p> <i>Resolution</i> is the process of resolving one URI against another, 183 * <i>base</i> URI. The resulting URI is constructed from components of both 184 * URIs in the manner specified by RFC 2396, taking components from the 185 * base URI for those not specified in the original. For hierarchical URIs, 186 * the path of the original is resolved against the path of the base and then 187 * normalized. The result, for example, of resolving 188 * 189 * <blockquote> 190 * {@code sample/a/index.html#28} 191 * 192 * (1) 193 * </blockquote> 194 * 195 * against the base URI {@code http://example.com/languages/java/} is the result 196 * URI 197 * 198 * <blockquote> 199 * {@code http://example.com/languages/java/sample/a/index.html#28} 200 * </blockquote> 201 * 202 * Resolving the relative URI 203 * 204 * <blockquote> 205 * {@code ../../demo/b/index.html} (2) 206 * </blockquote> 207 * 208 * against this result yields, in turn, 209 * 210 * <blockquote> 211 * {@code http://example.com/languages/java/demo/b/index.html} 212 * </blockquote> 213 * 214 * Resolution of both absolute and relative URIs, and of both absolute and 215 * relative paths in the case of hierarchical URIs, is supported. Resolving 216 * the URI {@code file:///~calendar} against any other URI simply yields the 217 * original URI, since it is absolute. Resolving the relative URI (2) above 218 * against the relative base URI (1) yields the normalized, but still relative, 219 * URI 220 * 221 * <blockquote> 222 * {@code demo/b/index.html} 223 * </blockquote> 224 * 225 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 226 * two normalized URIs <i>u</i> and <i>v</i>, 227 * 228 * <blockquote> 229 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 230 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 231 * </blockquote> 232 * 233 * This operation is often useful when constructing a document containing URIs 234 * that must be made relative to the base URI of the document wherever 235 * possible. For example, relativizing the URI 236 * 237 * <blockquote> 238 * {@code http://example.com/languages/java/sample/a/index.html#28} 239 * </blockquote> 240 * 241 * against the base URI 242 * 243 * <blockquote> 244 * {@code http://example.com/languages/java/} 245 * </blockquote> 246 * 247 * yields the relative URI {@code sample/a/index.html#28}. 248 * 249 * 250 * <h4> Character categories </h4> 251 * 252 * RFC 2396 specifies precisely which characters are permitted in the 253 * various components of a URI reference. The following categories, most of 254 * which are taken from that specification, are used below to describe these 255 * constraints: 256 * 257 * <table class="striped" style="margin-left:2em"> 258 * <caption style="display:none">Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other</caption> 259 * <thead> 260 * <tr><th scope="col">Category</th><th scope="col">Description</th></tr> 261 * </thead> 262 * <tbody style="text-align:left"> 263 * <tr><th scope="row" style="vertical-align:top">alpha</th> 264 * <td>The US-ASCII alphabetic characters, 265 * {@code 'A'} through {@code 'Z'} 266 * and {@code 'a'} through {@code 'z'}</td></tr> 267 * <tr><th scope="row" style="vertical-align:top">digit</th> 268 * <td>The US-ASCII decimal digit characters, 269 * {@code '0'} through {@code '9'}</td></tr> 270 * <tr><th scope="row" style="vertical-align:top">alphanum</th> 271 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 272 * <tr><th scope="row" style="vertical-align:top">unreserved</th> 273 * <td>All <i>alphanum</i> characters together with those in the string 274 * {@code "_-!.~'()*"}</td></tr> 275 * <tr><th scope="row" style="vertical-align:top">punct</th> 276 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 277 * <tr><th scope="row" style="vertical-align:top">reserved</th> 278 * <td>All <i>punct</i> characters together with those in the string 279 * {@code "?/[]@"}</td></tr> 280 * <tr><th scope="row" style="vertical-align:top">escaped</th> 281 * <td>Escaped octets, that is, triplets consisting of the percent 282 * character ({@code '%'}) followed by two hexadecimal digits 283 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 284 * {@code 'a'}-{@code 'f'})</td></tr> 285 * <tr><th scope="row" style="vertical-align:top">other</th> 286 * <td>The Unicode characters that are not in the US-ASCII character set, 287 * are not control characters (according to the {@link 288 * java.lang.Character#isISOControl(char) Character.isISOControl} 289 * method), and are not space characters (according to the {@link 290 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 291 * method) <i>(<b>Deviation from RFC 2396</b>, which is 292 * limited to US-ASCII)</i></td></tr> 293 * </tbody> 294 * </table> 295 * 296 * <p><a id="legal-chars"></a> The set of all legal URI characters consists of 297 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 298 * characters. 299 * 300 * 301 * <h4> Escaped octets, quotation, encoding, and decoding </h4> 302 * 303 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 304 * fragment components. Escaping serves two purposes in URIs: 305 * 306 * <ul> 307 * 308 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 309 * conform strictly to RFC 2396 by not containing any <i>other</i> 310 * characters. </p></li> 311 * 312 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 313 * component. The user-info, path, query, and fragment components differ 314 * slightly in terms of which characters are considered legal and illegal. 315 * </p></li> 316 * 317 * </ul> 318 * 319 * These purposes are served in this class by three related operations: 320 * 321 * <ul> 322 * 323 * <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it 324 * with the sequence of escaped octets that represent that character in the 325 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 326 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 327 * RFC 2396</b>, which does not specify any particular character 328 * set.)</i> </p></li> 329 * 330 * <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by 331 * encoding it. The space character, for example, is quoted by replacing it 332 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 333 * characters this transformation has exactly the effect required by 334 * RFC 2396. </p></li> 335 * 336 * <li><p><a id="decode"></a> 337 * A sequence of escaped octets is <i>decoded</i> by 338 * replacing it with the sequence of characters that it represents in the 339 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 340 * effect of de-quoting any quoted US-ASCII characters as well as that of 341 * decoding any encoded non-US-ASCII characters. If a <a 342 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 343 * when decoding the escaped octets then the erroneous octets are replaced by 344 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 345 * 346 * </ul> 347 * 348 * These operations are exposed in the constructors and methods of this class 349 * as follows: 350 * 351 * <ul> 352 * 353 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 354 * constructor} requires any illegal characters in its argument to be 355 * quoted and preserves any escaped octets and <i>other</i> characters that 356 * are present. </p></li> 357 * 358 * <li><p> The {@linkplain 359 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 360 * multi-argument constructors} quote illegal characters as 361 * required by the components in which they appear. The percent character 362 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 363 * characters are preserved. </p></li> 364 * 365 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 366 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 367 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 368 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 369 * values of their corresponding components in raw form, without interpreting 370 * any escaped octets. The strings returned by these methods may contain 371 * both escaped octets and <i>other</i> characters, and will not contain any 372 * illegal characters. </p></li> 373 * 374 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 375 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 376 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 377 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 378 * octets in their corresponding components. The strings returned by these 379 * methods may contain both <i>other</i> characters and illegal characters, 380 * and will not contain any escaped octets. </p></li> 381 * 382 * <li><p> The {@link #toString() toString} method returns a URI string with 383 * all necessary quotation but which may contain <i>other</i> characters. 384 * </p></li> 385 * 386 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 387 * quoted and encoded URI string that does not contain any <i>other</i> 388 * characters. </p></li> 389 * 390 * </ul> 391 * 392 * 393 * <h4> Identities </h4> 394 * 395 * For any URI <i>u</i>, it is always the case that 396 * 397 * <blockquote> 398 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 399 * </blockquote> 400 * 401 * For any URI <i>u</i> that does not contain redundant syntax such as two 402 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 403 * colon following a host name but no port (as in 404 * {@code http://www.example.com:} ), and that does not encode characters 405 * except those that must be quoted, the following identities also hold: 406 * <pre> 407 * new URI(<i>u</i>.getScheme(), 408 * <i>u</i>.getSchemeSpecificPart(), 409 * <i>u</i>.getFragment()) 410 * .equals(<i>u</i>)</pre> 411 * in all cases, 412 * <pre> 413 * new URI(<i>u</i>.getScheme(), 414 * <i>u</i>.getAuthority(), 415 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 416 * <i>u</i>.getFragment()) 417 * .equals(<i>u</i>)</pre> 418 * if <i>u</i> is hierarchical, and 419 * <pre> 420 * new URI(<i>u</i>.getScheme(), 421 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 422 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 423 * <i>u</i>.getFragment()) 424 * .equals(<i>u</i>)</pre> 425 * if <i>u</i> is hierarchical and has either no authority or a server-based 426 * authority. 427 * 428 * 429 * <h4> URIs, URLs, and URNs </h4> 430 * 431 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 432 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 433 * not every URI is a URL. This is because there is another subcategory of 434 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 435 * specify how to locate them. The {@code mailto}, {@code news}, and 436 * {@code isbn} URIs shown above are examples of URNs. 437 * 438 * <p> The conceptual distinction between URIs and URLs is reflected in the 439 * differences between this class and the {@link URL} class. 440 * 441 * <p> An instance of this class represents a URI reference in the syntactic 442 * sense defined by RFC 2396. A URI may be either absolute or relative. 443 * A URI string is parsed according to the generic syntax without regard to the 444 * scheme, if any, that it specifies. No lookup of the host, if any, is 445 * performed, and no scheme-dependent stream handler is constructed. Equality, 446 * hashing, and comparison are defined strictly in terms of the character 447 * content of the instance. In other words, a URI instance is little more than 448 * a structured string that supports the syntactic, scheme-independent 449 * operations of comparison, normalization, resolution, and relativization. 450 * 451 * <p> An instance of the {@link URL} class, by contrast, represents the 452 * syntactic components of a URL together with some of the information required 453 * to access the resource that it describes. A URL must be absolute, that is, 454 * it must always specify a scheme. A URL string is parsed according to its 455 * scheme. A stream handler is always established for a URL, and in fact it is 456 * impossible to create a URL instance for a scheme for which no handler is 457 * available. Equality and hashing depend upon both the scheme and the 458 * Internet address of the host, if any; comparison is not defined. In other 459 * words, a URL is a structured string that supports the syntactic operation of 460 * resolution as well as the network I/O operations of looking up the host and 461 * opening a connection to the specified resource. 462 * 463 * @apiNote 464 * 465 * Applications working with file paths and file URIs should take great 466 * care to use the appropriate methods to convert between the two. 467 * The {@link Path#of(URI)} factory method and the {@link File#File(URI)} 468 * constructor can be used to create {@link Path} or {@link File} 469 * objects from a file URI. {@link Path#toUri()} and {@link File#toURI()} 470 * can be used to create a {@link URI} from a file path. 471 * Applications should never try to {@linkplain 472 * #URI(String, String, String, int, String, String, String) 473 * construct}, {@linkplain #URI(String) parse}, or 474 * {@linkplain #resolve(String) resolve} a {@code URI} 475 * from the direct string representation of a {@code File} or {@code Path} 476 * instance. 477 * <p> 478 * Some components of a URL or URI, such as <i>userinfo</i>, may 479 * be abused to construct misleading URLs or URIs. Applications 480 * that deal with URLs or URIs should take into account 481 * the recommendations advised in <a 482 * href="https://tools.ietf.org/html/rfc3986#section-7">RFC3986, 483 * Section 7, Security Considerations</a>. 484 * 485 * @author Mark Reinhold 486 * @since 1.4 487 * 488 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 489 * transformation format of ISO 10646</i></a>, <br><a 490 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 491 * Architecture</i></a>, <br><a 492 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 493 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 494 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 495 * Literal IPv6 Addresses in URLs</i></a>, <br><a 496 * href="URISyntaxException.html">URISyntaxException</a> 497 */ 498 499 public final class URI 500 implements Comparable<URI>, Serializable 501 { 502 503 // Note: Comments containing the word "ASSERT" indicate places where a 504 // throw of an InternalError should be replaced by an appropriate assertion 505 // statement once asserts are enabled in the build. 506 507 static final long serialVersionUID = -6052424284110960213L; 508 509 510 // -- Properties and components of this instance -- 511 512 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 513 private transient String scheme; // null ==> relative URI 514 private transient String fragment; 515 516 // Hierarchical URI components: [//<authority>]<path>[?<query>] 517 private transient String authority; // Registry or server 518 519 // Server-based authority: [<userInfo>@]<host>[:<port>] 520 private transient String userInfo; 521 private transient String host; // null ==> registry-based 522 private transient int port = -1; // -1 ==> undefined 523 524 // Remaining components of hierarchical URIs 525 private transient String path; // null ==> opaque 526 private transient String query; 527 528 // The remaining fields may be computed on demand, which is safe even in 529 // the face of multiple threads racing to initialize them 530 private transient String schemeSpecificPart; 531 private transient int hash; // Zero ==> undefined 532 533 private transient String decodedUserInfo; 534 private transient String decodedAuthority; 535 private transient String decodedPath; 536 private transient String decodedQuery; 537 private transient String decodedFragment; 538 private transient String decodedSchemeSpecificPart; 539 540 /** 541 * The string form of this URI. 542 * 543 * @serial 544 */ 545 private volatile String string; // The only serializable field 546 547 548 549 // -- Constructors and factories -- 550 551 private URI() { } // Used internally 552 553 /** 554 * Constructs a URI by parsing the given string. 555 * 556 * <p> This constructor parses the given string exactly as specified by the 557 * grammar in <a 558 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 559 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 560 * 561 * <ul> 562 * 563 * <li><p> An empty authority component is permitted as long as it is 564 * followed by a non-empty path, a query component, or a fragment 565 * component. This allows the parsing of URIs such as 566 * {@code "file:///foo/bar"}, which seems to be the intent of 567 * RFC 2396 although the grammar does not permit it. If the 568 * authority component is empty then the user-information, host, and port 569 * components are undefined. </p></li> 570 * 571 * <li><p> Empty relative paths are permitted; this seems to be the 572 * intent of RFC 2396 although the grammar does not permit it. The 573 * primary consequence of this deviation is that a standalone fragment 574 * such as {@code "#foo"} parses as a relative URI with an empty path 575 * and the given fragment, and can be usefully <a 576 * href="#resolve-frag">resolved</a> against a base URI. 577 * 578 * <li><p> IPv4 addresses in host components are parsed rigorously, as 579 * specified by <a 580 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 581 * element of a dotted-quad address must contain no more than three 582 * decimal digits. Each element is further constrained to have a value 583 * no greater than 255. </p></li> 584 * 585 * <li> <p> Hostnames in host components that comprise only a single 586 * domain label are permitted to start with an <i>alphanum</i> 587 * character. This seems to be the intent of <a 588 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 589 * section 3.2.2 although the grammar does not permit it. The 590 * consequence of this deviation is that the authority component of a 591 * hierarchical URI such as {@code s://123}, will parse as a server-based 592 * authority. </p></li> 593 * 594 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 595 * address must be enclosed in square brackets ({@code '['} and 596 * {@code ']'}) as specified by <a 597 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 598 * IPv6 address itself must parse according to <a 599 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 600 * addresses are further constrained to describe no more than sixteen 601 * bytes of address information, a constraint implicit in RFC 2373 602 * but not expressible in the grammar. </p></li> 603 * 604 * <li><p> Characters in the <i>other</i> category are permitted wherever 605 * RFC 2396 permits <i>escaped</i> octets, that is, in the 606 * user-information, path, query, and fragment components, as well as in 607 * the authority component if the authority is registry-based. This 608 * allows URIs to contain Unicode characters beyond those in the US-ASCII 609 * character set. </p></li> 610 * 611 * </ul> 612 * 613 * @param str The string to be parsed into a URI 614 * 615 * @throws NullPointerException 616 * If {@code str} is {@code null} 617 * 618 * @throws URISyntaxException 619 * If the given string violates RFC 2396, as augmented 620 * by the above deviations 621 */ 622 public URI(String str) throws URISyntaxException { 623 new Parser(str).parse(false); 624 } 625 626 /** 627 * Constructs a hierarchical URI from the given components. 628 * 629 * <p> If a scheme is given then the path, if also given, must either be 630 * empty or begin with a slash character ({@code '/'}). Otherwise a 631 * component of the new URI may be left undefined by passing {@code null} 632 * for the corresponding parameter or, in the case of the {@code port} 633 * parameter, by passing {@code -1}. 634 * 635 * <p> This constructor first builds a URI string from the given components 636 * according to the rules specified in <a 637 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 638 * section 5.2, step 7: </p> 639 * 640 * <ol> 641 * 642 * <li><p> Initially, the result string is empty. </p></li> 643 * 644 * <li><p> If a scheme is given then it is appended to the result, 645 * followed by a colon character ({@code ':'}). </p></li> 646 * 647 * <li><p> If user information, a host, or a port are given then the 648 * string {@code "//"} is appended. </p></li> 649 * 650 * <li><p> If user information is given then it is appended, followed by 651 * a commercial-at character ({@code '@'}). Any character not in the 652 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 653 * categories is <a href="#quote">quoted</a>. </p></li> 654 * 655 * <li><p> If a host is given then it is appended. If the host is a 656 * literal IPv6 address but is not enclosed in square brackets 657 * ({@code '['} and {@code ']'}) then the square brackets are added. 658 * </p></li> 659 * 660 * <li><p> If a port number is given then a colon character 661 * ({@code ':'}) is appended, followed by the port number in decimal. 662 * </p></li> 663 * 664 * <li><p> If a path is given then it is appended. Any character not in 665 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 666 * categories, and not equal to the slash character ({@code '/'}) or the 667 * commercial-at character ({@code '@'}), is quoted. </p></li> 668 * 669 * <li><p> If a query is given then a question-mark character 670 * ({@code '?'}) is appended, followed by the query. Any character that 671 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 672 * </p></li> 673 * 674 * <li><p> Finally, if a fragment is given then a hash character 675 * ({@code '#'}) is appended, followed by the fragment. Any character 676 * that is not a legal URI character is quoted. </p></li> 677 * 678 * </ol> 679 * 680 * <p> The resulting URI string is then parsed as if by invoking the {@link 681 * #URI(String)} constructor and then invoking the {@link 682 * #parseServerAuthority()} method upon the result; this may cause a {@link 683 * URISyntaxException} to be thrown. </p> 684 * 685 * @param scheme Scheme name 686 * @param userInfo User name and authorization information 687 * @param host Host name 688 * @param port Port number 689 * @param path Path 690 * @param query Query 691 * @param fragment Fragment 692 * 693 * @throws URISyntaxException 694 * If both a scheme and a path are given but the path is relative, 695 * if the URI string constructed from the given components violates 696 * RFC 2396, or if the authority component of the string is 697 * present but cannot be parsed as a server-based authority 698 */ 699 public URI(String scheme, 700 String userInfo, String host, int port, 701 String path, String query, String fragment) 702 throws URISyntaxException 703 { 704 String s = toString(scheme, null, 705 null, userInfo, host, port, 706 path, query, fragment); 707 checkPath(s, scheme, path); 708 new Parser(s).parse(true); 709 } 710 711 /** 712 * Constructs a hierarchical URI from the given components. 713 * 714 * <p> If a scheme is given then the path, if also given, must either be 715 * empty or begin with a slash character ({@code '/'}). Otherwise a 716 * component of the new URI may be left undefined by passing {@code null} 717 * for the corresponding parameter. 718 * 719 * <p> This constructor first builds a URI string from the given components 720 * according to the rules specified in <a 721 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 722 * section 5.2, step 7: </p> 723 * 724 * <ol> 725 * 726 * <li><p> Initially, the result string is empty. </p></li> 727 * 728 * <li><p> If a scheme is given then it is appended to the result, 729 * followed by a colon character ({@code ':'}). </p></li> 730 * 731 * <li><p> If an authority is given then the string {@code "//"} is 732 * appended, followed by the authority. If the authority contains a 733 * literal IPv6 address then the address must be enclosed in square 734 * brackets ({@code '['} and {@code ']'}). Any character not in the 735 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 736 * categories, and not equal to the commercial-at character 737 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 738 * 739 * <li><p> If a path is given then it is appended. Any character not in 740 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 741 * categories, and not equal to the slash character ({@code '/'}) or the 742 * commercial-at character ({@code '@'}), is quoted. </p></li> 743 * 744 * <li><p> If a query is given then a question-mark character 745 * ({@code '?'}) is appended, followed by the query. Any character that 746 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 747 * </p></li> 748 * 749 * <li><p> Finally, if a fragment is given then a hash character 750 * ({@code '#'}) is appended, followed by the fragment. Any character 751 * that is not a legal URI character is quoted. </p></li> 752 * 753 * </ol> 754 * 755 * <p> The resulting URI string is then parsed as if by invoking the {@link 756 * #URI(String)} constructor and then invoking the {@link 757 * #parseServerAuthority()} method upon the result; this may cause a {@link 758 * URISyntaxException} to be thrown. </p> 759 * 760 * @param scheme Scheme name 761 * @param authority Authority 762 * @param path Path 763 * @param query Query 764 * @param fragment Fragment 765 * 766 * @throws URISyntaxException 767 * If both a scheme and a path are given but the path is relative, 768 * if the URI string constructed from the given components violates 769 * RFC 2396, or if the authority component of the string is 770 * present but cannot be parsed as a server-based authority 771 */ 772 public URI(String scheme, 773 String authority, 774 String path, String query, String fragment) 775 throws URISyntaxException 776 { 777 String s = toString(scheme, null, 778 authority, null, null, -1, 779 path, query, fragment); 780 checkPath(s, scheme, path); 781 new Parser(s).parse(false); 782 } 783 784 /** 785 * Constructs a hierarchical URI from the given components. 786 * 787 * <p> A component may be left undefined by passing {@code null}. 788 * 789 * <p> This convenience constructor works as if by invoking the 790 * seven-argument constructor as follows: 791 * 792 * <blockquote> 793 * {@code new} {@link #URI(String, String, String, int, String, String, String) 794 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 795 * </blockquote> 796 * 797 * @param scheme Scheme name 798 * @param host Host name 799 * @param path Path 800 * @param fragment Fragment 801 * 802 * @throws URISyntaxException 803 * If the URI string constructed from the given components 804 * violates RFC 2396 805 */ 806 public URI(String scheme, String host, String path, String fragment) 807 throws URISyntaxException 808 { 809 this(scheme, null, host, -1, path, null, fragment); 810 } 811 812 /** 813 * Constructs a URI from the given components. 814 * 815 * <p> A component may be left undefined by passing {@code null}. 816 * 817 * <p> This constructor first builds a URI in string form using the given 818 * components as follows: </p> 819 * 820 * <ol> 821 * 822 * <li><p> Initially, the result string is empty. </p></li> 823 * 824 * <li><p> If a scheme is given then it is appended to the result, 825 * followed by a colon character ({@code ':'}). </p></li> 826 * 827 * <li><p> If a scheme-specific part is given then it is appended. Any 828 * character that is not a <a href="#legal-chars">legal URI character</a> 829 * is <a href="#quote">quoted</a>. </p></li> 830 * 831 * <li><p> Finally, if a fragment is given then a hash character 832 * ({@code '#'}) is appended to the string, followed by the fragment. 833 * Any character that is not a legal URI character is quoted. </p></li> 834 * 835 * </ol> 836 * 837 * <p> The resulting URI string is then parsed in order to create the new 838 * URI instance as if by invoking the {@link #URI(String)} constructor; 839 * this may cause a {@link URISyntaxException} to be thrown. </p> 840 * 841 * @param scheme Scheme name 842 * @param ssp Scheme-specific part 843 * @param fragment Fragment 844 * 845 * @throws URISyntaxException 846 * If the URI string constructed from the given components 847 * violates RFC 2396 848 */ 849 public URI(String scheme, String ssp, String fragment) 850 throws URISyntaxException 851 { 852 new Parser(toString(scheme, ssp, 853 null, null, null, -1, 854 null, null, fragment)) 855 .parse(false); 856 } 857 858 /** 859 * Constructs a simple URI consisting of only a scheme and a pre-validated 860 * path. Provides a fast-path for some internal cases. 861 */ 862 URI(String scheme, String path) { 863 assert validSchemeAndPath(scheme, path); 864 this.scheme = scheme; 865 this.path = path; 866 } 867 868 private static boolean validSchemeAndPath(String scheme, String path) { 869 try { 870 URI u = new URI(scheme + ":" + path); 871 return scheme.equals(u.scheme) && path.equals(u.path); 872 } catch (URISyntaxException e) { 873 return false; 874 } 875 } 876 877 /** 878 * Creates a URI by parsing the given string. 879 * 880 * <p> This convenience factory method works as if by invoking the {@link 881 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 882 * constructor is caught and wrapped in a new {@link 883 * IllegalArgumentException} object, which is then thrown. 884 * 885 * <p> This method is provided for use in situations where it is known that 886 * the given string is a legal URI, for example for URI constants declared 887 * within a program, and so it would be considered a programming error 888 * for the string not to parse as such. The constructors, which throw 889 * {@link URISyntaxException} directly, should be used in situations where a 890 * URI is being constructed from user input or from some other source that 891 * may be prone to errors. </p> 892 * 893 * @param str The string to be parsed into a URI 894 * @return The new URI 895 * 896 * @throws NullPointerException 897 * If {@code str} is {@code null} 898 * 899 * @throws IllegalArgumentException 900 * If the given string violates RFC 2396 901 */ 902 public static URI create(String str) { 903 try { 904 return new URI(str); 905 } catch (URISyntaxException x) { 906 throw new IllegalArgumentException(x.getMessage(), x); 907 } 908 } 909 910 911 // -- Operations -- 912 913 /** 914 * Attempts to parse this URI's authority component, if defined, into 915 * user-information, host, and port components. 916 * 917 * <p> If this URI's authority component has already been recognized as 918 * being server-based then it will already have been parsed into 919 * user-information, host, and port components. In this case, or if this 920 * URI has no authority component, this method simply returns this URI. 921 * 922 * <p> Otherwise this method attempts once more to parse the authority 923 * component into user-information, host, and port components, and throws 924 * an exception describing why the authority component could not be parsed 925 * in that way. 926 * 927 * <p> This method is provided because the generic URI syntax specified in 928 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 929 * cannot always distinguish a malformed server-based authority from a 930 * legitimate registry-based authority. It must therefore treat some 931 * instances of the former as instances of the latter. The authority 932 * component in the URI string {@code "//foo:bar"}, for example, is not a 933 * legal server-based authority but it is legal as a registry-based 934 * authority. 935 * 936 * <p> In many common situations, for example when working URIs that are 937 * known to be either URNs or URLs, the hierarchical URIs being used will 938 * always be server-based. They therefore must either be parsed as such or 939 * treated as an error. In these cases a statement such as 940 * 941 * <blockquote> 942 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 943 * </blockquote> 944 * 945 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 946 * it has an authority component, has a server-based authority with proper 947 * user-information, host, and port components. Invoking this method also 948 * ensures that if the authority could not be parsed in that way then an 949 * appropriate diagnostic message can be issued based upon the exception 950 * that is thrown. </p> 951 * 952 * @return A URI whose authority field has been parsed 953 * as a server-based authority 954 * 955 * @throws URISyntaxException 956 * If the authority component of this URI is defined 957 * but cannot be parsed as a server-based authority 958 * according to RFC 2396 959 */ 960 public URI parseServerAuthority() 961 throws URISyntaxException 962 { 963 // We could be clever and cache the error message and index from the 964 // exception thrown during the original parse, but that would require 965 // either more fields or a more-obscure representation. 966 if ((host != null) || (authority == null)) 967 return this; 968 new Parser(toString()).parse(true); 969 return this; 970 } 971 972 /** 973 * Normalizes this URI's path. 974 * 975 * <p> If this URI is opaque, or if its path is already in normal form, 976 * then this URI is returned. Otherwise a new URI is constructed that is 977 * identical to this URI except that its path is computed by normalizing 978 * this URI's path in a manner consistent with <a 979 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 980 * section 5.2, step 6, sub-steps c through f; that is: 981 * </p> 982 * 983 * <ol> 984 * 985 * <li><p> All {@code "."} segments are removed. </p></li> 986 * 987 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 988 * segment then both of these segments are removed. This step is 989 * repeated until it is no longer applicable. </p></li> 990 * 991 * <li><p> If the path is relative, and if its first segment contains a 992 * colon character ({@code ':'}), then a {@code "."} segment is 993 * prepended. This prevents a relative URI with a path such as 994 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 995 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 996 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 997 * 998 * </ol> 999 * 1000 * <p> A normalized path will begin with one or more {@code ".."} segments 1001 * if there were insufficient non-{@code ".."} segments preceding them to 1002 * allow their removal. A normalized path will begin with a {@code "."} 1003 * segment if one was inserted by step 3 above. Otherwise, a normalized 1004 * path will not contain any {@code "."} or {@code ".."} segments. </p> 1005 * 1006 * @return A URI equivalent to this URI, 1007 * but whose path is in normal form 1008 */ 1009 public URI normalize() { 1010 return normalize(this); 1011 } 1012 1013 /** 1014 * Resolves the given URI against this URI. 1015 * 1016 * <p> If the given URI is already absolute, or if this URI is opaque, then 1017 * the given URI is returned. 1018 * 1019 * <p><a id="resolve-frag"></a> If the given URI's fragment component is 1020 * defined, its path component is empty, and its scheme, authority, and 1021 * query components are undefined, then a URI with the given fragment but 1022 * with all other components equal to those of this URI is returned. This 1023 * allows a URI representing a standalone fragment reference, such as 1024 * {@code "#foo"}, to be usefully resolved against a base URI. 1025 * 1026 * <p> Otherwise this method constructs a new hierarchical URI in a manner 1027 * consistent with <a 1028 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1029 * section 5.2; that is: </p> 1030 * 1031 * <ol> 1032 * 1033 * <li><p> A new URI is constructed with this URI's scheme and the given 1034 * URI's query and fragment components. </p></li> 1035 * 1036 * <li><p> If the given URI has an authority component then the new URI's 1037 * authority and path are taken from the given URI. </p></li> 1038 * 1039 * <li><p> Otherwise the new URI's authority component is copied from 1040 * this URI, and its path is computed as follows: </p> 1041 * 1042 * <ol> 1043 * 1044 * <li><p> If the given URI's path is absolute then the new URI's path 1045 * is taken from the given URI. </p></li> 1046 * 1047 * <li><p> Otherwise the given URI's path is relative, and so the new 1048 * URI's path is computed by resolving the path of the given URI 1049 * against the path of this URI. This is done by concatenating all but 1050 * the last segment of this URI's path, if any, with the given URI's 1051 * path and then normalizing the result as if by invoking the {@link 1052 * #normalize() normalize} method. </p></li> 1053 * 1054 * </ol></li> 1055 * 1056 * </ol> 1057 * 1058 * <p> The result of this method is absolute if, and only if, either this 1059 * URI is absolute or the given URI is absolute. </p> 1060 * 1061 * @param uri The URI to be resolved against this URI 1062 * @return The resulting URI 1063 * 1064 * @throws NullPointerException 1065 * If {@code uri} is {@code null} 1066 */ 1067 public URI resolve(URI uri) { 1068 return resolve(this, uri); 1069 } 1070 1071 /** 1072 * Constructs a new URI by parsing the given string and then resolving it 1073 * against this URI. 1074 * 1075 * <p> This convenience method works as if invoking it were equivalent to 1076 * evaluating the expression {@link #resolve(java.net.URI) 1077 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1078 * 1079 * @param str The string to be parsed into a URI 1080 * @return The resulting URI 1081 * 1082 * @throws NullPointerException 1083 * If {@code str} is {@code null} 1084 * 1085 * @throws IllegalArgumentException 1086 * If the given string violates RFC 2396 1087 */ 1088 public URI resolve(String str) { 1089 return resolve(URI.create(str)); 1090 } 1091 1092 /** 1093 * Relativizes the given URI against this URI. 1094 * 1095 * <p> The relativization of the given URI against this URI is computed as 1096 * follows: </p> 1097 * 1098 * <ol> 1099 * 1100 * <li><p> If either this URI or the given URI are opaque, or if the 1101 * scheme and authority components of the two URIs are not identical, or 1102 * if the path of this URI is not a prefix of the path of the given URI, 1103 * then the given URI is returned. </p></li> 1104 * 1105 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1106 * query and fragment components taken from the given URI and with a path 1107 * component computed by removing this URI's path from the beginning of 1108 * the given URI's path. </p></li> 1109 * 1110 * </ol> 1111 * 1112 * @param uri The URI to be relativized against this URI 1113 * @return The resulting URI 1114 * 1115 * @throws NullPointerException 1116 * If {@code uri} is {@code null} 1117 */ 1118 public URI relativize(URI uri) { 1119 return relativize(this, uri); 1120 } 1121 1122 /** 1123 * Constructs a URL from this URI. 1124 * 1125 * <p> This convenience method works as if invoking it were equivalent to 1126 * evaluating the expression {@code new URL(this.toString())} after 1127 * first checking that this URI is absolute. </p> 1128 * 1129 * @return A URL constructed from this URI 1130 * 1131 * @throws IllegalArgumentException 1132 * If this URL is not absolute 1133 * 1134 * @throws MalformedURLException 1135 * If a protocol handler for the URL could not be found, 1136 * or if some other error occurred while constructing the URL 1137 */ 1138 public URL toURL() throws MalformedURLException { 1139 return URL.fromURI(this); 1140 } 1141 1142 // -- Component access methods -- 1143 1144 /** 1145 * Returns the scheme component of this URI. 1146 * 1147 * <p> The scheme component of a URI, if defined, only contains characters 1148 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1149 * scheme always starts with an <i>alpha</i> character. <p> 1150 * 1151 * The scheme component of a URI cannot contain escaped octets, hence this 1152 * method does not perform any decoding. 1153 * 1154 * @return The scheme component of this URI, 1155 * or {@code null} if the scheme is undefined 1156 */ 1157 public String getScheme() { 1158 return scheme; 1159 } 1160 1161 /** 1162 * Tells whether or not this URI is absolute. 1163 * 1164 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1165 * 1166 * @return {@code true} if, and only if, this URI is absolute 1167 */ 1168 public boolean isAbsolute() { 1169 return scheme != null; 1170 } 1171 1172 /** 1173 * Tells whether or not this URI is opaque. 1174 * 1175 * <p> A URI is opaque if, and only if, it is absolute and its 1176 * scheme-specific part does not begin with a slash character ('/'). 1177 * An opaque URI has a scheme, a scheme-specific part, and possibly 1178 * a fragment; all other components are undefined. </p> 1179 * 1180 * @return {@code true} if, and only if, this URI is opaque 1181 */ 1182 public boolean isOpaque() { 1183 return path == null; 1184 } 1185 1186 /** 1187 * Returns the raw scheme-specific part of this URI. The scheme-specific 1188 * part is never undefined, though it may be empty. 1189 * 1190 * <p> The scheme-specific part of a URI only contains legal URI 1191 * characters. </p> 1192 * 1193 * @return The raw scheme-specific part of this URI 1194 * (never {@code null}) 1195 */ 1196 public String getRawSchemeSpecificPart() { 1197 String part = schemeSpecificPart; 1198 if (part != null) { 1199 return part; 1200 } 1201 1202 String s = string; 1203 if (s != null) { 1204 // if string is defined, components will have been parsed 1205 int start = 0; 1206 int end = s.length(); 1207 if (scheme != null) { 1208 start = scheme.length() + 1; 1209 } 1210 if (fragment != null) { 1211 end -= fragment.length() + 1; 1212 } 1213 if (path != null && path.length() == end - start) { 1214 part = path; 1215 } else { 1216 part = s.substring(start, end); 1217 } 1218 } else { 1219 StringBuilder sb = new StringBuilder(); 1220 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1221 host, port, getPath(), getQuery()); 1222 part = sb.toString(); 1223 } 1224 return schemeSpecificPart = part; 1225 } 1226 1227 /** 1228 * Returns the decoded scheme-specific part of this URI. 1229 * 1230 * <p> The string returned by this method is equal to that returned by the 1231 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1232 * except that all sequences of escaped octets are <a 1233 * href="#decode">decoded</a>. </p> 1234 * 1235 * @return The decoded scheme-specific part of this URI 1236 * (never {@code null}) 1237 */ 1238 public String getSchemeSpecificPart() { 1239 String part = decodedSchemeSpecificPart; 1240 if (part == null) { 1241 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1242 } 1243 return part; 1244 } 1245 1246 /** 1247 * Returns the raw authority component of this URI. 1248 * 1249 * <p> The authority component of a URI, if defined, only contains the 1250 * commercial-at character ({@code '@'}) and characters in the 1251 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1252 * categories. If the authority is server-based then it is further 1253 * constrained to have valid user-information, host, and port 1254 * components. </p> 1255 * 1256 * @return The raw authority component of this URI, 1257 * or {@code null} if the authority is undefined 1258 */ 1259 public String getRawAuthority() { 1260 return authority; 1261 } 1262 1263 /** 1264 * Returns the decoded authority component of this URI. 1265 * 1266 * <p> The string returned by this method is equal to that returned by the 1267 * {@link #getRawAuthority() getRawAuthority} method except that all 1268 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1269 * 1270 * @return The decoded authority component of this URI, 1271 * or {@code null} if the authority is undefined 1272 */ 1273 public String getAuthority() { 1274 String auth = decodedAuthority; 1275 if ((auth == null) && (authority != null)) { 1276 decodedAuthority = auth = decode(authority); 1277 } 1278 return auth; 1279 } 1280 1281 /** 1282 * Returns the raw user-information component of this URI. 1283 * 1284 * <p> The user-information component of a URI, if defined, only contains 1285 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1286 * <i>other</i> categories. </p> 1287 * 1288 * @return The raw user-information component of this URI, 1289 * or {@code null} if the user information is undefined 1290 */ 1291 public String getRawUserInfo() { 1292 return userInfo; 1293 } 1294 1295 /** 1296 * Returns the decoded user-information component of this URI. 1297 * 1298 * <p> The string returned by this method is equal to that returned by the 1299 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1300 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1301 * 1302 * @return The decoded user-information component of this URI, 1303 * or {@code null} if the user information is undefined 1304 */ 1305 public String getUserInfo() { 1306 String user = decodedUserInfo; 1307 if ((user == null) && (userInfo != null)) { 1308 decodedUserInfo = user = decode(userInfo); 1309 } 1310 return user; 1311 } 1312 1313 /** 1314 * Returns the host component of this URI. 1315 * 1316 * <p> The host component of a URI, if defined, will have one of the 1317 * following forms: </p> 1318 * 1319 * <ul> 1320 * 1321 * <li><p> A domain name consisting of one or more <i>labels</i> 1322 * separated by period characters ({@code '.'}), optionally followed by 1323 * a period character. Each label consists of <i>alphanum</i> characters 1324 * as well as hyphen characters ({@code '-'}), though hyphens never 1325 * occur as the first or last characters in a label. The rightmost 1326 * label of a domain name consisting of two or more labels, begins 1327 * with an <i>alpha</i> character. </li> 1328 * 1329 * <li><p> A dotted-quad IPv4 address of the form 1330 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1331 * where no <i>digit</i> sequence is longer than three characters and no 1332 * sequence has a value larger than 255. </p></li> 1333 * 1334 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1335 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1336 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1337 * syntax of IPv6 addresses is specified in <a 1338 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1339 * Addressing Architecture</i></a>. </p></li> 1340 * 1341 * </ul> 1342 * 1343 * The host component of a URI cannot contain escaped octets, hence this 1344 * method does not perform any decoding. 1345 * 1346 * @return The host component of this URI, 1347 * or {@code null} if the host is undefined 1348 */ 1349 public String getHost() { 1350 return host; 1351 } 1352 1353 /** 1354 * Returns the port number of this URI. 1355 * 1356 * <p> The port component of a URI, if defined, is a non-negative 1357 * integer. </p> 1358 * 1359 * @return The port component of this URI, 1360 * or {@code -1} if the port is undefined 1361 */ 1362 public int getPort() { 1363 return port; 1364 } 1365 1366 /** 1367 * Returns the raw path component of this URI. 1368 * 1369 * <p> The path component of a URI, if defined, only contains the slash 1370 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1371 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1372 * and <i>other</i> categories. </p> 1373 * 1374 * @return The path component of this URI, 1375 * or {@code null} if the path is undefined 1376 */ 1377 public String getRawPath() { 1378 return path; 1379 } 1380 1381 /** 1382 * Returns the decoded path component of this URI. 1383 * 1384 * <p> The string returned by this method is equal to that returned by the 1385 * {@link #getRawPath() getRawPath} method except that all sequences of 1386 * escaped octets are <a href="#decode">decoded</a>. </p> 1387 * 1388 * @return The decoded path component of this URI, 1389 * or {@code null} if the path is undefined 1390 */ 1391 public String getPath() { 1392 String decoded = decodedPath; 1393 if ((decoded == null) && (path != null)) { 1394 decodedPath = decoded = decode(path); 1395 } 1396 return decoded; 1397 } 1398 1399 /** 1400 * Returns the raw query component of this URI. 1401 * 1402 * <p> The query component of a URI, if defined, only contains legal URI 1403 * characters. </p> 1404 * 1405 * @return The raw query component of this URI, 1406 * or {@code null} if the query is undefined 1407 */ 1408 public String getRawQuery() { 1409 return query; 1410 } 1411 1412 /** 1413 * Returns the decoded query component of this URI. 1414 * 1415 * <p> The string returned by this method is equal to that returned by the 1416 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1417 * escaped octets are <a href="#decode">decoded</a>. </p> 1418 * 1419 * @return The decoded query component of this URI, 1420 * or {@code null} if the query is undefined 1421 */ 1422 public String getQuery() { 1423 String decoded = decodedQuery; 1424 if ((decoded == null) && (query != null)) { 1425 decodedQuery = decoded = decode(query, false); 1426 } 1427 return decoded; 1428 } 1429 1430 /** 1431 * Returns the raw fragment component of this URI. 1432 * 1433 * <p> The fragment component of a URI, if defined, only contains legal URI 1434 * characters. </p> 1435 * 1436 * @return The raw fragment component of this URI, 1437 * or {@code null} if the fragment is undefined 1438 */ 1439 public String getRawFragment() { 1440 return fragment; 1441 } 1442 1443 /** 1444 * Returns the decoded fragment component of this URI. 1445 * 1446 * <p> The string returned by this method is equal to that returned by the 1447 * {@link #getRawFragment() getRawFragment} method except that all 1448 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1449 * 1450 * @return The decoded fragment component of this URI, 1451 * or {@code null} if the fragment is undefined 1452 */ 1453 public String getFragment() { 1454 String decoded = decodedFragment; 1455 if ((decoded == null) && (fragment != null)) { 1456 decodedFragment = decoded = decode(fragment, false); 1457 } 1458 return decoded; 1459 } 1460 1461 1462 // -- Equality, comparison, hash code, toString, and serialization -- 1463 1464 /** 1465 * Tests this URI for equality with another object. 1466 * 1467 * <p> If the given object is not a URI then this method immediately 1468 * returns {@code false}. 1469 * 1470 * <p> For two URIs to be considered equal requires that either both are 1471 * opaque or both are hierarchical. Their schemes must either both be 1472 * undefined or else be equal without regard to case. Their fragments 1473 * must either both be undefined or else be equal. 1474 * 1475 * <p> For two opaque URIs to be considered equal, their scheme-specific 1476 * parts must be equal. 1477 * 1478 * <p> For two hierarchical URIs to be considered equal, their paths must 1479 * be equal and their queries must either both be undefined or else be 1480 * equal. Their authorities must either both be undefined, or both be 1481 * registry-based, or both be server-based. If their authorities are 1482 * defined and are registry-based, then they must be equal. If their 1483 * authorities are defined and are server-based, then their hosts must be 1484 * equal without regard to case, their port numbers must be equal, and 1485 * their user-information components must be equal. 1486 * 1487 * <p> When testing the user-information, path, query, fragment, authority, 1488 * or scheme-specific parts of two URIs for equality, the raw forms rather 1489 * than the encoded forms of these components are compared and the 1490 * hexadecimal digits of escaped octets are compared without regard to 1491 * case. 1492 * 1493 * <p> This method satisfies the general contract of the {@link 1494 * java.lang.Object#equals(Object) Object.equals} method. </p> 1495 * 1496 * @param ob The object to which this object is to be compared 1497 * 1498 * @return {@code true} if, and only if, the given object is a URI that 1499 * is identical to this URI 1500 */ 1501 public boolean equals(Object ob) { 1502 if (ob == this) 1503 return true; 1504 if (!(ob instanceof URI)) 1505 return false; 1506 URI that = (URI)ob; 1507 if (this.isOpaque() != that.isOpaque()) return false; 1508 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1509 if (!equal(this.fragment, that.fragment)) return false; 1510 1511 // Opaque 1512 if (this.isOpaque()) 1513 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1514 1515 // Hierarchical 1516 if (!equal(this.path, that.path)) return false; 1517 if (!equal(this.query, that.query)) return false; 1518 1519 // Authorities 1520 if (this.authority == that.authority) return true; 1521 if (this.host != null) { 1522 // Server-based 1523 if (!equal(this.userInfo, that.userInfo)) return false; 1524 if (!equalIgnoringCase(this.host, that.host)) return false; 1525 if (this.port != that.port) return false; 1526 } else if (this.authority != null) { 1527 // Registry-based 1528 if (!equal(this.authority, that.authority)) return false; 1529 } else if (this.authority != that.authority) { 1530 return false; 1531 } 1532 1533 return true; 1534 } 1535 1536 /** 1537 * Returns a hash-code value for this URI. The hash code is based upon all 1538 * of the URI's components, and satisfies the general contract of the 1539 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1540 * 1541 * @return A hash-code value for this URI 1542 */ 1543 public int hashCode() { 1544 int h = hash; 1545 if (h == 0) { 1546 h = hashIgnoringCase(0, scheme); 1547 h = hash(h, fragment); 1548 if (isOpaque()) { 1549 h = hash(h, schemeSpecificPart); 1550 } else { 1551 h = hash(h, path); 1552 h = hash(h, query); 1553 if (host != null) { 1554 h = hash(h, userInfo); 1555 h = hashIgnoringCase(h, host); 1556 h += 1949 * port; 1557 } else { 1558 h = hash(h, authority); 1559 } 1560 } 1561 if (h != 0) { 1562 hash = h; 1563 } 1564 } 1565 return h; 1566 } 1567 1568 /** 1569 * Compares this URI to another object, which must be a URI. 1570 * 1571 * <p> When comparing corresponding components of two URIs, if one 1572 * component is undefined but the other is defined then the first is 1573 * considered to be less than the second. Unless otherwise noted, string 1574 * components are ordered according to their natural, case-sensitive 1575 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1576 * String.compareTo} method. String components that are subject to 1577 * encoding are compared by comparing their raw forms rather than their 1578 * encoded forms. 1579 * 1580 * <p> The ordering of URIs is defined as follows: </p> 1581 * 1582 * <ul> 1583 * 1584 * <li><p> Two URIs with different schemes are ordered according the 1585 * ordering of their schemes, without regard to case. </p></li> 1586 * 1587 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1588 * with an identical scheme. </p></li> 1589 * 1590 * <li><p> Two opaque URIs with identical schemes are ordered according 1591 * to the ordering of their scheme-specific parts. </p></li> 1592 * 1593 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1594 * parts are ordered according to the ordering of their 1595 * fragments. </p></li> 1596 * 1597 * <li><p> Two hierarchical URIs with identical schemes are ordered 1598 * according to the ordering of their authority components: </p> 1599 * 1600 * <ul> 1601 * 1602 * <li><p> If both authority components are server-based then the URIs 1603 * are ordered according to their user-information components; if these 1604 * components are identical then the URIs are ordered according to the 1605 * ordering of their hosts, without regard to case; if the hosts are 1606 * identical then the URIs are ordered according to the ordering of 1607 * their ports. </p></li> 1608 * 1609 * <li><p> If one or both authority components are registry-based then 1610 * the URIs are ordered according to the ordering of their authority 1611 * components. </p></li> 1612 * 1613 * </ul></li> 1614 * 1615 * <li><p> Finally, two hierarchical URIs with identical schemes and 1616 * authority components are ordered according to the ordering of their 1617 * paths; if their paths are identical then they are ordered according to 1618 * the ordering of their queries; if the queries are identical then they 1619 * are ordered according to the order of their fragments. </p></li> 1620 * 1621 * </ul> 1622 * 1623 * <p> This method satisfies the general contract of the {@link 1624 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1625 * method. </p> 1626 * 1627 * @param that 1628 * The object to which this URI is to be compared 1629 * 1630 * @return A negative integer, zero, or a positive integer as this URI is 1631 * less than, equal to, or greater than the given URI 1632 * 1633 * @throws ClassCastException 1634 * If the given object is not a URI 1635 */ 1636 public int compareTo(URI that) { 1637 int c; 1638 1639 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1640 return c; 1641 1642 if (this.isOpaque()) { 1643 if (that.isOpaque()) { 1644 // Both opaque 1645 if ((c = compare(this.schemeSpecificPart, 1646 that.schemeSpecificPart)) != 0) 1647 return c; 1648 return compare(this.fragment, that.fragment); 1649 } 1650 return +1; // Opaque > hierarchical 1651 } else if (that.isOpaque()) { 1652 return -1; // Hierarchical < opaque 1653 } 1654 1655 // Hierarchical 1656 if ((this.host != null) && (that.host != null)) { 1657 // Both server-based 1658 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1659 return c; 1660 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1661 return c; 1662 if ((c = this.port - that.port) != 0) 1663 return c; 1664 } else { 1665 // If one or both authorities are registry-based then we simply 1666 // compare them in the usual, case-sensitive way. If one is 1667 // registry-based and one is server-based then the strings are 1668 // guaranteed to be unequal, hence the comparison will never return 1669 // zero and the compareTo and equals methods will remain 1670 // consistent. 1671 if ((c = compare(this.authority, that.authority)) != 0) return c; 1672 } 1673 1674 if ((c = compare(this.path, that.path)) != 0) return c; 1675 if ((c = compare(this.query, that.query)) != 0) return c; 1676 return compare(this.fragment, that.fragment); 1677 } 1678 1679 /** 1680 * Returns the content of this URI as a string. 1681 * 1682 * <p> If this URI was created by invoking one of the constructors in this 1683 * class then a string equivalent to the original input string, or to the 1684 * string computed from the originally-given components, as appropriate, is 1685 * returned. Otherwise this URI was created by normalization, resolution, 1686 * or relativization, and so a string is constructed from this URI's 1687 * components according to the rules specified in <a 1688 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1689 * section 5.2, step 7. </p> 1690 * 1691 * @return The string form of this URI 1692 */ 1693 public String toString() { 1694 String s = string; 1695 if (s == null) { 1696 s = defineString(); 1697 } 1698 return s; 1699 } 1700 1701 private String defineString() { 1702 String s = string; 1703 if (s != null) { 1704 return s; 1705 } 1706 1707 StringBuilder sb = new StringBuilder(); 1708 if (scheme != null) { 1709 sb.append(scheme); 1710 sb.append(':'); 1711 } 1712 if (isOpaque()) { 1713 sb.append(schemeSpecificPart); 1714 } else { 1715 if (host != null) { 1716 sb.append("//"); 1717 if (userInfo != null) { 1718 sb.append(userInfo); 1719 sb.append('@'); 1720 } 1721 boolean needBrackets = ((host.indexOf(':') >= 0) 1722 && !host.startsWith("[") 1723 && !host.endsWith("]")); 1724 if (needBrackets) sb.append('['); 1725 sb.append(host); 1726 if (needBrackets) sb.append(']'); 1727 if (port != -1) { 1728 sb.append(':'); 1729 sb.append(port); 1730 } 1731 } else if (authority != null) { 1732 sb.append("//"); 1733 sb.append(authority); 1734 } 1735 if (path != null) 1736 sb.append(path); 1737 if (query != null) { 1738 sb.append('?'); 1739 sb.append(query); 1740 } 1741 } 1742 if (fragment != null) { 1743 sb.append('#'); 1744 sb.append(fragment); 1745 } 1746 return string = sb.toString(); 1747 } 1748 1749 /** 1750 * Returns the content of this URI as a US-ASCII string. 1751 * 1752 * <p> If this URI does not contain any characters in the <i>other</i> 1753 * category then an invocation of this method will return the same value as 1754 * an invocation of the {@link #toString() toString} method. Otherwise 1755 * this method works as if by invoking that method and then <a 1756 * href="#encode">encoding</a> the result. </p> 1757 * 1758 * @return The string form of this URI, encoded as needed 1759 * so that it only contains characters in the US-ASCII 1760 * charset 1761 */ 1762 public String toASCIIString() { 1763 return encode(toString()); 1764 } 1765 1766 1767 // -- Serialization support -- 1768 1769 /** 1770 * Saves the content of this URI to the given serial stream. 1771 * 1772 * <p> The only serializable field of a URI instance is its {@code string} 1773 * field. That field is given a value, if it does not have one already, 1774 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1775 * method of the given object-output stream is invoked. </p> 1776 * 1777 * @param os The object-output stream to which this object 1778 * is to be written 1779 */ 1780 private void writeObject(ObjectOutputStream os) 1781 throws IOException 1782 { 1783 defineString(); 1784 os.defaultWriteObject(); // Writes the string field only 1785 } 1786 1787 /** 1788 * Reconstitutes a URI from the given serial stream. 1789 * 1790 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1791 * invoked to read the value of the {@code string} field. The result is 1792 * then parsed in the usual way. 1793 * 1794 * @param is The object-input stream from which this object 1795 * is being read 1796 */ 1797 private void readObject(ObjectInputStream is) 1798 throws ClassNotFoundException, IOException 1799 { 1800 port = -1; // Argh 1801 is.defaultReadObject(); 1802 try { 1803 new Parser(string).parse(false); 1804 } catch (URISyntaxException x) { 1805 IOException y = new InvalidObjectException("Invalid URI"); 1806 y.initCause(x); 1807 throw y; 1808 } 1809 } 1810 1811 1812 // -- End of public methods -- 1813 1814 1815 // -- Utility methods for string-field comparison and hashing -- 1816 1817 // These methods return appropriate values for null string arguments, 1818 // thereby simplifying the equals, hashCode, and compareTo methods. 1819 // 1820 // The case-ignoring methods should only be applied to strings whose 1821 // characters are all known to be US-ASCII. Because of this restriction, 1822 // these methods are faster than the similar methods in the String class. 1823 1824 // US-ASCII only 1825 private static int toLower(char c) { 1826 if ((c >= 'A') && (c <= 'Z')) 1827 return c + ('a' - 'A'); 1828 return c; 1829 } 1830 1831 // US-ASCII only 1832 private static int toUpper(char c) { 1833 if ((c >= 'a') && (c <= 'z')) 1834 return c - ('a' - 'A'); 1835 return c; 1836 } 1837 1838 private static boolean equal(String s, String t) { 1839 if (s == t) return true; 1840 if ((s != null) && (t != null)) { 1841 if (s.length() != t.length()) 1842 return false; 1843 if (s.indexOf('%') < 0) 1844 return s.equals(t); 1845 int n = s.length(); 1846 for (int i = 0; i < n;) { 1847 char c = s.charAt(i); 1848 char d = t.charAt(i); 1849 if (c != '%') { 1850 if (c != d) 1851 return false; 1852 i++; 1853 continue; 1854 } 1855 if (d != '%') 1856 return false; 1857 i++; 1858 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1859 return false; 1860 i++; 1861 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1862 return false; 1863 i++; 1864 } 1865 return true; 1866 } 1867 return false; 1868 } 1869 1870 // US-ASCII only 1871 private static boolean equalIgnoringCase(String s, String t) { 1872 if (s == t) return true; 1873 if ((s != null) && (t != null)) { 1874 int n = s.length(); 1875 if (t.length() != n) 1876 return false; 1877 for (int i = 0; i < n; i++) { 1878 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1879 return false; 1880 } 1881 return true; 1882 } 1883 return false; 1884 } 1885 1886 private static int hash(int hash, String s) { 1887 if (s == null) return hash; 1888 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1889 : normalizedHash(hash, s); 1890 } 1891 1892 1893 private static int normalizedHash(int hash, String s) { 1894 int h = 0; 1895 for (int index = 0; index < s.length(); index++) { 1896 char ch = s.charAt(index); 1897 h = 31 * h + ch; 1898 if (ch == '%') { 1899 /* 1900 * Process the next two encoded characters 1901 */ 1902 for (int i = index + 1; i < index + 3; i++) 1903 h = 31 * h + toUpper(s.charAt(i)); 1904 index += 2; 1905 } 1906 } 1907 return hash * 127 + h; 1908 } 1909 1910 // US-ASCII only 1911 private static int hashIgnoringCase(int hash, String s) { 1912 if (s == null) return hash; 1913 int h = hash; 1914 int n = s.length(); 1915 for (int i = 0; i < n; i++) 1916 h = 31 * h + toLower(s.charAt(i)); 1917 return h; 1918 } 1919 1920 private static int compare(String s, String t) { 1921 if (s == t) return 0; 1922 if (s != null) { 1923 if (t != null) 1924 return s.compareTo(t); 1925 else 1926 return +1; 1927 } else { 1928 return -1; 1929 } 1930 } 1931 1932 // US-ASCII only 1933 private static int compareIgnoringCase(String s, String t) { 1934 if (s == t) return 0; 1935 if (s != null) { 1936 if (t != null) { 1937 int sn = s.length(); 1938 int tn = t.length(); 1939 int n = sn < tn ? sn : tn; 1940 for (int i = 0; i < n; i++) { 1941 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1942 if (c != 0) 1943 return c; 1944 } 1945 return sn - tn; 1946 } 1947 return +1; 1948 } else { 1949 return -1; 1950 } 1951 } 1952 1953 1954 // -- String construction -- 1955 1956 // If a scheme is given then the path, if given, must be absolute 1957 // 1958 private static void checkPath(String s, String scheme, String path) 1959 throws URISyntaxException 1960 { 1961 if (scheme != null) { 1962 if ((path != null) 1963 && ((path.length() > 0) && (path.charAt(0) != '/'))) 1964 throw new URISyntaxException(s, 1965 "Relative path in absolute URI"); 1966 } 1967 } 1968 1969 private void appendAuthority(StringBuilder sb, 1970 String authority, 1971 String userInfo, 1972 String host, 1973 int port) 1974 { 1975 if (host != null) { 1976 sb.append("//"); 1977 if (userInfo != null) { 1978 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1979 sb.append('@'); 1980 } 1981 boolean needBrackets = ((host.indexOf(':') >= 0) 1982 && !host.startsWith("[") 1983 && !host.endsWith("]")); 1984 if (needBrackets) sb.append('['); 1985 sb.append(host); 1986 if (needBrackets) sb.append(']'); 1987 if (port != -1) { 1988 sb.append(':'); 1989 sb.append(port); 1990 } 1991 } else if (authority != null) { 1992 sb.append("//"); 1993 if (authority.startsWith("[")) { 1994 // authority should (but may not) contain an embedded IPv6 address 1995 int end = authority.indexOf(']'); 1996 String doquote = authority, dontquote = ""; 1997 if (end != -1 && authority.indexOf(':') != -1) { 1998 // the authority contains an IPv6 address 1999 if (end == authority.length()) { 2000 dontquote = authority; 2001 doquote = ""; 2002 } else { 2003 dontquote = authority.substring(0 , end + 1); 2004 doquote = authority.substring(end + 1); 2005 } 2006 } 2007 sb.append(dontquote); 2008 sb.append(quote(doquote, 2009 L_REG_NAME | L_SERVER, 2010 H_REG_NAME | H_SERVER)); 2011 } else { 2012 sb.append(quote(authority, 2013 L_REG_NAME | L_SERVER, 2014 H_REG_NAME | H_SERVER)); 2015 } 2016 } 2017 } 2018 2019 private void appendSchemeSpecificPart(StringBuilder sb, 2020 String opaquePart, 2021 String authority, 2022 String userInfo, 2023 String host, 2024 int port, 2025 String path, 2026 String query) 2027 { 2028 if (opaquePart != null) { 2029 /* check if SSP begins with an IPv6 address 2030 * because we must not quote a literal IPv6 address 2031 */ 2032 if (opaquePart.startsWith("//[")) { 2033 int end = opaquePart.indexOf(']'); 2034 if (end != -1 && opaquePart.indexOf(':')!=-1) { 2035 String doquote, dontquote; 2036 if (end == opaquePart.length()) { 2037 dontquote = opaquePart; 2038 doquote = ""; 2039 } else { 2040 dontquote = opaquePart.substring(0,end+1); 2041 doquote = opaquePart.substring(end+1); 2042 } 2043 sb.append (dontquote); 2044 sb.append(quote(doquote, L_URIC, H_URIC)); 2045 } 2046 } else { 2047 sb.append(quote(opaquePart, L_URIC, H_URIC)); 2048 } 2049 } else { 2050 appendAuthority(sb, authority, userInfo, host, port); 2051 if (path != null) 2052 sb.append(quote(path, L_PATH, H_PATH)); 2053 if (query != null) { 2054 sb.append('?'); 2055 sb.append(quote(query, L_URIC, H_URIC)); 2056 } 2057 } 2058 } 2059 2060 private void appendFragment(StringBuilder sb, String fragment) { 2061 if (fragment != null) { 2062 sb.append('#'); 2063 sb.append(quote(fragment, L_URIC, H_URIC)); 2064 } 2065 } 2066 2067 private String toString(String scheme, 2068 String opaquePart, 2069 String authority, 2070 String userInfo, 2071 String host, 2072 int port, 2073 String path, 2074 String query, 2075 String fragment) 2076 { 2077 StringBuilder sb = new StringBuilder(); 2078 if (scheme != null) { 2079 sb.append(scheme); 2080 sb.append(':'); 2081 } 2082 appendSchemeSpecificPart(sb, opaquePart, 2083 authority, userInfo, host, port, 2084 path, query); 2085 appendFragment(sb, fragment); 2086 return sb.toString(); 2087 } 2088 2089 // -- Normalization, resolution, and relativization -- 2090 2091 // RFC2396 5.2 (6) 2092 private static String resolvePath(String base, String child, 2093 boolean absolute) 2094 { 2095 int i = base.lastIndexOf('/'); 2096 int cn = child.length(); 2097 String path = ""; 2098 2099 if (cn == 0) { 2100 // 5.2 (6a) 2101 if (i >= 0) 2102 path = base.substring(0, i + 1); 2103 } else { 2104 StringBuilder sb = new StringBuilder(base.length() + cn); 2105 // 5.2 (6a) 2106 if (i >= 0) 2107 sb.append(base, 0, i + 1); 2108 // 5.2 (6b) 2109 sb.append(child); 2110 path = sb.toString(); 2111 } 2112 2113 // 5.2 (6c-f) 2114 String np = normalize(path); 2115 2116 // 5.2 (6g): If the result is absolute but the path begins with "../", 2117 // then we simply leave the path as-is 2118 2119 return np; 2120 } 2121 2122 // RFC2396 5.2 2123 private static URI resolve(URI base, URI child) { 2124 // check if child if opaque first so that NPE is thrown 2125 // if child is null. 2126 if (child.isOpaque() || base.isOpaque()) 2127 return child; 2128 2129 // 5.2 (2): Reference to current document (lone fragment) 2130 if ((child.scheme == null) && (child.authority == null) 2131 && child.path.isEmpty() && (child.fragment != null) 2132 && (child.query == null)) { 2133 if ((base.fragment != null) 2134 && child.fragment.equals(base.fragment)) { 2135 return base; 2136 } 2137 URI ru = new URI(); 2138 ru.scheme = base.scheme; 2139 ru.authority = base.authority; 2140 ru.userInfo = base.userInfo; 2141 ru.host = base.host; 2142 ru.port = base.port; 2143 ru.path = base.path; 2144 ru.fragment = child.fragment; 2145 ru.query = base.query; 2146 return ru; 2147 } 2148 2149 // 5.2 (3): Child is absolute 2150 if (child.scheme != null) 2151 return child; 2152 2153 URI ru = new URI(); // Resolved URI 2154 ru.scheme = base.scheme; 2155 ru.query = child.query; 2156 ru.fragment = child.fragment; 2157 2158 // 5.2 (4): Authority 2159 if (child.authority == null) { 2160 ru.authority = base.authority; 2161 ru.host = base.host; 2162 ru.userInfo = base.userInfo; 2163 ru.port = base.port; 2164 2165 String cp = (child.path == null) ? "" : child.path; 2166 if ((cp.length() > 0) && (cp.charAt(0) == '/')) { 2167 // 5.2 (5): Child path is absolute 2168 ru.path = child.path; 2169 } else { 2170 // 5.2 (6): Resolve relative path 2171 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2172 } 2173 } else { 2174 ru.authority = child.authority; 2175 ru.host = child.host; 2176 ru.userInfo = child.userInfo; 2177 ru.host = child.host; 2178 ru.port = child.port; 2179 ru.path = child.path; 2180 } 2181 2182 // 5.2 (7): Recombine (nothing to do here) 2183 return ru; 2184 } 2185 2186 // If the given URI's path is normal then return the URI; 2187 // o.w., return a new URI containing the normalized path. 2188 // 2189 private static URI normalize(URI u) { 2190 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) 2191 return u; 2192 2193 String np = normalize(u.path); 2194 if (np == u.path) 2195 return u; 2196 2197 URI v = new URI(); 2198 v.scheme = u.scheme; 2199 v.fragment = u.fragment; 2200 v.authority = u.authority; 2201 v.userInfo = u.userInfo; 2202 v.host = u.host; 2203 v.port = u.port; 2204 v.path = np; 2205 v.query = u.query; 2206 return v; 2207 } 2208 2209 // If both URIs are hierarchical, their scheme and authority components are 2210 // identical, and the base path is a prefix of the child's path, then 2211 // return a relative URI that, when resolved against the base, yields the 2212 // child; otherwise, return the child. 2213 // 2214 private static URI relativize(URI base, URI child) { 2215 // check if child if opaque first so that NPE is thrown 2216 // if child is null. 2217 if (child.isOpaque() || base.isOpaque()) 2218 return child; 2219 if (!equalIgnoringCase(base.scheme, child.scheme) 2220 || !equal(base.authority, child.authority)) 2221 return child; 2222 2223 String bp = normalize(base.path); 2224 String cp = normalize(child.path); 2225 if (!bp.equals(cp)) { 2226 if (!bp.endsWith("/")) 2227 bp = bp + "/"; 2228 if (!cp.startsWith(bp)) 2229 return child; 2230 } 2231 2232 URI v = new URI(); 2233 v.path = cp.substring(bp.length()); 2234 v.query = child.query; 2235 v.fragment = child.fragment; 2236 return v; 2237 } 2238 2239 2240 2241 // -- Path normalization -- 2242 2243 // The following algorithm for path normalization avoids the creation of a 2244 // string object for each segment, as well as the use of a string buffer to 2245 // compute the final result, by using a single char array and editing it in 2246 // place. The array is first split into segments, replacing each slash 2247 // with '\0' and creating a segment-index array, each element of which is 2248 // the index of the first char in the corresponding segment. We then walk 2249 // through both arrays, removing ".", "..", and other segments as necessary 2250 // by setting their entries in the index array to -1. Finally, the two 2251 // arrays are used to rejoin the segments and compute the final result. 2252 // 2253 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2254 2255 2256 // Check the given path to see if it might need normalization. A path 2257 // might need normalization if it contains duplicate slashes, a "." 2258 // segment, or a ".." segment. Return -1 if no further normalization is 2259 // possible, otherwise return the number of segments found. 2260 // 2261 // This method takes a string argument rather than a char array so that 2262 // this test can be performed without invoking path.toCharArray(). 2263 // 2264 private static int needsNormalization(String path) { 2265 boolean normal = true; 2266 int ns = 0; // Number of segments 2267 int end = path.length() - 1; // Index of last char in path 2268 int p = 0; // Index of next char in path 2269 2270 // Skip initial slashes 2271 while (p <= end) { 2272 if (path.charAt(p) != '/') break; 2273 p++; 2274 } 2275 if (p > 1) normal = false; 2276 2277 // Scan segments 2278 while (p <= end) { 2279 2280 // Looking at "." or ".." ? 2281 if ((path.charAt(p) == '.') 2282 && ((p == end) 2283 || ((path.charAt(p + 1) == '/') 2284 || ((path.charAt(p + 1) == '.') 2285 && ((p + 1 == end) 2286 || (path.charAt(p + 2) == '/')))))) { 2287 normal = false; 2288 } 2289 ns++; 2290 2291 // Find beginning of next segment 2292 while (p <= end) { 2293 if (path.charAt(p++) != '/') 2294 continue; 2295 2296 // Skip redundant slashes 2297 while (p <= end) { 2298 if (path.charAt(p) != '/') break; 2299 normal = false; 2300 p++; 2301 } 2302 2303 break; 2304 } 2305 } 2306 2307 return normal ? -1 : ns; 2308 } 2309 2310 2311 // Split the given path into segments, replacing slashes with nulls and 2312 // filling in the given segment-index array. 2313 // 2314 // Preconditions: 2315 // segs.length == Number of segments in path 2316 // 2317 // Postconditions: 2318 // All slashes in path replaced by '\0' 2319 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2320 // 2321 private static void split(char[] path, int[] segs) { 2322 int end = path.length - 1; // Index of last char in path 2323 int p = 0; // Index of next char in path 2324 int i = 0; // Index of current segment 2325 2326 // Skip initial slashes 2327 while (p <= end) { 2328 if (path[p] != '/') break; 2329 path[p] = '\0'; 2330 p++; 2331 } 2332 2333 while (p <= end) { 2334 2335 // Note start of segment 2336 segs[i++] = p++; 2337 2338 // Find beginning of next segment 2339 while (p <= end) { 2340 if (path[p++] != '/') 2341 continue; 2342 path[p - 1] = '\0'; 2343 2344 // Skip redundant slashes 2345 while (p <= end) { 2346 if (path[p] != '/') break; 2347 path[p++] = '\0'; 2348 } 2349 break; 2350 } 2351 } 2352 2353 if (i != segs.length) 2354 throw new InternalError(); // ASSERT 2355 } 2356 2357 2358 // Join the segments in the given path according to the given segment-index 2359 // array, ignoring those segments whose index entries have been set to -1, 2360 // and inserting slashes as needed. Return the length of the resulting 2361 // path. 2362 // 2363 // Preconditions: 2364 // segs[i] == -1 implies segment i is to be ignored 2365 // path computed by split, as above, with '\0' having replaced '/' 2366 // 2367 // Postconditions: 2368 // path[0] .. path[return value] == Resulting path 2369 // 2370 private static int join(char[] path, int[] segs) { 2371 int ns = segs.length; // Number of segments 2372 int end = path.length - 1; // Index of last char in path 2373 int p = 0; // Index of next path char to write 2374 2375 if (path[p] == '\0') { 2376 // Restore initial slash for absolute paths 2377 path[p++] = '/'; 2378 } 2379 2380 for (int i = 0; i < ns; i++) { 2381 int q = segs[i]; // Current segment 2382 if (q == -1) 2383 // Ignore this segment 2384 continue; 2385 2386 if (p == q) { 2387 // We're already at this segment, so just skip to its end 2388 while ((p <= end) && (path[p] != '\0')) 2389 p++; 2390 if (p <= end) { 2391 // Preserve trailing slash 2392 path[p++] = '/'; 2393 } 2394 } else if (p < q) { 2395 // Copy q down to p 2396 while ((q <= end) && (path[q] != '\0')) 2397 path[p++] = path[q++]; 2398 if (q <= end) { 2399 // Preserve trailing slash 2400 path[p++] = '/'; 2401 } 2402 } else 2403 throw new InternalError(); // ASSERT false 2404 } 2405 2406 return p; 2407 } 2408 2409 2410 // Remove "." segments from the given path, and remove segment pairs 2411 // consisting of a non-".." segment followed by a ".." segment. 2412 // 2413 private static void removeDots(char[] path, int[] segs) { 2414 int ns = segs.length; 2415 int end = path.length - 1; 2416 2417 for (int i = 0; i < ns; i++) { 2418 int dots = 0; // Number of dots found (0, 1, or 2) 2419 2420 // Find next occurrence of "." or ".." 2421 do { 2422 int p = segs[i]; 2423 if (path[p] == '.') { 2424 if (p == end) { 2425 dots = 1; 2426 break; 2427 } else if (path[p + 1] == '\0') { 2428 dots = 1; 2429 break; 2430 } else if ((path[p + 1] == '.') 2431 && ((p + 1 == end) 2432 || (path[p + 2] == '\0'))) { 2433 dots = 2; 2434 break; 2435 } 2436 } 2437 i++; 2438 } while (i < ns); 2439 if ((i > ns) || (dots == 0)) 2440 break; 2441 2442 if (dots == 1) { 2443 // Remove this occurrence of "." 2444 segs[i] = -1; 2445 } else { 2446 // If there is a preceding non-".." segment, remove both that 2447 // segment and this occurrence of ".."; otherwise, leave this 2448 // ".." segment as-is. 2449 int j; 2450 for (j = i - 1; j >= 0; j--) { 2451 if (segs[j] != -1) break; 2452 } 2453 if (j >= 0) { 2454 int q = segs[j]; 2455 if (!((path[q] == '.') 2456 && (path[q + 1] == '.') 2457 && (path[q + 2] == '\0'))) { 2458 segs[i] = -1; 2459 segs[j] = -1; 2460 } 2461 } 2462 } 2463 } 2464 } 2465 2466 2467 // DEVIATION: If the normalized path is relative, and if the first 2468 // segment could be parsed as a scheme name, then prepend a "." segment 2469 // 2470 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2471 2472 if (path[0] == '\0') 2473 // The path is absolute 2474 return; 2475 2476 int ns = segs.length; 2477 int f = 0; // Index of first segment 2478 while (f < ns) { 2479 if (segs[f] >= 0) 2480 break; 2481 f++; 2482 } 2483 if ((f >= ns) || (f == 0)) 2484 // The path is empty, or else the original first segment survived, 2485 // in which case we already know that no leading "." is needed 2486 return; 2487 2488 int p = segs[f]; 2489 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2490 if (p >= path.length || path[p] == '\0') 2491 // No colon in first segment, so no "." needed 2492 return; 2493 2494 // At this point we know that the first segment is unused, 2495 // hence we can insert a "." segment at that position 2496 path[0] = '.'; 2497 path[1] = '\0'; 2498 segs[0] = 0; 2499 } 2500 2501 2502 // Normalize the given path string. A normal path string has no empty 2503 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2504 // segments equal to ".." that are preceded by a segment not equal to "..". 2505 // In contrast to Unix-style pathname normalization, for URI paths we 2506 // always retain trailing slashes. 2507 // 2508 private static String normalize(String ps) { 2509 2510 // Does this path need normalization? 2511 int ns = needsNormalization(ps); // Number of segments 2512 if (ns < 0) 2513 // Nope -- just return it 2514 return ps; 2515 2516 char[] path = ps.toCharArray(); // Path in char-array form 2517 2518 // Split path into segments 2519 int[] segs = new int[ns]; // Segment-index array 2520 split(path, segs); 2521 2522 // Remove dots 2523 removeDots(path, segs); 2524 2525 // Prevent scheme-name confusion 2526 maybeAddLeadingDot(path, segs); 2527 2528 // Join the remaining segments and return the result 2529 String s = new String(path, 0, join(path, segs)); 2530 if (s.equals(ps)) { 2531 // string was already normalized 2532 return ps; 2533 } 2534 return s; 2535 } 2536 2537 2538 2539 // -- Character classes for parsing -- 2540 2541 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2542 // permissible in the various components of a URI reference. We here 2543 // define a set of mask pairs to aid in enforcing these restrictions. Each 2544 // mask pair consists of two longs, a low mask and a high mask. Taken 2545 // together they represent a 128-bit mask, where bit i is set iff the 2546 // character with value i is permitted. 2547 // 2548 // This approach is more efficient than sequentially searching arrays of 2549 // permitted characters. It could be made still more efficient by 2550 // precompiling the mask information so that a character's presence in a 2551 // given mask could be determined by a single table lookup. 2552 2553 // To save startup time, we manually calculate the low-/highMask constants. 2554 // For reference, the following methods were used to calculate the values: 2555 2556 // Compute the low-order mask for the characters in the given string 2557 // private static long lowMask(String chars) { 2558 // int n = chars.length(); 2559 // long m = 0; 2560 // for (int i = 0; i < n; i++) { 2561 // char c = chars.charAt(i); 2562 // if (c < 64) 2563 // m |= (1L << c); 2564 // } 2565 // return m; 2566 // } 2567 2568 // Compute the high-order mask for the characters in the given string 2569 // private static long highMask(String chars) { 2570 // int n = chars.length(); 2571 // long m = 0; 2572 // for (int i = 0; i < n; i++) { 2573 // char c = chars.charAt(i); 2574 // if ((c >= 64) && (c < 128)) 2575 // m |= (1L << (c - 64)); 2576 // } 2577 // return m; 2578 // } 2579 2580 // Compute a low-order mask for the characters 2581 // between first and last, inclusive 2582 // private static long lowMask(char first, char last) { 2583 // long m = 0; 2584 // int f = Math.max(Math.min(first, 63), 0); 2585 // int l = Math.max(Math.min(last, 63), 0); 2586 // for (int i = f; i <= l; i++) 2587 // m |= 1L << i; 2588 // return m; 2589 // } 2590 2591 // Compute a high-order mask for the characters 2592 // between first and last, inclusive 2593 // private static long highMask(char first, char last) { 2594 // long m = 0; 2595 // int f = Math.max(Math.min(first, 127), 64) - 64; 2596 // int l = Math.max(Math.min(last, 127), 64) - 64; 2597 // for (int i = f; i <= l; i++) 2598 // m |= 1L << i; 2599 // return m; 2600 // } 2601 2602 // Tell whether the given character is permitted by the given mask pair 2603 private static boolean match(char c, long lowMask, long highMask) { 2604 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2605 return false; 2606 if (c < 64) 2607 return ((1L << c) & lowMask) != 0; 2608 if (c < 128) 2609 return ((1L << (c - 64)) & highMask) != 0; 2610 return false; 2611 } 2612 2613 // Character-class masks, in reverse order from RFC2396 because 2614 // initializers for static fields cannot make forward references. 2615 2616 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2617 // "8" | "9" 2618 private static final long L_DIGIT = 0x3FF000000000000L; // lowMask('0', '9'); 2619 private static final long H_DIGIT = 0L; 2620 2621 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2622 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2623 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2624 private static final long L_UPALPHA = 0L; 2625 private static final long H_UPALPHA = 0x7FFFFFEL; // highMask('A', 'Z'); 2626 2627 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2628 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2629 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2630 private static final long L_LOWALPHA = 0L; 2631 private static final long H_LOWALPHA = 0x7FFFFFE00000000L; // highMask('a', 'z'); 2632 2633 // alpha = lowalpha | upalpha 2634 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2635 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2636 2637 // alphanum = alpha | digit 2638 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2639 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2640 2641 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2642 // "a" | "b" | "c" | "d" | "e" | "f" 2643 private static final long L_HEX = L_DIGIT; 2644 private static final long H_HEX = 0x7E0000007EL; // highMask('A', 'F') | highMask('a', 'f'); 2645 2646 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2647 // "(" | ")" 2648 private static final long L_MARK = 0x678200000000L; // lowMask("-_.!~*'()"); 2649 private static final long H_MARK = 0x4000000080000000L; // highMask("-_.!~*'()"); 2650 2651 // unreserved = alphanum | mark 2652 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2653 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2654 2655 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2656 // "$" | "," | "[" | "]" 2657 // Added per RFC2732: "[", "]" 2658 private static final long L_RESERVED = 0xAC00985000000000L; // lowMask(";/?:@&=+$,[]"); 2659 private static final long H_RESERVED = 0x28000001L; // highMask(";/?:@&=+$,[]"); 2660 2661 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2662 // characters are allowed; this is handled by the scanEscape method below. 2663 private static final long L_ESCAPED = 1L; 2664 private static final long H_ESCAPED = 0L; 2665 2666 // uric = reserved | unreserved | escaped 2667 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2668 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2669 2670 // pchar = unreserved | escaped | 2671 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2672 private static final long L_PCHAR 2673 = L_UNRESERVED | L_ESCAPED | 0x2400185000000000L; // lowMask(":@&=+$,"); 2674 private static final long H_PCHAR 2675 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask(":@&=+$,"); 2676 2677 // All valid path characters 2678 private static final long L_PATH = L_PCHAR | 0x800800000000000L; // lowMask(";/"); 2679 private static final long H_PATH = H_PCHAR; // highMask(";/") == 0x0L; 2680 2681 // Dash, for use in domainlabel and toplabel 2682 private static final long L_DASH = 0x200000000000L; // lowMask("-"); 2683 private static final long H_DASH = 0x0L; // highMask("-"); 2684 2685 // Dot, for use in hostnames 2686 private static final long L_DOT = 0x400000000000L; // lowMask("."); 2687 private static final long H_DOT = 0x0L; // highMask("."); 2688 2689 // userinfo = *( unreserved | escaped | 2690 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2691 private static final long L_USERINFO 2692 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask(";:&=+$,"); 2693 private static final long H_USERINFO 2694 = H_UNRESERVED | H_ESCAPED; // | highMask(";:&=+$,") == 0L; 2695 2696 // reg_name = 1*( unreserved | escaped | "$" | "," | 2697 // ";" | ":" | "@" | "&" | "=" | "+" ) 2698 private static final long L_REG_NAME 2699 = L_UNRESERVED | L_ESCAPED | 0x2C00185000000000L; // lowMask("$,;:@&=+"); 2700 private static final long H_REG_NAME 2701 = H_UNRESERVED | H_ESCAPED | 0x1L; // highMask("$,;:@&=+"); 2702 2703 // All valid characters for server-based authorities 2704 private static final long L_SERVER 2705 = L_USERINFO | L_ALPHANUM | L_DASH | 0x400400000000000L; // lowMask(".:@[]"); 2706 private static final long H_SERVER 2707 = H_USERINFO | H_ALPHANUM | H_DASH | 0x28000001L; // highMask(".:@[]"); 2708 2709 // Special case of server authority that represents an IPv6 address 2710 // In this case, a % does not signify an escape sequence 2711 private static final long L_SERVER_PERCENT 2712 = L_SERVER | 0x2000000000L; // lowMask("%"); 2713 private static final long H_SERVER_PERCENT 2714 = H_SERVER; // | highMask("%") == 0L; 2715 2716 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2717 private static final long L_SCHEME = L_ALPHA | L_DIGIT | 0x680000000000L; // lowMask("+-."); 2718 private static final long H_SCHEME = H_ALPHA | H_DIGIT; // | highMask("+-.") == 0L 2719 2720 // scope_id = alpha | digit | "_" | "." 2721 private static final long L_SCOPE_ID 2722 = L_ALPHANUM | 0x400000000000L; // lowMask("_."); 2723 private static final long H_SCOPE_ID 2724 = H_ALPHANUM | 0x80000000L; // highMask("_."); 2725 2726 // -- Escaping and encoding -- 2727 2728 private static final char[] hexDigits = { 2729 '0', '1', '2', '3', '4', '5', '6', '7', 2730 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2731 }; 2732 2733 private static void appendEscape(StringBuilder sb, byte b) { 2734 sb.append('%'); 2735 sb.append(hexDigits[(b >> 4) & 0x0f]); 2736 sb.append(hexDigits[(b >> 0) & 0x0f]); 2737 } 2738 2739 private static void appendEncoded(StringBuilder sb, char c) { 2740 ByteBuffer bb = null; 2741 try { 2742 bb = ThreadLocalCoders.encoderFor("UTF-8") 2743 .encode(CharBuffer.wrap("" + c)); 2744 } catch (CharacterCodingException x) { 2745 assert false; 2746 } 2747 while (bb.hasRemaining()) { 2748 int b = bb.get() & 0xff; 2749 if (b >= 0x80) 2750 appendEscape(sb, (byte)b); 2751 else 2752 sb.append((char)b); 2753 } 2754 } 2755 2756 // Quote any characters in s that are not permitted 2757 // by the given mask pair 2758 // 2759 private static String quote(String s, long lowMask, long highMask) { 2760 StringBuilder sb = null; 2761 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2762 for (int i = 0; i < s.length(); i++) { 2763 char c = s.charAt(i); 2764 if (c < '\u0080') { 2765 if (!match(c, lowMask, highMask)) { 2766 if (sb == null) { 2767 sb = new StringBuilder(); 2768 sb.append(s, 0, i); 2769 } 2770 appendEscape(sb, (byte)c); 2771 } else { 2772 if (sb != null) 2773 sb.append(c); 2774 } 2775 } else if (allowNonASCII 2776 && (Character.isSpaceChar(c) 2777 || Character.isISOControl(c))) { 2778 if (sb == null) { 2779 sb = new StringBuilder(); 2780 sb.append(s, 0, i); 2781 } 2782 appendEncoded(sb, c); 2783 } else { 2784 if (sb != null) 2785 sb.append(c); 2786 } 2787 } 2788 return (sb == null) ? s : sb.toString(); 2789 } 2790 2791 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2792 // assuming that s is otherwise legal 2793 // 2794 private static String encode(String s) { 2795 int n = s.length(); 2796 if (n == 0) 2797 return s; 2798 2799 // First check whether we actually need to encode 2800 for (int i = 0;;) { 2801 if (s.charAt(i) >= '\u0080') 2802 break; 2803 if (++i >= n) 2804 return s; 2805 } 2806 2807 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2808 ByteBuffer bb = null; 2809 try { 2810 bb = ThreadLocalCoders.encoderFor("UTF-8") 2811 .encode(CharBuffer.wrap(ns)); 2812 } catch (CharacterCodingException x) { 2813 assert false; 2814 } 2815 2816 StringBuilder sb = new StringBuilder(); 2817 while (bb.hasRemaining()) { 2818 int b = bb.get() & 0xff; 2819 if (b >= 0x80) 2820 appendEscape(sb, (byte)b); 2821 else 2822 sb.append((char)b); 2823 } 2824 return sb.toString(); 2825 } 2826 2827 private static int decode(char c) { 2828 if ((c >= '0') && (c <= '9')) 2829 return c - '0'; 2830 if ((c >= 'a') && (c <= 'f')) 2831 return c - 'a' + 10; 2832 if ((c >= 'A') && (c <= 'F')) 2833 return c - 'A' + 10; 2834 assert false; 2835 return -1; 2836 } 2837 2838 private static byte decode(char c1, char c2) { 2839 return (byte)( ((decode(c1) & 0xf) << 4) 2840 | ((decode(c2) & 0xf) << 0)); 2841 } 2842 2843 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2844 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2845 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2846 // are replaced with '\uFFFD'. 2847 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2848 // with a scope_id 2849 // 2850 private static String decode(String s) { 2851 return decode(s, true); 2852 } 2853 2854 // This method was introduced as a generalization of URI.decode method 2855 // to provide a fix for JDK-8037396 2856 private static String decode(String s, boolean ignorePercentInBrackets) { 2857 if (s == null) 2858 return s; 2859 int n = s.length(); 2860 if (n == 0) 2861 return s; 2862 if (s.indexOf('%') < 0) 2863 return s; 2864 2865 StringBuilder sb = new StringBuilder(n); 2866 ByteBuffer bb = ByteBuffer.allocate(n); 2867 CharBuffer cb = CharBuffer.allocate(n); 2868 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2869 .onMalformedInput(CodingErrorAction.REPLACE) 2870 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2871 2872 // This is not horribly efficient, but it will do for now 2873 char c = s.charAt(0); 2874 boolean betweenBrackets = false; 2875 2876 for (int i = 0; i < n;) { 2877 assert c == s.charAt(i); // Loop invariant 2878 if (c == '[') { 2879 betweenBrackets = true; 2880 } else if (betweenBrackets && c == ']') { 2881 betweenBrackets = false; 2882 } 2883 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2884 sb.append(c); 2885 if (++i >= n) 2886 break; 2887 c = s.charAt(i); 2888 continue; 2889 } 2890 bb.clear(); 2891 int ui = i; 2892 for (;;) { 2893 assert (n - i >= 2); 2894 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2895 if (++i >= n) 2896 break; 2897 c = s.charAt(i); 2898 if (c != '%') 2899 break; 2900 } 2901 bb.flip(); 2902 cb.clear(); 2903 dec.reset(); 2904 CoderResult cr = dec.decode(bb, cb, true); 2905 assert cr.isUnderflow(); 2906 cr = dec.flush(cb); 2907 assert cr.isUnderflow(); 2908 sb.append(cb.flip().toString()); 2909 } 2910 2911 return sb.toString(); 2912 } 2913 2914 2915 // -- Parsing -- 2916 2917 // For convenience we wrap the input URI string in a new instance of the 2918 // following internal class. This saves always having to pass the input 2919 // string as an argument to each internal scan/parse method. 2920 2921 private class Parser { 2922 2923 private String input; // URI input string 2924 private boolean requireServerAuthority = false; 2925 2926 Parser(String s) { 2927 input = s; 2928 string = s; 2929 } 2930 2931 // -- Methods for throwing URISyntaxException in various ways -- 2932 2933 private void fail(String reason) throws URISyntaxException { 2934 throw new URISyntaxException(input, reason); 2935 } 2936 2937 private void fail(String reason, int p) throws URISyntaxException { 2938 throw new URISyntaxException(input, reason, p); 2939 } 2940 2941 private void failExpecting(String expected, int p) 2942 throws URISyntaxException 2943 { 2944 fail("Expected " + expected, p); 2945 } 2946 2947 2948 // -- Simple access to the input string -- 2949 2950 // Tells whether start < end and, if so, whether charAt(start) == c 2951 // 2952 private boolean at(int start, int end, char c) { 2953 return (start < end) && (input.charAt(start) == c); 2954 } 2955 2956 // Tells whether start + s.length() < end and, if so, 2957 // whether the chars at the start position match s exactly 2958 // 2959 private boolean at(int start, int end, String s) { 2960 int p = start; 2961 int sn = s.length(); 2962 if (sn > end - p) 2963 return false; 2964 int i = 0; 2965 while (i < sn) { 2966 if (input.charAt(p++) != s.charAt(i)) { 2967 break; 2968 } 2969 i++; 2970 } 2971 return (i == sn); 2972 } 2973 2974 2975 // -- Scanning -- 2976 2977 // The various scan and parse methods that follow use a uniform 2978 // convention of taking the current start position and end index as 2979 // their first two arguments. The start is inclusive while the end is 2980 // exclusive, just as in the String class, i.e., a start/end pair 2981 // denotes the left-open interval [start, end) of the input string. 2982 // 2983 // These methods never proceed past the end position. They may return 2984 // -1 to indicate outright failure, but more often they simply return 2985 // the position of the first char after the last char scanned. Thus 2986 // a typical idiom is 2987 // 2988 // int p = start; 2989 // int q = scan(p, end, ...); 2990 // if (q > p) 2991 // // We scanned something 2992 // ...; 2993 // else if (q == p) 2994 // // We scanned nothing 2995 // ...; 2996 // else if (q == -1) 2997 // // Something went wrong 2998 // ...; 2999 3000 3001 // Scan a specific char: If the char at the given start position is 3002 // equal to c, return the index of the next char; otherwise, return the 3003 // start position. 3004 // 3005 private int scan(int start, int end, char c) { 3006 if ((start < end) && (input.charAt(start) == c)) 3007 return start + 1; 3008 return start; 3009 } 3010 3011 // Scan forward from the given start position. Stop at the first char 3012 // in the err string (in which case -1 is returned), or the first char 3013 // in the stop string (in which case the index of the preceding char is 3014 // returned), or the end of the input string (in which case the length 3015 // of the input string is returned). May return the start position if 3016 // nothing matches. 3017 // 3018 private int scan(int start, int end, String err, String stop) { 3019 int p = start; 3020 while (p < end) { 3021 char c = input.charAt(p); 3022 if (err.indexOf(c) >= 0) 3023 return -1; 3024 if (stop.indexOf(c) >= 0) 3025 break; 3026 p++; 3027 } 3028 return p; 3029 } 3030 3031 // Scan forward from the given start position. Stop at the first char 3032 // in the stop string (in which case the index of the preceding char is 3033 // returned), or the end of the input string (in which case the length 3034 // of the input string is returned). May return the start position if 3035 // nothing matches. 3036 // 3037 private int scan(int start, int end, String stop) { 3038 int p = start; 3039 while (p < end) { 3040 char c = input.charAt(p); 3041 if (stop.indexOf(c) >= 0) 3042 break; 3043 p++; 3044 } 3045 return p; 3046 } 3047 3048 // Scan a potential escape sequence, starting at the given position, 3049 // with the given first char (i.e., charAt(start) == c). 3050 // 3051 // This method assumes that if escapes are allowed then visible 3052 // non-US-ASCII chars are also allowed. 3053 // 3054 private int scanEscape(int start, int n, char first) 3055 throws URISyntaxException 3056 { 3057 int p = start; 3058 char c = first; 3059 if (c == '%') { 3060 // Process escape pair 3061 if ((p + 3 <= n) 3062 && match(input.charAt(p + 1), L_HEX, H_HEX) 3063 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3064 return p + 3; 3065 } 3066 fail("Malformed escape pair", p); 3067 } else if ((c > 128) 3068 && !Character.isSpaceChar(c) 3069 && !Character.isISOControl(c)) { 3070 // Allow unescaped but visible non-US-ASCII chars 3071 return p + 1; 3072 } 3073 return p; 3074 } 3075 3076 // Scan chars that match the given mask pair 3077 // 3078 private int scan(int start, int n, long lowMask, long highMask) 3079 throws URISyntaxException 3080 { 3081 int p = start; 3082 while (p < n) { 3083 char c = input.charAt(p); 3084 if (match(c, lowMask, highMask)) { 3085 p++; 3086 continue; 3087 } 3088 if ((lowMask & L_ESCAPED) != 0) { 3089 int q = scanEscape(p, n, c); 3090 if (q > p) { 3091 p = q; 3092 continue; 3093 } 3094 } 3095 break; 3096 } 3097 return p; 3098 } 3099 3100 // Check that each of the chars in [start, end) matches the given mask 3101 // 3102 private void checkChars(int start, int end, 3103 long lowMask, long highMask, 3104 String what) 3105 throws URISyntaxException 3106 { 3107 int p = scan(start, end, lowMask, highMask); 3108 if (p < end) 3109 fail("Illegal character in " + what, p); 3110 } 3111 3112 // Check that the char at position p matches the given mask 3113 // 3114 private void checkChar(int p, 3115 long lowMask, long highMask, 3116 String what) 3117 throws URISyntaxException 3118 { 3119 checkChars(p, p + 1, lowMask, highMask, what); 3120 } 3121 3122 3123 // -- Parsing -- 3124 3125 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3126 // 3127 void parse(boolean rsa) throws URISyntaxException { 3128 requireServerAuthority = rsa; 3129 int n = input.length(); 3130 int p = scan(0, n, "/?#", ":"); 3131 if ((p >= 0) && at(p, n, ':')) { 3132 if (p == 0) 3133 failExpecting("scheme name", 0); 3134 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3135 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3136 scheme = input.substring(0, p); 3137 p++; // Skip ':' 3138 if (at(p, n, '/')) { 3139 p = parseHierarchical(p, n); 3140 } else { 3141 // opaque; need to create the schemeSpecificPart 3142 int q = scan(p, n, "#"); 3143 if (q <= p) 3144 failExpecting("scheme-specific part", p); 3145 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3146 schemeSpecificPart = input.substring(p, q); 3147 p = q; 3148 } 3149 } else { 3150 p = parseHierarchical(0, n); 3151 } 3152 if (at(p, n, '#')) { 3153 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3154 fragment = input.substring(p + 1, n); 3155 p = n; 3156 } 3157 if (p < n) 3158 fail("end of URI", p); 3159 } 3160 3161 // [//authority]<path>[?<query>] 3162 // 3163 // DEVIATION from RFC2396: We allow an empty authority component as 3164 // long as it's followed by a non-empty path, query component, or 3165 // fragment component. This is so that URIs such as "file:///foo/bar" 3166 // will parse. This seems to be the intent of RFC2396, though the 3167 // grammar does not permit it. If the authority is empty then the 3168 // userInfo, host, and port components are undefined. 3169 // 3170 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3171 // to be the intent of RFC2396, but the grammar does not permit it. 3172 // The primary consequence of this deviation is that "#f" parses as a 3173 // relative URI with an empty path. 3174 // 3175 private int parseHierarchical(int start, int n) 3176 throws URISyntaxException 3177 { 3178 int p = start; 3179 if (at(p, n, '/') && at(p + 1, n, '/')) { 3180 p += 2; 3181 int q = scan(p, n, "/?#"); 3182 if (q > p) { 3183 p = parseAuthority(p, q); 3184 } else if (q < n) { 3185 // DEVIATION: Allow empty authority prior to non-empty 3186 // path, query component or fragment identifier 3187 } else 3188 failExpecting("authority", p); 3189 } 3190 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3191 checkChars(p, q, L_PATH, H_PATH, "path"); 3192 path = input.substring(p, q); 3193 p = q; 3194 if (at(p, n, '?')) { 3195 p++; 3196 q = scan(p, n, "#"); 3197 checkChars(p, q, L_URIC, H_URIC, "query"); 3198 query = input.substring(p, q); 3199 p = q; 3200 } 3201 return p; 3202 } 3203 3204 // authority = server | reg_name 3205 // 3206 // Ambiguity: An authority that is a registry name rather than a server 3207 // might have a prefix that parses as a server. We use the fact that 3208 // the authority component is always followed by '/' or the end of the 3209 // input string to resolve this: If the complete authority did not 3210 // parse as a server then we try to parse it as a registry name. 3211 // 3212 private int parseAuthority(int start, int n) 3213 throws URISyntaxException 3214 { 3215 int p = start; 3216 int q = p; 3217 URISyntaxException ex = null; 3218 3219 boolean serverChars; 3220 boolean regChars; 3221 3222 if (scan(p, n, "]") > p) { 3223 // contains a literal IPv6 address, therefore % is allowed 3224 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3225 } else { 3226 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3227 } 3228 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3229 3230 if (regChars && !serverChars) { 3231 // Must be a registry-based authority 3232 authority = input.substring(p, n); 3233 return n; 3234 } 3235 3236 if (serverChars) { 3237 // Might be (probably is) a server-based authority, so attempt 3238 // to parse it as such. If the attempt fails, try to treat it 3239 // as a registry-based authority. 3240 try { 3241 q = parseServer(p, n); 3242 if (q < n) 3243 failExpecting("end of authority", q); 3244 authority = input.substring(p, n); 3245 } catch (URISyntaxException x) { 3246 // Undo results of failed parse 3247 userInfo = null; 3248 host = null; 3249 port = -1; 3250 if (requireServerAuthority) { 3251 // If we're insisting upon a server-based authority, 3252 // then just re-throw the exception 3253 throw x; 3254 } else { 3255 // Save the exception in case it doesn't parse as a 3256 // registry either 3257 ex = x; 3258 q = p; 3259 } 3260 } 3261 } 3262 3263 if (q < n) { 3264 if (regChars) { 3265 // Registry-based authority 3266 authority = input.substring(p, n); 3267 } else if (ex != null) { 3268 // Re-throw exception; it was probably due to 3269 // a malformed IPv6 address 3270 throw ex; 3271 } else { 3272 fail("Illegal character in authority", q); 3273 } 3274 } 3275 3276 return n; 3277 } 3278 3279 3280 // [<userinfo>@]<host>[:<port>] 3281 // 3282 private int parseServer(int start, int n) 3283 throws URISyntaxException 3284 { 3285 int p = start; 3286 int q; 3287 3288 // userinfo 3289 q = scan(p, n, "/?#", "@"); 3290 if ((q >= p) && at(q, n, '@')) { 3291 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3292 userInfo = input.substring(p, q); 3293 p = q + 1; // Skip '@' 3294 } 3295 3296 // hostname, IPv4 address, or IPv6 address 3297 if (at(p, n, '[')) { 3298 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3299 p++; 3300 q = scan(p, n, "/?#", "]"); 3301 if ((q > p) && at(q, n, ']')) { 3302 // look for a "%" scope id 3303 int r = scan (p, q, "%"); 3304 if (r > p) { 3305 parseIPv6Reference(p, r); 3306 if (r+1 == q) { 3307 fail ("scope id expected"); 3308 } 3309 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3310 "scope id"); 3311 } else { 3312 parseIPv6Reference(p, q); 3313 } 3314 host = input.substring(p-1, q+1); 3315 p = q + 1; 3316 } else { 3317 failExpecting("closing bracket for IPv6 address", q); 3318 } 3319 } else { 3320 q = parseIPv4Address(p, n); 3321 if (q <= p) 3322 q = parseHostname(p, n); 3323 p = q; 3324 } 3325 3326 // port 3327 if (at(p, n, ':')) { 3328 p++; 3329 q = scan(p, n, "/"); 3330 if (q > p) { 3331 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3332 try { 3333 port = Integer.parseInt(input, p, q, 10); 3334 } catch (NumberFormatException x) { 3335 fail("Malformed port number", p); 3336 } 3337 p = q; 3338 } 3339 } 3340 if (p < n) 3341 failExpecting("port number", p); 3342 3343 return p; 3344 } 3345 3346 // Scan a string of decimal digits whose value fits in a byte 3347 // 3348 private int scanByte(int start, int n) 3349 throws URISyntaxException 3350 { 3351 int p = start; 3352 int q = scan(p, n, L_DIGIT, H_DIGIT); 3353 if (q <= p) return q; 3354 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3355 return q; 3356 } 3357 3358 // Scan an IPv4 address. 3359 // 3360 // If the strict argument is true then we require that the given 3361 // interval contain nothing besides an IPv4 address; if it is false 3362 // then we only require that it start with an IPv4 address. 3363 // 3364 // If the interval does not contain or start with (depending upon the 3365 // strict argument) a legal IPv4 address characters then we return -1 3366 // immediately; otherwise we insist that these characters parse as a 3367 // legal IPv4 address and throw an exception on failure. 3368 // 3369 // We assume that any string of decimal digits and dots must be an IPv4 3370 // address. It won't parse as a hostname anyway, so making that 3371 // assumption here allows more meaningful exceptions to be thrown. 3372 // 3373 private int scanIPv4Address(int start, int n, boolean strict) 3374 throws URISyntaxException 3375 { 3376 int p = start; 3377 int q; 3378 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3379 if ((m <= p) || (strict && (m != n))) 3380 return -1; 3381 for (;;) { 3382 // Per RFC2732: At most three digits per byte 3383 // Further constraint: Each element fits in a byte 3384 if ((q = scanByte(p, m)) <= p) break; p = q; 3385 if ((q = scan(p, m, '.')) <= p) break; p = q; 3386 if ((q = scanByte(p, m)) <= p) break; p = q; 3387 if ((q = scan(p, m, '.')) <= p) break; p = q; 3388 if ((q = scanByte(p, m)) <= p) break; p = q; 3389 if ((q = scan(p, m, '.')) <= p) break; p = q; 3390 if ((q = scanByte(p, m)) <= p) break; p = q; 3391 if (q < m) break; 3392 return q; 3393 } 3394 fail("Malformed IPv4 address", q); 3395 return -1; 3396 } 3397 3398 // Take an IPv4 address: Throw an exception if the given interval 3399 // contains anything except an IPv4 address 3400 // 3401 private int takeIPv4Address(int start, int n, String expected) 3402 throws URISyntaxException 3403 { 3404 int p = scanIPv4Address(start, n, true); 3405 if (p <= start) 3406 failExpecting(expected, start); 3407 return p; 3408 } 3409 3410 // Attempt to parse an IPv4 address, returning -1 on failure but 3411 // allowing the given interval to contain [:<characters>] after 3412 // the IPv4 address. 3413 // 3414 private int parseIPv4Address(int start, int n) { 3415 int p; 3416 3417 try { 3418 p = scanIPv4Address(start, n, false); 3419 } catch (URISyntaxException x) { 3420 return -1; 3421 } catch (NumberFormatException nfe) { 3422 return -1; 3423 } 3424 3425 if (p > start && p < n) { 3426 // IPv4 address is followed by something - check that 3427 // it's a ":" as this is the only valid character to 3428 // follow an address. 3429 if (input.charAt(p) != ':') { 3430 p = -1; 3431 } 3432 } 3433 3434 if (p > start) 3435 host = input.substring(start, p); 3436 3437 return p; 3438 } 3439 3440 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3441 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3442 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3443 // 3444 private int parseHostname(int start, int n) 3445 throws URISyntaxException 3446 { 3447 int p = start; 3448 int q; 3449 int l = -1; // Start of last parsed label 3450 3451 do { 3452 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3453 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3454 if (q <= p) 3455 break; 3456 l = p; 3457 if (q > p) { 3458 p = q; 3459 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3460 if (q > p) { 3461 if (input.charAt(q - 1) == '-') 3462 fail("Illegal character in hostname", q - 1); 3463 p = q; 3464 } 3465 } 3466 q = scan(p, n, '.'); 3467 if (q <= p) 3468 break; 3469 p = q; 3470 } while (p < n); 3471 3472 if ((p < n) && !at(p, n, ':')) 3473 fail("Illegal character in hostname", p); 3474 3475 if (l < 0) 3476 failExpecting("hostname", start); 3477 3478 // for a fully qualified hostname check that the rightmost 3479 // label starts with an alpha character. 3480 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3481 fail("Illegal character in hostname", l); 3482 } 3483 3484 host = input.substring(start, p); 3485 return p; 3486 } 3487 3488 3489 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3490 // 3491 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3492 // the form ::12.34.56.78, which are clearly shown in the examples 3493 // earlier in the document. Here is the original grammar: 3494 // 3495 // IPv6address = hexpart [ ":" IPv4address ] 3496 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3497 // hexseq = hex4 *( ":" hex4) 3498 // hex4 = 1*4HEXDIG 3499 // 3500 // We therefore use the following revised grammar: 3501 // 3502 // IPv6address = hexseq [ ":" IPv4address ] 3503 // | hexseq [ "::" [ hexpost ] ] 3504 // | "::" [ hexpost ] 3505 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3506 // hexseq = hex4 *( ":" hex4) 3507 // hex4 = 1*4HEXDIG 3508 // 3509 // This covers all and only the following cases: 3510 // 3511 // hexseq 3512 // hexseq : IPv4address 3513 // hexseq :: 3514 // hexseq :: hexseq 3515 // hexseq :: hexseq : IPv4address 3516 // hexseq :: IPv4address 3517 // :: hexseq 3518 // :: hexseq : IPv4address 3519 // :: IPv4address 3520 // :: 3521 // 3522 // Additionally we constrain the IPv6 address as follows :- 3523 // 3524 // i. IPv6 addresses without compressed zeros should contain 3525 // exactly 16 bytes. 3526 // 3527 // ii. IPv6 addresses with compressed zeros should contain 3528 // less than 16 bytes. 3529 3530 private int ipv6byteCount = 0; 3531 3532 private int parseIPv6Reference(int start, int n) 3533 throws URISyntaxException 3534 { 3535 int p = start; 3536 int q; 3537 boolean compressedZeros = false; 3538 3539 q = scanHexSeq(p, n); 3540 3541 if (q > p) { 3542 p = q; 3543 if (at(p, n, "::")) { 3544 compressedZeros = true; 3545 p = scanHexPost(p + 2, n); 3546 } else if (at(p, n, ':')) { 3547 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3548 ipv6byteCount += 4; 3549 } 3550 } else if (at(p, n, "::")) { 3551 compressedZeros = true; 3552 p = scanHexPost(p + 2, n); 3553 } 3554 if (p < n) 3555 fail("Malformed IPv6 address", start); 3556 if (ipv6byteCount > 16) 3557 fail("IPv6 address too long", start); 3558 if (!compressedZeros && ipv6byteCount < 16) 3559 fail("IPv6 address too short", start); 3560 if (compressedZeros && ipv6byteCount == 16) 3561 fail("Malformed IPv6 address", start); 3562 3563 return p; 3564 } 3565 3566 private int scanHexPost(int start, int n) 3567 throws URISyntaxException 3568 { 3569 int p = start; 3570 int q; 3571 3572 if (p == n) 3573 return p; 3574 3575 q = scanHexSeq(p, n); 3576 if (q > p) { 3577 p = q; 3578 if (at(p, n, ':')) { 3579 p++; 3580 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3581 ipv6byteCount += 4; 3582 } 3583 } else { 3584 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3585 ipv6byteCount += 4; 3586 } 3587 return p; 3588 } 3589 3590 // Scan a hex sequence; return -1 if one could not be scanned 3591 // 3592 private int scanHexSeq(int start, int n) 3593 throws URISyntaxException 3594 { 3595 int p = start; 3596 int q; 3597 3598 q = scan(p, n, L_HEX, H_HEX); 3599 if (q <= p) 3600 return -1; 3601 if (at(q, n, '.')) // Beginning of IPv4 address 3602 return -1; 3603 if (q > p + 4) 3604 fail("IPv6 hexadecimal digit sequence too long", p); 3605 ipv6byteCount += 2; 3606 p = q; 3607 while (p < n) { 3608 if (!at(p, n, ':')) 3609 break; 3610 if (at(p + 1, n, ':')) 3611 break; // "::" 3612 p++; 3613 q = scan(p, n, L_HEX, H_HEX); 3614 if (q <= p) 3615 failExpecting("digits for an IPv6 address", p); 3616 if (at(q, n, '.')) { // Beginning of IPv4 address 3617 p--; 3618 break; 3619 } 3620 if (q > p + 4) 3621 fail("IPv6 hexadecimal digit sequence too long", p); 3622 ipv6byteCount += 2; 3623 p = q; 3624 } 3625 3626 return p; 3627 } 3628 3629 } 3630 static { 3631 SharedSecrets.setJavaNetUriAccess( 3632 new JavaNetUriAccess() { 3633 public URI create(String scheme, String path) { 3634 return new URI(scheme, path); 3635 } 3636 } 3637 ); 3638 } 3639 }