1 /* 2 * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.IOException; 29 import java.io.InvalidObjectException; 30 import java.io.ObjectInputStream; 31 import java.io.ObjectOutputStream; 32 import java.io.Serializable; 33 import java.nio.ByteBuffer; 34 import java.nio.CharBuffer; 35 import java.nio.charset.CharsetDecoder; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.CharacterCodingException; 39 import java.text.Normalizer; 40 import jdk.internal.loader.URLClassPath; 41 import jdk.internal.misc.JavaNetUriAccess; 42 import jdk.internal.misc.SharedSecrets; 43 import sun.nio.cs.ThreadLocalCoders; 44 45 import java.lang.Character; // for javadoc 46 import java.lang.NullPointerException; // for javadoc 47 48 49 /** 50 * Represents a Uniform Resource Identifier (URI) reference. 51 * 52 * <p> Aside from some minor deviations noted below, an instance of this 53 * class represents a URI reference as defined by 54 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 55 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 56 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 57 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 58 * also supports scope_ids. The syntax and usage of scope_ids is described 59 * <a href="Inet6Address.html#scoped">here</a>. 60 * This class provides constructors for creating URI instances from 61 * their components or by parsing their string forms, methods for accessing the 62 * various components of an instance, and methods for normalizing, resolving, 63 * and relativizing URI instances. Instances of this class are immutable. 64 * 65 * 66 * <h3> URI syntax and components </h3> 67 * 68 * At the highest level a URI reference (hereinafter simply "URI") in string 69 * form has the syntax 70 * 71 * <blockquote> 72 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 73 * </blockquote> 74 * 75 * where square brackets [...] delineate optional components and the characters 76 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 77 * 78 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 79 * said to be <i>relative</i>. URIs are also classified according to whether 80 * they are <i>opaque</i> or <i>hierarchical</i>. 81 * 82 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 83 * not begin with a slash character ({@code '/'}). Opaque URIs are not 84 * subject to further parsing. Some examples of opaque URIs are: 85 * 86 * <blockquote><table cellpadding=0 cellspacing=0 summary="layout"> 87 * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr> 88 * <tr><td>{@code news:comp.lang.java}<td></tr> 89 * <tr><td>{@code urn:isbn:096139210x}</td></tr> 90 * </table></blockquote> 91 * 92 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 93 * scheme-specific part begins with a slash character, or a relative URI, that 94 * is, a URI that does not specify a scheme. Some examples of hierarchical 95 * URIs are: 96 * 97 * <blockquote> 98 * {@code http://example.com/languages/java/}<br> 99 * {@code sample/a/index.html#28}<br> 100 * {@code ../../demo/b/index.html}<br> 101 * {@code file:///~/calendar} 102 * </blockquote> 103 * 104 * <p> A hierarchical URI is subject to further parsing according to the syntax 105 * 106 * <blockquote> 107 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 108 * </blockquote> 109 * 110 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 111 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 112 * scheme-specific part of a hierarchical URI consists of the characters 113 * between the scheme and fragment components. 114 * 115 * <p> The authority component of a hierarchical URI is, if specified, either 116 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 117 * parses according to the familiar syntax 118 * 119 * <blockquote> 120 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 121 * </blockquote> 122 * 123 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 124 * themselves. Nearly all URI schemes currently in use are server-based. An 125 * authority component that does not parse in this way is considered to be 126 * registry-based. 127 * 128 * <p> The path component of a hierarchical URI is itself said to be absolute 129 * if it begins with a slash character ({@code '/'}); otherwise it is 130 * relative. The path of a hierarchical URI that is either absolute or 131 * specifies an authority is always absolute. 132 * 133 * <p> All told, then, a URI instance has the following nine components: 134 * 135 * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment"> 136 * <tr><th><i>Component</i></th><th><i>Type</i></th></tr> 137 * <tr><td>scheme</td><td>{@code String}</td></tr> 138 * <tr><td>scheme-specific-part </td><td>{@code String}</td></tr> 139 * <tr><td>authority</td><td>{@code String}</td></tr> 140 * <tr><td>user-info</td><td>{@code String}</td></tr> 141 * <tr><td>host</td><td>{@code String}</td></tr> 142 * <tr><td>port</td><td>{@code int}</td></tr> 143 * <tr><td>path</td><td>{@code String}</td></tr> 144 * <tr><td>query</td><td>{@code String}</td></tr> 145 * <tr><td>fragment</td><td>{@code String}</td></tr> 146 * </table></blockquote> 147 * 148 * In a given instance any particular component is either <i>undefined</i> or 149 * <i>defined</i> with a distinct value. Undefined string components are 150 * represented by {@code null}, while undefined integer components are 151 * represented by {@code -1}. A string component may be defined to have the 152 * empty string as its value; this is not equivalent to that component being 153 * undefined. 154 * 155 * <p> Whether a particular component is or is not defined in an instance 156 * depends upon the type of the URI being represented. An absolute URI has a 157 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 158 * possibly a fragment, but has no other components. A hierarchical URI always 159 * has a path (though it may be empty) and a scheme-specific-part (which at 160 * least contains the path), and may have any of the other components. If the 161 * authority component is present and is server-based then the host component 162 * will be defined and the user-information and port components may be defined. 163 * 164 * 165 * <h4> Operations on URI instances </h4> 166 * 167 * The key operations supported by this class are those of 168 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 169 * 170 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 171 * and {@code ".."} segments from the path component of a hierarchical URI. 172 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 173 * removed only if it is preceded by a non-{@code ".."} segment. 174 * Normalization has no effect upon opaque URIs. 175 * 176 * <p> <i>Resolution</i> is the process of resolving one URI against another, 177 * <i>base</i> URI. The resulting URI is constructed from components of both 178 * URIs in the manner specified by RFC 2396, taking components from the 179 * base URI for those not specified in the original. For hierarchical URIs, 180 * the path of the original is resolved against the path of the base and then 181 * normalized. The result, for example, of resolving 182 * 183 * <blockquote> 184 * {@code sample/a/index.html#28} 185 * 186 * (1) 187 * </blockquote> 188 * 189 * against the base URI {@code http://example.com/languages/java/} is the result 190 * URI 191 * 192 * <blockquote> 193 * {@code http://example.com/languages/java/sample/a/index.html#28} 194 * </blockquote> 195 * 196 * Resolving the relative URI 197 * 198 * <blockquote> 199 * {@code ../../demo/b/index.html} (2) 200 * </blockquote> 201 * 202 * against this result yields, in turn, 203 * 204 * <blockquote> 205 * {@code http://example.com/languages/java/demo/b/index.html} 206 * </blockquote> 207 * 208 * Resolution of both absolute and relative URIs, and of both absolute and 209 * relative paths in the case of hierarchical URIs, is supported. Resolving 210 * the URI {@code file:///~calendar} against any other URI simply yields the 211 * original URI, since it is absolute. Resolving the relative URI (2) above 212 * against the relative base URI (1) yields the normalized, but still relative, 213 * URI 214 * 215 * <blockquote> 216 * {@code demo/b/index.html} 217 * </blockquote> 218 * 219 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 220 * two normalized URIs <i>u</i> and <i>v</i>, 221 * 222 * <blockquote> 223 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 224 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 225 * </blockquote> 226 * 227 * This operation is often useful when constructing a document containing URIs 228 * that must be made relative to the base URI of the document wherever 229 * possible. For example, relativizing the URI 230 * 231 * <blockquote> 232 * {@code http://example.com/languages/java/sample/a/index.html#28} 233 * </blockquote> 234 * 235 * against the base URI 236 * 237 * <blockquote> 238 * {@code http://example.com/languages/java/} 239 * </blockquote> 240 * 241 * yields the relative URI {@code sample/a/index.html#28}. 242 * 243 * 244 * <h4> Character categories </h4> 245 * 246 * RFC 2396 specifies precisely which characters are permitted in the 247 * various components of a URI reference. The following categories, most of 248 * which are taken from that specification, are used below to describe these 249 * constraints: 250 * 251 * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other"> 252 * <tr><th valign=top><i>alpha</i></th> 253 * <td>The US-ASCII alphabetic characters, 254 * {@code 'A'} through {@code 'Z'} 255 * and {@code 'a'} through {@code 'z'}</td></tr> 256 * <tr><th valign=top><i>digit</i></th> 257 * <td>The US-ASCII decimal digit characters, 258 * {@code '0'} through {@code '9'}</td></tr> 259 * <tr><th valign=top><i>alphanum</i></th> 260 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 261 * <tr><th valign=top><i>unreserved</i> </th> 262 * <td>All <i>alphanum</i> characters together with those in the string 263 * {@code "_-!.~'()*"}</td></tr> 264 * <tr><th valign=top><i>punct</i></th> 265 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 266 * <tr><th valign=top><i>reserved</i></th> 267 * <td>All <i>punct</i> characters together with those in the string 268 * {@code "?/[]@"}</td></tr> 269 * <tr><th valign=top><i>escaped</i></th> 270 * <td>Escaped octets, that is, triplets consisting of the percent 271 * character ({@code '%'}) followed by two hexadecimal digits 272 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 273 * {@code 'a'}-{@code 'f'})</td></tr> 274 * <tr><th valign=top><i>other</i></th> 275 * <td>The Unicode characters that are not in the US-ASCII character set, 276 * are not control characters (according to the {@link 277 * java.lang.Character#isISOControl(char) Character.isISOControl} 278 * method), and are not space characters (according to the {@link 279 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 280 * method) <i>(<b>Deviation from RFC 2396</b>, which is 281 * limited to US-ASCII)</i></td></tr> 282 * </table></blockquote> 283 * 284 * <p><a id="legal-chars"></a> The set of all legal URI characters consists of 285 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 286 * characters. 287 * 288 * 289 * <h4> Escaped octets, quotation, encoding, and decoding </h4> 290 * 291 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 292 * fragment components. Escaping serves two purposes in URIs: 293 * 294 * <ul> 295 * 296 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 297 * conform strictly to RFC 2396 by not containing any <i>other</i> 298 * characters. </p></li> 299 * 300 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 301 * component. The user-info, path, query, and fragment components differ 302 * slightly in terms of which characters are considered legal and illegal. 303 * </p></li> 304 * 305 * </ul> 306 * 307 * These purposes are served in this class by three related operations: 308 * 309 * <ul> 310 * 311 * <li><p><a id="encode"></a> A character is <i>encoded</i> by replacing it 312 * with the sequence of escaped octets that represent that character in the 313 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 314 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 315 * RFC 2396</b>, which does not specify any particular character 316 * set.)</i> </p></li> 317 * 318 * <li><p><a id="quote"></a> An illegal character is <i>quoted</i> simply by 319 * encoding it. The space character, for example, is quoted by replacing it 320 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 321 * characters this transformation has exactly the effect required by 322 * RFC 2396. </p></li> 323 * 324 * <li><p><a id="decode"></a> 325 * A sequence of escaped octets is <i>decoded</i> by 326 * replacing it with the sequence of characters that it represents in the 327 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 328 * effect of de-quoting any quoted US-ASCII characters as well as that of 329 * decoding any encoded non-US-ASCII characters. If a <a 330 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 331 * when decoding the escaped octets then the erroneous octets are replaced by 332 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 333 * 334 * </ul> 335 * 336 * These operations are exposed in the constructors and methods of this class 337 * as follows: 338 * 339 * <ul> 340 * 341 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 342 * constructor} requires any illegal characters in its argument to be 343 * quoted and preserves any escaped octets and <i>other</i> characters that 344 * are present. </p></li> 345 * 346 * <li><p> The {@linkplain 347 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 348 * multi-argument constructors} quote illegal characters as 349 * required by the components in which they appear. The percent character 350 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 351 * characters are preserved. </p></li> 352 * 353 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 354 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 355 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 356 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 357 * values of their corresponding components in raw form, without interpreting 358 * any escaped octets. The strings returned by these methods may contain 359 * both escaped octets and <i>other</i> characters, and will not contain any 360 * illegal characters. </p></li> 361 * 362 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 363 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 364 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 365 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 366 * octets in their corresponding components. The strings returned by these 367 * methods may contain both <i>other</i> characters and illegal characters, 368 * and will not contain any escaped octets. </p></li> 369 * 370 * <li><p> The {@link #toString() toString} method returns a URI string with 371 * all necessary quotation but which may contain <i>other</i> characters. 372 * </p></li> 373 * 374 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 375 * quoted and encoded URI string that does not contain any <i>other</i> 376 * characters. </p></li> 377 * 378 * </ul> 379 * 380 * 381 * <h4> Identities </h4> 382 * 383 * For any URI <i>u</i>, it is always the case that 384 * 385 * <blockquote> 386 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 387 * </blockquote> 388 * 389 * For any URI <i>u</i> that does not contain redundant syntax such as two 390 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 391 * colon following a host name but no port (as in 392 * {@code http://java.sun.com:} ), and that does not encode characters 393 * except those that must be quoted, the following identities also hold: 394 * <pre> 395 * new URI(<i>u</i>.getScheme(), 396 * <i>u</i>.getSchemeSpecificPart(), 397 * <i>u</i>.getFragment()) 398 * .equals(<i>u</i>)</pre> 399 * in all cases, 400 * <pre> 401 * new URI(<i>u</i>.getScheme(), 402 * <i>u</i>.getAuthority(), 403 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 404 * <i>u</i>.getFragment()) 405 * .equals(<i>u</i>)</pre> 406 * if <i>u</i> is hierarchical, and 407 * <pre> 408 * new URI(<i>u</i>.getScheme(), 409 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 410 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 411 * <i>u</i>.getFragment()) 412 * .equals(<i>u</i>)</pre> 413 * if <i>u</i> is hierarchical and has either no authority or a server-based 414 * authority. 415 * 416 * 417 * <h4> URIs, URLs, and URNs </h4> 418 * 419 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 420 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 421 * not every URI is a URL. This is because there is another subcategory of 422 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 423 * specify how to locate them. The {@code mailto}, {@code news}, and 424 * {@code isbn} URIs shown above are examples of URNs. 425 * 426 * <p> The conceptual distinction between URIs and URLs is reflected in the 427 * differences between this class and the {@link URL} class. 428 * 429 * <p> An instance of this class represents a URI reference in the syntactic 430 * sense defined by RFC 2396. A URI may be either absolute or relative. 431 * A URI string is parsed according to the generic syntax without regard to the 432 * scheme, if any, that it specifies. No lookup of the host, if any, is 433 * performed, and no scheme-dependent stream handler is constructed. Equality, 434 * hashing, and comparison are defined strictly in terms of the character 435 * content of the instance. In other words, a URI instance is little more than 436 * a structured string that supports the syntactic, scheme-independent 437 * operations of comparison, normalization, resolution, and relativization. 438 * 439 * <p> An instance of the {@link URL} class, by contrast, represents the 440 * syntactic components of a URL together with some of the information required 441 * to access the resource that it describes. A URL must be absolute, that is, 442 * it must always specify a scheme. A URL string is parsed according to its 443 * scheme. A stream handler is always established for a URL, and in fact it is 444 * impossible to create a URL instance for a scheme for which no handler is 445 * available. Equality and hashing depend upon both the scheme and the 446 * Internet address of the host, if any; comparison is not defined. In other 447 * words, a URL is a structured string that supports the syntactic operation of 448 * resolution as well as the network I/O operations of looking up the host and 449 * opening a connection to the specified resource. 450 * 451 * 452 * @author Mark Reinhold 453 * @since 1.4 454 * 455 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 456 * transformation format of ISO 10646</i></a>, <br><a 457 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 458 * Architecture</i></a>, <br><a 459 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 460 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 461 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 462 * Literal IPv6 Addresses in URLs</i></a>, <br><a 463 * href="URISyntaxException.html">URISyntaxException</a> 464 */ 465 466 public final class URI 467 implements Comparable<URI>, Serializable 468 { 469 470 // Note: Comments containing the word "ASSERT" indicate places where a 471 // throw of an InternalError should be replaced by an appropriate assertion 472 // statement once asserts are enabled in the build. 473 474 static final long serialVersionUID = -6052424284110960213L; 475 476 477 // -- Properties and components of this instance -- 478 479 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 480 private transient String scheme; // null ==> relative URI 481 private transient String fragment; 482 483 // Hierarchical URI components: [//<authority>]<path>[?<query>] 484 private transient String authority; // Registry or server 485 486 // Server-based authority: [<userInfo>@]<host>[:<port>] 487 private transient String userInfo; 488 private transient String host; // null ==> registry-based 489 private transient int port = -1; // -1 ==> undefined 490 491 // Remaining components of hierarchical URIs 492 private transient String path; // null ==> opaque 493 private transient String query; 494 495 // The remaining fields may be computed on demand, which is safe even in 496 // the face of multiple threads racing to initialize them 497 private transient String schemeSpecificPart; 498 private transient int hash; // Zero ==> undefined 499 500 private transient String decodedUserInfo; 501 private transient String decodedAuthority; 502 private transient String decodedPath; 503 private transient String decodedQuery; 504 private transient String decodedFragment; 505 private transient String decodedSchemeSpecificPart; 506 507 /** 508 * The string form of this URI. 509 * 510 * @serial 511 */ 512 private volatile String string; // The only serializable field 513 514 515 516 // -- Constructors and factories -- 517 518 private URI() { } // Used internally 519 520 /** 521 * Constructs a URI by parsing the given string. 522 * 523 * <p> This constructor parses the given string exactly as specified by the 524 * grammar in <a 525 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 526 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 527 * 528 * <ul> 529 * 530 * <li><p> An empty authority component is permitted as long as it is 531 * followed by a non-empty path, a query component, or a fragment 532 * component. This allows the parsing of URIs such as 533 * {@code "file:///foo/bar"}, which seems to be the intent of 534 * RFC 2396 although the grammar does not permit it. If the 535 * authority component is empty then the user-information, host, and port 536 * components are undefined. </p></li> 537 * 538 * <li><p> Empty relative paths are permitted; this seems to be the 539 * intent of RFC 2396 although the grammar does not permit it. The 540 * primary consequence of this deviation is that a standalone fragment 541 * such as {@code "#foo"} parses as a relative URI with an empty path 542 * and the given fragment, and can be usefully <a 543 * href="#resolve-frag">resolved</a> against a base URI. 544 * 545 * <li><p> IPv4 addresses in host components are parsed rigorously, as 546 * specified by <a 547 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 548 * element of a dotted-quad address must contain no more than three 549 * decimal digits. Each element is further constrained to have a value 550 * no greater than 255. </p></li> 551 * 552 * <li> <p> Hostnames in host components that comprise only a single 553 * domain label are permitted to start with an <i>alphanum</i> 554 * character. This seems to be the intent of <a 555 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 556 * section 3.2.2 although the grammar does not permit it. The 557 * consequence of this deviation is that the authority component of a 558 * hierarchical URI such as {@code s://123}, will parse as a server-based 559 * authority. </p></li> 560 * 561 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 562 * address must be enclosed in square brackets ({@code '['} and 563 * {@code ']'}) as specified by <a 564 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 565 * IPv6 address itself must parse according to <a 566 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 567 * addresses are further constrained to describe no more than sixteen 568 * bytes of address information, a constraint implicit in RFC 2373 569 * but not expressible in the grammar. </p></li> 570 * 571 * <li><p> Characters in the <i>other</i> category are permitted wherever 572 * RFC 2396 permits <i>escaped</i> octets, that is, in the 573 * user-information, path, query, and fragment components, as well as in 574 * the authority component if the authority is registry-based. This 575 * allows URIs to contain Unicode characters beyond those in the US-ASCII 576 * character set. </p></li> 577 * 578 * </ul> 579 * 580 * @param str The string to be parsed into a URI 581 * 582 * @throws NullPointerException 583 * If {@code str} is {@code null} 584 * 585 * @throws URISyntaxException 586 * If the given string violates RFC 2396, as augmented 587 * by the above deviations 588 */ 589 public URI(String str) throws URISyntaxException { 590 new Parser(str).parse(false); 591 } 592 593 /** 594 * Constructs a hierarchical URI from the given components. 595 * 596 * <p> If a scheme is given then the path, if also given, must either be 597 * empty or begin with a slash character ({@code '/'}). Otherwise a 598 * component of the new URI may be left undefined by passing {@code null} 599 * for the corresponding parameter or, in the case of the {@code port} 600 * parameter, by passing {@code -1}. 601 * 602 * <p> This constructor first builds a URI string from the given components 603 * according to the rules specified in <a 604 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 605 * section 5.2, step 7: </p> 606 * 607 * <ol> 608 * 609 * <li><p> Initially, the result string is empty. </p></li> 610 * 611 * <li><p> If a scheme is given then it is appended to the result, 612 * followed by a colon character ({@code ':'}). </p></li> 613 * 614 * <li><p> If user information, a host, or a port are given then the 615 * string {@code "//"} is appended. </p></li> 616 * 617 * <li><p> If user information is given then it is appended, followed by 618 * a commercial-at character ({@code '@'}). Any character not in the 619 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 620 * categories is <a href="#quote">quoted</a>. </p></li> 621 * 622 * <li><p> If a host is given then it is appended. If the host is a 623 * literal IPv6 address but is not enclosed in square brackets 624 * ({@code '['} and {@code ']'}) then the square brackets are added. 625 * </p></li> 626 * 627 * <li><p> If a port number is given then a colon character 628 * ({@code ':'}) is appended, followed by the port number in decimal. 629 * </p></li> 630 * 631 * <li><p> If a path is given then it is appended. Any character not in 632 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 633 * categories, and not equal to the slash character ({@code '/'}) or the 634 * commercial-at character ({@code '@'}), is quoted. </p></li> 635 * 636 * <li><p> If a query is given then a question-mark character 637 * ({@code '?'}) is appended, followed by the query. Any character that 638 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 639 * </p></li> 640 * 641 * <li><p> Finally, if a fragment is given then a hash character 642 * ({@code '#'}) is appended, followed by the fragment. Any character 643 * that is not a legal URI character is quoted. </p></li> 644 * 645 * </ol> 646 * 647 * <p> The resulting URI string is then parsed as if by invoking the {@link 648 * #URI(String)} constructor and then invoking the {@link 649 * #parseServerAuthority()} method upon the result; this may cause a {@link 650 * URISyntaxException} to be thrown. </p> 651 * 652 * @param scheme Scheme name 653 * @param userInfo User name and authorization information 654 * @param host Host name 655 * @param port Port number 656 * @param path Path 657 * @param query Query 658 * @param fragment Fragment 659 * 660 * @throws URISyntaxException 661 * If both a scheme and a path are given but the path is relative, 662 * if the URI string constructed from the given components violates 663 * RFC 2396, or if the authority component of the string is 664 * present but cannot be parsed as a server-based authority 665 */ 666 public URI(String scheme, 667 String userInfo, String host, int port, 668 String path, String query, String fragment) 669 throws URISyntaxException 670 { 671 String s = toString(scheme, null, 672 null, userInfo, host, port, 673 path, query, fragment); 674 checkPath(s, scheme, path); 675 new Parser(s).parse(true); 676 } 677 678 /** 679 * Constructs a hierarchical URI from the given components. 680 * 681 * <p> If a scheme is given then the path, if also given, must either be 682 * empty or begin with a slash character ({@code '/'}). Otherwise a 683 * component of the new URI may be left undefined by passing {@code null} 684 * for the corresponding parameter. 685 * 686 * <p> This constructor first builds a URI string from the given components 687 * according to the rules specified in <a 688 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 689 * section 5.2, step 7: </p> 690 * 691 * <ol> 692 * 693 * <li><p> Initially, the result string is empty. </p></li> 694 * 695 * <li><p> If a scheme is given then it is appended to the result, 696 * followed by a colon character ({@code ':'}). </p></li> 697 * 698 * <li><p> If an authority is given then the string {@code "//"} is 699 * appended, followed by the authority. If the authority contains a 700 * literal IPv6 address then the address must be enclosed in square 701 * brackets ({@code '['} and {@code ']'}). Any character not in the 702 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 703 * categories, and not equal to the commercial-at character 704 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 705 * 706 * <li><p> If a path is given then it is appended. Any character not in 707 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 708 * categories, and not equal to the slash character ({@code '/'}) or the 709 * commercial-at character ({@code '@'}), is quoted. </p></li> 710 * 711 * <li><p> If a query is given then a question-mark character 712 * ({@code '?'}) is appended, followed by the query. Any character that 713 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 714 * </p></li> 715 * 716 * <li><p> Finally, if a fragment is given then a hash character 717 * ({@code '#'}) is appended, followed by the fragment. Any character 718 * that is not a legal URI character is quoted. </p></li> 719 * 720 * </ol> 721 * 722 * <p> The resulting URI string is then parsed as if by invoking the {@link 723 * #URI(String)} constructor and then invoking the {@link 724 * #parseServerAuthority()} method upon the result; this may cause a {@link 725 * URISyntaxException} to be thrown. </p> 726 * 727 * @param scheme Scheme name 728 * @param authority Authority 729 * @param path Path 730 * @param query Query 731 * @param fragment Fragment 732 * 733 * @throws URISyntaxException 734 * If both a scheme and a path are given but the path is relative, 735 * if the URI string constructed from the given components violates 736 * RFC 2396, or if the authority component of the string is 737 * present but cannot be parsed as a server-based authority 738 */ 739 public URI(String scheme, 740 String authority, 741 String path, String query, String fragment) 742 throws URISyntaxException 743 { 744 String s = toString(scheme, null, 745 authority, null, null, -1, 746 path, query, fragment); 747 checkPath(s, scheme, path); 748 new Parser(s).parse(false); 749 } 750 751 /** 752 * Constructs a hierarchical URI from the given components. 753 * 754 * <p> A component may be left undefined by passing {@code null}. 755 * 756 * <p> This convenience constructor works as if by invoking the 757 * seven-argument constructor as follows: 758 * 759 * <blockquote> 760 * {@code new} {@link #URI(String, String, String, int, String, String, String) 761 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 762 * </blockquote> 763 * 764 * @param scheme Scheme name 765 * @param host Host name 766 * @param path Path 767 * @param fragment Fragment 768 * 769 * @throws URISyntaxException 770 * If the URI string constructed from the given components 771 * violates RFC 2396 772 */ 773 public URI(String scheme, String host, String path, String fragment) 774 throws URISyntaxException 775 { 776 this(scheme, null, host, -1, path, null, fragment); 777 } 778 779 /** 780 * Constructs a URI from the given components. 781 * 782 * <p> A component may be left undefined by passing {@code null}. 783 * 784 * <p> This constructor first builds a URI in string form using the given 785 * components as follows: </p> 786 * 787 * <ol> 788 * 789 * <li><p> Initially, the result string is empty. </p></li> 790 * 791 * <li><p> If a scheme is given then it is appended to the result, 792 * followed by a colon character ({@code ':'}). </p></li> 793 * 794 * <li><p> If a scheme-specific part is given then it is appended. Any 795 * character that is not a <a href="#legal-chars">legal URI character</a> 796 * is <a href="#quote">quoted</a>. </p></li> 797 * 798 * <li><p> Finally, if a fragment is given then a hash character 799 * ({@code '#'}) is appended to the string, followed by the fragment. 800 * Any character that is not a legal URI character is quoted. </p></li> 801 * 802 * </ol> 803 * 804 * <p> The resulting URI string is then parsed in order to create the new 805 * URI instance as if by invoking the {@link #URI(String)} constructor; 806 * this may cause a {@link URISyntaxException} to be thrown. </p> 807 * 808 * @param scheme Scheme name 809 * @param ssp Scheme-specific part 810 * @param fragment Fragment 811 * 812 * @throws URISyntaxException 813 * If the URI string constructed from the given components 814 * violates RFC 2396 815 */ 816 public URI(String scheme, String ssp, String fragment) 817 throws URISyntaxException 818 { 819 new Parser(toString(scheme, ssp, 820 null, null, null, -1, 821 null, null, fragment)) 822 .parse(false); 823 } 824 825 /** 826 * Constructs a simple URI consisting of only a scheme and a pre-validated 827 * path. Provides a fast-path for some internal cases. 828 */ 829 URI(String scheme, String path) { 830 assert validSchemeAndPath(scheme, path); 831 this.scheme = scheme; 832 this.path = path; 833 } 834 835 private static boolean validSchemeAndPath(String scheme, String path) { 836 try { 837 URI u = new URI(scheme + ":" + path); 838 return scheme.equals(u.scheme) && path.equals(u.path); 839 } catch (URISyntaxException e) { 840 return false; 841 } 842 } 843 844 /** 845 * Creates a URI by parsing the given string. 846 * 847 * <p> This convenience factory method works as if by invoking the {@link 848 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 849 * constructor is caught and wrapped in a new {@link 850 * IllegalArgumentException} object, which is then thrown. 851 * 852 * <p> This method is provided for use in situations where it is known that 853 * the given string is a legal URI, for example for URI constants declared 854 * within in a program, and so it would be considered a programming error 855 * for the string not to parse as such. The constructors, which throw 856 * {@link URISyntaxException} directly, should be used situations where a 857 * URI is being constructed from user input or from some other source that 858 * may be prone to errors. </p> 859 * 860 * @param str The string to be parsed into a URI 861 * @return The new URI 862 * 863 * @throws NullPointerException 864 * If {@code str} is {@code null} 865 * 866 * @throws IllegalArgumentException 867 * If the given string violates RFC 2396 868 */ 869 public static URI create(String str) { 870 try { 871 return new URI(str); 872 } catch (URISyntaxException x) { 873 throw new IllegalArgumentException(x.getMessage(), x); 874 } 875 } 876 877 878 // -- Operations -- 879 880 /** 881 * Attempts to parse this URI's authority component, if defined, into 882 * user-information, host, and port components. 883 * 884 * <p> If this URI's authority component has already been recognized as 885 * being server-based then it will already have been parsed into 886 * user-information, host, and port components. In this case, or if this 887 * URI has no authority component, this method simply returns this URI. 888 * 889 * <p> Otherwise this method attempts once more to parse the authority 890 * component into user-information, host, and port components, and throws 891 * an exception describing why the authority component could not be parsed 892 * in that way. 893 * 894 * <p> This method is provided because the generic URI syntax specified in 895 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 896 * cannot always distinguish a malformed server-based authority from a 897 * legitimate registry-based authority. It must therefore treat some 898 * instances of the former as instances of the latter. The authority 899 * component in the URI string {@code "//foo:bar"}, for example, is not a 900 * legal server-based authority but it is legal as a registry-based 901 * authority. 902 * 903 * <p> In many common situations, for example when working URIs that are 904 * known to be either URNs or URLs, the hierarchical URIs being used will 905 * always be server-based. They therefore must either be parsed as such or 906 * treated as an error. In these cases a statement such as 907 * 908 * <blockquote> 909 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 910 * </blockquote> 911 * 912 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 913 * it has an authority component, has a server-based authority with proper 914 * user-information, host, and port components. Invoking this method also 915 * ensures that if the authority could not be parsed in that way then an 916 * appropriate diagnostic message can be issued based upon the exception 917 * that is thrown. </p> 918 * 919 * @return A URI whose authority field has been parsed 920 * as a server-based authority 921 * 922 * @throws URISyntaxException 923 * If the authority component of this URI is defined 924 * but cannot be parsed as a server-based authority 925 * according to RFC 2396 926 */ 927 public URI parseServerAuthority() 928 throws URISyntaxException 929 { 930 // We could be clever and cache the error message and index from the 931 // exception thrown during the original parse, but that would require 932 // either more fields or a more-obscure representation. 933 if ((host != null) || (authority == null)) 934 return this; 935 new Parser(toString()).parse(true); 936 return this; 937 } 938 939 /** 940 * Normalizes this URI's path. 941 * 942 * <p> If this URI is opaque, or if its path is already in normal form, 943 * then this URI is returned. Otherwise a new URI is constructed that is 944 * identical to this URI except that its path is computed by normalizing 945 * this URI's path in a manner consistent with <a 946 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 947 * section 5.2, step 6, sub-steps c through f; that is: 948 * </p> 949 * 950 * <ol> 951 * 952 * <li><p> All {@code "."} segments are removed. </p></li> 953 * 954 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 955 * segment then both of these segments are removed. This step is 956 * repeated until it is no longer applicable. </p></li> 957 * 958 * <li><p> If the path is relative, and if its first segment contains a 959 * colon character ({@code ':'}), then a {@code "."} segment is 960 * prepended. This prevents a relative URI with a path such as 961 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 962 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 963 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 964 * 965 * </ol> 966 * 967 * <p> A normalized path will begin with one or more {@code ".."} segments 968 * if there were insufficient non-{@code ".."} segments preceding them to 969 * allow their removal. A normalized path will begin with a {@code "."} 970 * segment if one was inserted by step 3 above. Otherwise, a normalized 971 * path will not contain any {@code "."} or {@code ".."} segments. </p> 972 * 973 * @return A URI equivalent to this URI, 974 * but whose path is in normal form 975 */ 976 public URI normalize() { 977 return normalize(this); 978 } 979 980 /** 981 * Resolves the given URI against this URI. 982 * 983 * <p> If the given URI is already absolute, or if this URI is opaque, then 984 * the given URI is returned. 985 * 986 * <p><a id="resolve-frag"></a> If the given URI's fragment component is 987 * defined, its path component is empty, and its scheme, authority, and 988 * query components are undefined, then a URI with the given fragment but 989 * with all other components equal to those of this URI is returned. This 990 * allows a URI representing a standalone fragment reference, such as 991 * {@code "#foo"}, to be usefully resolved against a base URI. 992 * 993 * <p> Otherwise this method constructs a new hierarchical URI in a manner 994 * consistent with <a 995 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 996 * section 5.2; that is: </p> 997 * 998 * <ol> 999 * 1000 * <li><p> A new URI is constructed with this URI's scheme and the given 1001 * URI's query and fragment components. </p></li> 1002 * 1003 * <li><p> If the given URI has an authority component then the new URI's 1004 * authority and path are taken from the given URI. </p></li> 1005 * 1006 * <li><p> Otherwise the new URI's authority component is copied from 1007 * this URI, and its path is computed as follows: </p> 1008 * 1009 * <ol> 1010 * 1011 * <li><p> If the given URI's path is absolute then the new URI's path 1012 * is taken from the given URI. </p></li> 1013 * 1014 * <li><p> Otherwise the given URI's path is relative, and so the new 1015 * URI's path is computed by resolving the path of the given URI 1016 * against the path of this URI. This is done by concatenating all but 1017 * the last segment of this URI's path, if any, with the given URI's 1018 * path and then normalizing the result as if by invoking the {@link 1019 * #normalize() normalize} method. </p></li> 1020 * 1021 * </ol></li> 1022 * 1023 * </ol> 1024 * 1025 * <p> The result of this method is absolute if, and only if, either this 1026 * URI is absolute or the given URI is absolute. </p> 1027 * 1028 * @param uri The URI to be resolved against this URI 1029 * @return The resulting URI 1030 * 1031 * @throws NullPointerException 1032 * If {@code uri} is {@code null} 1033 */ 1034 public URI resolve(URI uri) { 1035 return resolve(this, uri); 1036 } 1037 1038 /** 1039 * Constructs a new URI by parsing the given string and then resolving it 1040 * against this URI. 1041 * 1042 * <p> This convenience method works as if invoking it were equivalent to 1043 * evaluating the expression {@link #resolve(java.net.URI) 1044 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1045 * 1046 * @param str The string to be parsed into a URI 1047 * @return The resulting URI 1048 * 1049 * @throws NullPointerException 1050 * If {@code str} is {@code null} 1051 * 1052 * @throws IllegalArgumentException 1053 * If the given string violates RFC 2396 1054 */ 1055 public URI resolve(String str) { 1056 return resolve(URI.create(str)); 1057 } 1058 1059 /** 1060 * Relativizes the given URI against this URI. 1061 * 1062 * <p> The relativization of the given URI against this URI is computed as 1063 * follows: </p> 1064 * 1065 * <ol> 1066 * 1067 * <li><p> If either this URI or the given URI are opaque, or if the 1068 * scheme and authority components of the two URIs are not identical, or 1069 * if the path of this URI is not a prefix of the path of the given URI, 1070 * then the given URI is returned. </p></li> 1071 * 1072 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1073 * query and fragment components taken from the given URI and with a path 1074 * component computed by removing this URI's path from the beginning of 1075 * the given URI's path. </p></li> 1076 * 1077 * </ol> 1078 * 1079 * @param uri The URI to be relativized against this URI 1080 * @return The resulting URI 1081 * 1082 * @throws NullPointerException 1083 * If {@code uri} is {@code null} 1084 */ 1085 public URI relativize(URI uri) { 1086 return relativize(this, uri); 1087 } 1088 1089 /** 1090 * Constructs a URL from this URI. 1091 * 1092 * <p> This convenience method works as if invoking it were equivalent to 1093 * evaluating the expression {@code new URL(this.toString())} after 1094 * first checking that this URI is absolute. </p> 1095 * 1096 * @return A URL constructed from this URI 1097 * 1098 * @throws IllegalArgumentException 1099 * If this URL is not absolute 1100 * 1101 * @throws MalformedURLException 1102 * If a protocol handler for the URL could not be found, 1103 * or if some other error occurred while constructing the URL 1104 */ 1105 public URL toURL() throws MalformedURLException { 1106 return URL.fromURI(this); 1107 } 1108 1109 // -- Component access methods -- 1110 1111 /** 1112 * Returns the scheme component of this URI. 1113 * 1114 * <p> The scheme component of a URI, if defined, only contains characters 1115 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1116 * scheme always starts with an <i>alpha</i> character. <p> 1117 * 1118 * The scheme component of a URI cannot contain escaped octets, hence this 1119 * method does not perform any decoding. 1120 * 1121 * @return The scheme component of this URI, 1122 * or {@code null} if the scheme is undefined 1123 */ 1124 public String getScheme() { 1125 return scheme; 1126 } 1127 1128 /** 1129 * Tells whether or not this URI is absolute. 1130 * 1131 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1132 * 1133 * @return {@code true} if, and only if, this URI is absolute 1134 */ 1135 public boolean isAbsolute() { 1136 return scheme != null; 1137 } 1138 1139 /** 1140 * Tells whether or not this URI is opaque. 1141 * 1142 * <p> A URI is opaque if, and only if, it is absolute and its 1143 * scheme-specific part does not begin with a slash character ('/'). 1144 * An opaque URI has a scheme, a scheme-specific part, and possibly 1145 * a fragment; all other components are undefined. </p> 1146 * 1147 * @return {@code true} if, and only if, this URI is opaque 1148 */ 1149 public boolean isOpaque() { 1150 return path == null; 1151 } 1152 1153 /** 1154 * Returns the raw scheme-specific part of this URI. The scheme-specific 1155 * part is never undefined, though it may be empty. 1156 * 1157 * <p> The scheme-specific part of a URI only contains legal URI 1158 * characters. </p> 1159 * 1160 * @return The raw scheme-specific part of this URI 1161 * (never {@code null}) 1162 */ 1163 public String getRawSchemeSpecificPart() { 1164 String part = schemeSpecificPart; 1165 if (part != null) { 1166 return part; 1167 } 1168 1169 String s = string; 1170 if (s != null) { 1171 // if string is defined, components will have been parsed 1172 int start = 0; 1173 int end = s.length(); 1174 if (scheme != null) { 1175 start = scheme.length() + 1; 1176 } 1177 if (fragment != null) { 1178 end -= fragment.length() + 1; 1179 } 1180 if (path != null && path.length() == end - start) { 1181 part = path; 1182 } else { 1183 part = s.substring(start, end); 1184 } 1185 } else { 1186 StringBuilder sb = new StringBuilder(); 1187 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1188 host, port, getPath(), getQuery()); 1189 part = sb.toString(); 1190 } 1191 return schemeSpecificPart = part; 1192 } 1193 1194 /** 1195 * Returns the decoded scheme-specific part of this URI. 1196 * 1197 * <p> The string returned by this method is equal to that returned by the 1198 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1199 * except that all sequences of escaped octets are <a 1200 * href="#decode">decoded</a>. </p> 1201 * 1202 * @return The decoded scheme-specific part of this URI 1203 * (never {@code null}) 1204 */ 1205 public String getSchemeSpecificPart() { 1206 String part = decodedSchemeSpecificPart; 1207 if (part == null) { 1208 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1209 } 1210 return part; 1211 } 1212 1213 /** 1214 * Returns the raw authority component of this URI. 1215 * 1216 * <p> The authority component of a URI, if defined, only contains the 1217 * commercial-at character ({@code '@'}) and characters in the 1218 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1219 * categories. If the authority is server-based then it is further 1220 * constrained to have valid user-information, host, and port 1221 * components. </p> 1222 * 1223 * @return The raw authority component of this URI, 1224 * or {@code null} if the authority is undefined 1225 */ 1226 public String getRawAuthority() { 1227 return authority; 1228 } 1229 1230 /** 1231 * Returns the decoded authority component of this URI. 1232 * 1233 * <p> The string returned by this method is equal to that returned by the 1234 * {@link #getRawAuthority() getRawAuthority} method except that all 1235 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1236 * 1237 * @return The decoded authority component of this URI, 1238 * or {@code null} if the authority is undefined 1239 */ 1240 public String getAuthority() { 1241 String auth = decodedAuthority; 1242 if ((auth == null) && (authority != null)) { 1243 decodedAuthority = auth = decode(authority); 1244 } 1245 return auth; 1246 } 1247 1248 /** 1249 * Returns the raw user-information component of this URI. 1250 * 1251 * <p> The user-information component of a URI, if defined, only contains 1252 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1253 * <i>other</i> categories. </p> 1254 * 1255 * @return The raw user-information component of this URI, 1256 * or {@code null} if the user information is undefined 1257 */ 1258 public String getRawUserInfo() { 1259 return userInfo; 1260 } 1261 1262 /** 1263 * Returns the decoded user-information component of this URI. 1264 * 1265 * <p> The string returned by this method is equal to that returned by the 1266 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1267 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1268 * 1269 * @return The decoded user-information component of this URI, 1270 * or {@code null} if the user information is undefined 1271 */ 1272 public String getUserInfo() { 1273 String user = decodedUserInfo; 1274 if ((user == null) && (userInfo != null)) { 1275 decodedUserInfo = user = decode(userInfo); 1276 } 1277 return user; 1278 } 1279 1280 /** 1281 * Returns the host component of this URI. 1282 * 1283 * <p> The host component of a URI, if defined, will have one of the 1284 * following forms: </p> 1285 * 1286 * <ul> 1287 * 1288 * <li><p> A domain name consisting of one or more <i>labels</i> 1289 * separated by period characters ({@code '.'}), optionally followed by 1290 * a period character. Each label consists of <i>alphanum</i> characters 1291 * as well as hyphen characters ({@code '-'}), though hyphens never 1292 * occur as the first or last characters in a label. The rightmost 1293 * label of a domain name consisting of two or more labels, begins 1294 * with an <i>alpha</i> character. </li> 1295 * 1296 * <li><p> A dotted-quad IPv4 address of the form 1297 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1298 * where no <i>digit</i> sequence is longer than three characters and no 1299 * sequence has a value larger than 255. </p></li> 1300 * 1301 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1302 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1303 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1304 * syntax of IPv6 addresses is specified in <a 1305 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1306 * Addressing Architecture</i></a>. </p></li> 1307 * 1308 * </ul> 1309 * 1310 * The host component of a URI cannot contain escaped octets, hence this 1311 * method does not perform any decoding. 1312 * 1313 * @return The host component of this URI, 1314 * or {@code null} if the host is undefined 1315 */ 1316 public String getHost() { 1317 return host; 1318 } 1319 1320 /** 1321 * Returns the port number of this URI. 1322 * 1323 * <p> The port component of a URI, if defined, is a non-negative 1324 * integer. </p> 1325 * 1326 * @return The port component of this URI, 1327 * or {@code -1} if the port is undefined 1328 */ 1329 public int getPort() { 1330 return port; 1331 } 1332 1333 /** 1334 * Returns the raw path component of this URI. 1335 * 1336 * <p> The path component of a URI, if defined, only contains the slash 1337 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1338 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1339 * and <i>other</i> categories. </p> 1340 * 1341 * @return The path component of this URI, 1342 * or {@code null} if the path is undefined 1343 */ 1344 public String getRawPath() { 1345 return path; 1346 } 1347 1348 /** 1349 * Returns the decoded path component of this URI. 1350 * 1351 * <p> The string returned by this method is equal to that returned by the 1352 * {@link #getRawPath() getRawPath} method except that all sequences of 1353 * escaped octets are <a href="#decode">decoded</a>. </p> 1354 * 1355 * @return The decoded path component of this URI, 1356 * or {@code null} if the path is undefined 1357 */ 1358 public String getPath() { 1359 String decoded = decodedPath; 1360 if ((decoded == null) && (path != null)) { 1361 decodedPath = decoded = decode(path); 1362 } 1363 return decoded; 1364 } 1365 1366 /** 1367 * Returns the raw query component of this URI. 1368 * 1369 * <p> The query component of a URI, if defined, only contains legal URI 1370 * characters. </p> 1371 * 1372 * @return The raw query component of this URI, 1373 * or {@code null} if the query is undefined 1374 */ 1375 public String getRawQuery() { 1376 return query; 1377 } 1378 1379 /** 1380 * Returns the decoded query component of this URI. 1381 * 1382 * <p> The string returned by this method is equal to that returned by the 1383 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1384 * escaped octets are <a href="#decode">decoded</a>. </p> 1385 * 1386 * @return The decoded query component of this URI, 1387 * or {@code null} if the query is undefined 1388 */ 1389 public String getQuery() { 1390 String decoded = decodedQuery; 1391 if ((decoded == null) && (query != null)) { 1392 decodedQuery = decoded = decode(query, false); 1393 } 1394 return decoded; 1395 } 1396 1397 /** 1398 * Returns the raw fragment component of this URI. 1399 * 1400 * <p> The fragment component of a URI, if defined, only contains legal URI 1401 * characters. </p> 1402 * 1403 * @return The raw fragment component of this URI, 1404 * or {@code null} if the fragment is undefined 1405 */ 1406 public String getRawFragment() { 1407 return fragment; 1408 } 1409 1410 /** 1411 * Returns the decoded fragment component of this URI. 1412 * 1413 * <p> The string returned by this method is equal to that returned by the 1414 * {@link #getRawFragment() getRawFragment} method except that all 1415 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1416 * 1417 * @return The decoded fragment component of this URI, 1418 * or {@code null} if the fragment is undefined 1419 */ 1420 public String getFragment() { 1421 String decoded = decodedFragment; 1422 if ((decoded == null) && (fragment != null)) { 1423 decodedFragment = decoded = decode(fragment, false); 1424 } 1425 return decoded; 1426 } 1427 1428 1429 // -- Equality, comparison, hash code, toString, and serialization -- 1430 1431 /** 1432 * Tests this URI for equality with another object. 1433 * 1434 * <p> If the given object is not a URI then this method immediately 1435 * returns {@code false}. 1436 * 1437 * <p> For two URIs to be considered equal requires that either both are 1438 * opaque or both are hierarchical. Their schemes must either both be 1439 * undefined or else be equal without regard to case. Their fragments 1440 * must either both be undefined or else be equal. 1441 * 1442 * <p> For two opaque URIs to be considered equal, their scheme-specific 1443 * parts must be equal. 1444 * 1445 * <p> For two hierarchical URIs to be considered equal, their paths must 1446 * be equal and their queries must either both be undefined or else be 1447 * equal. Their authorities must either both be undefined, or both be 1448 * registry-based, or both be server-based. If their authorities are 1449 * defined and are registry-based, then they must be equal. If their 1450 * authorities are defined and are server-based, then their hosts must be 1451 * equal without regard to case, their port numbers must be equal, and 1452 * their user-information components must be equal. 1453 * 1454 * <p> When testing the user-information, path, query, fragment, authority, 1455 * or scheme-specific parts of two URIs for equality, the raw forms rather 1456 * than the encoded forms of these components are compared and the 1457 * hexadecimal digits of escaped octets are compared without regard to 1458 * case. 1459 * 1460 * <p> This method satisfies the general contract of the {@link 1461 * java.lang.Object#equals(Object) Object.equals} method. </p> 1462 * 1463 * @param ob The object to which this object is to be compared 1464 * 1465 * @return {@code true} if, and only if, the given object is a URI that 1466 * is identical to this URI 1467 */ 1468 public boolean equals(Object ob) { 1469 if (ob == this) 1470 return true; 1471 if (!(ob instanceof URI)) 1472 return false; 1473 URI that = (URI)ob; 1474 if (this.isOpaque() != that.isOpaque()) return false; 1475 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1476 if (!equal(this.fragment, that.fragment)) return false; 1477 1478 // Opaque 1479 if (this.isOpaque()) 1480 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1481 1482 // Hierarchical 1483 if (!equal(this.path, that.path)) return false; 1484 if (!equal(this.query, that.query)) return false; 1485 1486 // Authorities 1487 if (this.authority == that.authority) return true; 1488 if (this.host != null) { 1489 // Server-based 1490 if (!equal(this.userInfo, that.userInfo)) return false; 1491 if (!equalIgnoringCase(this.host, that.host)) return false; 1492 if (this.port != that.port) return false; 1493 } else if (this.authority != null) { 1494 // Registry-based 1495 if (!equal(this.authority, that.authority)) return false; 1496 } else if (this.authority != that.authority) { 1497 return false; 1498 } 1499 1500 return true; 1501 } 1502 1503 /** 1504 * Returns a hash-code value for this URI. The hash code is based upon all 1505 * of the URI's components, and satisfies the general contract of the 1506 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1507 * 1508 * @return A hash-code value for this URI 1509 */ 1510 public int hashCode() { 1511 int h = hash; 1512 if (h == 0) { 1513 h = hashIgnoringCase(0, scheme); 1514 h = hash(h, fragment); 1515 if (isOpaque()) { 1516 h = hash(h, schemeSpecificPart); 1517 } else { 1518 h = hash(h, path); 1519 h = hash(h, query); 1520 if (host != null) { 1521 h = hash(h, userInfo); 1522 h = hashIgnoringCase(h, host); 1523 h += 1949 * port; 1524 } else { 1525 h = hash(h, authority); 1526 } 1527 } 1528 if (h != 0) { 1529 hash = h; 1530 } 1531 } 1532 return h; 1533 } 1534 1535 /** 1536 * Compares this URI to another object, which must be a URI. 1537 * 1538 * <p> When comparing corresponding components of two URIs, if one 1539 * component is undefined but the other is defined then the first is 1540 * considered to be less than the second. Unless otherwise noted, string 1541 * components are ordered according to their natural, case-sensitive 1542 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1543 * String.compareTo} method. String components that are subject to 1544 * encoding are compared by comparing their raw forms rather than their 1545 * encoded forms. 1546 * 1547 * <p> The ordering of URIs is defined as follows: </p> 1548 * 1549 * <ul> 1550 * 1551 * <li><p> Two URIs with different schemes are ordered according the 1552 * ordering of their schemes, without regard to case. </p></li> 1553 * 1554 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1555 * with an identical scheme. </p></li> 1556 * 1557 * <li><p> Two opaque URIs with identical schemes are ordered according 1558 * to the ordering of their scheme-specific parts. </p></li> 1559 * 1560 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1561 * parts are ordered according to the ordering of their 1562 * fragments. </p></li> 1563 * 1564 * <li><p> Two hierarchical URIs with identical schemes are ordered 1565 * according to the ordering of their authority components: </p> 1566 * 1567 * <ul> 1568 * 1569 * <li><p> If both authority components are server-based then the URIs 1570 * are ordered according to their user-information components; if these 1571 * components are identical then the URIs are ordered according to the 1572 * ordering of their hosts, without regard to case; if the hosts are 1573 * identical then the URIs are ordered according to the ordering of 1574 * their ports. </p></li> 1575 * 1576 * <li><p> If one or both authority components are registry-based then 1577 * the URIs are ordered according to the ordering of their authority 1578 * components. </p></li> 1579 * 1580 * </ul></li> 1581 * 1582 * <li><p> Finally, two hierarchical URIs with identical schemes and 1583 * authority components are ordered according to the ordering of their 1584 * paths; if their paths are identical then they are ordered according to 1585 * the ordering of their queries; if the queries are identical then they 1586 * are ordered according to the order of their fragments. </p></li> 1587 * 1588 * </ul> 1589 * 1590 * <p> This method satisfies the general contract of the {@link 1591 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1592 * method. </p> 1593 * 1594 * @param that 1595 * The object to which this URI is to be compared 1596 * 1597 * @return A negative integer, zero, or a positive integer as this URI is 1598 * less than, equal to, or greater than the given URI 1599 * 1600 * @throws ClassCastException 1601 * If the given object is not a URI 1602 */ 1603 public int compareTo(URI that) { 1604 int c; 1605 1606 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1607 return c; 1608 1609 if (this.isOpaque()) { 1610 if (that.isOpaque()) { 1611 // Both opaque 1612 if ((c = compare(this.schemeSpecificPart, 1613 that.schemeSpecificPart)) != 0) 1614 return c; 1615 return compare(this.fragment, that.fragment); 1616 } 1617 return +1; // Opaque > hierarchical 1618 } else if (that.isOpaque()) { 1619 return -1; // Hierarchical < opaque 1620 } 1621 1622 // Hierarchical 1623 if ((this.host != null) && (that.host != null)) { 1624 // Both server-based 1625 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1626 return c; 1627 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1628 return c; 1629 if ((c = this.port - that.port) != 0) 1630 return c; 1631 } else { 1632 // If one or both authorities are registry-based then we simply 1633 // compare them in the usual, case-sensitive way. If one is 1634 // registry-based and one is server-based then the strings are 1635 // guaranteed to be unequal, hence the comparison will never return 1636 // zero and the compareTo and equals methods will remain 1637 // consistent. 1638 if ((c = compare(this.authority, that.authority)) != 0) return c; 1639 } 1640 1641 if ((c = compare(this.path, that.path)) != 0) return c; 1642 if ((c = compare(this.query, that.query)) != 0) return c; 1643 return compare(this.fragment, that.fragment); 1644 } 1645 1646 /** 1647 * Returns the content of this URI as a string. 1648 * 1649 * <p> If this URI was created by invoking one of the constructors in this 1650 * class then a string equivalent to the original input string, or to the 1651 * string computed from the originally-given components, as appropriate, is 1652 * returned. Otherwise this URI was created by normalization, resolution, 1653 * or relativization, and so a string is constructed from this URI's 1654 * components according to the rules specified in <a 1655 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1656 * section 5.2, step 7. </p> 1657 * 1658 * @return The string form of this URI 1659 */ 1660 public String toString() { 1661 String s = string; 1662 if (s == null) { 1663 s = defineString(); 1664 } 1665 return s; 1666 } 1667 1668 private String defineString() { 1669 String s = string; 1670 if (s != null) { 1671 return s; 1672 } 1673 1674 StringBuilder sb = new StringBuilder(); 1675 if (scheme != null) { 1676 sb.append(scheme); 1677 sb.append(':'); 1678 } 1679 if (isOpaque()) { 1680 sb.append(schemeSpecificPart); 1681 } else { 1682 if (host != null) { 1683 sb.append("//"); 1684 if (userInfo != null) { 1685 sb.append(userInfo); 1686 sb.append('@'); 1687 } 1688 boolean needBrackets = ((host.indexOf(':') >= 0) 1689 && !host.startsWith("[") 1690 && !host.endsWith("]")); 1691 if (needBrackets) sb.append('['); 1692 sb.append(host); 1693 if (needBrackets) sb.append(']'); 1694 if (port != -1) { 1695 sb.append(':'); 1696 sb.append(port); 1697 } 1698 } else if (authority != null) { 1699 sb.append("//"); 1700 sb.append(authority); 1701 } 1702 if (path != null) 1703 sb.append(path); 1704 if (query != null) { 1705 sb.append('?'); 1706 sb.append(query); 1707 } 1708 } 1709 if (fragment != null) { 1710 sb.append('#'); 1711 sb.append(fragment); 1712 } 1713 return string = sb.toString(); 1714 } 1715 1716 /** 1717 * Returns the content of this URI as a US-ASCII string. 1718 * 1719 * <p> If this URI does not contain any characters in the <i>other</i> 1720 * category then an invocation of this method will return the same value as 1721 * an invocation of the {@link #toString() toString} method. Otherwise 1722 * this method works as if by invoking that method and then <a 1723 * href="#encode">encoding</a> the result. </p> 1724 * 1725 * @return The string form of this URI, encoded as needed 1726 * so that it only contains characters in the US-ASCII 1727 * charset 1728 */ 1729 public String toASCIIString() { 1730 return encode(toString()); 1731 } 1732 1733 1734 // -- Serialization support -- 1735 1736 /** 1737 * Saves the content of this URI to the given serial stream. 1738 * 1739 * <p> The only serializable field of a URI instance is its {@code string} 1740 * field. That field is given a value, if it does not have one already, 1741 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1742 * method of the given object-output stream is invoked. </p> 1743 * 1744 * @param os The object-output stream to which this object 1745 * is to be written 1746 */ 1747 private void writeObject(ObjectOutputStream os) 1748 throws IOException 1749 { 1750 defineString(); 1751 os.defaultWriteObject(); // Writes the string field only 1752 } 1753 1754 /** 1755 * Reconstitutes a URI from the given serial stream. 1756 * 1757 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1758 * invoked to read the value of the {@code string} field. The result is 1759 * then parsed in the usual way. 1760 * 1761 * @param is The object-input stream from which this object 1762 * is being read 1763 */ 1764 private void readObject(ObjectInputStream is) 1765 throws ClassNotFoundException, IOException 1766 { 1767 port = -1; // Argh 1768 is.defaultReadObject(); 1769 try { 1770 new Parser(string).parse(false); 1771 } catch (URISyntaxException x) { 1772 IOException y = new InvalidObjectException("Invalid URI"); 1773 y.initCause(x); 1774 throw y; 1775 } 1776 } 1777 1778 1779 // -- End of public methods -- 1780 1781 1782 // -- Utility methods for string-field comparison and hashing -- 1783 1784 // These methods return appropriate values for null string arguments, 1785 // thereby simplifying the equals, hashCode, and compareTo methods. 1786 // 1787 // The case-ignoring methods should only be applied to strings whose 1788 // characters are all known to be US-ASCII. Because of this restriction, 1789 // these methods are faster than the similar methods in the String class. 1790 1791 // US-ASCII only 1792 private static int toLower(char c) { 1793 if ((c >= 'A') && (c <= 'Z')) 1794 return c + ('a' - 'A'); 1795 return c; 1796 } 1797 1798 // US-ASCII only 1799 private static int toUpper(char c) { 1800 if ((c >= 'a') && (c <= 'z')) 1801 return c - ('a' - 'A'); 1802 return c; 1803 } 1804 1805 private static boolean equal(String s, String t) { 1806 if (s == t) return true; 1807 if ((s != null) && (t != null)) { 1808 if (s.length() != t.length()) 1809 return false; 1810 if (s.indexOf('%') < 0) 1811 return s.equals(t); 1812 int n = s.length(); 1813 for (int i = 0; i < n;) { 1814 char c = s.charAt(i); 1815 char d = t.charAt(i); 1816 if (c != '%') { 1817 if (c != d) 1818 return false; 1819 i++; 1820 continue; 1821 } 1822 if (d != '%') 1823 return false; 1824 i++; 1825 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1826 return false; 1827 i++; 1828 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1829 return false; 1830 i++; 1831 } 1832 return true; 1833 } 1834 return false; 1835 } 1836 1837 // US-ASCII only 1838 private static boolean equalIgnoringCase(String s, String t) { 1839 if (s == t) return true; 1840 if ((s != null) && (t != null)) { 1841 int n = s.length(); 1842 if (t.length() != n) 1843 return false; 1844 for (int i = 0; i < n; i++) { 1845 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1846 return false; 1847 } 1848 return true; 1849 } 1850 return false; 1851 } 1852 1853 private static int hash(int hash, String s) { 1854 if (s == null) return hash; 1855 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1856 : normalizedHash(hash, s); 1857 } 1858 1859 1860 private static int normalizedHash(int hash, String s) { 1861 int h = 0; 1862 for (int index = 0; index < s.length(); index++) { 1863 char ch = s.charAt(index); 1864 h = 31 * h + ch; 1865 if (ch == '%') { 1866 /* 1867 * Process the next two encoded characters 1868 */ 1869 for (int i = index + 1; i < index + 3; i++) 1870 h = 31 * h + toUpper(s.charAt(i)); 1871 index += 2; 1872 } 1873 } 1874 return hash * 127 + h; 1875 } 1876 1877 // US-ASCII only 1878 private static int hashIgnoringCase(int hash, String s) { 1879 if (s == null) return hash; 1880 int h = hash; 1881 int n = s.length(); 1882 for (int i = 0; i < n; i++) 1883 h = 31 * h + toLower(s.charAt(i)); 1884 return h; 1885 } 1886 1887 private static int compare(String s, String t) { 1888 if (s == t) return 0; 1889 if (s != null) { 1890 if (t != null) 1891 return s.compareTo(t); 1892 else 1893 return +1; 1894 } else { 1895 return -1; 1896 } 1897 } 1898 1899 // US-ASCII only 1900 private static int compareIgnoringCase(String s, String t) { 1901 if (s == t) return 0; 1902 if (s != null) { 1903 if (t != null) { 1904 int sn = s.length(); 1905 int tn = t.length(); 1906 int n = sn < tn ? sn : tn; 1907 for (int i = 0; i < n; i++) { 1908 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1909 if (c != 0) 1910 return c; 1911 } 1912 return sn - tn; 1913 } 1914 return +1; 1915 } else { 1916 return -1; 1917 } 1918 } 1919 1920 1921 // -- String construction -- 1922 1923 // If a scheme is given then the path, if given, must be absolute 1924 // 1925 private static void checkPath(String s, String scheme, String path) 1926 throws URISyntaxException 1927 { 1928 if (scheme != null) { 1929 if ((path != null) 1930 && ((path.length() > 0) && (path.charAt(0) != '/'))) 1931 throw new URISyntaxException(s, 1932 "Relative path in absolute URI"); 1933 } 1934 } 1935 1936 private void appendAuthority(StringBuilder sb, 1937 String authority, 1938 String userInfo, 1939 String host, 1940 int port) 1941 { 1942 if (host != null) { 1943 sb.append("//"); 1944 if (userInfo != null) { 1945 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1946 sb.append('@'); 1947 } 1948 boolean needBrackets = ((host.indexOf(':') >= 0) 1949 && !host.startsWith("[") 1950 && !host.endsWith("]")); 1951 if (needBrackets) sb.append('['); 1952 sb.append(host); 1953 if (needBrackets) sb.append(']'); 1954 if (port != -1) { 1955 sb.append(':'); 1956 sb.append(port); 1957 } 1958 } else if (authority != null) { 1959 sb.append("//"); 1960 if (authority.startsWith("[")) { 1961 // authority should (but may not) contain an embedded IPv6 address 1962 int end = authority.indexOf(']'); 1963 String doquote = authority, dontquote = ""; 1964 if (end != -1 && authority.indexOf(':') != -1) { 1965 // the authority contains an IPv6 address 1966 if (end == authority.length()) { 1967 dontquote = authority; 1968 doquote = ""; 1969 } else { 1970 dontquote = authority.substring(0 , end + 1); 1971 doquote = authority.substring(end + 1); 1972 } 1973 } 1974 sb.append(dontquote); 1975 sb.append(quote(doquote, 1976 L_REG_NAME | L_SERVER, 1977 H_REG_NAME | H_SERVER)); 1978 } else { 1979 sb.append(quote(authority, 1980 L_REG_NAME | L_SERVER, 1981 H_REG_NAME | H_SERVER)); 1982 } 1983 } 1984 } 1985 1986 private void appendSchemeSpecificPart(StringBuilder sb, 1987 String opaquePart, 1988 String authority, 1989 String userInfo, 1990 String host, 1991 int port, 1992 String path, 1993 String query) 1994 { 1995 if (opaquePart != null) { 1996 /* check if SSP begins with an IPv6 address 1997 * because we must not quote a literal IPv6 address 1998 */ 1999 if (opaquePart.startsWith("//[")) { 2000 int end = opaquePart.indexOf(']'); 2001 if (end != -1 && opaquePart.indexOf(':')!=-1) { 2002 String doquote, dontquote; 2003 if (end == opaquePart.length()) { 2004 dontquote = opaquePart; 2005 doquote = ""; 2006 } else { 2007 dontquote = opaquePart.substring(0,end+1); 2008 doquote = opaquePart.substring(end+1); 2009 } 2010 sb.append (dontquote); 2011 sb.append(quote(doquote, L_URIC, H_URIC)); 2012 } 2013 } else { 2014 sb.append(quote(opaquePart, L_URIC, H_URIC)); 2015 } 2016 } else { 2017 appendAuthority(sb, authority, userInfo, host, port); 2018 if (path != null) 2019 sb.append(quote(path, L_PATH, H_PATH)); 2020 if (query != null) { 2021 sb.append('?'); 2022 sb.append(quote(query, L_URIC, H_URIC)); 2023 } 2024 } 2025 } 2026 2027 private void appendFragment(StringBuilder sb, String fragment) { 2028 if (fragment != null) { 2029 sb.append('#'); 2030 sb.append(quote(fragment, L_URIC, H_URIC)); 2031 } 2032 } 2033 2034 private String toString(String scheme, 2035 String opaquePart, 2036 String authority, 2037 String userInfo, 2038 String host, 2039 int port, 2040 String path, 2041 String query, 2042 String fragment) 2043 { 2044 StringBuilder sb = new StringBuilder(); 2045 if (scheme != null) { 2046 sb.append(scheme); 2047 sb.append(':'); 2048 } 2049 appendSchemeSpecificPart(sb, opaquePart, 2050 authority, userInfo, host, port, 2051 path, query); 2052 appendFragment(sb, fragment); 2053 return sb.toString(); 2054 } 2055 2056 // -- Normalization, resolution, and relativization -- 2057 2058 // RFC2396 5.2 (6) 2059 private static String resolvePath(String base, String child, 2060 boolean absolute) 2061 { 2062 int i = base.lastIndexOf('/'); 2063 int cn = child.length(); 2064 String path = ""; 2065 2066 if (cn == 0) { 2067 // 5.2 (6a) 2068 if (i >= 0) 2069 path = base.substring(0, i + 1); 2070 } else { 2071 StringBuilder sb = new StringBuilder(base.length() + cn); 2072 // 5.2 (6a) 2073 if (i >= 0) 2074 sb.append(base, 0, i + 1); 2075 // 5.2 (6b) 2076 sb.append(child); 2077 path = sb.toString(); 2078 } 2079 2080 // 5.2 (6c-f) 2081 String np = normalize(path); 2082 2083 // 5.2 (6g): If the result is absolute but the path begins with "../", 2084 // then we simply leave the path as-is 2085 2086 return np; 2087 } 2088 2089 // RFC2396 5.2 2090 private static URI resolve(URI base, URI child) { 2091 // check if child if opaque first so that NPE is thrown 2092 // if child is null. 2093 if (child.isOpaque() || base.isOpaque()) 2094 return child; 2095 2096 // 5.2 (2): Reference to current document (lone fragment) 2097 if ((child.scheme == null) && (child.authority == null) 2098 && child.path.isEmpty() && (child.fragment != null) 2099 && (child.query == null)) { 2100 if ((base.fragment != null) 2101 && child.fragment.equals(base.fragment)) { 2102 return base; 2103 } 2104 URI ru = new URI(); 2105 ru.scheme = base.scheme; 2106 ru.authority = base.authority; 2107 ru.userInfo = base.userInfo; 2108 ru.host = base.host; 2109 ru.port = base.port; 2110 ru.path = base.path; 2111 ru.fragment = child.fragment; 2112 ru.query = base.query; 2113 return ru; 2114 } 2115 2116 // 5.2 (3): Child is absolute 2117 if (child.scheme != null) 2118 return child; 2119 2120 URI ru = new URI(); // Resolved URI 2121 ru.scheme = base.scheme; 2122 ru.query = child.query; 2123 ru.fragment = child.fragment; 2124 2125 // 5.2 (4): Authority 2126 if (child.authority == null) { 2127 ru.authority = base.authority; 2128 ru.host = base.host; 2129 ru.userInfo = base.userInfo; 2130 ru.port = base.port; 2131 2132 String cp = (child.path == null) ? "" : child.path; 2133 if ((cp.length() > 0) && (cp.charAt(0) == '/')) { 2134 // 5.2 (5): Child path is absolute 2135 ru.path = child.path; 2136 } else { 2137 // 5.2 (6): Resolve relative path 2138 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2139 } 2140 } else { 2141 ru.authority = child.authority; 2142 ru.host = child.host; 2143 ru.userInfo = child.userInfo; 2144 ru.host = child.host; 2145 ru.port = child.port; 2146 ru.path = child.path; 2147 } 2148 2149 // 5.2 (7): Recombine (nothing to do here) 2150 return ru; 2151 } 2152 2153 // If the given URI's path is normal then return the URI; 2154 // o.w., return a new URI containing the normalized path. 2155 // 2156 private static URI normalize(URI u) { 2157 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) 2158 return u; 2159 2160 String np = normalize(u.path); 2161 if (np == u.path) 2162 return u; 2163 2164 URI v = new URI(); 2165 v.scheme = u.scheme; 2166 v.fragment = u.fragment; 2167 v.authority = u.authority; 2168 v.userInfo = u.userInfo; 2169 v.host = u.host; 2170 v.port = u.port; 2171 v.path = np; 2172 v.query = u.query; 2173 return v; 2174 } 2175 2176 // If both URIs are hierarchical, their scheme and authority components are 2177 // identical, and the base path is a prefix of the child's path, then 2178 // return a relative URI that, when resolved against the base, yields the 2179 // child; otherwise, return the child. 2180 // 2181 private static URI relativize(URI base, URI child) { 2182 // check if child if opaque first so that NPE is thrown 2183 // if child is null. 2184 if (child.isOpaque() || base.isOpaque()) 2185 return child; 2186 if (!equalIgnoringCase(base.scheme, child.scheme) 2187 || !equal(base.authority, child.authority)) 2188 return child; 2189 2190 String bp = normalize(base.path); 2191 String cp = normalize(child.path); 2192 if (!bp.equals(cp)) { 2193 if (!bp.endsWith("/")) 2194 bp = bp + "/"; 2195 if (!cp.startsWith(bp)) 2196 return child; 2197 } 2198 2199 URI v = new URI(); 2200 v.path = cp.substring(bp.length()); 2201 v.query = child.query; 2202 v.fragment = child.fragment; 2203 return v; 2204 } 2205 2206 2207 2208 // -- Path normalization -- 2209 2210 // The following algorithm for path normalization avoids the creation of a 2211 // string object for each segment, as well as the use of a string buffer to 2212 // compute the final result, by using a single char array and editing it in 2213 // place. The array is first split into segments, replacing each slash 2214 // with '\0' and creating a segment-index array, each element of which is 2215 // the index of the first char in the corresponding segment. We then walk 2216 // through both arrays, removing ".", "..", and other segments as necessary 2217 // by setting their entries in the index array to -1. Finally, the two 2218 // arrays are used to rejoin the segments and compute the final result. 2219 // 2220 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2221 2222 2223 // Check the given path to see if it might need normalization. A path 2224 // might need normalization if it contains duplicate slashes, a "." 2225 // segment, or a ".." segment. Return -1 if no further normalization is 2226 // possible, otherwise return the number of segments found. 2227 // 2228 // This method takes a string argument rather than a char array so that 2229 // this test can be performed without invoking path.toCharArray(). 2230 // 2231 private static int needsNormalization(String path) { 2232 boolean normal = true; 2233 int ns = 0; // Number of segments 2234 int end = path.length() - 1; // Index of last char in path 2235 int p = 0; // Index of next char in path 2236 2237 // Skip initial slashes 2238 while (p <= end) { 2239 if (path.charAt(p) != '/') break; 2240 p++; 2241 } 2242 if (p > 1) normal = false; 2243 2244 // Scan segments 2245 while (p <= end) { 2246 2247 // Looking at "." or ".." ? 2248 if ((path.charAt(p) == '.') 2249 && ((p == end) 2250 || ((path.charAt(p + 1) == '/') 2251 || ((path.charAt(p + 1) == '.') 2252 && ((p + 1 == end) 2253 || (path.charAt(p + 2) == '/')))))) { 2254 normal = false; 2255 } 2256 ns++; 2257 2258 // Find beginning of next segment 2259 while (p <= end) { 2260 if (path.charAt(p++) != '/') 2261 continue; 2262 2263 // Skip redundant slashes 2264 while (p <= end) { 2265 if (path.charAt(p) != '/') break; 2266 normal = false; 2267 p++; 2268 } 2269 2270 break; 2271 } 2272 } 2273 2274 return normal ? -1 : ns; 2275 } 2276 2277 2278 // Split the given path into segments, replacing slashes with nulls and 2279 // filling in the given segment-index array. 2280 // 2281 // Preconditions: 2282 // segs.length == Number of segments in path 2283 // 2284 // Postconditions: 2285 // All slashes in path replaced by '\0' 2286 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2287 // 2288 private static void split(char[] path, int[] segs) { 2289 int end = path.length - 1; // Index of last char in path 2290 int p = 0; // Index of next char in path 2291 int i = 0; // Index of current segment 2292 2293 // Skip initial slashes 2294 while (p <= end) { 2295 if (path[p] != '/') break; 2296 path[p] = '\0'; 2297 p++; 2298 } 2299 2300 while (p <= end) { 2301 2302 // Note start of segment 2303 segs[i++] = p++; 2304 2305 // Find beginning of next segment 2306 while (p <= end) { 2307 if (path[p++] != '/') 2308 continue; 2309 path[p - 1] = '\0'; 2310 2311 // Skip redundant slashes 2312 while (p <= end) { 2313 if (path[p] != '/') break; 2314 path[p++] = '\0'; 2315 } 2316 break; 2317 } 2318 } 2319 2320 if (i != segs.length) 2321 throw new InternalError(); // ASSERT 2322 } 2323 2324 2325 // Join the segments in the given path according to the given segment-index 2326 // array, ignoring those segments whose index entries have been set to -1, 2327 // and inserting slashes as needed. Return the length of the resulting 2328 // path. 2329 // 2330 // Preconditions: 2331 // segs[i] == -1 implies segment i is to be ignored 2332 // path computed by split, as above, with '\0' having replaced '/' 2333 // 2334 // Postconditions: 2335 // path[0] .. path[return value] == Resulting path 2336 // 2337 private static int join(char[] path, int[] segs) { 2338 int ns = segs.length; // Number of segments 2339 int end = path.length - 1; // Index of last char in path 2340 int p = 0; // Index of next path char to write 2341 2342 if (path[p] == '\0') { 2343 // Restore initial slash for absolute paths 2344 path[p++] = '/'; 2345 } 2346 2347 for (int i = 0; i < ns; i++) { 2348 int q = segs[i]; // Current segment 2349 if (q == -1) 2350 // Ignore this segment 2351 continue; 2352 2353 if (p == q) { 2354 // We're already at this segment, so just skip to its end 2355 while ((p <= end) && (path[p] != '\0')) 2356 p++; 2357 if (p <= end) { 2358 // Preserve trailing slash 2359 path[p++] = '/'; 2360 } 2361 } else if (p < q) { 2362 // Copy q down to p 2363 while ((q <= end) && (path[q] != '\0')) 2364 path[p++] = path[q++]; 2365 if (q <= end) { 2366 // Preserve trailing slash 2367 path[p++] = '/'; 2368 } 2369 } else 2370 throw new InternalError(); // ASSERT false 2371 } 2372 2373 return p; 2374 } 2375 2376 2377 // Remove "." segments from the given path, and remove segment pairs 2378 // consisting of a non-".." segment followed by a ".." segment. 2379 // 2380 private static void removeDots(char[] path, int[] segs) { 2381 int ns = segs.length; 2382 int end = path.length - 1; 2383 2384 for (int i = 0; i < ns; i++) { 2385 int dots = 0; // Number of dots found (0, 1, or 2) 2386 2387 // Find next occurrence of "." or ".." 2388 do { 2389 int p = segs[i]; 2390 if (path[p] == '.') { 2391 if (p == end) { 2392 dots = 1; 2393 break; 2394 } else if (path[p + 1] == '\0') { 2395 dots = 1; 2396 break; 2397 } else if ((path[p + 1] == '.') 2398 && ((p + 1 == end) 2399 || (path[p + 2] == '\0'))) { 2400 dots = 2; 2401 break; 2402 } 2403 } 2404 i++; 2405 } while (i < ns); 2406 if ((i > ns) || (dots == 0)) 2407 break; 2408 2409 if (dots == 1) { 2410 // Remove this occurrence of "." 2411 segs[i] = -1; 2412 } else { 2413 // If there is a preceding non-".." segment, remove both that 2414 // segment and this occurrence of ".."; otherwise, leave this 2415 // ".." segment as-is. 2416 int j; 2417 for (j = i - 1; j >= 0; j--) { 2418 if (segs[j] != -1) break; 2419 } 2420 if (j >= 0) { 2421 int q = segs[j]; 2422 if (!((path[q] == '.') 2423 && (path[q + 1] == '.') 2424 && (path[q + 2] == '\0'))) { 2425 segs[i] = -1; 2426 segs[j] = -1; 2427 } 2428 } 2429 } 2430 } 2431 } 2432 2433 2434 // DEVIATION: If the normalized path is relative, and if the first 2435 // segment could be parsed as a scheme name, then prepend a "." segment 2436 // 2437 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2438 2439 if (path[0] == '\0') 2440 // The path is absolute 2441 return; 2442 2443 int ns = segs.length; 2444 int f = 0; // Index of first segment 2445 while (f < ns) { 2446 if (segs[f] >= 0) 2447 break; 2448 f++; 2449 } 2450 if ((f >= ns) || (f == 0)) 2451 // The path is empty, or else the original first segment survived, 2452 // in which case we already know that no leading "." is needed 2453 return; 2454 2455 int p = segs[f]; 2456 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2457 if (p >= path.length || path[p] == '\0') 2458 // No colon in first segment, so no "." needed 2459 return; 2460 2461 // At this point we know that the first segment is unused, 2462 // hence we can insert a "." segment at that position 2463 path[0] = '.'; 2464 path[1] = '\0'; 2465 segs[0] = 0; 2466 } 2467 2468 2469 // Normalize the given path string. A normal path string has no empty 2470 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2471 // segments equal to ".." that are preceded by a segment not equal to "..". 2472 // In contrast to Unix-style pathname normalization, for URI paths we 2473 // always retain trailing slashes. 2474 // 2475 private static String normalize(String ps) { 2476 2477 // Does this path need normalization? 2478 int ns = needsNormalization(ps); // Number of segments 2479 if (ns < 0) 2480 // Nope -- just return it 2481 return ps; 2482 2483 char[] path = ps.toCharArray(); // Path in char-array form 2484 2485 // Split path into segments 2486 int[] segs = new int[ns]; // Segment-index array 2487 split(path, segs); 2488 2489 // Remove dots 2490 removeDots(path, segs); 2491 2492 // Prevent scheme-name confusion 2493 maybeAddLeadingDot(path, segs); 2494 2495 // Join the remaining segments and return the result 2496 String s = new String(path, 0, join(path, segs)); 2497 if (s.equals(ps)) { 2498 // string was already normalized 2499 return ps; 2500 } 2501 return s; 2502 } 2503 2504 2505 2506 // -- Character classes for parsing -- 2507 2508 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2509 // permissible in the various components of a URI reference. We here 2510 // define a set of mask pairs to aid in enforcing these restrictions. Each 2511 // mask pair consists of two longs, a low mask and a high mask. Taken 2512 // together they represent a 128-bit mask, where bit i is set iff the 2513 // character with value i is permitted. 2514 // 2515 // This approach is more efficient than sequentially searching arrays of 2516 // permitted characters. It could be made still more efficient by 2517 // precompiling the mask information so that a character's presence in a 2518 // given mask could be determined by a single table lookup. 2519 2520 // Compute the low-order mask for the characters in the given string 2521 private static long lowMask(String chars) { 2522 int n = chars.length(); 2523 long m = 0; 2524 for (int i = 0; i < n; i++) { 2525 char c = chars.charAt(i); 2526 if (c < 64) 2527 m |= (1L << c); 2528 } 2529 return m; 2530 } 2531 2532 // Compute the high-order mask for the characters in the given string 2533 private static long highMask(String chars) { 2534 int n = chars.length(); 2535 long m = 0; 2536 for (int i = 0; i < n; i++) { 2537 char c = chars.charAt(i); 2538 if ((c >= 64) && (c < 128)) 2539 m |= (1L << (c - 64)); 2540 } 2541 return m; 2542 } 2543 2544 // Compute a low-order mask for the characters 2545 // between first and last, inclusive 2546 private static long lowMask(char first, char last) { 2547 long m = 0; 2548 int f = Math.max(Math.min(first, 63), 0); 2549 int l = Math.max(Math.min(last, 63), 0); 2550 for (int i = f; i <= l; i++) 2551 m |= 1L << i; 2552 return m; 2553 } 2554 2555 // Compute a high-order mask for the characters 2556 // between first and last, inclusive 2557 private static long highMask(char first, char last) { 2558 long m = 0; 2559 int f = Math.max(Math.min(first, 127), 64) - 64; 2560 int l = Math.max(Math.min(last, 127), 64) - 64; 2561 for (int i = f; i <= l; i++) 2562 m |= 1L << i; 2563 return m; 2564 } 2565 2566 // Tell whether the given character is permitted by the given mask pair 2567 private static boolean match(char c, long lowMask, long highMask) { 2568 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2569 return false; 2570 if (c < 64) 2571 return ((1L << c) & lowMask) != 0; 2572 if (c < 128) 2573 return ((1L << (c - 64)) & highMask) != 0; 2574 return false; 2575 } 2576 2577 // Character-class masks, in reverse order from RFC2396 because 2578 // initializers for static fields cannot make forward references. 2579 2580 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2581 // "8" | "9" 2582 private static final long L_DIGIT = lowMask('0', '9'); 2583 private static final long H_DIGIT = 0L; 2584 2585 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2586 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2587 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2588 private static final long L_UPALPHA = 0L; 2589 private static final long H_UPALPHA = highMask('A', 'Z'); 2590 2591 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2592 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2593 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2594 private static final long L_LOWALPHA = 0L; 2595 private static final long H_LOWALPHA = highMask('a', 'z'); 2596 2597 // alpha = lowalpha | upalpha 2598 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2599 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2600 2601 // alphanum = alpha | digit 2602 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2603 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2604 2605 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2606 // "a" | "b" | "c" | "d" | "e" | "f" 2607 private static final long L_HEX = L_DIGIT; 2608 private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f'); 2609 2610 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2611 // "(" | ")" 2612 private static final long L_MARK = lowMask("-_.!~*'()"); 2613 private static final long H_MARK = highMask("-_.!~*'()"); 2614 2615 // unreserved = alphanum | mark 2616 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2617 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2618 2619 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2620 // "$" | "," | "[" | "]" 2621 // Added per RFC2732: "[", "]" 2622 private static final long L_RESERVED = lowMask(";/?:@&=+$,[]"); 2623 private static final long H_RESERVED = highMask(";/?:@&=+$,[]"); 2624 2625 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2626 // characters are allowed; this is handled by the scanEscape method below. 2627 private static final long L_ESCAPED = 1L; 2628 private static final long H_ESCAPED = 0L; 2629 2630 // uric = reserved | unreserved | escaped 2631 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2632 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2633 2634 // pchar = unreserved | escaped | 2635 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2636 private static final long L_PCHAR 2637 = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,"); 2638 private static final long H_PCHAR 2639 = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,"); 2640 2641 // All valid path characters 2642 private static final long L_PATH = L_PCHAR | lowMask(";/"); 2643 private static final long H_PATH = H_PCHAR | highMask(";/"); 2644 2645 // Dash, for use in domainlabel and toplabel 2646 private static final long L_DASH = lowMask("-"); 2647 private static final long H_DASH = highMask("-"); 2648 2649 // Dot, for use in hostnames 2650 private static final long L_DOT = lowMask("."); 2651 private static final long H_DOT = highMask("."); 2652 2653 // userinfo = *( unreserved | escaped | 2654 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2655 private static final long L_USERINFO 2656 = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,"); 2657 private static final long H_USERINFO 2658 = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,"); 2659 2660 // reg_name = 1*( unreserved | escaped | "$" | "," | 2661 // ";" | ":" | "@" | "&" | "=" | "+" ) 2662 private static final long L_REG_NAME 2663 = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+"); 2664 private static final long H_REG_NAME 2665 = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+"); 2666 2667 // All valid characters for server-based authorities 2668 private static final long L_SERVER 2669 = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]"); 2670 private static final long H_SERVER 2671 = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]"); 2672 2673 // Special case of server authority that represents an IPv6 address 2674 // In this case, a % does not signify an escape sequence 2675 private static final long L_SERVER_PERCENT 2676 = L_SERVER | lowMask("%"); 2677 private static final long H_SERVER_PERCENT 2678 = H_SERVER | highMask("%"); 2679 private static final long L_LEFT_BRACKET = lowMask("["); 2680 private static final long H_LEFT_BRACKET = highMask("["); 2681 2682 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2683 private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-."); 2684 private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-."); 2685 2686 // scope_id = alpha | digit | "_" | "." 2687 private static final long L_SCOPE_ID 2688 = L_ALPHANUM | lowMask("_."); 2689 private static final long H_SCOPE_ID 2690 = H_ALPHANUM | highMask("_."); 2691 2692 // -- Escaping and encoding -- 2693 2694 private static final char[] hexDigits = { 2695 '0', '1', '2', '3', '4', '5', '6', '7', 2696 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2697 }; 2698 2699 private static void appendEscape(StringBuilder sb, byte b) { 2700 sb.append('%'); 2701 sb.append(hexDigits[(b >> 4) & 0x0f]); 2702 sb.append(hexDigits[(b >> 0) & 0x0f]); 2703 } 2704 2705 private static void appendEncoded(StringBuilder sb, char c) { 2706 ByteBuffer bb = null; 2707 try { 2708 bb = ThreadLocalCoders.encoderFor("UTF-8") 2709 .encode(CharBuffer.wrap("" + c)); 2710 } catch (CharacterCodingException x) { 2711 assert false; 2712 } 2713 while (bb.hasRemaining()) { 2714 int b = bb.get() & 0xff; 2715 if (b >= 0x80) 2716 appendEscape(sb, (byte)b); 2717 else 2718 sb.append((char)b); 2719 } 2720 } 2721 2722 // Quote any characters in s that are not permitted 2723 // by the given mask pair 2724 // 2725 private static String quote(String s, long lowMask, long highMask) { 2726 StringBuilder sb = null; 2727 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2728 for (int i = 0; i < s.length(); i++) { 2729 char c = s.charAt(i); 2730 if (c < '\u0080') { 2731 if (!match(c, lowMask, highMask)) { 2732 if (sb == null) { 2733 sb = new StringBuilder(); 2734 sb.append(s, 0, i); 2735 } 2736 appendEscape(sb, (byte)c); 2737 } else { 2738 if (sb != null) 2739 sb.append(c); 2740 } 2741 } else if (allowNonASCII 2742 && (Character.isSpaceChar(c) 2743 || Character.isISOControl(c))) { 2744 if (sb == null) { 2745 sb = new StringBuilder(); 2746 sb.append(s, 0, i); 2747 } 2748 appendEncoded(sb, c); 2749 } else { 2750 if (sb != null) 2751 sb.append(c); 2752 } 2753 } 2754 return (sb == null) ? s : sb.toString(); 2755 } 2756 2757 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2758 // assuming that s is otherwise legal 2759 // 2760 private static String encode(String s) { 2761 int n = s.length(); 2762 if (n == 0) 2763 return s; 2764 2765 // First check whether we actually need to encode 2766 for (int i = 0;;) { 2767 if (s.charAt(i) >= '\u0080') 2768 break; 2769 if (++i >= n) 2770 return s; 2771 } 2772 2773 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2774 ByteBuffer bb = null; 2775 try { 2776 bb = ThreadLocalCoders.encoderFor("UTF-8") 2777 .encode(CharBuffer.wrap(ns)); 2778 } catch (CharacterCodingException x) { 2779 assert false; 2780 } 2781 2782 StringBuilder sb = new StringBuilder(); 2783 while (bb.hasRemaining()) { 2784 int b = bb.get() & 0xff; 2785 if (b >= 0x80) 2786 appendEscape(sb, (byte)b); 2787 else 2788 sb.append((char)b); 2789 } 2790 return sb.toString(); 2791 } 2792 2793 private static int decode(char c) { 2794 if ((c >= '0') && (c <= '9')) 2795 return c - '0'; 2796 if ((c >= 'a') && (c <= 'f')) 2797 return c - 'a' + 10; 2798 if ((c >= 'A') && (c <= 'F')) 2799 return c - 'A' + 10; 2800 assert false; 2801 return -1; 2802 } 2803 2804 private static byte decode(char c1, char c2) { 2805 return (byte)( ((decode(c1) & 0xf) << 4) 2806 | ((decode(c2) & 0xf) << 0)); 2807 } 2808 2809 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2810 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2811 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2812 // are replaced with '\uFFFD'. 2813 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2814 // with a scope_id 2815 // 2816 private static String decode(String s) { 2817 return decode(s, true); 2818 } 2819 2820 // This method was introduced as a generalization of URI.decode method 2821 // to provide a fix for JDK-8037396 2822 private static String decode(String s, boolean ignorePercentInBrackets) { 2823 if (s == null) 2824 return s; 2825 int n = s.length(); 2826 if (n == 0) 2827 return s; 2828 if (s.indexOf('%') < 0) 2829 return s; 2830 2831 StringBuilder sb = new StringBuilder(n); 2832 ByteBuffer bb = ByteBuffer.allocate(n); 2833 CharBuffer cb = CharBuffer.allocate(n); 2834 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2835 .onMalformedInput(CodingErrorAction.REPLACE) 2836 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2837 2838 // This is not horribly efficient, but it will do for now 2839 char c = s.charAt(0); 2840 boolean betweenBrackets = false; 2841 2842 for (int i = 0; i < n;) { 2843 assert c == s.charAt(i); // Loop invariant 2844 if (c == '[') { 2845 betweenBrackets = true; 2846 } else if (betweenBrackets && c == ']') { 2847 betweenBrackets = false; 2848 } 2849 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2850 sb.append(c); 2851 if (++i >= n) 2852 break; 2853 c = s.charAt(i); 2854 continue; 2855 } 2856 bb.clear(); 2857 int ui = i; 2858 for (;;) { 2859 assert (n - i >= 2); 2860 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2861 if (++i >= n) 2862 break; 2863 c = s.charAt(i); 2864 if (c != '%') 2865 break; 2866 } 2867 bb.flip(); 2868 cb.clear(); 2869 dec.reset(); 2870 CoderResult cr = dec.decode(bb, cb, true); 2871 assert cr.isUnderflow(); 2872 cr = dec.flush(cb); 2873 assert cr.isUnderflow(); 2874 sb.append(cb.flip().toString()); 2875 } 2876 2877 return sb.toString(); 2878 } 2879 2880 2881 // -- Parsing -- 2882 2883 // For convenience we wrap the input URI string in a new instance of the 2884 // following internal class. This saves always having to pass the input 2885 // string as an argument to each internal scan/parse method. 2886 2887 private class Parser { 2888 2889 private String input; // URI input string 2890 private boolean requireServerAuthority = false; 2891 2892 Parser(String s) { 2893 input = s; 2894 string = s; 2895 } 2896 2897 // -- Methods for throwing URISyntaxException in various ways -- 2898 2899 private void fail(String reason) throws URISyntaxException { 2900 throw new URISyntaxException(input, reason); 2901 } 2902 2903 private void fail(String reason, int p) throws URISyntaxException { 2904 throw new URISyntaxException(input, reason, p); 2905 } 2906 2907 private void failExpecting(String expected, int p) 2908 throws URISyntaxException 2909 { 2910 fail("Expected " + expected, p); 2911 } 2912 2913 2914 // -- Simple access to the input string -- 2915 2916 // Tells whether start < end and, if so, whether charAt(start) == c 2917 // 2918 private boolean at(int start, int end, char c) { 2919 return (start < end) && (input.charAt(start) == c); 2920 } 2921 2922 // Tells whether start + s.length() < end and, if so, 2923 // whether the chars at the start position match s exactly 2924 // 2925 private boolean at(int start, int end, String s) { 2926 int p = start; 2927 int sn = s.length(); 2928 if (sn > end - p) 2929 return false; 2930 int i = 0; 2931 while (i < sn) { 2932 if (input.charAt(p++) != s.charAt(i)) { 2933 break; 2934 } 2935 i++; 2936 } 2937 return (i == sn); 2938 } 2939 2940 2941 // -- Scanning -- 2942 2943 // The various scan and parse methods that follow use a uniform 2944 // convention of taking the current start position and end index as 2945 // their first two arguments. The start is inclusive while the end is 2946 // exclusive, just as in the String class, i.e., a start/end pair 2947 // denotes the left-open interval [start, end) of the input string. 2948 // 2949 // These methods never proceed past the end position. They may return 2950 // -1 to indicate outright failure, but more often they simply return 2951 // the position of the first char after the last char scanned. Thus 2952 // a typical idiom is 2953 // 2954 // int p = start; 2955 // int q = scan(p, end, ...); 2956 // if (q > p) 2957 // // We scanned something 2958 // ...; 2959 // else if (q == p) 2960 // // We scanned nothing 2961 // ...; 2962 // else if (q == -1) 2963 // // Something went wrong 2964 // ...; 2965 2966 2967 // Scan a specific char: If the char at the given start position is 2968 // equal to c, return the index of the next char; otherwise, return the 2969 // start position. 2970 // 2971 private int scan(int start, int end, char c) { 2972 if ((start < end) && (input.charAt(start) == c)) 2973 return start + 1; 2974 return start; 2975 } 2976 2977 // Scan forward from the given start position. Stop at the first char 2978 // in the err string (in which case -1 is returned), or the first char 2979 // in the stop string (in which case the index of the preceding char is 2980 // returned), or the end of the input string (in which case the length 2981 // of the input string is returned). May return the start position if 2982 // nothing matches. 2983 // 2984 private int scan(int start, int end, String err, String stop) { 2985 int p = start; 2986 while (p < end) { 2987 char c = input.charAt(p); 2988 if (err.indexOf(c) >= 0) 2989 return -1; 2990 if (stop.indexOf(c) >= 0) 2991 break; 2992 p++; 2993 } 2994 return p; 2995 } 2996 2997 // Scan forward from the given start position. Stop at the first char 2998 // in the stop string (in which case the index of the preceding char is 2999 // returned), or the end of the input string (in which case the length 3000 // of the input string is returned). May return the start position if 3001 // nothing matches. 3002 // 3003 private int scan(int start, int end, String stop) { 3004 int p = start; 3005 while (p < end) { 3006 char c = input.charAt(p); 3007 if (stop.indexOf(c) >= 0) 3008 break; 3009 p++; 3010 } 3011 return p; 3012 } 3013 3014 // Scan a potential escape sequence, starting at the given position, 3015 // with the given first char (i.e., charAt(start) == c). 3016 // 3017 // This method assumes that if escapes are allowed then visible 3018 // non-US-ASCII chars are also allowed. 3019 // 3020 private int scanEscape(int start, int n, char first) 3021 throws URISyntaxException 3022 { 3023 int p = start; 3024 char c = first; 3025 if (c == '%') { 3026 // Process escape pair 3027 if ((p + 3 <= n) 3028 && match(input.charAt(p + 1), L_HEX, H_HEX) 3029 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3030 return p + 3; 3031 } 3032 fail("Malformed escape pair", p); 3033 } else if ((c > 128) 3034 && !Character.isSpaceChar(c) 3035 && !Character.isISOControl(c)) { 3036 // Allow unescaped but visible non-US-ASCII chars 3037 return p + 1; 3038 } 3039 return p; 3040 } 3041 3042 // Scan chars that match the given mask pair 3043 // 3044 private int scan(int start, int n, long lowMask, long highMask) 3045 throws URISyntaxException 3046 { 3047 int p = start; 3048 while (p < n) { 3049 char c = input.charAt(p); 3050 if (match(c, lowMask, highMask)) { 3051 p++; 3052 continue; 3053 } 3054 if ((lowMask & L_ESCAPED) != 0) { 3055 int q = scanEscape(p, n, c); 3056 if (q > p) { 3057 p = q; 3058 continue; 3059 } 3060 } 3061 break; 3062 } 3063 return p; 3064 } 3065 3066 // Check that each of the chars in [start, end) matches the given mask 3067 // 3068 private void checkChars(int start, int end, 3069 long lowMask, long highMask, 3070 String what) 3071 throws URISyntaxException 3072 { 3073 int p = scan(start, end, lowMask, highMask); 3074 if (p < end) 3075 fail("Illegal character in " + what, p); 3076 } 3077 3078 // Check that the char at position p matches the given mask 3079 // 3080 private void checkChar(int p, 3081 long lowMask, long highMask, 3082 String what) 3083 throws URISyntaxException 3084 { 3085 checkChars(p, p + 1, lowMask, highMask, what); 3086 } 3087 3088 3089 // -- Parsing -- 3090 3091 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3092 // 3093 void parse(boolean rsa) throws URISyntaxException { 3094 requireServerAuthority = rsa; 3095 int n = input.length(); 3096 int p = scan(0, n, "/?#", ":"); 3097 if ((p >= 0) && at(p, n, ':')) { 3098 if (p == 0) 3099 failExpecting("scheme name", 0); 3100 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3101 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3102 scheme = input.substring(0, p); 3103 p++; // Skip ':' 3104 if (at(p, n, '/')) { 3105 p = parseHierarchical(p, n); 3106 } else { 3107 // opaque; need to create the schemeSpecificPart 3108 int q = scan(p, n, "#"); 3109 if (q <= p) 3110 failExpecting("scheme-specific part", p); 3111 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3112 schemeSpecificPart = input.substring(p, q); 3113 p = q; 3114 } 3115 } else { 3116 p = parseHierarchical(0, n); 3117 } 3118 if (at(p, n, '#')) { 3119 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3120 fragment = input.substring(p + 1, n); 3121 p = n; 3122 } 3123 if (p < n) 3124 fail("end of URI", p); 3125 } 3126 3127 // [//authority]<path>[?<query>] 3128 // 3129 // DEVIATION from RFC2396: We allow an empty authority component as 3130 // long as it's followed by a non-empty path, query component, or 3131 // fragment component. This is so that URIs such as "file:///foo/bar" 3132 // will parse. This seems to be the intent of RFC2396, though the 3133 // grammar does not permit it. If the authority is empty then the 3134 // userInfo, host, and port components are undefined. 3135 // 3136 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3137 // to be the intent of RFC2396, but the grammar does not permit it. 3138 // The primary consequence of this deviation is that "#f" parses as a 3139 // relative URI with an empty path. 3140 // 3141 private int parseHierarchical(int start, int n) 3142 throws URISyntaxException 3143 { 3144 int p = start; 3145 if (at(p, n, '/') && at(p + 1, n, '/')) { 3146 p += 2; 3147 int q = scan(p, n, "/?#"); 3148 if (q > p) { 3149 p = parseAuthority(p, q); 3150 } else if (q < n) { 3151 // DEVIATION: Allow empty authority prior to non-empty 3152 // path, query component or fragment identifier 3153 } else 3154 failExpecting("authority", p); 3155 } 3156 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3157 checkChars(p, q, L_PATH, H_PATH, "path"); 3158 path = input.substring(p, q); 3159 p = q; 3160 if (at(p, n, '?')) { 3161 p++; 3162 q = scan(p, n, "#"); 3163 checkChars(p, q, L_URIC, H_URIC, "query"); 3164 query = input.substring(p, q); 3165 p = q; 3166 } 3167 return p; 3168 } 3169 3170 // authority = server | reg_name 3171 // 3172 // Ambiguity: An authority that is a registry name rather than a server 3173 // might have a prefix that parses as a server. We use the fact that 3174 // the authority component is always followed by '/' or the end of the 3175 // input string to resolve this: If the complete authority did not 3176 // parse as a server then we try to parse it as a registry name. 3177 // 3178 private int parseAuthority(int start, int n) 3179 throws URISyntaxException 3180 { 3181 int p = start; 3182 int q = p; 3183 URISyntaxException ex = null; 3184 3185 boolean serverChars; 3186 boolean regChars; 3187 3188 if (scan(p, n, "]") > p) { 3189 // contains a literal IPv6 address, therefore % is allowed 3190 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3191 } else { 3192 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3193 } 3194 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3195 3196 if (regChars && !serverChars) { 3197 // Must be a registry-based authority 3198 authority = input.substring(p, n); 3199 return n; 3200 } 3201 3202 if (serverChars) { 3203 // Might be (probably is) a server-based authority, so attempt 3204 // to parse it as such. If the attempt fails, try to treat it 3205 // as a registry-based authority. 3206 try { 3207 q = parseServer(p, n); 3208 if (q < n) 3209 failExpecting("end of authority", q); 3210 authority = input.substring(p, n); 3211 } catch (URISyntaxException x) { 3212 // Undo results of failed parse 3213 userInfo = null; 3214 host = null; 3215 port = -1; 3216 if (requireServerAuthority) { 3217 // If we're insisting upon a server-based authority, 3218 // then just re-throw the exception 3219 throw x; 3220 } else { 3221 // Save the exception in case it doesn't parse as a 3222 // registry either 3223 ex = x; 3224 q = p; 3225 } 3226 } 3227 } 3228 3229 if (q < n) { 3230 if (regChars) { 3231 // Registry-based authority 3232 authority = input.substring(p, n); 3233 } else if (ex != null) { 3234 // Re-throw exception; it was probably due to 3235 // a malformed IPv6 address 3236 throw ex; 3237 } else { 3238 fail("Illegal character in authority", q); 3239 } 3240 } 3241 3242 return n; 3243 } 3244 3245 3246 // [<userinfo>@]<host>[:<port>] 3247 // 3248 private int parseServer(int start, int n) 3249 throws URISyntaxException 3250 { 3251 int p = start; 3252 int q; 3253 3254 // userinfo 3255 q = scan(p, n, "/?#", "@"); 3256 if ((q >= p) && at(q, n, '@')) { 3257 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3258 userInfo = input.substring(p, q); 3259 p = q + 1; // Skip '@' 3260 } 3261 3262 // hostname, IPv4 address, or IPv6 address 3263 if (at(p, n, '[')) { 3264 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3265 p++; 3266 q = scan(p, n, "/?#", "]"); 3267 if ((q > p) && at(q, n, ']')) { 3268 // look for a "%" scope id 3269 int r = scan (p, q, "%"); 3270 if (r > p) { 3271 parseIPv6Reference(p, r); 3272 if (r+1 == q) { 3273 fail ("scope id expected"); 3274 } 3275 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3276 "scope id"); 3277 } else { 3278 parseIPv6Reference(p, q); 3279 } 3280 host = input.substring(p-1, q+1); 3281 p = q + 1; 3282 } else { 3283 failExpecting("closing bracket for IPv6 address", q); 3284 } 3285 } else { 3286 q = parseIPv4Address(p, n); 3287 if (q <= p) 3288 q = parseHostname(p, n); 3289 p = q; 3290 } 3291 3292 // port 3293 if (at(p, n, ':')) { 3294 p++; 3295 q = scan(p, n, "/"); 3296 if (q > p) { 3297 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3298 try { 3299 port = Integer.parseInt(input, p, q, 10); 3300 } catch (NumberFormatException x) { 3301 fail("Malformed port number", p); 3302 } 3303 p = q; 3304 } 3305 } 3306 if (p < n) 3307 failExpecting("port number", p); 3308 3309 return p; 3310 } 3311 3312 // Scan a string of decimal digits whose value fits in a byte 3313 // 3314 private int scanByte(int start, int n) 3315 throws URISyntaxException 3316 { 3317 int p = start; 3318 int q = scan(p, n, L_DIGIT, H_DIGIT); 3319 if (q <= p) return q; 3320 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3321 return q; 3322 } 3323 3324 // Scan an IPv4 address. 3325 // 3326 // If the strict argument is true then we require that the given 3327 // interval contain nothing besides an IPv4 address; if it is false 3328 // then we only require that it start with an IPv4 address. 3329 // 3330 // If the interval does not contain or start with (depending upon the 3331 // strict argument) a legal IPv4 address characters then we return -1 3332 // immediately; otherwise we insist that these characters parse as a 3333 // legal IPv4 address and throw an exception on failure. 3334 // 3335 // We assume that any string of decimal digits and dots must be an IPv4 3336 // address. It won't parse as a hostname anyway, so making that 3337 // assumption here allows more meaningful exceptions to be thrown. 3338 // 3339 private int scanIPv4Address(int start, int n, boolean strict) 3340 throws URISyntaxException 3341 { 3342 int p = start; 3343 int q; 3344 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3345 if ((m <= p) || (strict && (m != n))) 3346 return -1; 3347 for (;;) { 3348 // Per RFC2732: At most three digits per byte 3349 // Further constraint: Each element fits in a byte 3350 if ((q = scanByte(p, m)) <= p) break; p = q; 3351 if ((q = scan(p, m, '.')) <= p) break; p = q; 3352 if ((q = scanByte(p, m)) <= p) break; p = q; 3353 if ((q = scan(p, m, '.')) <= p) break; p = q; 3354 if ((q = scanByte(p, m)) <= p) break; p = q; 3355 if ((q = scan(p, m, '.')) <= p) break; p = q; 3356 if ((q = scanByte(p, m)) <= p) break; p = q; 3357 if (q < m) break; 3358 return q; 3359 } 3360 fail("Malformed IPv4 address", q); 3361 return -1; 3362 } 3363 3364 // Take an IPv4 address: Throw an exception if the given interval 3365 // contains anything except an IPv4 address 3366 // 3367 private int takeIPv4Address(int start, int n, String expected) 3368 throws URISyntaxException 3369 { 3370 int p = scanIPv4Address(start, n, true); 3371 if (p <= start) 3372 failExpecting(expected, start); 3373 return p; 3374 } 3375 3376 // Attempt to parse an IPv4 address, returning -1 on failure but 3377 // allowing the given interval to contain [:<characters>] after 3378 // the IPv4 address. 3379 // 3380 private int parseIPv4Address(int start, int n) { 3381 int p; 3382 3383 try { 3384 p = scanIPv4Address(start, n, false); 3385 } catch (URISyntaxException x) { 3386 return -1; 3387 } catch (NumberFormatException nfe) { 3388 return -1; 3389 } 3390 3391 if (p > start && p < n) { 3392 // IPv4 address is followed by something - check that 3393 // it's a ":" as this is the only valid character to 3394 // follow an address. 3395 if (input.charAt(p) != ':') { 3396 p = -1; 3397 } 3398 } 3399 3400 if (p > start) 3401 host = input.substring(start, p); 3402 3403 return p; 3404 } 3405 3406 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3407 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3408 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3409 // 3410 private int parseHostname(int start, int n) 3411 throws URISyntaxException 3412 { 3413 int p = start; 3414 int q; 3415 int l = -1; // Start of last parsed label 3416 3417 do { 3418 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3419 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3420 if (q <= p) 3421 break; 3422 l = p; 3423 if (q > p) { 3424 p = q; 3425 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3426 if (q > p) { 3427 if (input.charAt(q - 1) == '-') 3428 fail("Illegal character in hostname", q - 1); 3429 p = q; 3430 } 3431 } 3432 q = scan(p, n, '.'); 3433 if (q <= p) 3434 break; 3435 p = q; 3436 } while (p < n); 3437 3438 if ((p < n) && !at(p, n, ':')) 3439 fail("Illegal character in hostname", p); 3440 3441 if (l < 0) 3442 failExpecting("hostname", start); 3443 3444 // for a fully qualified hostname check that the rightmost 3445 // label starts with an alpha character. 3446 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3447 fail("Illegal character in hostname", l); 3448 } 3449 3450 host = input.substring(start, p); 3451 return p; 3452 } 3453 3454 3455 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3456 // 3457 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3458 // the form ::12.34.56.78, which are clearly shown in the examples 3459 // earlier in the document. Here is the original grammar: 3460 // 3461 // IPv6address = hexpart [ ":" IPv4address ] 3462 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3463 // hexseq = hex4 *( ":" hex4) 3464 // hex4 = 1*4HEXDIG 3465 // 3466 // We therefore use the following revised grammar: 3467 // 3468 // IPv6address = hexseq [ ":" IPv4address ] 3469 // | hexseq [ "::" [ hexpost ] ] 3470 // | "::" [ hexpost ] 3471 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3472 // hexseq = hex4 *( ":" hex4) 3473 // hex4 = 1*4HEXDIG 3474 // 3475 // This covers all and only the following cases: 3476 // 3477 // hexseq 3478 // hexseq : IPv4address 3479 // hexseq :: 3480 // hexseq :: hexseq 3481 // hexseq :: hexseq : IPv4address 3482 // hexseq :: IPv4address 3483 // :: hexseq 3484 // :: hexseq : IPv4address 3485 // :: IPv4address 3486 // :: 3487 // 3488 // Additionally we constrain the IPv6 address as follows :- 3489 // 3490 // i. IPv6 addresses without compressed zeros should contain 3491 // exactly 16 bytes. 3492 // 3493 // ii. IPv6 addresses with compressed zeros should contain 3494 // less than 16 bytes. 3495 3496 private int ipv6byteCount = 0; 3497 3498 private int parseIPv6Reference(int start, int n) 3499 throws URISyntaxException 3500 { 3501 int p = start; 3502 int q; 3503 boolean compressedZeros = false; 3504 3505 q = scanHexSeq(p, n); 3506 3507 if (q > p) { 3508 p = q; 3509 if (at(p, n, "::")) { 3510 compressedZeros = true; 3511 p = scanHexPost(p + 2, n); 3512 } else if (at(p, n, ':')) { 3513 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3514 ipv6byteCount += 4; 3515 } 3516 } else if (at(p, n, "::")) { 3517 compressedZeros = true; 3518 p = scanHexPost(p + 2, n); 3519 } 3520 if (p < n) 3521 fail("Malformed IPv6 address", start); 3522 if (ipv6byteCount > 16) 3523 fail("IPv6 address too long", start); 3524 if (!compressedZeros && ipv6byteCount < 16) 3525 fail("IPv6 address too short", start); 3526 if (compressedZeros && ipv6byteCount == 16) 3527 fail("Malformed IPv6 address", start); 3528 3529 return p; 3530 } 3531 3532 private int scanHexPost(int start, int n) 3533 throws URISyntaxException 3534 { 3535 int p = start; 3536 int q; 3537 3538 if (p == n) 3539 return p; 3540 3541 q = scanHexSeq(p, n); 3542 if (q > p) { 3543 p = q; 3544 if (at(p, n, ':')) { 3545 p++; 3546 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3547 ipv6byteCount += 4; 3548 } 3549 } else { 3550 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3551 ipv6byteCount += 4; 3552 } 3553 return p; 3554 } 3555 3556 // Scan a hex sequence; return -1 if one could not be scanned 3557 // 3558 private int scanHexSeq(int start, int n) 3559 throws URISyntaxException 3560 { 3561 int p = start; 3562 int q; 3563 3564 q = scan(p, n, L_HEX, H_HEX); 3565 if (q <= p) 3566 return -1; 3567 if (at(q, n, '.')) // Beginning of IPv4 address 3568 return -1; 3569 if (q > p + 4) 3570 fail("IPv6 hexadecimal digit sequence too long", p); 3571 ipv6byteCount += 2; 3572 p = q; 3573 while (p < n) { 3574 if (!at(p, n, ':')) 3575 break; 3576 if (at(p + 1, n, ':')) 3577 break; // "::" 3578 p++; 3579 q = scan(p, n, L_HEX, H_HEX); 3580 if (q <= p) 3581 failExpecting("digits for an IPv6 address", p); 3582 if (at(q, n, '.')) { // Beginning of IPv4 address 3583 p--; 3584 break; 3585 } 3586 if (q > p + 4) 3587 fail("IPv6 hexadecimal digit sequence too long", p); 3588 ipv6byteCount += 2; 3589 p = q; 3590 } 3591 3592 return p; 3593 } 3594 3595 } 3596 static { 3597 SharedSecrets.setJavaNetUriAccess( 3598 new JavaNetUriAccess() { 3599 public URI create(String scheme, String path) { 3600 return new URI(scheme, path); 3601 } 3602 } 3603 ); 3604 } 3605 }