1 /* 2 * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.IOException; 29 import java.io.InvalidObjectException; 30 import java.io.ObjectInputStream; 31 import java.io.ObjectOutputStream; 32 import java.io.Serializable; 33 import java.nio.ByteBuffer; 34 import java.nio.CharBuffer; 35 import java.nio.charset.CharsetDecoder; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.CharacterCodingException; 39 import java.text.Normalizer; 40 import jdk.internal.loader.URLClassPath; 41 import jdk.internal.misc.JavaNetAccess; 42 import jdk.internal.misc.SharedSecrets; 43 import sun.nio.cs.ThreadLocalCoders; 44 45 import java.lang.Character; // for javadoc 46 import java.lang.NullPointerException; // for javadoc 47 48 49 /** 50 * Represents a Uniform Resource Identifier (URI) reference. 51 * 52 * <p> Aside from some minor deviations noted below, an instance of this 53 * class represents a URI reference as defined by 54 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 55 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 56 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 57 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 58 * also supports scope_ids. The syntax and usage of scope_ids is described 59 * <a href="Inet6Address.html#scoped">here</a>. 60 * This class provides constructors for creating URI instances from 61 * their components or by parsing their string forms, methods for accessing the 62 * various components of an instance, and methods for normalizing, resolving, 63 * and relativizing URI instances. Instances of this class are immutable. 64 * 65 * 66 * <h3> URI syntax and components </h3> 67 * 68 * At the highest level a URI reference (hereinafter simply "URI") in string 69 * form has the syntax 70 * 71 * <blockquote> 72 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 73 * </blockquote> 74 * 75 * where square brackets [...] delineate optional components and the characters 76 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 77 * 78 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 79 * said to be <i>relative</i>. URIs are also classified according to whether 80 * they are <i>opaque</i> or <i>hierarchical</i>. 81 * 82 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 83 * not begin with a slash character ({@code '/'}). Opaque URIs are not 84 * subject to further parsing. Some examples of opaque URIs are: 85 * 86 * <blockquote><table cellpadding=0 cellspacing=0 summary="layout"> 87 * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr> 88 * <tr><td>{@code news:comp.lang.java}<td></tr> 89 * <tr><td>{@code urn:isbn:096139210x}</td></tr> 90 * </table></blockquote> 91 * 92 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 93 * scheme-specific part begins with a slash character, or a relative URI, that 94 * is, a URI that does not specify a scheme. Some examples of hierarchical 95 * URIs are: 96 * 97 * <blockquote> 98 * {@code http://example.com/languages/java/}<br> 99 * {@code sample/a/index.html#28}<br> 100 * {@code ../../demo/b/index.html}<br> 101 * {@code file:///~/calendar} 102 * </blockquote> 103 * 104 * <p> A hierarchical URI is subject to further parsing according to the syntax 105 * 106 * <blockquote> 107 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 108 * </blockquote> 109 * 110 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 111 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 112 * scheme-specific part of a hierarchical URI consists of the characters 113 * between the scheme and fragment components. 114 * 115 * <p> The authority component of a hierarchical URI is, if specified, either 116 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 117 * parses according to the familiar syntax 118 * 119 * <blockquote> 120 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 121 * </blockquote> 122 * 123 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 124 * themselves. Nearly all URI schemes currently in use are server-based. An 125 * authority component that does not parse in this way is considered to be 126 * registry-based. 127 * 128 * <p> The path component of a hierarchical URI is itself said to be absolute 129 * if it begins with a slash character ({@code '/'}); otherwise it is 130 * relative. The path of a hierarchical URI that is either absolute or 131 * specifies an authority is always absolute. 132 * 133 * <p> All told, then, a URI instance has the following nine components: 134 * 135 * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment"> 136 * <tr><th><i>Component</i></th><th><i>Type</i></th></tr> 137 * <tr><td>scheme</td><td>{@code String}</td></tr> 138 * <tr><td>scheme-specific-part </td><td>{@code String}</td></tr> 139 * <tr><td>authority</td><td>{@code String}</td></tr> 140 * <tr><td>user-info</td><td>{@code String}</td></tr> 141 * <tr><td>host</td><td>{@code String}</td></tr> 142 * <tr><td>port</td><td>{@code int}</td></tr> 143 * <tr><td>path</td><td>{@code String}</td></tr> 144 * <tr><td>query</td><td>{@code String}</td></tr> 145 * <tr><td>fragment</td><td>{@code String}</td></tr> 146 * </table></blockquote> 147 * 148 * In a given instance any particular component is either <i>undefined</i> or 149 * <i>defined</i> with a distinct value. Undefined string components are 150 * represented by {@code null}, while undefined integer components are 151 * represented by {@code -1}. A string component may be defined to have the 152 * empty string as its value; this is not equivalent to that component being 153 * undefined. 154 * 155 * <p> Whether a particular component is or is not defined in an instance 156 * depends upon the type of the URI being represented. An absolute URI has a 157 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 158 * possibly a fragment, but has no other components. A hierarchical URI always 159 * has a path (though it may be empty) and a scheme-specific-part (which at 160 * least contains the path), and may have any of the other components. If the 161 * authority component is present and is server-based then the host component 162 * will be defined and the user-information and port components may be defined. 163 * 164 * 165 * <h4> Operations on URI instances </h4> 166 * 167 * The key operations supported by this class are those of 168 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 169 * 170 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 171 * and {@code ".."} segments from the path component of a hierarchical URI. 172 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 173 * removed only if it is preceded by a non-{@code ".."} segment. 174 * Normalization has no effect upon opaque URIs. 175 * 176 * <p> <i>Resolution</i> is the process of resolving one URI against another, 177 * <i>base</i> URI. The resulting URI is constructed from components of both 178 * URIs in the manner specified by RFC 2396, taking components from the 179 * base URI for those not specified in the original. For hierarchical URIs, 180 * the path of the original is resolved against the path of the base and then 181 * normalized. The result, for example, of resolving 182 * 183 * <blockquote> 184 * {@code sample/a/index.html#28} 185 * 186 * (1) 187 * </blockquote> 188 * 189 * against the base URI {@code http://example.com/languages/java/} is the result 190 * URI 191 * 192 * <blockquote> 193 * {@code http://example.com/languages/java/sample/a/index.html#28} 194 * </blockquote> 195 * 196 * Resolving the relative URI 197 * 198 * <blockquote> 199 * {@code ../../demo/b/index.html} (2) 200 * </blockquote> 201 * 202 * against this result yields, in turn, 203 * 204 * <blockquote> 205 * {@code http://example.com/languages/java/demo/b/index.html} 206 * </blockquote> 207 * 208 * Resolution of both absolute and relative URIs, and of both absolute and 209 * relative paths in the case of hierarchical URIs, is supported. Resolving 210 * the URI {@code file:///~calendar} against any other URI simply yields the 211 * original URI, since it is absolute. Resolving the relative URI (2) above 212 * against the relative base URI (1) yields the normalized, but still relative, 213 * URI 214 * 215 * <blockquote> 216 * {@code demo/b/index.html} 217 * </blockquote> 218 * 219 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 220 * two normalized URIs <i>u</i> and <i>v</i>, 221 * 222 * <blockquote> 223 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 224 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 225 * </blockquote> 226 * 227 * This operation is often useful when constructing a document containing URIs 228 * that must be made relative to the base URI of the document wherever 229 * possible. For example, relativizing the URI 230 * 231 * <blockquote> 232 * {@code http://example.com/languages/java/sample/a/index.html#28} 233 * </blockquote> 234 * 235 * against the base URI 236 * 237 * <blockquote> 238 * {@code http://example.com/languages/java/} 239 * </blockquote> 240 * 241 * yields the relative URI {@code sample/a/index.html#28}. 242 * 243 * 244 * <h4> Character categories </h4> 245 * 246 * RFC 2396 specifies precisely which characters are permitted in the 247 * various components of a URI reference. The following categories, most of 248 * which are taken from that specification, are used below to describe these 249 * constraints: 250 * 251 * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other"> 252 * <tr><th valign=top><i>alpha</i></th> 253 * <td>The US-ASCII alphabetic characters, 254 * {@code 'A'} through {@code 'Z'} 255 * and {@code 'a'} through {@code 'z'}</td></tr> 256 * <tr><th valign=top><i>digit</i></th> 257 * <td>The US-ASCII decimal digit characters, 258 * {@code '0'} through {@code '9'}</td></tr> 259 * <tr><th valign=top><i>alphanum</i></th> 260 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 261 * <tr><th valign=top><i>unreserved</i> </th> 262 * <td>All <i>alphanum</i> characters together with those in the string 263 * {@code "_-!.~'()*"}</td></tr> 264 * <tr><th valign=top><i>punct</i></th> 265 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 266 * <tr><th valign=top><i>reserved</i></th> 267 * <td>All <i>punct</i> characters together with those in the string 268 * {@code "?/[]@"}</td></tr> 269 * <tr><th valign=top><i>escaped</i></th> 270 * <td>Escaped octets, that is, triplets consisting of the percent 271 * character ({@code '%'}) followed by two hexadecimal digits 272 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 273 * {@code 'a'}-{@code 'f'})</td></tr> 274 * <tr><th valign=top><i>other</i></th> 275 * <td>The Unicode characters that are not in the US-ASCII character set, 276 * are not control characters (according to the {@link 277 * java.lang.Character#isISOControl(char) Character.isISOControl} 278 * method), and are not space characters (according to the {@link 279 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 280 * method) <i>(<b>Deviation from RFC 2396</b>, which is 281 * limited to US-ASCII)</i></td></tr> 282 * </table></blockquote> 283 * 284 * <p><a name="legal-chars"></a> The set of all legal URI characters consists of 285 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 286 * characters. 287 * 288 * 289 * <h4> Escaped octets, quotation, encoding, and decoding </h4> 290 * 291 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 292 * fragment components. Escaping serves two purposes in URIs: 293 * 294 * <ul> 295 * 296 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 297 * conform strictly to RFC 2396 by not containing any <i>other</i> 298 * characters. </p></li> 299 * 300 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 301 * component. The user-info, path, query, and fragment components differ 302 * slightly in terms of which characters are considered legal and illegal. 303 * </p></li> 304 * 305 * </ul> 306 * 307 * These purposes are served in this class by three related operations: 308 * 309 * <ul> 310 * 311 * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it 312 * with the sequence of escaped octets that represent that character in the 313 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 314 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 315 * RFC 2396</b>, which does not specify any particular character 316 * set.)</i> </p></li> 317 * 318 * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by 319 * encoding it. The space character, for example, is quoted by replacing it 320 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 321 * characters this transformation has exactly the effect required by 322 * RFC 2396. </p></li> 323 * 324 * <li><p><a name="decode"></a> 325 * A sequence of escaped octets is <i>decoded</i> by 326 * replacing it with the sequence of characters that it represents in the 327 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 328 * effect of de-quoting any quoted US-ASCII characters as well as that of 329 * decoding any encoded non-US-ASCII characters. If a <a 330 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 331 * when decoding the escaped octets then the erroneous octets are replaced by 332 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 333 * 334 * </ul> 335 * 336 * These operations are exposed in the constructors and methods of this class 337 * as follows: 338 * 339 * <ul> 340 * 341 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 342 * constructor} requires any illegal characters in its argument to be 343 * quoted and preserves any escaped octets and <i>other</i> characters that 344 * are present. </p></li> 345 * 346 * <li><p> The {@linkplain 347 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 348 * multi-argument constructors} quote illegal characters as 349 * required by the components in which they appear. The percent character 350 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 351 * characters are preserved. </p></li> 352 * 353 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 354 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 355 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 356 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 357 * values of their corresponding components in raw form, without interpreting 358 * any escaped octets. The strings returned by these methods may contain 359 * both escaped octets and <i>other</i> characters, and will not contain any 360 * illegal characters. </p></li> 361 * 362 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 363 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 364 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 365 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 366 * octets in their corresponding components. The strings returned by these 367 * methods may contain both <i>other</i> characters and illegal characters, 368 * and will not contain any escaped octets. </p></li> 369 * 370 * <li><p> The {@link #toString() toString} method returns a URI string with 371 * all necessary quotation but which may contain <i>other</i> characters. 372 * </p></li> 373 * 374 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 375 * quoted and encoded URI string that does not contain any <i>other</i> 376 * characters. </p></li> 377 * 378 * </ul> 379 * 380 * 381 * <h4> Identities </h4> 382 * 383 * For any URI <i>u</i>, it is always the case that 384 * 385 * <blockquote> 386 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 387 * </blockquote> 388 * 389 * For any URI <i>u</i> that does not contain redundant syntax such as two 390 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 391 * colon following a host name but no port (as in 392 * {@code http://java.sun.com:} ), and that does not encode characters 393 * except those that must be quoted, the following identities also hold: 394 * <pre> 395 * new URI(<i>u</i>.getScheme(), 396 * <i>u</i>.getSchemeSpecificPart(), 397 * <i>u</i>.getFragment()) 398 * .equals(<i>u</i>)</pre> 399 * in all cases, 400 * <pre> 401 * new URI(<i>u</i>.getScheme(), 402 * <i>u</i>.getAuthority(), 403 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 404 * <i>u</i>.getFragment()) 405 * .equals(<i>u</i>)</pre> 406 * if <i>u</i> is hierarchical, and 407 * <pre> 408 * new URI(<i>u</i>.getScheme(), 409 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 410 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 411 * <i>u</i>.getFragment()) 412 * .equals(<i>u</i>)</pre> 413 * if <i>u</i> is hierarchical and has either no authority or a server-based 414 * authority. 415 * 416 * 417 * <h4> URIs, URLs, and URNs </h4> 418 * 419 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 420 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 421 * not every URI is a URL. This is because there is another subcategory of 422 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 423 * specify how to locate them. The {@code mailto}, {@code news}, and 424 * {@code isbn} URIs shown above are examples of URNs. 425 * 426 * <p> The conceptual distinction between URIs and URLs is reflected in the 427 * differences between this class and the {@link URL} class. 428 * 429 * <p> An instance of this class represents a URI reference in the syntactic 430 * sense defined by RFC 2396. A URI may be either absolute or relative. 431 * A URI string is parsed according to the generic syntax without regard to the 432 * scheme, if any, that it specifies. No lookup of the host, if any, is 433 * performed, and no scheme-dependent stream handler is constructed. Equality, 434 * hashing, and comparison are defined strictly in terms of the character 435 * content of the instance. In other words, a URI instance is little more than 436 * a structured string that supports the syntactic, scheme-independent 437 * operations of comparison, normalization, resolution, and relativization. 438 * 439 * <p> An instance of the {@link URL} class, by contrast, represents the 440 * syntactic components of a URL together with some of the information required 441 * to access the resource that it describes. A URL must be absolute, that is, 442 * it must always specify a scheme. A URL string is parsed according to its 443 * scheme. A stream handler is always established for a URL, and in fact it is 444 * impossible to create a URL instance for a scheme for which no handler is 445 * available. Equality and hashing depend upon both the scheme and the 446 * Internet address of the host, if any; comparison is not defined. In other 447 * words, a URL is a structured string that supports the syntactic operation of 448 * resolution as well as the network I/O operations of looking up the host and 449 * opening a connection to the specified resource. 450 * 451 * 452 * @author Mark Reinhold 453 * @since 1.4 454 * 455 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 456 * transformation format of ISO 10646</i></a>, <br><a 457 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 458 * Architecture</i></a>, <br><a 459 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 460 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 461 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 462 * Literal IPv6 Addresses in URLs</i></a>, <br><a 463 * href="URISyntaxException.html">URISyntaxException</a> 464 */ 465 466 public final class URI 467 implements Comparable<URI>, Serializable 468 { 469 470 // Note: Comments containing the word "ASSERT" indicate places where a 471 // throw of an InternalError should be replaced by an appropriate assertion 472 // statement once asserts are enabled in the build. 473 474 static final long serialVersionUID = -6052424284110960213L; 475 476 477 // -- Properties and components of this instance -- 478 479 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 480 private transient String scheme; // null ==> relative URI 481 private transient String fragment; 482 483 // Hierarchical URI components: [//<authority>]<path>[?<query>] 484 private transient String authority; // Registry or server 485 486 // Server-based authority: [<userInfo>@]<host>[:<port>] 487 private transient String userInfo; 488 private transient String host; // null ==> registry-based 489 private transient int port = -1; // -1 ==> undefined 490 491 // Remaining components of hierarchical URIs 492 private transient String path; // null ==> opaque 493 private transient String query; 494 495 // The remaining fields may be computed on demand, which is safe even in 496 // the face of multiple threads racing to initialize them 497 private transient String schemeSpecificPart; 498 private transient int hash; // Zero ==> undefined 499 500 private transient String decodedUserInfo; 501 private transient String decodedAuthority; 502 private transient String decodedPath; 503 private transient String decodedQuery; 504 private transient String decodedFragment; 505 private transient String decodedSchemeSpecificPart; 506 507 /** 508 * The string form of this URI. 509 * 510 * @serial 511 */ 512 private volatile String string; // The only serializable field 513 514 515 516 // -- Constructors and factories -- 517 518 private URI() { } // Used internally 519 520 /** 521 * Constructs a URI by parsing the given string. 522 * 523 * <p> This constructor parses the given string exactly as specified by the 524 * grammar in <a 525 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 526 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 527 * 528 * <ul> 529 * 530 * <li><p> An empty authority component is permitted as long as it is 531 * followed by a non-empty path, a query component, or a fragment 532 * component. This allows the parsing of URIs such as 533 * {@code "file:///foo/bar"}, which seems to be the intent of 534 * RFC 2396 although the grammar does not permit it. If the 535 * authority component is empty then the user-information, host, and port 536 * components are undefined. </p></li> 537 * 538 * <li><p> Empty relative paths are permitted; this seems to be the 539 * intent of RFC 2396 although the grammar does not permit it. The 540 * primary consequence of this deviation is that a standalone fragment 541 * such as {@code "#foo"} parses as a relative URI with an empty path 542 * and the given fragment, and can be usefully <a 543 * href="#resolve-frag">resolved</a> against a base URI. 544 * 545 * <li><p> IPv4 addresses in host components are parsed rigorously, as 546 * specified by <a 547 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 548 * element of a dotted-quad address must contain no more than three 549 * decimal digits. Each element is further constrained to have a value 550 * no greater than 255. </p></li> 551 * 552 * <li> <p> Hostnames in host components that comprise only a single 553 * domain label are permitted to start with an <i>alphanum</i> 554 * character. This seems to be the intent of <a 555 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 556 * section 3.2.2 although the grammar does not permit it. The 557 * consequence of this deviation is that the authority component of a 558 * hierarchical URI such as {@code s://123}, will parse as a server-based 559 * authority. </p></li> 560 * 561 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 562 * address must be enclosed in square brackets ({@code '['} and 563 * {@code ']'}) as specified by <a 564 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 565 * IPv6 address itself must parse according to <a 566 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 567 * addresses are further constrained to describe no more than sixteen 568 * bytes of address information, a constraint implicit in RFC 2373 569 * but not expressible in the grammar. </p></li> 570 * 571 * <li><p> Characters in the <i>other</i> category are permitted wherever 572 * RFC 2396 permits <i>escaped</i> octets, that is, in the 573 * user-information, path, query, and fragment components, as well as in 574 * the authority component if the authority is registry-based. This 575 * allows URIs to contain Unicode characters beyond those in the US-ASCII 576 * character set. </p></li> 577 * 578 * </ul> 579 * 580 * @param str The string to be parsed into a URI 581 * 582 * @throws NullPointerException 583 * If {@code str} is {@code null} 584 * 585 * @throws URISyntaxException 586 * If the given string violates RFC 2396, as augmented 587 * by the above deviations 588 */ 589 public URI(String str) throws URISyntaxException { 590 new Parser(str).parse(false); 591 } 592 593 /** 594 * Constructs a hierarchical URI from the given components. 595 * 596 * <p> If a scheme is given then the path, if also given, must either be 597 * empty or begin with a slash character ({@code '/'}). Otherwise a 598 * component of the new URI may be left undefined by passing {@code null} 599 * for the corresponding parameter or, in the case of the {@code port} 600 * parameter, by passing {@code -1}. 601 * 602 * <p> This constructor first builds a URI string from the given components 603 * according to the rules specified in <a 604 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 605 * section 5.2, step 7: </p> 606 * 607 * <ol> 608 * 609 * <li><p> Initially, the result string is empty. </p></li> 610 * 611 * <li><p> If a scheme is given then it is appended to the result, 612 * followed by a colon character ({@code ':'}). </p></li> 613 * 614 * <li><p> If user information, a host, or a port are given then the 615 * string {@code "//"} is appended. </p></li> 616 * 617 * <li><p> If user information is given then it is appended, followed by 618 * a commercial-at character ({@code '@'}). Any character not in the 619 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 620 * categories is <a href="#quote">quoted</a>. </p></li> 621 * 622 * <li><p> If a host is given then it is appended. If the host is a 623 * literal IPv6 address but is not enclosed in square brackets 624 * ({@code '['} and {@code ']'}) then the square brackets are added. 625 * </p></li> 626 * 627 * <li><p> If a port number is given then a colon character 628 * ({@code ':'}) is appended, followed by the port number in decimal. 629 * </p></li> 630 * 631 * <li><p> If a path is given then it is appended. Any character not in 632 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 633 * categories, and not equal to the slash character ({@code '/'}) or the 634 * commercial-at character ({@code '@'}), is quoted. </p></li> 635 * 636 * <li><p> If a query is given then a question-mark character 637 * ({@code '?'}) is appended, followed by the query. Any character that 638 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 639 * </p></li> 640 * 641 * <li><p> Finally, if a fragment is given then a hash character 642 * ({@code '#'}) is appended, followed by the fragment. Any character 643 * that is not a legal URI character is quoted. </p></li> 644 * 645 * </ol> 646 * 647 * <p> The resulting URI string is then parsed as if by invoking the {@link 648 * #URI(String)} constructor and then invoking the {@link 649 * #parseServerAuthority()} method upon the result; this may cause a {@link 650 * URISyntaxException} to be thrown. </p> 651 * 652 * @param scheme Scheme name 653 * @param userInfo User name and authorization information 654 * @param host Host name 655 * @param port Port number 656 * @param path Path 657 * @param query Query 658 * @param fragment Fragment 659 * 660 * @throws URISyntaxException 661 * If both a scheme and a path are given but the path is relative, 662 * if the URI string constructed from the given components violates 663 * RFC 2396, or if the authority component of the string is 664 * present but cannot be parsed as a server-based authority 665 */ 666 public URI(String scheme, 667 String userInfo, String host, int port, 668 String path, String query, String fragment) 669 throws URISyntaxException 670 { 671 String s = toString(scheme, null, 672 null, userInfo, host, port, 673 path, query, fragment); 674 checkPath(s, scheme, path); 675 new Parser(s).parse(true); 676 } 677 678 /** 679 * Constructs a hierarchical URI from the given components. 680 * 681 * <p> If a scheme is given then the path, if also given, must either be 682 * empty or begin with a slash character ({@code '/'}). Otherwise a 683 * component of the new URI may be left undefined by passing {@code null} 684 * for the corresponding parameter. 685 * 686 * <p> This constructor first builds a URI string from the given components 687 * according to the rules specified in <a 688 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 689 * section 5.2, step 7: </p> 690 * 691 * <ol> 692 * 693 * <li><p> Initially, the result string is empty. </p></li> 694 * 695 * <li><p> If a scheme is given then it is appended to the result, 696 * followed by a colon character ({@code ':'}). </p></li> 697 * 698 * <li><p> If an authority is given then the string {@code "//"} is 699 * appended, followed by the authority. If the authority contains a 700 * literal IPv6 address then the address must be enclosed in square 701 * brackets ({@code '['} and {@code ']'}). Any character not in the 702 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 703 * categories, and not equal to the commercial-at character 704 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 705 * 706 * <li><p> If a path is given then it is appended. Any character not in 707 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 708 * categories, and not equal to the slash character ({@code '/'}) or the 709 * commercial-at character ({@code '@'}), is quoted. </p></li> 710 * 711 * <li><p> If a query is given then a question-mark character 712 * ({@code '?'}) is appended, followed by the query. Any character that 713 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 714 * </p></li> 715 * 716 * <li><p> Finally, if a fragment is given then a hash character 717 * ({@code '#'}) is appended, followed by the fragment. Any character 718 * that is not a legal URI character is quoted. </p></li> 719 * 720 * </ol> 721 * 722 * <p> The resulting URI string is then parsed as if by invoking the {@link 723 * #URI(String)} constructor and then invoking the {@link 724 * #parseServerAuthority()} method upon the result; this may cause a {@link 725 * URISyntaxException} to be thrown. </p> 726 * 727 * @param scheme Scheme name 728 * @param authority Authority 729 * @param path Path 730 * @param query Query 731 * @param fragment Fragment 732 * 733 * @throws URISyntaxException 734 * If both a scheme and a path are given but the path is relative, 735 * if the URI string constructed from the given components violates 736 * RFC 2396, or if the authority component of the string is 737 * present but cannot be parsed as a server-based authority 738 */ 739 public URI(String scheme, 740 String authority, 741 String path, String query, String fragment) 742 throws URISyntaxException 743 { 744 String s = toString(scheme, null, 745 authority, null, null, -1, 746 path, query, fragment); 747 checkPath(s, scheme, path); 748 new Parser(s).parse(false); 749 } 750 751 /** 752 * Constructs a hierarchical URI from the given components. 753 * 754 * <p> A component may be left undefined by passing {@code null}. 755 * 756 * <p> This convenience constructor works as if by invoking the 757 * seven-argument constructor as follows: 758 * 759 * <blockquote> 760 * {@code new} {@link #URI(String, String, String, int, String, String, String) 761 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 762 * </blockquote> 763 * 764 * @param scheme Scheme name 765 * @param host Host name 766 * @param path Path 767 * @param fragment Fragment 768 * 769 * @throws URISyntaxException 770 * If the URI string constructed from the given components 771 * violates RFC 2396 772 */ 773 public URI(String scheme, String host, String path, String fragment) 774 throws URISyntaxException 775 { 776 this(scheme, null, host, -1, path, null, fragment); 777 } 778 779 /** 780 * Constructs a URI from the given components. 781 * 782 * <p> A component may be left undefined by passing {@code null}. 783 * 784 * <p> This constructor first builds a URI in string form using the given 785 * components as follows: </p> 786 * 787 * <ol> 788 * 789 * <li><p> Initially, the result string is empty. </p></li> 790 * 791 * <li><p> If a scheme is given then it is appended to the result, 792 * followed by a colon character ({@code ':'}). </p></li> 793 * 794 * <li><p> If a scheme-specific part is given then it is appended. Any 795 * character that is not a <a href="#legal-chars">legal URI character</a> 796 * is <a href="#quote">quoted</a>. </p></li> 797 * 798 * <li><p> Finally, if a fragment is given then a hash character 799 * ({@code '#'}) is appended to the string, followed by the fragment. 800 * Any character that is not a legal URI character is quoted. </p></li> 801 * 802 * </ol> 803 * 804 * <p> The resulting URI string is then parsed in order to create the new 805 * URI instance as if by invoking the {@link #URI(String)} constructor; 806 * this may cause a {@link URISyntaxException} to be thrown. </p> 807 * 808 * @param scheme Scheme name 809 * @param ssp Scheme-specific part 810 * @param fragment Fragment 811 * 812 * @throws URISyntaxException 813 * If the URI string constructed from the given components 814 * violates RFC 2396 815 */ 816 public URI(String scheme, String ssp, String fragment) 817 throws URISyntaxException 818 { 819 new Parser(toString(scheme, ssp, 820 null, null, null, -1, 821 null, null, fragment)) 822 .parse(false); 823 } 824 825 /** 826 * Constructs a simple URI consisting of only a scheme and a pre-validated 827 * path. Provides a fast-path for some internal cases. 828 */ 829 URI(String scheme, String path) { 830 this.scheme = scheme; 831 this.path = path; 832 } 833 834 /** 835 * Creates a URI by parsing the given string. 836 * 837 * <p> This convenience factory method works as if by invoking the {@link 838 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 839 * constructor is caught and wrapped in a new {@link 840 * IllegalArgumentException} object, which is then thrown. 841 * 842 * <p> This method is provided for use in situations where it is known that 843 * the given string is a legal URI, for example for URI constants declared 844 * within in a program, and so it would be considered a programming error 845 * for the string not to parse as such. The constructors, which throw 846 * {@link URISyntaxException} directly, should be used situations where a 847 * URI is being constructed from user input or from some other source that 848 * may be prone to errors. </p> 849 * 850 * @param str The string to be parsed into a URI 851 * @return The new URI 852 * 853 * @throws NullPointerException 854 * If {@code str} is {@code null} 855 * 856 * @throws IllegalArgumentException 857 * If the given string violates RFC 2396 858 */ 859 public static URI create(String str) { 860 try { 861 return new URI(str); 862 } catch (URISyntaxException x) { 863 throw new IllegalArgumentException(x.getMessage(), x); 864 } 865 } 866 867 868 // -- Operations -- 869 870 /** 871 * Attempts to parse this URI's authority component, if defined, into 872 * user-information, host, and port components. 873 * 874 * <p> If this URI's authority component has already been recognized as 875 * being server-based then it will already have been parsed into 876 * user-information, host, and port components. In this case, or if this 877 * URI has no authority component, this method simply returns this URI. 878 * 879 * <p> Otherwise this method attempts once more to parse the authority 880 * component into user-information, host, and port components, and throws 881 * an exception describing why the authority component could not be parsed 882 * in that way. 883 * 884 * <p> This method is provided because the generic URI syntax specified in 885 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 886 * cannot always distinguish a malformed server-based authority from a 887 * legitimate registry-based authority. It must therefore treat some 888 * instances of the former as instances of the latter. The authority 889 * component in the URI string {@code "//foo:bar"}, for example, is not a 890 * legal server-based authority but it is legal as a registry-based 891 * authority. 892 * 893 * <p> In many common situations, for example when working URIs that are 894 * known to be either URNs or URLs, the hierarchical URIs being used will 895 * always be server-based. They therefore must either be parsed as such or 896 * treated as an error. In these cases a statement such as 897 * 898 * <blockquote> 899 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 900 * </blockquote> 901 * 902 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 903 * it has an authority component, has a server-based authority with proper 904 * user-information, host, and port components. Invoking this method also 905 * ensures that if the authority could not be parsed in that way then an 906 * appropriate diagnostic message can be issued based upon the exception 907 * that is thrown. </p> 908 * 909 * @return A URI whose authority field has been parsed 910 * as a server-based authority 911 * 912 * @throws URISyntaxException 913 * If the authority component of this URI is defined 914 * but cannot be parsed as a server-based authority 915 * according to RFC 2396 916 */ 917 public URI parseServerAuthority() 918 throws URISyntaxException 919 { 920 // We could be clever and cache the error message and index from the 921 // exception thrown during the original parse, but that would require 922 // either more fields or a more-obscure representation. 923 if ((host != null) || (authority == null)) 924 return this; 925 new Parser(toString()).parse(true); 926 return this; 927 } 928 929 /** 930 * Normalizes this URI's path. 931 * 932 * <p> If this URI is opaque, or if its path is already in normal form, 933 * then this URI is returned. Otherwise a new URI is constructed that is 934 * identical to this URI except that its path is computed by normalizing 935 * this URI's path in a manner consistent with <a 936 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 937 * section 5.2, step 6, sub-steps c through f; that is: 938 * </p> 939 * 940 * <ol> 941 * 942 * <li><p> All {@code "."} segments are removed. </p></li> 943 * 944 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 945 * segment then both of these segments are removed. This step is 946 * repeated until it is no longer applicable. </p></li> 947 * 948 * <li><p> If the path is relative, and if its first segment contains a 949 * colon character ({@code ':'}), then a {@code "."} segment is 950 * prepended. This prevents a relative URI with a path such as 951 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 952 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 953 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 954 * 955 * </ol> 956 * 957 * <p> A normalized path will begin with one or more {@code ".."} segments 958 * if there were insufficient non-{@code ".."} segments preceding them to 959 * allow their removal. A normalized path will begin with a {@code "."} 960 * segment if one was inserted by step 3 above. Otherwise, a normalized 961 * path will not contain any {@code "."} or {@code ".."} segments. </p> 962 * 963 * @return A URI equivalent to this URI, 964 * but whose path is in normal form 965 */ 966 public URI normalize() { 967 return normalize(this); 968 } 969 970 /** 971 * Resolves the given URI against this URI. 972 * 973 * <p> If the given URI is already absolute, or if this URI is opaque, then 974 * the given URI is returned. 975 * 976 * <p><a name="resolve-frag"></a> If the given URI's fragment component is 977 * defined, its path component is empty, and its scheme, authority, and 978 * query components are undefined, then a URI with the given fragment but 979 * with all other components equal to those of this URI is returned. This 980 * allows a URI representing a standalone fragment reference, such as 981 * {@code "#foo"}, to be usefully resolved against a base URI. 982 * 983 * <p> Otherwise this method constructs a new hierarchical URI in a manner 984 * consistent with <a 985 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 986 * section 5.2; that is: </p> 987 * 988 * <ol> 989 * 990 * <li><p> A new URI is constructed with this URI's scheme and the given 991 * URI's query and fragment components. </p></li> 992 * 993 * <li><p> If the given URI has an authority component then the new URI's 994 * authority and path are taken from the given URI. </p></li> 995 * 996 * <li><p> Otherwise the new URI's authority component is copied from 997 * this URI, and its path is computed as follows: </p> 998 * 999 * <ol> 1000 * 1001 * <li><p> If the given URI's path is absolute then the new URI's path 1002 * is taken from the given URI. </p></li> 1003 * 1004 * <li><p> Otherwise the given URI's path is relative, and so the new 1005 * URI's path is computed by resolving the path of the given URI 1006 * against the path of this URI. This is done by concatenating all but 1007 * the last segment of this URI's path, if any, with the given URI's 1008 * path and then normalizing the result as if by invoking the {@link 1009 * #normalize() normalize} method. </p></li> 1010 * 1011 * </ol></li> 1012 * 1013 * </ol> 1014 * 1015 * <p> The result of this method is absolute if, and only if, either this 1016 * URI is absolute or the given URI is absolute. </p> 1017 * 1018 * @param uri The URI to be resolved against this URI 1019 * @return The resulting URI 1020 * 1021 * @throws NullPointerException 1022 * If {@code uri} is {@code null} 1023 */ 1024 public URI resolve(URI uri) { 1025 return resolve(this, uri); 1026 } 1027 1028 /** 1029 * Constructs a new URI by parsing the given string and then resolving it 1030 * against this URI. 1031 * 1032 * <p> This convenience method works as if invoking it were equivalent to 1033 * evaluating the expression {@link #resolve(java.net.URI) 1034 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1035 * 1036 * @param str The string to be parsed into a URI 1037 * @return The resulting URI 1038 * 1039 * @throws NullPointerException 1040 * If {@code str} is {@code null} 1041 * 1042 * @throws IllegalArgumentException 1043 * If the given string violates RFC 2396 1044 */ 1045 public URI resolve(String str) { 1046 return resolve(URI.create(str)); 1047 } 1048 1049 /** 1050 * Relativizes the given URI against this URI. 1051 * 1052 * <p> The relativization of the given URI against this URI is computed as 1053 * follows: </p> 1054 * 1055 * <ol> 1056 * 1057 * <li><p> If either this URI or the given URI are opaque, or if the 1058 * scheme and authority components of the two URIs are not identical, or 1059 * if the path of this URI is not a prefix of the path of the given URI, 1060 * then the given URI is returned. </p></li> 1061 * 1062 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1063 * query and fragment components taken from the given URI and with a path 1064 * component computed by removing this URI's path from the beginning of 1065 * the given URI's path. </p></li> 1066 * 1067 * </ol> 1068 * 1069 * @param uri The URI to be relativized against this URI 1070 * @return The resulting URI 1071 * 1072 * @throws NullPointerException 1073 * If {@code uri} is {@code null} 1074 */ 1075 public URI relativize(URI uri) { 1076 return relativize(this, uri); 1077 } 1078 1079 /** 1080 * Constructs a URL from this URI. 1081 * 1082 * <p> This convenience method works as if invoking it were equivalent to 1083 * evaluating the expression {@code new URL(this.toString())} after 1084 * first checking that this URI is absolute. </p> 1085 * 1086 * @return A URL constructed from this URI 1087 * 1088 * @throws IllegalArgumentException 1089 * If this URL is not absolute 1090 * 1091 * @throws MalformedURLException 1092 * If a protocol handler for the URL could not be found, 1093 * or if some other error occurred while constructing the URL 1094 */ 1095 public URL toURL() throws MalformedURLException { 1096 return URL.fromURI(this); 1097 } 1098 1099 // -- Component access methods -- 1100 1101 /** 1102 * Returns the scheme component of this URI. 1103 * 1104 * <p> The scheme component of a URI, if defined, only contains characters 1105 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1106 * scheme always starts with an <i>alpha</i> character. <p> 1107 * 1108 * The scheme component of a URI cannot contain escaped octets, hence this 1109 * method does not perform any decoding. 1110 * 1111 * @return The scheme component of this URI, 1112 * or {@code null} if the scheme is undefined 1113 */ 1114 public String getScheme() { 1115 return scheme; 1116 } 1117 1118 /** 1119 * Tells whether or not this URI is absolute. 1120 * 1121 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1122 * 1123 * @return {@code true} if, and only if, this URI is absolute 1124 */ 1125 public boolean isAbsolute() { 1126 return scheme != null; 1127 } 1128 1129 /** 1130 * Tells whether or not this URI is opaque. 1131 * 1132 * <p> A URI is opaque if, and only if, it is absolute and its 1133 * scheme-specific part does not begin with a slash character ('/'). 1134 * An opaque URI has a scheme, a scheme-specific part, and possibly 1135 * a fragment; all other components are undefined. </p> 1136 * 1137 * @return {@code true} if, and only if, this URI is opaque 1138 */ 1139 public boolean isOpaque() { 1140 return path == null; 1141 } 1142 1143 /** 1144 * Returns the raw scheme-specific part of this URI. The scheme-specific 1145 * part is never undefined, though it may be empty. 1146 * 1147 * <p> The scheme-specific part of a URI only contains legal URI 1148 * characters. </p> 1149 * 1150 * @return The raw scheme-specific part of this URI 1151 * (never {@code null}) 1152 */ 1153 public String getRawSchemeSpecificPart() { 1154 String part = schemeSpecificPart; 1155 if (part != null) { 1156 return part; 1157 } 1158 1159 String s = string; 1160 if (s != null) { 1161 // if string is defined, components will have been parsed 1162 int start = 0; 1163 int end = s.length(); 1164 if (scheme != null) { 1165 start = scheme.length() + 1; 1166 } 1167 if (fragment != null) { 1168 end -= fragment.length() + 1; 1169 } 1170 if (path != null && path.length() == end - start) { 1171 part = path; 1172 } else { 1173 part = s.substring(start, end); 1174 } 1175 } else { 1176 StringBuilder sb = new StringBuilder(); 1177 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1178 host, port, getPath(), getQuery()); 1179 part = sb.toString(); 1180 } 1181 return schemeSpecificPart = part; 1182 } 1183 1184 /** 1185 * Returns the decoded scheme-specific part of this URI. 1186 * 1187 * <p> The string returned by this method is equal to that returned by the 1188 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1189 * except that all sequences of escaped octets are <a 1190 * href="#decode">decoded</a>. </p> 1191 * 1192 * @return The decoded scheme-specific part of this URI 1193 * (never {@code null}) 1194 */ 1195 public String getSchemeSpecificPart() { 1196 String part = decodedSchemeSpecificPart; 1197 if (part == null) { 1198 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1199 } 1200 return part; 1201 } 1202 1203 /** 1204 * Returns the raw authority component of this URI. 1205 * 1206 * <p> The authority component of a URI, if defined, only contains the 1207 * commercial-at character ({@code '@'}) and characters in the 1208 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1209 * categories. If the authority is server-based then it is further 1210 * constrained to have valid user-information, host, and port 1211 * components. </p> 1212 * 1213 * @return The raw authority component of this URI, 1214 * or {@code null} if the authority is undefined 1215 */ 1216 public String getRawAuthority() { 1217 return authority; 1218 } 1219 1220 /** 1221 * Returns the decoded authority component of this URI. 1222 * 1223 * <p> The string returned by this method is equal to that returned by the 1224 * {@link #getRawAuthority() getRawAuthority} method except that all 1225 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1226 * 1227 * @return The decoded authority component of this URI, 1228 * or {@code null} if the authority is undefined 1229 */ 1230 public String getAuthority() { 1231 String auth = decodedAuthority; 1232 if ((auth == null) && (authority != null)) { 1233 decodedAuthority = auth = decode(authority); 1234 } 1235 return auth; 1236 } 1237 1238 /** 1239 * Returns the raw user-information component of this URI. 1240 * 1241 * <p> The user-information component of a URI, if defined, only contains 1242 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1243 * <i>other</i> categories. </p> 1244 * 1245 * @return The raw user-information component of this URI, 1246 * or {@code null} if the user information is undefined 1247 */ 1248 public String getRawUserInfo() { 1249 return userInfo; 1250 } 1251 1252 /** 1253 * Returns the decoded user-information component of this URI. 1254 * 1255 * <p> The string returned by this method is equal to that returned by the 1256 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1257 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1258 * 1259 * @return The decoded user-information component of this URI, 1260 * or {@code null} if the user information is undefined 1261 */ 1262 public String getUserInfo() { 1263 String user = decodedUserInfo; 1264 if ((user == null) && (userInfo != null)) { 1265 decodedUserInfo = user = decode(userInfo); 1266 } 1267 return user; 1268 } 1269 1270 /** 1271 * Returns the host component of this URI. 1272 * 1273 * <p> The host component of a URI, if defined, will have one of the 1274 * following forms: </p> 1275 * 1276 * <ul> 1277 * 1278 * <li><p> A domain name consisting of one or more <i>labels</i> 1279 * separated by period characters ({@code '.'}), optionally followed by 1280 * a period character. Each label consists of <i>alphanum</i> characters 1281 * as well as hyphen characters ({@code '-'}), though hyphens never 1282 * occur as the first or last characters in a label. The rightmost 1283 * label of a domain name consisting of two or more labels, begins 1284 * with an <i>alpha</i> character. </li> 1285 * 1286 * <li><p> A dotted-quad IPv4 address of the form 1287 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1288 * where no <i>digit</i> sequence is longer than three characters and no 1289 * sequence has a value larger than 255. </p></li> 1290 * 1291 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1292 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1293 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1294 * syntax of IPv6 addresses is specified in <a 1295 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1296 * Addressing Architecture</i></a>. </p></li> 1297 * 1298 * </ul> 1299 * 1300 * The host component of a URI cannot contain escaped octets, hence this 1301 * method does not perform any decoding. 1302 * 1303 * @return The host component of this URI, 1304 * or {@code null} if the host is undefined 1305 */ 1306 public String getHost() { 1307 return host; 1308 } 1309 1310 /** 1311 * Returns the port number of this URI. 1312 * 1313 * <p> The port component of a URI, if defined, is a non-negative 1314 * integer. </p> 1315 * 1316 * @return The port component of this URI, 1317 * or {@code -1} if the port is undefined 1318 */ 1319 public int getPort() { 1320 return port; 1321 } 1322 1323 /** 1324 * Returns the raw path component of this URI. 1325 * 1326 * <p> The path component of a URI, if defined, only contains the slash 1327 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1328 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1329 * and <i>other</i> categories. </p> 1330 * 1331 * @return The path component of this URI, 1332 * or {@code null} if the path is undefined 1333 */ 1334 public String getRawPath() { 1335 return path; 1336 } 1337 1338 /** 1339 * Returns the decoded path component of this URI. 1340 * 1341 * <p> The string returned by this method is equal to that returned by the 1342 * {@link #getRawPath() getRawPath} method except that all sequences of 1343 * escaped octets are <a href="#decode">decoded</a>. </p> 1344 * 1345 * @return The decoded path component of this URI, 1346 * or {@code null} if the path is undefined 1347 */ 1348 public String getPath() { 1349 String decoded = decodedPath; 1350 if ((decoded == null) && (path != null)) { 1351 decodedPath = decoded = decode(path); 1352 } 1353 return decoded; 1354 } 1355 1356 /** 1357 * Returns the raw query component of this URI. 1358 * 1359 * <p> The query component of a URI, if defined, only contains legal URI 1360 * characters. </p> 1361 * 1362 * @return The raw query component of this URI, 1363 * or {@code null} if the query is undefined 1364 */ 1365 public String getRawQuery() { 1366 return query; 1367 } 1368 1369 /** 1370 * Returns the decoded query component of this URI. 1371 * 1372 * <p> The string returned by this method is equal to that returned by the 1373 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1374 * escaped octets are <a href="#decode">decoded</a>. </p> 1375 * 1376 * @return The decoded query component of this URI, 1377 * or {@code null} if the query is undefined 1378 */ 1379 public String getQuery() { 1380 String decoded = decodedQuery; 1381 if ((decoded == null) && (query != null)) { 1382 decodedQuery = decoded = decode(query, false); 1383 } 1384 return decoded; 1385 } 1386 1387 /** 1388 * Returns the raw fragment component of this URI. 1389 * 1390 * <p> The fragment component of a URI, if defined, only contains legal URI 1391 * characters. </p> 1392 * 1393 * @return The raw fragment component of this URI, 1394 * or {@code null} if the fragment is undefined 1395 */ 1396 public String getRawFragment() { 1397 return fragment; 1398 } 1399 1400 /** 1401 * Returns the decoded fragment component of this URI. 1402 * 1403 * <p> The string returned by this method is equal to that returned by the 1404 * {@link #getRawFragment() getRawFragment} method except that all 1405 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1406 * 1407 * @return The decoded fragment component of this URI, 1408 * or {@code null} if the fragment is undefined 1409 */ 1410 public String getFragment() { 1411 String decoded = decodedFragment; 1412 if ((decoded == null) && (fragment != null)) { 1413 decodedFragment = decoded = decode(fragment, false); 1414 } 1415 return decoded; 1416 } 1417 1418 1419 // -- Equality, comparison, hash code, toString, and serialization -- 1420 1421 /** 1422 * Tests this URI for equality with another object. 1423 * 1424 * <p> If the given object is not a URI then this method immediately 1425 * returns {@code false}. 1426 * 1427 * <p> For two URIs to be considered equal requires that either both are 1428 * opaque or both are hierarchical. Their schemes must either both be 1429 * undefined or else be equal without regard to case. Their fragments 1430 * must either both be undefined or else be equal. 1431 * 1432 * <p> For two opaque URIs to be considered equal, their scheme-specific 1433 * parts must be equal. 1434 * 1435 * <p> For two hierarchical URIs to be considered equal, their paths must 1436 * be equal and their queries must either both be undefined or else be 1437 * equal. Their authorities must either both be undefined, or both be 1438 * registry-based, or both be server-based. If their authorities are 1439 * defined and are registry-based, then they must be equal. If their 1440 * authorities are defined and are server-based, then their hosts must be 1441 * equal without regard to case, their port numbers must be equal, and 1442 * their user-information components must be equal. 1443 * 1444 * <p> When testing the user-information, path, query, fragment, authority, 1445 * or scheme-specific parts of two URIs for equality, the raw forms rather 1446 * than the encoded forms of these components are compared and the 1447 * hexadecimal digits of escaped octets are compared without regard to 1448 * case. 1449 * 1450 * <p> This method satisfies the general contract of the {@link 1451 * java.lang.Object#equals(Object) Object.equals} method. </p> 1452 * 1453 * @param ob The object to which this object is to be compared 1454 * 1455 * @return {@code true} if, and only if, the given object is a URI that 1456 * is identical to this URI 1457 */ 1458 public boolean equals(Object ob) { 1459 if (ob == this) 1460 return true; 1461 if (!(ob instanceof URI)) 1462 return false; 1463 URI that = (URI)ob; 1464 if (this.isOpaque() != that.isOpaque()) return false; 1465 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1466 if (!equal(this.fragment, that.fragment)) return false; 1467 1468 // Opaque 1469 if (this.isOpaque()) 1470 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1471 1472 // Hierarchical 1473 if (!equal(this.path, that.path)) return false; 1474 if (!equal(this.query, that.query)) return false; 1475 1476 // Authorities 1477 if (this.authority == that.authority) return true; 1478 if (this.host != null) { 1479 // Server-based 1480 if (!equal(this.userInfo, that.userInfo)) return false; 1481 if (!equalIgnoringCase(this.host, that.host)) return false; 1482 if (this.port != that.port) return false; 1483 } else if (this.authority != null) { 1484 // Registry-based 1485 if (!equal(this.authority, that.authority)) return false; 1486 } else if (this.authority != that.authority) { 1487 return false; 1488 } 1489 1490 return true; 1491 } 1492 1493 /** 1494 * Returns a hash-code value for this URI. The hash code is based upon all 1495 * of the URI's components, and satisfies the general contract of the 1496 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1497 * 1498 * @return A hash-code value for this URI 1499 */ 1500 public int hashCode() { 1501 int h = hash; 1502 if (h == 0) { 1503 h = hashIgnoringCase(0, scheme); 1504 h = hash(h, fragment); 1505 if (isOpaque()) { 1506 h = hash(h, schemeSpecificPart); 1507 } else { 1508 h = hash(h, path); 1509 h = hash(h, query); 1510 if (host != null) { 1511 h = hash(h, userInfo); 1512 h = hashIgnoringCase(h, host); 1513 h += 1949 * port; 1514 } else { 1515 h = hash(h, authority); 1516 } 1517 } 1518 if (h != 0) { 1519 hash = h; 1520 } 1521 } 1522 return h; 1523 } 1524 1525 /** 1526 * Compares this URI to another object, which must be a URI. 1527 * 1528 * <p> When comparing corresponding components of two URIs, if one 1529 * component is undefined but the other is defined then the first is 1530 * considered to be less than the second. Unless otherwise noted, string 1531 * components are ordered according to their natural, case-sensitive 1532 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1533 * String.compareTo} method. String components that are subject to 1534 * encoding are compared by comparing their raw forms rather than their 1535 * encoded forms. 1536 * 1537 * <p> The ordering of URIs is defined as follows: </p> 1538 * 1539 * <ul> 1540 * 1541 * <li><p> Two URIs with different schemes are ordered according the 1542 * ordering of their schemes, without regard to case. </p></li> 1543 * 1544 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1545 * with an identical scheme. </p></li> 1546 * 1547 * <li><p> Two opaque URIs with identical schemes are ordered according 1548 * to the ordering of their scheme-specific parts. </p></li> 1549 * 1550 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1551 * parts are ordered according to the ordering of their 1552 * fragments. </p></li> 1553 * 1554 * <li><p> Two hierarchical URIs with identical schemes are ordered 1555 * according to the ordering of their authority components: </p> 1556 * 1557 * <ul> 1558 * 1559 * <li><p> If both authority components are server-based then the URIs 1560 * are ordered according to their user-information components; if these 1561 * components are identical then the URIs are ordered according to the 1562 * ordering of their hosts, without regard to case; if the hosts are 1563 * identical then the URIs are ordered according to the ordering of 1564 * their ports. </p></li> 1565 * 1566 * <li><p> If one or both authority components are registry-based then 1567 * the URIs are ordered according to the ordering of their authority 1568 * components. </p></li> 1569 * 1570 * </ul></li> 1571 * 1572 * <li><p> Finally, two hierarchical URIs with identical schemes and 1573 * authority components are ordered according to the ordering of their 1574 * paths; if their paths are identical then they are ordered according to 1575 * the ordering of their queries; if the queries are identical then they 1576 * are ordered according to the order of their fragments. </p></li> 1577 * 1578 * </ul> 1579 * 1580 * <p> This method satisfies the general contract of the {@link 1581 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1582 * method. </p> 1583 * 1584 * @param that 1585 * The object to which this URI is to be compared 1586 * 1587 * @return A negative integer, zero, or a positive integer as this URI is 1588 * less than, equal to, or greater than the given URI 1589 * 1590 * @throws ClassCastException 1591 * If the given object is not a URI 1592 */ 1593 public int compareTo(URI that) { 1594 int c; 1595 1596 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1597 return c; 1598 1599 if (this.isOpaque()) { 1600 if (that.isOpaque()) { 1601 // Both opaque 1602 if ((c = compare(this.schemeSpecificPart, 1603 that.schemeSpecificPart)) != 0) 1604 return c; 1605 return compare(this.fragment, that.fragment); 1606 } 1607 return +1; // Opaque > hierarchical 1608 } else if (that.isOpaque()) { 1609 return -1; // Hierarchical < opaque 1610 } 1611 1612 // Hierarchical 1613 if ((this.host != null) && (that.host != null)) { 1614 // Both server-based 1615 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1616 return c; 1617 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1618 return c; 1619 if ((c = this.port - that.port) != 0) 1620 return c; 1621 } else { 1622 // If one or both authorities are registry-based then we simply 1623 // compare them in the usual, case-sensitive way. If one is 1624 // registry-based and one is server-based then the strings are 1625 // guaranteed to be unequal, hence the comparison will never return 1626 // zero and the compareTo and equals methods will remain 1627 // consistent. 1628 if ((c = compare(this.authority, that.authority)) != 0) return c; 1629 } 1630 1631 if ((c = compare(this.path, that.path)) != 0) return c; 1632 if ((c = compare(this.query, that.query)) != 0) return c; 1633 return compare(this.fragment, that.fragment); 1634 } 1635 1636 /** 1637 * Returns the content of this URI as a string. 1638 * 1639 * <p> If this URI was created by invoking one of the constructors in this 1640 * class then a string equivalent to the original input string, or to the 1641 * string computed from the originally-given components, as appropriate, is 1642 * returned. Otherwise this URI was created by normalization, resolution, 1643 * or relativization, and so a string is constructed from this URI's 1644 * components according to the rules specified in <a 1645 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1646 * section 5.2, step 7. </p> 1647 * 1648 * @return The string form of this URI 1649 */ 1650 public String toString() { 1651 String s = string; 1652 if (s == null) { 1653 s = defineString(); 1654 } 1655 return s; 1656 } 1657 1658 private String defineString() { 1659 String s = string; 1660 if (s != null) { 1661 return s; 1662 } 1663 1664 StringBuilder sb = new StringBuilder(); 1665 if (scheme != null) { 1666 sb.append(scheme); 1667 sb.append(':'); 1668 } 1669 if (isOpaque()) { 1670 sb.append(schemeSpecificPart); 1671 } else { 1672 if (host != null) { 1673 sb.append("//"); 1674 if (userInfo != null) { 1675 sb.append(userInfo); 1676 sb.append('@'); 1677 } 1678 boolean needBrackets = ((host.indexOf(':') >= 0) 1679 && !host.startsWith("[") 1680 && !host.endsWith("]")); 1681 if (needBrackets) sb.append('['); 1682 sb.append(host); 1683 if (needBrackets) sb.append(']'); 1684 if (port != -1) { 1685 sb.append(':'); 1686 sb.append(port); 1687 } 1688 } else if (authority != null) { 1689 sb.append("//"); 1690 sb.append(authority); 1691 } 1692 if (path != null) 1693 sb.append(path); 1694 if (query != null) { 1695 sb.append('?'); 1696 sb.append(query); 1697 } 1698 } 1699 if (fragment != null) { 1700 sb.append('#'); 1701 sb.append(fragment); 1702 } 1703 return string = sb.toString(); 1704 } 1705 1706 /** 1707 * Returns the content of this URI as a US-ASCII string. 1708 * 1709 * <p> If this URI does not contain any characters in the <i>other</i> 1710 * category then an invocation of this method will return the same value as 1711 * an invocation of the {@link #toString() toString} method. Otherwise 1712 * this method works as if by invoking that method and then <a 1713 * href="#encode">encoding</a> the result. </p> 1714 * 1715 * @return The string form of this URI, encoded as needed 1716 * so that it only contains characters in the US-ASCII 1717 * charset 1718 */ 1719 public String toASCIIString() { 1720 return encode(toString()); 1721 } 1722 1723 1724 // -- Serialization support -- 1725 1726 /** 1727 * Saves the content of this URI to the given serial stream. 1728 * 1729 * <p> The only serializable field of a URI instance is its {@code string} 1730 * field. That field is given a value, if it does not have one already, 1731 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1732 * method of the given object-output stream is invoked. </p> 1733 * 1734 * @param os The object-output stream to which this object 1735 * is to be written 1736 */ 1737 private void writeObject(ObjectOutputStream os) 1738 throws IOException 1739 { 1740 defineString(); 1741 os.defaultWriteObject(); // Writes the string field only 1742 } 1743 1744 /** 1745 * Reconstitutes a URI from the given serial stream. 1746 * 1747 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1748 * invoked to read the value of the {@code string} field. The result is 1749 * then parsed in the usual way. 1750 * 1751 * @param is The object-input stream from which this object 1752 * is being read 1753 */ 1754 private void readObject(ObjectInputStream is) 1755 throws ClassNotFoundException, IOException 1756 { 1757 port = -1; // Argh 1758 is.defaultReadObject(); 1759 try { 1760 new Parser(string).parse(false); 1761 } catch (URISyntaxException x) { 1762 IOException y = new InvalidObjectException("Invalid URI"); 1763 y.initCause(x); 1764 throw y; 1765 } 1766 } 1767 1768 1769 // -- End of public methods -- 1770 1771 1772 // -- Utility methods for string-field comparison and hashing -- 1773 1774 // These methods return appropriate values for null string arguments, 1775 // thereby simplifying the equals, hashCode, and compareTo methods. 1776 // 1777 // The case-ignoring methods should only be applied to strings whose 1778 // characters are all known to be US-ASCII. Because of this restriction, 1779 // these methods are faster than the similar methods in the String class. 1780 1781 // US-ASCII only 1782 private static int toLower(char c) { 1783 if ((c >= 'A') && (c <= 'Z')) 1784 return c + ('a' - 'A'); 1785 return c; 1786 } 1787 1788 // US-ASCII only 1789 private static int toUpper(char c) { 1790 if ((c >= 'a') && (c <= 'z')) 1791 return c - ('a' - 'A'); 1792 return c; 1793 } 1794 1795 private static boolean equal(String s, String t) { 1796 if (s == t) return true; 1797 if ((s != null) && (t != null)) { 1798 if (s.length() != t.length()) 1799 return false; 1800 if (s.indexOf('%') < 0) 1801 return s.equals(t); 1802 int n = s.length(); 1803 for (int i = 0; i < n;) { 1804 char c = s.charAt(i); 1805 char d = t.charAt(i); 1806 if (c != '%') { 1807 if (c != d) 1808 return false; 1809 i++; 1810 continue; 1811 } 1812 if (d != '%') 1813 return false; 1814 i++; 1815 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1816 return false; 1817 i++; 1818 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1819 return false; 1820 i++; 1821 } 1822 return true; 1823 } 1824 return false; 1825 } 1826 1827 // US-ASCII only 1828 private static boolean equalIgnoringCase(String s, String t) { 1829 if (s == t) return true; 1830 if ((s != null) && (t != null)) { 1831 int n = s.length(); 1832 if (t.length() != n) 1833 return false; 1834 for (int i = 0; i < n; i++) { 1835 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1836 return false; 1837 } 1838 return true; 1839 } 1840 return false; 1841 } 1842 1843 private static int hash(int hash, String s) { 1844 if (s == null) return hash; 1845 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1846 : normalizedHash(hash, s); 1847 } 1848 1849 1850 private static int normalizedHash(int hash, String s) { 1851 int h = 0; 1852 for (int index = 0; index < s.length(); index++) { 1853 char ch = s.charAt(index); 1854 h = 31 * h + ch; 1855 if (ch == '%') { 1856 /* 1857 * Process the next two encoded characters 1858 */ 1859 for (int i = index + 1; i < index + 3; i++) 1860 h = 31 * h + toUpper(s.charAt(i)); 1861 index += 2; 1862 } 1863 } 1864 return hash * 127 + h; 1865 } 1866 1867 // US-ASCII only 1868 private static int hashIgnoringCase(int hash, String s) { 1869 if (s == null) return hash; 1870 int h = hash; 1871 int n = s.length(); 1872 for (int i = 0; i < n; i++) 1873 h = 31 * h + toLower(s.charAt(i)); 1874 return h; 1875 } 1876 1877 private static int compare(String s, String t) { 1878 if (s == t) return 0; 1879 if (s != null) { 1880 if (t != null) 1881 return s.compareTo(t); 1882 else 1883 return +1; 1884 } else { 1885 return -1; 1886 } 1887 } 1888 1889 // US-ASCII only 1890 private static int compareIgnoringCase(String s, String t) { 1891 if (s == t) return 0; 1892 if (s != null) { 1893 if (t != null) { 1894 int sn = s.length(); 1895 int tn = t.length(); 1896 int n = sn < tn ? sn : tn; 1897 for (int i = 0; i < n; i++) { 1898 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1899 if (c != 0) 1900 return c; 1901 } 1902 return sn - tn; 1903 } 1904 return +1; 1905 } else { 1906 return -1; 1907 } 1908 } 1909 1910 1911 // -- String construction -- 1912 1913 // If a scheme is given then the path, if given, must be absolute 1914 // 1915 private static void checkPath(String s, String scheme, String path) 1916 throws URISyntaxException 1917 { 1918 if (scheme != null) { 1919 if ((path != null) 1920 && ((path.length() > 0) && (path.charAt(0) != '/'))) 1921 throw new URISyntaxException(s, 1922 "Relative path in absolute URI"); 1923 } 1924 } 1925 1926 private void appendAuthority(StringBuilder sb, 1927 String authority, 1928 String userInfo, 1929 String host, 1930 int port) 1931 { 1932 if (host != null) { 1933 sb.append("//"); 1934 if (userInfo != null) { 1935 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1936 sb.append('@'); 1937 } 1938 boolean needBrackets = ((host.indexOf(':') >= 0) 1939 && !host.startsWith("[") 1940 && !host.endsWith("]")); 1941 if (needBrackets) sb.append('['); 1942 sb.append(host); 1943 if (needBrackets) sb.append(']'); 1944 if (port != -1) { 1945 sb.append(':'); 1946 sb.append(port); 1947 } 1948 } else if (authority != null) { 1949 sb.append("//"); 1950 if (authority.startsWith("[")) { 1951 // authority should (but may not) contain an embedded IPv6 address 1952 int end = authority.indexOf(']'); 1953 String doquote = authority, dontquote = ""; 1954 if (end != -1 && authority.indexOf(':') != -1) { 1955 // the authority contains an IPv6 address 1956 if (end == authority.length()) { 1957 dontquote = authority; 1958 doquote = ""; 1959 } else { 1960 dontquote = authority.substring(0 , end + 1); 1961 doquote = authority.substring(end + 1); 1962 } 1963 } 1964 sb.append(dontquote); 1965 sb.append(quote(doquote, 1966 L_REG_NAME | L_SERVER, 1967 H_REG_NAME | H_SERVER)); 1968 } else { 1969 sb.append(quote(authority, 1970 L_REG_NAME | L_SERVER, 1971 H_REG_NAME | H_SERVER)); 1972 } 1973 } 1974 } 1975 1976 private void appendSchemeSpecificPart(StringBuilder sb, 1977 String opaquePart, 1978 String authority, 1979 String userInfo, 1980 String host, 1981 int port, 1982 String path, 1983 String query) 1984 { 1985 if (opaquePart != null) { 1986 /* check if SSP begins with an IPv6 address 1987 * because we must not quote a literal IPv6 address 1988 */ 1989 if (opaquePart.startsWith("//[")) { 1990 int end = opaquePart.indexOf(']'); 1991 if (end != -1 && opaquePart.indexOf(':')!=-1) { 1992 String doquote, dontquote; 1993 if (end == opaquePart.length()) { 1994 dontquote = opaquePart; 1995 doquote = ""; 1996 } else { 1997 dontquote = opaquePart.substring(0,end+1); 1998 doquote = opaquePart.substring(end+1); 1999 } 2000 sb.append (dontquote); 2001 sb.append(quote(doquote, L_URIC, H_URIC)); 2002 } 2003 } else { 2004 sb.append(quote(opaquePart, L_URIC, H_URIC)); 2005 } 2006 } else { 2007 appendAuthority(sb, authority, userInfo, host, port); 2008 if (path != null) 2009 sb.append(quote(path, L_PATH, H_PATH)); 2010 if (query != null) { 2011 sb.append('?'); 2012 sb.append(quote(query, L_URIC, H_URIC)); 2013 } 2014 } 2015 } 2016 2017 private void appendFragment(StringBuilder sb, String fragment) { 2018 if (fragment != null) { 2019 sb.append('#'); 2020 sb.append(quote(fragment, L_URIC, H_URIC)); 2021 } 2022 } 2023 2024 private String toString(String scheme, 2025 String opaquePart, 2026 String authority, 2027 String userInfo, 2028 String host, 2029 int port, 2030 String path, 2031 String query, 2032 String fragment) 2033 { 2034 StringBuilder sb = new StringBuilder(); 2035 if (scheme != null) { 2036 sb.append(scheme); 2037 sb.append(':'); 2038 } 2039 appendSchemeSpecificPart(sb, opaquePart, 2040 authority, userInfo, host, port, 2041 path, query); 2042 appendFragment(sb, fragment); 2043 return sb.toString(); 2044 } 2045 2046 // -- Normalization, resolution, and relativization -- 2047 2048 // RFC2396 5.2 (6) 2049 private static String resolvePath(String base, String child, 2050 boolean absolute) 2051 { 2052 int i = base.lastIndexOf('/'); 2053 int cn = child.length(); 2054 String path = ""; 2055 2056 if (cn == 0) { 2057 // 5.2 (6a) 2058 if (i >= 0) 2059 path = base.substring(0, i + 1); 2060 } else { 2061 StringBuilder sb = new StringBuilder(base.length() + cn); 2062 // 5.2 (6a) 2063 if (i >= 0) 2064 sb.append(base, 0, i + 1); 2065 // 5.2 (6b) 2066 sb.append(child); 2067 path = sb.toString(); 2068 } 2069 2070 // 5.2 (6c-f) 2071 String np = normalize(path); 2072 2073 // 5.2 (6g): If the result is absolute but the path begins with "../", 2074 // then we simply leave the path as-is 2075 2076 return np; 2077 } 2078 2079 // RFC2396 5.2 2080 private static URI resolve(URI base, URI child) { 2081 // check if child if opaque first so that NPE is thrown 2082 // if child is null. 2083 if (child.isOpaque() || base.isOpaque()) 2084 return child; 2085 2086 // 5.2 (2): Reference to current document (lone fragment) 2087 if ((child.scheme == null) && (child.authority == null) 2088 && child.path.isEmpty() && (child.fragment != null) 2089 && (child.query == null)) { 2090 if ((base.fragment != null) 2091 && child.fragment.equals(base.fragment)) { 2092 return base; 2093 } 2094 URI ru = new URI(); 2095 ru.scheme = base.scheme; 2096 ru.authority = base.authority; 2097 ru.userInfo = base.userInfo; 2098 ru.host = base.host; 2099 ru.port = base.port; 2100 ru.path = base.path; 2101 ru.fragment = child.fragment; 2102 ru.query = base.query; 2103 return ru; 2104 } 2105 2106 // 5.2 (3): Child is absolute 2107 if (child.scheme != null) 2108 return child; 2109 2110 URI ru = new URI(); // Resolved URI 2111 ru.scheme = base.scheme; 2112 ru.query = child.query; 2113 ru.fragment = child.fragment; 2114 2115 // 5.2 (4): Authority 2116 if (child.authority == null) { 2117 ru.authority = base.authority; 2118 ru.host = base.host; 2119 ru.userInfo = base.userInfo; 2120 ru.port = base.port; 2121 2122 String cp = (child.path == null) ? "" : child.path; 2123 if ((cp.length() > 0) && (cp.charAt(0) == '/')) { 2124 // 5.2 (5): Child path is absolute 2125 ru.path = child.path; 2126 } else { 2127 // 5.2 (6): Resolve relative path 2128 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2129 } 2130 } else { 2131 ru.authority = child.authority; 2132 ru.host = child.host; 2133 ru.userInfo = child.userInfo; 2134 ru.host = child.host; 2135 ru.port = child.port; 2136 ru.path = child.path; 2137 } 2138 2139 // 5.2 (7): Recombine (nothing to do here) 2140 return ru; 2141 } 2142 2143 // If the given URI's path is normal then return the URI; 2144 // o.w., return a new URI containing the normalized path. 2145 // 2146 private static URI normalize(URI u) { 2147 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) 2148 return u; 2149 2150 String np = normalize(u.path); 2151 if (np == u.path) 2152 return u; 2153 2154 URI v = new URI(); 2155 v.scheme = u.scheme; 2156 v.fragment = u.fragment; 2157 v.authority = u.authority; 2158 v.userInfo = u.userInfo; 2159 v.host = u.host; 2160 v.port = u.port; 2161 v.path = np; 2162 v.query = u.query; 2163 return v; 2164 } 2165 2166 // If both URIs are hierarchical, their scheme and authority components are 2167 // identical, and the base path is a prefix of the child's path, then 2168 // return a relative URI that, when resolved against the base, yields the 2169 // child; otherwise, return the child. 2170 // 2171 private static URI relativize(URI base, URI child) { 2172 // check if child if opaque first so that NPE is thrown 2173 // if child is null. 2174 if (child.isOpaque() || base.isOpaque()) 2175 return child; 2176 if (!equalIgnoringCase(base.scheme, child.scheme) 2177 || !equal(base.authority, child.authority)) 2178 return child; 2179 2180 String bp = normalize(base.path); 2181 String cp = normalize(child.path); 2182 if (!bp.equals(cp)) { 2183 if (!bp.endsWith("/")) 2184 bp = bp + "/"; 2185 if (!cp.startsWith(bp)) 2186 return child; 2187 } 2188 2189 URI v = new URI(); 2190 v.path = cp.substring(bp.length()); 2191 v.query = child.query; 2192 v.fragment = child.fragment; 2193 return v; 2194 } 2195 2196 2197 2198 // -- Path normalization -- 2199 2200 // The following algorithm for path normalization avoids the creation of a 2201 // string object for each segment, as well as the use of a string buffer to 2202 // compute the final result, by using a single char array and editing it in 2203 // place. The array is first split into segments, replacing each slash 2204 // with '\0' and creating a segment-index array, each element of which is 2205 // the index of the first char in the corresponding segment. We then walk 2206 // through both arrays, removing ".", "..", and other segments as necessary 2207 // by setting their entries in the index array to -1. Finally, the two 2208 // arrays are used to rejoin the segments and compute the final result. 2209 // 2210 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2211 2212 2213 // Check the given path to see if it might need normalization. A path 2214 // might need normalization if it contains duplicate slashes, a "." 2215 // segment, or a ".." segment. Return -1 if no further normalization is 2216 // possible, otherwise return the number of segments found. 2217 // 2218 // This method takes a string argument rather than a char array so that 2219 // this test can be performed without invoking path.toCharArray(). 2220 // 2221 private static int needsNormalization(String path) { 2222 boolean normal = true; 2223 int ns = 0; // Number of segments 2224 int end = path.length() - 1; // Index of last char in path 2225 int p = 0; // Index of next char in path 2226 2227 // Skip initial slashes 2228 while (p <= end) { 2229 if (path.charAt(p) != '/') break; 2230 p++; 2231 } 2232 if (p > 1) normal = false; 2233 2234 // Scan segments 2235 while (p <= end) { 2236 2237 // Looking at "." or ".." ? 2238 if ((path.charAt(p) == '.') 2239 && ((p == end) 2240 || ((path.charAt(p + 1) == '/') 2241 || ((path.charAt(p + 1) == '.') 2242 && ((p + 1 == end) 2243 || (path.charAt(p + 2) == '/')))))) { 2244 normal = false; 2245 } 2246 ns++; 2247 2248 // Find beginning of next segment 2249 while (p <= end) { 2250 if (path.charAt(p++) != '/') 2251 continue; 2252 2253 // Skip redundant slashes 2254 while (p <= end) { 2255 if (path.charAt(p) != '/') break; 2256 normal = false; 2257 p++; 2258 } 2259 2260 break; 2261 } 2262 } 2263 2264 return normal ? -1 : ns; 2265 } 2266 2267 2268 // Split the given path into segments, replacing slashes with nulls and 2269 // filling in the given segment-index array. 2270 // 2271 // Preconditions: 2272 // segs.length == Number of segments in path 2273 // 2274 // Postconditions: 2275 // All slashes in path replaced by '\0' 2276 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2277 // 2278 private static void split(char[] path, int[] segs) { 2279 int end = path.length - 1; // Index of last char in path 2280 int p = 0; // Index of next char in path 2281 int i = 0; // Index of current segment 2282 2283 // Skip initial slashes 2284 while (p <= end) { 2285 if (path[p] != '/') break; 2286 path[p] = '\0'; 2287 p++; 2288 } 2289 2290 while (p <= end) { 2291 2292 // Note start of segment 2293 segs[i++] = p++; 2294 2295 // Find beginning of next segment 2296 while (p <= end) { 2297 if (path[p++] != '/') 2298 continue; 2299 path[p - 1] = '\0'; 2300 2301 // Skip redundant slashes 2302 while (p <= end) { 2303 if (path[p] != '/') break; 2304 path[p++] = '\0'; 2305 } 2306 break; 2307 } 2308 } 2309 2310 if (i != segs.length) 2311 throw new InternalError(); // ASSERT 2312 } 2313 2314 2315 // Join the segments in the given path according to the given segment-index 2316 // array, ignoring those segments whose index entries have been set to -1, 2317 // and inserting slashes as needed. Return the length of the resulting 2318 // path. 2319 // 2320 // Preconditions: 2321 // segs[i] == -1 implies segment i is to be ignored 2322 // path computed by split, as above, with '\0' having replaced '/' 2323 // 2324 // Postconditions: 2325 // path[0] .. path[return value] == Resulting path 2326 // 2327 private static int join(char[] path, int[] segs) { 2328 int ns = segs.length; // Number of segments 2329 int end = path.length - 1; // Index of last char in path 2330 int p = 0; // Index of next path char to write 2331 2332 if (path[p] == '\0') { 2333 // Restore initial slash for absolute paths 2334 path[p++] = '/'; 2335 } 2336 2337 for (int i = 0; i < ns; i++) { 2338 int q = segs[i]; // Current segment 2339 if (q == -1) 2340 // Ignore this segment 2341 continue; 2342 2343 if (p == q) { 2344 // We're already at this segment, so just skip to its end 2345 while ((p <= end) && (path[p] != '\0')) 2346 p++; 2347 if (p <= end) { 2348 // Preserve trailing slash 2349 path[p++] = '/'; 2350 } 2351 } else if (p < q) { 2352 // Copy q down to p 2353 while ((q <= end) && (path[q] != '\0')) 2354 path[p++] = path[q++]; 2355 if (q <= end) { 2356 // Preserve trailing slash 2357 path[p++] = '/'; 2358 } 2359 } else 2360 throw new InternalError(); // ASSERT false 2361 } 2362 2363 return p; 2364 } 2365 2366 2367 // Remove "." segments from the given path, and remove segment pairs 2368 // consisting of a non-".." segment followed by a ".." segment. 2369 // 2370 private static void removeDots(char[] path, int[] segs) { 2371 int ns = segs.length; 2372 int end = path.length - 1; 2373 2374 for (int i = 0; i < ns; i++) { 2375 int dots = 0; // Number of dots found (0, 1, or 2) 2376 2377 // Find next occurrence of "." or ".." 2378 do { 2379 int p = segs[i]; 2380 if (path[p] == '.') { 2381 if (p == end) { 2382 dots = 1; 2383 break; 2384 } else if (path[p + 1] == '\0') { 2385 dots = 1; 2386 break; 2387 } else if ((path[p + 1] == '.') 2388 && ((p + 1 == end) 2389 || (path[p + 2] == '\0'))) { 2390 dots = 2; 2391 break; 2392 } 2393 } 2394 i++; 2395 } while (i < ns); 2396 if ((i > ns) || (dots == 0)) 2397 break; 2398 2399 if (dots == 1) { 2400 // Remove this occurrence of "." 2401 segs[i] = -1; 2402 } else { 2403 // If there is a preceding non-".." segment, remove both that 2404 // segment and this occurrence of ".."; otherwise, leave this 2405 // ".." segment as-is. 2406 int j; 2407 for (j = i - 1; j >= 0; j--) { 2408 if (segs[j] != -1) break; 2409 } 2410 if (j >= 0) { 2411 int q = segs[j]; 2412 if (!((path[q] == '.') 2413 && (path[q + 1] == '.') 2414 && (path[q + 2] == '\0'))) { 2415 segs[i] = -1; 2416 segs[j] = -1; 2417 } 2418 } 2419 } 2420 } 2421 } 2422 2423 2424 // DEVIATION: If the normalized path is relative, and if the first 2425 // segment could be parsed as a scheme name, then prepend a "." segment 2426 // 2427 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2428 2429 if (path[0] == '\0') 2430 // The path is absolute 2431 return; 2432 2433 int ns = segs.length; 2434 int f = 0; // Index of first segment 2435 while (f < ns) { 2436 if (segs[f] >= 0) 2437 break; 2438 f++; 2439 } 2440 if ((f >= ns) || (f == 0)) 2441 // The path is empty, or else the original first segment survived, 2442 // in which case we already know that no leading "." is needed 2443 return; 2444 2445 int p = segs[f]; 2446 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2447 if (p >= path.length || path[p] == '\0') 2448 // No colon in first segment, so no "." needed 2449 return; 2450 2451 // At this point we know that the first segment is unused, 2452 // hence we can insert a "." segment at that position 2453 path[0] = '.'; 2454 path[1] = '\0'; 2455 segs[0] = 0; 2456 } 2457 2458 2459 // Normalize the given path string. A normal path string has no empty 2460 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2461 // segments equal to ".." that are preceded by a segment not equal to "..". 2462 // In contrast to Unix-style pathname normalization, for URI paths we 2463 // always retain trailing slashes. 2464 // 2465 private static String normalize(String ps) { 2466 2467 // Does this path need normalization? 2468 int ns = needsNormalization(ps); // Number of segments 2469 if (ns < 0) 2470 // Nope -- just return it 2471 return ps; 2472 2473 char[] path = ps.toCharArray(); // Path in char-array form 2474 2475 // Split path into segments 2476 int[] segs = new int[ns]; // Segment-index array 2477 split(path, segs); 2478 2479 // Remove dots 2480 removeDots(path, segs); 2481 2482 // Prevent scheme-name confusion 2483 maybeAddLeadingDot(path, segs); 2484 2485 // Join the remaining segments and return the result 2486 String s = new String(path, 0, join(path, segs)); 2487 if (s.equals(ps)) { 2488 // string was already normalized 2489 return ps; 2490 } 2491 return s; 2492 } 2493 2494 2495 2496 // -- Character classes for parsing -- 2497 2498 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2499 // permissible in the various components of a URI reference. We here 2500 // define a set of mask pairs to aid in enforcing these restrictions. Each 2501 // mask pair consists of two longs, a low mask and a high mask. Taken 2502 // together they represent a 128-bit mask, where bit i is set iff the 2503 // character with value i is permitted. 2504 // 2505 // This approach is more efficient than sequentially searching arrays of 2506 // permitted characters. It could be made still more efficient by 2507 // precompiling the mask information so that a character's presence in a 2508 // given mask could be determined by a single table lookup. 2509 2510 // Compute the low-order mask for the characters in the given string 2511 private static long lowMask(String chars) { 2512 int n = chars.length(); 2513 long m = 0; 2514 for (int i = 0; i < n; i++) { 2515 char c = chars.charAt(i); 2516 if (c < 64) 2517 m |= (1L << c); 2518 } 2519 return m; 2520 } 2521 2522 // Compute the high-order mask for the characters in the given string 2523 private static long highMask(String chars) { 2524 int n = chars.length(); 2525 long m = 0; 2526 for (int i = 0; i < n; i++) { 2527 char c = chars.charAt(i); 2528 if ((c >= 64) && (c < 128)) 2529 m |= (1L << (c - 64)); 2530 } 2531 return m; 2532 } 2533 2534 // Compute a low-order mask for the characters 2535 // between first and last, inclusive 2536 private static long lowMask(char first, char last) { 2537 long m = 0; 2538 int f = Math.max(Math.min(first, 63), 0); 2539 int l = Math.max(Math.min(last, 63), 0); 2540 for (int i = f; i <= l; i++) 2541 m |= 1L << i; 2542 return m; 2543 } 2544 2545 // Compute a high-order mask for the characters 2546 // between first and last, inclusive 2547 private static long highMask(char first, char last) { 2548 long m = 0; 2549 int f = Math.max(Math.min(first, 127), 64) - 64; 2550 int l = Math.max(Math.min(last, 127), 64) - 64; 2551 for (int i = f; i <= l; i++) 2552 m |= 1L << i; 2553 return m; 2554 } 2555 2556 // Tell whether the given character is permitted by the given mask pair 2557 private static boolean match(char c, long lowMask, long highMask) { 2558 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2559 return false; 2560 if (c < 64) 2561 return ((1L << c) & lowMask) != 0; 2562 if (c < 128) 2563 return ((1L << (c - 64)) & highMask) != 0; 2564 return false; 2565 } 2566 2567 // Character-class masks, in reverse order from RFC2396 because 2568 // initializers for static fields cannot make forward references. 2569 2570 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2571 // "8" | "9" 2572 private static final long L_DIGIT = lowMask('0', '9'); 2573 private static final long H_DIGIT = 0L; 2574 2575 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2576 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2577 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2578 private static final long L_UPALPHA = 0L; 2579 private static final long H_UPALPHA = highMask('A', 'Z'); 2580 2581 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2582 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2583 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2584 private static final long L_LOWALPHA = 0L; 2585 private static final long H_LOWALPHA = highMask('a', 'z'); 2586 2587 // alpha = lowalpha | upalpha 2588 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2589 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2590 2591 // alphanum = alpha | digit 2592 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2593 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2594 2595 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2596 // "a" | "b" | "c" | "d" | "e" | "f" 2597 private static final long L_HEX = L_DIGIT; 2598 private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f'); 2599 2600 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2601 // "(" | ")" 2602 private static final long L_MARK = lowMask("-_.!~*'()"); 2603 private static final long H_MARK = highMask("-_.!~*'()"); 2604 2605 // unreserved = alphanum | mark 2606 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2607 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2608 2609 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2610 // "$" | "," | "[" | "]" 2611 // Added per RFC2732: "[", "]" 2612 private static final long L_RESERVED = lowMask(";/?:@&=+$,[]"); 2613 private static final long H_RESERVED = highMask(";/?:@&=+$,[]"); 2614 2615 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2616 // characters are allowed; this is handled by the scanEscape method below. 2617 private static final long L_ESCAPED = 1L; 2618 private static final long H_ESCAPED = 0L; 2619 2620 // uric = reserved | unreserved | escaped 2621 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2622 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2623 2624 // pchar = unreserved | escaped | 2625 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2626 private static final long L_PCHAR 2627 = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,"); 2628 private static final long H_PCHAR 2629 = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,"); 2630 2631 // All valid path characters 2632 private static final long L_PATH = L_PCHAR | lowMask(";/"); 2633 private static final long H_PATH = H_PCHAR | highMask(";/"); 2634 2635 // Dash, for use in domainlabel and toplabel 2636 private static final long L_DASH = lowMask("-"); 2637 private static final long H_DASH = highMask("-"); 2638 2639 // Dot, for use in hostnames 2640 private static final long L_DOT = lowMask("."); 2641 private static final long H_DOT = highMask("."); 2642 2643 // userinfo = *( unreserved | escaped | 2644 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2645 private static final long L_USERINFO 2646 = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,"); 2647 private static final long H_USERINFO 2648 = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,"); 2649 2650 // reg_name = 1*( unreserved | escaped | "$" | "," | 2651 // ";" | ":" | "@" | "&" | "=" | "+" ) 2652 private static final long L_REG_NAME 2653 = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+"); 2654 private static final long H_REG_NAME 2655 = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+"); 2656 2657 // All valid characters for server-based authorities 2658 private static final long L_SERVER 2659 = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]"); 2660 private static final long H_SERVER 2661 = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]"); 2662 2663 // Special case of server authority that represents an IPv6 address 2664 // In this case, a % does not signify an escape sequence 2665 private static final long L_SERVER_PERCENT 2666 = L_SERVER | lowMask("%"); 2667 private static final long H_SERVER_PERCENT 2668 = H_SERVER | highMask("%"); 2669 private static final long L_LEFT_BRACKET = lowMask("["); 2670 private static final long H_LEFT_BRACKET = highMask("["); 2671 2672 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2673 private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-."); 2674 private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-."); 2675 2676 // scope_id = alpha | digit | "_" | "." 2677 private static final long L_SCOPE_ID 2678 = L_ALPHANUM | lowMask("_."); 2679 private static final long H_SCOPE_ID 2680 = H_ALPHANUM | highMask("_."); 2681 2682 // -- Escaping and encoding -- 2683 2684 private static final char[] hexDigits = { 2685 '0', '1', '2', '3', '4', '5', '6', '7', 2686 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2687 }; 2688 2689 private static void appendEscape(StringBuilder sb, byte b) { 2690 sb.append('%'); 2691 sb.append(hexDigits[(b >> 4) & 0x0f]); 2692 sb.append(hexDigits[(b >> 0) & 0x0f]); 2693 } 2694 2695 private static void appendEncoded(StringBuilder sb, char c) { 2696 ByteBuffer bb = null; 2697 try { 2698 bb = ThreadLocalCoders.encoderFor("UTF-8") 2699 .encode(CharBuffer.wrap("" + c)); 2700 } catch (CharacterCodingException x) { 2701 assert false; 2702 } 2703 while (bb.hasRemaining()) { 2704 int b = bb.get() & 0xff; 2705 if (b >= 0x80) 2706 appendEscape(sb, (byte)b); 2707 else 2708 sb.append((char)b); 2709 } 2710 } 2711 2712 // Quote any characters in s that are not permitted 2713 // by the given mask pair 2714 // 2715 private static String quote(String s, long lowMask, long highMask) { 2716 StringBuilder sb = null; 2717 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2718 for (int i = 0; i < s.length(); i++) { 2719 char c = s.charAt(i); 2720 if (c < '\u0080') { 2721 if (!match(c, lowMask, highMask)) { 2722 if (sb == null) { 2723 sb = new StringBuilder(); 2724 sb.append(s, 0, i); 2725 } 2726 appendEscape(sb, (byte)c); 2727 } else { 2728 if (sb != null) 2729 sb.append(c); 2730 } 2731 } else if (allowNonASCII 2732 && (Character.isSpaceChar(c) 2733 || Character.isISOControl(c))) { 2734 if (sb == null) { 2735 sb = new StringBuilder(); 2736 sb.append(s, 0, i); 2737 } 2738 appendEncoded(sb, c); 2739 } else { 2740 if (sb != null) 2741 sb.append(c); 2742 } 2743 } 2744 return (sb == null) ? s : sb.toString(); 2745 } 2746 2747 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2748 // assuming that s is otherwise legal 2749 // 2750 private static String encode(String s) { 2751 int n = s.length(); 2752 if (n == 0) 2753 return s; 2754 2755 // First check whether we actually need to encode 2756 for (int i = 0;;) { 2757 if (s.charAt(i) >= '\u0080') 2758 break; 2759 if (++i >= n) 2760 return s; 2761 } 2762 2763 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2764 ByteBuffer bb = null; 2765 try { 2766 bb = ThreadLocalCoders.encoderFor("UTF-8") 2767 .encode(CharBuffer.wrap(ns)); 2768 } catch (CharacterCodingException x) { 2769 assert false; 2770 } 2771 2772 StringBuilder sb = new StringBuilder(); 2773 while (bb.hasRemaining()) { 2774 int b = bb.get() & 0xff; 2775 if (b >= 0x80) 2776 appendEscape(sb, (byte)b); 2777 else 2778 sb.append((char)b); 2779 } 2780 return sb.toString(); 2781 } 2782 2783 private static int decode(char c) { 2784 if ((c >= '0') && (c <= '9')) 2785 return c - '0'; 2786 if ((c >= 'a') && (c <= 'f')) 2787 return c - 'a' + 10; 2788 if ((c >= 'A') && (c <= 'F')) 2789 return c - 'A' + 10; 2790 assert false; 2791 return -1; 2792 } 2793 2794 private static byte decode(char c1, char c2) { 2795 return (byte)( ((decode(c1) & 0xf) << 4) 2796 | ((decode(c2) & 0xf) << 0)); 2797 } 2798 2799 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2800 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2801 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2802 // are replaced with '\uFFFD'. 2803 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2804 // with a scope_id 2805 // 2806 private static String decode(String s) { 2807 return decode(s, true); 2808 } 2809 2810 // This method was introduced as a generalization of URI.decode method 2811 // to provide a fix for JDK-8037396 2812 private static String decode(String s, boolean ignorePercentInBrackets) { 2813 if (s == null) 2814 return s; 2815 int n = s.length(); 2816 if (n == 0) 2817 return s; 2818 if (s.indexOf('%') < 0) 2819 return s; 2820 2821 StringBuilder sb = new StringBuilder(n); 2822 ByteBuffer bb = ByteBuffer.allocate(n); 2823 CharBuffer cb = CharBuffer.allocate(n); 2824 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2825 .onMalformedInput(CodingErrorAction.REPLACE) 2826 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2827 2828 // This is not horribly efficient, but it will do for now 2829 char c = s.charAt(0); 2830 boolean betweenBrackets = false; 2831 2832 for (int i = 0; i < n;) { 2833 assert c == s.charAt(i); // Loop invariant 2834 if (c == '[') { 2835 betweenBrackets = true; 2836 } else if (betweenBrackets && c == ']') { 2837 betweenBrackets = false; 2838 } 2839 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2840 sb.append(c); 2841 if (++i >= n) 2842 break; 2843 c = s.charAt(i); 2844 continue; 2845 } 2846 bb.clear(); 2847 int ui = i; 2848 for (;;) { 2849 assert (n - i >= 2); 2850 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2851 if (++i >= n) 2852 break; 2853 c = s.charAt(i); 2854 if (c != '%') 2855 break; 2856 } 2857 bb.flip(); 2858 cb.clear(); 2859 dec.reset(); 2860 CoderResult cr = dec.decode(bb, cb, true); 2861 assert cr.isUnderflow(); 2862 cr = dec.flush(cb); 2863 assert cr.isUnderflow(); 2864 sb.append(cb.flip().toString()); 2865 } 2866 2867 return sb.toString(); 2868 } 2869 2870 2871 // -- Parsing -- 2872 2873 // For convenience we wrap the input URI string in a new instance of the 2874 // following internal class. This saves always having to pass the input 2875 // string as an argument to each internal scan/parse method. 2876 2877 private class Parser { 2878 2879 private String input; // URI input string 2880 private boolean requireServerAuthority = false; 2881 2882 Parser(String s) { 2883 input = s; 2884 string = s; 2885 } 2886 2887 // -- Methods for throwing URISyntaxException in various ways -- 2888 2889 private void fail(String reason) throws URISyntaxException { 2890 throw new URISyntaxException(input, reason); 2891 } 2892 2893 private void fail(String reason, int p) throws URISyntaxException { 2894 throw new URISyntaxException(input, reason, p); 2895 } 2896 2897 private void failExpecting(String expected, int p) 2898 throws URISyntaxException 2899 { 2900 fail("Expected " + expected, p); 2901 } 2902 2903 2904 // -- Simple access to the input string -- 2905 2906 // Tells whether start < end and, if so, whether charAt(start) == c 2907 // 2908 private boolean at(int start, int end, char c) { 2909 return (start < end) && (input.charAt(start) == c); 2910 } 2911 2912 // Tells whether start + s.length() < end and, if so, 2913 // whether the chars at the start position match s exactly 2914 // 2915 private boolean at(int start, int end, String s) { 2916 int p = start; 2917 int sn = s.length(); 2918 if (sn > end - p) 2919 return false; 2920 int i = 0; 2921 while (i < sn) { 2922 if (input.charAt(p++) != s.charAt(i)) { 2923 break; 2924 } 2925 i++; 2926 } 2927 return (i == sn); 2928 } 2929 2930 2931 // -- Scanning -- 2932 2933 // The various scan and parse methods that follow use a uniform 2934 // convention of taking the current start position and end index as 2935 // their first two arguments. The start is inclusive while the end is 2936 // exclusive, just as in the String class, i.e., a start/end pair 2937 // denotes the left-open interval [start, end) of the input string. 2938 // 2939 // These methods never proceed past the end position. They may return 2940 // -1 to indicate outright failure, but more often they simply return 2941 // the position of the first char after the last char scanned. Thus 2942 // a typical idiom is 2943 // 2944 // int p = start; 2945 // int q = scan(p, end, ...); 2946 // if (q > p) 2947 // // We scanned something 2948 // ...; 2949 // else if (q == p) 2950 // // We scanned nothing 2951 // ...; 2952 // else if (q == -1) 2953 // // Something went wrong 2954 // ...; 2955 2956 2957 // Scan a specific char: If the char at the given start position is 2958 // equal to c, return the index of the next char; otherwise, return the 2959 // start position. 2960 // 2961 private int scan(int start, int end, char c) { 2962 if ((start < end) && (input.charAt(start) == c)) 2963 return start + 1; 2964 return start; 2965 } 2966 2967 // Scan forward from the given start position. Stop at the first char 2968 // in the err string (in which case -1 is returned), or the first char 2969 // in the stop string (in which case the index of the preceding char is 2970 // returned), or the end of the input string (in which case the length 2971 // of the input string is returned). May return the start position if 2972 // nothing matches. 2973 // 2974 private int scan(int start, int end, String err, String stop) { 2975 int p = start; 2976 while (p < end) { 2977 char c = input.charAt(p); 2978 if (err.indexOf(c) >= 0) 2979 return -1; 2980 if (stop.indexOf(c) >= 0) 2981 break; 2982 p++; 2983 } 2984 return p; 2985 } 2986 2987 // Scan forward from the given start position. Stop at the first char 2988 // in the stop string (in which case the index of the preceding char is 2989 // returned), or the end of the input string (in which case the length 2990 // of the input string is returned). May return the start position if 2991 // nothing matches. 2992 // 2993 private int scan(int start, int end, String stop) { 2994 int p = start; 2995 while (p < end) { 2996 char c = input.charAt(p); 2997 if (stop.indexOf(c) >= 0) 2998 break; 2999 p++; 3000 } 3001 return p; 3002 } 3003 3004 // Scan a potential escape sequence, starting at the given position, 3005 // with the given first char (i.e., charAt(start) == c). 3006 // 3007 // This method assumes that if escapes are allowed then visible 3008 // non-US-ASCII chars are also allowed. 3009 // 3010 private int scanEscape(int start, int n, char first) 3011 throws URISyntaxException 3012 { 3013 int p = start; 3014 char c = first; 3015 if (c == '%') { 3016 // Process escape pair 3017 if ((p + 3 <= n) 3018 && match(input.charAt(p + 1), L_HEX, H_HEX) 3019 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3020 return p + 3; 3021 } 3022 fail("Malformed escape pair", p); 3023 } else if ((c > 128) 3024 && !Character.isSpaceChar(c) 3025 && !Character.isISOControl(c)) { 3026 // Allow unescaped but visible non-US-ASCII chars 3027 return p + 1; 3028 } 3029 return p; 3030 } 3031 3032 // Scan chars that match the given mask pair 3033 // 3034 private int scan(int start, int n, long lowMask, long highMask) 3035 throws URISyntaxException 3036 { 3037 int p = start; 3038 while (p < n) { 3039 char c = input.charAt(p); 3040 if (match(c, lowMask, highMask)) { 3041 p++; 3042 continue; 3043 } 3044 if ((lowMask & L_ESCAPED) != 0) { 3045 int q = scanEscape(p, n, c); 3046 if (q > p) { 3047 p = q; 3048 continue; 3049 } 3050 } 3051 break; 3052 } 3053 return p; 3054 } 3055 3056 // Check that each of the chars in [start, end) matches the given mask 3057 // 3058 private void checkChars(int start, int end, 3059 long lowMask, long highMask, 3060 String what) 3061 throws URISyntaxException 3062 { 3063 int p = scan(start, end, lowMask, highMask); 3064 if (p < end) 3065 fail("Illegal character in " + what, p); 3066 } 3067 3068 // Check that the char at position p matches the given mask 3069 // 3070 private void checkChar(int p, 3071 long lowMask, long highMask, 3072 String what) 3073 throws URISyntaxException 3074 { 3075 checkChars(p, p + 1, lowMask, highMask, what); 3076 } 3077 3078 3079 // -- Parsing -- 3080 3081 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3082 // 3083 void parse(boolean rsa) throws URISyntaxException { 3084 requireServerAuthority = rsa; 3085 int n = input.length(); 3086 int p = scan(0, n, "/?#", ":"); 3087 if ((p >= 0) && at(p, n, ':')) { 3088 if (p == 0) 3089 failExpecting("scheme name", 0); 3090 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3091 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3092 scheme = input.substring(0, p); 3093 p++; // Skip ':' 3094 if (at(p, n, '/')) { 3095 p = parseHierarchical(p, n); 3096 } else { 3097 // opaque; need to create the schemeSpecificPart 3098 int q = scan(p, n, "#"); 3099 if (q <= p) 3100 failExpecting("scheme-specific part", p); 3101 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3102 schemeSpecificPart = input.substring(p, q); 3103 p = q; 3104 } 3105 } else { 3106 p = parseHierarchical(0, n); 3107 } 3108 if (at(p, n, '#')) { 3109 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3110 fragment = input.substring(p + 1, n); 3111 p = n; 3112 } 3113 if (p < n) 3114 fail("end of URI", p); 3115 } 3116 3117 // [//authority]<path>[?<query>] 3118 // 3119 // DEVIATION from RFC2396: We allow an empty authority component as 3120 // long as it's followed by a non-empty path, query component, or 3121 // fragment component. This is so that URIs such as "file:///foo/bar" 3122 // will parse. This seems to be the intent of RFC2396, though the 3123 // grammar does not permit it. If the authority is empty then the 3124 // userInfo, host, and port components are undefined. 3125 // 3126 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3127 // to be the intent of RFC2396, but the grammar does not permit it. 3128 // The primary consequence of this deviation is that "#f" parses as a 3129 // relative URI with an empty path. 3130 // 3131 private int parseHierarchical(int start, int n) 3132 throws URISyntaxException 3133 { 3134 int p = start; 3135 if (at(p, n, '/') && at(p + 1, n, '/')) { 3136 p += 2; 3137 int q = scan(p, n, "/?#"); 3138 if (q > p) { 3139 p = parseAuthority(p, q); 3140 } else if (q < n) { 3141 // DEVIATION: Allow empty authority prior to non-empty 3142 // path, query component or fragment identifier 3143 } else 3144 failExpecting("authority", p); 3145 } 3146 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3147 checkChars(p, q, L_PATH, H_PATH, "path"); 3148 path = input.substring(p, q); 3149 p = q; 3150 if (at(p, n, '?')) { 3151 p++; 3152 q = scan(p, n, "#"); 3153 checkChars(p, q, L_URIC, H_URIC, "query"); 3154 query = input.substring(p, q); 3155 p = q; 3156 } 3157 return p; 3158 } 3159 3160 // authority = server | reg_name 3161 // 3162 // Ambiguity: An authority that is a registry name rather than a server 3163 // might have a prefix that parses as a server. We use the fact that 3164 // the authority component is always followed by '/' or the end of the 3165 // input string to resolve this: If the complete authority did not 3166 // parse as a server then we try to parse it as a registry name. 3167 // 3168 private int parseAuthority(int start, int n) 3169 throws URISyntaxException 3170 { 3171 int p = start; 3172 int q = p; 3173 URISyntaxException ex = null; 3174 3175 boolean serverChars; 3176 boolean regChars; 3177 3178 if (scan(p, n, "]") > p) { 3179 // contains a literal IPv6 address, therefore % is allowed 3180 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3181 } else { 3182 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3183 } 3184 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3185 3186 if (regChars && !serverChars) { 3187 // Must be a registry-based authority 3188 authority = input.substring(p, n); 3189 return n; 3190 } 3191 3192 if (serverChars) { 3193 // Might be (probably is) a server-based authority, so attempt 3194 // to parse it as such. If the attempt fails, try to treat it 3195 // as a registry-based authority. 3196 try { 3197 q = parseServer(p, n); 3198 if (q < n) 3199 failExpecting("end of authority", q); 3200 authority = input.substring(p, n); 3201 } catch (URISyntaxException x) { 3202 // Undo results of failed parse 3203 userInfo = null; 3204 host = null; 3205 port = -1; 3206 if (requireServerAuthority) { 3207 // If we're insisting upon a server-based authority, 3208 // then just re-throw the exception 3209 throw x; 3210 } else { 3211 // Save the exception in case it doesn't parse as a 3212 // registry either 3213 ex = x; 3214 q = p; 3215 } 3216 } 3217 } 3218 3219 if (q < n) { 3220 if (regChars) { 3221 // Registry-based authority 3222 authority = input.substring(p, n); 3223 } else if (ex != null) { 3224 // Re-throw exception; it was probably due to 3225 // a malformed IPv6 address 3226 throw ex; 3227 } else { 3228 fail("Illegal character in authority", q); 3229 } 3230 } 3231 3232 return n; 3233 } 3234 3235 3236 // [<userinfo>@]<host>[:<port>] 3237 // 3238 private int parseServer(int start, int n) 3239 throws URISyntaxException 3240 { 3241 int p = start; 3242 int q; 3243 3244 // userinfo 3245 q = scan(p, n, "/?#", "@"); 3246 if ((q >= p) && at(q, n, '@')) { 3247 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3248 userInfo = input.substring(p, q); 3249 p = q + 1; // Skip '@' 3250 } 3251 3252 // hostname, IPv4 address, or IPv6 address 3253 if (at(p, n, '[')) { 3254 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3255 p++; 3256 q = scan(p, n, "/?#", "]"); 3257 if ((q > p) && at(q, n, ']')) { 3258 // look for a "%" scope id 3259 int r = scan (p, q, "%"); 3260 if (r > p) { 3261 parseIPv6Reference(p, r); 3262 if (r+1 == q) { 3263 fail ("scope id expected"); 3264 } 3265 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3266 "scope id"); 3267 } else { 3268 parseIPv6Reference(p, q); 3269 } 3270 host = input.substring(p-1, q+1); 3271 p = q + 1; 3272 } else { 3273 failExpecting("closing bracket for IPv6 address", q); 3274 } 3275 } else { 3276 q = parseIPv4Address(p, n); 3277 if (q <= p) 3278 q = parseHostname(p, n); 3279 p = q; 3280 } 3281 3282 // port 3283 if (at(p, n, ':')) { 3284 p++; 3285 q = scan(p, n, "/"); 3286 if (q > p) { 3287 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3288 try { 3289 port = Integer.parseInt(input, p, q, 10); 3290 } catch (NumberFormatException x) { 3291 fail("Malformed port number", p); 3292 } 3293 p = q; 3294 } 3295 } 3296 if (p < n) 3297 failExpecting("port number", p); 3298 3299 return p; 3300 } 3301 3302 // Scan a string of decimal digits whose value fits in a byte 3303 // 3304 private int scanByte(int start, int n) 3305 throws URISyntaxException 3306 { 3307 int p = start; 3308 int q = scan(p, n, L_DIGIT, H_DIGIT); 3309 if (q <= p) return q; 3310 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3311 return q; 3312 } 3313 3314 // Scan an IPv4 address. 3315 // 3316 // If the strict argument is true then we require that the given 3317 // interval contain nothing besides an IPv4 address; if it is false 3318 // then we only require that it start with an IPv4 address. 3319 // 3320 // If the interval does not contain or start with (depending upon the 3321 // strict argument) a legal IPv4 address characters then we return -1 3322 // immediately; otherwise we insist that these characters parse as a 3323 // legal IPv4 address and throw an exception on failure. 3324 // 3325 // We assume that any string of decimal digits and dots must be an IPv4 3326 // address. It won't parse as a hostname anyway, so making that 3327 // assumption here allows more meaningful exceptions to be thrown. 3328 // 3329 private int scanIPv4Address(int start, int n, boolean strict) 3330 throws URISyntaxException 3331 { 3332 int p = start; 3333 int q; 3334 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3335 if ((m <= p) || (strict && (m != n))) 3336 return -1; 3337 for (;;) { 3338 // Per RFC2732: At most three digits per byte 3339 // Further constraint: Each element fits in a byte 3340 if ((q = scanByte(p, m)) <= p) break; p = q; 3341 if ((q = scan(p, m, '.')) <= p) break; p = q; 3342 if ((q = scanByte(p, m)) <= p) break; p = q; 3343 if ((q = scan(p, m, '.')) <= p) break; p = q; 3344 if ((q = scanByte(p, m)) <= p) break; p = q; 3345 if ((q = scan(p, m, '.')) <= p) break; p = q; 3346 if ((q = scanByte(p, m)) <= p) break; p = q; 3347 if (q < m) break; 3348 return q; 3349 } 3350 fail("Malformed IPv4 address", q); 3351 return -1; 3352 } 3353 3354 // Take an IPv4 address: Throw an exception if the given interval 3355 // contains anything except an IPv4 address 3356 // 3357 private int takeIPv4Address(int start, int n, String expected) 3358 throws URISyntaxException 3359 { 3360 int p = scanIPv4Address(start, n, true); 3361 if (p <= start) 3362 failExpecting(expected, start); 3363 return p; 3364 } 3365 3366 // Attempt to parse an IPv4 address, returning -1 on failure but 3367 // allowing the given interval to contain [:<characters>] after 3368 // the IPv4 address. 3369 // 3370 private int parseIPv4Address(int start, int n) { 3371 int p; 3372 3373 try { 3374 p = scanIPv4Address(start, n, false); 3375 } catch (URISyntaxException x) { 3376 return -1; 3377 } catch (NumberFormatException nfe) { 3378 return -1; 3379 } 3380 3381 if (p > start && p < n) { 3382 // IPv4 address is followed by something - check that 3383 // it's a ":" as this is the only valid character to 3384 // follow an address. 3385 if (input.charAt(p) != ':') { 3386 p = -1; 3387 } 3388 } 3389 3390 if (p > start) 3391 host = input.substring(start, p); 3392 3393 return p; 3394 } 3395 3396 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3397 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3398 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3399 // 3400 private int parseHostname(int start, int n) 3401 throws URISyntaxException 3402 { 3403 int p = start; 3404 int q; 3405 int l = -1; // Start of last parsed label 3406 3407 do { 3408 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3409 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3410 if (q <= p) 3411 break; 3412 l = p; 3413 if (q > p) { 3414 p = q; 3415 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3416 if (q > p) { 3417 if (input.charAt(q - 1) == '-') 3418 fail("Illegal character in hostname", q - 1); 3419 p = q; 3420 } 3421 } 3422 q = scan(p, n, '.'); 3423 if (q <= p) 3424 break; 3425 p = q; 3426 } while (p < n); 3427 3428 if ((p < n) && !at(p, n, ':')) 3429 fail("Illegal character in hostname", p); 3430 3431 if (l < 0) 3432 failExpecting("hostname", start); 3433 3434 // for a fully qualified hostname check that the rightmost 3435 // label starts with an alpha character. 3436 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3437 fail("Illegal character in hostname", l); 3438 } 3439 3440 host = input.substring(start, p); 3441 return p; 3442 } 3443 3444 3445 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3446 // 3447 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3448 // the form ::12.34.56.78, which are clearly shown in the examples 3449 // earlier in the document. Here is the original grammar: 3450 // 3451 // IPv6address = hexpart [ ":" IPv4address ] 3452 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3453 // hexseq = hex4 *( ":" hex4) 3454 // hex4 = 1*4HEXDIG 3455 // 3456 // We therefore use the following revised grammar: 3457 // 3458 // IPv6address = hexseq [ ":" IPv4address ] 3459 // | hexseq [ "::" [ hexpost ] ] 3460 // | "::" [ hexpost ] 3461 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3462 // hexseq = hex4 *( ":" hex4) 3463 // hex4 = 1*4HEXDIG 3464 // 3465 // This covers all and only the following cases: 3466 // 3467 // hexseq 3468 // hexseq : IPv4address 3469 // hexseq :: 3470 // hexseq :: hexseq 3471 // hexseq :: hexseq : IPv4address 3472 // hexseq :: IPv4address 3473 // :: hexseq 3474 // :: hexseq : IPv4address 3475 // :: IPv4address 3476 // :: 3477 // 3478 // Additionally we constrain the IPv6 address as follows :- 3479 // 3480 // i. IPv6 addresses without compressed zeros should contain 3481 // exactly 16 bytes. 3482 // 3483 // ii. IPv6 addresses with compressed zeros should contain 3484 // less than 16 bytes. 3485 3486 private int ipv6byteCount = 0; 3487 3488 private int parseIPv6Reference(int start, int n) 3489 throws URISyntaxException 3490 { 3491 int p = start; 3492 int q; 3493 boolean compressedZeros = false; 3494 3495 q = scanHexSeq(p, n); 3496 3497 if (q > p) { 3498 p = q; 3499 if (at(p, n, "::")) { 3500 compressedZeros = true; 3501 p = scanHexPost(p + 2, n); 3502 } else if (at(p, n, ':')) { 3503 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3504 ipv6byteCount += 4; 3505 } 3506 } else if (at(p, n, "::")) { 3507 compressedZeros = true; 3508 p = scanHexPost(p + 2, n); 3509 } 3510 if (p < n) 3511 fail("Malformed IPv6 address", start); 3512 if (ipv6byteCount > 16) 3513 fail("IPv6 address too long", start); 3514 if (!compressedZeros && ipv6byteCount < 16) 3515 fail("IPv6 address too short", start); 3516 if (compressedZeros && ipv6byteCount == 16) 3517 fail("Malformed IPv6 address", start); 3518 3519 return p; 3520 } 3521 3522 private int scanHexPost(int start, int n) 3523 throws URISyntaxException 3524 { 3525 int p = start; 3526 int q; 3527 3528 if (p == n) 3529 return p; 3530 3531 q = scanHexSeq(p, n); 3532 if (q > p) { 3533 p = q; 3534 if (at(p, n, ':')) { 3535 p++; 3536 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3537 ipv6byteCount += 4; 3538 } 3539 } else { 3540 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3541 ipv6byteCount += 4; 3542 } 3543 return p; 3544 } 3545 3546 // Scan a hex sequence; return -1 if one could not be scanned 3547 // 3548 private int scanHexSeq(int start, int n) 3549 throws URISyntaxException 3550 { 3551 int p = start; 3552 int q; 3553 3554 q = scan(p, n, L_HEX, H_HEX); 3555 if (q <= p) 3556 return -1; 3557 if (at(q, n, '.')) // Beginning of IPv4 address 3558 return -1; 3559 if (q > p + 4) 3560 fail("IPv6 hexadecimal digit sequence too long", p); 3561 ipv6byteCount += 2; 3562 p = q; 3563 while (p < n) { 3564 if (!at(p, n, ':')) 3565 break; 3566 if (at(p + 1, n, ':')) 3567 break; // "::" 3568 p++; 3569 q = scan(p, n, L_HEX, H_HEX); 3570 if (q <= p) 3571 failExpecting("digits for an IPv6 address", p); 3572 if (at(q, n, '.')) { // Beginning of IPv4 address 3573 p--; 3574 break; 3575 } 3576 if (q > p + 4) 3577 fail("IPv6 hexadecimal digit sequence too long", p); 3578 ipv6byteCount += 2; 3579 p = q; 3580 } 3581 3582 return p; 3583 } 3584 3585 } 3586 static { 3587 SharedSecrets.setJavaNetAccess( 3588 new JavaNetAccess() { 3589 public URI createURI(String scheme, String path) { 3590 return new URI(scheme, path); 3591 } 3592 } 3593 ); 3594 } 3595 }