1 /* 2 * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.net; 27 28 import java.io.IOException; 29 import java.io.InvalidObjectException; 30 import java.io.ObjectInputStream; 31 import java.io.ObjectOutputStream; 32 import java.io.Serializable; 33 import java.nio.ByteBuffer; 34 import java.nio.CharBuffer; 35 import java.nio.charset.CharsetDecoder; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.CharacterCodingException; 39 import java.text.Normalizer; 40 import sun.nio.cs.ThreadLocalCoders; 41 42 import java.lang.Character; // for javadoc 43 import java.lang.NullPointerException; // for javadoc 44 45 46 /** 47 * Represents a Uniform Resource Identifier (URI) reference. 48 * 49 * <p> Aside from some minor deviations noted below, an instance of this 50 * class represents a URI reference as defined by 51 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 52 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a 53 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 54 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format 55 * also supports scope_ids. The syntax and usage of scope_ids is described 56 * <a href="Inet6Address.html#scoped">here</a>. 57 * This class provides constructors for creating URI instances from 58 * their components or by parsing their string forms, methods for accessing the 59 * various components of an instance, and methods for normalizing, resolving, 60 * and relativizing URI instances. Instances of this class are immutable. 61 * 62 * 63 * <h3> URI syntax and components </h3> 64 * 65 * At the highest level a URI reference (hereinafter simply "URI") in string 66 * form has the syntax 67 * 68 * <blockquote> 69 * [<i>scheme</i><b>{@code :}</b>]<i>scheme-specific-part</i>[<b>{@code #}</b><i>fragment</i>] 70 * </blockquote> 71 * 72 * where square brackets [...] delineate optional components and the characters 73 * <b>{@code :}</b> and <b>{@code #}</b> stand for themselves. 74 * 75 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is 76 * said to be <i>relative</i>. URIs are also classified according to whether 77 * they are <i>opaque</i> or <i>hierarchical</i>. 78 * 79 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does 80 * not begin with a slash character ({@code '/'}). Opaque URIs are not 81 * subject to further parsing. Some examples of opaque URIs are: 82 * 83 * <blockquote><table cellpadding=0 cellspacing=0 summary="layout"> 84 * <tr><td>{@code mailto:java-net@java.sun.com}<td></tr> 85 * <tr><td>{@code news:comp.lang.java}<td></tr> 86 * <tr><td>{@code urn:isbn:096139210x}</td></tr> 87 * </table></blockquote> 88 * 89 * <p> A <i>hierarchical</i> URI is either an absolute URI whose 90 * scheme-specific part begins with a slash character, or a relative URI, that 91 * is, a URI that does not specify a scheme. Some examples of hierarchical 92 * URIs are: 93 * 94 * <blockquote> 95 * {@code http://example.com/languages/java/}<br> 96 * {@code sample/a/index.html#28}<br> 97 * {@code ../../demo/b/index.html}<br> 98 * {@code file:///~/calendar} 99 * </blockquote> 100 * 101 * <p> A hierarchical URI is subject to further parsing according to the syntax 102 * 103 * <blockquote> 104 * [<i>scheme</i><b>{@code :}</b>][<b>{@code //}</b><i>authority</i>][<i>path</i>][<b>{@code ?}</b><i>query</i>][<b>{@code #}</b><i>fragment</i>] 105 * </blockquote> 106 * 107 * where the characters <b>{@code :}</b>, <b>{@code /}</b>, 108 * <b>{@code ?}</b>, and <b>{@code #}</b> stand for themselves. The 109 * scheme-specific part of a hierarchical URI consists of the characters 110 * between the scheme and fragment components. 111 * 112 * <p> The authority component of a hierarchical URI is, if specified, either 113 * <i>server-based</i> or <i>registry-based</i>. A server-based authority 114 * parses according to the familiar syntax 115 * 116 * <blockquote> 117 * [<i>user-info</i><b>{@code @}</b>]<i>host</i>[<b>{@code :}</b><i>port</i>] 118 * </blockquote> 119 * 120 * where the characters <b>{@code @}</b> and <b>{@code :}</b> stand for 121 * themselves. Nearly all URI schemes currently in use are server-based. An 122 * authority component that does not parse in this way is considered to be 123 * registry-based. 124 * 125 * <p> The path component of a hierarchical URI is itself said to be absolute 126 * if it begins with a slash character ({@code '/'}); otherwise it is 127 * relative. The path of a hierarchical URI that is either absolute or 128 * specifies an authority is always absolute. 129 * 130 * <p> All told, then, a URI instance has the following nine components: 131 * 132 * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment"> 133 * <tr><th><i>Component</i></th><th><i>Type</i></th></tr> 134 * <tr><td>scheme</td><td>{@code String}</td></tr> 135 * <tr><td>scheme-specific-part </td><td>{@code String}</td></tr> 136 * <tr><td>authority</td><td>{@code String}</td></tr> 137 * <tr><td>user-info</td><td>{@code String}</td></tr> 138 * <tr><td>host</td><td>{@code String}</td></tr> 139 * <tr><td>port</td><td>{@code int}</td></tr> 140 * <tr><td>path</td><td>{@code String}</td></tr> 141 * <tr><td>query</td><td>{@code String}</td></tr> 142 * <tr><td>fragment</td><td>{@code String}</td></tr> 143 * </table></blockquote> 144 * 145 * In a given instance any particular component is either <i>undefined</i> or 146 * <i>defined</i> with a distinct value. Undefined string components are 147 * represented by {@code null}, while undefined integer components are 148 * represented by {@code -1}. A string component may be defined to have the 149 * empty string as its value; this is not equivalent to that component being 150 * undefined. 151 * 152 * <p> Whether a particular component is or is not defined in an instance 153 * depends upon the type of the URI being represented. An absolute URI has a 154 * scheme component. An opaque URI has a scheme, a scheme-specific part, and 155 * possibly a fragment, but has no other components. A hierarchical URI always 156 * has a path (though it may be empty) and a scheme-specific-part (which at 157 * least contains the path), and may have any of the other components. If the 158 * authority component is present and is server-based then the host component 159 * will be defined and the user-information and port components may be defined. 160 * 161 * 162 * <h4> Operations on URI instances </h4> 163 * 164 * The key operations supported by this class are those of 165 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>. 166 * 167 * <p> <i>Normalization</i> is the process of removing unnecessary {@code "."} 168 * and {@code ".."} segments from the path component of a hierarchical URI. 169 * Each {@code "."} segment is simply removed. A {@code ".."} segment is 170 * removed only if it is preceded by a non-{@code ".."} segment. 171 * Normalization has no effect upon opaque URIs. 172 * 173 * <p> <i>Resolution</i> is the process of resolving one URI against another, 174 * <i>base</i> URI. The resulting URI is constructed from components of both 175 * URIs in the manner specified by RFC 2396, taking components from the 176 * base URI for those not specified in the original. For hierarchical URIs, 177 * the path of the original is resolved against the path of the base and then 178 * normalized. The result, for example, of resolving 179 * 180 * <blockquote> 181 * {@code sample/a/index.html#28} 182 * 183 * (1) 184 * </blockquote> 185 * 186 * against the base URI {@code http://example.com/languages/java/} is the result 187 * URI 188 * 189 * <blockquote> 190 * {@code http://example.com/languages/java/sample/a/index.html#28} 191 * </blockquote> 192 * 193 * Resolving the relative URI 194 * 195 * <blockquote> 196 * {@code ../../demo/b/index.html} (2) 197 * </blockquote> 198 * 199 * against this result yields, in turn, 200 * 201 * <blockquote> 202 * {@code http://example.com/languages/java/demo/b/index.html} 203 * </blockquote> 204 * 205 * Resolution of both absolute and relative URIs, and of both absolute and 206 * relative paths in the case of hierarchical URIs, is supported. Resolving 207 * the URI {@code file:///~calendar} against any other URI simply yields the 208 * original URI, since it is absolute. Resolving the relative URI (2) above 209 * against the relative base URI (1) yields the normalized, but still relative, 210 * URI 211 * 212 * <blockquote> 213 * {@code demo/b/index.html} 214 * </blockquote> 215 * 216 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any 217 * two normalized URIs <i>u</i> and <i>v</i>, 218 * 219 * <blockquote> 220 * <i>u</i>{@code .relativize(}<i>u</i>{@code .resolve(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} and<br> 221 * <i>u</i>{@code .resolve(}<i>u</i>{@code .relativize(}<i>v</i>{@code )).equals(}<i>v</i>{@code )} .<br> 222 * </blockquote> 223 * 224 * This operation is often useful when constructing a document containing URIs 225 * that must be made relative to the base URI of the document wherever 226 * possible. For example, relativizing the URI 227 * 228 * <blockquote> 229 * {@code http://example.com/languages/java/sample/a/index.html#28} 230 * </blockquote> 231 * 232 * against the base URI 233 * 234 * <blockquote> 235 * {@code http://example.com/languages/java/} 236 * </blockquote> 237 * 238 * yields the relative URI {@code sample/a/index.html#28}. 239 * 240 * 241 * <h4> Character categories </h4> 242 * 243 * RFC 2396 specifies precisely which characters are permitted in the 244 * various components of a URI reference. The following categories, most of 245 * which are taken from that specification, are used below to describe these 246 * constraints: 247 * 248 * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other"> 249 * <tr><th valign=top><i>alpha</i></th> 250 * <td>The US-ASCII alphabetic characters, 251 * {@code 'A'} through {@code 'Z'} 252 * and {@code 'a'} through {@code 'z'}</td></tr> 253 * <tr><th valign=top><i>digit</i></th> 254 * <td>The US-ASCII decimal digit characters, 255 * {@code '0'} through {@code '9'}</td></tr> 256 * <tr><th valign=top><i>alphanum</i></th> 257 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr> 258 * <tr><th valign=top><i>unreserved</i> </th> 259 * <td>All <i>alphanum</i> characters together with those in the string 260 * {@code "_-!.~'()*"}</td></tr> 261 * <tr><th valign=top><i>punct</i></th> 262 * <td>The characters in the string {@code ",;:$&+="}</td></tr> 263 * <tr><th valign=top><i>reserved</i></th> 264 * <td>All <i>punct</i> characters together with those in the string 265 * {@code "?/[]@"}</td></tr> 266 * <tr><th valign=top><i>escaped</i></th> 267 * <td>Escaped octets, that is, triplets consisting of the percent 268 * character ({@code '%'}) followed by two hexadecimal digits 269 * ({@code '0'}-{@code '9'}, {@code 'A'}-{@code 'F'}, and 270 * {@code 'a'}-{@code 'f'})</td></tr> 271 * <tr><th valign=top><i>other</i></th> 272 * <td>The Unicode characters that are not in the US-ASCII character set, 273 * are not control characters (according to the {@link 274 * java.lang.Character#isISOControl(char) Character.isISOControl} 275 * method), and are not space characters (according to the {@link 276 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar} 277 * method) <i>(<b>Deviation from RFC 2396</b>, which is 278 * limited to US-ASCII)</i></td></tr> 279 * </table></blockquote> 280 * 281 * <p><a name="legal-chars"></a> The set of all legal URI characters consists of 282 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i> 283 * characters. 284 * 285 * 286 * <h4> Escaped octets, quotation, encoding, and decoding </h4> 287 * 288 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and 289 * fragment components. Escaping serves two purposes in URIs: 290 * 291 * <ul> 292 * 293 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to 294 * conform strictly to RFC 2396 by not containing any <i>other</i> 295 * characters. </p></li> 296 * 297 * <li><p> To <i>quote</i> characters that are otherwise illegal in a 298 * component. The user-info, path, query, and fragment components differ 299 * slightly in terms of which characters are considered legal and illegal. 300 * </p></li> 301 * 302 * </ul> 303 * 304 * These purposes are served in this class by three related operations: 305 * 306 * <ul> 307 * 308 * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it 309 * with the sequence of escaped octets that represent that character in the 310 * UTF-8 character set. The Euro currency symbol ({@code '\u005Cu20AC'}), 311 * for example, is encoded as {@code "%E2%82%AC"}. <i>(<b>Deviation from 312 * RFC 2396</b>, which does not specify any particular character 313 * set.)</i> </p></li> 314 * 315 * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by 316 * encoding it. The space character, for example, is quoted by replacing it 317 * with {@code "%20"}. UTF-8 contains US-ASCII, hence for US-ASCII 318 * characters this transformation has exactly the effect required by 319 * RFC 2396. </p></li> 320 * 321 * <li><p><a name="decode"></a> 322 * A sequence of escaped octets is <i>decoded</i> by 323 * replacing it with the sequence of characters that it represents in the 324 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the 325 * effect of de-quoting any quoted US-ASCII characters as well as that of 326 * decoding any encoded non-US-ASCII characters. If a <a 327 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs 328 * when decoding the escaped octets then the erroneous octets are replaced by 329 * {@code '\u005CuFFFD'}, the Unicode replacement character. </p></li> 330 * 331 * </ul> 332 * 333 * These operations are exposed in the constructors and methods of this class 334 * as follows: 335 * 336 * <ul> 337 * 338 * <li><p> The {@linkplain #URI(java.lang.String) single-argument 339 * constructor} requires any illegal characters in its argument to be 340 * quoted and preserves any escaped octets and <i>other</i> characters that 341 * are present. </p></li> 342 * 343 * <li><p> The {@linkplain 344 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String) 345 * multi-argument constructors} quote illegal characters as 346 * required by the components in which they appear. The percent character 347 * ({@code '%'}) is always quoted by these constructors. Any <i>other</i> 348 * characters are preserved. </p></li> 349 * 350 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath() 351 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment() 352 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link 353 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the 354 * values of their corresponding components in raw form, without interpreting 355 * any escaped octets. The strings returned by these methods may contain 356 * both escaped octets and <i>other</i> characters, and will not contain any 357 * illegal characters. </p></li> 358 * 359 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath() 360 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment() 361 * getFragment}, {@link #getAuthority() getAuthority}, and {@link 362 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped 363 * octets in their corresponding components. The strings returned by these 364 * methods may contain both <i>other</i> characters and illegal characters, 365 * and will not contain any escaped octets. </p></li> 366 * 367 * <li><p> The {@link #toString() toString} method returns a URI string with 368 * all necessary quotation but which may contain <i>other</i> characters. 369 * </p></li> 370 * 371 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully 372 * quoted and encoded URI string that does not contain any <i>other</i> 373 * characters. </p></li> 374 * 375 * </ul> 376 * 377 * 378 * <h4> Identities </h4> 379 * 380 * For any URI <i>u</i>, it is always the case that 381 * 382 * <blockquote> 383 * {@code new URI(}<i>u</i>{@code .toString()).equals(}<i>u</i>{@code )} . 384 * </blockquote> 385 * 386 * For any URI <i>u</i> that does not contain redundant syntax such as two 387 * slashes before an empty authority (as in {@code file:///tmp/} ) or a 388 * colon following a host name but no port (as in 389 * {@code http://java.sun.com:} ), and that does not encode characters 390 * except those that must be quoted, the following identities also hold: 391 * <pre> 392 * new URI(<i>u</i>.getScheme(), 393 * <i>u</i>.getSchemeSpecificPart(), 394 * <i>u</i>.getFragment()) 395 * .equals(<i>u</i>)</pre> 396 * in all cases, 397 * <pre> 398 * new URI(<i>u</i>.getScheme(), 399 * <i>u</i>.getAuthority(), 400 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 401 * <i>u</i>.getFragment()) 402 * .equals(<i>u</i>)</pre> 403 * if <i>u</i> is hierarchical, and 404 * <pre> 405 * new URI(<i>u</i>.getScheme(), 406 * <i>u</i>.getUserInfo(), <i>u</i>.getHost(), <i>u</i>.getPort(), 407 * <i>u</i>.getPath(), <i>u</i>.getQuery(), 408 * <i>u</i>.getFragment()) 409 * .equals(<i>u</i>)</pre> 410 * if <i>u</i> is hierarchical and has either no authority or a server-based 411 * authority. 412 * 413 * 414 * <h4> URIs, URLs, and URNs </h4> 415 * 416 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform 417 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but 418 * not every URI is a URL. This is because there is another subcategory of 419 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not 420 * specify how to locate them. The {@code mailto}, {@code news}, and 421 * {@code isbn} URIs shown above are examples of URNs. 422 * 423 * <p> The conceptual distinction between URIs and URLs is reflected in the 424 * differences between this class and the {@link URL} class. 425 * 426 * <p> An instance of this class represents a URI reference in the syntactic 427 * sense defined by RFC 2396. A URI may be either absolute or relative. 428 * A URI string is parsed according to the generic syntax without regard to the 429 * scheme, if any, that it specifies. No lookup of the host, if any, is 430 * performed, and no scheme-dependent stream handler is constructed. Equality, 431 * hashing, and comparison are defined strictly in terms of the character 432 * content of the instance. In other words, a URI instance is little more than 433 * a structured string that supports the syntactic, scheme-independent 434 * operations of comparison, normalization, resolution, and relativization. 435 * 436 * <p> An instance of the {@link URL} class, by contrast, represents the 437 * syntactic components of a URL together with some of the information required 438 * to access the resource that it describes. A URL must be absolute, that is, 439 * it must always specify a scheme. A URL string is parsed according to its 440 * scheme. A stream handler is always established for a URL, and in fact it is 441 * impossible to create a URL instance for a scheme for which no handler is 442 * available. Equality and hashing depend upon both the scheme and the 443 * Internet address of the host, if any; comparison is not defined. In other 444 * words, a URL is a structured string that supports the syntactic operation of 445 * resolution as well as the network I/O operations of looking up the host and 446 * opening a connection to the specified resource. 447 * 448 * 449 * @author Mark Reinhold 450 * @since 1.4 451 * 452 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a 453 * transformation format of ISO 10646</i></a>, <br><a 454 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing 455 * Architecture</i></a>, <br><a 456 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform 457 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a 458 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for 459 * Literal IPv6 Addresses in URLs</i></a>, <br><a 460 * href="URISyntaxException.html">URISyntaxException</a> 461 */ 462 463 public final class URI 464 implements Comparable<URI>, Serializable 465 { 466 467 // Note: Comments containing the word "ASSERT" indicate places where a 468 // throw of an InternalError should be replaced by an appropriate assertion 469 // statement once asserts are enabled in the build. 470 471 static final long serialVersionUID = -6052424284110960213L; 472 473 474 // -- Properties and components of this instance -- 475 476 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>] 477 private transient String scheme; // null ==> relative URI 478 private transient String fragment; 479 480 // Hierarchical URI components: [//<authority>]<path>[?<query>] 481 private transient String authority; // Registry or server 482 483 // Server-based authority: [<userInfo>@]<host>[:<port>] 484 private transient String userInfo; 485 private transient String host; // null ==> registry-based 486 private transient int port = -1; // -1 ==> undefined 487 488 // Remaining components of hierarchical URIs 489 private transient String path; // null ==> opaque 490 private transient String query; 491 492 // The remaining fields may be computed on demand, which is safe even in 493 // the face of multiple threads racing to initialize them 494 private transient String schemeSpecificPart; 495 private transient int hash; // Zero ==> undefined 496 497 private transient String decodedUserInfo; 498 private transient String decodedAuthority; 499 private transient String decodedPath; 500 private transient String decodedQuery; 501 private transient String decodedFragment; 502 private transient String decodedSchemeSpecificPart; 503 504 /** 505 * The string form of this URI. 506 * 507 * @serial 508 */ 509 private volatile String string; // The only serializable field 510 511 512 513 // -- Constructors and factories -- 514 515 private URI() { } // Used internally 516 517 /** 518 * Constructs a URI by parsing the given string. 519 * 520 * <p> This constructor parses the given string exactly as specified by the 521 * grammar in <a 522 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 523 * Appendix A, <b><i>except for the following deviations:</i></b> </p> 524 * 525 * <ul> 526 * 527 * <li><p> An empty authority component is permitted as long as it is 528 * followed by a non-empty path, a query component, or a fragment 529 * component. This allows the parsing of URIs such as 530 * {@code "file:///foo/bar"}, which seems to be the intent of 531 * RFC 2396 although the grammar does not permit it. If the 532 * authority component is empty then the user-information, host, and port 533 * components are undefined. </p></li> 534 * 535 * <li><p> Empty relative paths are permitted; this seems to be the 536 * intent of RFC 2396 although the grammar does not permit it. The 537 * primary consequence of this deviation is that a standalone fragment 538 * such as {@code "#foo"} parses as a relative URI with an empty path 539 * and the given fragment, and can be usefully <a 540 * href="#resolve-frag">resolved</a> against a base URI. 541 * 542 * <li><p> IPv4 addresses in host components are parsed rigorously, as 543 * specified by <a 544 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each 545 * element of a dotted-quad address must contain no more than three 546 * decimal digits. Each element is further constrained to have a value 547 * no greater than 255. </p></li> 548 * 549 * <li> <p> Hostnames in host components that comprise only a single 550 * domain label are permitted to start with an <i>alphanum</i> 551 * character. This seems to be the intent of <a 552 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 553 * section 3.2.2 although the grammar does not permit it. The 554 * consequence of this deviation is that the authority component of a 555 * hierarchical URI such as {@code s://123}, will parse as a server-based 556 * authority. </p></li> 557 * 558 * <li><p> IPv6 addresses are permitted for the host component. An IPv6 559 * address must be enclosed in square brackets ({@code '['} and 560 * {@code ']'}) as specified by <a 561 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The 562 * IPv6 address itself must parse according to <a 563 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6 564 * addresses are further constrained to describe no more than sixteen 565 * bytes of address information, a constraint implicit in RFC 2373 566 * but not expressible in the grammar. </p></li> 567 * 568 * <li><p> Characters in the <i>other</i> category are permitted wherever 569 * RFC 2396 permits <i>escaped</i> octets, that is, in the 570 * user-information, path, query, and fragment components, as well as in 571 * the authority component if the authority is registry-based. This 572 * allows URIs to contain Unicode characters beyond those in the US-ASCII 573 * character set. </p></li> 574 * 575 * </ul> 576 * 577 * @param str The string to be parsed into a URI 578 * 579 * @throws NullPointerException 580 * If {@code str} is {@code null} 581 * 582 * @throws URISyntaxException 583 * If the given string violates RFC 2396, as augmented 584 * by the above deviations 585 */ 586 public URI(String str) throws URISyntaxException { 587 new Parser(str).parse(false); 588 } 589 590 /** 591 * Constructs a hierarchical URI from the given components. 592 * 593 * <p> If a scheme is given then the path, if also given, must either be 594 * empty or begin with a slash character ({@code '/'}). Otherwise a 595 * component of the new URI may be left undefined by passing {@code null} 596 * for the corresponding parameter or, in the case of the {@code port} 597 * parameter, by passing {@code -1}. 598 * 599 * <p> This constructor first builds a URI string from the given components 600 * according to the rules specified in <a 601 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 602 * section 5.2, step 7: </p> 603 * 604 * <ol> 605 * 606 * <li><p> Initially, the result string is empty. </p></li> 607 * 608 * <li><p> If a scheme is given then it is appended to the result, 609 * followed by a colon character ({@code ':'}). </p></li> 610 * 611 * <li><p> If user information, a host, or a port are given then the 612 * string {@code "//"} is appended. </p></li> 613 * 614 * <li><p> If user information is given then it is appended, followed by 615 * a commercial-at character ({@code '@'}). Any character not in the 616 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 617 * categories is <a href="#quote">quoted</a>. </p></li> 618 * 619 * <li><p> If a host is given then it is appended. If the host is a 620 * literal IPv6 address but is not enclosed in square brackets 621 * ({@code '['} and {@code ']'}) then the square brackets are added. 622 * </p></li> 623 * 624 * <li><p> If a port number is given then a colon character 625 * ({@code ':'}) is appended, followed by the port number in decimal. 626 * </p></li> 627 * 628 * <li><p> If a path is given then it is appended. Any character not in 629 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 630 * categories, and not equal to the slash character ({@code '/'}) or the 631 * commercial-at character ({@code '@'}), is quoted. </p></li> 632 * 633 * <li><p> If a query is given then a question-mark character 634 * ({@code '?'}) is appended, followed by the query. Any character that 635 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 636 * </p></li> 637 * 638 * <li><p> Finally, if a fragment is given then a hash character 639 * ({@code '#'}) is appended, followed by the fragment. Any character 640 * that is not a legal URI character is quoted. </p></li> 641 * 642 * </ol> 643 * 644 * <p> The resulting URI string is then parsed as if by invoking the {@link 645 * #URI(String)} constructor and then invoking the {@link 646 * #parseServerAuthority()} method upon the result; this may cause a {@link 647 * URISyntaxException} to be thrown. </p> 648 * 649 * @param scheme Scheme name 650 * @param userInfo User name and authorization information 651 * @param host Host name 652 * @param port Port number 653 * @param path Path 654 * @param query Query 655 * @param fragment Fragment 656 * 657 * @throws URISyntaxException 658 * If both a scheme and a path are given but the path is relative, 659 * if the URI string constructed from the given components violates 660 * RFC 2396, or if the authority component of the string is 661 * present but cannot be parsed as a server-based authority 662 */ 663 public URI(String scheme, 664 String userInfo, String host, int port, 665 String path, String query, String fragment) 666 throws URISyntaxException 667 { 668 String s = toString(scheme, null, 669 null, userInfo, host, port, 670 path, query, fragment); 671 checkPath(s, scheme, path); 672 new Parser(s).parse(true); 673 } 674 675 /** 676 * Constructs a hierarchical URI from the given components. 677 * 678 * <p> If a scheme is given then the path, if also given, must either be 679 * empty or begin with a slash character ({@code '/'}). Otherwise a 680 * component of the new URI may be left undefined by passing {@code null} 681 * for the corresponding parameter. 682 * 683 * <p> This constructor first builds a URI string from the given components 684 * according to the rules specified in <a 685 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 686 * section 5.2, step 7: </p> 687 * 688 * <ol> 689 * 690 * <li><p> Initially, the result string is empty. </p></li> 691 * 692 * <li><p> If a scheme is given then it is appended to the result, 693 * followed by a colon character ({@code ':'}). </p></li> 694 * 695 * <li><p> If an authority is given then the string {@code "//"} is 696 * appended, followed by the authority. If the authority contains a 697 * literal IPv6 address then the address must be enclosed in square 698 * brackets ({@code '['} and {@code ']'}). Any character not in the 699 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 700 * categories, and not equal to the commercial-at character 701 * ({@code '@'}), is <a href="#quote">quoted</a>. </p></li> 702 * 703 * <li><p> If a path is given then it is appended. Any character not in 704 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i> 705 * categories, and not equal to the slash character ({@code '/'}) or the 706 * commercial-at character ({@code '@'}), is quoted. </p></li> 707 * 708 * <li><p> If a query is given then a question-mark character 709 * ({@code '?'}) is appended, followed by the query. Any character that 710 * is not a <a href="#legal-chars">legal URI character</a> is quoted. 711 * </p></li> 712 * 713 * <li><p> Finally, if a fragment is given then a hash character 714 * ({@code '#'}) is appended, followed by the fragment. Any character 715 * that is not a legal URI character is quoted. </p></li> 716 * 717 * </ol> 718 * 719 * <p> The resulting URI string is then parsed as if by invoking the {@link 720 * #URI(String)} constructor and then invoking the {@link 721 * #parseServerAuthority()} method upon the result; this may cause a {@link 722 * URISyntaxException} to be thrown. </p> 723 * 724 * @param scheme Scheme name 725 * @param authority Authority 726 * @param path Path 727 * @param query Query 728 * @param fragment Fragment 729 * 730 * @throws URISyntaxException 731 * If both a scheme and a path are given but the path is relative, 732 * if the URI string constructed from the given components violates 733 * RFC 2396, or if the authority component of the string is 734 * present but cannot be parsed as a server-based authority 735 */ 736 public URI(String scheme, 737 String authority, 738 String path, String query, String fragment) 739 throws URISyntaxException 740 { 741 String s = toString(scheme, null, 742 authority, null, null, -1, 743 path, query, fragment); 744 checkPath(s, scheme, path); 745 new Parser(s).parse(false); 746 } 747 748 /** 749 * Constructs a hierarchical URI from the given components. 750 * 751 * <p> A component may be left undefined by passing {@code null}. 752 * 753 * <p> This convenience constructor works as if by invoking the 754 * seven-argument constructor as follows: 755 * 756 * <blockquote> 757 * {@code new} {@link #URI(String, String, String, int, String, String, String) 758 * URI}{@code (scheme, null, host, -1, path, null, fragment);} 759 * </blockquote> 760 * 761 * @param scheme Scheme name 762 * @param host Host name 763 * @param path Path 764 * @param fragment Fragment 765 * 766 * @throws URISyntaxException 767 * If the URI string constructed from the given components 768 * violates RFC 2396 769 */ 770 public URI(String scheme, String host, String path, String fragment) 771 throws URISyntaxException 772 { 773 this(scheme, null, host, -1, path, null, fragment); 774 } 775 776 /** 777 * Constructs a URI from the given components. 778 * 779 * <p> A component may be left undefined by passing {@code null}. 780 * 781 * <p> This constructor first builds a URI in string form using the given 782 * components as follows: </p> 783 * 784 * <ol> 785 * 786 * <li><p> Initially, the result string is empty. </p></li> 787 * 788 * <li><p> If a scheme is given then it is appended to the result, 789 * followed by a colon character ({@code ':'}). </p></li> 790 * 791 * <li><p> If a scheme-specific part is given then it is appended. Any 792 * character that is not a <a href="#legal-chars">legal URI character</a> 793 * is <a href="#quote">quoted</a>. </p></li> 794 * 795 * <li><p> Finally, if a fragment is given then a hash character 796 * ({@code '#'}) is appended to the string, followed by the fragment. 797 * Any character that is not a legal URI character is quoted. </p></li> 798 * 799 * </ol> 800 * 801 * <p> The resulting URI string is then parsed in order to create the new 802 * URI instance as if by invoking the {@link #URI(String)} constructor; 803 * this may cause a {@link URISyntaxException} to be thrown. </p> 804 * 805 * @param scheme Scheme name 806 * @param ssp Scheme-specific part 807 * @param fragment Fragment 808 * 809 * @throws URISyntaxException 810 * If the URI string constructed from the given components 811 * violates RFC 2396 812 */ 813 public URI(String scheme, String ssp, String fragment) 814 throws URISyntaxException 815 { 816 new Parser(toString(scheme, ssp, 817 null, null, null, -1, 818 null, null, fragment)) 819 .parse(false); 820 } 821 822 /** 823 * Creates a URI by parsing the given string. 824 * 825 * <p> This convenience factory method works as if by invoking the {@link 826 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the 827 * constructor is caught and wrapped in a new {@link 828 * IllegalArgumentException} object, which is then thrown. 829 * 830 * <p> This method is provided for use in situations where it is known that 831 * the given string is a legal URI, for example for URI constants declared 832 * within in a program, and so it would be considered a programming error 833 * for the string not to parse as such. The constructors, which throw 834 * {@link URISyntaxException} directly, should be used situations where a 835 * URI is being constructed from user input or from some other source that 836 * may be prone to errors. </p> 837 * 838 * @param str The string to be parsed into a URI 839 * @return The new URI 840 * 841 * @throws NullPointerException 842 * If {@code str} is {@code null} 843 * 844 * @throws IllegalArgumentException 845 * If the given string violates RFC 2396 846 */ 847 public static URI create(String str) { 848 try { 849 return new URI(str); 850 } catch (URISyntaxException x) { 851 throw new IllegalArgumentException(x.getMessage(), x); 852 } 853 } 854 855 856 // -- Operations -- 857 858 /** 859 * Attempts to parse this URI's authority component, if defined, into 860 * user-information, host, and port components. 861 * 862 * <p> If this URI's authority component has already been recognized as 863 * being server-based then it will already have been parsed into 864 * user-information, host, and port components. In this case, or if this 865 * URI has no authority component, this method simply returns this URI. 866 * 867 * <p> Otherwise this method attempts once more to parse the authority 868 * component into user-information, host, and port components, and throws 869 * an exception describing why the authority component could not be parsed 870 * in that way. 871 * 872 * <p> This method is provided because the generic URI syntax specified in 873 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a> 874 * cannot always distinguish a malformed server-based authority from a 875 * legitimate registry-based authority. It must therefore treat some 876 * instances of the former as instances of the latter. The authority 877 * component in the URI string {@code "//foo:bar"}, for example, is not a 878 * legal server-based authority but it is legal as a registry-based 879 * authority. 880 * 881 * <p> In many common situations, for example when working URIs that are 882 * known to be either URNs or URLs, the hierarchical URIs being used will 883 * always be server-based. They therefore must either be parsed as such or 884 * treated as an error. In these cases a statement such as 885 * 886 * <blockquote> 887 * {@code URI }<i>u</i>{@code = new URI(str).parseServerAuthority();} 888 * </blockquote> 889 * 890 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if 891 * it has an authority component, has a server-based authority with proper 892 * user-information, host, and port components. Invoking this method also 893 * ensures that if the authority could not be parsed in that way then an 894 * appropriate diagnostic message can be issued based upon the exception 895 * that is thrown. </p> 896 * 897 * @return A URI whose authority field has been parsed 898 * as a server-based authority 899 * 900 * @throws URISyntaxException 901 * If the authority component of this URI is defined 902 * but cannot be parsed as a server-based authority 903 * according to RFC 2396 904 */ 905 public URI parseServerAuthority() 906 throws URISyntaxException 907 { 908 // We could be clever and cache the error message and index from the 909 // exception thrown during the original parse, but that would require 910 // either more fields or a more-obscure representation. 911 if ((host != null) || (authority == null)) 912 return this; 913 new Parser(toString()).parse(true); 914 return this; 915 } 916 917 /** 918 * Normalizes this URI's path. 919 * 920 * <p> If this URI is opaque, or if its path is already in normal form, 921 * then this URI is returned. Otherwise a new URI is constructed that is 922 * identical to this URI except that its path is computed by normalizing 923 * this URI's path in a manner consistent with <a 924 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 925 * section 5.2, step 6, sub-steps c through f; that is: 926 * </p> 927 * 928 * <ol> 929 * 930 * <li><p> All {@code "."} segments are removed. </p></li> 931 * 932 * <li><p> If a {@code ".."} segment is preceded by a non-{@code ".."} 933 * segment then both of these segments are removed. This step is 934 * repeated until it is no longer applicable. </p></li> 935 * 936 * <li><p> If the path is relative, and if its first segment contains a 937 * colon character ({@code ':'}), then a {@code "."} segment is 938 * prepended. This prevents a relative URI with a path such as 939 * {@code "a:b/c/d"} from later being re-parsed as an opaque URI with a 940 * scheme of {@code "a"} and a scheme-specific part of {@code "b/c/d"}. 941 * <b><i>(Deviation from RFC 2396)</i></b> </p></li> 942 * 943 * </ol> 944 * 945 * <p> A normalized path will begin with one or more {@code ".."} segments 946 * if there were insufficient non-{@code ".."} segments preceding them to 947 * allow their removal. A normalized path will begin with a {@code "."} 948 * segment if one was inserted by step 3 above. Otherwise, a normalized 949 * path will not contain any {@code "."} or {@code ".."} segments. </p> 950 * 951 * @return A URI equivalent to this URI, 952 * but whose path is in normal form 953 */ 954 public URI normalize() { 955 return normalize(this); 956 } 957 958 /** 959 * Resolves the given URI against this URI. 960 * 961 * <p> If the given URI is already absolute, or if this URI is opaque, then 962 * the given URI is returned. 963 * 964 * <p><a name="resolve-frag"></a> If the given URI's fragment component is 965 * defined, its path component is empty, and its scheme, authority, and 966 * query components are undefined, then a URI with the given fragment but 967 * with all other components equal to those of this URI is returned. This 968 * allows a URI representing a standalone fragment reference, such as 969 * {@code "#foo"}, to be usefully resolved against a base URI. 970 * 971 * <p> Otherwise this method constructs a new hierarchical URI in a manner 972 * consistent with <a 973 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 974 * section 5.2; that is: </p> 975 * 976 * <ol> 977 * 978 * <li><p> A new URI is constructed with this URI's scheme and the given 979 * URI's query and fragment components. </p></li> 980 * 981 * <li><p> If the given URI has an authority component then the new URI's 982 * authority and path are taken from the given URI. </p></li> 983 * 984 * <li><p> Otherwise the new URI's authority component is copied from 985 * this URI, and its path is computed as follows: </p> 986 * 987 * <ol> 988 * 989 * <li><p> If the given URI's path is absolute then the new URI's path 990 * is taken from the given URI. </p></li> 991 * 992 * <li><p> Otherwise the given URI's path is relative, and so the new 993 * URI's path is computed by resolving the path of the given URI 994 * against the path of this URI. This is done by concatenating all but 995 * the last segment of this URI's path, if any, with the given URI's 996 * path and then normalizing the result as if by invoking the {@link 997 * #normalize() normalize} method. </p></li> 998 * 999 * </ol></li> 1000 * 1001 * </ol> 1002 * 1003 * <p> The result of this method is absolute if, and only if, either this 1004 * URI is absolute or the given URI is absolute. </p> 1005 * 1006 * @param uri The URI to be resolved against this URI 1007 * @return The resulting URI 1008 * 1009 * @throws NullPointerException 1010 * If {@code uri} is {@code null} 1011 */ 1012 public URI resolve(URI uri) { 1013 return resolve(this, uri); 1014 } 1015 1016 /** 1017 * Constructs a new URI by parsing the given string and then resolving it 1018 * against this URI. 1019 * 1020 * <p> This convenience method works as if invoking it were equivalent to 1021 * evaluating the expression {@link #resolve(java.net.URI) 1022 * resolve}{@code (URI.}{@link #create(String) create}{@code (str))}. </p> 1023 * 1024 * @param str The string to be parsed into a URI 1025 * @return The resulting URI 1026 * 1027 * @throws NullPointerException 1028 * If {@code str} is {@code null} 1029 * 1030 * @throws IllegalArgumentException 1031 * If the given string violates RFC 2396 1032 */ 1033 public URI resolve(String str) { 1034 return resolve(URI.create(str)); 1035 } 1036 1037 /** 1038 * Relativizes the given URI against this URI. 1039 * 1040 * <p> The relativization of the given URI against this URI is computed as 1041 * follows: </p> 1042 * 1043 * <ol> 1044 * 1045 * <li><p> If either this URI or the given URI are opaque, or if the 1046 * scheme and authority components of the two URIs are not identical, or 1047 * if the path of this URI is not a prefix of the path of the given URI, 1048 * then the given URI is returned. </p></li> 1049 * 1050 * <li><p> Otherwise a new relative hierarchical URI is constructed with 1051 * query and fragment components taken from the given URI and with a path 1052 * component computed by removing this URI's path from the beginning of 1053 * the given URI's path. </p></li> 1054 * 1055 * </ol> 1056 * 1057 * @param uri The URI to be relativized against this URI 1058 * @return The resulting URI 1059 * 1060 * @throws NullPointerException 1061 * If {@code uri} is {@code null} 1062 */ 1063 public URI relativize(URI uri) { 1064 return relativize(this, uri); 1065 } 1066 1067 /** 1068 * Constructs a URL from this URI. 1069 * 1070 * <p> This convenience method works as if invoking it were equivalent to 1071 * evaluating the expression {@code new URL(this.toString())} after 1072 * first checking that this URI is absolute. </p> 1073 * 1074 * @return A URL constructed from this URI 1075 * 1076 * @throws IllegalArgumentException 1077 * If this URL is not absolute 1078 * 1079 * @throws MalformedURLException 1080 * If a protocol handler for the URL could not be found, 1081 * or if some other error occurred while constructing the URL 1082 */ 1083 public URL toURL() throws MalformedURLException { 1084 return URL.fromURI(this); 1085 } 1086 1087 // -- Component access methods -- 1088 1089 /** 1090 * Returns the scheme component of this URI. 1091 * 1092 * <p> The scheme component of a URI, if defined, only contains characters 1093 * in the <i>alphanum</i> category and in the string {@code "-.+"}. A 1094 * scheme always starts with an <i>alpha</i> character. <p> 1095 * 1096 * The scheme component of a URI cannot contain escaped octets, hence this 1097 * method does not perform any decoding. 1098 * 1099 * @return The scheme component of this URI, 1100 * or {@code null} if the scheme is undefined 1101 */ 1102 public String getScheme() { 1103 return scheme; 1104 } 1105 1106 /** 1107 * Tells whether or not this URI is absolute. 1108 * 1109 * <p> A URI is absolute if, and only if, it has a scheme component. </p> 1110 * 1111 * @return {@code true} if, and only if, this URI is absolute 1112 */ 1113 public boolean isAbsolute() { 1114 return scheme != null; 1115 } 1116 1117 /** 1118 * Tells whether or not this URI is opaque. 1119 * 1120 * <p> A URI is opaque if, and only if, it is absolute and its 1121 * scheme-specific part does not begin with a slash character ('/'). 1122 * An opaque URI has a scheme, a scheme-specific part, and possibly 1123 * a fragment; all other components are undefined. </p> 1124 * 1125 * @return {@code true} if, and only if, this URI is opaque 1126 */ 1127 public boolean isOpaque() { 1128 return path == null; 1129 } 1130 1131 /** 1132 * Returns the raw scheme-specific part of this URI. The scheme-specific 1133 * part is never undefined, though it may be empty. 1134 * 1135 * <p> The scheme-specific part of a URI only contains legal URI 1136 * characters. </p> 1137 * 1138 * @return The raw scheme-specific part of this URI 1139 * (never {@code null}) 1140 */ 1141 public String getRawSchemeSpecificPart() { 1142 String part = schemeSpecificPart; 1143 if (part != null) { 1144 return part; 1145 } 1146 1147 String s = string; 1148 if (s != null) { 1149 // if string is defined, components will have been parsed 1150 int start = 0; 1151 int end = s.length(); 1152 if (scheme != null) { 1153 start = scheme.length() + 1; 1154 } 1155 if (fragment != null) { 1156 end -= fragment.length() + 1; 1157 } 1158 if (path != null && path.length() == end - start) { 1159 part = path; 1160 } else { 1161 part = s.substring(start, end); 1162 } 1163 } else { 1164 StringBuilder sb = new StringBuilder(); 1165 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(), 1166 host, port, getPath(), getQuery()); 1167 part = sb.toString(); 1168 } 1169 return schemeSpecificPart = part; 1170 } 1171 1172 /** 1173 * Returns the decoded scheme-specific part of this URI. 1174 * 1175 * <p> The string returned by this method is equal to that returned by the 1176 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method 1177 * except that all sequences of escaped octets are <a 1178 * href="#decode">decoded</a>. </p> 1179 * 1180 * @return The decoded scheme-specific part of this URI 1181 * (never {@code null}) 1182 */ 1183 public String getSchemeSpecificPart() { 1184 String part = decodedSchemeSpecificPart; 1185 if (part == null) { 1186 decodedSchemeSpecificPart = part = decode(getRawSchemeSpecificPart()); 1187 } 1188 return part; 1189 } 1190 1191 /** 1192 * Returns the raw authority component of this URI. 1193 * 1194 * <p> The authority component of a URI, if defined, only contains the 1195 * commercial-at character ({@code '@'}) and characters in the 1196 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i> 1197 * categories. If the authority is server-based then it is further 1198 * constrained to have valid user-information, host, and port 1199 * components. </p> 1200 * 1201 * @return The raw authority component of this URI, 1202 * or {@code null} if the authority is undefined 1203 */ 1204 public String getRawAuthority() { 1205 return authority; 1206 } 1207 1208 /** 1209 * Returns the decoded authority component of this URI. 1210 * 1211 * <p> The string returned by this method is equal to that returned by the 1212 * {@link #getRawAuthority() getRawAuthority} method except that all 1213 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1214 * 1215 * @return The decoded authority component of this URI, 1216 * or {@code null} if the authority is undefined 1217 */ 1218 public String getAuthority() { 1219 String auth = decodedAuthority; 1220 if ((auth == null) && (authority != null)) { 1221 decodedAuthority = auth = decode(authority); 1222 } 1223 return auth; 1224 } 1225 1226 /** 1227 * Returns the raw user-information component of this URI. 1228 * 1229 * <p> The user-information component of a URI, if defined, only contains 1230 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and 1231 * <i>other</i> categories. </p> 1232 * 1233 * @return The raw user-information component of this URI, 1234 * or {@code null} if the user information is undefined 1235 */ 1236 public String getRawUserInfo() { 1237 return userInfo; 1238 } 1239 1240 /** 1241 * Returns the decoded user-information component of this URI. 1242 * 1243 * <p> The string returned by this method is equal to that returned by the 1244 * {@link #getRawUserInfo() getRawUserInfo} method except that all 1245 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1246 * 1247 * @return The decoded user-information component of this URI, 1248 * or {@code null} if the user information is undefined 1249 */ 1250 public String getUserInfo() { 1251 String user = decodedUserInfo; 1252 if ((user == null) && (userInfo != null)) { 1253 decodedUserInfo = user = decode(userInfo); 1254 } 1255 return user; 1256 } 1257 1258 /** 1259 * Returns the host component of this URI. 1260 * 1261 * <p> The host component of a URI, if defined, will have one of the 1262 * following forms: </p> 1263 * 1264 * <ul> 1265 * 1266 * <li><p> A domain name consisting of one or more <i>labels</i> 1267 * separated by period characters ({@code '.'}), optionally followed by 1268 * a period character. Each label consists of <i>alphanum</i> characters 1269 * as well as hyphen characters ({@code '-'}), though hyphens never 1270 * occur as the first or last characters in a label. The rightmost 1271 * label of a domain name consisting of two or more labels, begins 1272 * with an <i>alpha</i> character. </li> 1273 * 1274 * <li><p> A dotted-quad IPv4 address of the form 1275 * <i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +.}<i>digit</i>{@code +}, 1276 * where no <i>digit</i> sequence is longer than three characters and no 1277 * sequence has a value larger than 255. </p></li> 1278 * 1279 * <li><p> An IPv6 address enclosed in square brackets ({@code '['} and 1280 * {@code ']'}) and consisting of hexadecimal digits, colon characters 1281 * ({@code ':'}), and possibly an embedded IPv4 address. The full 1282 * syntax of IPv6 addresses is specified in <a 1283 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 1284 * Addressing Architecture</i></a>. </p></li> 1285 * 1286 * </ul> 1287 * 1288 * The host component of a URI cannot contain escaped octets, hence this 1289 * method does not perform any decoding. 1290 * 1291 * @return The host component of this URI, 1292 * or {@code null} if the host is undefined 1293 */ 1294 public String getHost() { 1295 return host; 1296 } 1297 1298 /** 1299 * Returns the port number of this URI. 1300 * 1301 * <p> The port component of a URI, if defined, is a non-negative 1302 * integer. </p> 1303 * 1304 * @return The port component of this URI, 1305 * or {@code -1} if the port is undefined 1306 */ 1307 public int getPort() { 1308 return port; 1309 } 1310 1311 /** 1312 * Returns the raw path component of this URI. 1313 * 1314 * <p> The path component of a URI, if defined, only contains the slash 1315 * character ({@code '/'}), the commercial-at character ({@code '@'}), 1316 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, 1317 * and <i>other</i> categories. </p> 1318 * 1319 * @return The path component of this URI, 1320 * or {@code null} if the path is undefined 1321 */ 1322 public String getRawPath() { 1323 return path; 1324 } 1325 1326 /** 1327 * Returns the decoded path component of this URI. 1328 * 1329 * <p> The string returned by this method is equal to that returned by the 1330 * {@link #getRawPath() getRawPath} method except that all sequences of 1331 * escaped octets are <a href="#decode">decoded</a>. </p> 1332 * 1333 * @return The decoded path component of this URI, 1334 * or {@code null} if the path is undefined 1335 */ 1336 public String getPath() { 1337 String decoded = decodedPath; 1338 if ((decoded == null) && (path != null)) { 1339 decodedPath = decoded = decode(path); 1340 } 1341 return decoded; 1342 } 1343 1344 /** 1345 * Returns the raw query component of this URI. 1346 * 1347 * <p> The query component of a URI, if defined, only contains legal URI 1348 * characters. </p> 1349 * 1350 * @return The raw query component of this URI, 1351 * or {@code null} if the query is undefined 1352 */ 1353 public String getRawQuery() { 1354 return query; 1355 } 1356 1357 /** 1358 * Returns the decoded query component of this URI. 1359 * 1360 * <p> The string returned by this method is equal to that returned by the 1361 * {@link #getRawQuery() getRawQuery} method except that all sequences of 1362 * escaped octets are <a href="#decode">decoded</a>. </p> 1363 * 1364 * @return The decoded query component of this URI, 1365 * or {@code null} if the query is undefined 1366 */ 1367 public String getQuery() { 1368 String decoded = decodedQuery; 1369 if ((decoded == null) && (query != null)) { 1370 decodedQuery = decoded = decode(query, false); 1371 } 1372 return decoded; 1373 } 1374 1375 /** 1376 * Returns the raw fragment component of this URI. 1377 * 1378 * <p> The fragment component of a URI, if defined, only contains legal URI 1379 * characters. </p> 1380 * 1381 * @return The raw fragment component of this URI, 1382 * or {@code null} if the fragment is undefined 1383 */ 1384 public String getRawFragment() { 1385 return fragment; 1386 } 1387 1388 /** 1389 * Returns the decoded fragment component of this URI. 1390 * 1391 * <p> The string returned by this method is equal to that returned by the 1392 * {@link #getRawFragment() getRawFragment} method except that all 1393 * sequences of escaped octets are <a href="#decode">decoded</a>. </p> 1394 * 1395 * @return The decoded fragment component of this URI, 1396 * or {@code null} if the fragment is undefined 1397 */ 1398 public String getFragment() { 1399 String decoded = decodedFragment; 1400 if ((decoded == null) && (fragment != null)) { 1401 decodedFragment = decoded = decode(fragment, false); 1402 } 1403 return decoded; 1404 } 1405 1406 1407 // -- Equality, comparison, hash code, toString, and serialization -- 1408 1409 /** 1410 * Tests this URI for equality with another object. 1411 * 1412 * <p> If the given object is not a URI then this method immediately 1413 * returns {@code false}. 1414 * 1415 * <p> For two URIs to be considered equal requires that either both are 1416 * opaque or both are hierarchical. Their schemes must either both be 1417 * undefined or else be equal without regard to case. Their fragments 1418 * must either both be undefined or else be equal. 1419 * 1420 * <p> For two opaque URIs to be considered equal, their scheme-specific 1421 * parts must be equal. 1422 * 1423 * <p> For two hierarchical URIs to be considered equal, their paths must 1424 * be equal and their queries must either both be undefined or else be 1425 * equal. Their authorities must either both be undefined, or both be 1426 * registry-based, or both be server-based. If their authorities are 1427 * defined and are registry-based, then they must be equal. If their 1428 * authorities are defined and are server-based, then their hosts must be 1429 * equal without regard to case, their port numbers must be equal, and 1430 * their user-information components must be equal. 1431 * 1432 * <p> When testing the user-information, path, query, fragment, authority, 1433 * or scheme-specific parts of two URIs for equality, the raw forms rather 1434 * than the encoded forms of these components are compared and the 1435 * hexadecimal digits of escaped octets are compared without regard to 1436 * case. 1437 * 1438 * <p> This method satisfies the general contract of the {@link 1439 * java.lang.Object#equals(Object) Object.equals} method. </p> 1440 * 1441 * @param ob The object to which this object is to be compared 1442 * 1443 * @return {@code true} if, and only if, the given object is a URI that 1444 * is identical to this URI 1445 */ 1446 public boolean equals(Object ob) { 1447 if (ob == this) 1448 return true; 1449 if (!(ob instanceof URI)) 1450 return false; 1451 URI that = (URI)ob; 1452 if (this.isOpaque() != that.isOpaque()) return false; 1453 if (!equalIgnoringCase(this.scheme, that.scheme)) return false; 1454 if (!equal(this.fragment, that.fragment)) return false; 1455 1456 // Opaque 1457 if (this.isOpaque()) 1458 return equal(this.schemeSpecificPart, that.schemeSpecificPart); 1459 1460 // Hierarchical 1461 if (!equal(this.path, that.path)) return false; 1462 if (!equal(this.query, that.query)) return false; 1463 1464 // Authorities 1465 if (this.authority == that.authority) return true; 1466 if (this.host != null) { 1467 // Server-based 1468 if (!equal(this.userInfo, that.userInfo)) return false; 1469 if (!equalIgnoringCase(this.host, that.host)) return false; 1470 if (this.port != that.port) return false; 1471 } else if (this.authority != null) { 1472 // Registry-based 1473 if (!equal(this.authority, that.authority)) return false; 1474 } else if (this.authority != that.authority) { 1475 return false; 1476 } 1477 1478 return true; 1479 } 1480 1481 /** 1482 * Returns a hash-code value for this URI. The hash code is based upon all 1483 * of the URI's components, and satisfies the general contract of the 1484 * {@link java.lang.Object#hashCode() Object.hashCode} method. 1485 * 1486 * @return A hash-code value for this URI 1487 */ 1488 public int hashCode() { 1489 int h = hash; 1490 if (h == 0) { 1491 h = hashIgnoringCase(0, scheme); 1492 h = hash(h, fragment); 1493 if (isOpaque()) { 1494 h = hash(h, schemeSpecificPart); 1495 } else { 1496 h = hash(h, path); 1497 h = hash(h, query); 1498 if (host != null) { 1499 h = hash(h, userInfo); 1500 h = hashIgnoringCase(h, host); 1501 h += 1949 * port; 1502 } else { 1503 h = hash(h, authority); 1504 } 1505 } 1506 if (h != 0) { 1507 hash = h; 1508 } 1509 } 1510 return h; 1511 } 1512 1513 /** 1514 * Compares this URI to another object, which must be a URI. 1515 * 1516 * <p> When comparing corresponding components of two URIs, if one 1517 * component is undefined but the other is defined then the first is 1518 * considered to be less than the second. Unless otherwise noted, string 1519 * components are ordered according to their natural, case-sensitive 1520 * ordering as defined by the {@link java.lang.String#compareTo(Object) 1521 * String.compareTo} method. String components that are subject to 1522 * encoding are compared by comparing their raw forms rather than their 1523 * encoded forms. 1524 * 1525 * <p> The ordering of URIs is defined as follows: </p> 1526 * 1527 * <ul> 1528 * 1529 * <li><p> Two URIs with different schemes are ordered according the 1530 * ordering of their schemes, without regard to case. </p></li> 1531 * 1532 * <li><p> A hierarchical URI is considered to be less than an opaque URI 1533 * with an identical scheme. </p></li> 1534 * 1535 * <li><p> Two opaque URIs with identical schemes are ordered according 1536 * to the ordering of their scheme-specific parts. </p></li> 1537 * 1538 * <li><p> Two opaque URIs with identical schemes and scheme-specific 1539 * parts are ordered according to the ordering of their 1540 * fragments. </p></li> 1541 * 1542 * <li><p> Two hierarchical URIs with identical schemes are ordered 1543 * according to the ordering of their authority components: </p> 1544 * 1545 * <ul> 1546 * 1547 * <li><p> If both authority components are server-based then the URIs 1548 * are ordered according to their user-information components; if these 1549 * components are identical then the URIs are ordered according to the 1550 * ordering of their hosts, without regard to case; if the hosts are 1551 * identical then the URIs are ordered according to the ordering of 1552 * their ports. </p></li> 1553 * 1554 * <li><p> If one or both authority components are registry-based then 1555 * the URIs are ordered according to the ordering of their authority 1556 * components. </p></li> 1557 * 1558 * </ul></li> 1559 * 1560 * <li><p> Finally, two hierarchical URIs with identical schemes and 1561 * authority components are ordered according to the ordering of their 1562 * paths; if their paths are identical then they are ordered according to 1563 * the ordering of their queries; if the queries are identical then they 1564 * are ordered according to the order of their fragments. </p></li> 1565 * 1566 * </ul> 1567 * 1568 * <p> This method satisfies the general contract of the {@link 1569 * java.lang.Comparable#compareTo(Object) Comparable.compareTo} 1570 * method. </p> 1571 * 1572 * @param that 1573 * The object to which this URI is to be compared 1574 * 1575 * @return A negative integer, zero, or a positive integer as this URI is 1576 * less than, equal to, or greater than the given URI 1577 * 1578 * @throws ClassCastException 1579 * If the given object is not a URI 1580 */ 1581 public int compareTo(URI that) { 1582 int c; 1583 1584 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0) 1585 return c; 1586 1587 if (this.isOpaque()) { 1588 if (that.isOpaque()) { 1589 // Both opaque 1590 if ((c = compare(this.schemeSpecificPart, 1591 that.schemeSpecificPart)) != 0) 1592 return c; 1593 return compare(this.fragment, that.fragment); 1594 } 1595 return +1; // Opaque > hierarchical 1596 } else if (that.isOpaque()) { 1597 return -1; // Hierarchical < opaque 1598 } 1599 1600 // Hierarchical 1601 if ((this.host != null) && (that.host != null)) { 1602 // Both server-based 1603 if ((c = compare(this.userInfo, that.userInfo)) != 0) 1604 return c; 1605 if ((c = compareIgnoringCase(this.host, that.host)) != 0) 1606 return c; 1607 if ((c = this.port - that.port) != 0) 1608 return c; 1609 } else { 1610 // If one or both authorities are registry-based then we simply 1611 // compare them in the usual, case-sensitive way. If one is 1612 // registry-based and one is server-based then the strings are 1613 // guaranteed to be unequal, hence the comparison will never return 1614 // zero and the compareTo and equals methods will remain 1615 // consistent. 1616 if ((c = compare(this.authority, that.authority)) != 0) return c; 1617 } 1618 1619 if ((c = compare(this.path, that.path)) != 0) return c; 1620 if ((c = compare(this.query, that.query)) != 0) return c; 1621 return compare(this.fragment, that.fragment); 1622 } 1623 1624 /** 1625 * Returns the content of this URI as a string. 1626 * 1627 * <p> If this URI was created by invoking one of the constructors in this 1628 * class then a string equivalent to the original input string, or to the 1629 * string computed from the originally-given components, as appropriate, is 1630 * returned. Otherwise this URI was created by normalization, resolution, 1631 * or relativization, and so a string is constructed from this URI's 1632 * components according to the rules specified in <a 1633 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>, 1634 * section 5.2, step 7. </p> 1635 * 1636 * @return The string form of this URI 1637 */ 1638 public String toString() { 1639 String s = string; 1640 if (s == null) { 1641 s = defineString(); 1642 } 1643 return s; 1644 } 1645 1646 private String defineString() { 1647 String s = string; 1648 if (s != null) { 1649 return s; 1650 } 1651 1652 StringBuilder sb = new StringBuilder(); 1653 if (scheme != null) { 1654 sb.append(scheme); 1655 sb.append(':'); 1656 } 1657 if (isOpaque()) { 1658 sb.append(schemeSpecificPart); 1659 } else { 1660 if (host != null) { 1661 sb.append("//"); 1662 if (userInfo != null) { 1663 sb.append(userInfo); 1664 sb.append('@'); 1665 } 1666 boolean needBrackets = ((host.indexOf(':') >= 0) 1667 && !host.startsWith("[") 1668 && !host.endsWith("]")); 1669 if (needBrackets) sb.append('['); 1670 sb.append(host); 1671 if (needBrackets) sb.append(']'); 1672 if (port != -1) { 1673 sb.append(':'); 1674 sb.append(port); 1675 } 1676 } else if (authority != null) { 1677 sb.append("//"); 1678 sb.append(authority); 1679 } 1680 if (path != null) 1681 sb.append(path); 1682 if (query != null) { 1683 sb.append('?'); 1684 sb.append(query); 1685 } 1686 } 1687 if (fragment != null) { 1688 sb.append('#'); 1689 sb.append(fragment); 1690 } 1691 return string = sb.toString(); 1692 } 1693 1694 /** 1695 * Returns the content of this URI as a US-ASCII string. 1696 * 1697 * <p> If this URI does not contain any characters in the <i>other</i> 1698 * category then an invocation of this method will return the same value as 1699 * an invocation of the {@link #toString() toString} method. Otherwise 1700 * this method works as if by invoking that method and then <a 1701 * href="#encode">encoding</a> the result. </p> 1702 * 1703 * @return The string form of this URI, encoded as needed 1704 * so that it only contains characters in the US-ASCII 1705 * charset 1706 */ 1707 public String toASCIIString() { 1708 return encode(toString()); 1709 } 1710 1711 1712 // -- Serialization support -- 1713 1714 /** 1715 * Saves the content of this URI to the given serial stream. 1716 * 1717 * <p> The only serializable field of a URI instance is its {@code string} 1718 * field. That field is given a value, if it does not have one already, 1719 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()} 1720 * method of the given object-output stream is invoked. </p> 1721 * 1722 * @param os The object-output stream to which this object 1723 * is to be written 1724 */ 1725 private void writeObject(ObjectOutputStream os) 1726 throws IOException 1727 { 1728 defineString(); 1729 os.defaultWriteObject(); // Writes the string field only 1730 } 1731 1732 /** 1733 * Reconstitutes a URI from the given serial stream. 1734 * 1735 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is 1736 * invoked to read the value of the {@code string} field. The result is 1737 * then parsed in the usual way. 1738 * 1739 * @param is The object-input stream from which this object 1740 * is being read 1741 */ 1742 private void readObject(ObjectInputStream is) 1743 throws ClassNotFoundException, IOException 1744 { 1745 port = -1; // Argh 1746 is.defaultReadObject(); 1747 try { 1748 new Parser(string).parse(false); 1749 } catch (URISyntaxException x) { 1750 IOException y = new InvalidObjectException("Invalid URI"); 1751 y.initCause(x); 1752 throw y; 1753 } 1754 } 1755 1756 1757 // -- End of public methods -- 1758 1759 1760 // -- Utility methods for string-field comparison and hashing -- 1761 1762 // These methods return appropriate values for null string arguments, 1763 // thereby simplifying the equals, hashCode, and compareTo methods. 1764 // 1765 // The case-ignoring methods should only be applied to strings whose 1766 // characters are all known to be US-ASCII. Because of this restriction, 1767 // these methods are faster than the similar methods in the String class. 1768 1769 // US-ASCII only 1770 private static int toLower(char c) { 1771 if ((c >= 'A') && (c <= 'Z')) 1772 return c + ('a' - 'A'); 1773 return c; 1774 } 1775 1776 // US-ASCII only 1777 private static int toUpper(char c) { 1778 if ((c >= 'a') && (c <= 'z')) 1779 return c - ('a' - 'A'); 1780 return c; 1781 } 1782 1783 private static boolean equal(String s, String t) { 1784 if (s == t) return true; 1785 if ((s != null) && (t != null)) { 1786 if (s.length() != t.length()) 1787 return false; 1788 if (s.indexOf('%') < 0) 1789 return s.equals(t); 1790 int n = s.length(); 1791 for (int i = 0; i < n;) { 1792 char c = s.charAt(i); 1793 char d = t.charAt(i); 1794 if (c != '%') { 1795 if (c != d) 1796 return false; 1797 i++; 1798 continue; 1799 } 1800 if (d != '%') 1801 return false; 1802 i++; 1803 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1804 return false; 1805 i++; 1806 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1807 return false; 1808 i++; 1809 } 1810 return true; 1811 } 1812 return false; 1813 } 1814 1815 // US-ASCII only 1816 private static boolean equalIgnoringCase(String s, String t) { 1817 if (s == t) return true; 1818 if ((s != null) && (t != null)) { 1819 int n = s.length(); 1820 if (t.length() != n) 1821 return false; 1822 for (int i = 0; i < n; i++) { 1823 if (toLower(s.charAt(i)) != toLower(t.charAt(i))) 1824 return false; 1825 } 1826 return true; 1827 } 1828 return false; 1829 } 1830 1831 private static int hash(int hash, String s) { 1832 if (s == null) return hash; 1833 return s.indexOf('%') < 0 ? hash * 127 + s.hashCode() 1834 : normalizedHash(hash, s); 1835 } 1836 1837 1838 private static int normalizedHash(int hash, String s) { 1839 int h = 0; 1840 for (int index = 0; index < s.length(); index++) { 1841 char ch = s.charAt(index); 1842 h = 31 * h + ch; 1843 if (ch == '%') { 1844 /* 1845 * Process the next two encoded characters 1846 */ 1847 for (int i = index + 1; i < index + 3; i++) 1848 h = 31 * h + toUpper(s.charAt(i)); 1849 index += 2; 1850 } 1851 } 1852 return hash * 127 + h; 1853 } 1854 1855 // US-ASCII only 1856 private static int hashIgnoringCase(int hash, String s) { 1857 if (s == null) return hash; 1858 int h = hash; 1859 int n = s.length(); 1860 for (int i = 0; i < n; i++) 1861 h = 31 * h + toLower(s.charAt(i)); 1862 return h; 1863 } 1864 1865 private static int compare(String s, String t) { 1866 if (s == t) return 0; 1867 if (s != null) { 1868 if (t != null) 1869 return s.compareTo(t); 1870 else 1871 return +1; 1872 } else { 1873 return -1; 1874 } 1875 } 1876 1877 // US-ASCII only 1878 private static int compareIgnoringCase(String s, String t) { 1879 if (s == t) return 0; 1880 if (s != null) { 1881 if (t != null) { 1882 int sn = s.length(); 1883 int tn = t.length(); 1884 int n = sn < tn ? sn : tn; 1885 for (int i = 0; i < n; i++) { 1886 int c = toLower(s.charAt(i)) - toLower(t.charAt(i)); 1887 if (c != 0) 1888 return c; 1889 } 1890 return sn - tn; 1891 } 1892 return +1; 1893 } else { 1894 return -1; 1895 } 1896 } 1897 1898 1899 // -- String construction -- 1900 1901 // If a scheme is given then the path, if given, must be absolute 1902 // 1903 private static void checkPath(String s, String scheme, String path) 1904 throws URISyntaxException 1905 { 1906 if (scheme != null) { 1907 if ((path != null) 1908 && ((path.length() > 0) && (path.charAt(0) != '/'))) 1909 throw new URISyntaxException(s, 1910 "Relative path in absolute URI"); 1911 } 1912 } 1913 1914 private void appendAuthority(StringBuilder sb, 1915 String authority, 1916 String userInfo, 1917 String host, 1918 int port) 1919 { 1920 if (host != null) { 1921 sb.append("//"); 1922 if (userInfo != null) { 1923 sb.append(quote(userInfo, L_USERINFO, H_USERINFO)); 1924 sb.append('@'); 1925 } 1926 boolean needBrackets = ((host.indexOf(':') >= 0) 1927 && !host.startsWith("[") 1928 && !host.endsWith("]")); 1929 if (needBrackets) sb.append('['); 1930 sb.append(host); 1931 if (needBrackets) sb.append(']'); 1932 if (port != -1) { 1933 sb.append(':'); 1934 sb.append(port); 1935 } 1936 } else if (authority != null) { 1937 sb.append("//"); 1938 if (authority.startsWith("[")) { 1939 // authority should (but may not) contain an embedded IPv6 address 1940 int end = authority.indexOf(']'); 1941 String doquote = authority, dontquote = ""; 1942 if (end != -1 && authority.indexOf(':') != -1) { 1943 // the authority contains an IPv6 address 1944 if (end == authority.length()) { 1945 dontquote = authority; 1946 doquote = ""; 1947 } else { 1948 dontquote = authority.substring(0 , end + 1); 1949 doquote = authority.substring(end + 1); 1950 } 1951 } 1952 sb.append(dontquote); 1953 sb.append(quote(doquote, 1954 L_REG_NAME | L_SERVER, 1955 H_REG_NAME | H_SERVER)); 1956 } else { 1957 sb.append(quote(authority, 1958 L_REG_NAME | L_SERVER, 1959 H_REG_NAME | H_SERVER)); 1960 } 1961 } 1962 } 1963 1964 private void appendSchemeSpecificPart(StringBuilder sb, 1965 String opaquePart, 1966 String authority, 1967 String userInfo, 1968 String host, 1969 int port, 1970 String path, 1971 String query) 1972 { 1973 if (opaquePart != null) { 1974 /* check if SSP begins with an IPv6 address 1975 * because we must not quote a literal IPv6 address 1976 */ 1977 if (opaquePart.startsWith("//[")) { 1978 int end = opaquePart.indexOf(']'); 1979 if (end != -1 && opaquePart.indexOf(':')!=-1) { 1980 String doquote, dontquote; 1981 if (end == opaquePart.length()) { 1982 dontquote = opaquePart; 1983 doquote = ""; 1984 } else { 1985 dontquote = opaquePart.substring(0,end+1); 1986 doquote = opaquePart.substring(end+1); 1987 } 1988 sb.append (dontquote); 1989 sb.append(quote(doquote, L_URIC, H_URIC)); 1990 } 1991 } else { 1992 sb.append(quote(opaquePart, L_URIC, H_URIC)); 1993 } 1994 } else { 1995 appendAuthority(sb, authority, userInfo, host, port); 1996 if (path != null) 1997 sb.append(quote(path, L_PATH, H_PATH)); 1998 if (query != null) { 1999 sb.append('?'); 2000 sb.append(quote(query, L_URIC, H_URIC)); 2001 } 2002 } 2003 } 2004 2005 private void appendFragment(StringBuilder sb, String fragment) { 2006 if (fragment != null) { 2007 sb.append('#'); 2008 sb.append(quote(fragment, L_URIC, H_URIC)); 2009 } 2010 } 2011 2012 private String toString(String scheme, 2013 String opaquePart, 2014 String authority, 2015 String userInfo, 2016 String host, 2017 int port, 2018 String path, 2019 String query, 2020 String fragment) 2021 { 2022 StringBuilder sb = new StringBuilder(); 2023 if (scheme != null) { 2024 sb.append(scheme); 2025 sb.append(':'); 2026 } 2027 appendSchemeSpecificPart(sb, opaquePart, 2028 authority, userInfo, host, port, 2029 path, query); 2030 appendFragment(sb, fragment); 2031 return sb.toString(); 2032 } 2033 2034 // -- Normalization, resolution, and relativization -- 2035 2036 // RFC2396 5.2 (6) 2037 private static String resolvePath(String base, String child, 2038 boolean absolute) 2039 { 2040 int i = base.lastIndexOf('/'); 2041 int cn = child.length(); 2042 String path = ""; 2043 2044 if (cn == 0) { 2045 // 5.2 (6a) 2046 if (i >= 0) 2047 path = base.substring(0, i + 1); 2048 } else { 2049 StringBuilder sb = new StringBuilder(base.length() + cn); 2050 // 5.2 (6a) 2051 if (i >= 0) 2052 sb.append(base, 0, i + 1); 2053 // 5.2 (6b) 2054 sb.append(child); 2055 path = sb.toString(); 2056 } 2057 2058 // 5.2 (6c-f) 2059 String np = normalize(path); 2060 2061 // 5.2 (6g): If the result is absolute but the path begins with "../", 2062 // then we simply leave the path as-is 2063 2064 return np; 2065 } 2066 2067 // RFC2396 5.2 2068 private static URI resolve(URI base, URI child) { 2069 // check if child if opaque first so that NPE is thrown 2070 // if child is null. 2071 if (child.isOpaque() || base.isOpaque()) 2072 return child; 2073 2074 // 5.2 (2): Reference to current document (lone fragment) 2075 if ((child.scheme == null) && (child.authority == null) 2076 && child.path.isEmpty() && (child.fragment != null) 2077 && (child.query == null)) { 2078 if ((base.fragment != null) 2079 && child.fragment.equals(base.fragment)) { 2080 return base; 2081 } 2082 URI ru = new URI(); 2083 ru.scheme = base.scheme; 2084 ru.authority = base.authority; 2085 ru.userInfo = base.userInfo; 2086 ru.host = base.host; 2087 ru.port = base.port; 2088 ru.path = base.path; 2089 ru.fragment = child.fragment; 2090 ru.query = base.query; 2091 return ru; 2092 } 2093 2094 // 5.2 (3): Child is absolute 2095 if (child.scheme != null) 2096 return child; 2097 2098 URI ru = new URI(); // Resolved URI 2099 ru.scheme = base.scheme; 2100 ru.query = child.query; 2101 ru.fragment = child.fragment; 2102 2103 // 5.2 (4): Authority 2104 if (child.authority == null) { 2105 ru.authority = base.authority; 2106 ru.host = base.host; 2107 ru.userInfo = base.userInfo; 2108 ru.port = base.port; 2109 2110 String cp = (child.path == null) ? "" : child.path; 2111 if ((cp.length() > 0) && (cp.charAt(0) == '/')) { 2112 // 5.2 (5): Child path is absolute 2113 ru.path = child.path; 2114 } else { 2115 // 5.2 (6): Resolve relative path 2116 ru.path = resolvePath(base.path, cp, base.isAbsolute()); 2117 } 2118 } else { 2119 ru.authority = child.authority; 2120 ru.host = child.host; 2121 ru.userInfo = child.userInfo; 2122 ru.host = child.host; 2123 ru.port = child.port; 2124 ru.path = child.path; 2125 } 2126 2127 // 5.2 (7): Recombine (nothing to do here) 2128 return ru; 2129 } 2130 2131 // If the given URI's path is normal then return the URI; 2132 // o.w., return a new URI containing the normalized path. 2133 // 2134 private static URI normalize(URI u) { 2135 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0)) 2136 return u; 2137 2138 String np = normalize(u.path); 2139 if (np == u.path) 2140 return u; 2141 2142 URI v = new URI(); 2143 v.scheme = u.scheme; 2144 v.fragment = u.fragment; 2145 v.authority = u.authority; 2146 v.userInfo = u.userInfo; 2147 v.host = u.host; 2148 v.port = u.port; 2149 v.path = np; 2150 v.query = u.query; 2151 return v; 2152 } 2153 2154 // If both URIs are hierarchical, their scheme and authority components are 2155 // identical, and the base path is a prefix of the child's path, then 2156 // return a relative URI that, when resolved against the base, yields the 2157 // child; otherwise, return the child. 2158 // 2159 private static URI relativize(URI base, URI child) { 2160 // check if child if opaque first so that NPE is thrown 2161 // if child is null. 2162 if (child.isOpaque() || base.isOpaque()) 2163 return child; 2164 if (!equalIgnoringCase(base.scheme, child.scheme) 2165 || !equal(base.authority, child.authority)) 2166 return child; 2167 2168 String bp = normalize(base.path); 2169 String cp = normalize(child.path); 2170 if (!bp.equals(cp)) { 2171 if (!bp.endsWith("/")) 2172 bp = bp + "/"; 2173 if (!cp.startsWith(bp)) 2174 return child; 2175 } 2176 2177 URI v = new URI(); 2178 v.path = cp.substring(bp.length()); 2179 v.query = child.query; 2180 v.fragment = child.fragment; 2181 return v; 2182 } 2183 2184 2185 2186 // -- Path normalization -- 2187 2188 // The following algorithm for path normalization avoids the creation of a 2189 // string object for each segment, as well as the use of a string buffer to 2190 // compute the final result, by using a single char array and editing it in 2191 // place. The array is first split into segments, replacing each slash 2192 // with '\0' and creating a segment-index array, each element of which is 2193 // the index of the first char in the corresponding segment. We then walk 2194 // through both arrays, removing ".", "..", and other segments as necessary 2195 // by setting their entries in the index array to -1. Finally, the two 2196 // arrays are used to rejoin the segments and compute the final result. 2197 // 2198 // This code is based upon src/solaris/native/java/io/canonicalize_md.c 2199 2200 2201 // Check the given path to see if it might need normalization. A path 2202 // might need normalization if it contains duplicate slashes, a "." 2203 // segment, or a ".." segment. Return -1 if no further normalization is 2204 // possible, otherwise return the number of segments found. 2205 // 2206 // This method takes a string argument rather than a char array so that 2207 // this test can be performed without invoking path.toCharArray(). 2208 // 2209 private static int needsNormalization(String path) { 2210 boolean normal = true; 2211 int ns = 0; // Number of segments 2212 int end = path.length() - 1; // Index of last char in path 2213 int p = 0; // Index of next char in path 2214 2215 // Skip initial slashes 2216 while (p <= end) { 2217 if (path.charAt(p) != '/') break; 2218 p++; 2219 } 2220 if (p > 1) normal = false; 2221 2222 // Scan segments 2223 while (p <= end) { 2224 2225 // Looking at "." or ".." ? 2226 if ((path.charAt(p) == '.') 2227 && ((p == end) 2228 || ((path.charAt(p + 1) == '/') 2229 || ((path.charAt(p + 1) == '.') 2230 && ((p + 1 == end) 2231 || (path.charAt(p + 2) == '/')))))) { 2232 normal = false; 2233 } 2234 ns++; 2235 2236 // Find beginning of next segment 2237 while (p <= end) { 2238 if (path.charAt(p++) != '/') 2239 continue; 2240 2241 // Skip redundant slashes 2242 while (p <= end) { 2243 if (path.charAt(p) != '/') break; 2244 normal = false; 2245 p++; 2246 } 2247 2248 break; 2249 } 2250 } 2251 2252 return normal ? -1 : ns; 2253 } 2254 2255 2256 // Split the given path into segments, replacing slashes with nulls and 2257 // filling in the given segment-index array. 2258 // 2259 // Preconditions: 2260 // segs.length == Number of segments in path 2261 // 2262 // Postconditions: 2263 // All slashes in path replaced by '\0' 2264 // segs[i] == Index of first char in segment i (0 <= i < segs.length) 2265 // 2266 private static void split(char[] path, int[] segs) { 2267 int end = path.length - 1; // Index of last char in path 2268 int p = 0; // Index of next char in path 2269 int i = 0; // Index of current segment 2270 2271 // Skip initial slashes 2272 while (p <= end) { 2273 if (path[p] != '/') break; 2274 path[p] = '\0'; 2275 p++; 2276 } 2277 2278 while (p <= end) { 2279 2280 // Note start of segment 2281 segs[i++] = p++; 2282 2283 // Find beginning of next segment 2284 while (p <= end) { 2285 if (path[p++] != '/') 2286 continue; 2287 path[p - 1] = '\0'; 2288 2289 // Skip redundant slashes 2290 while (p <= end) { 2291 if (path[p] != '/') break; 2292 path[p++] = '\0'; 2293 } 2294 break; 2295 } 2296 } 2297 2298 if (i != segs.length) 2299 throw new InternalError(); // ASSERT 2300 } 2301 2302 2303 // Join the segments in the given path according to the given segment-index 2304 // array, ignoring those segments whose index entries have been set to -1, 2305 // and inserting slashes as needed. Return the length of the resulting 2306 // path. 2307 // 2308 // Preconditions: 2309 // segs[i] == -1 implies segment i is to be ignored 2310 // path computed by split, as above, with '\0' having replaced '/' 2311 // 2312 // Postconditions: 2313 // path[0] .. path[return value] == Resulting path 2314 // 2315 private static int join(char[] path, int[] segs) { 2316 int ns = segs.length; // Number of segments 2317 int end = path.length - 1; // Index of last char in path 2318 int p = 0; // Index of next path char to write 2319 2320 if (path[p] == '\0') { 2321 // Restore initial slash for absolute paths 2322 path[p++] = '/'; 2323 } 2324 2325 for (int i = 0; i < ns; i++) { 2326 int q = segs[i]; // Current segment 2327 if (q == -1) 2328 // Ignore this segment 2329 continue; 2330 2331 if (p == q) { 2332 // We're already at this segment, so just skip to its end 2333 while ((p <= end) && (path[p] != '\0')) 2334 p++; 2335 if (p <= end) { 2336 // Preserve trailing slash 2337 path[p++] = '/'; 2338 } 2339 } else if (p < q) { 2340 // Copy q down to p 2341 while ((q <= end) && (path[q] != '\0')) 2342 path[p++] = path[q++]; 2343 if (q <= end) { 2344 // Preserve trailing slash 2345 path[p++] = '/'; 2346 } 2347 } else 2348 throw new InternalError(); // ASSERT false 2349 } 2350 2351 return p; 2352 } 2353 2354 2355 // Remove "." segments from the given path, and remove segment pairs 2356 // consisting of a non-".." segment followed by a ".." segment. 2357 // 2358 private static void removeDots(char[] path, int[] segs) { 2359 int ns = segs.length; 2360 int end = path.length - 1; 2361 2362 for (int i = 0; i < ns; i++) { 2363 int dots = 0; // Number of dots found (0, 1, or 2) 2364 2365 // Find next occurrence of "." or ".." 2366 do { 2367 int p = segs[i]; 2368 if (path[p] == '.') { 2369 if (p == end) { 2370 dots = 1; 2371 break; 2372 } else if (path[p + 1] == '\0') { 2373 dots = 1; 2374 break; 2375 } else if ((path[p + 1] == '.') 2376 && ((p + 1 == end) 2377 || (path[p + 2] == '\0'))) { 2378 dots = 2; 2379 break; 2380 } 2381 } 2382 i++; 2383 } while (i < ns); 2384 if ((i > ns) || (dots == 0)) 2385 break; 2386 2387 if (dots == 1) { 2388 // Remove this occurrence of "." 2389 segs[i] = -1; 2390 } else { 2391 // If there is a preceding non-".." segment, remove both that 2392 // segment and this occurrence of ".."; otherwise, leave this 2393 // ".." segment as-is. 2394 int j; 2395 for (j = i - 1; j >= 0; j--) { 2396 if (segs[j] != -1) break; 2397 } 2398 if (j >= 0) { 2399 int q = segs[j]; 2400 if (!((path[q] == '.') 2401 && (path[q + 1] == '.') 2402 && (path[q + 2] == '\0'))) { 2403 segs[i] = -1; 2404 segs[j] = -1; 2405 } 2406 } 2407 } 2408 } 2409 } 2410 2411 2412 // DEVIATION: If the normalized path is relative, and if the first 2413 // segment could be parsed as a scheme name, then prepend a "." segment 2414 // 2415 private static void maybeAddLeadingDot(char[] path, int[] segs) { 2416 2417 if (path[0] == '\0') 2418 // The path is absolute 2419 return; 2420 2421 int ns = segs.length; 2422 int f = 0; // Index of first segment 2423 while (f < ns) { 2424 if (segs[f] >= 0) 2425 break; 2426 f++; 2427 } 2428 if ((f >= ns) || (f == 0)) 2429 // The path is empty, or else the original first segment survived, 2430 // in which case we already know that no leading "." is needed 2431 return; 2432 2433 int p = segs[f]; 2434 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++; 2435 if (p >= path.length || path[p] == '\0') 2436 // No colon in first segment, so no "." needed 2437 return; 2438 2439 // At this point we know that the first segment is unused, 2440 // hence we can insert a "." segment at that position 2441 path[0] = '.'; 2442 path[1] = '\0'; 2443 segs[0] = 0; 2444 } 2445 2446 2447 // Normalize the given path string. A normal path string has no empty 2448 // segments (i.e., occurrences of "//"), no segments equal to ".", and no 2449 // segments equal to ".." that are preceded by a segment not equal to "..". 2450 // In contrast to Unix-style pathname normalization, for URI paths we 2451 // always retain trailing slashes. 2452 // 2453 private static String normalize(String ps) { 2454 2455 // Does this path need normalization? 2456 int ns = needsNormalization(ps); // Number of segments 2457 if (ns < 0) 2458 // Nope -- just return it 2459 return ps; 2460 2461 char[] path = ps.toCharArray(); // Path in char-array form 2462 2463 // Split path into segments 2464 int[] segs = new int[ns]; // Segment-index array 2465 split(path, segs); 2466 2467 // Remove dots 2468 removeDots(path, segs); 2469 2470 // Prevent scheme-name confusion 2471 maybeAddLeadingDot(path, segs); 2472 2473 // Join the remaining segments and return the result 2474 String s = new String(path, 0, join(path, segs)); 2475 if (s.equals(ps)) { 2476 // string was already normalized 2477 return ps; 2478 } 2479 return s; 2480 } 2481 2482 2483 2484 // -- Character classes for parsing -- 2485 2486 // RFC2396 precisely specifies which characters in the US-ASCII charset are 2487 // permissible in the various components of a URI reference. We here 2488 // define a set of mask pairs to aid in enforcing these restrictions. Each 2489 // mask pair consists of two longs, a low mask and a high mask. Taken 2490 // together they represent a 128-bit mask, where bit i is set iff the 2491 // character with value i is permitted. 2492 // 2493 // This approach is more efficient than sequentially searching arrays of 2494 // permitted characters. It could be made still more efficient by 2495 // precompiling the mask information so that a character's presence in a 2496 // given mask could be determined by a single table lookup. 2497 2498 // Compute the low-order mask for the characters in the given string 2499 private static long lowMask(String chars) { 2500 int n = chars.length(); 2501 long m = 0; 2502 for (int i = 0; i < n; i++) { 2503 char c = chars.charAt(i); 2504 if (c < 64) 2505 m |= (1L << c); 2506 } 2507 return m; 2508 } 2509 2510 // Compute the high-order mask for the characters in the given string 2511 private static long highMask(String chars) { 2512 int n = chars.length(); 2513 long m = 0; 2514 for (int i = 0; i < n; i++) { 2515 char c = chars.charAt(i); 2516 if ((c >= 64) && (c < 128)) 2517 m |= (1L << (c - 64)); 2518 } 2519 return m; 2520 } 2521 2522 // Compute a low-order mask for the characters 2523 // between first and last, inclusive 2524 private static long lowMask(char first, char last) { 2525 long m = 0; 2526 int f = Math.max(Math.min(first, 63), 0); 2527 int l = Math.max(Math.min(last, 63), 0); 2528 for (int i = f; i <= l; i++) 2529 m |= 1L << i; 2530 return m; 2531 } 2532 2533 // Compute a high-order mask for the characters 2534 // between first and last, inclusive 2535 private static long highMask(char first, char last) { 2536 long m = 0; 2537 int f = Math.max(Math.min(first, 127), 64) - 64; 2538 int l = Math.max(Math.min(last, 127), 64) - 64; 2539 for (int i = f; i <= l; i++) 2540 m |= 1L << i; 2541 return m; 2542 } 2543 2544 // Tell whether the given character is permitted by the given mask pair 2545 private static boolean match(char c, long lowMask, long highMask) { 2546 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches. 2547 return false; 2548 if (c < 64) 2549 return ((1L << c) & lowMask) != 0; 2550 if (c < 128) 2551 return ((1L << (c - 64)) & highMask) != 0; 2552 return false; 2553 } 2554 2555 // Character-class masks, in reverse order from RFC2396 because 2556 // initializers for static fields cannot make forward references. 2557 2558 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | 2559 // "8" | "9" 2560 private static final long L_DIGIT = lowMask('0', '9'); 2561 private static final long H_DIGIT = 0L; 2562 2563 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | 2564 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | 2565 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 2566 private static final long L_UPALPHA = 0L; 2567 private static final long H_UPALPHA = highMask('A', 'Z'); 2568 2569 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | 2570 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | 2571 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 2572 private static final long L_LOWALPHA = 0L; 2573 private static final long H_LOWALPHA = highMask('a', 'z'); 2574 2575 // alpha = lowalpha | upalpha 2576 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA; 2577 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA; 2578 2579 // alphanum = alpha | digit 2580 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA; 2581 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA; 2582 2583 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | 2584 // "a" | "b" | "c" | "d" | "e" | "f" 2585 private static final long L_HEX = L_DIGIT; 2586 private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f'); 2587 2588 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | 2589 // "(" | ")" 2590 private static final long L_MARK = lowMask("-_.!~*'()"); 2591 private static final long H_MARK = highMask("-_.!~*'()"); 2592 2593 // unreserved = alphanum | mark 2594 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK; 2595 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK; 2596 2597 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 2598 // "$" | "," | "[" | "]" 2599 // Added per RFC2732: "[", "]" 2600 private static final long L_RESERVED = lowMask(";/?:@&=+$,[]"); 2601 private static final long H_RESERVED = highMask(";/?:@&=+$,[]"); 2602 2603 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII 2604 // characters are allowed; this is handled by the scanEscape method below. 2605 private static final long L_ESCAPED = 1L; 2606 private static final long H_ESCAPED = 0L; 2607 2608 // uric = reserved | unreserved | escaped 2609 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED; 2610 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED; 2611 2612 // pchar = unreserved | escaped | 2613 // ":" | "@" | "&" | "=" | "+" | "$" | "," 2614 private static final long L_PCHAR 2615 = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,"); 2616 private static final long H_PCHAR 2617 = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,"); 2618 2619 // All valid path characters 2620 private static final long L_PATH = L_PCHAR | lowMask(";/"); 2621 private static final long H_PATH = H_PCHAR | highMask(";/"); 2622 2623 // Dash, for use in domainlabel and toplabel 2624 private static final long L_DASH = lowMask("-"); 2625 private static final long H_DASH = highMask("-"); 2626 2627 // Dot, for use in hostnames 2628 private static final long L_DOT = lowMask("."); 2629 private static final long H_DOT = highMask("."); 2630 2631 // userinfo = *( unreserved | escaped | 2632 // ";" | ":" | "&" | "=" | "+" | "$" | "," ) 2633 private static final long L_USERINFO 2634 = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,"); 2635 private static final long H_USERINFO 2636 = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,"); 2637 2638 // reg_name = 1*( unreserved | escaped | "$" | "," | 2639 // ";" | ":" | "@" | "&" | "=" | "+" ) 2640 private static final long L_REG_NAME 2641 = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+"); 2642 private static final long H_REG_NAME 2643 = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+"); 2644 2645 // All valid characters for server-based authorities 2646 private static final long L_SERVER 2647 = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]"); 2648 private static final long H_SERVER 2649 = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]"); 2650 2651 // Special case of server authority that represents an IPv6 address 2652 // In this case, a % does not signify an escape sequence 2653 private static final long L_SERVER_PERCENT 2654 = L_SERVER | lowMask("%"); 2655 private static final long H_SERVER_PERCENT 2656 = H_SERVER | highMask("%"); 2657 private static final long L_LEFT_BRACKET = lowMask("["); 2658 private static final long H_LEFT_BRACKET = highMask("["); 2659 2660 // scheme = alpha *( alpha | digit | "+" | "-" | "." ) 2661 private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-."); 2662 private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-."); 2663 2664 // scope_id = alpha | digit | "_" | "." 2665 private static final long L_SCOPE_ID 2666 = L_ALPHANUM | lowMask("_."); 2667 private static final long H_SCOPE_ID 2668 = H_ALPHANUM | highMask("_."); 2669 2670 // -- Escaping and encoding -- 2671 2672 private static final char[] hexDigits = { 2673 '0', '1', '2', '3', '4', '5', '6', '7', 2674 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' 2675 }; 2676 2677 private static void appendEscape(StringBuilder sb, byte b) { 2678 sb.append('%'); 2679 sb.append(hexDigits[(b >> 4) & 0x0f]); 2680 sb.append(hexDigits[(b >> 0) & 0x0f]); 2681 } 2682 2683 private static void appendEncoded(StringBuilder sb, char c) { 2684 ByteBuffer bb = null; 2685 try { 2686 bb = ThreadLocalCoders.encoderFor("UTF-8") 2687 .encode(CharBuffer.wrap("" + c)); 2688 } catch (CharacterCodingException x) { 2689 assert false; 2690 } 2691 while (bb.hasRemaining()) { 2692 int b = bb.get() & 0xff; 2693 if (b >= 0x80) 2694 appendEscape(sb, (byte)b); 2695 else 2696 sb.append((char)b); 2697 } 2698 } 2699 2700 // Quote any characters in s that are not permitted 2701 // by the given mask pair 2702 // 2703 private static String quote(String s, long lowMask, long highMask) { 2704 StringBuilder sb = null; 2705 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0); 2706 for (int i = 0; i < s.length(); i++) { 2707 char c = s.charAt(i); 2708 if (c < '\u0080') { 2709 if (!match(c, lowMask, highMask)) { 2710 if (sb == null) { 2711 sb = new StringBuilder(); 2712 sb.append(s, 0, i); 2713 } 2714 appendEscape(sb, (byte)c); 2715 } else { 2716 if (sb != null) 2717 sb.append(c); 2718 } 2719 } else if (allowNonASCII 2720 && (Character.isSpaceChar(c) 2721 || Character.isISOControl(c))) { 2722 if (sb == null) { 2723 sb = new StringBuilder(); 2724 sb.append(s, 0, i); 2725 } 2726 appendEncoded(sb, c); 2727 } else { 2728 if (sb != null) 2729 sb.append(c); 2730 } 2731 } 2732 return (sb == null) ? s : sb.toString(); 2733 } 2734 2735 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets, 2736 // assuming that s is otherwise legal 2737 // 2738 private static String encode(String s) { 2739 int n = s.length(); 2740 if (n == 0) 2741 return s; 2742 2743 // First check whether we actually need to encode 2744 for (int i = 0;;) { 2745 if (s.charAt(i) >= '\u0080') 2746 break; 2747 if (++i >= n) 2748 return s; 2749 } 2750 2751 String ns = Normalizer.normalize(s, Normalizer.Form.NFC); 2752 ByteBuffer bb = null; 2753 try { 2754 bb = ThreadLocalCoders.encoderFor("UTF-8") 2755 .encode(CharBuffer.wrap(ns)); 2756 } catch (CharacterCodingException x) { 2757 assert false; 2758 } 2759 2760 StringBuilder sb = new StringBuilder(); 2761 while (bb.hasRemaining()) { 2762 int b = bb.get() & 0xff; 2763 if (b >= 0x80) 2764 appendEscape(sb, (byte)b); 2765 else 2766 sb.append((char)b); 2767 } 2768 return sb.toString(); 2769 } 2770 2771 private static int decode(char c) { 2772 if ((c >= '0') && (c <= '9')) 2773 return c - '0'; 2774 if ((c >= 'a') && (c <= 'f')) 2775 return c - 'a' + 10; 2776 if ((c >= 'A') && (c <= 'F')) 2777 return c - 'A' + 10; 2778 assert false; 2779 return -1; 2780 } 2781 2782 private static byte decode(char c1, char c2) { 2783 return (byte)( ((decode(c1) & 0xf) << 4) 2784 | ((decode(c2) & 0xf) << 0)); 2785 } 2786 2787 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes 2788 // that escapes are well-formed syntactically, i.e., of the form %XX. If a 2789 // sequence of escaped octets is not valid UTF-8 then the erroneous octets 2790 // are replaced with '\uFFFD'. 2791 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal 2792 // with a scope_id 2793 // 2794 private static String decode(String s) { 2795 return decode(s, true); 2796 } 2797 2798 // This method was introduced as a generalization of URI.decode method 2799 // to provide a fix for JDK-8037396 2800 private static String decode(String s, boolean ignorePercentInBrackets) { 2801 if (s == null) 2802 return s; 2803 int n = s.length(); 2804 if (n == 0) 2805 return s; 2806 if (s.indexOf('%') < 0) 2807 return s; 2808 2809 StringBuilder sb = new StringBuilder(n); 2810 ByteBuffer bb = ByteBuffer.allocate(n); 2811 CharBuffer cb = CharBuffer.allocate(n); 2812 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8") 2813 .onMalformedInput(CodingErrorAction.REPLACE) 2814 .onUnmappableCharacter(CodingErrorAction.REPLACE); 2815 2816 // This is not horribly efficient, but it will do for now 2817 char c = s.charAt(0); 2818 boolean betweenBrackets = false; 2819 2820 for (int i = 0; i < n;) { 2821 assert c == s.charAt(i); // Loop invariant 2822 if (c == '[') { 2823 betweenBrackets = true; 2824 } else if (betweenBrackets && c == ']') { 2825 betweenBrackets = false; 2826 } 2827 if (c != '%' || (betweenBrackets && ignorePercentInBrackets)) { 2828 sb.append(c); 2829 if (++i >= n) 2830 break; 2831 c = s.charAt(i); 2832 continue; 2833 } 2834 bb.clear(); 2835 int ui = i; 2836 for (;;) { 2837 assert (n - i >= 2); 2838 bb.put(decode(s.charAt(++i), s.charAt(++i))); 2839 if (++i >= n) 2840 break; 2841 c = s.charAt(i); 2842 if (c != '%') 2843 break; 2844 } 2845 bb.flip(); 2846 cb.clear(); 2847 dec.reset(); 2848 CoderResult cr = dec.decode(bb, cb, true); 2849 assert cr.isUnderflow(); 2850 cr = dec.flush(cb); 2851 assert cr.isUnderflow(); 2852 sb.append(cb.flip().toString()); 2853 } 2854 2855 return sb.toString(); 2856 } 2857 2858 2859 // -- Parsing -- 2860 2861 // For convenience we wrap the input URI string in a new instance of the 2862 // following internal class. This saves always having to pass the input 2863 // string as an argument to each internal scan/parse method. 2864 2865 private class Parser { 2866 2867 private String input; // URI input string 2868 private boolean requireServerAuthority = false; 2869 2870 Parser(String s) { 2871 input = s; 2872 string = s; 2873 } 2874 2875 // -- Methods for throwing URISyntaxException in various ways -- 2876 2877 private void fail(String reason) throws URISyntaxException { 2878 throw new URISyntaxException(input, reason); 2879 } 2880 2881 private void fail(String reason, int p) throws URISyntaxException { 2882 throw new URISyntaxException(input, reason, p); 2883 } 2884 2885 private void failExpecting(String expected, int p) 2886 throws URISyntaxException 2887 { 2888 fail("Expected " + expected, p); 2889 } 2890 2891 2892 // -- Simple access to the input string -- 2893 2894 // Tells whether start < end and, if so, whether charAt(start) == c 2895 // 2896 private boolean at(int start, int end, char c) { 2897 return (start < end) && (input.charAt(start) == c); 2898 } 2899 2900 // Tells whether start + s.length() < end and, if so, 2901 // whether the chars at the start position match s exactly 2902 // 2903 private boolean at(int start, int end, String s) { 2904 int p = start; 2905 int sn = s.length(); 2906 if (sn > end - p) 2907 return false; 2908 int i = 0; 2909 while (i < sn) { 2910 if (input.charAt(p++) != s.charAt(i)) { 2911 break; 2912 } 2913 i++; 2914 } 2915 return (i == sn); 2916 } 2917 2918 2919 // -- Scanning -- 2920 2921 // The various scan and parse methods that follow use a uniform 2922 // convention of taking the current start position and end index as 2923 // their first two arguments. The start is inclusive while the end is 2924 // exclusive, just as in the String class, i.e., a start/end pair 2925 // denotes the left-open interval [start, end) of the input string. 2926 // 2927 // These methods never proceed past the end position. They may return 2928 // -1 to indicate outright failure, but more often they simply return 2929 // the position of the first char after the last char scanned. Thus 2930 // a typical idiom is 2931 // 2932 // int p = start; 2933 // int q = scan(p, end, ...); 2934 // if (q > p) 2935 // // We scanned something 2936 // ...; 2937 // else if (q == p) 2938 // // We scanned nothing 2939 // ...; 2940 // else if (q == -1) 2941 // // Something went wrong 2942 // ...; 2943 2944 2945 // Scan a specific char: If the char at the given start position is 2946 // equal to c, return the index of the next char; otherwise, return the 2947 // start position. 2948 // 2949 private int scan(int start, int end, char c) { 2950 if ((start < end) && (input.charAt(start) == c)) 2951 return start + 1; 2952 return start; 2953 } 2954 2955 // Scan forward from the given start position. Stop at the first char 2956 // in the err string (in which case -1 is returned), or the first char 2957 // in the stop string (in which case the index of the preceding char is 2958 // returned), or the end of the input string (in which case the length 2959 // of the input string is returned). May return the start position if 2960 // nothing matches. 2961 // 2962 private int scan(int start, int end, String err, String stop) { 2963 int p = start; 2964 while (p < end) { 2965 char c = input.charAt(p); 2966 if (err.indexOf(c) >= 0) 2967 return -1; 2968 if (stop.indexOf(c) >= 0) 2969 break; 2970 p++; 2971 } 2972 return p; 2973 } 2974 2975 // Scan forward from the given start position. Stop at the first char 2976 // in the stop string (in which case the index of the preceding char is 2977 // returned), or the end of the input string (in which case the length 2978 // of the input string is returned). May return the start position if 2979 // nothing matches. 2980 // 2981 private int scan(int start, int end, String stop) { 2982 int p = start; 2983 while (p < end) { 2984 char c = input.charAt(p); 2985 if (stop.indexOf(c) >= 0) 2986 break; 2987 p++; 2988 } 2989 return p; 2990 } 2991 2992 // Scan a potential escape sequence, starting at the given position, 2993 // with the given first char (i.e., charAt(start) == c). 2994 // 2995 // This method assumes that if escapes are allowed then visible 2996 // non-US-ASCII chars are also allowed. 2997 // 2998 private int scanEscape(int start, int n, char first) 2999 throws URISyntaxException 3000 { 3001 int p = start; 3002 char c = first; 3003 if (c == '%') { 3004 // Process escape pair 3005 if ((p + 3 <= n) 3006 && match(input.charAt(p + 1), L_HEX, H_HEX) 3007 && match(input.charAt(p + 2), L_HEX, H_HEX)) { 3008 return p + 3; 3009 } 3010 fail("Malformed escape pair", p); 3011 } else if ((c > 128) 3012 && !Character.isSpaceChar(c) 3013 && !Character.isISOControl(c)) { 3014 // Allow unescaped but visible non-US-ASCII chars 3015 return p + 1; 3016 } 3017 return p; 3018 } 3019 3020 // Scan chars that match the given mask pair 3021 // 3022 private int scan(int start, int n, long lowMask, long highMask) 3023 throws URISyntaxException 3024 { 3025 int p = start; 3026 while (p < n) { 3027 char c = input.charAt(p); 3028 if (match(c, lowMask, highMask)) { 3029 p++; 3030 continue; 3031 } 3032 if ((lowMask & L_ESCAPED) != 0) { 3033 int q = scanEscape(p, n, c); 3034 if (q > p) { 3035 p = q; 3036 continue; 3037 } 3038 } 3039 break; 3040 } 3041 return p; 3042 } 3043 3044 // Check that each of the chars in [start, end) matches the given mask 3045 // 3046 private void checkChars(int start, int end, 3047 long lowMask, long highMask, 3048 String what) 3049 throws URISyntaxException 3050 { 3051 int p = scan(start, end, lowMask, highMask); 3052 if (p < end) 3053 fail("Illegal character in " + what, p); 3054 } 3055 3056 // Check that the char at position p matches the given mask 3057 // 3058 private void checkChar(int p, 3059 long lowMask, long highMask, 3060 String what) 3061 throws URISyntaxException 3062 { 3063 checkChars(p, p + 1, lowMask, highMask, what); 3064 } 3065 3066 3067 // -- Parsing -- 3068 3069 // [<scheme>:]<scheme-specific-part>[#<fragment>] 3070 // 3071 void parse(boolean rsa) throws URISyntaxException { 3072 requireServerAuthority = rsa; 3073 int n = input.length(); 3074 int p = scan(0, n, "/?#", ":"); 3075 if ((p >= 0) && at(p, n, ':')) { 3076 if (p == 0) 3077 failExpecting("scheme name", 0); 3078 checkChar(0, L_ALPHA, H_ALPHA, "scheme name"); 3079 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name"); 3080 scheme = input.substring(0, p); 3081 p++; // Skip ':' 3082 if (at(p, n, '/')) { 3083 p = parseHierarchical(p, n); 3084 } else { 3085 // opaque; need to create the schemeSpecificPart 3086 int q = scan(p, n, "#"); 3087 if (q <= p) 3088 failExpecting("scheme-specific part", p); 3089 checkChars(p, q, L_URIC, H_URIC, "opaque part"); 3090 schemeSpecificPart = input.substring(p, q); 3091 p = q; 3092 } 3093 } else { 3094 p = parseHierarchical(0, n); 3095 } 3096 if (at(p, n, '#')) { 3097 checkChars(p + 1, n, L_URIC, H_URIC, "fragment"); 3098 fragment = input.substring(p + 1, n); 3099 p = n; 3100 } 3101 if (p < n) 3102 fail("end of URI", p); 3103 } 3104 3105 // [//authority]<path>[?<query>] 3106 // 3107 // DEVIATION from RFC2396: We allow an empty authority component as 3108 // long as it's followed by a non-empty path, query component, or 3109 // fragment component. This is so that URIs such as "file:///foo/bar" 3110 // will parse. This seems to be the intent of RFC2396, though the 3111 // grammar does not permit it. If the authority is empty then the 3112 // userInfo, host, and port components are undefined. 3113 // 3114 // DEVIATION from RFC2396: We allow empty relative paths. This seems 3115 // to be the intent of RFC2396, but the grammar does not permit it. 3116 // The primary consequence of this deviation is that "#f" parses as a 3117 // relative URI with an empty path. 3118 // 3119 private int parseHierarchical(int start, int n) 3120 throws URISyntaxException 3121 { 3122 int p = start; 3123 if (at(p, n, '/') && at(p + 1, n, '/')) { 3124 p += 2; 3125 int q = scan(p, n, "/?#"); 3126 if (q > p) { 3127 p = parseAuthority(p, q); 3128 } else if (q < n) { 3129 // DEVIATION: Allow empty authority prior to non-empty 3130 // path, query component or fragment identifier 3131 } else 3132 failExpecting("authority", p); 3133 } 3134 int q = scan(p, n, "?#"); // DEVIATION: May be empty 3135 checkChars(p, q, L_PATH, H_PATH, "path"); 3136 path = input.substring(p, q); 3137 p = q; 3138 if (at(p, n, '?')) { 3139 p++; 3140 q = scan(p, n, "#"); 3141 checkChars(p, q, L_URIC, H_URIC, "query"); 3142 query = input.substring(p, q); 3143 p = q; 3144 } 3145 return p; 3146 } 3147 3148 // authority = server | reg_name 3149 // 3150 // Ambiguity: An authority that is a registry name rather than a server 3151 // might have a prefix that parses as a server. We use the fact that 3152 // the authority component is always followed by '/' or the end of the 3153 // input string to resolve this: If the complete authority did not 3154 // parse as a server then we try to parse it as a registry name. 3155 // 3156 private int parseAuthority(int start, int n) 3157 throws URISyntaxException 3158 { 3159 int p = start; 3160 int q = p; 3161 URISyntaxException ex = null; 3162 3163 boolean serverChars; 3164 boolean regChars; 3165 3166 if (scan(p, n, "]") > p) { 3167 // contains a literal IPv6 address, therefore % is allowed 3168 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n); 3169 } else { 3170 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n); 3171 } 3172 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n); 3173 3174 if (regChars && !serverChars) { 3175 // Must be a registry-based authority 3176 authority = input.substring(p, n); 3177 return n; 3178 } 3179 3180 if (serverChars) { 3181 // Might be (probably is) a server-based authority, so attempt 3182 // to parse it as such. If the attempt fails, try to treat it 3183 // as a registry-based authority. 3184 try { 3185 q = parseServer(p, n); 3186 if (q < n) 3187 failExpecting("end of authority", q); 3188 authority = input.substring(p, n); 3189 } catch (URISyntaxException x) { 3190 // Undo results of failed parse 3191 userInfo = null; 3192 host = null; 3193 port = -1; 3194 if (requireServerAuthority) { 3195 // If we're insisting upon a server-based authority, 3196 // then just re-throw the exception 3197 throw x; 3198 } else { 3199 // Save the exception in case it doesn't parse as a 3200 // registry either 3201 ex = x; 3202 q = p; 3203 } 3204 } 3205 } 3206 3207 if (q < n) { 3208 if (regChars) { 3209 // Registry-based authority 3210 authority = input.substring(p, n); 3211 } else if (ex != null) { 3212 // Re-throw exception; it was probably due to 3213 // a malformed IPv6 address 3214 throw ex; 3215 } else { 3216 fail("Illegal character in authority", q); 3217 } 3218 } 3219 3220 return n; 3221 } 3222 3223 3224 // [<userinfo>@]<host>[:<port>] 3225 // 3226 private int parseServer(int start, int n) 3227 throws URISyntaxException 3228 { 3229 int p = start; 3230 int q; 3231 3232 // userinfo 3233 q = scan(p, n, "/?#", "@"); 3234 if ((q >= p) && at(q, n, '@')) { 3235 checkChars(p, q, L_USERINFO, H_USERINFO, "user info"); 3236 userInfo = input.substring(p, q); 3237 p = q + 1; // Skip '@' 3238 } 3239 3240 // hostname, IPv4 address, or IPv6 address 3241 if (at(p, n, '[')) { 3242 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732 3243 p++; 3244 q = scan(p, n, "/?#", "]"); 3245 if ((q > p) && at(q, n, ']')) { 3246 // look for a "%" scope id 3247 int r = scan (p, q, "%"); 3248 if (r > p) { 3249 parseIPv6Reference(p, r); 3250 if (r+1 == q) { 3251 fail ("scope id expected"); 3252 } 3253 checkChars (r+1, q, L_SCOPE_ID, H_SCOPE_ID, 3254 "scope id"); 3255 } else { 3256 parseIPv6Reference(p, q); 3257 } 3258 host = input.substring(p-1, q+1); 3259 p = q + 1; 3260 } else { 3261 failExpecting("closing bracket for IPv6 address", q); 3262 } 3263 } else { 3264 q = parseIPv4Address(p, n); 3265 if (q <= p) 3266 q = parseHostname(p, n); 3267 p = q; 3268 } 3269 3270 // port 3271 if (at(p, n, ':')) { 3272 p++; 3273 q = scan(p, n, "/"); 3274 if (q > p) { 3275 checkChars(p, q, L_DIGIT, H_DIGIT, "port number"); 3276 try { 3277 port = Integer.parseInt(input, p, q, 10); 3278 } catch (NumberFormatException x) { 3279 fail("Malformed port number", p); 3280 } 3281 p = q; 3282 } 3283 } 3284 if (p < n) 3285 failExpecting("port number", p); 3286 3287 return p; 3288 } 3289 3290 // Scan a string of decimal digits whose value fits in a byte 3291 // 3292 private int scanByte(int start, int n) 3293 throws URISyntaxException 3294 { 3295 int p = start; 3296 int q = scan(p, n, L_DIGIT, H_DIGIT); 3297 if (q <= p) return q; 3298 if (Integer.parseInt(input, p, q, 10) > 255) return p; 3299 return q; 3300 } 3301 3302 // Scan an IPv4 address. 3303 // 3304 // If the strict argument is true then we require that the given 3305 // interval contain nothing besides an IPv4 address; if it is false 3306 // then we only require that it start with an IPv4 address. 3307 // 3308 // If the interval does not contain or start with (depending upon the 3309 // strict argument) a legal IPv4 address characters then we return -1 3310 // immediately; otherwise we insist that these characters parse as a 3311 // legal IPv4 address and throw an exception on failure. 3312 // 3313 // We assume that any string of decimal digits and dots must be an IPv4 3314 // address. It won't parse as a hostname anyway, so making that 3315 // assumption here allows more meaningful exceptions to be thrown. 3316 // 3317 private int scanIPv4Address(int start, int n, boolean strict) 3318 throws URISyntaxException 3319 { 3320 int p = start; 3321 int q; 3322 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT); 3323 if ((m <= p) || (strict && (m != n))) 3324 return -1; 3325 for (;;) { 3326 // Per RFC2732: At most three digits per byte 3327 // Further constraint: Each element fits in a byte 3328 if ((q = scanByte(p, m)) <= p) break; p = q; 3329 if ((q = scan(p, m, '.')) <= p) break; p = q; 3330 if ((q = scanByte(p, m)) <= p) break; p = q; 3331 if ((q = scan(p, m, '.')) <= p) break; p = q; 3332 if ((q = scanByte(p, m)) <= p) break; p = q; 3333 if ((q = scan(p, m, '.')) <= p) break; p = q; 3334 if ((q = scanByte(p, m)) <= p) break; p = q; 3335 if (q < m) break; 3336 return q; 3337 } 3338 fail("Malformed IPv4 address", q); 3339 return -1; 3340 } 3341 3342 // Take an IPv4 address: Throw an exception if the given interval 3343 // contains anything except an IPv4 address 3344 // 3345 private int takeIPv4Address(int start, int n, String expected) 3346 throws URISyntaxException 3347 { 3348 int p = scanIPv4Address(start, n, true); 3349 if (p <= start) 3350 failExpecting(expected, start); 3351 return p; 3352 } 3353 3354 // Attempt to parse an IPv4 address, returning -1 on failure but 3355 // allowing the given interval to contain [:<characters>] after 3356 // the IPv4 address. 3357 // 3358 private int parseIPv4Address(int start, int n) { 3359 int p; 3360 3361 try { 3362 p = scanIPv4Address(start, n, false); 3363 } catch (URISyntaxException x) { 3364 return -1; 3365 } catch (NumberFormatException nfe) { 3366 return -1; 3367 } 3368 3369 if (p > start && p < n) { 3370 // IPv4 address is followed by something - check that 3371 // it's a ":" as this is the only valid character to 3372 // follow an address. 3373 if (input.charAt(p) != ':') { 3374 p = -1; 3375 } 3376 } 3377 3378 if (p > start) 3379 host = input.substring(start, p); 3380 3381 return p; 3382 } 3383 3384 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ] 3385 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum 3386 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum 3387 // 3388 private int parseHostname(int start, int n) 3389 throws URISyntaxException 3390 { 3391 int p = start; 3392 int q; 3393 int l = -1; // Start of last parsed label 3394 3395 do { 3396 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ] 3397 q = scan(p, n, L_ALPHANUM, H_ALPHANUM); 3398 if (q <= p) 3399 break; 3400 l = p; 3401 if (q > p) { 3402 p = q; 3403 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH); 3404 if (q > p) { 3405 if (input.charAt(q - 1) == '-') 3406 fail("Illegal character in hostname", q - 1); 3407 p = q; 3408 } 3409 } 3410 q = scan(p, n, '.'); 3411 if (q <= p) 3412 break; 3413 p = q; 3414 } while (p < n); 3415 3416 if ((p < n) && !at(p, n, ':')) 3417 fail("Illegal character in hostname", p); 3418 3419 if (l < 0) 3420 failExpecting("hostname", start); 3421 3422 // for a fully qualified hostname check that the rightmost 3423 // label starts with an alpha character. 3424 if (l > start && !match(input.charAt(l), L_ALPHA, H_ALPHA)) { 3425 fail("Illegal character in hostname", l); 3426 } 3427 3428 host = input.substring(start, p); 3429 return p; 3430 } 3431 3432 3433 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture 3434 // 3435 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of 3436 // the form ::12.34.56.78, which are clearly shown in the examples 3437 // earlier in the document. Here is the original grammar: 3438 // 3439 // IPv6address = hexpart [ ":" IPv4address ] 3440 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ] 3441 // hexseq = hex4 *( ":" hex4) 3442 // hex4 = 1*4HEXDIG 3443 // 3444 // We therefore use the following revised grammar: 3445 // 3446 // IPv6address = hexseq [ ":" IPv4address ] 3447 // | hexseq [ "::" [ hexpost ] ] 3448 // | "::" [ hexpost ] 3449 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address 3450 // hexseq = hex4 *( ":" hex4) 3451 // hex4 = 1*4HEXDIG 3452 // 3453 // This covers all and only the following cases: 3454 // 3455 // hexseq 3456 // hexseq : IPv4address 3457 // hexseq :: 3458 // hexseq :: hexseq 3459 // hexseq :: hexseq : IPv4address 3460 // hexseq :: IPv4address 3461 // :: hexseq 3462 // :: hexseq : IPv4address 3463 // :: IPv4address 3464 // :: 3465 // 3466 // Additionally we constrain the IPv6 address as follows :- 3467 // 3468 // i. IPv6 addresses without compressed zeros should contain 3469 // exactly 16 bytes. 3470 // 3471 // ii. IPv6 addresses with compressed zeros should contain 3472 // less than 16 bytes. 3473 3474 private int ipv6byteCount = 0; 3475 3476 private int parseIPv6Reference(int start, int n) 3477 throws URISyntaxException 3478 { 3479 int p = start; 3480 int q; 3481 boolean compressedZeros = false; 3482 3483 q = scanHexSeq(p, n); 3484 3485 if (q > p) { 3486 p = q; 3487 if (at(p, n, "::")) { 3488 compressedZeros = true; 3489 p = scanHexPost(p + 2, n); 3490 } else if (at(p, n, ':')) { 3491 p = takeIPv4Address(p + 1, n, "IPv4 address"); 3492 ipv6byteCount += 4; 3493 } 3494 } else if (at(p, n, "::")) { 3495 compressedZeros = true; 3496 p = scanHexPost(p + 2, n); 3497 } 3498 if (p < n) 3499 fail("Malformed IPv6 address", start); 3500 if (ipv6byteCount > 16) 3501 fail("IPv6 address too long", start); 3502 if (!compressedZeros && ipv6byteCount < 16) 3503 fail("IPv6 address too short", start); 3504 if (compressedZeros && ipv6byteCount == 16) 3505 fail("Malformed IPv6 address", start); 3506 3507 return p; 3508 } 3509 3510 private int scanHexPost(int start, int n) 3511 throws URISyntaxException 3512 { 3513 int p = start; 3514 int q; 3515 3516 if (p == n) 3517 return p; 3518 3519 q = scanHexSeq(p, n); 3520 if (q > p) { 3521 p = q; 3522 if (at(p, n, ':')) { 3523 p++; 3524 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3525 ipv6byteCount += 4; 3526 } 3527 } else { 3528 p = takeIPv4Address(p, n, "hex digits or IPv4 address"); 3529 ipv6byteCount += 4; 3530 } 3531 return p; 3532 } 3533 3534 // Scan a hex sequence; return -1 if one could not be scanned 3535 // 3536 private int scanHexSeq(int start, int n) 3537 throws URISyntaxException 3538 { 3539 int p = start; 3540 int q; 3541 3542 q = scan(p, n, L_HEX, H_HEX); 3543 if (q <= p) 3544 return -1; 3545 if (at(q, n, '.')) // Beginning of IPv4 address 3546 return -1; 3547 if (q > p + 4) 3548 fail("IPv6 hexadecimal digit sequence too long", p); 3549 ipv6byteCount += 2; 3550 p = q; 3551 while (p < n) { 3552 if (!at(p, n, ':')) 3553 break; 3554 if (at(p + 1, n, ':')) 3555 break; // "::" 3556 p++; 3557 q = scan(p, n, L_HEX, H_HEX); 3558 if (q <= p) 3559 failExpecting("digits for an IPv6 address", p); 3560 if (at(q, n, '.')) { // Beginning of IPv4 address 3561 p--; 3562 break; 3563 } 3564 if (q > p + 4) 3565 fail("IPv6 hexadecimal digit sequence too long", p); 3566 ipv6byteCount += 2; 3567 p = q; 3568 } 3569 3570 return p; 3571 } 3572 3573 } 3574 3575 }