Print this page
Split |
Close |
Expand all |
Collapse all |
--- old/src/share/classes/java/net/URI.java
+++ new/src/share/classes/java/net/URI.java
1 1 /*
2 2 * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 4 *
5 5 * This code is free software; you can redistribute it and/or modify it
6 6 * under the terms of the GNU General Public License version 2 only, as
7 7 * published by the Free Software Foundation. Oracle designates this
8 8 * particular file as subject to the "Classpath" exception as provided
9 9 * by Oracle in the LICENSE file that accompanied this code.
10 10 *
11 11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 14 * version 2 for more details (a copy is included in the LICENSE file that
15 15 * accompanied this code).
16 16 *
17 17 * You should have received a copy of the GNU General Public License version
18 18 * 2 along with this work; if not, write to the Free Software Foundation,
19 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 20 *
21 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 22 * or visit www.oracle.com if you need additional information or have any
23 23 * questions.
24 24 */
25 25
26 26 package java.net;
27 27
28 28 import java.io.IOException;
29 29 import java.io.InvalidObjectException;
30 30 import java.io.ObjectInputStream;
31 31 import java.io.ObjectOutputStream;
32 32 import java.io.Serializable;
33 33 import java.nio.ByteBuffer;
34 34 import java.nio.CharBuffer;
35 35 import java.nio.charset.CharsetDecoder;
36 36 import java.nio.charset.CharsetEncoder;
37 37 import java.nio.charset.CoderResult;
38 38 import java.nio.charset.CodingErrorAction;
39 39 import java.nio.charset.CharacterCodingException;
40 40 import java.text.Normalizer;
41 41 import sun.nio.cs.ThreadLocalCoders;
42 42
43 43 import java.lang.Character; // for javadoc
44 44 import java.lang.NullPointerException; // for javadoc
45 45
46 46
47 47 /**
48 48 * Represents a Uniform Resource Identifier (URI) reference.
49 49 *
50 50 * <p> Aside from some minor deviations noted below, an instance of this
51 51 * class represents a URI reference as defined by
52 52 * <a href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform
53 53 * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
54 54 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for
55 55 * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
56 56 * also supports scope_ids. The syntax and usage of scope_ids is described
57 57 * <a href="Inet6Address.html#scoped">here</a>.
58 58 * This class provides constructors for creating URI instances from
59 59 * their components or by parsing their string forms, methods for accessing the
60 60 * various components of an instance, and methods for normalizing, resolving,
61 61 * and relativizing URI instances. Instances of this class are immutable.
62 62 *
63 63 *
64 64 * <h4> URI syntax and components </h4>
65 65 *
66 66 * At the highest level a URI reference (hereinafter simply "URI") in string
67 67 * form has the syntax
68 68 *
69 69 * <blockquote>
70 70 * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
71 71 * </blockquote>
72 72 *
73 73 * where square brackets [...] delineate optional components and the characters
74 74 * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
75 75 *
76 76 * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
77 77 * said to be <i>relative</i>. URIs are also classified according to whether
78 78 * they are <i>opaque</i> or <i>hierarchical</i>.
79 79 *
80 80 * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
81 81 * not begin with a slash character (<tt>'/'</tt>). Opaque URIs are not
82 82 * subject to further parsing. Some examples of opaque URIs are:
83 83 *
84 84 * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
85 85 * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
86 86 * <tr><td><tt>news:comp.lang.java</tt><td></tr>
87 87 * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
88 88 * </table></blockquote>
89 89 *
90 90 * <p> A <i>hierarchical</i> URI is either an absolute URI whose
91 91 * scheme-specific part begins with a slash character, or a relative URI, that
92 92 * is, a URI that does not specify a scheme. Some examples of hierarchical
93 93 * URIs are:
94 94 *
95 95 * <blockquote>
96 96 * <tt>http://java.sun.com/j2se/1.3/</tt><br>
97 97 * <tt>docs/guide/collections/designfaq.html#28</tt><br>
98 98 * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
99 99 * <tt>file:///~/calendar</tt>
100 100 * </blockquote>
101 101 *
102 102 * <p> A hierarchical URI is subject to further parsing according to the syntax
103 103 *
104 104 * <blockquote>
105 105 * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
106 106 * </blockquote>
107 107 *
108 108 * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
109 109 * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves. The
110 110 * scheme-specific part of a hierarchical URI consists of the characters
111 111 * between the scheme and fragment components.
112 112 *
113 113 * <p> The authority component of a hierarchical URI is, if specified, either
114 114 * <i>server-based</i> or <i>registry-based</i>. A server-based authority
115 115 * parses according to the familiar syntax
116 116 *
117 117 * <blockquote>
118 118 * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
119 119 * </blockquote>
120 120 *
121 121 * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
122 122 * themselves. Nearly all URI schemes currently in use are server-based. An
123 123 * authority component that does not parse in this way is considered to be
124 124 * registry-based.
125 125 *
126 126 * <p> The path component of a hierarchical URI is itself said to be absolute
127 127 * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
128 128 * relative. The path of a hierarchical URI that is either absolute or
129 129 * specifies an authority is always absolute.
130 130 *
131 131 * <p> All told, then, a URI instance has the following nine components:
132 132 *
133 133 * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
134 134 * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
135 135 * <tr><td>scheme</td><td><tt>String</tt></td></tr>
136 136 * <tr><td>scheme-specific-part </td><td><tt>String</tt></td></tr>
137 137 * <tr><td>authority</td><td><tt>String</tt></td></tr>
138 138 * <tr><td>user-info</td><td><tt>String</tt></td></tr>
139 139 * <tr><td>host</td><td><tt>String</tt></td></tr>
140 140 * <tr><td>port</td><td><tt>int</tt></td></tr>
141 141 * <tr><td>path</td><td><tt>String</tt></td></tr>
142 142 * <tr><td>query</td><td><tt>String</tt></td></tr>
143 143 * <tr><td>fragment</td><td><tt>String</tt></td></tr>
144 144 * </table></blockquote>
145 145 *
146 146 * In a given instance any particular component is either <i>undefined</i> or
147 147 * <i>defined</i> with a distinct value. Undefined string components are
148 148 * represented by <tt>null</tt>, while undefined integer components are
149 149 * represented by <tt>-1</tt>. A string component may be defined to have the
150 150 * empty string as its value; this is not equivalent to that component being
151 151 * undefined.
152 152 *
153 153 * <p> Whether a particular component is or is not defined in an instance
154 154 * depends upon the type of the URI being represented. An absolute URI has a
155 155 * scheme component. An opaque URI has a scheme, a scheme-specific part, and
156 156 * possibly a fragment, but has no other components. A hierarchical URI always
157 157 * has a path (though it may be empty) and a scheme-specific-part (which at
158 158 * least contains the path), and may have any of the other components. If the
159 159 * authority component is present and is server-based then the host component
160 160 * will be defined and the user-information and port components may be defined.
161 161 *
162 162 *
163 163 * <h4> Operations on URI instances </h4>
164 164 *
165 165 * The key operations supported by this class are those of
166 166 * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
167 167 *
168 168 * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
169 169 * and <tt>".."</tt> segments from the path component of a hierarchical URI.
170 170 * Each <tt>"."</tt> segment is simply removed. A <tt>".."</tt> segment is
171 171 * removed only if it is preceded by a non-<tt>".."</tt> segment.
172 172 * Normalization has no effect upon opaque URIs.
173 173 *
174 174 * <p> <i>Resolution</i> is the process of resolving one URI against another,
175 175 * <i>base</i> URI. The resulting URI is constructed from components of both
176 176 * URIs in the manner specified by RFC 2396, taking components from the
177 177 * base URI for those not specified in the original. For hierarchical URIs,
178 178 * the path of the original is resolved against the path of the base and then
179 179 * normalized. The result, for example, of resolving
180 180 *
181 181 * <blockquote>
182 182 * <tt>docs/guide/collections/designfaq.html#28 </tt>(1)
183 183 * </blockquote>
184 184 *
185 185 * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
186 186 * URI
187 187 *
188 188 * <blockquote>
189 189 * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
190 190 * </blockquote>
191 191 *
192 192 * Resolving the relative URI
193 193 *
194 194 * <blockquote>
195 195 * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java </tt>(2)
196 196 * </blockquote>
197 197 *
198 198 * against this result yields, in turn,
199 199 *
200 200 * <blockquote>
201 201 * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
202 202 * </blockquote>
203 203 *
204 204 * Resolution of both absolute and relative URIs, and of both absolute and
205 205 * relative paths in the case of hierarchical URIs, is supported. Resolving
206 206 * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
207 207 * original URI, since it is absolute. Resolving the relative URI (2) above
208 208 * against the relative base URI (1) yields the normalized, but still relative,
209 209 * URI
210 210 *
211 211 * <blockquote>
212 212 * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
213 213 * </blockquote>
214 214 *
215 215 * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
216 216 * two normalized URIs <i>u</i> and <i>v</i>,
217 217 *
218 218 * <blockquote>
219 219 * <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt> and<br>
220 220 * <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt> .<br>
221 221 * </blockquote>
222 222 *
223 223 * This operation is often useful when constructing a document containing URIs
224 224 * that must be made relative to the base URI of the document wherever
225 225 * possible. For example, relativizing the URI
226 226 *
227 227 * <blockquote>
228 228 * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
229 229 * </blockquote>
230 230 *
231 231 * against the base URI
232 232 *
233 233 * <blockquote>
234 234 * <tt>http://java.sun.com/j2se/1.3</tt>
235 235 * </blockquote>
236 236 *
237 237 * yields the relative URI <tt>docs/guide/index.html</tt>.
238 238 *
239 239 *
240 240 * <h4> Character categories </h4>
241 241 *
242 242 * RFC 2396 specifies precisely which characters are permitted in the
243 243 * various components of a URI reference. The following categories, most of
244 244 * which are taken from that specification, are used below to describe these
245 245 * constraints:
246 246 *
247 247 * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
248 248 * <tr><th valign=top><i>alpha</i></th>
249 249 * <td>The US-ASCII alphabetic characters,
250 250 * <tt>'A'</tt> through <tt>'Z'</tt>
251 251 * and <tt>'a'</tt> through <tt>'z'</tt></td></tr>
252 252 * <tr><th valign=top><i>digit</i></th>
253 253 * <td>The US-ASCII decimal digit characters,
254 254 * <tt>'0'</tt> through <tt>'9'</tt></td></tr>
255 255 * <tr><th valign=top><i>alphanum</i></th>
256 256 * <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
257 257 * <tr><th valign=top><i>unreserved</i> </th>
258 258 * <td>All <i>alphanum</i> characters together with those in the string
259 259 * <tt>"_-!.~'()*"</tt></td></tr>
260 260 * <tr><th valign=top><i>punct</i></th>
261 261 * <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
262 262 * <tr><th valign=top><i>reserved</i></th>
263 263 * <td>All <i>punct</i> characters together with those in the string
264 264 * <tt>"?/[]@"</tt></td></tr>
265 265 * <tr><th valign=top><i>escaped</i></th>
266 266 * <td>Escaped octets, that is, triplets consisting of the percent
267 267 * character (<tt>'%'</tt>) followed by two hexadecimal digits
268 268 * (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
269 269 * <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
270 270 * <tr><th valign=top><i>other</i></th>
271 271 * <td>The Unicode characters that are not in the US-ASCII character set,
272 272 * are not control characters (according to the {@link
273 273 * java.lang.Character#isISOControl(char) Character.isISOControl}
274 274 * method), and are not space characters (according to the {@link
275 275 * java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
276 276 * method) <i>(<b>Deviation from RFC 2396</b>, which is
277 277 * limited to US-ASCII)</i></td></tr>
278 278 * </table></blockquote>
279 279 *
280 280 * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
281 281 * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
282 282 * characters.
283 283 *
284 284 *
285 285 * <h4> Escaped octets, quotation, encoding, and decoding </h4>
286 286 *
287 287 * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
288 288 * fragment components. Escaping serves two purposes in URIs:
289 289 *
290 290 * <ul>
291 291 *
292 292 * <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
293 293 * conform strictly to RFC 2396 by not containing any <i>other</i>
294 294 * characters. </p></li>
295 295 *
296 296 * <li><p> To <i>quote</i> characters that are otherwise illegal in a
297 297 * component. The user-info, path, query, and fragment components differ
298 298 * slightly in terms of which characters are considered legal and illegal.
299 299 * </p></li>
300 300 *
301 301 * </ul>
302 302 *
303 303 * These purposes are served in this class by three related operations:
304 304 *
305 305 * <ul>
306 306 *
307 307 * <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
308 308 * with the sequence of escaped octets that represent that character in the
309 309 * UTF-8 character set. The Euro currency symbol (<tt>'\u20AC'</tt>),
310 310 * for example, is encoded as <tt>"%E2%82%AC"</tt>. <i>(<b>Deviation from
311 311 * RFC 2396</b>, which does not specify any particular character
312 312 * set.)</i> </p></li>
313 313 *
314 314 * <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
315 315 * encoding it. The space character, for example, is quoted by replacing it
316 316 * with <tt>"%20"</tt>. UTF-8 contains US-ASCII, hence for US-ASCII
317 317 * characters this transformation has exactly the effect required by
318 318 * RFC 2396. </p></li>
319 319 *
320 320 * <li><p><a name="decode"></a>
321 321 * A sequence of escaped octets is <i>decoded</i> by
322 322 * replacing it with the sequence of characters that it represents in the
323 323 * UTF-8 character set. UTF-8 contains US-ASCII, hence decoding has the
324 324 * effect of de-quoting any quoted US-ASCII characters as well as that of
325 325 * decoding any encoded non-US-ASCII characters. If a <a
326 326 * href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
327 327 * when decoding the escaped octets then the erroneous octets are replaced by
328 328 * <tt>'\uFFFD'</tt>, the Unicode replacement character. </p></li>
329 329 *
330 330 * </ul>
331 331 *
332 332 * These operations are exposed in the constructors and methods of this class
333 333 * as follows:
334 334 *
335 335 * <ul>
336 336 *
337 337 * <li><p> The {@link #URI(java.lang.String) <code>single-argument
338 338 * constructor</code>} requires any illegal characters in its argument to be
339 339 * quoted and preserves any escaped octets and <i>other</i> characters that
340 340 * are present. </p></li>
341 341 *
342 342 * <li><p> The {@link
343 343 * #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
344 344 * <code>multi-argument constructors</code>} quote illegal characters as
345 345 * required by the components in which they appear. The percent character
346 346 * (<tt>'%'</tt>) is always quoted by these constructors. Any <i>other</i>
347 347 * characters are preserved. </p></li>
348 348 *
349 349 * <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
350 350 * getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
351 351 * getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
352 352 * #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
353 353 * values of their corresponding components in raw form, without interpreting
354 354 * any escaped octets. The strings returned by these methods may contain
355 355 * both escaped octets and <i>other</i> characters, and will not contain any
356 356 * illegal characters. </p></li>
357 357 *
358 358 * <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
359 359 * getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
360 360 * getFragment}, {@link #getAuthority() getAuthority}, and {@link
361 361 * #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
362 362 * octets in their corresponding components. The strings returned by these
363 363 * methods may contain both <i>other</i> characters and illegal characters,
364 364 * and will not contain any escaped octets. </p></li>
365 365 *
366 366 * <li><p> The {@link #toString() toString} method returns a URI string with
367 367 * all necessary quotation but which may contain <i>other</i> characters.
368 368 * </p></li>
369 369 *
370 370 * <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
371 371 * quoted and encoded URI string that does not contain any <i>other</i>
372 372 * characters. </p></li>
373 373 *
374 374 * </ul>
375 375 *
376 376 *
377 377 * <h4> Identities </h4>
378 378 *
379 379 * For any URI <i>u</i>, it is always the case that
380 380 *
381 381 * <blockquote>
382 382 * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt> .
383 383 * </blockquote>
384 384 *
385 385 * For any URI <i>u</i> that does not contain redundant syntax such as two
386 386 * slashes before an empty authority (as in <tt>file:///tmp/</tt> ) or a
387 387 * colon following a host name but no port (as in
388 388 * <tt>http://java.sun.com:</tt> ), and that does not encode characters
389 389 * except those that must be quoted, the following identities also hold:
390 390 *
391 391 * <blockquote>
392 392 * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
393 393 * </tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
394 394 * </tt><i>u</i><tt>.getFragment())<br>
395 395 * .equals(</tt><i>u</i><tt>)</tt>
396 396 * </blockquote>
397 397 *
398 398 * in all cases,
399 399 *
400 400 * <blockquote>
401 401 * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
402 402 * </tt><i>u</i><tt>.getUserInfo(), </tt><i>u</i><tt>.getAuthority(),<br>
403 403 * </tt><i>u</i><tt>.getPath(), </tt><i>u</i><tt>.getQuery(),<br>
404 404 * </tt><i>u</i><tt>.getFragment())<br>
405 405 * .equals(</tt><i>u</i><tt>)</tt>
406 406 * </blockquote>
407 407 *
408 408 * if <i>u</i> is hierarchical, and
409 409 *
410 410 * <blockquote>
411 411 * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
412 412 * </tt><i>u</i><tt>.getUserInfo(), </tt><i>u</i><tt>.getHost(), </tt><i>u</i><tt>.getPort(),<br>
413 413 * </tt><i>u</i><tt>.getPath(), </tt><i>u</i><tt>.getQuery(),<br>
414 414 * </tt><i>u</i><tt>.getFragment())<br>
415 415 * .equals(</tt><i>u</i><tt>)</tt>
416 416 * </blockquote>
417 417 *
418 418 * if <i>u</i> is hierarchical and has either no authority or a server-based
419 419 * authority.
420 420 *
421 421 *
422 422 * <h4> URIs, URLs, and URNs </h4>
423 423 *
424 424 * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
425 425 * resource <i>locator</i>. Hence every URL is a URI, abstractly speaking, but
426 426 * not every URI is a URL. This is because there is another subcategory of
427 427 * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
428 428 * specify how to locate them. The <tt>mailto</tt>, <tt>news</tt>, and
429 429 * <tt>isbn</tt> URIs shown above are examples of URNs.
430 430 *
431 431 * <p> The conceptual distinction between URIs and URLs is reflected in the
432 432 * differences between this class and the {@link URL} class.
433 433 *
434 434 * <p> An instance of this class represents a URI reference in the syntactic
435 435 * sense defined by RFC 2396. A URI may be either absolute or relative.
436 436 * A URI string is parsed according to the generic syntax without regard to the
437 437 * scheme, if any, that it specifies. No lookup of the host, if any, is
438 438 * performed, and no scheme-dependent stream handler is constructed. Equality,
439 439 * hashing, and comparison are defined strictly in terms of the character
440 440 * content of the instance. In other words, a URI instance is little more than
441 441 * a structured string that supports the syntactic, scheme-independent
442 442 * operations of comparison, normalization, resolution, and relativization.
443 443 *
444 444 * <p> An instance of the {@link URL} class, by contrast, represents the
445 445 * syntactic components of a URL together with some of the information required
446 446 * to access the resource that it describes. A URL must be absolute, that is,
447 447 * it must always specify a scheme. A URL string is parsed according to its
448 448 * scheme. A stream handler is always established for a URL, and in fact it is
449 449 * impossible to create a URL instance for a scheme for which no handler is
450 450 * available. Equality and hashing depend upon both the scheme and the
451 451 * Internet address of the host, if any; comparison is not defined. In other
452 452 * words, a URL is a structured string that supports the syntactic operation of
453 453 * resolution as well as the network I/O operations of looking up the host and
454 454 * opening a connection to the specified resource.
455 455 *
456 456 *
457 457 * @author Mark Reinhold
458 458 * @since 1.4
459 459 *
460 460 * @see <a href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279: UTF-8, a
461 461 * transformation format of ISO 10646</i></a>, <br><a
462 462 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6 Addressing
463 463 * Architecture</i></a>, <br><a
464 464 * href="http://www.ietf.org/rfc/rfc2396.txt"><i>RFC 2396: Uniform
465 465 * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
466 466 * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC 2732: Format for
467 467 * Literal IPv6 Addresses in URLs</i></a>, <br><a
468 468 * href="URISyntaxException.html">URISyntaxException</a>
469 469 */
470 470
471 471 public final class URI
472 472 implements Comparable<URI>, Serializable
473 473 {
474 474
475 475 // Note: Comments containing the word "ASSERT" indicate places where a
476 476 // throw of an InternalError should be replaced by an appropriate assertion
477 477 // statement once asserts are enabled in the build.
478 478
479 479 static final long serialVersionUID = -6052424284110960213L;
480 480
481 481
482 482 // -- Properties and components of this instance --
483 483
484 484 // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
485 485 private transient String scheme; // null ==> relative URI
486 486 private transient String fragment;
487 487
488 488 // Hierarchical URI components: [//<authority>]<path>[?<query>]
489 489 private transient String authority; // Registry or server
490 490
491 491 // Server-based authority: [<userInfo>@]<host>[:<port>]
492 492 private transient String userInfo;
493 493 private transient String host; // null ==> registry-based
494 494 private transient int port = -1; // -1 ==> undefined
495 495
496 496 // Remaining components of hierarchical URIs
497 497 private transient String path; // null ==> opaque
498 498 private transient String query;
499 499
500 500 // The remaining fields may be computed on demand
501 501
502 502 private volatile transient String schemeSpecificPart;
503 503 private volatile transient int hash; // Zero ==> undefined
504 504
505 505 private volatile transient String decodedUserInfo = null;
506 506 private volatile transient String decodedAuthority = null;
507 507 private volatile transient String decodedPath = null;
508 508 private volatile transient String decodedQuery = null;
509 509 private volatile transient String decodedFragment = null;
510 510 private volatile transient String decodedSchemeSpecificPart = null;
511 511
512 512 /**
513 513 * The string form of this URI.
514 514 *
515 515 * @serial
516 516 */
517 517 private volatile String string; // The only serializable field
518 518
519 519
520 520
521 521 // -- Constructors and factories --
522 522
523 523 private URI() { } // Used internally
524 524
525 525 /**
526 526 * Constructs a URI by parsing the given string.
527 527 *
528 528 * <p> This constructor parses the given string exactly as specified by the
529 529 * grammar in <a
530 530 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
531 531 * Appendix A, <b><i>except for the following deviations:</i></b> </p>
532 532 *
533 533 * <ul type=disc>
534 534 *
535 535 * <li><p> An empty authority component is permitted as long as it is
536 536 * followed by a non-empty path, a query component, or a fragment
537 537 * component. This allows the parsing of URIs such as
538 538 * <tt>"file:///foo/bar"</tt>, which seems to be the intent of
539 539 * RFC 2396 although the grammar does not permit it. If the
540 540 * authority component is empty then the user-information, host, and port
541 541 * components are undefined. </p></li>
542 542 *
543 543 * <li><p> Empty relative paths are permitted; this seems to be the
544 544 * intent of RFC 2396 although the grammar does not permit it. The
545 545 * primary consequence of this deviation is that a standalone fragment
546 546 * such as <tt>"#foo"</tt> parses as a relative URI with an empty path
547 547 * and the given fragment, and can be usefully <a
548 548 * href="#resolve-frag">resolved</a> against a base URI.
549 549 *
550 550 * <li><p> IPv4 addresses in host components are parsed rigorously, as
551 551 * specified by <a
552 552 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>: Each
553 553 * element of a dotted-quad address must contain no more than three
554 554 * decimal digits. Each element is further constrained to have a value
555 555 * no greater than 255. </p></li>
556 556 *
557 557 * <li> <p> Hostnames in host components that comprise only a single
558 558 * domain label are permitted to start with an <i>alphanum</i>
559 559 * character. This seems to be the intent of <a
560 560 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
561 561 * section 3.2.2 although the grammar does not permit it. The
562 562 * consequence of this deviation is that the authority component of a
563 563 * hierarchical URI such as <tt>s://123</tt>, will parse as a server-based
564 564 * authority. </p></li>
565 565 *
566 566 * <li><p> IPv6 addresses are permitted for the host component. An IPv6
567 567 * address must be enclosed in square brackets (<tt>'['</tt> and
568 568 * <tt>']'</tt>) as specified by <a
569 569 * href="http://www.ietf.org/rfc/rfc2732.txt">RFC 2732</a>. The
570 570 * IPv6 address itself must parse according to <a
571 571 * href="http://www.ietf.org/rfc/rfc2373.txt">RFC 2373</a>. IPv6
572 572 * addresses are further constrained to describe no more than sixteen
573 573 * bytes of address information, a constraint implicit in RFC 2373
574 574 * but not expressible in the grammar. </p></li>
575 575 *
576 576 * <li><p> Characters in the <i>other</i> category are permitted wherever
577 577 * RFC 2396 permits <i>escaped</i> octets, that is, in the
578 578 * user-information, path, query, and fragment components, as well as in
579 579 * the authority component if the authority is registry-based. This
580 580 * allows URIs to contain Unicode characters beyond those in the US-ASCII
581 581 * character set. </p></li>
582 582 *
583 583 * </ul>
584 584 *
585 585 * @param str The string to be parsed into a URI
586 586 *
587 587 * @throws NullPointerException
588 588 * If <tt>str</tt> is <tt>null</tt>
589 589 *
590 590 * @throws URISyntaxException
591 591 * If the given string violates RFC 2396, as augmented
592 592 * by the above deviations
593 593 */
594 594 public URI(String str) throws URISyntaxException {
595 595 new Parser(str).parse(false);
596 596 }
597 597
598 598 /**
599 599 * Constructs a hierarchical URI from the given components.
600 600 *
601 601 * <p> If a scheme is given then the path, if also given, must either be
602 602 * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
603 603 * component of the new URI may be left undefined by passing <tt>null</tt>
604 604 * for the corresponding parameter or, in the case of the <tt>port</tt>
605 605 * parameter, by passing <tt>-1</tt>.
606 606 *
607 607 * <p> This constructor first builds a URI string from the given components
608 608 * according to the rules specified in <a
609 609 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
610 610 * section 5.2, step 7: </p>
611 611 *
612 612 * <ol>
613 613 *
614 614 * <li><p> Initially, the result string is empty. </p></li>
615 615 *
616 616 * <li><p> If a scheme is given then it is appended to the result,
617 617 * followed by a colon character (<tt>':'</tt>). </p></li>
618 618 *
619 619 * <li><p> If user information, a host, or a port are given then the
620 620 * string <tt>"//"</tt> is appended. </p></li>
621 621 *
622 622 * <li><p> If user information is given then it is appended, followed by
623 623 * a commercial-at character (<tt>'@'</tt>). Any character not in the
624 624 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
625 625 * categories is <a href="#quote">quoted</a>. </p></li>
626 626 *
627 627 * <li><p> If a host is given then it is appended. If the host is a
628 628 * literal IPv6 address but is not enclosed in square brackets
629 629 * (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
630 630 * </p></li>
631 631 *
632 632 * <li><p> If a port number is given then a colon character
633 633 * (<tt>':'</tt>) is appended, followed by the port number in decimal.
634 634 * </p></li>
635 635 *
636 636 * <li><p> If a path is given then it is appended. Any character not in
637 637 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
638 638 * categories, and not equal to the slash character (<tt>'/'</tt>) or the
639 639 * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
640 640 *
641 641 * <li><p> If a query is given then a question-mark character
642 642 * (<tt>'?'</tt>) is appended, followed by the query. Any character that
643 643 * is not a <a href="#legal-chars">legal URI character</a> is quoted.
644 644 * </p></li>
645 645 *
646 646 * <li><p> Finally, if a fragment is given then a hash character
647 647 * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
648 648 * that is not a legal URI character is quoted. </p></li>
649 649 *
650 650 * </ol>
651 651 *
652 652 * <p> The resulting URI string is then parsed as if by invoking the {@link
653 653 * #URI(String)} constructor and then invoking the {@link
654 654 * #parseServerAuthority()} method upon the result; this may cause a {@link
655 655 * URISyntaxException} to be thrown. </p>
656 656 *
657 657 * @param scheme Scheme name
658 658 * @param userInfo User name and authorization information
659 659 * @param host Host name
660 660 * @param port Port number
661 661 * @param path Path
662 662 * @param query Query
663 663 * @param fragment Fragment
664 664 *
665 665 * @throws URISyntaxException
666 666 * If both a scheme and a path are given but the path is relative,
667 667 * if the URI string constructed from the given components violates
668 668 * RFC 2396, or if the authority component of the string is
669 669 * present but cannot be parsed as a server-based authority
670 670 */
671 671 public URI(String scheme,
672 672 String userInfo, String host, int port,
673 673 String path, String query, String fragment)
674 674 throws URISyntaxException
675 675 {
676 676 String s = toString(scheme, null,
677 677 null, userInfo, host, port,
678 678 path, query, fragment);
679 679 checkPath(s, scheme, path);
680 680 new Parser(s).parse(true);
681 681 }
682 682
683 683 /**
684 684 * Constructs a hierarchical URI from the given components.
685 685 *
686 686 * <p> If a scheme is given then the path, if also given, must either be
687 687 * empty or begin with a slash character (<tt>'/'</tt>). Otherwise a
688 688 * component of the new URI may be left undefined by passing <tt>null</tt>
689 689 * for the corresponding parameter.
690 690 *
691 691 * <p> This constructor first builds a URI string from the given components
692 692 * according to the rules specified in <a
693 693 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
694 694 * section 5.2, step 7: </p>
695 695 *
696 696 * <ol>
697 697 *
698 698 * <li><p> Initially, the result string is empty. </p></li>
699 699 *
700 700 * <li><p> If a scheme is given then it is appended to the result,
701 701 * followed by a colon character (<tt>':'</tt>). </p></li>
702 702 *
703 703 * <li><p> If an authority is given then the string <tt>"//"</tt> is
704 704 * appended, followed by the authority. If the authority contains a
705 705 * literal IPv6 address then the address must be enclosed in square
706 706 * brackets (<tt>'['</tt> and <tt>']'</tt>). Any character not in the
707 707 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
708 708 * categories, and not equal to the commercial-at character
709 709 * (<tt>'@'</tt>), is <a href="#quote">quoted</a>. </p></li>
710 710 *
711 711 * <li><p> If a path is given then it is appended. Any character not in
712 712 * the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
713 713 * categories, and not equal to the slash character (<tt>'/'</tt>) or the
714 714 * commercial-at character (<tt>'@'</tt>), is quoted. </p></li>
715 715 *
716 716 * <li><p> If a query is given then a question-mark character
717 717 * (<tt>'?'</tt>) is appended, followed by the query. Any character that
718 718 * is not a <a href="#legal-chars">legal URI character</a> is quoted.
719 719 * </p></li>
720 720 *
721 721 * <li><p> Finally, if a fragment is given then a hash character
722 722 * (<tt>'#'</tt>) is appended, followed by the fragment. Any character
723 723 * that is not a legal URI character is quoted. </p></li>
724 724 *
725 725 * </ol>
726 726 *
727 727 * <p> The resulting URI string is then parsed as if by invoking the {@link
728 728 * #URI(String)} constructor and then invoking the {@link
729 729 * #parseServerAuthority()} method upon the result; this may cause a {@link
730 730 * URISyntaxException} to be thrown. </p>
731 731 *
732 732 * @param scheme Scheme name
733 733 * @param authority Authority
734 734 * @param path Path
735 735 * @param query Query
736 736 * @param fragment Fragment
737 737 *
738 738 * @throws URISyntaxException
739 739 * If both a scheme and a path are given but the path is relative,
740 740 * if the URI string constructed from the given components violates
741 741 * RFC 2396, or if the authority component of the string is
742 742 * present but cannot be parsed as a server-based authority
743 743 */
744 744 public URI(String scheme,
745 745 String authority,
746 746 String path, String query, String fragment)
747 747 throws URISyntaxException
748 748 {
749 749 String s = toString(scheme, null,
750 750 authority, null, null, -1,
751 751 path, query, fragment);
752 752 checkPath(s, scheme, path);
753 753 new Parser(s).parse(false);
754 754 }
755 755
756 756 /**
757 757 * Constructs a hierarchical URI from the given components.
758 758 *
759 759 * <p> A component may be left undefined by passing <tt>null</tt>.
760 760 *
761 761 * <p> This convenience constructor works as if by invoking the
762 762 * seven-argument constructor as follows:
763 763 *
764 764 * <blockquote><tt>
765 765 * new {@link #URI(String, String, String, int, String, String, String)
766 766 * URI}(scheme, null, host, -1, path, null, fragment);
767 767 * </tt></blockquote>
768 768 *
769 769 * @param scheme Scheme name
770 770 * @param host Host name
771 771 * @param path Path
772 772 * @param fragment Fragment
773 773 *
774 774 * @throws URISyntaxException
775 775 * If the URI string constructed from the given components
776 776 * violates RFC 2396
777 777 */
778 778 public URI(String scheme, String host, String path, String fragment)
779 779 throws URISyntaxException
780 780 {
781 781 this(scheme, null, host, -1, path, null, fragment);
782 782 }
783 783
784 784 /**
785 785 * Constructs a URI from the given components.
786 786 *
787 787 * <p> A component may be left undefined by passing <tt>null</tt>.
788 788 *
789 789 * <p> This constructor first builds a URI in string form using the given
790 790 * components as follows: </p>
791 791 *
792 792 * <ol>
793 793 *
794 794 * <li><p> Initially, the result string is empty. </p></li>
795 795 *
796 796 * <li><p> If a scheme is given then it is appended to the result,
797 797 * followed by a colon character (<tt>':'</tt>). </p></li>
798 798 *
799 799 * <li><p> If a scheme-specific part is given then it is appended. Any
800 800 * character that is not a <a href="#legal-chars">legal URI character</a>
801 801 * is <a href="#quote">quoted</a>. </p></li>
802 802 *
803 803 * <li><p> Finally, if a fragment is given then a hash character
804 804 * (<tt>'#'</tt>) is appended to the string, followed by the fragment.
805 805 * Any character that is not a legal URI character is quoted. </p></li>
806 806 *
807 807 * </ol>
808 808 *
809 809 * <p> The resulting URI string is then parsed in order to create the new
810 810 * URI instance as if by invoking the {@link #URI(String)} constructor;
811 811 * this may cause a {@link URISyntaxException} to be thrown. </p>
812 812 *
813 813 * @param scheme Scheme name
814 814 * @param ssp Scheme-specific part
815 815 * @param fragment Fragment
816 816 *
817 817 * @throws URISyntaxException
818 818 * If the URI string constructed from the given components
819 819 * violates RFC 2396
820 820 */
821 821 public URI(String scheme, String ssp, String fragment)
822 822 throws URISyntaxException
823 823 {
824 824 new Parser(toString(scheme, ssp,
825 825 null, null, null, -1,
826 826 null, null, fragment))
827 827 .parse(false);
828 828 }
829 829
830 830 /**
831 831 * Creates a URI by parsing the given string.
832 832 *
833 833 * <p> This convenience factory method works as if by invoking the {@link
834 834 * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
835 835 * constructor is caught and wrapped in a new {@link
836 836 * IllegalArgumentException} object, which is then thrown.
837 837 *
838 838 * <p> This method is provided for use in situations where it is known that
839 839 * the given string is a legal URI, for example for URI constants declared
840 840 * within in a program, and so it would be considered a programming error
841 841 * for the string not to parse as such. The constructors, which throw
842 842 * {@link URISyntaxException} directly, should be used situations where a
843 843 * URI is being constructed from user input or from some other source that
844 844 * may be prone to errors. </p>
845 845 *
846 846 * @param str The string to be parsed into a URI
847 847 * @return The new URI
848 848 *
849 849 * @throws NullPointerException
850 850 * If <tt>str</tt> is <tt>null</tt>
851 851 *
852 852 * @throws IllegalArgumentException
853 853 * If the given string violates RFC 2396
854 854 */
855 855 public static URI create(String str) {
856 856 try {
857 857 return new URI(str);
858 858 } catch (URISyntaxException x) {
859 859 throw new IllegalArgumentException(x.getMessage(), x);
860 860 }
861 861 }
862 862
863 863
864 864 // -- Operations --
865 865
866 866 /**
867 867 * Attempts to parse this URI's authority component, if defined, into
868 868 * user-information, host, and port components.
869 869 *
870 870 * <p> If this URI's authority component has already been recognized as
871 871 * being server-based then it will already have been parsed into
872 872 * user-information, host, and port components. In this case, or if this
873 873 * URI has no authority component, this method simply returns this URI.
874 874 *
875 875 * <p> Otherwise this method attempts once more to parse the authority
876 876 * component into user-information, host, and port components, and throws
877 877 * an exception describing why the authority component could not be parsed
878 878 * in that way.
879 879 *
880 880 * <p> This method is provided because the generic URI syntax specified in
881 881 * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>
882 882 * cannot always distinguish a malformed server-based authority from a
883 883 * legitimate registry-based authority. It must therefore treat some
884 884 * instances of the former as instances of the latter. The authority
885 885 * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
886 886 * legal server-based authority but it is legal as a registry-based
887 887 * authority.
888 888 *
889 889 * <p> In many common situations, for example when working URIs that are
890 890 * known to be either URNs or URLs, the hierarchical URIs being used will
891 891 * always be server-based. They therefore must either be parsed as such or
892 892 * treated as an error. In these cases a statement such as
893 893 *
894 894 * <blockquote>
895 895 * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
896 896 * </blockquote>
897 897 *
898 898 * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
899 899 * it has an authority component, has a server-based authority with proper
900 900 * user-information, host, and port components. Invoking this method also
901 901 * ensures that if the authority could not be parsed in that way then an
902 902 * appropriate diagnostic message can be issued based upon the exception
903 903 * that is thrown. </p>
904 904 *
905 905 * @return A URI whose authority field has been parsed
906 906 * as a server-based authority
907 907 *
908 908 * @throws URISyntaxException
909 909 * If the authority component of this URI is defined
910 910 * but cannot be parsed as a server-based authority
911 911 * according to RFC 2396
912 912 */
913 913 public URI parseServerAuthority()
914 914 throws URISyntaxException
915 915 {
916 916 // We could be clever and cache the error message and index from the
917 917 // exception thrown during the original parse, but that would require
918 918 // either more fields or a more-obscure representation.
919 919 if ((host != null) || (authority == null))
920 920 return this;
921 921 defineString();
922 922 new Parser(string).parse(true);
923 923 return this;
924 924 }
925 925
926 926 /**
927 927 * Normalizes this URI's path.
928 928 *
929 929 * <p> If this URI is opaque, or if its path is already in normal form,
930 930 * then this URI is returned. Otherwise a new URI is constructed that is
931 931 * identical to this URI except that its path is computed by normalizing
932 932 * this URI's path in a manner consistent with <a
933 933 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
934 934 * section 5.2, step 6, sub-steps c through f; that is:
935 935 * </p>
936 936 *
937 937 * <ol>
938 938 *
939 939 * <li><p> All <tt>"."</tt> segments are removed. </p></li>
940 940 *
941 941 * <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
942 942 * segment then both of these segments are removed. This step is
943 943 * repeated until it is no longer applicable. </p></li>
944 944 *
945 945 * <li><p> If the path is relative, and if its first segment contains a
946 946 * colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
947 947 * prepended. This prevents a relative URI with a path such as
948 948 * <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
949 949 * scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
950 950 * <b><i>(Deviation from RFC 2396)</i></b> </p></li>
951 951 *
952 952 * </ol>
953 953 *
954 954 * <p> A normalized path will begin with one or more <tt>".."</tt> segments
955 955 * if there were insufficient non-<tt>".."</tt> segments preceding them to
956 956 * allow their removal. A normalized path will begin with a <tt>"."</tt>
957 957 * segment if one was inserted by step 3 above. Otherwise, a normalized
958 958 * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
959 959 *
960 960 * @return A URI equivalent to this URI,
961 961 * but whose path is in normal form
962 962 */
963 963 public URI normalize() {
964 964 return normalize(this);
965 965 }
966 966
967 967 /**
968 968 * Resolves the given URI against this URI.
969 969 *
970 970 * <p> If the given URI is already absolute, or if this URI is opaque, then
971 971 * the given URI is returned.
972 972 *
973 973 * <p><a name="resolve-frag"></a> If the given URI's fragment component is
974 974 * defined, its path component is empty, and its scheme, authority, and
975 975 * query components are undefined, then a URI with the given fragment but
976 976 * with all other components equal to those of this URI is returned. This
977 977 * allows a URI representing a standalone fragment reference, such as
978 978 * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
979 979 *
980 980 * <p> Otherwise this method constructs a new hierarchical URI in a manner
981 981 * consistent with <a
982 982 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
983 983 * section 5.2; that is: </p>
984 984 *
985 985 * <ol>
986 986 *
987 987 * <li><p> A new URI is constructed with this URI's scheme and the given
988 988 * URI's query and fragment components. </p></li>
989 989 *
990 990 * <li><p> If the given URI has an authority component then the new URI's
991 991 * authority and path are taken from the given URI. </p></li>
992 992 *
993 993 * <li><p> Otherwise the new URI's authority component is copied from
994 994 * this URI, and its path is computed as follows: </p>
995 995 *
996 996 * <ol type=a>
997 997 *
998 998 * <li><p> If the given URI's path is absolute then the new URI's path
999 999 * is taken from the given URI. </p></li>
1000 1000 *
1001 1001 * <li><p> Otherwise the given URI's path is relative, and so the new
1002 1002 * URI's path is computed by resolving the path of the given URI
1003 1003 * against the path of this URI. This is done by concatenating all but
1004 1004 * the last segment of this URI's path, if any, with the given URI's
1005 1005 * path and then normalizing the result as if by invoking the {@link
1006 1006 * #normalize() normalize} method. </p></li>
1007 1007 *
1008 1008 * </ol></li>
1009 1009 *
1010 1010 * </ol>
1011 1011 *
1012 1012 * <p> The result of this method is absolute if, and only if, either this
1013 1013 * URI is absolute or the given URI is absolute. </p>
1014 1014 *
1015 1015 * @param uri The URI to be resolved against this URI
1016 1016 * @return The resulting URI
1017 1017 *
1018 1018 * @throws NullPointerException
1019 1019 * If <tt>uri</tt> is <tt>null</tt>
1020 1020 */
1021 1021 public URI resolve(URI uri) {
1022 1022 return resolve(this, uri);
1023 1023 }
1024 1024
1025 1025 /**
1026 1026 * Constructs a new URI by parsing the given string and then resolving it
1027 1027 * against this URI.
1028 1028 *
1029 1029 * <p> This convenience method works as if invoking it were equivalent to
1030 1030 * evaluating the expression <tt>{@link #resolve(java.net.URI)
1031 1031 * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
1032 1032 *
1033 1033 * @param str The string to be parsed into a URI
1034 1034 * @return The resulting URI
1035 1035 *
1036 1036 * @throws NullPointerException
1037 1037 * If <tt>str</tt> is <tt>null</tt>
1038 1038 *
1039 1039 * @throws IllegalArgumentException
1040 1040 * If the given string violates RFC 2396
1041 1041 */
1042 1042 public URI resolve(String str) {
1043 1043 return resolve(URI.create(str));
1044 1044 }
1045 1045
1046 1046 /**
1047 1047 * Relativizes the given URI against this URI.
1048 1048 *
1049 1049 * <p> The relativization of the given URI against this URI is computed as
1050 1050 * follows: </p>
1051 1051 *
1052 1052 * <ol>
1053 1053 *
1054 1054 * <li><p> If either this URI or the given URI are opaque, or if the
1055 1055 * scheme and authority components of the two URIs are not identical, or
1056 1056 * if the path of this URI is not a prefix of the path of the given URI,
1057 1057 * then the given URI is returned. </p></li>
1058 1058 *
1059 1059 * <li><p> Otherwise a new relative hierarchical URI is constructed with
1060 1060 * query and fragment components taken from the given URI and with a path
1061 1061 * component computed by removing this URI's path from the beginning of
1062 1062 * the given URI's path. </p></li>
1063 1063 *
1064 1064 * </ol>
1065 1065 *
1066 1066 * @param uri The URI to be relativized against this URI
1067 1067 * @return The resulting URI
1068 1068 *
1069 1069 * @throws NullPointerException
1070 1070 * If <tt>uri</tt> is <tt>null</tt>
1071 1071 */
1072 1072 public URI relativize(URI uri) {
1073 1073 return relativize(this, uri);
1074 1074 }
1075 1075
1076 1076 /**
1077 1077 * Constructs a URL from this URI.
1078 1078 *
1079 1079 * <p> This convenience method works as if invoking it were equivalent to
1080 1080 * evaluating the expression <tt>new URL(this.toString())</tt> after
1081 1081 * first checking that this URI is absolute. </p>
1082 1082 *
1083 1083 * @return A URL constructed from this URI
1084 1084 *
1085 1085 * @throws IllegalArgumentException
1086 1086 * If this URL is not absolute
1087 1087 *
1088 1088 * @throws MalformedURLException
1089 1089 * If a protocol handler for the URL could not be found,
1090 1090 * or if some other error occurred while constructing the URL
1091 1091 */
1092 1092 public URL toURL()
1093 1093 throws MalformedURLException {
1094 1094 if (!isAbsolute())
1095 1095 throw new IllegalArgumentException("URI is not absolute");
1096 1096 return new URL(toString());
1097 1097 }
1098 1098
1099 1099 // -- Component access methods --
1100 1100
1101 1101 /**
1102 1102 * Returns the scheme component of this URI.
1103 1103 *
1104 1104 * <p> The scheme component of a URI, if defined, only contains characters
1105 1105 * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>. A
1106 1106 * scheme always starts with an <i>alpha</i> character. <p>
1107 1107 *
1108 1108 * The scheme component of a URI cannot contain escaped octets, hence this
1109 1109 * method does not perform any decoding.
1110 1110 *
1111 1111 * @return The scheme component of this URI,
1112 1112 * or <tt>null</tt> if the scheme is undefined
1113 1113 */
1114 1114 public String getScheme() {
1115 1115 return scheme;
1116 1116 }
1117 1117
1118 1118 /**
1119 1119 * Tells whether or not this URI is absolute.
1120 1120 *
1121 1121 * <p> A URI is absolute if, and only if, it has a scheme component. </p>
1122 1122 *
1123 1123 * @return <tt>true</tt> if, and only if, this URI is absolute
1124 1124 */
1125 1125 public boolean isAbsolute() {
1126 1126 return scheme != null;
1127 1127 }
1128 1128
1129 1129 /**
1130 1130 * Tells whether or not this URI is opaque.
1131 1131 *
1132 1132 * <p> A URI is opaque if, and only if, it is absolute and its
1133 1133 * scheme-specific part does not begin with a slash character ('/').
1134 1134 * An opaque URI has a scheme, a scheme-specific part, and possibly
1135 1135 * a fragment; all other components are undefined. </p>
1136 1136 *
1137 1137 * @return <tt>true</tt> if, and only if, this URI is opaque
1138 1138 */
1139 1139 public boolean isOpaque() {
1140 1140 return path == null;
1141 1141 }
1142 1142
1143 1143 /**
1144 1144 * Returns the raw scheme-specific part of this URI. The scheme-specific
1145 1145 * part is never undefined, though it may be empty.
1146 1146 *
1147 1147 * <p> The scheme-specific part of a URI only contains legal URI
1148 1148 * characters. </p>
1149 1149 *
1150 1150 * @return The raw scheme-specific part of this URI
1151 1151 * (never <tt>null</tt>)
1152 1152 */
1153 1153 public String getRawSchemeSpecificPart() {
1154 1154 defineSchemeSpecificPart();
1155 1155 return schemeSpecificPart;
1156 1156 }
1157 1157
1158 1158 /**
1159 1159 * Returns the decoded scheme-specific part of this URI.
1160 1160 *
1161 1161 * <p> The string returned by this method is equal to that returned by the
1162 1162 * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
1163 1163 * except that all sequences of escaped octets are <a
1164 1164 * href="#decode">decoded</a>. </p>
1165 1165 *
1166 1166 * @return The decoded scheme-specific part of this URI
1167 1167 * (never <tt>null</tt>)
1168 1168 */
1169 1169 public String getSchemeSpecificPart() {
1170 1170 if (decodedSchemeSpecificPart == null)
1171 1171 decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
1172 1172 return decodedSchemeSpecificPart;
1173 1173 }
1174 1174
1175 1175 /**
1176 1176 * Returns the raw authority component of this URI.
1177 1177 *
1178 1178 * <p> The authority component of a URI, if defined, only contains the
1179 1179 * commercial-at character (<tt>'@'</tt>) and characters in the
1180 1180 * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
1181 1181 * categories. If the authority is server-based then it is further
1182 1182 * constrained to have valid user-information, host, and port
1183 1183 * components. </p>
1184 1184 *
1185 1185 * @return The raw authority component of this URI,
1186 1186 * or <tt>null</tt> if the authority is undefined
1187 1187 */
1188 1188 public String getRawAuthority() {
1189 1189 return authority;
1190 1190 }
1191 1191
1192 1192 /**
1193 1193 * Returns the decoded authority component of this URI.
1194 1194 *
1195 1195 * <p> The string returned by this method is equal to that returned by the
1196 1196 * {@link #getRawAuthority() getRawAuthority} method except that all
1197 1197 * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1198 1198 *
1199 1199 * @return The decoded authority component of this URI,
1200 1200 * or <tt>null</tt> if the authority is undefined
1201 1201 */
1202 1202 public String getAuthority() {
1203 1203 if (decodedAuthority == null)
1204 1204 decodedAuthority = decode(authority);
1205 1205 return decodedAuthority;
1206 1206 }
1207 1207
1208 1208 /**
1209 1209 * Returns the raw user-information component of this URI.
1210 1210 *
1211 1211 * <p> The user-information component of a URI, if defined, only contains
1212 1212 * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
1213 1213 * <i>other</i> categories. </p>
1214 1214 *
1215 1215 * @return The raw user-information component of this URI,
1216 1216 * or <tt>null</tt> if the user information is undefined
1217 1217 */
1218 1218 public String getRawUserInfo() {
1219 1219 return userInfo;
1220 1220 }
1221 1221
1222 1222 /**
1223 1223 * Returns the decoded user-information component of this URI.
1224 1224 *
1225 1225 * <p> The string returned by this method is equal to that returned by the
1226 1226 * {@link #getRawUserInfo() getRawUserInfo} method except that all
1227 1227 * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1228 1228 *
1229 1229 * @return The decoded user-information component of this URI,
1230 1230 * or <tt>null</tt> if the user information is undefined
1231 1231 */
1232 1232 public String getUserInfo() {
1233 1233 if ((decodedUserInfo == null) && (userInfo != null))
1234 1234 decodedUserInfo = decode(userInfo);
1235 1235 return decodedUserInfo;
1236 1236 }
1237 1237
1238 1238 /**
1239 1239 * Returns the host component of this URI.
1240 1240 *
1241 1241 * <p> The host component of a URI, if defined, will have one of the
1242 1242 * following forms: </p>
1243 1243 *
1244 1244 * <ul type=disc>
1245 1245 *
1246 1246 * <li><p> A domain name consisting of one or more <i>labels</i>
1247 1247 * separated by period characters (<tt>'.'</tt>), optionally followed by
1248 1248 * a period character. Each label consists of <i>alphanum</i> characters
1249 1249 * as well as hyphen characters (<tt>'-'</tt>), though hyphens never
1250 1250 * occur as the first or last characters in a label. The rightmost
1251 1251 * label of a domain name consisting of two or more labels, begins
1252 1252 * with an <i>alpha</i> character. </li>
1253 1253 *
1254 1254 * <li><p> A dotted-quad IPv4 address of the form
1255 1255 * <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
1256 1256 * where no <i>digit</i> sequence is longer than three characters and no
1257 1257 * sequence has a value larger than 255. </p></li>
1258 1258 *
1259 1259 * <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
1260 1260 * <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
1261 1261 * (<tt>':'</tt>), and possibly an embedded IPv4 address. The full
1262 1262 * syntax of IPv6 addresses is specified in <a
1263 1263 * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC 2373: IPv6
1264 1264 * Addressing Architecture</i></a>. </p></li>
1265 1265 *
1266 1266 * </ul>
1267 1267 *
1268 1268 * The host component of a URI cannot contain escaped octets, hence this
1269 1269 * method does not perform any decoding.
1270 1270 *
1271 1271 * @return The host component of this URI,
1272 1272 * or <tt>null</tt> if the host is undefined
1273 1273 */
1274 1274 public String getHost() {
1275 1275 return host;
1276 1276 }
1277 1277
1278 1278 /**
1279 1279 * Returns the port number of this URI.
1280 1280 *
1281 1281 * <p> The port component of a URI, if defined, is a non-negative
1282 1282 * integer. </p>
1283 1283 *
1284 1284 * @return The port component of this URI,
1285 1285 * or <tt>-1</tt> if the port is undefined
1286 1286 */
1287 1287 public int getPort() {
1288 1288 return port;
1289 1289 }
1290 1290
1291 1291 /**
1292 1292 * Returns the raw path component of this URI.
1293 1293 *
1294 1294 * <p> The path component of a URI, if defined, only contains the slash
1295 1295 * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
1296 1296 * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
1297 1297 * and <i>other</i> categories. </p>
1298 1298 *
1299 1299 * @return The path component of this URI,
1300 1300 * or <tt>null</tt> if the path is undefined
1301 1301 */
1302 1302 public String getRawPath() {
1303 1303 return path;
1304 1304 }
1305 1305
1306 1306 /**
1307 1307 * Returns the decoded path component of this URI.
1308 1308 *
1309 1309 * <p> The string returned by this method is equal to that returned by the
1310 1310 * {@link #getRawPath() getRawPath} method except that all sequences of
1311 1311 * escaped octets are <a href="#decode">decoded</a>. </p>
1312 1312 *
1313 1313 * @return The decoded path component of this URI,
1314 1314 * or <tt>null</tt> if the path is undefined
1315 1315 */
1316 1316 public String getPath() {
1317 1317 if ((decodedPath == null) && (path != null))
1318 1318 decodedPath = decode(path);
1319 1319 return decodedPath;
1320 1320 }
1321 1321
1322 1322 /**
1323 1323 * Returns the raw query component of this URI.
1324 1324 *
1325 1325 * <p> The query component of a URI, if defined, only contains legal URI
1326 1326 * characters. </p>
1327 1327 *
1328 1328 * @return The raw query component of this URI,
1329 1329 * or <tt>null</tt> if the query is undefined
1330 1330 */
1331 1331 public String getRawQuery() {
1332 1332 return query;
1333 1333 }
1334 1334
1335 1335 /**
1336 1336 * Returns the decoded query component of this URI.
1337 1337 *
1338 1338 * <p> The string returned by this method is equal to that returned by the
1339 1339 * {@link #getRawQuery() getRawQuery} method except that all sequences of
1340 1340 * escaped octets are <a href="#decode">decoded</a>. </p>
1341 1341 *
1342 1342 * @return The decoded query component of this URI,
1343 1343 * or <tt>null</tt> if the query is undefined
1344 1344 */
1345 1345 public String getQuery() {
1346 1346 if ((decodedQuery == null) && (query != null))
1347 1347 decodedQuery = decode(query);
1348 1348 return decodedQuery;
1349 1349 }
1350 1350
1351 1351 /**
1352 1352 * Returns the raw fragment component of this URI.
1353 1353 *
1354 1354 * <p> The fragment component of a URI, if defined, only contains legal URI
1355 1355 * characters. </p>
1356 1356 *
1357 1357 * @return The raw fragment component of this URI,
1358 1358 * or <tt>null</tt> if the fragment is undefined
1359 1359 */
1360 1360 public String getRawFragment() {
1361 1361 return fragment;
1362 1362 }
1363 1363
1364 1364 /**
1365 1365 * Returns the decoded fragment component of this URI.
1366 1366 *
1367 1367 * <p> The string returned by this method is equal to that returned by the
1368 1368 * {@link #getRawFragment() getRawFragment} method except that all
1369 1369 * sequences of escaped octets are <a href="#decode">decoded</a>. </p>
1370 1370 *
1371 1371 * @return The decoded fragment component of this URI,
1372 1372 * or <tt>null</tt> if the fragment is undefined
1373 1373 */
1374 1374 public String getFragment() {
1375 1375 if ((decodedFragment == null) && (fragment != null))
1376 1376 decodedFragment = decode(fragment);
1377 1377 return decodedFragment;
1378 1378 }
1379 1379
1380 1380
1381 1381 // -- Equality, comparison, hash code, toString, and serialization --
1382 1382
1383 1383 /**
1384 1384 * Tests this URI for equality with another object.
1385 1385 *
1386 1386 * <p> If the given object is not a URI then this method immediately
1387 1387 * returns <tt>false</tt>.
1388 1388 *
1389 1389 * <p> For two URIs to be considered equal requires that either both are
1390 1390 * opaque or both are hierarchical. Their schemes must either both be
1391 1391 * undefined or else be equal without regard to case. Their fragments
1392 1392 * must either both be undefined or else be equal.
1393 1393 *
1394 1394 * <p> For two opaque URIs to be considered equal, their scheme-specific
1395 1395 * parts must be equal.
1396 1396 *
1397 1397 * <p> For two hierarchical URIs to be considered equal, their paths must
1398 1398 * be equal and their queries must either both be undefined or else be
1399 1399 * equal. Their authorities must either both be undefined, or both be
1400 1400 * registry-based, or both be server-based. If their authorities are
1401 1401 * defined and are registry-based, then they must be equal. If their
1402 1402 * authorities are defined and are server-based, then their hosts must be
1403 1403 * equal without regard to case, their port numbers must be equal, and
1404 1404 * their user-information components must be equal.
1405 1405 *
1406 1406 * <p> When testing the user-information, path, query, fragment, authority,
1407 1407 * or scheme-specific parts of two URIs for equality, the raw forms rather
1408 1408 * than the encoded forms of these components are compared and the
1409 1409 * hexadecimal digits of escaped octets are compared without regard to
1410 1410 * case.
1411 1411 *
1412 1412 * <p> This method satisfies the general contract of the {@link
1413 1413 * java.lang.Object#equals(Object) Object.equals} method. </p>
1414 1414 *
1415 1415 * @param ob The object to which this object is to be compared
1416 1416 *
1417 1417 * @return <tt>true</tt> if, and only if, the given object is a URI that
1418 1418 * is identical to this URI
1419 1419 */
1420 1420 public boolean equals(Object ob) {
1421 1421 if (ob == this)
1422 1422 return true;
1423 1423 if (!(ob instanceof URI))
1424 1424 return false;
1425 1425 URI that = (URI)ob;
1426 1426 if (this.isOpaque() != that.isOpaque()) return false;
1427 1427 if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
1428 1428 if (!equal(this.fragment, that.fragment)) return false;
1429 1429
1430 1430 // Opaque
1431 1431 if (this.isOpaque())
1432 1432 return equal(this.schemeSpecificPart, that.schemeSpecificPart);
1433 1433
1434 1434 // Hierarchical
1435 1435 if (!equal(this.path, that.path)) return false;
1436 1436 if (!equal(this.query, that.query)) return false;
1437 1437
1438 1438 // Authorities
1439 1439 if (this.authority == that.authority) return true;
1440 1440 if (this.host != null) {
1441 1441 // Server-based
1442 1442 if (!equal(this.userInfo, that.userInfo)) return false;
1443 1443 if (!equalIgnoringCase(this.host, that.host)) return false;
1444 1444 if (this.port != that.port) return false;
1445 1445 } else if (this.authority != null) {
1446 1446 // Registry-based
1447 1447 if (!equal(this.authority, that.authority)) return false;
1448 1448 } else if (this.authority != that.authority) {
1449 1449 return false;
1450 1450 }
1451 1451
1452 1452 return true;
1453 1453 }
1454 1454
1455 1455 /**
1456 1456 * Returns a hash-code value for this URI. The hash code is based upon all
1457 1457 * of the URI's components, and satisfies the general contract of the
1458 1458 * {@link java.lang.Object#hashCode() Object.hashCode} method.
1459 1459 *
1460 1460 * @return A hash-code value for this URI
1461 1461 */
1462 1462 public int hashCode() {
1463 1463 if (hash != 0)
1464 1464 return hash;
1465 1465 int h = hashIgnoringCase(0, scheme);
1466 1466 h = hash(h, fragment);
1467 1467 if (isOpaque()) {
1468 1468 h = hash(h, schemeSpecificPart);
1469 1469 } else {
1470 1470 h = hash(h, path);
1471 1471 h = hash(h, query);
1472 1472 if (host != null) {
1473 1473 h = hash(h, userInfo);
1474 1474 h = hashIgnoringCase(h, host);
1475 1475 h += 1949 * port;
1476 1476 } else {
1477 1477 h = hash(h, authority);
1478 1478 }
1479 1479 }
1480 1480 hash = h;
1481 1481 return h;
1482 1482 }
1483 1483
1484 1484 /**
1485 1485 * Compares this URI to another object, which must be a URI.
1486 1486 *
1487 1487 * <p> When comparing corresponding components of two URIs, if one
1488 1488 * component is undefined but the other is defined then the first is
1489 1489 * considered to be less than the second. Unless otherwise noted, string
1490 1490 * components are ordered according to their natural, case-sensitive
1491 1491 * ordering as defined by the {@link java.lang.String#compareTo(Object)
1492 1492 * String.compareTo} method. String components that are subject to
1493 1493 * encoding are compared by comparing their raw forms rather than their
1494 1494 * encoded forms.
1495 1495 *
1496 1496 * <p> The ordering of URIs is defined as follows: </p>
1497 1497 *
1498 1498 * <ul type=disc>
1499 1499 *
1500 1500 * <li><p> Two URIs with different schemes are ordered according the
1501 1501 * ordering of their schemes, without regard to case. </p></li>
1502 1502 *
1503 1503 * <li><p> A hierarchical URI is considered to be less than an opaque URI
1504 1504 * with an identical scheme. </p></li>
1505 1505 *
1506 1506 * <li><p> Two opaque URIs with identical schemes are ordered according
1507 1507 * to the ordering of their scheme-specific parts. </p></li>
1508 1508 *
1509 1509 * <li><p> Two opaque URIs with identical schemes and scheme-specific
1510 1510 * parts are ordered according to the ordering of their
1511 1511 * fragments. </p></li>
1512 1512 *
1513 1513 * <li><p> Two hierarchical URIs with identical schemes are ordered
1514 1514 * according to the ordering of their authority components: </p>
1515 1515 *
1516 1516 * <ul type=disc>
1517 1517 *
1518 1518 * <li><p> If both authority components are server-based then the URIs
1519 1519 * are ordered according to their user-information components; if these
1520 1520 * components are identical then the URIs are ordered according to the
1521 1521 * ordering of their hosts, without regard to case; if the hosts are
1522 1522 * identical then the URIs are ordered according to the ordering of
1523 1523 * their ports. </p></li>
1524 1524 *
1525 1525 * <li><p> If one or both authority components are registry-based then
1526 1526 * the URIs are ordered according to the ordering of their authority
1527 1527 * components. </p></li>
1528 1528 *
1529 1529 * </ul></li>
1530 1530 *
1531 1531 * <li><p> Finally, two hierarchical URIs with identical schemes and
1532 1532 * authority components are ordered according to the ordering of their
1533 1533 * paths; if their paths are identical then they are ordered according to
1534 1534 * the ordering of their queries; if the queries are identical then they
1535 1535 * are ordered according to the order of their fragments. </p></li>
1536 1536 *
1537 1537 * </ul>
1538 1538 *
1539 1539 * <p> This method satisfies the general contract of the {@link
1540 1540 * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
1541 1541 * method. </p>
1542 1542 *
1543 1543 * @param that
1544 1544 * The object to which this URI is to be compared
1545 1545 *
1546 1546 * @return A negative integer, zero, or a positive integer as this URI is
1547 1547 * less than, equal to, or greater than the given URI
1548 1548 *
1549 1549 * @throws ClassCastException
1550 1550 * If the given object is not a URI
1551 1551 */
1552 1552 public int compareTo(URI that) {
1553 1553 int c;
1554 1554
1555 1555 if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
1556 1556 return c;
1557 1557
1558 1558 if (this.isOpaque()) {
1559 1559 if (that.isOpaque()) {
1560 1560 // Both opaque
1561 1561 if ((c = compare(this.schemeSpecificPart,
1562 1562 that.schemeSpecificPart)) != 0)
1563 1563 return c;
1564 1564 return compare(this.fragment, that.fragment);
1565 1565 }
1566 1566 return +1; // Opaque > hierarchical
1567 1567 } else if (that.isOpaque()) {
1568 1568 return -1; // Hierarchical < opaque
1569 1569 }
1570 1570
1571 1571 // Hierarchical
1572 1572 if ((this.host != null) && (that.host != null)) {
1573 1573 // Both server-based
1574 1574 if ((c = compare(this.userInfo, that.userInfo)) != 0)
1575 1575 return c;
1576 1576 if ((c = compareIgnoringCase(this.host, that.host)) != 0)
1577 1577 return c;
1578 1578 if ((c = this.port - that.port) != 0)
1579 1579 return c;
1580 1580 } else {
1581 1581 // If one or both authorities are registry-based then we simply
1582 1582 // compare them in the usual, case-sensitive way. If one is
1583 1583 // registry-based and one is server-based then the strings are
1584 1584 // guaranteed to be unequal, hence the comparison will never return
1585 1585 // zero and the compareTo and equals methods will remain
1586 1586 // consistent.
1587 1587 if ((c = compare(this.authority, that.authority)) != 0) return c;
1588 1588 }
1589 1589
1590 1590 if ((c = compare(this.path, that.path)) != 0) return c;
1591 1591 if ((c = compare(this.query, that.query)) != 0) return c;
1592 1592 return compare(this.fragment, that.fragment);
1593 1593 }
1594 1594
1595 1595 /**
1596 1596 * Returns the content of this URI as a string.
1597 1597 *
1598 1598 * <p> If this URI was created by invoking one of the constructors in this
1599 1599 * class then a string equivalent to the original input string, or to the
1600 1600 * string computed from the originally-given components, as appropriate, is
1601 1601 * returned. Otherwise this URI was created by normalization, resolution,
1602 1602 * or relativization, and so a string is constructed from this URI's
1603 1603 * components according to the rules specified in <a
1604 1604 * href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>,
1605 1605 * section 5.2, step 7. </p>
1606 1606 *
1607 1607 * @return The string form of this URI
1608 1608 */
1609 1609 public String toString() {
1610 1610 defineString();
1611 1611 return string;
1612 1612 }
1613 1613
1614 1614 /**
1615 1615 * Returns the content of this URI as a US-ASCII string.
1616 1616 *
1617 1617 * <p> If this URI does not contain any characters in the <i>other</i>
1618 1618 * category then an invocation of this method will return the same value as
1619 1619 * an invocation of the {@link #toString() toString} method. Otherwise
1620 1620 * this method works as if by invoking that method and then <a
1621 1621 * href="#encode">encoding</a> the result. </p>
1622 1622 *
1623 1623 * @return The string form of this URI, encoded as needed
1624 1624 * so that it only contains characters in the US-ASCII
1625 1625 * charset
1626 1626 */
1627 1627 public String toASCIIString() {
1628 1628 defineString();
1629 1629 return encode(string);
1630 1630 }
1631 1631
1632 1632
1633 1633 // -- Serialization support --
1634 1634
1635 1635 /**
1636 1636 * Saves the content of this URI to the given serial stream.
1637 1637 *
1638 1638 * <p> The only serializable field of a URI instance is its <tt>string</tt>
1639 1639 * field. That field is given a value, if it does not have one already,
1640 1640 * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
1641 1641 * method of the given object-output stream is invoked. </p>
1642 1642 *
1643 1643 * @param os The object-output stream to which this object
1644 1644 * is to be written
1645 1645 */
1646 1646 private void writeObject(ObjectOutputStream os)
1647 1647 throws IOException
1648 1648 {
1649 1649 defineString();
1650 1650 os.defaultWriteObject(); // Writes the string field only
1651 1651 }
1652 1652
1653 1653 /**
1654 1654 * Reconstitutes a URI from the given serial stream.
1655 1655 *
1656 1656 * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
1657 1657 * invoked to read the value of the <tt>string</tt> field. The result is
1658 1658 * then parsed in the usual way.
1659 1659 *
1660 1660 * @param is The object-input stream from which this object
1661 1661 * is being read
1662 1662 */
1663 1663 private void readObject(ObjectInputStream is)
1664 1664 throws ClassNotFoundException, IOException
1665 1665 {
1666 1666 port = -1; // Argh
1667 1667 is.defaultReadObject();
1668 1668 try {
1669 1669 new Parser(string).parse(false);
1670 1670 } catch (URISyntaxException x) {
1671 1671 IOException y = new InvalidObjectException("Invalid URI");
1672 1672 y.initCause(x);
1673 1673 throw y;
1674 1674 }
1675 1675 }
1676 1676
1677 1677
1678 1678 // -- End of public methods --
1679 1679
1680 1680
1681 1681 // -- Utility methods for string-field comparison and hashing --
1682 1682
1683 1683 // These methods return appropriate values for null string arguments,
1684 1684 // thereby simplifying the equals, hashCode, and compareTo methods.
1685 1685 //
1686 1686 // The case-ignoring methods should only be applied to strings whose
1687 1687 // characters are all known to be US-ASCII. Because of this restriction,
1688 1688 // these methods are faster than the similar methods in the String class.
1689 1689
1690 1690 // US-ASCII only
1691 1691 private static int toLower(char c) {
1692 1692 if ((c >= 'A') && (c <= 'Z'))
1693 1693 return c + ('a' - 'A');
1694 1694 return c;
1695 1695 }
1696 1696
1697 1697 private static boolean equal(String s, String t) {
1698 1698 if (s == t) return true;
1699 1699 if ((s != null) && (t != null)) {
1700 1700 if (s.length() != t.length())
1701 1701 return false;
1702 1702 if (s.indexOf('%') < 0)
1703 1703 return s.equals(t);
↓ open down ↓ |
1703 lines elided |
↑ open up ↑ |
1704 1704 int n = s.length();
1705 1705 for (int i = 0; i < n;) {
1706 1706 char c = s.charAt(i);
1707 1707 char d = t.charAt(i);
1708 1708 if (c != '%') {
1709 1709 if (c != d)
1710 1710 return false;
1711 1711 i++;
1712 1712 continue;
1713 1713 }
1714 + if (d != '%')
1715 + return false;
1714 1716 i++;
1715 1717 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1716 1718 return false;
1717 1719 i++;
1718 1720 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1719 1721 return false;
1720 1722 i++;
1721 1723 }
1722 1724 return true;
1723 1725 }
1724 1726 return false;
1725 1727 }
1726 1728
1727 1729 // US-ASCII only
1728 1730 private static boolean equalIgnoringCase(String s, String t) {
1729 1731 if (s == t) return true;
1730 1732 if ((s != null) && (t != null)) {
1731 1733 int n = s.length();
1732 1734 if (t.length() != n)
1733 1735 return false;
1734 1736 for (int i = 0; i < n; i++) {
1735 1737 if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
1736 1738 return false;
1737 1739 }
1738 1740 return true;
1739 1741 }
1740 1742 return false;
1741 1743 }
1742 1744
1743 1745 private static int hash(int hash, String s) {
1744 1746 if (s == null) return hash;
1745 1747 return hash * 127 + s.hashCode();
1746 1748 }
1747 1749
1748 1750 // US-ASCII only
1749 1751 private static int hashIgnoringCase(int hash, String s) {
1750 1752 if (s == null) return hash;
1751 1753 int h = hash;
1752 1754 int n = s.length();
1753 1755 for (int i = 0; i < n; i++)
1754 1756 h = 31 * h + toLower(s.charAt(i));
1755 1757 return h;
1756 1758 }
1757 1759
1758 1760 private static int compare(String s, String t) {
1759 1761 if (s == t) return 0;
1760 1762 if (s != null) {
1761 1763 if (t != null)
1762 1764 return s.compareTo(t);
1763 1765 else
1764 1766 return +1;
1765 1767 } else {
1766 1768 return -1;
1767 1769 }
1768 1770 }
1769 1771
1770 1772 // US-ASCII only
1771 1773 private static int compareIgnoringCase(String s, String t) {
1772 1774 if (s == t) return 0;
1773 1775 if (s != null) {
1774 1776 if (t != null) {
1775 1777 int sn = s.length();
1776 1778 int tn = t.length();
1777 1779 int n = sn < tn ? sn : tn;
1778 1780 for (int i = 0; i < n; i++) {
1779 1781 int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
1780 1782 if (c != 0)
1781 1783 return c;
1782 1784 }
1783 1785 return sn - tn;
1784 1786 }
1785 1787 return +1;
1786 1788 } else {
1787 1789 return -1;
1788 1790 }
1789 1791 }
1790 1792
1791 1793
1792 1794 // -- String construction --
1793 1795
1794 1796 // If a scheme is given then the path, if given, must be absolute
1795 1797 //
1796 1798 private static void checkPath(String s, String scheme, String path)
1797 1799 throws URISyntaxException
1798 1800 {
1799 1801 if (scheme != null) {
1800 1802 if ((path != null)
1801 1803 && ((path.length() > 0) && (path.charAt(0) != '/')))
1802 1804 throw new URISyntaxException(s,
1803 1805 "Relative path in absolute URI");
1804 1806 }
1805 1807 }
1806 1808
1807 1809 private void appendAuthority(StringBuffer sb,
1808 1810 String authority,
1809 1811 String userInfo,
1810 1812 String host,
1811 1813 int port)
1812 1814 {
1813 1815 if (host != null) {
1814 1816 sb.append("//");
1815 1817 if (userInfo != null) {
1816 1818 sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
1817 1819 sb.append('@');
1818 1820 }
1819 1821 boolean needBrackets = ((host.indexOf(':') >= 0)
1820 1822 && !host.startsWith("[")
1821 1823 && !host.endsWith("]"));
1822 1824 if (needBrackets) sb.append('[');
1823 1825 sb.append(host);
1824 1826 if (needBrackets) sb.append(']');
1825 1827 if (port != -1) {
1826 1828 sb.append(':');
1827 1829 sb.append(port);
1828 1830 }
1829 1831 } else if (authority != null) {
1830 1832 sb.append("//");
1831 1833 if (authority.startsWith("[")) {
1832 1834 // authority should (but may not) contain an embedded IPv6 address
1833 1835 int end = authority.indexOf("]");
1834 1836 String doquote = authority, dontquote = "";
1835 1837 if (end != -1 && authority.indexOf(":") != -1) {
1836 1838 // the authority contains an IPv6 address
1837 1839 if (end == authority.length()) {
1838 1840 dontquote = authority;
1839 1841 doquote = "";
1840 1842 } else {
1841 1843 dontquote = authority.substring(0 , end + 1);
1842 1844 doquote = authority.substring(end + 1);
1843 1845 }
1844 1846 }
1845 1847 sb.append(dontquote);
1846 1848 sb.append(quote(doquote,
1847 1849 L_REG_NAME | L_SERVER,
1848 1850 H_REG_NAME | H_SERVER));
1849 1851 } else {
1850 1852 sb.append(quote(authority,
1851 1853 L_REG_NAME | L_SERVER,
1852 1854 H_REG_NAME | H_SERVER));
1853 1855 }
1854 1856 }
1855 1857 }
1856 1858
1857 1859 private void appendSchemeSpecificPart(StringBuffer sb,
1858 1860 String opaquePart,
1859 1861 String authority,
1860 1862 String userInfo,
1861 1863 String host,
1862 1864 int port,
1863 1865 String path,
1864 1866 String query)
1865 1867 {
1866 1868 if (opaquePart != null) {
1867 1869 /* check if SSP begins with an IPv6 address
1868 1870 * because we must not quote a literal IPv6 address
1869 1871 */
1870 1872 if (opaquePart.startsWith("//[")) {
1871 1873 int end = opaquePart.indexOf("]");
1872 1874 if (end != -1 && opaquePart.indexOf(":")!=-1) {
1873 1875 String doquote, dontquote;
1874 1876 if (end == opaquePart.length()) {
1875 1877 dontquote = opaquePart;
1876 1878 doquote = "";
1877 1879 } else {
1878 1880 dontquote = opaquePart.substring(0,end+1);
1879 1881 doquote = opaquePart.substring(end+1);
1880 1882 }
1881 1883 sb.append (dontquote);
1882 1884 sb.append(quote(doquote, L_URIC, H_URIC));
1883 1885 }
1884 1886 } else {
1885 1887 sb.append(quote(opaquePart, L_URIC, H_URIC));
1886 1888 }
1887 1889 } else {
1888 1890 appendAuthority(sb, authority, userInfo, host, port);
1889 1891 if (path != null)
1890 1892 sb.append(quote(path, L_PATH, H_PATH));
1891 1893 if (query != null) {
1892 1894 sb.append('?');
1893 1895 sb.append(quote(query, L_URIC, H_URIC));
1894 1896 }
1895 1897 }
1896 1898 }
1897 1899
1898 1900 private void appendFragment(StringBuffer sb, String fragment) {
1899 1901 if (fragment != null) {
1900 1902 sb.append('#');
1901 1903 sb.append(quote(fragment, L_URIC, H_URIC));
1902 1904 }
1903 1905 }
1904 1906
1905 1907 private String toString(String scheme,
1906 1908 String opaquePart,
1907 1909 String authority,
1908 1910 String userInfo,
1909 1911 String host,
1910 1912 int port,
1911 1913 String path,
1912 1914 String query,
1913 1915 String fragment)
1914 1916 {
1915 1917 StringBuffer sb = new StringBuffer();
1916 1918 if (scheme != null) {
1917 1919 sb.append(scheme);
1918 1920 sb.append(':');
1919 1921 }
1920 1922 appendSchemeSpecificPart(sb, opaquePart,
1921 1923 authority, userInfo, host, port,
1922 1924 path, query);
1923 1925 appendFragment(sb, fragment);
1924 1926 return sb.toString();
1925 1927 }
1926 1928
1927 1929 private void defineSchemeSpecificPart() {
1928 1930 if (schemeSpecificPart != null) return;
1929 1931 StringBuffer sb = new StringBuffer();
1930 1932 appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
1931 1933 host, port, getPath(), getQuery());
1932 1934 if (sb.length() == 0) return;
1933 1935 schemeSpecificPart = sb.toString();
1934 1936 }
1935 1937
1936 1938 private void defineString() {
1937 1939 if (string != null) return;
1938 1940
1939 1941 StringBuffer sb = new StringBuffer();
1940 1942 if (scheme != null) {
1941 1943 sb.append(scheme);
1942 1944 sb.append(':');
1943 1945 }
1944 1946 if (isOpaque()) {
1945 1947 sb.append(schemeSpecificPart);
1946 1948 } else {
1947 1949 if (host != null) {
1948 1950 sb.append("//");
1949 1951 if (userInfo != null) {
1950 1952 sb.append(userInfo);
1951 1953 sb.append('@');
1952 1954 }
1953 1955 boolean needBrackets = ((host.indexOf(':') >= 0)
1954 1956 && !host.startsWith("[")
1955 1957 && !host.endsWith("]"));
1956 1958 if (needBrackets) sb.append('[');
1957 1959 sb.append(host);
1958 1960 if (needBrackets) sb.append(']');
1959 1961 if (port != -1) {
1960 1962 sb.append(':');
1961 1963 sb.append(port);
1962 1964 }
1963 1965 } else if (authority != null) {
1964 1966 sb.append("//");
1965 1967 sb.append(authority);
1966 1968 }
1967 1969 if (path != null)
1968 1970 sb.append(path);
1969 1971 if (query != null) {
1970 1972 sb.append('?');
1971 1973 sb.append(query);
1972 1974 }
1973 1975 }
1974 1976 if (fragment != null) {
1975 1977 sb.append('#');
1976 1978 sb.append(fragment);
1977 1979 }
1978 1980 string = sb.toString();
1979 1981 }
1980 1982
1981 1983
1982 1984 // -- Normalization, resolution, and relativization --
1983 1985
1984 1986 // RFC2396 5.2 (6)
1985 1987 private static String resolvePath(String base, String child,
1986 1988 boolean absolute)
1987 1989 {
1988 1990 int i = base.lastIndexOf('/');
1989 1991 int cn = child.length();
1990 1992 String path = "";
1991 1993
1992 1994 if (cn == 0) {
1993 1995 // 5.2 (6a)
1994 1996 if (i >= 0)
1995 1997 path = base.substring(0, i + 1);
1996 1998 } else {
1997 1999 StringBuffer sb = new StringBuffer(base.length() + cn);
1998 2000 // 5.2 (6a)
1999 2001 if (i >= 0)
2000 2002 sb.append(base.substring(0, i + 1));
2001 2003 // 5.2 (6b)
2002 2004 sb.append(child);
2003 2005 path = sb.toString();
2004 2006 }
2005 2007
2006 2008 // 5.2 (6c-f)
2007 2009 String np = normalize(path);
2008 2010
2009 2011 // 5.2 (6g): If the result is absolute but the path begins with "../",
2010 2012 // then we simply leave the path as-is
2011 2013
2012 2014 return np;
2013 2015 }
2014 2016
2015 2017 // RFC2396 5.2
2016 2018 private static URI resolve(URI base, URI child) {
2017 2019 // check if child if opaque first so that NPE is thrown
2018 2020 // if child is null.
2019 2021 if (child.isOpaque() || base.isOpaque())
2020 2022 return child;
2021 2023
2022 2024 // 5.2 (2): Reference to current document (lone fragment)
2023 2025 if ((child.scheme == null) && (child.authority == null)
2024 2026 && child.path.equals("") && (child.fragment != null)
2025 2027 && (child.query == null)) {
2026 2028 if ((base.fragment != null)
2027 2029 && child.fragment.equals(base.fragment)) {
2028 2030 return base;
2029 2031 }
2030 2032 URI ru = new URI();
2031 2033 ru.scheme = base.scheme;
2032 2034 ru.authority = base.authority;
2033 2035 ru.userInfo = base.userInfo;
2034 2036 ru.host = base.host;
2035 2037 ru.port = base.port;
2036 2038 ru.path = base.path;
2037 2039 ru.fragment = child.fragment;
2038 2040 ru.query = base.query;
2039 2041 return ru;
2040 2042 }
2041 2043
2042 2044 // 5.2 (3): Child is absolute
2043 2045 if (child.scheme != null)
2044 2046 return child;
2045 2047
2046 2048 URI ru = new URI(); // Resolved URI
2047 2049 ru.scheme = base.scheme;
2048 2050 ru.query = child.query;
2049 2051 ru.fragment = child.fragment;
2050 2052
2051 2053 // 5.2 (4): Authority
2052 2054 if (child.authority == null) {
2053 2055 ru.authority = base.authority;
2054 2056 ru.host = base.host;
2055 2057 ru.userInfo = base.userInfo;
2056 2058 ru.port = base.port;
2057 2059
2058 2060 String cp = (child.path == null) ? "" : child.path;
2059 2061 if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
2060 2062 // 5.2 (5): Child path is absolute
2061 2063 ru.path = child.path;
2062 2064 } else {
2063 2065 // 5.2 (6): Resolve relative path
2064 2066 ru.path = resolvePath(base.path, cp, base.isAbsolute());
2065 2067 }
2066 2068 } else {
2067 2069 ru.authority = child.authority;
2068 2070 ru.host = child.host;
2069 2071 ru.userInfo = child.userInfo;
2070 2072 ru.host = child.host;
2071 2073 ru.port = child.port;
2072 2074 ru.path = child.path;
2073 2075 }
2074 2076
2075 2077 // 5.2 (7): Recombine (nothing to do here)
2076 2078 return ru;
2077 2079 }
2078 2080
2079 2081 // If the given URI's path is normal then return the URI;
2080 2082 // o.w., return a new URI containing the normalized path.
2081 2083 //
2082 2084 private static URI normalize(URI u) {
2083 2085 if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
2084 2086 return u;
2085 2087
2086 2088 String np = normalize(u.path);
2087 2089 if (np == u.path)
2088 2090 return u;
2089 2091
2090 2092 URI v = new URI();
2091 2093 v.scheme = u.scheme;
2092 2094 v.fragment = u.fragment;
2093 2095 v.authority = u.authority;
2094 2096 v.userInfo = u.userInfo;
2095 2097 v.host = u.host;
2096 2098 v.port = u.port;
2097 2099 v.path = np;
2098 2100 v.query = u.query;
2099 2101 return v;
2100 2102 }
2101 2103
2102 2104 // If both URIs are hierarchical, their scheme and authority components are
2103 2105 // identical, and the base path is a prefix of the child's path, then
2104 2106 // return a relative URI that, when resolved against the base, yields the
2105 2107 // child; otherwise, return the child.
2106 2108 //
2107 2109 private static URI relativize(URI base, URI child) {
2108 2110 // check if child if opaque first so that NPE is thrown
2109 2111 // if child is null.
2110 2112 if (child.isOpaque() || base.isOpaque())
2111 2113 return child;
2112 2114 if (!equalIgnoringCase(base.scheme, child.scheme)
2113 2115 || !equal(base.authority, child.authority))
2114 2116 return child;
2115 2117
2116 2118 String bp = normalize(base.path);
2117 2119 String cp = normalize(child.path);
2118 2120 if (!bp.equals(cp)) {
2119 2121 if (!bp.endsWith("/"))
2120 2122 bp = bp + "/";
2121 2123 if (!cp.startsWith(bp))
2122 2124 return child;
2123 2125 }
2124 2126
2125 2127 URI v = new URI();
2126 2128 v.path = cp.substring(bp.length());
2127 2129 v.query = child.query;
2128 2130 v.fragment = child.fragment;
2129 2131 return v;
2130 2132 }
2131 2133
2132 2134
2133 2135
2134 2136 // -- Path normalization --
2135 2137
2136 2138 // The following algorithm for path normalization avoids the creation of a
2137 2139 // string object for each segment, as well as the use of a string buffer to
2138 2140 // compute the final result, by using a single char array and editing it in
2139 2141 // place. The array is first split into segments, replacing each slash
2140 2142 // with '\0' and creating a segment-index array, each element of which is
2141 2143 // the index of the first char in the corresponding segment. We then walk
2142 2144 // through both arrays, removing ".", "..", and other segments as necessary
2143 2145 // by setting their entries in the index array to -1. Finally, the two
2144 2146 // arrays are used to rejoin the segments and compute the final result.
2145 2147 //
2146 2148 // This code is based upon src/solaris/native/java/io/canonicalize_md.c
2147 2149
2148 2150
2149 2151 // Check the given path to see if it might need normalization. A path
2150 2152 // might need normalization if it contains duplicate slashes, a "."
2151 2153 // segment, or a ".." segment. Return -1 if no further normalization is
2152 2154 // possible, otherwise return the number of segments found.
2153 2155 //
2154 2156 // This method takes a string argument rather than a char array so that
2155 2157 // this test can be performed without invoking path.toCharArray().
2156 2158 //
2157 2159 static private int needsNormalization(String path) {
2158 2160 boolean normal = true;
2159 2161 int ns = 0; // Number of segments
2160 2162 int end = path.length() - 1; // Index of last char in path
2161 2163 int p = 0; // Index of next char in path
2162 2164
2163 2165 // Skip initial slashes
2164 2166 while (p <= end) {
2165 2167 if (path.charAt(p) != '/') break;
2166 2168 p++;
2167 2169 }
2168 2170 if (p > 1) normal = false;
2169 2171
2170 2172 // Scan segments
2171 2173 while (p <= end) {
2172 2174
2173 2175 // Looking at "." or ".." ?
2174 2176 if ((path.charAt(p) == '.')
2175 2177 && ((p == end)
2176 2178 || ((path.charAt(p + 1) == '/')
2177 2179 || ((path.charAt(p + 1) == '.')
2178 2180 && ((p + 1 == end)
2179 2181 || (path.charAt(p + 2) == '/')))))) {
2180 2182 normal = false;
2181 2183 }
2182 2184 ns++;
2183 2185
2184 2186 // Find beginning of next segment
2185 2187 while (p <= end) {
2186 2188 if (path.charAt(p++) != '/')
2187 2189 continue;
2188 2190
2189 2191 // Skip redundant slashes
2190 2192 while (p <= end) {
2191 2193 if (path.charAt(p) != '/') break;
2192 2194 normal = false;
2193 2195 p++;
2194 2196 }
2195 2197
2196 2198 break;
2197 2199 }
2198 2200 }
2199 2201
2200 2202 return normal ? -1 : ns;
2201 2203 }
2202 2204
2203 2205
2204 2206 // Split the given path into segments, replacing slashes with nulls and
2205 2207 // filling in the given segment-index array.
2206 2208 //
2207 2209 // Preconditions:
2208 2210 // segs.length == Number of segments in path
2209 2211 //
2210 2212 // Postconditions:
2211 2213 // All slashes in path replaced by '\0'
2212 2214 // segs[i] == Index of first char in segment i (0 <= i < segs.length)
2213 2215 //
2214 2216 static private void split(char[] path, int[] segs) {
2215 2217 int end = path.length - 1; // Index of last char in path
2216 2218 int p = 0; // Index of next char in path
2217 2219 int i = 0; // Index of current segment
2218 2220
2219 2221 // Skip initial slashes
2220 2222 while (p <= end) {
2221 2223 if (path[p] != '/') break;
2222 2224 path[p] = '\0';
2223 2225 p++;
2224 2226 }
2225 2227
2226 2228 while (p <= end) {
2227 2229
2228 2230 // Note start of segment
2229 2231 segs[i++] = p++;
2230 2232
2231 2233 // Find beginning of next segment
2232 2234 while (p <= end) {
2233 2235 if (path[p++] != '/')
2234 2236 continue;
2235 2237 path[p - 1] = '\0';
2236 2238
2237 2239 // Skip redundant slashes
2238 2240 while (p <= end) {
2239 2241 if (path[p] != '/') break;
2240 2242 path[p++] = '\0';
2241 2243 }
2242 2244 break;
2243 2245 }
2244 2246 }
2245 2247
2246 2248 if (i != segs.length)
2247 2249 throw new InternalError(); // ASSERT
2248 2250 }
2249 2251
2250 2252
2251 2253 // Join the segments in the given path according to the given segment-index
2252 2254 // array, ignoring those segments whose index entries have been set to -1,
2253 2255 // and inserting slashes as needed. Return the length of the resulting
2254 2256 // path.
2255 2257 //
2256 2258 // Preconditions:
2257 2259 // segs[i] == -1 implies segment i is to be ignored
2258 2260 // path computed by split, as above, with '\0' having replaced '/'
2259 2261 //
2260 2262 // Postconditions:
2261 2263 // path[0] .. path[return value] == Resulting path
2262 2264 //
2263 2265 static private int join(char[] path, int[] segs) {
2264 2266 int ns = segs.length; // Number of segments
2265 2267 int end = path.length - 1; // Index of last char in path
2266 2268 int p = 0; // Index of next path char to write
2267 2269
2268 2270 if (path[p] == '\0') {
2269 2271 // Restore initial slash for absolute paths
2270 2272 path[p++] = '/';
2271 2273 }
2272 2274
2273 2275 for (int i = 0; i < ns; i++) {
2274 2276 int q = segs[i]; // Current segment
2275 2277 if (q == -1)
2276 2278 // Ignore this segment
2277 2279 continue;
2278 2280
2279 2281 if (p == q) {
2280 2282 // We're already at this segment, so just skip to its end
2281 2283 while ((p <= end) && (path[p] != '\0'))
2282 2284 p++;
2283 2285 if (p <= end) {
2284 2286 // Preserve trailing slash
2285 2287 path[p++] = '/';
2286 2288 }
2287 2289 } else if (p < q) {
2288 2290 // Copy q down to p
2289 2291 while ((q <= end) && (path[q] != '\0'))
2290 2292 path[p++] = path[q++];
2291 2293 if (q <= end) {
2292 2294 // Preserve trailing slash
2293 2295 path[p++] = '/';
2294 2296 }
2295 2297 } else
2296 2298 throw new InternalError(); // ASSERT false
2297 2299 }
2298 2300
2299 2301 return p;
2300 2302 }
2301 2303
2302 2304
2303 2305 // Remove "." segments from the given path, and remove segment pairs
2304 2306 // consisting of a non-".." segment followed by a ".." segment.
2305 2307 //
2306 2308 private static void removeDots(char[] path, int[] segs) {
2307 2309 int ns = segs.length;
2308 2310 int end = path.length - 1;
2309 2311
2310 2312 for (int i = 0; i < ns; i++) {
2311 2313 int dots = 0; // Number of dots found (0, 1, or 2)
2312 2314
2313 2315 // Find next occurrence of "." or ".."
2314 2316 do {
2315 2317 int p = segs[i];
2316 2318 if (path[p] == '.') {
2317 2319 if (p == end) {
2318 2320 dots = 1;
2319 2321 break;
2320 2322 } else if (path[p + 1] == '\0') {
2321 2323 dots = 1;
2322 2324 break;
2323 2325 } else if ((path[p + 1] == '.')
2324 2326 && ((p + 1 == end)
2325 2327 || (path[p + 2] == '\0'))) {
2326 2328 dots = 2;
2327 2329 break;
2328 2330 }
2329 2331 }
2330 2332 i++;
2331 2333 } while (i < ns);
2332 2334 if ((i > ns) || (dots == 0))
2333 2335 break;
2334 2336
2335 2337 if (dots == 1) {
2336 2338 // Remove this occurrence of "."
2337 2339 segs[i] = -1;
2338 2340 } else {
2339 2341 // If there is a preceding non-".." segment, remove both that
2340 2342 // segment and this occurrence of ".."; otherwise, leave this
2341 2343 // ".." segment as-is.
2342 2344 int j;
2343 2345 for (j = i - 1; j >= 0; j--) {
2344 2346 if (segs[j] != -1) break;
2345 2347 }
2346 2348 if (j >= 0) {
2347 2349 int q = segs[j];
2348 2350 if (!((path[q] == '.')
2349 2351 && (path[q + 1] == '.')
2350 2352 && (path[q + 2] == '\0'))) {
2351 2353 segs[i] = -1;
2352 2354 segs[j] = -1;
2353 2355 }
2354 2356 }
2355 2357 }
2356 2358 }
2357 2359 }
2358 2360
2359 2361
2360 2362 // DEVIATION: If the normalized path is relative, and if the first
2361 2363 // segment could be parsed as a scheme name, then prepend a "." segment
2362 2364 //
2363 2365 private static void maybeAddLeadingDot(char[] path, int[] segs) {
2364 2366
2365 2367 if (path[0] == '\0')
2366 2368 // The path is absolute
2367 2369 return;
2368 2370
2369 2371 int ns = segs.length;
2370 2372 int f = 0; // Index of first segment
2371 2373 while (f < ns) {
2372 2374 if (segs[f] >= 0)
2373 2375 break;
2374 2376 f++;
2375 2377 }
2376 2378 if ((f >= ns) || (f == 0))
2377 2379 // The path is empty, or else the original first segment survived,
2378 2380 // in which case we already know that no leading "." is needed
2379 2381 return;
2380 2382
2381 2383 int p = segs[f];
2382 2384 while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
2383 2385 if (p >= path.length || path[p] == '\0')
2384 2386 // No colon in first segment, so no "." needed
2385 2387 return;
2386 2388
2387 2389 // At this point we know that the first segment is unused,
2388 2390 // hence we can insert a "." segment at that position
2389 2391 path[0] = '.';
2390 2392 path[1] = '\0';
2391 2393 segs[0] = 0;
2392 2394 }
2393 2395
2394 2396
2395 2397 // Normalize the given path string. A normal path string has no empty
2396 2398 // segments (i.e., occurrences of "//"), no segments equal to ".", and no
2397 2399 // segments equal to ".." that are preceded by a segment not equal to "..".
2398 2400 // In contrast to Unix-style pathname normalization, for URI paths we
2399 2401 // always retain trailing slashes.
2400 2402 //
2401 2403 private static String normalize(String ps) {
2402 2404
2403 2405 // Does this path need normalization?
2404 2406 int ns = needsNormalization(ps); // Number of segments
2405 2407 if (ns < 0)
2406 2408 // Nope -- just return it
2407 2409 return ps;
2408 2410
2409 2411 char[] path = ps.toCharArray(); // Path in char-array form
2410 2412
2411 2413 // Split path into segments
2412 2414 int[] segs = new int[ns]; // Segment-index array
2413 2415 split(path, segs);
2414 2416
2415 2417 // Remove dots
2416 2418 removeDots(path, segs);
2417 2419
2418 2420 // Prevent scheme-name confusion
2419 2421 maybeAddLeadingDot(path, segs);
2420 2422
2421 2423 // Join the remaining segments and return the result
2422 2424 String s = new String(path, 0, join(path, segs));
2423 2425 if (s.equals(ps)) {
2424 2426 // string was already normalized
2425 2427 return ps;
2426 2428 }
2427 2429 return s;
2428 2430 }
2429 2431
2430 2432
2431 2433
2432 2434 // -- Character classes for parsing --
2433 2435
2434 2436 // RFC2396 precisely specifies which characters in the US-ASCII charset are
2435 2437 // permissible in the various components of a URI reference. We here
2436 2438 // define a set of mask pairs to aid in enforcing these restrictions. Each
2437 2439 // mask pair consists of two longs, a low mask and a high mask. Taken
2438 2440 // together they represent a 128-bit mask, where bit i is set iff the
2439 2441 // character with value i is permitted.
2440 2442 //
2441 2443 // This approach is more efficient than sequentially searching arrays of
2442 2444 // permitted characters. It could be made still more efficient by
2443 2445 // precompiling the mask information so that a character's presence in a
2444 2446 // given mask could be determined by a single table lookup.
2445 2447
2446 2448 // Compute the low-order mask for the characters in the given string
2447 2449 private static long lowMask(String chars) {
2448 2450 int n = chars.length();
2449 2451 long m = 0;
2450 2452 for (int i = 0; i < n; i++) {
2451 2453 char c = chars.charAt(i);
2452 2454 if (c < 64)
2453 2455 m |= (1L << c);
2454 2456 }
2455 2457 return m;
2456 2458 }
2457 2459
2458 2460 // Compute the high-order mask for the characters in the given string
2459 2461 private static long highMask(String chars) {
2460 2462 int n = chars.length();
2461 2463 long m = 0;
2462 2464 for (int i = 0; i < n; i++) {
2463 2465 char c = chars.charAt(i);
2464 2466 if ((c >= 64) && (c < 128))
2465 2467 m |= (1L << (c - 64));
2466 2468 }
2467 2469 return m;
2468 2470 }
2469 2471
2470 2472 // Compute a low-order mask for the characters
2471 2473 // between first and last, inclusive
2472 2474 private static long lowMask(char first, char last) {
2473 2475 long m = 0;
2474 2476 int f = Math.max(Math.min(first, 63), 0);
2475 2477 int l = Math.max(Math.min(last, 63), 0);
2476 2478 for (int i = f; i <= l; i++)
2477 2479 m |= 1L << i;
2478 2480 return m;
2479 2481 }
2480 2482
2481 2483 // Compute a high-order mask for the characters
2482 2484 // between first and last, inclusive
2483 2485 private static long highMask(char first, char last) {
2484 2486 long m = 0;
2485 2487 int f = Math.max(Math.min(first, 127), 64) - 64;
2486 2488 int l = Math.max(Math.min(last, 127), 64) - 64;
2487 2489 for (int i = f; i <= l; i++)
2488 2490 m |= 1L << i;
2489 2491 return m;
2490 2492 }
2491 2493
2492 2494 // Tell whether the given character is permitted by the given mask pair
2493 2495 private static boolean match(char c, long lowMask, long highMask) {
2494 2496 if (c == 0) // 0 doesn't have a slot in the mask. So, it never matches.
2495 2497 return false;
2496 2498 if (c < 64)
2497 2499 return ((1L << c) & lowMask) != 0;
2498 2500 if (c < 128)
2499 2501 return ((1L << (c - 64)) & highMask) != 0;
2500 2502 return false;
2501 2503 }
2502 2504
2503 2505 // Character-class masks, in reverse order from RFC2396 because
2504 2506 // initializers for static fields cannot make forward references.
2505 2507
2506 2508 // digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
2507 2509 // "8" | "9"
2508 2510 private static final long L_DIGIT = lowMask('0', '9');
2509 2511 private static final long H_DIGIT = 0L;
2510 2512
2511 2513 // upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
2512 2514 // "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
2513 2515 // "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
2514 2516 private static final long L_UPALPHA = 0L;
2515 2517 private static final long H_UPALPHA = highMask('A', 'Z');
2516 2518
2517 2519 // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
2518 2520 // "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
2519 2521 // "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
2520 2522 private static final long L_LOWALPHA = 0L;
2521 2523 private static final long H_LOWALPHA = highMask('a', 'z');
2522 2524
2523 2525 // alpha = lowalpha | upalpha
2524 2526 private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
2525 2527 private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
2526 2528
2527 2529 // alphanum = alpha | digit
2528 2530 private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
2529 2531 private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
2530 2532
2531 2533 // hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
2532 2534 // "a" | "b" | "c" | "d" | "e" | "f"
2533 2535 private static final long L_HEX = L_DIGIT;
2534 2536 private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
2535 2537
2536 2538 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
2537 2539 // "(" | ")"
2538 2540 private static final long L_MARK = lowMask("-_.!~*'()");
2539 2541 private static final long H_MARK = highMask("-_.!~*'()");
2540 2542
2541 2543 // unreserved = alphanum | mark
2542 2544 private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
2543 2545 private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
2544 2546
2545 2547 // reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
2546 2548 // "$" | "," | "[" | "]"
2547 2549 // Added per RFC2732: "[", "]"
2548 2550 private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
2549 2551 private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
2550 2552
2551 2553 // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
2552 2554 // characters are allowed; this is handled by the scanEscape method below.
2553 2555 private static final long L_ESCAPED = 1L;
2554 2556 private static final long H_ESCAPED = 0L;
2555 2557
2556 2558 // uric = reserved | unreserved | escaped
2557 2559 private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
2558 2560 private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
2559 2561
2560 2562 // pchar = unreserved | escaped |
2561 2563 // ":" | "@" | "&" | "=" | "+" | "$" | ","
2562 2564 private static final long L_PCHAR
2563 2565 = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
2564 2566 private static final long H_PCHAR
2565 2567 = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
2566 2568
2567 2569 // All valid path characters
2568 2570 private static final long L_PATH = L_PCHAR | lowMask(";/");
2569 2571 private static final long H_PATH = H_PCHAR | highMask(";/");
2570 2572
2571 2573 // Dash, for use in domainlabel and toplabel
2572 2574 private static final long L_DASH = lowMask("-");
2573 2575 private static final long H_DASH = highMask("-");
2574 2576
2575 2577 // Dot, for use in hostnames
2576 2578 private static final long L_DOT = lowMask(".");
2577 2579 private static final long H_DOT = highMask(".");
2578 2580
2579 2581 // userinfo = *( unreserved | escaped |
2580 2582 // ";" | ":" | "&" | "=" | "+" | "$" | "," )
2581 2583 private static final long L_USERINFO
2582 2584 = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
2583 2585 private static final long H_USERINFO
2584 2586 = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
2585 2587
2586 2588 // reg_name = 1*( unreserved | escaped | "$" | "," |
2587 2589 // ";" | ":" | "@" | "&" | "=" | "+" )
2588 2590 private static final long L_REG_NAME
2589 2591 = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
2590 2592 private static final long H_REG_NAME
2591 2593 = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
2592 2594
2593 2595 // All valid characters for server-based authorities
2594 2596 private static final long L_SERVER
2595 2597 = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
2596 2598 private static final long H_SERVER
2597 2599 = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
2598 2600
2599 2601 // Special case of server authority that represents an IPv6 address
2600 2602 // In this case, a % does not signify an escape sequence
2601 2603 private static final long L_SERVER_PERCENT
2602 2604 = L_SERVER | lowMask("%");
2603 2605 private static final long H_SERVER_PERCENT
2604 2606 = H_SERVER | highMask("%");
2605 2607 private static final long L_LEFT_BRACKET = lowMask("[");
2606 2608 private static final long H_LEFT_BRACKET = highMask("[");
2607 2609
2608 2610 // scheme = alpha *( alpha | digit | "+" | "-" | "." )
2609 2611 private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
2610 2612 private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
2611 2613
2612 2614 // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
2613 2615 // "&" | "=" | "+" | "$" | ","
2614 2616 private static final long L_URIC_NO_SLASH
2615 2617 = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
2616 2618 private static final long H_URIC_NO_SLASH
2617 2619 = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
2618 2620
2619 2621
2620 2622 // -- Escaping and encoding --
2621 2623
2622 2624 private final static char[] hexDigits = {
2623 2625 '0', '1', '2', '3', '4', '5', '6', '7',
2624 2626 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
2625 2627 };
2626 2628
2627 2629 private static void appendEscape(StringBuffer sb, byte b) {
2628 2630 sb.append('%');
2629 2631 sb.append(hexDigits[(b >> 4) & 0x0f]);
2630 2632 sb.append(hexDigits[(b >> 0) & 0x0f]);
2631 2633 }
2632 2634
2633 2635 private static void appendEncoded(StringBuffer sb, char c) {
2634 2636 ByteBuffer bb = null;
2635 2637 try {
2636 2638 bb = ThreadLocalCoders.encoderFor("UTF-8")
2637 2639 .encode(CharBuffer.wrap("" + c));
2638 2640 } catch (CharacterCodingException x) {
2639 2641 assert false;
2640 2642 }
2641 2643 while (bb.hasRemaining()) {
2642 2644 int b = bb.get() & 0xff;
2643 2645 if (b >= 0x80)
2644 2646 appendEscape(sb, (byte)b);
2645 2647 else
2646 2648 sb.append((char)b);
2647 2649 }
2648 2650 }
2649 2651
2650 2652 // Quote any characters in s that are not permitted
2651 2653 // by the given mask pair
2652 2654 //
2653 2655 private static String quote(String s, long lowMask, long highMask) {
2654 2656 int n = s.length();
2655 2657 StringBuffer sb = null;
2656 2658 boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
2657 2659 for (int i = 0; i < s.length(); i++) {
2658 2660 char c = s.charAt(i);
2659 2661 if (c < '\u0080') {
2660 2662 if (!match(c, lowMask, highMask)) {
2661 2663 if (sb == null) {
2662 2664 sb = new StringBuffer();
2663 2665 sb.append(s.substring(0, i));
2664 2666 }
2665 2667 appendEscape(sb, (byte)c);
2666 2668 } else {
2667 2669 if (sb != null)
2668 2670 sb.append(c);
2669 2671 }
2670 2672 } else if (allowNonASCII
2671 2673 && (Character.isSpaceChar(c)
2672 2674 || Character.isISOControl(c))) {
2673 2675 if (sb == null) {
2674 2676 sb = new StringBuffer();
2675 2677 sb.append(s.substring(0, i));
2676 2678 }
2677 2679 appendEncoded(sb, c);
2678 2680 } else {
2679 2681 if (sb != null)
2680 2682 sb.append(c);
2681 2683 }
2682 2684 }
2683 2685 return (sb == null) ? s : sb.toString();
2684 2686 }
2685 2687
2686 2688 // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
2687 2689 // assuming that s is otherwise legal
2688 2690 //
2689 2691 private static String encode(String s) {
2690 2692 int n = s.length();
2691 2693 if (n == 0)
2692 2694 return s;
2693 2695
2694 2696 // First check whether we actually need to encode
2695 2697 for (int i = 0;;) {
2696 2698 if (s.charAt(i) >= '\u0080')
2697 2699 break;
2698 2700 if (++i >= n)
2699 2701 return s;
2700 2702 }
2701 2703
2702 2704 String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
2703 2705 ByteBuffer bb = null;
2704 2706 try {
2705 2707 bb = ThreadLocalCoders.encoderFor("UTF-8")
2706 2708 .encode(CharBuffer.wrap(ns));
2707 2709 } catch (CharacterCodingException x) {
2708 2710 assert false;
2709 2711 }
2710 2712
2711 2713 StringBuffer sb = new StringBuffer();
2712 2714 while (bb.hasRemaining()) {
2713 2715 int b = bb.get() & 0xff;
2714 2716 if (b >= 0x80)
2715 2717 appendEscape(sb, (byte)b);
2716 2718 else
2717 2719 sb.append((char)b);
2718 2720 }
2719 2721 return sb.toString();
2720 2722 }
2721 2723
2722 2724 private static int decode(char c) {
2723 2725 if ((c >= '0') && (c <= '9'))
2724 2726 return c - '0';
2725 2727 if ((c >= 'a') && (c <= 'f'))
2726 2728 return c - 'a' + 10;
2727 2729 if ((c >= 'A') && (c <= 'F'))
2728 2730 return c - 'A' + 10;
2729 2731 assert false;
2730 2732 return -1;
2731 2733 }
2732 2734
2733 2735 private static byte decode(char c1, char c2) {
2734 2736 return (byte)( ((decode(c1) & 0xf) << 4)
2735 2737 | ((decode(c2) & 0xf) << 0));
2736 2738 }
2737 2739
2738 2740 // Evaluates all escapes in s, applying UTF-8 decoding if needed. Assumes
2739 2741 // that escapes are well-formed syntactically, i.e., of the form %XX. If a
2740 2742 // sequence of escaped octets is not valid UTF-8 then the erroneous octets
2741 2743 // are replaced with '\uFFFD'.
2742 2744 // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
2743 2745 // with a scope_id
2744 2746 //
2745 2747 private static String decode(String s) {
2746 2748 if (s == null)
2747 2749 return s;
2748 2750 int n = s.length();
2749 2751 if (n == 0)
2750 2752 return s;
2751 2753 if (s.indexOf('%') < 0)
2752 2754 return s;
2753 2755
2754 2756 StringBuffer sb = new StringBuffer(n);
2755 2757 ByteBuffer bb = ByteBuffer.allocate(n);
2756 2758 CharBuffer cb = CharBuffer.allocate(n);
2757 2759 CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
2758 2760 .onMalformedInput(CodingErrorAction.REPLACE)
2759 2761 .onUnmappableCharacter(CodingErrorAction.REPLACE);
2760 2762
2761 2763 // This is not horribly efficient, but it will do for now
2762 2764 char c = s.charAt(0);
2763 2765 boolean betweenBrackets = false;
2764 2766
2765 2767 for (int i = 0; i < n;) {
2766 2768 assert c == s.charAt(i); // Loop invariant
2767 2769 if (c == '[') {
2768 2770 betweenBrackets = true;
2769 2771 } else if (betweenBrackets && c == ']') {
2770 2772 betweenBrackets = false;
2771 2773 }
2772 2774 if (c != '%' || betweenBrackets) {
2773 2775 sb.append(c);
2774 2776 if (++i >= n)
2775 2777 break;
2776 2778 c = s.charAt(i);
2777 2779 continue;
2778 2780 }
2779 2781 bb.clear();
2780 2782 int ui = i;
2781 2783 for (;;) {
2782 2784 assert (n - i >= 2);
2783 2785 bb.put(decode(s.charAt(++i), s.charAt(++i)));
2784 2786 if (++i >= n)
2785 2787 break;
2786 2788 c = s.charAt(i);
2787 2789 if (c != '%')
2788 2790 break;
2789 2791 }
2790 2792 bb.flip();
2791 2793 cb.clear();
2792 2794 dec.reset();
2793 2795 CoderResult cr = dec.decode(bb, cb, true);
2794 2796 assert cr.isUnderflow();
2795 2797 cr = dec.flush(cb);
2796 2798 assert cr.isUnderflow();
2797 2799 sb.append(cb.flip().toString());
2798 2800 }
2799 2801
2800 2802 return sb.toString();
2801 2803 }
2802 2804
2803 2805
2804 2806 // -- Parsing --
2805 2807
2806 2808 // For convenience we wrap the input URI string in a new instance of the
2807 2809 // following internal class. This saves always having to pass the input
2808 2810 // string as an argument to each internal scan/parse method.
2809 2811
2810 2812 private class Parser {
2811 2813
2812 2814 private String input; // URI input string
2813 2815 private boolean requireServerAuthority = false;
2814 2816
2815 2817 Parser(String s) {
2816 2818 input = s;
2817 2819 string = s;
2818 2820 }
2819 2821
2820 2822 // -- Methods for throwing URISyntaxException in various ways --
2821 2823
2822 2824 private void fail(String reason) throws URISyntaxException {
2823 2825 throw new URISyntaxException(input, reason);
2824 2826 }
2825 2827
2826 2828 private void fail(String reason, int p) throws URISyntaxException {
2827 2829 throw new URISyntaxException(input, reason, p);
2828 2830 }
2829 2831
2830 2832 private void failExpecting(String expected, int p)
2831 2833 throws URISyntaxException
2832 2834 {
2833 2835 fail("Expected " + expected, p);
2834 2836 }
2835 2837
2836 2838 private void failExpecting(String expected, String prior, int p)
2837 2839 throws URISyntaxException
2838 2840 {
2839 2841 fail("Expected " + expected + " following " + prior, p);
2840 2842 }
2841 2843
2842 2844
2843 2845 // -- Simple access to the input string --
2844 2846
2845 2847 // Return a substring of the input string
2846 2848 //
2847 2849 private String substring(int start, int end) {
2848 2850 return input.substring(start, end);
2849 2851 }
2850 2852
2851 2853 // Return the char at position p,
2852 2854 // assuming that p < input.length()
2853 2855 //
2854 2856 private char charAt(int p) {
2855 2857 return input.charAt(p);
2856 2858 }
2857 2859
2858 2860 // Tells whether start < end and, if so, whether charAt(start) == c
2859 2861 //
2860 2862 private boolean at(int start, int end, char c) {
2861 2863 return (start < end) && (charAt(start) == c);
2862 2864 }
2863 2865
2864 2866 // Tells whether start + s.length() < end and, if so,
2865 2867 // whether the chars at the start position match s exactly
2866 2868 //
2867 2869 private boolean at(int start, int end, String s) {
2868 2870 int p = start;
2869 2871 int sn = s.length();
2870 2872 if (sn > end - p)
2871 2873 return false;
2872 2874 int i = 0;
2873 2875 while (i < sn) {
2874 2876 if (charAt(p++) != s.charAt(i)) {
2875 2877 break;
2876 2878 }
2877 2879 i++;
2878 2880 }
2879 2881 return (i == sn);
2880 2882 }
2881 2883
2882 2884
2883 2885 // -- Scanning --
2884 2886
2885 2887 // The various scan and parse methods that follow use a uniform
2886 2888 // convention of taking the current start position and end index as
2887 2889 // their first two arguments. The start is inclusive while the end is
2888 2890 // exclusive, just as in the String class, i.e., a start/end pair
2889 2891 // denotes the left-open interval [start, end) of the input string.
2890 2892 //
2891 2893 // These methods never proceed past the end position. They may return
2892 2894 // -1 to indicate outright failure, but more often they simply return
2893 2895 // the position of the first char after the last char scanned. Thus
2894 2896 // a typical idiom is
2895 2897 //
2896 2898 // int p = start;
2897 2899 // int q = scan(p, end, ...);
2898 2900 // if (q > p)
2899 2901 // // We scanned something
2900 2902 // ...;
2901 2903 // else if (q == p)
2902 2904 // // We scanned nothing
2903 2905 // ...;
2904 2906 // else if (q == -1)
2905 2907 // // Something went wrong
2906 2908 // ...;
2907 2909
2908 2910
2909 2911 // Scan a specific char: If the char at the given start position is
2910 2912 // equal to c, return the index of the next char; otherwise, return the
2911 2913 // start position.
2912 2914 //
2913 2915 private int scan(int start, int end, char c) {
2914 2916 if ((start < end) && (charAt(start) == c))
2915 2917 return start + 1;
2916 2918 return start;
2917 2919 }
2918 2920
2919 2921 // Scan forward from the given start position. Stop at the first char
2920 2922 // in the err string (in which case -1 is returned), or the first char
2921 2923 // in the stop string (in which case the index of the preceding char is
2922 2924 // returned), or the end of the input string (in which case the length
2923 2925 // of the input string is returned). May return the start position if
2924 2926 // nothing matches.
2925 2927 //
2926 2928 private int scan(int start, int end, String err, String stop) {
2927 2929 int p = start;
2928 2930 while (p < end) {
2929 2931 char c = charAt(p);
2930 2932 if (err.indexOf(c) >= 0)
2931 2933 return -1;
2932 2934 if (stop.indexOf(c) >= 0)
2933 2935 break;
2934 2936 p++;
2935 2937 }
2936 2938 return p;
2937 2939 }
2938 2940
2939 2941 // Scan a potential escape sequence, starting at the given position,
2940 2942 // with the given first char (i.e., charAt(start) == c).
2941 2943 //
2942 2944 // This method assumes that if escapes are allowed then visible
2943 2945 // non-US-ASCII chars are also allowed.
2944 2946 //
2945 2947 private int scanEscape(int start, int n, char first)
2946 2948 throws URISyntaxException
2947 2949 {
2948 2950 int p = start;
2949 2951 char c = first;
2950 2952 if (c == '%') {
2951 2953 // Process escape pair
2952 2954 if ((p + 3 <= n)
2953 2955 && match(charAt(p + 1), L_HEX, H_HEX)
2954 2956 && match(charAt(p + 2), L_HEX, H_HEX)) {
2955 2957 return p + 3;
2956 2958 }
2957 2959 fail("Malformed escape pair", p);
2958 2960 } else if ((c > 128)
2959 2961 && !Character.isSpaceChar(c)
2960 2962 && !Character.isISOControl(c)) {
2961 2963 // Allow unescaped but visible non-US-ASCII chars
2962 2964 return p + 1;
2963 2965 }
2964 2966 return p;
2965 2967 }
2966 2968
2967 2969 // Scan chars that match the given mask pair
2968 2970 //
2969 2971 private int scan(int start, int n, long lowMask, long highMask)
2970 2972 throws URISyntaxException
2971 2973 {
2972 2974 int p = start;
2973 2975 while (p < n) {
2974 2976 char c = charAt(p);
2975 2977 if (match(c, lowMask, highMask)) {
2976 2978 p++;
2977 2979 continue;
2978 2980 }
2979 2981 if ((lowMask & L_ESCAPED) != 0) {
2980 2982 int q = scanEscape(p, n, c);
2981 2983 if (q > p) {
2982 2984 p = q;
2983 2985 continue;
2984 2986 }
2985 2987 }
2986 2988 break;
2987 2989 }
2988 2990 return p;
2989 2991 }
2990 2992
2991 2993 // Check that each of the chars in [start, end) matches the given mask
2992 2994 //
2993 2995 private void checkChars(int start, int end,
2994 2996 long lowMask, long highMask,
2995 2997 String what)
2996 2998 throws URISyntaxException
2997 2999 {
2998 3000 int p = scan(start, end, lowMask, highMask);
2999 3001 if (p < end)
3000 3002 fail("Illegal character in " + what, p);
3001 3003 }
3002 3004
3003 3005 // Check that the char at position p matches the given mask
3004 3006 //
3005 3007 private void checkChar(int p,
3006 3008 long lowMask, long highMask,
3007 3009 String what)
3008 3010 throws URISyntaxException
3009 3011 {
3010 3012 checkChars(p, p + 1, lowMask, highMask, what);
3011 3013 }
3012 3014
3013 3015
3014 3016 // -- Parsing --
3015 3017
3016 3018 // [<scheme>:]<scheme-specific-part>[#<fragment>]
3017 3019 //
3018 3020 void parse(boolean rsa) throws URISyntaxException {
3019 3021 requireServerAuthority = rsa;
3020 3022 int ssp; // Start of scheme-specific part
3021 3023 int n = input.length();
3022 3024 int p = scan(0, n, "/?#", ":");
3023 3025 if ((p >= 0) && at(p, n, ':')) {
3024 3026 if (p == 0)
3025 3027 failExpecting("scheme name", 0);
3026 3028 checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
3027 3029 checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
3028 3030 scheme = substring(0, p);
3029 3031 p++; // Skip ':'
3030 3032 ssp = p;
3031 3033 if (at(p, n, '/')) {
3032 3034 p = parseHierarchical(p, n);
3033 3035 } else {
3034 3036 int q = scan(p, n, "", "#");
3035 3037 if (q <= p)
3036 3038 failExpecting("scheme-specific part", p);
3037 3039 checkChars(p, q, L_URIC, H_URIC, "opaque part");
3038 3040 p = q;
3039 3041 }
3040 3042 } else {
3041 3043 ssp = 0;
3042 3044 p = parseHierarchical(0, n);
3043 3045 }
3044 3046 schemeSpecificPart = substring(ssp, p);
3045 3047 if (at(p, n, '#')) {
3046 3048 checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
3047 3049 fragment = substring(p + 1, n);
3048 3050 p = n;
3049 3051 }
3050 3052 if (p < n)
3051 3053 fail("end of URI", p);
3052 3054 }
3053 3055
3054 3056 // [//authority]<path>[?<query>]
3055 3057 //
3056 3058 // DEVIATION from RFC2396: We allow an empty authority component as
3057 3059 // long as it's followed by a non-empty path, query component, or
3058 3060 // fragment component. This is so that URIs such as "file:///foo/bar"
3059 3061 // will parse. This seems to be the intent of RFC2396, though the
3060 3062 // grammar does not permit it. If the authority is empty then the
3061 3063 // userInfo, host, and port components are undefined.
3062 3064 //
3063 3065 // DEVIATION from RFC2396: We allow empty relative paths. This seems
3064 3066 // to be the intent of RFC2396, but the grammar does not permit it.
3065 3067 // The primary consequence of this deviation is that "#f" parses as a
3066 3068 // relative URI with an empty path.
3067 3069 //
3068 3070 private int parseHierarchical(int start, int n)
3069 3071 throws URISyntaxException
3070 3072 {
3071 3073 int p = start;
3072 3074 if (at(p, n, '/') && at(p + 1, n, '/')) {
3073 3075 p += 2;
3074 3076 int q = scan(p, n, "", "/?#");
3075 3077 if (q > p) {
3076 3078 p = parseAuthority(p, q);
3077 3079 } else if (q < n) {
3078 3080 // DEVIATION: Allow empty authority prior to non-empty
3079 3081 // path, query component or fragment identifier
3080 3082 } else
3081 3083 failExpecting("authority", p);
3082 3084 }
3083 3085 int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
3084 3086 checkChars(p, q, L_PATH, H_PATH, "path");
3085 3087 path = substring(p, q);
3086 3088 p = q;
3087 3089 if (at(p, n, '?')) {
3088 3090 p++;
3089 3091 q = scan(p, n, "", "#");
3090 3092 checkChars(p, q, L_URIC, H_URIC, "query");
3091 3093 query = substring(p, q);
3092 3094 p = q;
3093 3095 }
3094 3096 return p;
3095 3097 }
3096 3098
3097 3099 // authority = server | reg_name
3098 3100 //
3099 3101 // Ambiguity: An authority that is a registry name rather than a server
3100 3102 // might have a prefix that parses as a server. We use the fact that
3101 3103 // the authority component is always followed by '/' or the end of the
3102 3104 // input string to resolve this: If the complete authority did not
3103 3105 // parse as a server then we try to parse it as a registry name.
3104 3106 //
3105 3107 private int parseAuthority(int start, int n)
3106 3108 throws URISyntaxException
3107 3109 {
3108 3110 int p = start;
3109 3111 int q = p;
3110 3112 URISyntaxException ex = null;
3111 3113
3112 3114 boolean serverChars;
3113 3115 boolean regChars;
3114 3116
3115 3117 if (scan(p, n, "", "]") > p) {
3116 3118 // contains a literal IPv6 address, therefore % is allowed
3117 3119 serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
3118 3120 } else {
3119 3121 serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
3120 3122 }
3121 3123 regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
3122 3124
3123 3125 if (regChars && !serverChars) {
3124 3126 // Must be a registry-based authority
3125 3127 authority = substring(p, n);
3126 3128 return n;
3127 3129 }
3128 3130
3129 3131 if (serverChars) {
3130 3132 // Might be (probably is) a server-based authority, so attempt
3131 3133 // to parse it as such. If the attempt fails, try to treat it
3132 3134 // as a registry-based authority.
3133 3135 try {
3134 3136 q = parseServer(p, n);
3135 3137 if (q < n)
3136 3138 failExpecting("end of authority", q);
3137 3139 authority = substring(p, n);
3138 3140 } catch (URISyntaxException x) {
3139 3141 // Undo results of failed parse
3140 3142 userInfo = null;
3141 3143 host = null;
3142 3144 port = -1;
3143 3145 if (requireServerAuthority) {
3144 3146 // If we're insisting upon a server-based authority,
3145 3147 // then just re-throw the exception
3146 3148 throw x;
3147 3149 } else {
3148 3150 // Save the exception in case it doesn't parse as a
3149 3151 // registry either
3150 3152 ex = x;
3151 3153 q = p;
3152 3154 }
3153 3155 }
3154 3156 }
3155 3157
3156 3158 if (q < n) {
3157 3159 if (regChars) {
3158 3160 // Registry-based authority
3159 3161 authority = substring(p, n);
3160 3162 } else if (ex != null) {
3161 3163 // Re-throw exception; it was probably due to
3162 3164 // a malformed IPv6 address
3163 3165 throw ex;
3164 3166 } else {
3165 3167 fail("Illegal character in authority", q);
3166 3168 }
3167 3169 }
3168 3170
3169 3171 return n;
3170 3172 }
3171 3173
3172 3174
3173 3175 // [<userinfo>@]<host>[:<port>]
3174 3176 //
3175 3177 private int parseServer(int start, int n)
3176 3178 throws URISyntaxException
3177 3179 {
3178 3180 int p = start;
3179 3181 int q;
3180 3182
3181 3183 // userinfo
3182 3184 q = scan(p, n, "/?#", "@");
3183 3185 if ((q >= p) && at(q, n, '@')) {
3184 3186 checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
3185 3187 userInfo = substring(p, q);
3186 3188 p = q + 1; // Skip '@'
3187 3189 }
3188 3190
3189 3191 // hostname, IPv4 address, or IPv6 address
3190 3192 if (at(p, n, '[')) {
3191 3193 // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
3192 3194 p++;
3193 3195 q = scan(p, n, "/?#", "]");
3194 3196 if ((q > p) && at(q, n, ']')) {
3195 3197 // look for a "%" scope id
3196 3198 int r = scan (p, q, "", "%");
3197 3199 if (r > p) {
3198 3200 parseIPv6Reference(p, r);
3199 3201 if (r+1 == q) {
3200 3202 fail ("scope id expected");
3201 3203 }
3202 3204 checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
3203 3205 "scope id");
3204 3206 } else {
3205 3207 parseIPv6Reference(p, q);
3206 3208 }
3207 3209 host = substring(p-1, q+1);
3208 3210 p = q + 1;
3209 3211 } else {
3210 3212 failExpecting("closing bracket for IPv6 address", q);
3211 3213 }
3212 3214 } else {
3213 3215 q = parseIPv4Address(p, n);
3214 3216 if (q <= p)
3215 3217 q = parseHostname(p, n);
3216 3218 p = q;
3217 3219 }
3218 3220
3219 3221 // port
3220 3222 if (at(p, n, ':')) {
3221 3223 p++;
3222 3224 q = scan(p, n, "", "/");
3223 3225 if (q > p) {
3224 3226 checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
3225 3227 try {
3226 3228 port = Integer.parseInt(substring(p, q));
3227 3229 } catch (NumberFormatException x) {
3228 3230 fail("Malformed port number", p);
3229 3231 }
3230 3232 p = q;
3231 3233 }
3232 3234 }
3233 3235 if (p < n)
3234 3236 failExpecting("port number", p);
3235 3237
3236 3238 return p;
3237 3239 }
3238 3240
3239 3241 // Scan a string of decimal digits whose value fits in a byte
3240 3242 //
3241 3243 private int scanByte(int start, int n)
3242 3244 throws URISyntaxException
3243 3245 {
3244 3246 int p = start;
3245 3247 int q = scan(p, n, L_DIGIT, H_DIGIT);
3246 3248 if (q <= p) return q;
3247 3249 if (Integer.parseInt(substring(p, q)) > 255) return p;
3248 3250 return q;
3249 3251 }
3250 3252
3251 3253 // Scan an IPv4 address.
3252 3254 //
3253 3255 // If the strict argument is true then we require that the given
3254 3256 // interval contain nothing besides an IPv4 address; if it is false
3255 3257 // then we only require that it start with an IPv4 address.
3256 3258 //
3257 3259 // If the interval does not contain or start with (depending upon the
3258 3260 // strict argument) a legal IPv4 address characters then we return -1
3259 3261 // immediately; otherwise we insist that these characters parse as a
3260 3262 // legal IPv4 address and throw an exception on failure.
3261 3263 //
3262 3264 // We assume that any string of decimal digits and dots must be an IPv4
3263 3265 // address. It won't parse as a hostname anyway, so making that
3264 3266 // assumption here allows more meaningful exceptions to be thrown.
3265 3267 //
3266 3268 private int scanIPv4Address(int start, int n, boolean strict)
3267 3269 throws URISyntaxException
3268 3270 {
3269 3271 int p = start;
3270 3272 int q;
3271 3273 int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
3272 3274 if ((m <= p) || (strict && (m != n)))
3273 3275 return -1;
3274 3276 for (;;) {
3275 3277 // Per RFC2732: At most three digits per byte
3276 3278 // Further constraint: Each element fits in a byte
3277 3279 if ((q = scanByte(p, m)) <= p) break; p = q;
3278 3280 if ((q = scan(p, m, '.')) <= p) break; p = q;
3279 3281 if ((q = scanByte(p, m)) <= p) break; p = q;
3280 3282 if ((q = scan(p, m, '.')) <= p) break; p = q;
3281 3283 if ((q = scanByte(p, m)) <= p) break; p = q;
3282 3284 if ((q = scan(p, m, '.')) <= p) break; p = q;
3283 3285 if ((q = scanByte(p, m)) <= p) break; p = q;
3284 3286 if (q < m) break;
3285 3287 return q;
3286 3288 }
3287 3289 fail("Malformed IPv4 address", q);
3288 3290 return -1;
3289 3291 }
3290 3292
3291 3293 // Take an IPv4 address: Throw an exception if the given interval
3292 3294 // contains anything except an IPv4 address
3293 3295 //
3294 3296 private int takeIPv4Address(int start, int n, String expected)
3295 3297 throws URISyntaxException
3296 3298 {
3297 3299 int p = scanIPv4Address(start, n, true);
3298 3300 if (p <= start)
3299 3301 failExpecting(expected, start);
3300 3302 return p;
3301 3303 }
3302 3304
3303 3305 // Attempt to parse an IPv4 address, returning -1 on failure but
3304 3306 // allowing the given interval to contain [:<characters>] after
3305 3307 // the IPv4 address.
3306 3308 //
3307 3309 private int parseIPv4Address(int start, int n) {
3308 3310 int p;
3309 3311
3310 3312 try {
3311 3313 p = scanIPv4Address(start, n, false);
3312 3314 } catch (URISyntaxException x) {
3313 3315 return -1;
3314 3316 } catch (NumberFormatException nfe) {
3315 3317 return -1;
3316 3318 }
3317 3319
3318 3320 if (p > start && p < n) {
3319 3321 // IPv4 address is followed by something - check that
3320 3322 // it's a ":" as this is the only valid character to
3321 3323 // follow an address.
3322 3324 if (charAt(p) != ':') {
3323 3325 p = -1;
3324 3326 }
3325 3327 }
3326 3328
3327 3329 if (p > start)
3328 3330 host = substring(start, p);
3329 3331
3330 3332 return p;
3331 3333 }
3332 3334
3333 3335 // hostname = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
3334 3336 // domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
3335 3337 // toplabel = alpha | alpha *( alphanum | "-" ) alphanum
3336 3338 //
3337 3339 private int parseHostname(int start, int n)
3338 3340 throws URISyntaxException
3339 3341 {
3340 3342 int p = start;
3341 3343 int q;
3342 3344 int l = -1; // Start of last parsed label
3343 3345
3344 3346 do {
3345 3347 // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
3346 3348 q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
3347 3349 if (q <= p)
3348 3350 break;
3349 3351 l = p;
3350 3352 if (q > p) {
3351 3353 p = q;
3352 3354 q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
3353 3355 if (q > p) {
3354 3356 if (charAt(q - 1) == '-')
3355 3357 fail("Illegal character in hostname", q - 1);
3356 3358 p = q;
3357 3359 }
3358 3360 }
3359 3361 q = scan(p, n, '.');
3360 3362 if (q <= p)
3361 3363 break;
3362 3364 p = q;
3363 3365 } while (p < n);
3364 3366
3365 3367 if ((p < n) && !at(p, n, ':'))
3366 3368 fail("Illegal character in hostname", p);
3367 3369
3368 3370 if (l < 0)
3369 3371 failExpecting("hostname", start);
3370 3372
3371 3373 // for a fully qualified hostname check that the rightmost
3372 3374 // label starts with an alpha character.
3373 3375 if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
3374 3376 fail("Illegal character in hostname", l);
3375 3377 }
3376 3378
3377 3379 host = substring(start, p);
3378 3380 return p;
3379 3381 }
3380 3382
3381 3383
3382 3384 // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
3383 3385 //
3384 3386 // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
3385 3387 // the form ::12.34.56.78, which are clearly shown in the examples
3386 3388 // earlier in the document. Here is the original grammar:
3387 3389 //
3388 3390 // IPv6address = hexpart [ ":" IPv4address ]
3389 3391 // hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
3390 3392 // hexseq = hex4 *( ":" hex4)
3391 3393 // hex4 = 1*4HEXDIG
3392 3394 //
3393 3395 // We therefore use the following revised grammar:
3394 3396 //
3395 3397 // IPv6address = hexseq [ ":" IPv4address ]
3396 3398 // | hexseq [ "::" [ hexpost ] ]
3397 3399 // | "::" [ hexpost ]
3398 3400 // hexpost = hexseq | hexseq ":" IPv4address | IPv4address
3399 3401 // hexseq = hex4 *( ":" hex4)
3400 3402 // hex4 = 1*4HEXDIG
3401 3403 //
3402 3404 // This covers all and only the following cases:
3403 3405 //
3404 3406 // hexseq
3405 3407 // hexseq : IPv4address
3406 3408 // hexseq ::
3407 3409 // hexseq :: hexseq
3408 3410 // hexseq :: hexseq : IPv4address
3409 3411 // hexseq :: IPv4address
3410 3412 // :: hexseq
3411 3413 // :: hexseq : IPv4address
3412 3414 // :: IPv4address
3413 3415 // ::
3414 3416 //
3415 3417 // Additionally we constrain the IPv6 address as follows :-
3416 3418 //
3417 3419 // i. IPv6 addresses without compressed zeros should contain
3418 3420 // exactly 16 bytes.
3419 3421 //
3420 3422 // ii. IPv6 addresses with compressed zeros should contain
3421 3423 // less than 16 bytes.
3422 3424
3423 3425 private int ipv6byteCount = 0;
3424 3426
3425 3427 private int parseIPv6Reference(int start, int n)
3426 3428 throws URISyntaxException
3427 3429 {
3428 3430 int p = start;
3429 3431 int q;
3430 3432 boolean compressedZeros = false;
3431 3433
3432 3434 q = scanHexSeq(p, n);
3433 3435
3434 3436 if (q > p) {
3435 3437 p = q;
3436 3438 if (at(p, n, "::")) {
3437 3439 compressedZeros = true;
3438 3440 p = scanHexPost(p + 2, n);
3439 3441 } else if (at(p, n, ':')) {
3440 3442 p = takeIPv4Address(p + 1, n, "IPv4 address");
3441 3443 ipv6byteCount += 4;
3442 3444 }
3443 3445 } else if (at(p, n, "::")) {
3444 3446 compressedZeros = true;
3445 3447 p = scanHexPost(p + 2, n);
3446 3448 }
3447 3449 if (p < n)
3448 3450 fail("Malformed IPv6 address", start);
3449 3451 if (ipv6byteCount > 16)
3450 3452 fail("IPv6 address too long", start);
3451 3453 if (!compressedZeros && ipv6byteCount < 16)
3452 3454 fail("IPv6 address too short", start);
3453 3455 if (compressedZeros && ipv6byteCount == 16)
3454 3456 fail("Malformed IPv6 address", start);
3455 3457
3456 3458 return p;
3457 3459 }
3458 3460
3459 3461 private int scanHexPost(int start, int n)
3460 3462 throws URISyntaxException
3461 3463 {
3462 3464 int p = start;
3463 3465 int q;
3464 3466
3465 3467 if (p == n)
3466 3468 return p;
3467 3469
3468 3470 q = scanHexSeq(p, n);
3469 3471 if (q > p) {
3470 3472 p = q;
3471 3473 if (at(p, n, ':')) {
3472 3474 p++;
3473 3475 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3474 3476 ipv6byteCount += 4;
3475 3477 }
3476 3478 } else {
3477 3479 p = takeIPv4Address(p, n, "hex digits or IPv4 address");
3478 3480 ipv6byteCount += 4;
3479 3481 }
3480 3482 return p;
3481 3483 }
3482 3484
3483 3485 // Scan a hex sequence; return -1 if one could not be scanned
3484 3486 //
3485 3487 private int scanHexSeq(int start, int n)
3486 3488 throws URISyntaxException
3487 3489 {
3488 3490 int p = start;
3489 3491 int q;
3490 3492
3491 3493 q = scan(p, n, L_HEX, H_HEX);
3492 3494 if (q <= p)
3493 3495 return -1;
3494 3496 if (at(q, n, '.')) // Beginning of IPv4 address
3495 3497 return -1;
3496 3498 if (q > p + 4)
3497 3499 fail("IPv6 hexadecimal digit sequence too long", p);
3498 3500 ipv6byteCount += 2;
3499 3501 p = q;
3500 3502 while (p < n) {
3501 3503 if (!at(p, n, ':'))
3502 3504 break;
3503 3505 if (at(p + 1, n, ':'))
3504 3506 break; // "::"
3505 3507 p++;
3506 3508 q = scan(p, n, L_HEX, H_HEX);
3507 3509 if (q <= p)
3508 3510 failExpecting("digits for an IPv6 address", p);
3509 3511 if (at(q, n, '.')) { // Beginning of IPv4 address
3510 3512 p--;
3511 3513 break;
3512 3514 }
3513 3515 if (q > p + 4)
3514 3516 fail("IPv6 hexadecimal digit sequence too long", p);
3515 3517 ipv6byteCount += 2;
3516 3518 p = q;
3517 3519 }
3518 3520
3519 3521 return p;
3520 3522 }
3521 3523
3522 3524 }
3523 3525
3524 3526 }
↓ open down ↓ |
1801 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX