1 /*
   2  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 package java.net;
  26 
  27 import java.io.InputStream;
  28 import java.io.IOException;
  29 import java.security.AccessController;
  30 import java.security.PrivilegedAction;
  31 
  32 import sun.net.idn.StringPrep;
  33 import sun.net.idn.Punycode;
  34 import sun.text.normalizer.UCharacterIterator;
  35 
  36 /**
  37  * Provides methods to convert internationalized domain names (IDNs) between
  38  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
  39  * Internationalized domain names can use characters from the entire range of
  40  * Unicode, while traditional domain names are restricted to ASCII characters.
  41  * ACE is an encoding of Unicode strings that uses only ASCII characters and
  42  * can be used with software (such as the Domain Name System) that only
  43  * understands traditional domain names.
  44  *
  45  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
  46  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
  47  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
  48  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
  49  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
  50  * domain name string back and forth.
  51  *
  52  * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
  53  *   <ul>
  54  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
  55  *         can contain code points that are unassigned in Unicode 3.2, which is the
  56  *         Unicode version on which IDN conversion is based. If the flag is not used,
  57  *         the presence of such unassigned code points is treated as an error.
  58  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
  59  *         It is an error if they don't meet the requirements.
  60  *   </ul>
  61  * These flags can be logically OR'ed together.
  62  *
  63  * <p>The security consideration is important with respect to internationalization
  64  * domain name support. For example, English domain names may be <i>homographed</i>
  65  * - maliciously misspelled by substitution of non-Latin letters.
  66  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
  67  * discusses security issues of IDN support as well as possible solutions.
  68  * Applications are responsible for taking adequate security measures when using
  69  * international domain names.
  70  *
  71  * @author Edward Wang
  72  * @since 1.6
  73  *
  74  */
  75 public final class IDN {
  76     /**
  77      * Flag to allow processing of unassigned code points
  78      */
  79     public static final int ALLOW_UNASSIGNED = 0x01;
  80 
  81     /**
  82      * Flag to turn on the check against STD-3 ASCII rules
  83      */
  84     public static final int USE_STD3_ASCII_RULES = 0x02;
  85 
  86 
  87     /**
  88      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
  89      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
  90      *
  91      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
  92      * If ToASCII operation fails, an IllegalArgumentException will be thrown.
  93      * In this case, the input string should not be used in an internationalized domain name.
  94      *
  95      * <p> A label is an individual part of a domain name. The original ToASCII operation,
  96      * as defined in RFC 3490, only operates on a single label. This method can handle
  97      * both label and entire domain name, by assuming that labels in a domain name are
  98      * always separated by dots. The following characters are recognized as dots:
  99      * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
 100      * and \uFF61 (halfwidth ideographic full stop). if dots are
 101      * used as label separators, this method also changes all of them to \u002E (full stop)
 102      * in output translated string.
 103      *
 104      * @param input     the string to be processed
 105      * @param flag      process flag; can be 0 or any logical OR of possible flags
 106      *
 107      * @return          the translated {@code String}
 108      *
 109      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
 110      */
 111     public static String toASCII(String input, int flag)
 112     {
 113         int p = 0, q = 0;
 114         StringBuilder out = new StringBuilder();
 115 
 116         if (isRootLabel(input)) {
 117             return ".";
 118         }
 119 
 120         while (p < input.length()) {
 121             q = searchDots(input, p);
 122             out.append(toASCIIInternal(input.substring(p, q),  flag));
 123             if (q != (input.length())) {
 124                // has more labels, or keep the trailing dot as at present
 125                out.append('.');
 126             }
 127             p = q + 1;
 128         }
 129 
 130         return out.toString();
 131     }
 132 
 133 
 134     /**
 135      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
 136      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
 137      *
 138      * <p> This convenience method works as if by invoking the
 139      * two-argument counterpart as follows:
 140      * <blockquote>
 141      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
 142      * </blockquote>
 143      *
 144      * @param input     the string to be processed
 145      *
 146      * @return          the translated {@code String}
 147      *
 148      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
 149      */
 150     public static String toASCII(String input) {
 151         return toASCII(input, 0);
 152     }
 153 
 154 
 155     /**
 156      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
 157      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
 158      *
 159      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
 160      *
 161      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
 162      * as defined in RFC 3490, only operates on a single label. This method can handle
 163      * both label and entire domain name, by assuming that labels in a domain name are
 164      * always separated by dots. The following characters are recognized as dots:
 165      * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
 166      * and \uFF61 (halfwidth ideographic full stop).
 167      *
 168      * @param input     the string to be processed
 169      * @param flag      process flag; can be 0 or any logical OR of possible flags
 170      *
 171      * @return          the translated {@code String}
 172      */
 173     public static String toUnicode(String input, int flag) {
 174         int p = 0, q = 0;
 175         StringBuilder out = new StringBuilder();
 176 
 177         if (isRootLabel(input)) {
 178             return ".";
 179         }
 180 
 181         while (p < input.length()) {
 182             q = searchDots(input, p);
 183             out.append(toUnicodeInternal(input.substring(p, q),  flag));
 184             if (q != (input.length())) {
 185                // has more labels, or keep the trailing dot as at present
 186                out.append('.');
 187             }
 188             p = q + 1;
 189         }
 190 
 191         return out.toString();
 192     }
 193 
 194 
 195     /**
 196      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
 197      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
 198      *
 199      * <p> This convenience method works as if by invoking the
 200      * two-argument counterpart as follows:
 201      * <blockquote>
 202      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
 203      * </blockquote>
 204      *
 205      * @param input     the string to be processed
 206      *
 207      * @return          the translated {@code String}
 208      */
 209     public static String toUnicode(String input) {
 210         return toUnicode(input, 0);
 211     }
 212 
 213 
 214     /* ---------------- Private members -------------- */
 215 
 216     // ACE Prefix is "xn--"
 217     private static final String ACE_PREFIX = "xn--";
 218     private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
 219 
 220     private static final int MAX_LABEL_LENGTH   = 63;
 221 
 222     // single instance of nameprep
 223     private static StringPrep namePrep = null;
 224 
 225     static {
 226         InputStream stream = null;
 227 
 228         try {
 229             final String IDN_PROFILE = "uidna.spp";
 230             if (System.getSecurityManager() != null) {
 231                 stream = AccessController.doPrivileged(new PrivilegedAction<>() {
 232                     public InputStream run() {
 233                         return StringPrep.class.getResourceAsStream(IDN_PROFILE);
 234                     }
 235                 });
 236             } else {
 237                 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
 238             }
 239 
 240             namePrep = new StringPrep(stream);
 241             stream.close();
 242         } catch (IOException e) {
 243             // should never reach here
 244             assert false;
 245         }
 246     }
 247 
 248 
 249     /* ---------------- Private operations -------------- */
 250 
 251 
 252     //
 253     // to suppress the default zero-argument constructor
 254     //
 255     private IDN() {}
 256 
 257     //
 258     // toASCII operation; should only apply to a single label
 259     //
 260     private static String toASCIIInternal(String label, int flag)
 261     {
 262         // step 1
 263         // Check if the string contains code points outside the ASCII range 0..0x7c.
 264         boolean isASCII  = isAllASCII(label);
 265         StringBuffer dest;
 266 
 267         // step 2
 268         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
 269         if (!isASCII) {
 270             UCharacterIterator iter = UCharacterIterator.getInstance(label);
 271             try {
 272                 dest = namePrep.prepare(iter, flag);
 273             } catch (java.text.ParseException e) {
 274                 throw new IllegalArgumentException(e);
 275             }
 276         } else {
 277             dest = new StringBuffer(label);
 278         }
 279 
 280         // step 8, move forward to check the smallest number of the code points
 281         // the length must be inside 1..63
 282         if (dest.length() == 0) {
 283             throw new IllegalArgumentException(
 284                         "Empty label is not a legal name");
 285         }
 286 
 287         // step 3
 288         // Verify the absence of non-LDH ASCII code points
 289         //   0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
 290         // Verify the absence of leading and trailing hyphen
 291         boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
 292         if (useSTD3ASCIIRules) {
 293             for (int i = 0; i < dest.length(); i++) {
 294                 int c = dest.charAt(i);
 295                 if (isNonLDHAsciiCodePoint(c)) {
 296                     throw new IllegalArgumentException(
 297                         "Contains non-LDH ASCII characters");
 298                 }
 299             }
 300 
 301             if (dest.charAt(0) == '-' ||
 302                 dest.charAt(dest.length() - 1) == '-') {
 303 
 304                 throw new IllegalArgumentException(
 305                         "Has leading or trailing hyphen");
 306             }
 307         }
 308 
 309         if (!isASCII) {
 310             // step 4
 311             // If all code points are inside 0..0x7f, skip to step 8
 312             if (!isAllASCII(dest.toString())) {
 313                 // step 5
 314                 // verify the sequence does not begin with ACE prefix
 315                 if(!startsWithACEPrefix(dest)){
 316 
 317                     // step 6
 318                     // encode the sequence with punycode
 319                     try {
 320                         dest = Punycode.encode(dest, null);
 321                     } catch (java.text.ParseException e) {
 322                         throw new IllegalArgumentException(e);
 323                     }
 324 
 325                     dest = toASCIILower(dest);
 326 
 327                     // step 7
 328                     // prepend the ACE prefix
 329                     dest.insert(0, ACE_PREFIX);
 330                 } else {
 331                     throw new IllegalArgumentException("The input starts with the ACE Prefix");
 332                 }
 333 
 334             }
 335         }
 336 
 337         // step 8
 338         // the length must be inside 1..63
 339         if (dest.length() > MAX_LABEL_LENGTH) {
 340             throw new IllegalArgumentException("The label in the input is too long");
 341         }
 342 
 343         return dest.toString();
 344     }
 345 
 346     //
 347     // toUnicode operation; should only apply to a single label
 348     //
 349     private static String toUnicodeInternal(String label, int flag) {
 350         boolean[] caseFlags = null;
 351         StringBuffer dest;
 352 
 353         // step 1
 354         // find out if all the codepoints in input are ASCII
 355         boolean isASCII = isAllASCII(label);
 356 
 357         if(!isASCII){
 358             // step 2
 359             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
 360             try {
 361                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
 362                 dest = namePrep.prepare(iter, flag);
 363             } catch (Exception e) {
 364                 // toUnicode never fails; if any step fails, return the input string
 365                 return label;
 366             }
 367         } else {
 368             dest = new StringBuffer(label);
 369         }
 370 
 371         // step 3
 372         // verify ACE Prefix
 373         if(startsWithACEPrefix(dest)) {
 374 
 375             // step 4
 376             // Remove the ACE Prefix
 377             String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
 378 
 379             try {
 380                 // step 5
 381                 // Decode using punycode
 382                 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
 383 
 384                 // step 6
 385                 // Apply toASCII
 386                 String toASCIIOut = toASCII(decodeOut.toString(), flag);
 387 
 388                 // step 7
 389                 // verify
 390                 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
 391                     // step 8
 392                     // return output of step 5
 393                     return decodeOut.toString();
 394                 }
 395             } catch (Exception ignored) {
 396                 // no-op
 397             }
 398         }
 399 
 400         // just return the input
 401         return label;
 402     }
 403 
 404 
 405     //
 406     // LDH stands for "letter/digit/hyphen", with characters restricted to the
 407     // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
 408     // <->.
 409     // Non LDH refers to characters in the ASCII range, but which are not
 410     // letters, digits or the hypen.
 411     //
 412     // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F
 413     //
 414     private static boolean isNonLDHAsciiCodePoint(int ch){
 415         return (0x0000 <= ch && ch <= 0x002C) ||
 416                (0x002E <= ch && ch <= 0x002F) ||
 417                (0x003A <= ch && ch <= 0x0040) ||
 418                (0x005B <= ch && ch <= 0x0060) ||
 419                (0x007B <= ch && ch <= 0x007F);
 420     }
 421 
 422     //
 423     // search dots in a string and return the index of that character;
 424     // or if there is no dots, return the length of input string
 425     // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
 426     // and \uFF61 (halfwidth ideographic full stop).
 427     //
 428     private static int searchDots(String s, int start) {
 429         int i;
 430         for (i = start; i < s.length(); i++) {
 431             if (isLabelSeparator(s.charAt(i))) {
 432                 break;
 433             }
 434         }
 435 
 436         return i;
 437     }
 438 
 439     //
 440     // to check if a string is a root label, ".".
 441     //
 442     private static boolean isRootLabel(String s) {
 443         return (s.length() == 1 && isLabelSeparator(s.charAt(0)));
 444     }
 445 
 446     //
 447     // to check if a character is a label separator, i.e. a dot character.
 448     //
 449     private static boolean isLabelSeparator(char c) {
 450         return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');
 451     }
 452 
 453     //
 454     // to check if a string only contains US-ASCII code point
 455     //
 456     private static boolean isAllASCII(String input) {
 457         boolean isASCII = true;
 458         for (int i = 0; i < input.length(); i++) {
 459             int c = input.charAt(i);
 460             if (c > 0x7F) {
 461                 isASCII = false;
 462                 break;
 463             }
 464         }
 465         return isASCII;
 466     }
 467 
 468     //
 469     // to check if a string starts with ACE-prefix
 470     //
 471     private static boolean startsWithACEPrefix(StringBuffer input){
 472         boolean startsWithPrefix = true;
 473 
 474         if(input.length() < ACE_PREFIX_LENGTH){
 475             return false;
 476         }
 477         for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
 478             if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
 479                 startsWithPrefix = false;
 480             }
 481         }
 482         return startsWithPrefix;
 483     }
 484 
 485     private static char toASCIILower(char ch){
 486         if('A' <= ch && ch <= 'Z'){
 487             return (char)(ch + 'a' - 'A');
 488         }
 489         return ch;
 490     }
 491 
 492     private static StringBuffer toASCIILower(StringBuffer input){
 493         StringBuffer dest = new StringBuffer();
 494         for(int i = 0; i < input.length();i++){
 495             dest.append(toASCIILower(input.charAt(i)));
 496         }
 497         return dest;
 498     }
 499 }