1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.  Oracle designates this
   7  * particular file as subject to the "Classpath" exception as provided
   8  * by Oracle in the LICENSE file that accompanied this code.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  */
  24 /*
  25  *******************************************************************************
  26  * Copyright (C) 2003-2004, International Business Machines Corporation and    *
  27  * others. All Rights Reserved.                                                *
  28  *******************************************************************************
  29  */
  30 //
  31 // CHANGELOG
  32 //      2005-05-19 Edward Wang
  33 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
  34 //          - move from package com.ibm.icu.text to package sun.net.idn
  35 //          - use ParseException instead of StringPrepParseException
  36 //      2007-08-14 Martin Buchholz
  37 //          - remove redundant casts
  38 //
  39 package sun.net.idn;
  40 
  41 import java.text.ParseException;
  42 import sun.text.normalizer.UCharacter;
  43 import sun.text.normalizer.UTF16;
  44 
  45 /**
  46  * Ported code from ICU punycode.c
  47  * @author ram
  48  */
  49 
  50 /* Package Private class */
  51 public final class Punycode {
  52 
  53     /* Punycode parameters for Bootstring */
  54     private static final int BASE           = 36;
  55     private static final int TMIN           = 1;
  56     private static final int TMAX           = 26;
  57     private static final int SKEW           = 38;
  58     private static final int DAMP           = 700;
  59     private static final int INITIAL_BIAS   = 72;
  60     private static final int INITIAL_N      = 0x80;
  61 
  62     /* "Basic" Unicode/ASCII code points */
  63     private static final int HYPHEN         = 0x2d;
  64     private static final int DELIMITER      = HYPHEN;
  65 
  66     private static final int ZERO           = 0x30;
  67     private static final int NINE           = 0x39;
  68 
  69     private static final int SMALL_A        = 0x61;
  70     private static final int SMALL_Z        = 0x7a;
  71 
  72     private static final int CAPITAL_A      = 0x41;
  73     private static final int CAPITAL_Z      = 0x5a;
  74 
  75     //  TODO: eliminate the 256 limitation
  76     private static final int MAX_CP_COUNT   = 256;
  77 
  78     private static final int UINT_MAGIC     = 0x80000000;
  79     private static final long ULONG_MAGIC   = 0x8000000000000000L;
  80 
  81     private static int adaptBias(int delta, int length, boolean firstTime){
  82         if(firstTime){
  83             delta /=DAMP;
  84         }else{
  85             delta /=  2;
  86         }
  87         delta += delta/length;
  88 
  89         int count=0;
  90         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
  91             delta/=(BASE-TMIN);
  92         }
  93 
  94         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
  95     }
  96 
  97     /**
  98      * basicToDigit[] contains the numeric value of a basic code
  99      * point (for use in representing integers) in the range 0 to
 100      * BASE-1, or -1 if b is does not represent a value.
 101      */
 102     static final int[]    basicToDigit= new int[]{
 103         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 104         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 105 
 106         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 107         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
 108 
 109         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 110         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 111 
 112         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 113         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 114 
 115         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 116         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 117 
 118         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 119         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 120 
 121         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 122         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 123 
 124         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 125         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 126     };
 127 
 128     private static char asciiCaseMap(char b, boolean uppercase) {
 129         if(uppercase) {
 130             if(SMALL_A<=b && b<=SMALL_Z) {
 131                 b-=(SMALL_A-CAPITAL_A);
 132             }
 133         } else {
 134             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
 135                 b+=(SMALL_A-CAPITAL_A);
 136             }
 137         }
 138         return b;
 139     }
 140 
 141     /**
 142      * digitToBasic() returns the basic code point whose value
 143      * (when used for representing integers) is d, which must be in the
 144      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 145      * nonzero, in which case the uppercase form is used.
 146      */
 147     private static char digitToBasic(int digit, boolean uppercase) {
 148         /*  0..25 map to ASCII a..z or A..Z */
 149         /* 26..35 map to ASCII 0..9         */
 150         if(digit<26) {
 151             if(uppercase) {
 152                 return (char)(CAPITAL_A+digit);
 153             } else {
 154                 return (char)(SMALL_A+digit);
 155             }
 156         } else {
 157             return (char)((ZERO-26)+digit);
 158         }
 159     }
 160     /**
 161      * Converts Unicode to Punycode.
 162      * The input string must not contain single, unpaired surrogates.
 163      * The output will be represented as an array of ASCII code points.
 164      *
 165      * @param src
 166      * @param caseFlags
 167      * @return
 168      * @throws ParseException
 169      */
 170     public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
 171 
 172         int[] cpBuffer = new int[MAX_CP_COUNT];
 173         int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
 174         char c, c2;
 175         int srcLength = src.length();
 176         int destCapacity = MAX_CP_COUNT;
 177         char[] dest = new char[destCapacity];
 178         StringBuffer result = new StringBuffer();
 179         /*
 180          * Handle the basic code points and
 181          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
 182          */
 183         srcCPCount=destLength=0;
 184 
 185         for(j=0; j<srcLength; ++j) {
 186             if(srcCPCount==MAX_CP_COUNT) {
 187                 /* too many input code points */
 188                 throw new IndexOutOfBoundsException();
 189             }
 190             c=src.charAt(j);
 191             if(isBasic(c)) {
 192                 if(destLength<destCapacity) {
 193                     cpBuffer[srcCPCount++]=0;
 194                     dest[destLength]=
 195                         caseFlags!=null ?
 196                             asciiCaseMap(c, caseFlags[j]) :
 197                             c;
 198                 }
 199                 ++destLength;
 200             } else {
 201                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
 202                 if(!UTF16.isSurrogate(c)) {
 203                     n|=c;
 204                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
 205                     ++j;
 206 
 207                     n|=UCharacter.getCodePoint(c, c2);
 208                 } else {
 209                     /* error: unmatched surrogate */
 210                     throw new ParseException("Illegal char found", -1);
 211                 }
 212                 cpBuffer[srcCPCount++]=n;
 213             }
 214         }
 215 
 216         /* Finish the basic string - if it is not empty - with a delimiter. */
 217         basicLength=destLength;
 218         if(basicLength>0) {
 219             if(destLength<destCapacity) {
 220                 dest[destLength]=DELIMITER;
 221             }
 222             ++destLength;
 223         }
 224 
 225         /*
 226          * handledCPCount is the number of code points that have been handled
 227          * basicLength is the number of basic code points
 228          * destLength is the number of chars that have been output
 229          */
 230 
 231         /* Initialize the state: */
 232         n=INITIAL_N;
 233         delta=0;
 234         bias=INITIAL_BIAS;
 235 
 236         /* Main encoding loop: */
 237         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
 238             /*
 239              * All non-basic code points < n have been handled already.
 240              * Find the next larger one:
 241              */
 242             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
 243                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 244                 if(n<=q && q<m) {
 245                     m=q;
 246                 }
 247             }
 248 
 249             /*
 250              * Increase delta enough to advance the decoder's
 251              * <n,i> state to <m,0>, but guard against overflow:
 252              */
 253             if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
 254                 throw new RuntimeException("Internal program error");
 255             }
 256             delta+=(m-n)*(handledCPCount+1);
 257             n=m;
 258 
 259             /* Encode a sequence of same code points n */
 260             for(j=0; j<srcCPCount; ++j) {
 261                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 262                 if(q<n) {
 263                     ++delta;
 264                 } else if(q==n) {
 265                     /* Represent delta as a generalized variable-length integer: */
 266                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
 267 
 268                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 269 
 270                         t=k-bias;
 271                         if(t<TMIN) {
 272                             t=TMIN;
 273                         } else if(t>TMAX) {
 274                             t=TMAX;
 275                         }
 276                         */
 277 
 278                         t=k-bias;
 279                         if(t<TMIN) {
 280                             t=TMIN;
 281                         } else if(k>=(bias+TMAX)) {
 282                             t=TMAX;
 283                         }
 284 
 285                         if(q<t) {
 286                             break;
 287                         }
 288 
 289                         if(destLength<destCapacity) {
 290                             dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
 291                         }
 292                         q=(q-t)/(BASE-t);
 293                     }
 294 
 295                     if(destLength<destCapacity) {
 296                         dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
 297                     }
 298                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
 299                     delta=0;
 300                     ++handledCPCount;
 301                 }
 302             }
 303 
 304             ++delta;
 305             ++n;
 306         }
 307 
 308         return result.append(dest, 0, destLength);
 309     }
 310 
 311     private static boolean isBasic(int ch){
 312         return (ch < INITIAL_N);
 313     }
 314 
 315     private static boolean isBasicUpperCase(int ch){
 316         return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
 317     }
 318 
 319     private static boolean isSurrogate(int ch){
 320         return (((ch)&0xfffff800)==0xd800);
 321     }
 322     /**
 323      * Converts Punycode to Unicode.
 324      * The Unicode string will be at most as long as the Punycode string.
 325      *
 326      * @param src
 327      * @param caseFlags
 328      * @return
 329      * @throws ParseException
 330      */
 331     public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
 332                                throws ParseException{
 333         int srcLength = src.length();
 334         StringBuffer result = new StringBuffer();
 335         int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
 336                 destCPCount, firstSupplementaryIndex, cpLength;
 337         char b;
 338         int destCapacity = MAX_CP_COUNT;
 339         char[] dest = new char[destCapacity];
 340 
 341         /*
 342          * Handle the basic code points:
 343          * Let basicLength be the number of input code points
 344          * before the last delimiter, or 0 if there is none,
 345          * then copy the first basicLength code points to the output.
 346          *
 347          * The two following loops iterate backward.
 348          */
 349         for(j=srcLength; j>0;) {
 350             if(src.charAt(--j)==DELIMITER) {
 351                 break;
 352             }
 353         }
 354         destLength=basicLength=destCPCount=j;
 355 
 356         while(j>0) {
 357             b=src.charAt(--j);
 358             if(!isBasic(b)) {
 359                 throw new ParseException("Illegal char found", -1);
 360             }
 361 
 362             if(j<destCapacity) {
 363                 dest[j]= b;
 364 
 365                 if(caseFlags!=null) {
 366                     caseFlags[j]=isBasicUpperCase(b);
 367                 }
 368             }
 369         }
 370 
 371         /* Initialize the state: */
 372         n=INITIAL_N;
 373         i=0;
 374         bias=INITIAL_BIAS;
 375         firstSupplementaryIndex=1000000000;
 376 
 377         /*
 378          * Main decoding loop:
 379          * Start just after the last delimiter if any
 380          * basic code points were copied; start at the beginning otherwise.
 381          */
 382         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
 383             /*
 384              * in is the index of the next character to be consumed, and
 385              * destCPCount is the number of code points in the output array.
 386              *
 387              * Decode a generalized variable-length integer into delta,
 388              * which gets added to i.  The overflow checking is easier
 389              * if we increase i as we go, then subtract off its starting
 390              * value at the end to obtain delta.
 391              */
 392             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
 393                 if(in>=srcLength) {
 394                     throw new ParseException("Illegal char found", -1);
 395                 }
 396 
 397                 digit=basicToDigit[(byte)src.charAt(in++)];
 398                 if(digit<0) {
 399                     throw new ParseException("Invalid char found", -1);
 400                 }
 401                 if(digit>(0x7fffffff-i)/w) {
 402                     /* integer overflow */
 403                     throw new ParseException("Illegal char found", -1);
 404                 }
 405 
 406                 i+=digit*w;
 407                 t=k-bias;
 408                 if(t<TMIN) {
 409                     t=TMIN;
 410                 } else if(k>=(bias+TMAX)) {
 411                     t=TMAX;
 412                 }
 413                 if(digit<t) {
 414                     break;
 415                 }
 416 
 417                 if(w>0x7fffffff/(BASE-t)) {
 418                     /* integer overflow */
 419                     throw new ParseException("Illegal char found", -1);
 420                 }
 421                 w*=BASE-t;
 422             }
 423 
 424             /*
 425              * Modification from sample code:
 426              * Increments destCPCount here,
 427              * where needed instead of in for() loop tail.
 428              */
 429             ++destCPCount;
 430             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
 431 
 432             /*
 433              * i was supposed to wrap around from (incremented) destCPCount to 0,
 434              * incrementing n each time, so we'll fix that now:
 435              */
 436             if(i/destCPCount>(0x7fffffff-n)) {
 437                 /* integer overflow */
 438                 throw new ParseException("Illegal char found", -1);
 439             }
 440 
 441             n+=i/destCPCount;
 442             i%=destCPCount;
 443             /* not needed for Punycode: */
 444             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
 445 
 446             if(n>0x10ffff || isSurrogate(n)) {
 447                 /* Unicode code point overflow */
 448                 throw new ParseException("Illegal char found", -1);
 449             }
 450 
 451             /* Insert n at position i of the output: */
 452             cpLength=UTF16.getCharCount(n);
 453             if((destLength+cpLength)<destCapacity) {
 454                 int codeUnitIndex;
 455 
 456                 /*
 457                  * Handle indexes when supplementary code points are present.
 458                  *
 459                  * In almost all cases, there will be only BMP code points before i
 460                  * and even in the entire string.
 461                  * This is handled with the same efficiency as with UTF-32.
 462                  *
 463                  * Only the rare cases with supplementary code points are handled
 464                  * more slowly - but not too bad since this is an insertion anyway.
 465                  */
 466                 if(i<=firstSupplementaryIndex) {
 467                     codeUnitIndex=i;
 468                     if(cpLength>1) {
 469                         firstSupplementaryIndex=codeUnitIndex;
 470                     } else {
 471                         ++firstSupplementaryIndex;
 472                     }
 473                 } else {
 474                     codeUnitIndex=firstSupplementaryIndex;
 475                     codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
 476                 }
 477 
 478                 /* use the UChar index codeUnitIndex instead of the code point index i */
 479                 if(codeUnitIndex<destLength) {
 480                     System.arraycopy(dest, codeUnitIndex,
 481                                      dest, codeUnitIndex+cpLength,
 482                                     (destLength-codeUnitIndex));
 483                     if(caseFlags!=null) {
 484                         System.arraycopy(caseFlags, codeUnitIndex,
 485                                          caseFlags, codeUnitIndex+cpLength,
 486                                          destLength-codeUnitIndex);
 487                     }
 488                 }
 489                 if(cpLength==1) {
 490                     /* BMP, insert one code unit */
 491                     dest[codeUnitIndex]=(char)n;
 492                 } else {
 493                     /* supplementary character, insert two code units */
 494                     dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
 495                     dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
 496                 }
 497                 if(caseFlags!=null) {
 498                     /* Case of last character determines uppercase flag: */
 499                     caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
 500                     if(cpLength==2) {
 501                         caseFlags[codeUnitIndex+1]=false;
 502                     }
 503                 }
 504             }
 505             destLength+=cpLength;
 506             ++i;
 507         }
 508         result.append(dest, 0, destLength);
 509         return result;
 510     }
 511 }