New src/java.base/share/classes/jdk/internal/icu/impl/Punycode.java

   1 /*
   2  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 2003-2004, International Business Machines Corporation and    *
  28  * others. All Rights Reserved.                                                *
  29  *******************************************************************************
  30  */
  31 //
  32 // CHANGELOG
  33 //      2005-05-19 Edward Wang
  34 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
  35 //          - move from package com.ibm.icu.text to package sun.net.idn
  36 //          - use ParseException instead of StringPrepParseException
  37 //      2007-08-14 Martin Buchholz
  38 //          - remove redundant casts
  39 //
  40 package jdk.internal.icu.impl;
  41 
  42 import java.text.ParseException;
  43 import jdk.internal.icu.lang.UCharacter;
  44 import jdk.internal.icu.text.UTF16;
  45 
  46 /**
  47  * Ported code from ICU punycode.c
  48  * @author ram
  49  */
  50 
  51 /* Package Private class */
  52 public final class Punycode {
  53 
  54     /* Punycode parameters for Bootstring */
  55     private static final int BASE           = 36;
  56     private static final int TMIN           = 1;
  57     private static final int TMAX           = 26;
  58     private static final int SKEW           = 38;
  59     private static final int DAMP           = 700;
  60     private static final int INITIAL_BIAS   = 72;
  61     private static final int INITIAL_N      = 0x80;
  62 
  63     /* "Basic" Unicode/ASCII code points */
  64     private static final int HYPHEN         = 0x2d;
  65     private static final int DELIMITER      = HYPHEN;
  66 
  67     private static final int ZERO           = 0x30;
  68     private static final int NINE           = 0x39;
  69 
  70     private static final int SMALL_A        = 0x61;
  71     private static final int SMALL_Z        = 0x7a;
  72 
  73     private static final int CAPITAL_A      = 0x41;
  74     private static final int CAPITAL_Z      = 0x5a;
  75 
  76     //  TODO: eliminate the 256 limitation
  77     private static final int MAX_CP_COUNT   = 256;
  78 
  79     private static final int UINT_MAGIC     = 0x80000000;
  80     private static final long ULONG_MAGIC   = 0x8000000000000000L;
  81 
  82     private static int adaptBias(int delta, int length, boolean firstTime){
  83         if(firstTime){
  84             delta /=DAMP;
  85         }else{
  86             delta /=  2;
  87         }
  88         delta += delta/length;
  89 
  90         int count=0;
  91         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
  92             delta/=(BASE-TMIN);
  93         }
  94 
  95         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
  96     }
  97 
  98     /**
  99      * basicToDigit[] contains the numeric value of a basic code
 100      * point (for use in representing integers) in the range 0 to
 101      * BASE-1, or -1 if b is does not represent a value.
 102      */
 103     static final int[]    basicToDigit= new int[]{
 104         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 105         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 106 
 107         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 108         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
 109 
 110         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 111         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 112 
 113         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
 114         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
 115 
 116         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 117         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 118 
 119         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 120         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 121 
 122         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 123         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 124 
 125         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 126         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
 127     };
 128 
 129     private static char asciiCaseMap(char b, boolean uppercase) {
 130         if(uppercase) {
 131             if(SMALL_A<=b && b<=SMALL_Z) {
 132                 b-=(SMALL_A-CAPITAL_A);
 133             }
 134         } else {
 135             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
 136                 b+=(SMALL_A-CAPITAL_A);
 137             }
 138         }
 139         return b;
 140     }
 141 
 142     /**
 143      * digitToBasic() returns the basic code point whose value
 144      * (when used for representing integers) is d, which must be in the
 145      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
 146      * nonzero, in which case the uppercase form is used.
 147      */
 148     private static char digitToBasic(int digit, boolean uppercase) {
 149         /*  0..25 map to ASCII a..z or A..Z */
 150         /* 26..35 map to ASCII 0..9         */
 151         if(digit<26) {
 152             if(uppercase) {
 153                 return (char)(CAPITAL_A+digit);
 154             } else {
 155                 return (char)(SMALL_A+digit);
 156             }
 157         } else {
 158             return (char)((ZERO-26)+digit);
 159         }
 160     }
 161     /**
 162      * Converts Unicode to Punycode.
 163      * The input string must not contain single, unpaired surrogates.
 164      * The output will be represented as an array of ASCII code points.
 165      *
 166      * @param src
 167      * @param caseFlags
 168      * @return
 169      * @throws ParseException
 170      */
 171     public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
 172 
 173         int[] cpBuffer = new int[MAX_CP_COUNT];
 174         int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
 175         char c, c2;
 176         int srcLength = src.length();
 177         int destCapacity = MAX_CP_COUNT;
 178         char[] dest = new char[destCapacity];
 179         StringBuffer result = new StringBuffer();
 180         /*
 181          * Handle the basic code points and
 182          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
 183          */
 184         srcCPCount=destLength=0;
 185 
 186         for(j=0; j<srcLength; ++j) {
 187             if(srcCPCount==MAX_CP_COUNT) {
 188                 /* too many input code points */
 189                 throw new IndexOutOfBoundsException();
 190             }
 191             c=src.charAt(j);
 192             if(isBasic(c)) {
 193                 if(destLength<destCapacity) {
 194                     cpBuffer[srcCPCount++]=0;
 195                     dest[destLength]=
 196                         caseFlags!=null ?
 197                             asciiCaseMap(c, caseFlags[j]) :
 198                             c;
 199                 }
 200                 ++destLength;
 201             } else {
 202                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
 203                 if(!UTF16.isSurrogate(c)) {
 204                     n|=c;
 205                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
 206                     ++j;
 207 
 208                     n|=UCharacter.getCodePoint(c, c2);
 209                 } else {
 210                     /* error: unmatched surrogate */
 211                     throw new ParseException("Illegal char found", -1);
 212                 }
 213                 cpBuffer[srcCPCount++]=n;
 214             }
 215         }
 216 
 217         /* Finish the basic string - if it is not empty - with a delimiter. */
 218         basicLength=destLength;
 219         if(basicLength>0) {
 220             if(destLength<destCapacity) {
 221                 dest[destLength]=DELIMITER;
 222             }
 223             ++destLength;
 224         }
 225 
 226         /*
 227          * handledCPCount is the number of code points that have been handled
 228          * basicLength is the number of basic code points
 229          * destLength is the number of chars that have been output
 230          */
 231 
 232         /* Initialize the state: */
 233         n=INITIAL_N;
 234         delta=0;
 235         bias=INITIAL_BIAS;
 236 
 237         /* Main encoding loop: */
 238         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
 239             /*
 240              * All non-basic code points < n have been handled already.
 241              * Find the next larger one:
 242              */
 243             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
 244                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 245                 if(n<=q && q<m) {
 246                     m=q;
 247                 }
 248             }
 249 
 250             /*
 251              * Increase delta enough to advance the decoder's
 252              * <n,i> state to <m,0>, but guard against overflow:
 253              */
 254             if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
 255                 throw new RuntimeException("Internal program error");
 256             }
 257             delta+=(m-n)*(handledCPCount+1);
 258             n=m;
 259 
 260             /* Encode a sequence of same code points n */
 261             for(j=0; j<srcCPCount; ++j) {
 262                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
 263                 if(q<n) {
 264                     ++delta;
 265                 } else if(q==n) {
 266                     /* Represent delta as a generalized variable-length integer: */
 267                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
 268 
 269                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
 270 
 271                         t=k-bias;
 272                         if(t<TMIN) {
 273                             t=TMIN;
 274                         } else if(t>TMAX) {
 275                             t=TMAX;
 276                         }
 277                         */
 278 
 279                         t=k-bias;
 280                         if(t<TMIN) {
 281                             t=TMIN;
 282                         } else if(k>=(bias+TMAX)) {
 283                             t=TMAX;
 284                         }
 285 
 286                         if(q<t) {
 287                             break;
 288                         }
 289 
 290                         if(destLength<destCapacity) {
 291                             dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
 292                         }
 293                         q=(q-t)/(BASE-t);
 294                     }
 295 
 296                     if(destLength<destCapacity) {
 297                         dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
 298                     }
 299                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
 300                     delta=0;
 301                     ++handledCPCount;
 302                 }
 303             }
 304 
 305             ++delta;
 306             ++n;
 307         }
 308 
 309         return result.append(dest, 0, destLength);
 310     }
 311 
 312     private static boolean isBasic(int ch){
 313         return (ch < INITIAL_N);
 314     }
 315 
 316     private static boolean isBasicUpperCase(int ch){
 317         return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
 318     }
 319 
 320     private static boolean isSurrogate(int ch){
 321         return (((ch)&0xfffff800)==0xd800);
 322     }
 323     /**
 324      * Converts Punycode to Unicode.
 325      * The Unicode string will be at most as long as the Punycode string.
 326      *
 327      * @param src
 328      * @param caseFlags
 329      * @return
 330      * @throws ParseException
 331      */
 332     public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
 333                                throws ParseException{
 334         int srcLength = src.length();
 335         StringBuffer result = new StringBuffer();
 336         int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
 337                 destCPCount, firstSupplementaryIndex, cpLength;
 338         char b;
 339         int destCapacity = MAX_CP_COUNT;
 340         char[] dest = new char[destCapacity];
 341 
 342         /*
 343          * Handle the basic code points:
 344          * Let basicLength be the number of input code points
 345          * before the last delimiter, or 0 if there is none,
 346          * then copy the first basicLength code points to the output.
 347          *
 348          * The two following loops iterate backward.
 349          */
 350         for(j=srcLength; j>0;) {
 351             if(src.charAt(--j)==DELIMITER) {
 352                 break;
 353             }
 354         }
 355         destLength=basicLength=destCPCount=j;
 356 
 357         while(j>0) {
 358             b=src.charAt(--j);
 359             if(!isBasic(b)) {
 360                 throw new ParseException("Illegal char found", -1);
 361             }
 362 
 363             if(j<destCapacity) {
 364                 dest[j]= b;
 365 
 366                 if(caseFlags!=null) {
 367                     caseFlags[j]=isBasicUpperCase(b);
 368                 }
 369             }
 370         }
 371 
 372         /* Initialize the state: */
 373         n=INITIAL_N;
 374         i=0;
 375         bias=INITIAL_BIAS;
 376         firstSupplementaryIndex=1000000000;
 377 
 378         /*
 379          * Main decoding loop:
 380          * Start just after the last delimiter if any
 381          * basic code points were copied; start at the beginning otherwise.
 382          */
 383         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
 384             /*
 385              * in is the index of the next character to be consumed, and
 386              * destCPCount is the number of code points in the output array.
 387              *
 388              * Decode a generalized variable-length integer into delta,
 389              * which gets added to i.  The overflow checking is easier
 390              * if we increase i as we go, then subtract off its starting
 391              * value at the end to obtain delta.
 392              */
 393             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
 394                 if(in>=srcLength) {
 395                     throw new ParseException("Illegal char found", -1);
 396                 }
 397 
 398                 digit=basicToDigit[(byte)src.charAt(in++)];
 399                 if(digit<0) {
 400                     throw new ParseException("Invalid char found", -1);
 401                 }
 402                 if(digit>(0x7fffffff-i)/w) {
 403                     /* integer overflow */
 404                     throw new ParseException("Illegal char found", -1);
 405                 }
 406 
 407                 i+=digit*w;
 408                 t=k-bias;
 409                 if(t<TMIN) {
 410                     t=TMIN;
 411                 } else if(k>=(bias+TMAX)) {
 412                     t=TMAX;
 413                 }
 414                 if(digit<t) {
 415                     break;
 416                 }
 417 
 418                 if(w>0x7fffffff/(BASE-t)) {
 419                     /* integer overflow */
 420                     throw new ParseException("Illegal char found", -1);
 421                 }
 422                 w*=BASE-t;
 423             }
 424 
 425             /*
 426              * Modification from sample code:
 427              * Increments destCPCount here,
 428              * where needed instead of in for() loop tail.
 429              */
 430             ++destCPCount;
 431             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
 432 
 433             /*
 434              * i was supposed to wrap around from (incremented) destCPCount to 0,
 435              * incrementing n each time, so we'll fix that now:
 436              */
 437             if(i/destCPCount>(0x7fffffff-n)) {
 438                 /* integer overflow */
 439                 throw new ParseException("Illegal char found", -1);
 440             }
 441 
 442             n+=i/destCPCount;
 443             i%=destCPCount;
 444             /* not needed for Punycode: */
 445             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
 446 
 447             if(n>0x10ffff || isSurrogate(n)) {
 448                 /* Unicode code point overflow */
 449                 throw new ParseException("Illegal char found", -1);
 450             }
 451 
 452             /* Insert n at position i of the output: */
 453             cpLength=UTF16.getCharCount(n);
 454             if((destLength+cpLength)<destCapacity) {
 455                 int codeUnitIndex;
 456 
 457                 /*
 458                  * Handle indexes when supplementary code points are present.
 459                  *
 460                  * In almost all cases, there will be only BMP code points before i
 461                  * and even in the entire string.
 462                  * This is handled with the same efficiency as with UTF-32.
 463                  *
 464                  * Only the rare cases with supplementary code points are handled
 465                  * more slowly - but not too bad since this is an insertion anyway.
 466                  */
 467                 if(i<=firstSupplementaryIndex) {
 468                     codeUnitIndex=i;
 469                     if(cpLength>1) {
 470                         firstSupplementaryIndex=codeUnitIndex;
 471                     } else {
 472                         ++firstSupplementaryIndex;
 473                     }
 474                 } else {
 475                     codeUnitIndex=firstSupplementaryIndex;
 476                     codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
 477                 }
 478 
 479                 /* use the UChar index codeUnitIndex instead of the code point index i */
 480                 if(codeUnitIndex<destLength) {
 481                     System.arraycopy(dest, codeUnitIndex,
 482                                      dest, codeUnitIndex+cpLength,
 483                                     (destLength-codeUnitIndex));
 484                     if(caseFlags!=null) {
 485                         System.arraycopy(caseFlags, codeUnitIndex,
 486                                          caseFlags, codeUnitIndex+cpLength,
 487                                          destLength-codeUnitIndex);
 488                     }
 489                 }
 490                 if(cpLength==1) {
 491                     /* BMP, insert one code unit */
 492                     dest[codeUnitIndex]=(char)n;
 493                 } else {
 494                     /* supplementary character, insert two code units */
 495                     dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
 496                     dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
 497                 }
 498                 if(caseFlags!=null) {
 499                     /* Case of last character determines uppercase flag: */
 500                     caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
 501                     if(cpLength==2) {
 502                         caseFlags[codeUnitIndex+1]=false;
 503                     }
 504                 }
 505             }
 506             destLength+=cpLength;
 507             ++i;
 508         }
 509         result.append(dest, 0, destLength);
 510         return result;
 511     }
 512 }