1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 2003-2004, International Business Machines Corporation and * 28 * others. All Rights Reserved. * 29 ******************************************************************************* 30 */ 31 // 32 // CHANGELOG 33 // 2005-05-19 Edward Wang 34 // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java 35 // - move from package com.ibm.icu.text to package sun.net.idn 36 // - use ParseException instead of StringPrepParseException 37 // 2007-08-14 Martin Buchholz 38 // - remove redundant casts 39 // 40 package jdk.internal.icu.impl; 41 42 import java.text.ParseException; 43 import jdk.internal.icu.lang.UCharacter; 44 import jdk.internal.icu.text.UTF16; 45 46 /** 47 * Ported code from ICU punycode.c 48 * @author ram 49 */ 50 51 /* Package Private class */ 52 public final class Punycode { 53 54 /* Punycode parameters for Bootstring */ 55 private static final int BASE = 36; 56 private static final int TMIN = 1; 57 private static final int TMAX = 26; 58 private static final int SKEW = 38; 59 private static final int DAMP = 700; 60 private static final int INITIAL_BIAS = 72; 61 private static final int INITIAL_N = 0x80; 62 63 /* "Basic" Unicode/ASCII code points */ 64 private static final int HYPHEN = 0x2d; 65 private static final int DELIMITER = HYPHEN; 66 67 private static final int ZERO = 0x30; 68 private static final int NINE = 0x39; 69 70 private static final int SMALL_A = 0x61; 71 private static final int SMALL_Z = 0x7a; 72 73 private static final int CAPITAL_A = 0x41; 74 private static final int CAPITAL_Z = 0x5a; 75 76 // TODO: eliminate the 256 limitation 77 private static final int MAX_CP_COUNT = 256; 78 79 private static final int UINT_MAGIC = 0x80000000; 80 private static final long ULONG_MAGIC = 0x8000000000000000L; 81 82 private static int adaptBias(int delta, int length, boolean firstTime){ 83 if(firstTime){ 84 delta /=DAMP; 85 }else{ 86 delta /= 2; 87 } 88 delta += delta/length; 89 90 int count=0; 91 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 92 delta/=(BASE-TMIN); 93 } 94 95 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 96 } 97 98 /** 99 * basicToDigit[] contains the numeric value of a basic code 100 * point (for use in representing integers) in the range 0 to 101 * BASE-1, or -1 if b is does not represent a value. 102 */ 103 static final int[] basicToDigit= new int[]{ 104 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 105 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 106 107 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 108 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 109 110 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 111 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 112 113 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 114 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 115 116 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 117 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 118 119 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 120 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 121 122 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 123 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 124 125 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 126 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 127 }; 128 129 private static char asciiCaseMap(char b, boolean uppercase) { 130 if(uppercase) { 131 if(SMALL_A<=b && b<=SMALL_Z) { 132 b-=(SMALL_A-CAPITAL_A); 133 } 134 } else { 135 if(CAPITAL_A<=b && b<=CAPITAL_Z) { 136 b+=(SMALL_A-CAPITAL_A); 137 } 138 } 139 return b; 140 } 141 142 /** 143 * digitToBasic() returns the basic code point whose value 144 * (when used for representing integers) is d, which must be in the 145 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 146 * nonzero, in which case the uppercase form is used. 147 */ 148 private static char digitToBasic(int digit, boolean uppercase) { 149 /* 0..25 map to ASCII a..z or A..Z */ 150 /* 26..35 map to ASCII 0..9 */ 151 if(digit<26) { 152 if(uppercase) { 153 return (char)(CAPITAL_A+digit); 154 } else { 155 return (char)(SMALL_A+digit); 156 } 157 } else { 158 return (char)((ZERO-26)+digit); 159 } 160 } 161 /** 162 * Converts Unicode to Punycode. 163 * The input string must not contain single, unpaired surrogates. 164 * The output will be represented as an array of ASCII code points. 165 * 166 * @param src 167 * @param caseFlags 168 * @return 169 * @throws ParseException 170 */ 171 public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ 172 173 int[] cpBuffer = new int[MAX_CP_COUNT]; 174 int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; 175 char c, c2; 176 int srcLength = src.length(); 177 int destCapacity = MAX_CP_COUNT; 178 char[] dest = new char[destCapacity]; 179 StringBuffer result = new StringBuffer(); 180 /* 181 * Handle the basic code points and 182 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 183 */ 184 srcCPCount=destLength=0; 185 186 for(j=0; j<srcLength; ++j) { 187 if(srcCPCount==MAX_CP_COUNT) { 188 /* too many input code points */ 189 throw new IndexOutOfBoundsException(); 190 } 191 c=src.charAt(j); 192 if(isBasic(c)) { 193 if(destLength<destCapacity) { 194 cpBuffer[srcCPCount++]=0; 195 dest[destLength]= 196 caseFlags!=null ? 197 asciiCaseMap(c, caseFlags[j]) : 198 c; 199 } 200 ++destLength; 201 } else { 202 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; 203 if(!UTF16.isSurrogate(c)) { 204 n|=c; 205 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { 206 ++j; 207 208 n|=UCharacter.getCodePoint(c, c2); 209 } else { 210 /* error: unmatched surrogate */ 211 throw new ParseException("Illegal char found", -1); 212 } 213 cpBuffer[srcCPCount++]=n; 214 } 215 } 216 217 /* Finish the basic string - if it is not empty - with a delimiter. */ 218 basicLength=destLength; 219 if(basicLength>0) { 220 if(destLength<destCapacity) { 221 dest[destLength]=DELIMITER; 222 } 223 ++destLength; 224 } 225 226 /* 227 * handledCPCount is the number of code points that have been handled 228 * basicLength is the number of basic code points 229 * destLength is the number of chars that have been output 230 */ 231 232 /* Initialize the state: */ 233 n=INITIAL_N; 234 delta=0; 235 bias=INITIAL_BIAS; 236 237 /* Main encoding loop: */ 238 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 239 /* 240 * All non-basic code points < n have been handled already. 241 * Find the next larger one: 242 */ 243 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 244 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 245 if(n<=q && q<m) { 246 m=q; 247 } 248 } 249 250 /* 251 * Increase delta enough to advance the decoder's 252 * <n,i> state to <m,0>, but guard against overflow: 253 */ 254 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { 255 throw new RuntimeException("Internal program error"); 256 } 257 delta+=(m-n)*(handledCPCount+1); 258 n=m; 259 260 /* Encode a sequence of same code points n */ 261 for(j=0; j<srcCPCount; ++j) { 262 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 263 if(q<n) { 264 ++delta; 265 } else if(q==n) { 266 /* Represent delta as a generalized variable-length integer: */ 267 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 268 269 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 270 271 t=k-bias; 272 if(t<TMIN) { 273 t=TMIN; 274 } else if(t>TMAX) { 275 t=TMAX; 276 } 277 */ 278 279 t=k-bias; 280 if(t<TMIN) { 281 t=TMIN; 282 } else if(k>=(bias+TMAX)) { 283 t=TMAX; 284 } 285 286 if(q<t) { 287 break; 288 } 289 290 if(destLength<destCapacity) { 291 dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false); 292 } 293 q=(q-t)/(BASE-t); 294 } 295 296 if(destLength<destCapacity) { 297 dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0)); 298 } 299 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); 300 delta=0; 301 ++handledCPCount; 302 } 303 } 304 305 ++delta; 306 ++n; 307 } 308 309 return result.append(dest, 0, destLength); 310 } 311 312 private static boolean isBasic(int ch){ 313 return (ch < INITIAL_N); 314 } 315 316 private static boolean isBasicUpperCase(int ch){ 317 return( CAPITAL_A <= ch && ch <= CAPITAL_Z); 318 } 319 320 private static boolean isSurrogate(int ch){ 321 return (((ch)&0xfffff800)==0xd800); 322 } 323 /** 324 * Converts Punycode to Unicode. 325 * The Unicode string will be at most as long as the Punycode string. 326 * 327 * @param src 328 * @param caseFlags 329 * @return 330 * @throws ParseException 331 */ 332 public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) 333 throws ParseException{ 334 int srcLength = src.length(); 335 StringBuffer result = new StringBuffer(); 336 int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, 337 destCPCount, firstSupplementaryIndex, cpLength; 338 char b; 339 int destCapacity = MAX_CP_COUNT; 340 char[] dest = new char[destCapacity]; 341 342 /* 343 * Handle the basic code points: 344 * Let basicLength be the number of input code points 345 * before the last delimiter, or 0 if there is none, 346 * then copy the first basicLength code points to the output. 347 * 348 * The two following loops iterate backward. 349 */ 350 for(j=srcLength; j>0;) { 351 if(src.charAt(--j)==DELIMITER) { 352 break; 353 } 354 } 355 destLength=basicLength=destCPCount=j; 356 357 while(j>0) { 358 b=src.charAt(--j); 359 if(!isBasic(b)) { 360 throw new ParseException("Illegal char found", -1); 361 } 362 363 if(j<destCapacity) { 364 dest[j]= b; 365 366 if(caseFlags!=null) { 367 caseFlags[j]=isBasicUpperCase(b); 368 } 369 } 370 } 371 372 /* Initialize the state: */ 373 n=INITIAL_N; 374 i=0; 375 bias=INITIAL_BIAS; 376 firstSupplementaryIndex=1000000000; 377 378 /* 379 * Main decoding loop: 380 * Start just after the last delimiter if any 381 * basic code points were copied; start at the beginning otherwise. 382 */ 383 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 384 /* 385 * in is the index of the next character to be consumed, and 386 * destCPCount is the number of code points in the output array. 387 * 388 * Decode a generalized variable-length integer into delta, 389 * which gets added to i. The overflow checking is easier 390 * if we increase i as we go, then subtract off its starting 391 * value at the end to obtain delta. 392 */ 393 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 394 if(in>=srcLength) { 395 throw new ParseException("Illegal char found", -1); 396 } 397 398 digit=basicToDigit[(byte)src.charAt(in++)]; 399 if(digit<0) { 400 throw new ParseException("Invalid char found", -1); 401 } 402 if(digit>(0x7fffffff-i)/w) { 403 /* integer overflow */ 404 throw new ParseException("Illegal char found", -1); 405 } 406 407 i+=digit*w; 408 t=k-bias; 409 if(t<TMIN) { 410 t=TMIN; 411 } else if(k>=(bias+TMAX)) { 412 t=TMAX; 413 } 414 if(digit<t) { 415 break; 416 } 417 418 if(w>0x7fffffff/(BASE-t)) { 419 /* integer overflow */ 420 throw new ParseException("Illegal char found", -1); 421 } 422 w*=BASE-t; 423 } 424 425 /* 426 * Modification from sample code: 427 * Increments destCPCount here, 428 * where needed instead of in for() loop tail. 429 */ 430 ++destCPCount; 431 bias=adaptBias(i-oldi, destCPCount, (oldi==0)); 432 433 /* 434 * i was supposed to wrap around from (incremented) destCPCount to 0, 435 * incrementing n each time, so we'll fix that now: 436 */ 437 if(i/destCPCount>(0x7fffffff-n)) { 438 /* integer overflow */ 439 throw new ParseException("Illegal char found", -1); 440 } 441 442 n+=i/destCPCount; 443 i%=destCPCount; 444 /* not needed for Punycode: */ 445 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 446 447 if(n>0x10ffff || isSurrogate(n)) { 448 /* Unicode code point overflow */ 449 throw new ParseException("Illegal char found", -1); 450 } 451 452 /* Insert n at position i of the output: */ 453 cpLength=UTF16.getCharCount(n); 454 if((destLength+cpLength)<destCapacity) { 455 int codeUnitIndex; 456 457 /* 458 * Handle indexes when supplementary code points are present. 459 * 460 * In almost all cases, there will be only BMP code points before i 461 * and even in the entire string. 462 * This is handled with the same efficiency as with UTF-32. 463 * 464 * Only the rare cases with supplementary code points are handled 465 * more slowly - but not too bad since this is an insertion anyway. 466 */ 467 if(i<=firstSupplementaryIndex) { 468 codeUnitIndex=i; 469 if(cpLength>1) { 470 firstSupplementaryIndex=codeUnitIndex; 471 } else { 472 ++firstSupplementaryIndex; 473 } 474 } else { 475 codeUnitIndex=firstSupplementaryIndex; 476 codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); 477 } 478 479 /* use the UChar index codeUnitIndex instead of the code point index i */ 480 if(codeUnitIndex<destLength) { 481 System.arraycopy(dest, codeUnitIndex, 482 dest, codeUnitIndex+cpLength, 483 (destLength-codeUnitIndex)); 484 if(caseFlags!=null) { 485 System.arraycopy(caseFlags, codeUnitIndex, 486 caseFlags, codeUnitIndex+cpLength, 487 destLength-codeUnitIndex); 488 } 489 } 490 if(cpLength==1) { 491 /* BMP, insert one code unit */ 492 dest[codeUnitIndex]=(char)n; 493 } else { 494 /* supplementary character, insert two code units */ 495 dest[codeUnitIndex]=UTF16.getLeadSurrogate(n); 496 dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n); 497 } 498 if(caseFlags!=null) { 499 /* Case of last character determines uppercase flag: */ 500 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); 501 if(cpLength==2) { 502 caseFlags[codeUnitIndex+1]=false; 503 } 504 } 505 } 506 destLength+=cpLength; 507 ++i; 508 } 509 result.append(dest, 0, destLength); 510 return result; 511 } 512 }