1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. Oracle designates this 7 * particular file as subject to the "Classpath" exception as provided 8 * by Oracle in the LICENSE file that accompanied this code. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 */ 24 /* 25 /* 26 ******************************************************************************* 27 * Copyright (C) 2003-2004, International Business Machines Corporation and * 28 * others. All Rights Reserved. * 29 ******************************************************************************* 30 */ 31 // 32 // CHANGELOG 33 // 2005-05-19 Edward Wang 34 // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java 35 // - move from package com.ibm.icu.text to package sun.net.idn 36 // - use ParseException instead of StringPrepParseException 37 // - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()' 38 // - remove all @deprecated tag to make compiler happy 39 // 2007-08-14 Martin Buchholz 40 // - remove redundant casts 41 // 42 package sun.net.idn; 43 44 import java.io.BufferedInputStream; 45 import java.io.ByteArrayInputStream; 46 import java.io.IOException; 47 import java.io.InputStream; 48 import java.text.ParseException; 49 50 import sun.text.Normalizer; 51 import sun.text.normalizer.CharTrie; 52 import sun.text.normalizer.Trie; 53 import sun.text.normalizer.NormalizerImpl; 54 import sun.text.normalizer.VersionInfo; 55 import sun.text.normalizer.UCharacter; 56 import sun.text.normalizer.UCharacterIterator; 57 import sun.text.normalizer.UTF16; 58 import sun.net.idn.UCharacterDirection; 59 import sun.net.idn.StringPrepDataReader; 60 61 /** 62 * StringPrep API implements the StingPrep framework as described by 63 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. 64 * StringPrep prepares Unicode strings for use in network protocols. 65 * Profiles of StingPrep are set of rules and data according to which the 66 * Unicode Strings are prepared. Each profiles contains tables which describe 67 * how a code point should be treated. The tables are broadly classied into 68 * <ul> 69 * <li> Unassigned Table: Contains code points that are unassigned 70 * in the Unicode Version supported by StringPrep. Currently 71 * RFC 3454 supports Unicode 3.2. </li> 72 * <li> Prohibited Table: Contains code points that are prohibted from 73 * the output of the StringPrep processing function. </li> 74 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li> 75 * </ul> 76 * 77 * The procedure for preparing Unicode strings: 78 * <ol> 79 * <li> Map: For each character in the input, check if it has a mapping 80 * and, if so, replace it with its mapping. </li> 81 * <li> Normalize: Possibly normalize the result of step 1 using Unicode 82 * normalization. </li> 83 * <li> Prohibit: Check for any characters that are not allowed in the 84 * output. If any are found, return an error.</li> 85 * <li> Check bidi: Possibly check for right-to-left characters, and if 86 * any are found, make sure that the whole string satisfies the 87 * requirements for bidirectional strings. If the string does not 88 * satisfy the requirements for bidirectional strings, return an 89 * error. </li> 90 * </ol> 91 * @author Ram Viswanadha 92 * @draft ICU 2.8 93 */ 94 public final class StringPrep { 95 /** 96 * Option to prohibit processing of unassigned code points in the input 97 * 98 * @see #prepare 99 * @draft ICU 2.8 100 */ 101 public static final int DEFAULT = 0x0000; 102 103 /** 104 * Option to allow processing of unassigned code points in the input 105 * 106 * @see #prepare 107 * @draft ICU 2.8 108 */ 109 public static final int ALLOW_UNASSIGNED = 0x0001; 110 111 private static final int UNASSIGNED = 0x0000; 112 private static final int MAP = 0x0001; 113 private static final int PROHIBITED = 0x0002; 114 private static final int DELETE = 0x0003; 115 private static final int TYPE_LIMIT = 0x0004; 116 117 private static final int NORMALIZATION_ON = 0x0001; 118 private static final int CHECK_BIDI_ON = 0x0002; 119 120 private static final int TYPE_THRESHOLD = 0xFFF0; 121 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ 122 private static final int MAX_INDEX_TOP_LENGTH = 0x0003; 123 124 /* indexes[] value names */ 125 private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ 126 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ 127 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 128 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ 129 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ 130 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; 131 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; 132 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ 133 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ 134 135 136 /** 137 * Default buffer size of datafile 138 */ 139 private static final int DATA_BUFFER_SIZE = 25000; 140 141 /* Wrappers for Trie implementations */ 142 private static final class StringPrepTrieImpl implements Trie.DataManipulate{ 143 private CharTrie sprepTrie = null; 144 /** 145 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 146 * data the index array offset of the indexes for that lead surrogate. 147 * @param property data value for a surrogate from the trie, including 148 * the folding offset 149 * @return data offset or 0 if there is no data for the lead surrogate 150 */ 151 public int getFoldingOffset(int value){ 152 return value; 153 } 154 } 155 156 // CharTrie implementation for reading the trie data 157 private StringPrepTrieImpl sprepTrieImpl; 158 // Indexes read from the data file 159 private int[] indexes; 160 // mapping data read from the data file 161 private char[] mappingData; 162 // format version of the data file 163 private byte[] formatVersion; 164 // the version of Unicode supported by the data file 165 private VersionInfo sprepUniVer; 166 // the Unicode version of last entry in the 167 // NormalizationCorrections.txt file if normalization 168 // is turned on 169 private VersionInfo normCorrVer; 170 // Option to turn on Normalization 171 private boolean doNFKC; 172 // Option to turn on checking for BiDi rules 173 private boolean checkBiDi; 174 175 176 private char getCodePointValue(int ch){ 177 return sprepTrieImpl.sprepTrie.getCodePointValue(ch); 178 } 179 180 private static VersionInfo getVersionInfo(int comp){ 181 int micro = comp & 0xFF; 182 int milli =(comp >> 8) & 0xFF; 183 int minor =(comp >> 16) & 0xFF; 184 int major =(comp >> 24) & 0xFF; 185 return VersionInfo.getInstance(major,minor,milli,micro); 186 } 187 private static VersionInfo getVersionInfo(byte[] version){ 188 if(version.length != 4){ 189 return null; 190 } 191 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); 192 } 193 /** 194 * Creates an StringPrep object after reading the input stream. 195 * The object does not hold a reference to the input steam, so the stream can be 196 * closed after the method returns. 197 * 198 * @param inputStream The stream for reading the StringPrep profile binarySun 199 * @throws IOException 200 * @draft ICU 2.8 201 */ 202 public StringPrep(InputStream inputStream) throws IOException{ 203 204 BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE); 205 206 StringPrepDataReader reader = new StringPrepDataReader(b); 207 208 // read the indexes 209 indexes = reader.readIndexes(INDEX_TOP); 210 211 byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]]; 212 213 214 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes 215 mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2]; 216 // load the rest of the data data and initialize the data members 217 reader.read(sprepBytes,mappingData); 218 219 sprepTrieImpl = new StringPrepTrieImpl(); 220 sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl ); 221 222 // get the data format version 223 formatVersion = reader.getDataFormatVersion(); 224 225 // get the options 226 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); 227 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); 228 sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); 229 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); 230 VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion(); 231 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ 232 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ 233 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ 234 ){ 235 throw new IOException("Normalization Correction version not supported"); 236 } 237 b.close(); 238 } 239 240 private static final class Values{ 241 boolean isIndex; 242 int value; 243 int type; 244 public void reset(){ 245 isIndex = false; 246 value = 0; 247 type = -1; 248 } 249 } 250 251 private static final void getValues(char trieWord,Values values){ 252 values.reset(); 253 if(trieWord == 0){ 254 /* 255 * Initial value stored in the mapping table 256 * just return TYPE_LIMIT .. so that 257 * the source codepoint is copied to the destination 258 */ 259 values.type = TYPE_LIMIT; 260 }else if(trieWord >= TYPE_THRESHOLD){ 261 values.type = (trieWord - TYPE_THRESHOLD); 262 }else{ 263 /* get the type */ 264 values.type = MAP; 265 /* ascertain if the value is index or delta */ 266 if((trieWord & 0x02)>0){ 267 values.isIndex = true; 268 values.value = trieWord >> 2; //mask off the lower 2 bits and shift 269 270 }else{ 271 values.isIndex = false; 272 values.value = (trieWord<<16)>>16; 273 values.value = (values.value >> 2); 274 275 } 276 277 if((trieWord>>2) == MAX_INDEX_VALUE){ 278 values.type = DELETE; 279 values.isIndex = false; 280 values.value = 0; 281 } 282 } 283 } 284 285 286 287 private StringBuffer map( UCharacterIterator iter, int options) 288 throws ParseException { 289 290 Values val = new Values(); 291 char result = 0; 292 int ch = UCharacterIterator.DONE; 293 StringBuffer dest = new StringBuffer(); 294 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); 295 296 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 297 298 result = getCodePointValue(ch); 299 getValues(result,val); 300 301 // check if the source codepoint is unassigned 302 if(val.type == UNASSIGNED && allowUnassigned == false){ 303 throw new ParseException("An unassigned code point was found in the input " + 304 iter.getText(), iter.getIndex()); 305 }else if((val.type == MAP)){ 306 int index, length; 307 308 if(val.isIndex){ 309 index = val.value; 310 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && 311 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ 312 length = 1; 313 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && 314 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ 315 length = 2; 316 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && 317 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ 318 length = 3; 319 }else{ 320 length = mappingData[index++]; 321 } 322 /* copy mapping to destination */ 323 dest.append(mappingData,index,length); 324 continue; 325 326 }else{ 327 ch -= val.value; 328 } 329 }else if(val.type == DELETE){ 330 // just consume the codepoint and contine 331 continue; 332 } 333 //copy the source into destination 334 UTF16.append(dest,ch); 335 } 336 337 return dest; 338 } 339 340 341 private StringBuffer normalize(StringBuffer src){ 342 /* 343 * Option UNORM_BEFORE_PRI_29: 344 * 345 * IDNA as interpreted by IETF members (see unicode mailing list 2004H1) 346 * requires strict adherence to Unicode 3.2 normalization, 347 * including buggy composition from before fixing Public Review Issue #29. 348 * Note that this results in some valid but nonsensical text to be 349 * either corrupted or rejected, depending on the text. 350 * See http://www.unicode.org/review/resolved-pri.html#pri29 351 * See unorm.cpp and cnormtst.c 352 */ 353 return new StringBuffer( 354 Normalizer.normalize( 355 src.toString(), 356 java.text.Normalizer.Form.NFKC, 357 Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29)); 358 } 359 /* 360 boolean isLabelSeparator(int ch){ 361 int result = getCodePointValue(ch); 362 if( (result & 0x07) == LABEL_SEPARATOR){ 363 return true; 364 } 365 return false; 366 } 367 */ 368 /* 369 1) Map -- For each character in the input, check if it has a mapping 370 and, if so, replace it with its mapping. 371 372 2) Normalize -- Possibly normalize the result of step 1 using Unicode 373 normalization. 374 375 3) Prohibit -- Check for any characters that are not allowed in the 376 output. If any are found, return an error. 377 378 4) Check bidi -- Possibly check for right-to-left characters, and if 379 any are found, make sure that the whole string satisfies the 380 requirements for bidirectional strings. If the string does not 381 satisfy the requirements for bidirectional strings, return an 382 error. 383 [Unicode3.2] defines several bidirectional categories; each character 384 has one bidirectional category assigned to it. For the purposes of 385 the requirements below, an "RandALCat character" is a character that 386 has Unicode bidirectional categories "R" or "AL"; an "LCat character" 387 is a character that has Unicode bidirectional category "L". Note 388 389 390 that there are many characters which fall in neither of the above 391 definitions; Latin digits (<U+0030> through <U+0039>) are examples of 392 this because they have bidirectional category "EN". 393 394 In any profile that specifies bidirectional character handling, all 395 three of the following requirements MUST be met: 396 397 1) The characters in section 5.8 MUST be prohibited. 398 399 2) If a string contains any RandALCat character, the string MUST NOT 400 contain any LCat character. 401 402 3) If a string contains any RandALCat character, a RandALCat 403 character MUST be the first character of the string, and a 404 RandALCat character MUST be the last character of the string. 405 */ 406 /** 407 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), 408 * checks for prohited and BiDi characters in the order defined by RFC 3454 409 * depending on the options specified in the profile. 410 * 411 * @param src A UCharacterIterator object containing the source string 412 * @param options A bit set of options: 413 * 414 * - StringPrep.NONE Prohibit processing of unassigned code points in the input 415 * 416 * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input 417 * as normal Unicode code points. 418 * 419 * @return StringBuffer A StringBuffer containing the output 420 * @throws ParseException 421 * @draft ICU 2.8 422 */ 423 public StringBuffer prepare(UCharacterIterator src, int options) 424 throws ParseException{ 425 426 // map 427 StringBuffer mapOut = map(src,options); 428 StringBuffer normOut = mapOut;// initialize 429 430 if(doNFKC){ 431 // normalize 432 normOut = normalize(mapOut); 433 } 434 435 int ch; 436 char result; 437 UCharacterIterator iter = UCharacterIterator.getInstance(normOut); 438 Values val = new Values(); 439 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, 440 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; 441 int rtlPos=-1, ltrPos=-1; 442 boolean rightToLeft=false, leftToRight=false; 443 444 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 445 result = getCodePointValue(ch); 446 getValues(result,val); 447 448 if(val.type == PROHIBITED ){ 449 throw new ParseException("A prohibited code point was found in the input" + 450 iter.getText(), val.value); 451 } 452 453 direction = UCharacter.getDirection(ch); 454 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ 455 firstCharDir = direction; 456 } 457 if(direction == UCharacterDirection.LEFT_TO_RIGHT){ 458 leftToRight = true; 459 ltrPos = iter.getIndex()-1; 460 } 461 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ 462 rightToLeft = true; 463 rtlPos = iter.getIndex()-1; 464 } 465 } 466 if(checkBiDi == true){ 467 // satisfy 2 468 if( leftToRight == true && rightToLeft == true){ 469 throw new ParseException("The input does not conform to the rules for BiDi code points." + 470 iter.getText(), 471 (rtlPos>ltrPos) ? rtlPos : ltrPos); 472 } 473 474 //satisfy 3 475 if( rightToLeft == true && 476 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && 477 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) 478 ){ 479 throw new ParseException("The input does not conform to the rules for BiDi code points." + 480 iter.getText(), 481 (rtlPos>ltrPos) ? rtlPos : ltrPos); 482 } 483 } 484 return normOut; 485 486 } 487 }