1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. Oracle designates this 7 * particular file as subject to the "Classpath" exception as provided 8 * by Oracle in the LICENSE file that accompanied this code. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 */ 24 /* 25 /* 26 ******************************************************************************* 27 * Copyright (C) 2003-2004, International Business Machines Corporation and * 28 * others. All Rights Reserved. * 29 ******************************************************************************* 30 */ 31 // 32 // CHANGELOG 33 // 2005-05-19 Edward Wang 34 // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java 35 // - move from package com.ibm.icu.text to package sun.net.idn 36 // - use ParseException instead of StringPrepParseException 37 // - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()' 38 // - remove all @deprecated tag to make compiler happy 39 // 2007-08-14 Martin Buchholz 40 // - remove redundant casts 41 // 42 package sun.net.idn; 43 44 import java.io.BufferedInputStream; 45 import java.io.ByteArrayInputStream; 46 import java.io.IOException; 47 import java.io.InputStream; 48 import java.text.ParseException; 49 50 import sun.text.Normalizer; 51 import sun.text.normalizer.CharTrie; 52 import sun.text.normalizer.Trie; 53 import sun.text.normalizer.VersionInfo; 54 import sun.text.normalizer.UCharacter; 55 import sun.text.normalizer.UCharacterIterator; 56 import sun.text.normalizer.UTF16; 57 import sun.net.idn.UCharacterDirection; 58 import sun.net.idn.StringPrepDataReader; 59 60 /** 61 * StringPrep API implements the StingPrep framework as described by 62 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. 63 * StringPrep prepares Unicode strings for use in network protocols. 64 * Profiles of StingPrep are set of rules and data according to which the 65 * Unicode Strings are prepared. Each profiles contains tables which describe 66 * how a code point should be treated. The tables are broadly classied into 67 * <ul> 68 * <li> Unassigned Table: Contains code points that are unassigned 69 * in the Unicode Version supported by StringPrep. Currently 70 * RFC 3454 supports Unicode 3.2. </li> 71 * <li> Prohibited Table: Contains code points that are prohibted from 72 * the output of the StringPrep processing function. </li> 73 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li> 74 * </ul> 75 * 76 * The procedure for preparing Unicode strings: 77 * <ol> 78 * <li> Map: For each character in the input, check if it has a mapping 79 * and, if so, replace it with its mapping. </li> 80 * <li> Normalize: Possibly normalize the result of step 1 using Unicode 81 * normalization. </li> 82 * <li> Prohibit: Check for any characters that are not allowed in the 83 * output. If any are found, return an error.</li> 84 * <li> Check bidi: Possibly check for right-to-left characters, and if 85 * any are found, make sure that the whole string satisfies the 86 * requirements for bidirectional strings. If the string does not 87 * satisfy the requirements for bidirectional strings, return an 88 * error. </li> 89 * </ol> 90 * @author Ram Viswanadha 91 * @draft ICU 2.8 92 */ 93 public final class StringPrep { 94 /** 95 * Option to prohibit processing of unassigned code points in the input 96 * 97 * @see #prepare 98 * @draft ICU 2.8 99 */ 100 public static final int DEFAULT = 0x0000; 101 102 /** 103 * Option to allow processing of unassigned code points in the input 104 * 105 * @see #prepare 106 * @draft ICU 2.8 107 */ 108 public static final int ALLOW_UNASSIGNED = 0x0001; 109 110 private static final int UNASSIGNED = 0x0000; 111 private static final int MAP = 0x0001; 112 private static final int PROHIBITED = 0x0002; 113 private static final int DELETE = 0x0003; 114 private static final int TYPE_LIMIT = 0x0004; 115 116 private static final int NORMALIZATION_ON = 0x0001; 117 private static final int CHECK_BIDI_ON = 0x0002; 118 119 private static final int TYPE_THRESHOLD = 0xFFF0; 120 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ 121 private static final int MAX_INDEX_TOP_LENGTH = 0x0003; 122 123 /* indexes[] value names */ 124 private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ 125 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ 126 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 127 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ 128 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ 129 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; 130 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; 131 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ 132 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ 133 134 135 /** 136 * Default buffer size of datafile 137 */ 138 private static final int DATA_BUFFER_SIZE = 25000; 139 140 /* Wrappers for Trie implementations */ 141 private static final class StringPrepTrieImpl implements Trie.DataManipulate{ 142 private CharTrie sprepTrie = null; 143 /** 144 * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's 145 * data the index array offset of the indexes for that lead surrogate. 146 * @param property data value for a surrogate from the trie, including 147 * the folding offset 148 * @return data offset or 0 if there is no data for the lead surrogate 149 */ 150 public int getFoldingOffset(int value){ 151 return value; 152 } 153 } 154 155 // CharTrie implementation for reading the trie data 156 private StringPrepTrieImpl sprepTrieImpl; 157 // Indexes read from the data file 158 private int[] indexes; 159 // mapping data read from the data file 160 private char[] mappingData; 161 // format version of the data file 162 private byte[] formatVersion; 163 // the version of Unicode supported by the data file 164 private VersionInfo sprepUniVer; 165 // the Unicode version of last entry in the 166 // NormalizationCorrections.txt file if normalization 167 // is turned on 168 private VersionInfo normCorrVer; 169 // Option to turn on Normalization 170 private boolean doNFKC; 171 // Option to turn on checking for BiDi rules 172 private boolean checkBiDi; 173 174 175 private char getCodePointValue(int ch){ 176 return sprepTrieImpl.sprepTrie.getCodePointValue(ch); 177 } 178 179 private static VersionInfo getVersionInfo(int comp){ 180 int micro = comp & 0xFF; 181 int milli =(comp >> 8) & 0xFF; 182 int minor =(comp >> 16) & 0xFF; 183 int major =(comp >> 24) & 0xFF; 184 return VersionInfo.getInstance(major,minor,milli,micro); 185 } 186 private static VersionInfo getVersionInfo(byte[] version){ 187 if(version.length != 4){ 188 return null; 189 } 190 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); 191 } 192 /** 193 * Creates an StringPrep object after reading the input stream. 194 * The object does not hold a reference to the input steam, so the stream can be 195 * closed after the method returns. 196 * 197 * @param inputStream The stream for reading the StringPrep profile binarySun 198 * @throws IOException 199 * @draft ICU 2.8 200 */ 201 public StringPrep(InputStream inputStream) throws IOException{ 202 203 BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE); 204 205 StringPrepDataReader reader = new StringPrepDataReader(b); 206 207 // read the indexes 208 indexes = reader.readIndexes(INDEX_TOP); 209 210 byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]]; 211 212 213 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes 214 mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2]; 215 // load the rest of the data data and initialize the data members 216 reader.read(sprepBytes,mappingData); 217 218 sprepTrieImpl = new StringPrepTrieImpl(); 219 sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl ); 220 221 // get the data format version 222 formatVersion = reader.getDataFormatVersion(); 223 224 // get the options 225 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); 226 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); 227 sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); 228 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); 229 VersionInfo normUniVer = UCharacter.getUnicodeVersion(); 230 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ 231 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ 232 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ 233 ){ 234 throw new IOException("Normalization Correction version not supported"); 235 } 236 b.close(); 237 } 238 239 private static final class Values{ 240 boolean isIndex; 241 int value; 242 int type; 243 public void reset(){ 244 isIndex = false; 245 value = 0; 246 type = -1; 247 } 248 } 249 250 private static final void getValues(char trieWord,Values values){ 251 values.reset(); 252 if(trieWord == 0){ 253 /* 254 * Initial value stored in the mapping table 255 * just return TYPE_LIMIT .. so that 256 * the source codepoint is copied to the destination 257 */ 258 values.type = TYPE_LIMIT; 259 }else if(trieWord >= TYPE_THRESHOLD){ 260 values.type = (trieWord - TYPE_THRESHOLD); 261 }else{ 262 /* get the type */ 263 values.type = MAP; 264 /* ascertain if the value is index or delta */ 265 if((trieWord & 0x02)>0){ 266 values.isIndex = true; 267 values.value = trieWord >> 2; //mask off the lower 2 bits and shift 268 269 }else{ 270 values.isIndex = false; 271 values.value = (trieWord<<16)>>16; 272 values.value = (values.value >> 2); 273 274 } 275 276 if((trieWord>>2) == MAX_INDEX_VALUE){ 277 values.type = DELETE; 278 values.isIndex = false; 279 values.value = 0; 280 } 281 } 282 } 283 284 285 286 private StringBuffer map( UCharacterIterator iter, int options) 287 throws ParseException { 288 289 Values val = new Values(); 290 char result = 0; 291 int ch = UCharacterIterator.DONE; 292 StringBuffer dest = new StringBuffer(); 293 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); 294 295 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 296 297 result = getCodePointValue(ch); 298 getValues(result,val); 299 300 // check if the source codepoint is unassigned 301 if(val.type == UNASSIGNED && allowUnassigned == false){ 302 throw new ParseException("An unassigned code point was found in the input " + 303 iter.getText(), iter.getIndex()); 304 }else if((val.type == MAP)){ 305 int index, length; 306 307 if(val.isIndex){ 308 index = val.value; 309 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && 310 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ 311 length = 1; 312 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && 313 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ 314 length = 2; 315 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && 316 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ 317 length = 3; 318 }else{ 319 length = mappingData[index++]; 320 } 321 /* copy mapping to destination */ 322 dest.append(mappingData,index,length); 323 continue; 324 325 }else{ 326 ch -= val.value; 327 } 328 }else if(val.type == DELETE){ 329 // just consume the codepoint and contine 330 continue; 331 } 332 //copy the source into destination 333 UTF16.append(dest,ch); 334 } 335 336 return dest; 337 } 338 339 340 private StringBuffer normalize(StringBuffer src){ 341 /* 342 * Option UNORM_BEFORE_PRI_29: 343 * 344 * IDNA as interpreted by IETF members (see unicode mailing list 2004H1) 345 * requires strict adherence to Unicode 3.2 normalization, 346 * including buggy composition from before fixing Public Review Issue #29. 347 * Note that this results in some valid but nonsensical text to be 348 * either corrupted or rejected, depending on the text. 349 * See http://www.unicode.org/review/resolved-pri.html#pri29 350 * See unorm.cpp and cnormtst.c 351 */ 352 return new StringBuffer( 353 Normalizer.normalize( 354 src.toString(), 355 java.text.Normalizer.Form.NFKC, 356 Normalizer.UNICODE_3_2)); 357 } 358 /* 359 boolean isLabelSeparator(int ch){ 360 int result = getCodePointValue(ch); 361 if( (result & 0x07) == LABEL_SEPARATOR){ 362 return true; 363 } 364 return false; 365 } 366 */ 367 /* 368 1) Map -- For each character in the input, check if it has a mapping 369 and, if so, replace it with its mapping. 370 371 2) Normalize -- Possibly normalize the result of step 1 using Unicode 372 normalization. 373 374 3) Prohibit -- Check for any characters that are not allowed in the 375 output. If any are found, return an error. 376 377 4) Check bidi -- Possibly check for right-to-left characters, and if 378 any are found, make sure that the whole string satisfies the 379 requirements for bidirectional strings. If the string does not 380 satisfy the requirements for bidirectional strings, return an 381 error. 382 [Unicode3.2] defines several bidirectional categories; each character 383 has one bidirectional category assigned to it. For the purposes of 384 the requirements below, an "RandALCat character" is a character that 385 has Unicode bidirectional categories "R" or "AL"; an "LCat character" 386 is a character that has Unicode bidirectional category "L". Note 387 388 389 that there are many characters which fall in neither of the above 390 definitions; Latin digits (<U+0030> through <U+0039>) are examples of 391 this because they have bidirectional category "EN". 392 393 In any profile that specifies bidirectional character handling, all 394 three of the following requirements MUST be met: 395 396 1) The characters in section 5.8 MUST be prohibited. 397 398 2) If a string contains any RandALCat character, the string MUST NOT 399 contain any LCat character. 400 401 3) If a string contains any RandALCat character, a RandALCat 402 character MUST be the first character of the string, and a 403 RandALCat character MUST be the last character of the string. 404 */ 405 /** 406 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), 407 * checks for prohited and BiDi characters in the order defined by RFC 3454 408 * depending on the options specified in the profile. 409 * 410 * @param src A UCharacterIterator object containing the source string 411 * @param options A bit set of options: 412 * 413 * - StringPrep.NONE Prohibit processing of unassigned code points in the input 414 * 415 * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input 416 * as normal Unicode code points. 417 * 418 * @return StringBuffer A StringBuffer containing the output 419 * @throws ParseException 420 * @draft ICU 2.8 421 */ 422 public StringBuffer prepare(UCharacterIterator src, int options) 423 throws ParseException{ 424 425 // map 426 StringBuffer mapOut = map(src,options); 427 StringBuffer normOut = mapOut;// initialize 428 429 if(doNFKC){ 430 // normalize 431 normOut = normalize(mapOut); 432 } 433 434 int ch; 435 char result; 436 UCharacterIterator iter = UCharacterIterator.getInstance(normOut); 437 Values val = new Values(); 438 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, 439 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; 440 int rtlPos=-1, ltrPos=-1; 441 boolean rightToLeft=false, leftToRight=false; 442 443 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 444 result = getCodePointValue(ch); 445 getValues(result,val); 446 447 if(val.type == PROHIBITED ){ 448 throw new ParseException("A prohibited code point was found in the input" + 449 iter.getText(), val.value); 450 } 451 452 direction = UCharacter.getDirection(ch); 453 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ 454 firstCharDir = direction; 455 } 456 if(direction == UCharacterDirection.LEFT_TO_RIGHT){ 457 leftToRight = true; 458 ltrPos = iter.getIndex()-1; 459 } 460 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ 461 rightToLeft = true; 462 rtlPos = iter.getIndex()-1; 463 } 464 } 465 if(checkBiDi == true){ 466 // satisfy 2 467 if( leftToRight == true && rightToLeft == true){ 468 throw new ParseException("The input does not conform to the rules for BiDi code points." + 469 iter.getText(), 470 (rtlPos>ltrPos) ? rtlPos : ltrPos); 471 } 472 473 //satisfy 3 474 if( rightToLeft == true && 475 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && 476 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) 477 ){ 478 throw new ParseException("The input does not conform to the rules for BiDi code points." + 479 iter.getText(), 480 (rtlPos>ltrPos) ? rtlPos : ltrPos); 481 } 482 } 483 return normOut; 484 485 } 486 }