1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.  Oracle designates this
   7  * particular file as subject to the "Classpath" exception as provided
   8  * by Oracle in the LICENSE file that accompanied this code.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  */
  24 /*
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 2003-2004, International Business Machines Corporation and         *
  28  * others. All Rights Reserved.                                                *
  29  *******************************************************************************
  30  */
  31 //
  32 // CHANGELOG
  33 //      2005-05-19 Edward Wang
  34 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
  35 //          - move from package com.ibm.icu.text to package sun.net.idn
  36 //          - use ParseException instead of StringPrepParseException
  37 //          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
  38 //          - remove all @deprecated tag to make compiler happy
  39 //      2007-08-14 Martin Buchholz
  40 //          - remove redundant casts
  41 //
  42 package sun.net.idn;
  43 
  44 import java.io.BufferedInputStream;
  45 import java.io.ByteArrayInputStream;
  46 import java.io.IOException;
  47 import java.io.InputStream;
  48 import java.text.ParseException;
  49 
  50 import sun.text.Normalizer;
  51 import sun.text.normalizer.CharTrie;
  52 import sun.text.normalizer.Trie;
  53 import sun.text.normalizer.VersionInfo;
  54 import sun.text.normalizer.UCharacter;
  55 import sun.text.normalizer.UCharacterIterator;
  56 import sun.text.normalizer.UTF16;
  57 import sun.net.idn.UCharacterDirection;
  58 import sun.net.idn.StringPrepDataReader;
  59 
  60 /**
  61  * StringPrep API implements the StingPrep framework as described by
  62  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
  63  * StringPrep prepares Unicode strings for use in network protocols.
  64  * Profiles of StingPrep are set of rules and data according to which the
  65  * Unicode Strings are prepared. Each profiles contains tables which describe
  66  * how a code point should be treated. The tables are broadly classied into
  67  * <ul>
  68  *     <li> Unassigned Table: Contains code points that are unassigned
  69  *          in the Unicode Version supported by StringPrep. Currently
  70  *          RFC 3454 supports Unicode 3.2. </li>
  71  *     <li> Prohibited Table: Contains code points that are prohibted from
  72  *          the output of the StringPrep processing function. </li>
  73  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
  74  * </ul>
  75  *
  76  * The procedure for preparing Unicode strings:
  77  * <ol>
  78  *      <li> Map: For each character in the input, check if it has a mapping
  79  *           and, if so, replace it with its mapping. </li>
  80  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
  81  *           normalization. </li>
  82  *      <li> Prohibit: Check for any characters that are not allowed in the
  83  *           output.  If any are found, return an error.</li>
  84  *      <li> Check bidi: Possibly check for right-to-left characters, and if
  85  *           any are found, make sure that the whole string satisfies the
  86  *           requirements for bidirectional strings.  If the string does not
  87  *           satisfy the requirements for bidirectional strings, return an
  88  *           error.  </li>
  89  * </ol>
  90  * @author Ram Viswanadha
  91  * @draft ICU 2.8
  92  */
  93 public final class StringPrep {
  94     /**
  95      * Option to prohibit processing of unassigned code points in the input
  96      *
  97      * @see   #prepare
  98      * @draft ICU 2.8
  99      */
 100     public static final int DEFAULT = 0x0000;
 101 
 102     /**
 103      * Option to allow processing of unassigned code points in the input
 104      *
 105      * @see   #prepare
 106      * @draft ICU 2.8
 107      */
 108     public static final int ALLOW_UNASSIGNED = 0x0001;
 109 
 110     private static final int UNASSIGNED        = 0x0000;
 111     private static final int MAP               = 0x0001;
 112     private static final int PROHIBITED        = 0x0002;
 113     private static final int DELETE            = 0x0003;
 114     private static final int TYPE_LIMIT        = 0x0004;
 115 
 116     private static final int NORMALIZATION_ON  = 0x0001;
 117     private static final int CHECK_BIDI_ON     = 0x0002;
 118 
 119     private static final int TYPE_THRESHOLD       = 0xFFF0;
 120     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
 121     private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
 122 
 123     /* indexes[] value names */
 124     private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
 125     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
 126     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
 127     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
 128     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
 129     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
 130     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
 131     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
 132     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
 133 
 134 
 135     /**
 136      * Default buffer size of datafile
 137      */
 138     private static final int DATA_BUFFER_SIZE = 25000;
 139 
 140     /* Wrappers for Trie implementations */
 141     private static final class StringPrepTrieImpl implements Trie.DataManipulate{
 142         private CharTrie sprepTrie = null;
 143        /**
 144         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
 145         * data the index array offset of the indexes for that lead surrogate.
 146         * @param property data value for a surrogate from the trie, including
 147         *        the folding offset
 148         * @return data offset or 0 if there is no data for the lead surrogate
 149         */
 150          public int getFoldingOffset(int value){
 151             return value;
 152         }
 153     }
 154 
 155     // CharTrie implementation for reading the trie data
 156     private StringPrepTrieImpl sprepTrieImpl;
 157     // Indexes read from the data file
 158     private int[] indexes;
 159     // mapping data read from the data file
 160     private char[] mappingData;
 161     // format version of the data file
 162     private byte[] formatVersion;
 163     // the version of Unicode supported by the data file
 164     private VersionInfo sprepUniVer;
 165     // the Unicode version of last entry in the
 166     // NormalizationCorrections.txt file if normalization
 167     // is turned on
 168     private VersionInfo normCorrVer;
 169     // Option to turn on Normalization
 170     private boolean doNFKC;
 171     // Option to turn on checking for BiDi rules
 172     private boolean checkBiDi;
 173 
 174 
 175     private char getCodePointValue(int ch){
 176         return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
 177     }
 178 
 179     private static VersionInfo getVersionInfo(int comp){
 180         int micro = comp & 0xFF;
 181         int milli =(comp >> 8)  & 0xFF;
 182         int minor =(comp >> 16) & 0xFF;
 183         int major =(comp >> 24) & 0xFF;
 184         return VersionInfo.getInstance(major,minor,milli,micro);
 185     }
 186     private static VersionInfo getVersionInfo(byte[] version){
 187         if(version.length != 4){
 188             return null;
 189         }
 190         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
 191     }
 192     /**
 193      * Creates an StringPrep object after reading the input stream.
 194      * The object does not hold a reference to the input steam, so the stream can be
 195      * closed after the method returns.
 196      *
 197      * @param inputStream The stream for reading the StringPrep profile binarySun
 198      * @throws IOException
 199      * @draft ICU 2.8
 200      */
 201     public StringPrep(InputStream inputStream) throws IOException{
 202 
 203         BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
 204 
 205         StringPrepDataReader reader = new StringPrepDataReader(b);
 206 
 207         // read the indexes
 208         indexes = reader.readIndexes(INDEX_TOP);
 209 
 210         byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
 211 
 212 
 213         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
 214         mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
 215         // load the rest of the data and initialize the data members
 216         reader.read(sprepBytes,mappingData);
 217 
 218         sprepTrieImpl           = new StringPrepTrieImpl();
 219         sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
 220 
 221         // get the data format version
 222         formatVersion = reader.getDataFormatVersion();
 223 
 224         // get the options
 225         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
 226         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
 227         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
 228         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
 229         VersionInfo normUniVer = UCharacter.getUnicodeVersion();
 230         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
 231            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
 232            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
 233            ){
 234             throw new IOException("Normalization Correction version not supported");
 235         }
 236         b.close();
 237     }
 238 
 239     private static final class Values{
 240         boolean isIndex;
 241         int value;
 242         int type;
 243         public void reset(){
 244             isIndex = false;
 245             value = 0;
 246             type = -1;
 247         }
 248     }
 249 
 250     private static final void getValues(char trieWord,Values values){
 251         values.reset();
 252         if(trieWord == 0){
 253             /*
 254              * Initial value stored in the mapping table
 255              * just return TYPE_LIMIT .. so that
 256              * the source codepoint is copied to the destination
 257              */
 258             values.type = TYPE_LIMIT;
 259         }else if(trieWord >= TYPE_THRESHOLD){
 260             values.type = (trieWord - TYPE_THRESHOLD);
 261         }else{
 262             /* get the type */
 263             values.type = MAP;
 264             /* ascertain if the value is index or delta */
 265             if((trieWord & 0x02)>0){
 266                 values.isIndex = true;
 267                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
 268 
 269             }else{
 270                 values.isIndex = false;
 271                 values.value = (trieWord<<16)>>16;
 272                 values.value =  (values.value >> 2);
 273 
 274             }
 275 
 276             if((trieWord>>2) == MAX_INDEX_VALUE){
 277                 values.type = DELETE;
 278                 values.isIndex = false;
 279                 values.value = 0;
 280             }
 281         }
 282     }
 283 
 284 
 285 
 286     private StringBuffer map( UCharacterIterator iter, int options)
 287                             throws ParseException {
 288 
 289         Values val = new Values();
 290         char result = 0;
 291         int ch  = UCharacterIterator.DONE;
 292         StringBuffer dest = new StringBuffer();
 293         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
 294 
 295         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
 296 
 297             result = getCodePointValue(ch);
 298             getValues(result,val);
 299 
 300             // check if the source codepoint is unassigned
 301             if(val.type == UNASSIGNED && allowUnassigned == false){
 302                  throw new ParseException("An unassigned code point was found in the input " +
 303                                           iter.getText(), iter.getIndex());
 304             }else if((val.type == MAP)){
 305                 int index, length;
 306 
 307                 if(val.isIndex){
 308                     index = val.value;
 309                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
 310                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
 311                         length = 1;
 312                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
 313                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
 314                         length = 2;
 315                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
 316                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
 317                         length = 3;
 318                     }else{
 319                         length = mappingData[index++];
 320                     }
 321                     /* copy mapping to destination */
 322                     dest.append(mappingData,index,length);
 323                     continue;
 324 
 325                 }else{
 326                     ch -= val.value;
 327                 }
 328             }else if(val.type == DELETE){
 329                 // just consume the codepoint and contine
 330                 continue;
 331             }
 332             //copy the source into destination
 333             UTF16.append(dest,ch);
 334         }
 335 
 336         return dest;
 337     }
 338 
 339 
 340     private StringBuffer normalize(StringBuffer src){
 341         /*
 342          * Option UNORM_BEFORE_PRI_29:
 343          *
 344          * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
 345          * requires strict adherence to Unicode 3.2 normalization,
 346          * including buggy composition from before fixing Public Review Issue #29.
 347          * Note that this results in some valid but nonsensical text to be
 348          * either corrupted or rejected, depending on the text.
 349          * See http://www.unicode.org/review/resolved-pri.html#pri29
 350          * See unorm.cpp and cnormtst.c
 351          */
 352         return new StringBuffer(
 353             Normalizer.normalize(
 354                 src.toString(),
 355                 java.text.Normalizer.Form.NFKC,
 356                 Normalizer.UNICODE_3_2));
 357     }
 358     /*
 359     boolean isLabelSeparator(int ch){
 360         int result = getCodePointValue(ch);
 361         if( (result & 0x07)  == LABEL_SEPARATOR){
 362             return true;
 363         }
 364         return false;
 365     }
 366     */
 367      /*
 368        1) Map -- For each character in the input, check if it has a mapping
 369           and, if so, replace it with its mapping.
 370 
 371        2) Normalize -- Possibly normalize the result of step 1 using Unicode
 372           normalization.
 373 
 374        3) Prohibit -- Check for any characters that are not allowed in the
 375           output.  If any are found, return an error.
 376 
 377        4) Check bidi -- Possibly check for right-to-left characters, and if
 378           any are found, make sure that the whole string satisfies the
 379           requirements for bidirectional strings.  If the string does not
 380           satisfy the requirements for bidirectional strings, return an
 381           error.
 382           [Unicode3.2] defines several bidirectional categories; each character
 383            has one bidirectional category assigned to it.  For the purposes of
 384            the requirements below, an "RandALCat character" is a character that
 385            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
 386            is a character that has Unicode bidirectional category "L".  Note
 387 
 388 
 389            that there are many characters which fall in neither of the above
 390            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
 391            this because they have bidirectional category "EN".
 392 
 393            In any profile that specifies bidirectional character handling, all
 394            three of the following requirements MUST be met:
 395 
 396            1) The characters in section 5.8 MUST be prohibited.
 397 
 398            2) If a string contains any RandALCat character, the string MUST NOT
 399               contain any LCat character.
 400 
 401            3) If a string contains any RandALCat character, a RandALCat
 402               character MUST be the first character of the string, and a
 403               RandALCat character MUST be the last character of the string.
 404     */
 405     /**
 406      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
 407      * checks for prohited and BiDi characters in the order defined by RFC 3454
 408      * depending on the options specified in the profile.
 409      *
 410      * @param src           A UCharacterIterator object containing the source string
 411      * @param options       A bit set of options:
 412      *
 413      *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
 414      *
 415      *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
 416      *                                  as normal Unicode code points.
 417      *
 418      * @return StringBuffer A StringBuffer containing the output
 419      * @throws ParseException
 420      * @draft ICU 2.8
 421      */
 422     public StringBuffer prepare(UCharacterIterator src, int options)
 423                         throws ParseException{
 424 
 425         // map
 426         StringBuffer mapOut = map(src,options);
 427         StringBuffer normOut = mapOut;// initialize
 428 
 429         if(doNFKC){
 430             // normalize
 431             normOut = normalize(mapOut);
 432         }
 433 
 434         int ch;
 435         char result;
 436         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
 437         Values val = new Values();
 438         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
 439             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
 440         int rtlPos=-1, ltrPos=-1;
 441         boolean rightToLeft=false, leftToRight=false;
 442 
 443         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
 444             result = getCodePointValue(ch);
 445             getValues(result,val);
 446 
 447             if(val.type == PROHIBITED ){
 448                 throw new ParseException("A prohibited code point was found in the input" +
 449                                          iter.getText(), val.value);
 450             }
 451 
 452             direction = UCharacter.getDirection(ch);
 453             if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
 454                 firstCharDir = direction;
 455             }
 456             if(direction == UCharacterDirection.LEFT_TO_RIGHT){
 457                 leftToRight = true;
 458                 ltrPos = iter.getIndex()-1;
 459             }
 460             if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
 461                 rightToLeft = true;
 462                 rtlPos = iter.getIndex()-1;
 463             }
 464         }
 465         if(checkBiDi == true){
 466             // satisfy 2
 467             if( leftToRight == true && rightToLeft == true){
 468                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
 469                                          iter.getText(),
 470                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
 471              }
 472 
 473             //satisfy 3
 474             if( rightToLeft == true &&
 475                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
 476                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
 477               ){
 478                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
 479                                          iter.getText(),
 480                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
 481             }
 482         }
 483         return normOut;
 484 
 485       }
 486 }