1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.  Oracle designates this
   7  * particular file as subject to the "Classpath" exception as provided
   8  * by Oracle in the LICENSE file that accompanied this code.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  */
  24 /*
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 2003-2004, International Business Machines Corporation and         *
  28  * others. All Rights Reserved.                                                *
  29  *******************************************************************************
  30  */
  31 //
  32 // CHANGELOG
  33 //      2005-05-19 Edward Wang
  34 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
  35 //          - move from package com.ibm.icu.text to package sun.net.idn
  36 //          - use ParseException instead of StringPrepParseException
  37 //          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
  38 //          - remove all @deprecated tag to make compiler happy
  39 //      2007-08-14 Martin Buchholz
  40 //          - remove redundant casts
  41 //
  42 package sun.net.idn;
  43 
  44 import java.io.BufferedInputStream;
  45 import java.io.ByteArrayInputStream;
  46 import java.io.IOException;
  47 import java.io.InputStream;
  48 import java.text.ParseException;
  49 
  50 import sun.text.Normalizer;
  51 import sun.text.normalizer.CharTrie;
  52 import sun.text.normalizer.Trie;
  53 import sun.text.normalizer.NormalizerImpl;
  54 import sun.text.normalizer.VersionInfo;
  55 import sun.text.normalizer.UCharacter;
  56 import sun.text.normalizer.UCharacterIterator;
  57 import sun.text.normalizer.UTF16;
  58 import sun.net.idn.UCharacterDirection;
  59 import sun.net.idn.StringPrepDataReader;
  60 
  61 /**
  62  * StringPrep API implements the StingPrep framework as described by
  63  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
  64  * StringPrep prepares Unicode strings for use in network protocols.
  65  * Profiles of StingPrep are set of rules and data according to which the
  66  * Unicode Strings are prepared. Each profiles contains tables which describe
  67  * how a code point should be treated. The tables are broadly classied into
  68  * <ul>
  69  *     <li> Unassigned Table: Contains code points that are unassigned
  70  *          in the Unicode Version supported by StringPrep. Currently
  71  *          RFC 3454 supports Unicode 3.2. </li>
  72  *     <li> Prohibited Table: Contains code points that are prohibted from
  73  *          the output of the StringPrep processing function. </li>
  74  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
  75  * </ul>
  76  *
  77  * The procedure for preparing Unicode strings:
  78  * <ol>
  79  *      <li> Map: For each character in the input, check if it has a mapping
  80  *           and, if so, replace it with its mapping. </li>
  81  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
  82  *           normalization. </li>
  83  *      <li> Prohibit: Check for any characters that are not allowed in the
  84  *           output.  If any are found, return an error.</li>
  85  *      <li> Check bidi: Possibly check for right-to-left characters, and if
  86  *           any are found, make sure that the whole string satisfies the
  87  *           requirements for bidirectional strings.  If the string does not
  88  *           satisfy the requirements for bidirectional strings, return an
  89  *           error.  </li>
  90  * </ol>
  91  * @author Ram Viswanadha
  92  * @draft ICU 2.8
  93  */
  94 public final class StringPrep {
  95     /**
  96      * Option to prohibit processing of unassigned code points in the input
  97      *
  98      * @see   #prepare
  99      * @draft ICU 2.8
 100      */
 101     public static final int DEFAULT = 0x0000;
 102 
 103     /**
 104      * Option to allow processing of unassigned code points in the input
 105      *
 106      * @see   #prepare
 107      * @draft ICU 2.8
 108      */
 109     public static final int ALLOW_UNASSIGNED = 0x0001;
 110 
 111     private static final int UNASSIGNED        = 0x0000;
 112     private static final int MAP               = 0x0001;
 113     private static final int PROHIBITED        = 0x0002;
 114     private static final int DELETE            = 0x0003;
 115     private static final int TYPE_LIMIT        = 0x0004;
 116 
 117     private static final int NORMALIZATION_ON  = 0x0001;
 118     private static final int CHECK_BIDI_ON     = 0x0002;
 119 
 120     private static final int TYPE_THRESHOLD       = 0xFFF0;
 121     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
 122     private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
 123 
 124     /* indexes[] value names */
 125     private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
 126     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
 127     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
 128     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
 129     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
 130     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
 131     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
 132     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
 133     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
 134 
 135 
 136     /**
 137      * Default buffer size of datafile
 138      */
 139     private static final int DATA_BUFFER_SIZE = 25000;
 140 
 141     /* Wrappers for Trie implementations */
 142     private static final class StringPrepTrieImpl implements Trie.DataManipulate{
 143         private CharTrie sprepTrie = null;
 144        /**
 145         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
 146         * data the index array offset of the indexes for that lead surrogate.
 147         * @param property data value for a surrogate from the trie, including
 148         *        the folding offset
 149         * @return data offset or 0 if there is no data for the lead surrogate
 150         */
 151          public int getFoldingOffset(int value){
 152             return value;
 153         }
 154     }
 155 
 156     // CharTrie implementation for reading the trie data
 157     private StringPrepTrieImpl sprepTrieImpl;
 158     // Indexes read from the data file
 159     private int[] indexes;
 160     // mapping data read from the data file
 161     private char[] mappingData;
 162     // format version of the data file
 163     private byte[] formatVersion;
 164     // the version of Unicode supported by the data file
 165     private VersionInfo sprepUniVer;
 166     // the Unicode version of last entry in the
 167     // NormalizationCorrections.txt file if normalization
 168     // is turned on
 169     private VersionInfo normCorrVer;
 170     // Option to turn on Normalization
 171     private boolean doNFKC;
 172     // Option to turn on checking for BiDi rules
 173     private boolean checkBiDi;
 174 
 175 
 176     private char getCodePointValue(int ch){
 177         return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
 178     }
 179 
 180     private static VersionInfo getVersionInfo(int comp){
 181         int micro = comp & 0xFF;
 182         int milli =(comp >> 8)  & 0xFF;
 183         int minor =(comp >> 16) & 0xFF;
 184         int major =(comp >> 24) & 0xFF;
 185         return VersionInfo.getInstance(major,minor,milli,micro);
 186     }
 187     private static VersionInfo getVersionInfo(byte[] version){
 188         if(version.length != 4){
 189             return null;
 190         }
 191         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
 192     }
 193     /**
 194      * Creates an StringPrep object after reading the input stream.
 195      * The object does not hold a reference to the input steam, so the stream can be
 196      * closed after the method returns.
 197      *
 198      * @param inputStream The stream for reading the StringPrep profile binarySun
 199      * @throws IOException
 200      * @draft ICU 2.8
 201      */
 202     public StringPrep(InputStream inputStream) throws IOException{
 203 
 204         BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
 205 
 206         StringPrepDataReader reader = new StringPrepDataReader(b);
 207 
 208         // read the indexes
 209         indexes = reader.readIndexes(INDEX_TOP);
 210 
 211         byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
 212 
 213 
 214         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
 215         mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
 216         // load the rest of the data data and initialize the data members
 217         reader.read(sprepBytes,mappingData);
 218 
 219         sprepTrieImpl           = new StringPrepTrieImpl();
 220         sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
 221 
 222         // get the data format version
 223         formatVersion = reader.getDataFormatVersion();
 224 
 225         // get the options
 226         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
 227         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
 228         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
 229         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
 230         VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
 231         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
 232            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
 233            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
 234            ){
 235             throw new IOException("Normalization Correction version not supported");
 236         }
 237         b.close();
 238     }
 239 
 240     private static final class Values{
 241         boolean isIndex;
 242         int value;
 243         int type;
 244         public void reset(){
 245             isIndex = false;
 246             value = 0;
 247             type = -1;
 248         }
 249     }
 250 
 251     private static final void getValues(char trieWord,Values values){
 252         values.reset();
 253         if(trieWord == 0){
 254             /*
 255              * Initial value stored in the mapping table
 256              * just return TYPE_LIMIT .. so that
 257              * the source codepoint is copied to the destination
 258              */
 259             values.type = TYPE_LIMIT;
 260         }else if(trieWord >= TYPE_THRESHOLD){
 261             values.type = (trieWord - TYPE_THRESHOLD);
 262         }else{
 263             /* get the type */
 264             values.type = MAP;
 265             /* ascertain if the value is index or delta */
 266             if((trieWord & 0x02)>0){
 267                 values.isIndex = true;
 268                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
 269 
 270             }else{
 271                 values.isIndex = false;
 272                 values.value = (trieWord<<16)>>16;
 273                 values.value =  (values.value >> 2);
 274 
 275             }
 276 
 277             if((trieWord>>2) == MAX_INDEX_VALUE){
 278                 values.type = DELETE;
 279                 values.isIndex = false;
 280                 values.value = 0;
 281             }
 282         }
 283     }
 284 
 285 
 286 
 287     private StringBuffer map( UCharacterIterator iter, int options)
 288                             throws ParseException {
 289 
 290         Values val = new Values();
 291         char result = 0;
 292         int ch  = UCharacterIterator.DONE;
 293         StringBuffer dest = new StringBuffer();
 294         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
 295 
 296         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
 297 
 298             result = getCodePointValue(ch);
 299             getValues(result,val);
 300 
 301             // check if the source codepoint is unassigned
 302             if(val.type == UNASSIGNED && allowUnassigned == false){
 303                  throw new ParseException("An unassigned code point was found in the input " +
 304                                           iter.getText(), iter.getIndex());
 305             }else if((val.type == MAP)){
 306                 int index, length;
 307 
 308                 if(val.isIndex){
 309                     index = val.value;
 310                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
 311                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
 312                         length = 1;
 313                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
 314                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
 315                         length = 2;
 316                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
 317                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
 318                         length = 3;
 319                     }else{
 320                         length = mappingData[index++];
 321                     }
 322                     /* copy mapping to destination */
 323                     dest.append(mappingData,index,length);
 324                     continue;
 325 
 326                 }else{
 327                     ch -= val.value;
 328                 }
 329             }else if(val.type == DELETE){
 330                 // just consume the codepoint and contine
 331                 continue;
 332             }
 333             //copy the source into destination
 334             UTF16.append(dest,ch);
 335         }
 336 
 337         return dest;
 338     }
 339 
 340 
 341     private StringBuffer normalize(StringBuffer src){
 342         /*
 343          * Option UNORM_BEFORE_PRI_29:
 344          *
 345          * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
 346          * requires strict adherence to Unicode 3.2 normalization,
 347          * including buggy composition from before fixing Public Review Issue #29.
 348          * Note that this results in some valid but nonsensical text to be
 349          * either corrupted or rejected, depending on the text.
 350          * See http://www.unicode.org/review/resolved-pri.html#pri29
 351          * See unorm.cpp and cnormtst.c
 352          */
 353         return new StringBuffer(
 354             Normalizer.normalize(
 355                 src.toString(),
 356                 java.text.Normalizer.Form.NFKC,
 357                 Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
 358     }
 359     /*
 360     boolean isLabelSeparator(int ch){
 361         int result = getCodePointValue(ch);
 362         if( (result & 0x07)  == LABEL_SEPARATOR){
 363             return true;
 364         }
 365         return false;
 366     }
 367     */
 368      /*
 369        1) Map -- For each character in the input, check if it has a mapping
 370           and, if so, replace it with its mapping.
 371 
 372        2) Normalize -- Possibly normalize the result of step 1 using Unicode
 373           normalization.
 374 
 375        3) Prohibit -- Check for any characters that are not allowed in the
 376           output.  If any are found, return an error.
 377 
 378        4) Check bidi -- Possibly check for right-to-left characters, and if
 379           any are found, make sure that the whole string satisfies the
 380           requirements for bidirectional strings.  If the string does not
 381           satisfy the requirements for bidirectional strings, return an
 382           error.
 383           [Unicode3.2] defines several bidirectional categories; each character
 384            has one bidirectional category assigned to it.  For the purposes of
 385            the requirements below, an "RandALCat character" is a character that
 386            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
 387            is a character that has Unicode bidirectional category "L".  Note
 388 
 389 
 390            that there are many characters which fall in neither of the above
 391            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
 392            this because they have bidirectional category "EN".
 393 
 394            In any profile that specifies bidirectional character handling, all
 395            three of the following requirements MUST be met:
 396 
 397            1) The characters in section 5.8 MUST be prohibited.
 398 
 399            2) If a string contains any RandALCat character, the string MUST NOT
 400               contain any LCat character.
 401 
 402            3) If a string contains any RandALCat character, a RandALCat
 403               character MUST be the first character of the string, and a
 404               RandALCat character MUST be the last character of the string.
 405     */
 406     /**
 407      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
 408      * checks for prohited and BiDi characters in the order defined by RFC 3454
 409      * depending on the options specified in the profile.
 410      *
 411      * @param src           A UCharacterIterator object containing the source string
 412      * @param options       A bit set of options:
 413      *
 414      *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
 415      *
 416      *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
 417      *                                  as normal Unicode code points.
 418      *
 419      * @return StringBuffer A StringBuffer containing the output
 420      * @throws ParseException
 421      * @draft ICU 2.8
 422      */
 423     public StringBuffer prepare(UCharacterIterator src, int options)
 424                         throws ParseException{
 425 
 426         // map
 427         StringBuffer mapOut = map(src,options);
 428         StringBuffer normOut = mapOut;// initialize
 429 
 430         if(doNFKC){
 431             // normalize
 432             normOut = normalize(mapOut);
 433         }
 434 
 435         int ch;
 436         char result;
 437         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
 438         Values val = new Values();
 439         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
 440             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
 441         int rtlPos=-1, ltrPos=-1;
 442         boolean rightToLeft=false, leftToRight=false;
 443 
 444         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
 445             result = getCodePointValue(ch);
 446             getValues(result,val);
 447 
 448             if(val.type == PROHIBITED ){
 449                 throw new ParseException("A prohibited code point was found in the input" +
 450                                          iter.getText(), val.value);
 451             }
 452 
 453             direction = UCharacter.getDirection(ch);
 454             if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
 455                 firstCharDir = direction;
 456             }
 457             if(direction == UCharacterDirection.LEFT_TO_RIGHT){
 458                 leftToRight = true;
 459                 ltrPos = iter.getIndex()-1;
 460             }
 461             if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
 462                 rightToLeft = true;
 463                 rtlPos = iter.getIndex()-1;
 464             }
 465         }
 466         if(checkBiDi == true){
 467             // satisfy 2
 468             if( leftToRight == true && rightToLeft == true){
 469                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
 470                                          iter.getText(),
 471                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
 472              }
 473 
 474             //satisfy 3
 475             if( rightToLeft == true &&
 476                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
 477                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
 478               ){
 479                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
 480                                          iter.getText(),
 481                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
 482             }
 483         }
 484         return normOut;
 485 
 486       }
 487 }