1 /*
   2  * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27 *******************************************************************************
  28 *   Copyright (C) 2009-2014, International Business Machines
  29 *   Corporation and others.  All Rights Reserved.
  30 *******************************************************************************
  31 */
  32 package jdk.internal.icu.text;
  33 
  34 import java.io.IOException;
  35 
  36 /**
  37  * Normalization filtered by a UnicodeSet.
  38  * Normalizes portions of the text contained in the filter set and leaves
  39  * portions not contained in the filter set unchanged.
  40  * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
  41  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
  42  * This class implements all of (and only) the Normalizer2 API.
  43  * An instance of this class is unmodifiable/immutable.
  44  * @stable ICU 4.4
  45  * @author Markus W. Scherer
  46  */
  47 class FilteredNormalizer2 extends Normalizer2 {
  48 
  49     /**
  50      * Constructs a filtered normalizer wrapping any Normalizer2 instance
  51      * and a filter set.
  52      * Both are aliased and must not be modified or deleted while this object
  53      * is used.
  54      * The filter set should be frozen; otherwise the performance will suffer greatly.
  55      * @param n2 wrapped Normalizer2 instance
  56      * @param filterSet UnicodeSet which determines the characters to be normalized
  57      * @stable ICU 4.4
  58      */
  59     public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
  60         norm2=n2;
  61         set=filterSet;
  62     }
  63 
  64     /**
  65      * {@inheritDoc}
  66      * @stable ICU 4.4
  67      */
  68     @Override
  69     public StringBuilder normalize(CharSequence src, StringBuilder dest) {
  70         if(dest==src) {
  71             throw new IllegalArgumentException();
  72         }
  73         dest.setLength(0);
  74         normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
  75         return dest;
  76     }
  77 
  78     /**
  79      * {@inheritDoc}
  80      * @stable ICU 4.6
  81      */
  82     @Override
  83     public Appendable normalize(CharSequence src, Appendable dest) {
  84         if(dest==src) {
  85             throw new IllegalArgumentException();
  86         }
  87         return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
  88     }
  89 
  90     /**
  91      * {@inheritDoc}
  92      * @stable ICU 4.4
  93      */
  94     @Override
  95     public StringBuilder normalizeSecondAndAppend(
  96             StringBuilder first, CharSequence second) {
  97         return normalizeSecondAndAppend(first, second, true);
  98     }
  99 
 100     /**
 101      * {@inheritDoc}
 102      * @stable ICU 4.4
 103      */
 104     @Override
 105     public StringBuilder append(StringBuilder first, CharSequence second) {
 106         return normalizeSecondAndAppend(first, second, false);
 107     }
 108 
 109     /**
 110      * {@inheritDoc}
 111      * @stable ICU 4.6
 112      */
 113     @Override
 114     public String getDecomposition(int c) {
 115         return set.contains(c) ? norm2.getDecomposition(c) : null;
 116     }
 117 
 118     /**
 119      * {@inheritDoc}
 120      * @stable ICU 49
 121      */
 122     @Override
 123     public int getCombiningClass(int c) {
 124         return set.contains(c) ? norm2.getCombiningClass(c) : 0;
 125     }
 126 
 127     /**
 128      * {@inheritDoc}
 129      * @stable ICU 4.4
 130      */
 131     @Override
 132     public boolean isNormalized(CharSequence s) {
 133         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
 134         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
 135             int spanLimit=set.span(s, prevSpanLimit, spanCondition);
 136             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
 137                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
 138             } else {
 139                 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
 140                     return false;
 141                 }
 142                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
 143             }
 144             prevSpanLimit=spanLimit;
 145         }
 146         return true;
 147     }
 148 
 149     /**
 150      * {@inheritDoc}
 151      * @stable ICU 4.4
 152      */
 153     @Override
 154     public int spanQuickCheckYes(CharSequence s) {
 155         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
 156         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
 157             int spanLimit=set.span(s, prevSpanLimit, spanCondition);
 158             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
 159                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
 160             } else {
 161                 int yesLimit=
 162                     prevSpanLimit+
 163                     norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
 164                 if(yesLimit<spanLimit) {
 165                     return yesLimit;
 166                 }
 167                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
 168             }
 169             prevSpanLimit=spanLimit;
 170         }
 171         return s.length();
 172     }
 173 
 174     /**
 175      * {@inheritDoc}
 176      * @stable ICU 4.4
 177      */
 178     @Override
 179     public boolean hasBoundaryBefore(int c) {
 180         return !set.contains(c) || norm2.hasBoundaryBefore(c);
 181     }
 182 
 183     // Internal: No argument checking, and appends to dest.
 184     // Pass as input spanCondition the one that is likely to yield a non-zero
 185     // span length at the start of src.
 186     // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
 187     // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
 188     // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
 189     // an in-filter prefix.
 190     private Appendable normalize(CharSequence src, Appendable dest,
 191                                  UnicodeSet.SpanCondition spanCondition) {
 192         // Don't throw away destination buffer between iterations.
 193         StringBuilder tempDest=new StringBuilder();
 194         try {
 195             for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
 196                 int spanLimit=set.span(src, prevSpanLimit, spanCondition);
 197                 int spanLength=spanLimit-prevSpanLimit;
 198                 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
 199                     if(spanLength!=0) {
 200                         dest.append(src, prevSpanLimit, spanLimit);
 201                     }
 202                     spanCondition=UnicodeSet.SpanCondition.SIMPLE;
 203                 } else {
 204                     if(spanLength!=0) {
 205                         // Not norm2.normalizeSecondAndAppend() because we do not want
 206                         // to modify the non-filter part of dest.
 207                         dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
 208                     }
 209                     spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
 210                 }
 211                 prevSpanLimit=spanLimit;
 212             }
 213         } catch(IOException e) {
 214             throw new InternalError(e.toString(), e);
 215         }
 216         return dest;
 217     }
 218 
 219     private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
 220                                                    boolean doNormalize) {
 221         if(first==second) {
 222             throw new IllegalArgumentException();
 223         }
 224         if(first.length()==0) {
 225             if(doNormalize) {
 226                 return normalize(second, first);
 227             } else {
 228                 return first.append(second);
 229             }
 230         }
 231         // merge the in-filter suffix of the first string with the in-filter prefix of the second
 232         int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
 233         if(prefixLimit!=0) {
 234             CharSequence prefix=second.subSequence(0, prefixLimit);
 235             int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
 236             if(suffixStart==0) {
 237                 if(doNormalize) {
 238                     norm2.normalizeSecondAndAppend(first, prefix);
 239                 } else {
 240                     norm2.append(first, prefix);
 241                 }
 242             } else {
 243                 StringBuilder middle=new StringBuilder(
 244                         first.subSequence(suffixStart, first.length()));
 245                 if(doNormalize) {
 246                     norm2.normalizeSecondAndAppend(middle, prefix);
 247                 } else {
 248                     norm2.append(middle, prefix);
 249                 }
 250                 first.delete(suffixStart, 0x7fffffff).append(middle);
 251             }
 252         }
 253         if(prefixLimit<second.length()) {
 254             CharSequence rest=second.subSequence(prefixLimit, second.length());
 255             if(doNormalize) {
 256                 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
 257             } else {
 258                 first.append(rest);
 259             }
 260         }
 261         return first;
 262     }
 263 
 264     private Normalizer2 norm2;
 265     private UnicodeSet set;
 266 };