1 /* 2 * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 ******************************************************************************* 28 * Copyright (C) 2009-2014, International Business Machines 29 * Corporation and others. All Rights Reserved. 30 ******************************************************************************* 31 */ 32 package jdk.internal.icu.text; 33 34 import java.io.IOException; 35 36 /** 37 * Normalization filtered by a UnicodeSet. 38 * Normalizes portions of the text contained in the filter set and leaves 39 * portions not contained in the filter set unchanged. 40 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE). 41 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 42 * This class implements all of (and only) the Normalizer2 API. 43 * An instance of this class is unmodifiable/immutable. 44 * @stable ICU 4.4 45 * @author Markus W. Scherer 46 */ 47 class FilteredNormalizer2 extends Normalizer2 { 48 49 /** 50 * Constructs a filtered normalizer wrapping any Normalizer2 instance 51 * and a filter set. 52 * Both are aliased and must not be modified or deleted while this object 53 * is used. 54 * The filter set should be frozen; otherwise the performance will suffer greatly. 55 * @param n2 wrapped Normalizer2 instance 56 * @param filterSet UnicodeSet which determines the characters to be normalized 57 * @stable ICU 4.4 58 */ 59 public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) { 60 norm2=n2; 61 set=filterSet; 62 } 63 64 /** 65 * {@inheritDoc} 66 * @stable ICU 4.4 67 */ 68 @Override 69 public StringBuilder normalize(CharSequence src, StringBuilder dest) { 70 if(dest==src) { 71 throw new IllegalArgumentException(); 72 } 73 dest.setLength(0); 74 normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 75 return dest; 76 } 77 78 /** 79 * {@inheritDoc} 80 * @stable ICU 4.6 81 */ 82 @Override 83 public Appendable normalize(CharSequence src, Appendable dest) { 84 if(dest==src) { 85 throw new IllegalArgumentException(); 86 } 87 return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); 88 } 89 90 /** 91 * {@inheritDoc} 92 * @stable ICU 4.4 93 */ 94 @Override 95 public StringBuilder normalizeSecondAndAppend( 96 StringBuilder first, CharSequence second) { 97 return normalizeSecondAndAppend(first, second, true); 98 } 99 100 /** 101 * {@inheritDoc} 102 * @stable ICU 4.4 103 */ 104 @Override 105 public StringBuilder append(StringBuilder first, CharSequence second) { 106 return normalizeSecondAndAppend(first, second, false); 107 } 108 109 /** 110 * {@inheritDoc} 111 * @stable ICU 4.6 112 */ 113 @Override 114 public String getDecomposition(int c) { 115 return set.contains(c) ? norm2.getDecomposition(c) : null; 116 } 117 118 /** 119 * {@inheritDoc} 120 * @stable ICU 49 121 */ 122 @Override 123 public int getCombiningClass(int c) { 124 return set.contains(c) ? norm2.getCombiningClass(c) : 0; 125 } 126 127 /** 128 * {@inheritDoc} 129 * @stable ICU 4.4 130 */ 131 @Override 132 public boolean isNormalized(CharSequence s) { 133 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 134 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 135 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 136 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 137 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 138 } else { 139 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) { 140 return false; 141 } 142 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 143 } 144 prevSpanLimit=spanLimit; 145 } 146 return true; 147 } 148 149 /** 150 * {@inheritDoc} 151 * @stable ICU 4.4 152 */ 153 @Override 154 public int spanQuickCheckYes(CharSequence s) { 155 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; 156 for(int prevSpanLimit=0; prevSpanLimit<s.length();) { 157 int spanLimit=set.span(s, prevSpanLimit, spanCondition); 158 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 159 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 160 } else { 161 int yesLimit= 162 prevSpanLimit+ 163 norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit)); 164 if(yesLimit<spanLimit) { 165 return yesLimit; 166 } 167 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 168 } 169 prevSpanLimit=spanLimit; 170 } 171 return s.length(); 172 } 173 174 /** 175 * {@inheritDoc} 176 * @stable ICU 4.4 177 */ 178 @Override 179 public boolean hasBoundaryBefore(int c) { 180 return !set.contains(c) || norm2.hasBoundaryBefore(c); 181 } 182 183 // Internal: No argument checking, and appends to dest. 184 // Pass as input spanCondition the one that is likely to yield a non-zero 185 // span length at the start of src. 186 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, 187 // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src 188 // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after 189 // an in-filter prefix. 190 private Appendable normalize(CharSequence src, Appendable dest, 191 UnicodeSet.SpanCondition spanCondition) { 192 // Don't throw away destination buffer between iterations. 193 StringBuilder tempDest=new StringBuilder(); 194 try { 195 for(int prevSpanLimit=0; prevSpanLimit<src.length();) { 196 int spanLimit=set.span(src, prevSpanLimit, spanCondition); 197 int spanLength=spanLimit-prevSpanLimit; 198 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { 199 if(spanLength!=0) { 200 dest.append(src, prevSpanLimit, spanLimit); 201 } 202 spanCondition=UnicodeSet.SpanCondition.SIMPLE; 203 } else { 204 if(spanLength!=0) { 205 // Not norm2.normalizeSecondAndAppend() because we do not want 206 // to modify the non-filter part of dest. 207 dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest)); 208 } 209 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; 210 } 211 prevSpanLimit=spanLimit; 212 } 213 } catch(IOException e) { 214 throw new InternalError(e.toString(), e); 215 } 216 return dest; 217 } 218 219 private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, 220 boolean doNormalize) { 221 if(first==second) { 222 throw new IllegalArgumentException(); 223 } 224 if(first.length()==0) { 225 if(doNormalize) { 226 return normalize(second, first); 227 } else { 228 return first.append(second); 229 } 230 } 231 // merge the in-filter suffix of the first string with the in-filter prefix of the second 232 int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE); 233 if(prefixLimit!=0) { 234 CharSequence prefix=second.subSequence(0, prefixLimit); 235 int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE); 236 if(suffixStart==0) { 237 if(doNormalize) { 238 norm2.normalizeSecondAndAppend(first, prefix); 239 } else { 240 norm2.append(first, prefix); 241 } 242 } else { 243 StringBuilder middle=new StringBuilder( 244 first.subSequence(suffixStart, first.length())); 245 if(doNormalize) { 246 norm2.normalizeSecondAndAppend(middle, prefix); 247 } else { 248 norm2.append(middle, prefix); 249 } 250 first.delete(suffixStart, 0x7fffffff).append(middle); 251 } 252 } 253 if(prefixLimit<second.length()) { 254 CharSequence rest=second.subSequence(prefixLimit, second.length()); 255 if(doNormalize) { 256 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED); 257 } else { 258 first.append(rest); 259 } 260 } 261 return first; 262 } 263 264 private Normalizer2 norm2; 265 private UnicodeSet set; 266 };