1 /* 2 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * 28 * * 29 * The original version of this source code and documentation is copyrighted * 30 * and owned by IBM, These materials are provided under terms of a License * 31 * Agreement between IBM and Sun. This technology is protected by multiple * 32 * US and International patents. This notice and attribution to IBM may not * 33 * to removed. * 34 ******************************************************************************* 35 */ 36 37 package sun.text.normalizer; 38 39 public final class Utility { 40 41 /** 42 * Convenience utility to compare two Object[]s 43 * Ought to be in System. 44 * @param len the length to compare. 45 * The start indices and start+len must be valid. 46 */ 47 public final static boolean arrayRegionMatches(char[] source, int sourceStart, 48 char[] target, int targetStart, 49 int len) 50 { 51 int sourceEnd = sourceStart + len; 52 int delta = targetStart - sourceStart; 53 for (int i = sourceStart; i < sourceEnd; i++) { 54 if (source[i]!=target[i + delta]) 55 return false; 56 } 57 return true; 58 } 59 60 /** 61 * Convert characters outside the range U+0020 to U+007F to 62 * Unicode escapes, and convert backslash to a double backslash. 63 */ 64 public static final String escape(String s) { 65 StringBuffer buf = new StringBuffer(); 66 for (int i=0; i<s.length(); ) { 67 int c = UTF16.charAt(s, i); 68 i += UTF16.getCharCount(c); 69 if (c >= ' ' && c <= 0x007F) { 70 if (c == '\\') { 71 buf.append("\\\\"); // That is, "\\" 72 } else { 73 buf.append((char)c); 74 } 75 } else { 76 boolean four = c <= 0xFFFF; 77 buf.append(four ? "\\u" : "\\U"); 78 hex(c, four ? 4 : 8, buf); 79 } 80 } 81 return buf.toString(); 82 } 83 84 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 85 static private final char[] UNESCAPE_MAP = { 86 /*" 0x22, 0x22 */ 87 /*' 0x27, 0x27 */ 88 /*? 0x3F, 0x3F */ 89 /*\ 0x5C, 0x5C */ 90 /*a*/ 0x61, 0x07, 91 /*b*/ 0x62, 0x08, 92 /*e*/ 0x65, 0x1b, 93 /*f*/ 0x66, 0x0c, 94 /*n*/ 0x6E, 0x0a, 95 /*r*/ 0x72, 0x0d, 96 /*t*/ 0x74, 0x09, 97 /*v*/ 0x76, 0x0b 98 }; 99 100 /** 101 * Convert an escape to a 32-bit code point value. We attempt 102 * to parallel the icu4c unescapeAt() function. 103 * @param offset16 an array containing offset to the character 104 * <em>after</em> the backslash. Upon return offset16[0] will 105 * be updated to point after the escape sequence. 106 * @return character value from 0 to 10FFFF, or -1 on error. 107 */ 108 public static int unescapeAt(String s, int[] offset16) { 109 int c; 110 int result = 0; 111 int n = 0; 112 int minDig = 0; 113 int maxDig = 0; 114 int bitsPerDigit = 4; 115 int dig; 116 int i; 117 boolean braces = false; 118 119 /* Check that offset is in range */ 120 int offset = offset16[0]; 121 int length = s.length(); 122 if (offset < 0 || offset >= length) { 123 return -1; 124 } 125 126 /* Fetch first UChar after '\\' */ 127 c = UTF16.charAt(s, offset); 128 offset += UTF16.getCharCount(c); 129 130 /* Convert hexadecimal and octal escapes */ 131 switch (c) { 132 case 'u': 133 minDig = maxDig = 4; 134 break; 135 case 'U': 136 minDig = maxDig = 8; 137 break; 138 case 'x': 139 minDig = 1; 140 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 141 ++offset; 142 braces = true; 143 maxDig = 8; 144 } else { 145 maxDig = 2; 146 } 147 break; 148 default: 149 dig = UCharacter.digit(c, 8); 150 if (dig >= 0) { 151 minDig = 1; 152 maxDig = 3; 153 n = 1; /* Already have first octal digit */ 154 bitsPerDigit = 3; 155 result = dig; 156 } 157 break; 158 } 159 if (minDig != 0) { 160 while (offset < length && n < maxDig) { 161 c = UTF16.charAt(s, offset); 162 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 163 if (dig < 0) { 164 break; 165 } 166 result = (result << bitsPerDigit) | dig; 167 offset += UTF16.getCharCount(c); 168 ++n; 169 } 170 if (n < minDig) { 171 return -1; 172 } 173 if (braces) { 174 if (c != 0x7D /*}*/) { 175 return -1; 176 } 177 ++offset; 178 } 179 if (result < 0 || result >= 0x110000) { 180 return -1; 181 } 182 // If an escape sequence specifies a lead surrogate, see 183 // if there is a trail surrogate after it, either as an 184 // escape or as a literal. If so, join them up into a 185 // supplementary. 186 if (offset < length && 187 UTF16.isLeadSurrogate((char) result)) { 188 int ahead = offset+1; 189 c = s.charAt(offset); // [sic] get 16-bit code unit 190 if (c == '\\' && ahead < length) { 191 int o[] = new int[] { ahead }; 192 c = unescapeAt(s, o); 193 ahead = o[0]; 194 } 195 if (UTF16.isTrailSurrogate((char) c)) { 196 offset = ahead; 197 result = UCharacterProperty.getRawSupplementary( 198 (char) result, (char) c); 199 } 200 } 201 offset16[0] = offset; 202 return result; 203 } 204 205 /* Convert C-style escapes in table */ 206 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 207 if (c == UNESCAPE_MAP[i]) { 208 offset16[0] = offset; 209 return UNESCAPE_MAP[i+1]; 210 } else if (c < UNESCAPE_MAP[i]) { 211 break; 212 } 213 } 214 215 /* Map \cX to control-X: X & 0x1F */ 216 if (c == 'c' && offset < length) { 217 c = UTF16.charAt(s, offset); 218 offset16[0] = offset + UTF16.getCharCount(c); 219 return 0x1F & c; 220 } 221 222 /* If no special forms are recognized, then consider 223 * the backslash to generically escape the next character. */ 224 offset16[0] = offset; 225 return c; 226 } 227 228 /** 229 * Convert a integer to size width hex uppercase digits. 230 * E.g., {@code hex('a', 4, str) => "0041"}. 231 * Append the output to the given StringBuffer. 232 * If width is too small to fit, nothing will be appended to output. 233 */ 234 public static StringBuffer hex(int ch, int width, StringBuffer output) { 235 return appendNumber(output, ch, 16, width); 236 } 237 238 /** 239 * Convert a integer to size width (minimum) hex uppercase digits. 240 * E.g., {@code hex('a', 4, str) => "0041"}. If the integer requires more 241 * than width digits, more will be used. 242 */ 243 public static String hex(int ch, int width) { 244 StringBuffer buf = new StringBuffer(); 245 return appendNumber(buf, ch, 16, width).toString(); 246 } 247 248 /** 249 * Skip over a sequence of zero or more white space characters 250 * at pos. Return the index of the first non-white-space character 251 * at or after pos, or str.length(), if there is none. 252 */ 253 public static int skipWhitespace(String str, int pos) { 254 while (pos < str.length()) { 255 int c = UTF16.charAt(str, pos); 256 if (!UCharacterProperty.isRuleWhiteSpace(c)) { 257 break; 258 } 259 pos += UTF16.getCharCount(c); 260 } 261 return pos; 262 } 263 264 static final char DIGITS[] = { 265 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 266 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 267 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 268 'U', 'V', 'W', 'X', 'Y', 'Z' 269 }; 270 271 /** 272 * Append the digits of a positive integer to the given 273 * <code>StringBuffer</code> in the given radix. This is 274 * done recursively since it is easiest to generate the low- 275 * order digit first, but it must be appended last. 276 * 277 * @param result is the <code>StringBuffer</code> to append to 278 * @param n is the positive integer 279 * @param radix is the radix, from 2 to 36 inclusive 280 * @param minDigits is the minimum number of digits to append. 281 */ 282 private static void recursiveAppendNumber(StringBuffer result, int n, 283 int radix, int minDigits) 284 { 285 int digit = n % radix; 286 287 if (n >= radix || minDigits > 1) { 288 recursiveAppendNumber(result, n / radix, radix, minDigits - 1); 289 } 290 291 result.append(DIGITS[digit]); 292 } 293 294 /** 295 * Append a number to the given StringBuffer in the given radix. 296 * Standard digits '0'-'9' are used and letters 'A'-'Z' for 297 * radices 11 through 36. 298 * @param result the digits of the number are appended here 299 * @param n the number to be converted to digits; may be negative. 300 * If negative, a '-' is prepended to the digits. 301 * @param radix a radix from 2 to 36 inclusive. 302 * @param minDigits the minimum number of digits, not including 303 * any '-', to produce. Values less than 2 have no effect. One 304 * digit is always emitted regardless of this parameter. 305 * @return a reference to result 306 */ 307 public static StringBuffer appendNumber(StringBuffer result, int n, 308 int radix, int minDigits) 309 throws IllegalArgumentException 310 { 311 if (radix < 2 || radix > 36) { 312 throw new IllegalArgumentException("Illegal radix " + radix); 313 } 314 315 316 int abs = n; 317 318 if (n < 0) { 319 abs = -n; 320 result.append("-"); 321 } 322 323 recursiveAppendNumber(result, abs, radix, minDigits); 324 325 return result; 326 } 327 328 /** 329 * Return true if the character is NOT printable ASCII. The tab, 330 * newline and linefeed characters are considered unprintable. 331 */ 332 public static boolean isUnprintable(int c) { 333 return !(c >= 0x20 && c <= 0x7E); 334 } 335 336 /** 337 * Escape unprintable characters using {@code <backslash>uxxxx} notation 338 * for U+0000 to U+FFFF and {@code <backslash>Uxxxxxxxx} for U+10000 and 339 * above. If the character is printable ASCII, then do nothing 340 * and return FALSE. Otherwise, append the escaped notation and 341 * return TRUE. 342 */ 343 public static boolean escapeUnprintable(StringBuffer result, int c) { 344 if (isUnprintable(c)) { 345 result.append('\\'); 346 if ((c & ~0xFFFF) != 0) { 347 result.append('U'); 348 result.append(DIGITS[0xF&(c>>28)]); 349 result.append(DIGITS[0xF&(c>>24)]); 350 result.append(DIGITS[0xF&(c>>20)]); 351 result.append(DIGITS[0xF&(c>>16)]); 352 } else { 353 result.append('u'); 354 } 355 result.append(DIGITS[0xF&(c>>12)]); 356 result.append(DIGITS[0xF&(c>>8)]); 357 result.append(DIGITS[0xF&(c>>4)]); 358 result.append(DIGITS[0xF&c]); 359 return true; 360 } 361 return false; 362 } 363 364 /** 365 * Similar to StringBuffer.getChars, version 1.3. 366 * Since JDK 1.2 implements StringBuffer.getChars differently, this method 367 * is here to provide consistent results. 368 * To be removed after JDK 1.2 ceased to be the reference platform. 369 * @param src source string buffer 370 * @param srcBegin offset to the start of the src to retrieve from 371 * @param srcEnd offset to the end of the src to retrieve from 372 * @param dst char array to store the retrieved chars 373 * @param dstBegin offset to the start of the destination char array to 374 * store the retrieved chars 375 */ 376 public static void getChars(StringBuffer src, int srcBegin, int srcEnd, 377 char dst[], int dstBegin) 378 { 379 if (srcBegin == srcEnd) { 380 return; 381 } 382 src.getChars(srcBegin, srcEnd, dst, dstBegin); 383 } 384 385 }