1 /* 2 * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2011, International Business Machines Corporation and * 28 * others. All Rights Reserved. * 29 ******************************************************************************* 30 */ 31 32 package sun.text.normalizer; 33 34 import java.io.IOException; 35 import java.util.Locale; 36 37 final class Utility { 38 39 /** 40 * Convert characters outside the range U+0020 to U+007F to 41 * Unicode escapes, and convert backslash to a double backslash. 42 */ 43 public static final String escape(String s) { 44 StringBuilder buf = new StringBuilder(); 45 for (int i=0; i<s.length(); ) { 46 int c = Character.codePointAt(s, i); 47 i += UTF16.getCharCount(c); 48 if (c >= ' ' && c <= 0x007F) { 49 if (c == '\\') { 50 buf.append("\\\\"); // That is, "\\" 51 } else { 52 buf.append((char)c); 53 } 54 } else { 55 boolean four = c <= 0xFFFF; 56 buf.append(four ? "\\u" : "\\U"); 57 buf.append(hex(c, four ? 4 : 8)); 58 } 59 } 60 return buf.toString(); 61 } 62 63 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 64 private static final char[] UNESCAPE_MAP = { 65 /*" 0x22, 0x22 */ 66 /*' 0x27, 0x27 */ 67 /*? 0x3F, 0x3F */ 68 /*\ 0x5C, 0x5C */ 69 /*a*/ 0x61, 0x07, 70 /*b*/ 0x62, 0x08, 71 /*e*/ 0x65, 0x1b, 72 /*f*/ 0x66, 0x0c, 73 /*n*/ 0x6E, 0x0a, 74 /*r*/ 0x72, 0x0d, 75 /*t*/ 0x74, 0x09, 76 /*v*/ 0x76, 0x0b 77 }; 78 79 /** 80 * Convert an escape to a 32-bit code point value. We attempt 81 * to parallel the icu4c unescapeAt() function. 82 * @param offset16 an array containing offset to the character 83 * <em>after</em> the backslash. Upon return offset16[0] will 84 * be updated to point after the escape sequence. 85 * @return character value from 0 to 10FFFF, or -1 on error. 86 */ 87 public static int unescapeAt(String s, int[] offset16) { 88 int c; 89 int result = 0; 90 int n = 0; 91 int minDig = 0; 92 int maxDig = 0; 93 int bitsPerDigit = 4; 94 int dig; 95 int i; 96 boolean braces = false; 97 98 /* Check that offset is in range */ 99 int offset = offset16[0]; 100 int length = s.length(); 101 if (offset < 0 || offset >= length) { 102 return -1; 103 } 104 105 /* Fetch first UChar after '\\' */ 106 c = Character.codePointAt(s, offset); 107 offset += UTF16.getCharCount(c); 108 109 /* Convert hexadecimal and octal escapes */ 110 switch (c) { 111 case 'u': 112 minDig = maxDig = 4; 113 break; 114 case 'U': 115 minDig = maxDig = 8; 116 break; 117 case 'x': 118 minDig = 1; 119 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 120 ++offset; 121 braces = true; 122 maxDig = 8; 123 } else { 124 maxDig = 2; 125 } 126 break; 127 default: 128 dig = UCharacter.digit(c, 8); 129 if (dig >= 0) { 130 minDig = 1; 131 maxDig = 3; 132 n = 1; /* Already have first octal digit */ 133 bitsPerDigit = 3; 134 result = dig; 135 } 136 break; 137 } 138 if (minDig != 0) { 139 while (offset < length && n < maxDig) { 140 c = UTF16.charAt(s, offset); 141 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 142 if (dig < 0) { 143 break; 144 } 145 result = (result << bitsPerDigit) | dig; 146 offset += UTF16.getCharCount(c); 147 ++n; 148 } 149 if (n < minDig) { 150 return -1; 151 } 152 if (braces) { 153 if (c != 0x7D /*}*/) { 154 return -1; 155 } 156 ++offset; 157 } 158 if (result < 0 || result >= 0x110000) { 159 return -1; 160 } 161 // If an escape sequence specifies a lead surrogate, see 162 // if there is a trail surrogate after it, either as an 163 // escape or as a literal. If so, join them up into a 164 // supplementary. 165 if (offset < length && 166 UTF16.isLeadSurrogate((char) result)) { 167 int ahead = offset+1; 168 c = s.charAt(offset); // [sic] get 16-bit code unit 169 if (c == '\\' && ahead < length) { 170 int[] o = new int[] { ahead }; 171 c = unescapeAt(s, o); 172 ahead = o[0]; 173 } 174 if (UTF16.isTrailSurrogate((char) c)) { 175 offset = ahead; 176 result = UCharacterProperty.getRawSupplementary( 177 (char) result, (char) c); 178 } 179 } 180 offset16[0] = offset; 181 return result; 182 } 183 184 /* Convert C-style escapes in table */ 185 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 186 if (c == UNESCAPE_MAP[i]) { 187 offset16[0] = offset; 188 return UNESCAPE_MAP[i+1]; 189 } else if (c < UNESCAPE_MAP[i]) { 190 break; 191 } 192 } 193 194 /* Map \cX to control-X: X & 0x1F */ 195 if (c == 'c' && offset < length) { 196 c = UTF16.charAt(s, offset); 197 offset16[0] = offset + UTF16.getCharCount(c); 198 return 0x1F & c; 199 } 200 201 /* If no special forms are recognized, then consider 202 * the backslash to generically escape the next character. */ 203 offset16[0] = offset; 204 return c; 205 } 206 207 /** 208 * Supplies a zero-padded hex representation of an integer (without 0x) 209 */ 210 public static String hex(long i, int places) { 211 if (i == Long.MIN_VALUE) return "-8000000000000000"; 212 boolean negative = i < 0; 213 if (negative) { 214 i = -i; 215 } 216 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 217 if (result.length() < places) { 218 result = "0000000000000000".substring(result.length(),places) + result; 219 } 220 if (negative) { 221 return '-' + result; 222 } 223 return result; 224 } 225 226 static final char[] DIGITS = { 227 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 228 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 229 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 230 'U', 'V', 'W', 'X', 'Y', 'Z' 231 }; 232 233 /** 234 * Return true if the character is NOT printable ASCII. The tab, 235 * newline and linefeed characters are considered unprintable. 236 */ 237 public static boolean isUnprintable(int c) { 238 //0x20 = 32 and 0x7E = 126 239 return !(c >= 0x20 && c <= 0x7E); 240 } 241 242 /** 243 * Escape unprintable characters using <backslash>uxxxx notation 244 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 245 * above. If the character is printable ASCII, then do nothing 246 * and return FALSE. Otherwise, append the escaped notation and 247 * return TRUE. 248 */ 249 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 250 try { 251 if (isUnprintable(c)) { 252 result.append('\\'); 253 if ((c & ~0xFFFF) != 0) { 254 result.append('U'); 255 result.append(DIGITS[0xF&(c>>28)]); 256 result.append(DIGITS[0xF&(c>>24)]); 257 result.append(DIGITS[0xF&(c>>20)]); 258 result.append(DIGITS[0xF&(c>>16)]); 259 } else { 260 result.append('u'); 261 } 262 result.append(DIGITS[0xF&(c>>12)]); 263 result.append(DIGITS[0xF&(c>>8)]); 264 result.append(DIGITS[0xF&(c>>4)]); 265 result.append(DIGITS[0xF&c]); 266 return true; 267 } 268 return false; 269 } catch (IOException e) { 270 throw new IllegalArgumentException(e); 271 } 272 } 273 }