1 /* 2 * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 /* 26 ******************************************************************************* 27 * Copyright (C) 1996-2011, International Business Machines Corporation and * 28 * others. All Rights Reserved. * 29 ******************************************************************************* 30 */ 31 32 package jdk.internal.icu.impl; 33 34 import jdk.internal.icu.lang.UCharacter; 35 import jdk.internal.icu.text.UTF16; 36 37 import java.io.IOException; 38 import java.util.Locale; 39 40 public final class Utility { 41 42 /** 43 * Convert characters outside the range U+0020 to U+007F to 44 * Unicode escapes, and convert backslash to a double backslash. 45 */ 46 public static final String escape(String s) { 47 StringBuilder buf = new StringBuilder(); 48 for (int i=0; i<s.length(); ) { 49 int c = Character.codePointAt(s, i); 50 i += UTF16.getCharCount(c); 51 if (c >= ' ' && c <= 0x007F) { 52 if (c == '\\') { 53 buf.append("\\\\"); // That is, "\\" 54 } else { 55 buf.append((char)c); 56 } 57 } else { 58 boolean four = c <= 0xFFFF; 59 buf.append(four ? "\\u" : "\\U"); 60 buf.append(hex(c, four ? 4 : 8)); 61 } 62 } 63 return buf.toString(); 64 } 65 66 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 67 private static final char[] UNESCAPE_MAP = { 68 /*" 0x22, 0x22 */ 69 /*' 0x27, 0x27 */ 70 /*? 0x3F, 0x3F */ 71 /*\ 0x5C, 0x5C */ 72 /*a*/ 0x61, 0x07, 73 /*b*/ 0x62, 0x08, 74 /*e*/ 0x65, 0x1b, 75 /*f*/ 0x66, 0x0c, 76 /*n*/ 0x6E, 0x0a, 77 /*r*/ 0x72, 0x0d, 78 /*t*/ 0x74, 0x09, 79 /*v*/ 0x76, 0x0b 80 }; 81 82 /** 83 * Convert an escape to a 32-bit code point value. We attempt 84 * to parallel the icu4c unescapeAt() function. 85 * @param offset16 an array containing offset to the character 86 * <em>after</em> the backslash. Upon return offset16[0] will 87 * be updated to point after the escape sequence. 88 * @return character value from 0 to 10FFFF, or -1 on error. 89 */ 90 public static int unescapeAt(String s, int[] offset16) { 91 int c; 92 int result = 0; 93 int n = 0; 94 int minDig = 0; 95 int maxDig = 0; 96 int bitsPerDigit = 4; 97 int dig; 98 int i; 99 boolean braces = false; 100 101 /* Check that offset is in range */ 102 int offset = offset16[0]; 103 int length = s.length(); 104 if (offset < 0 || offset >= length) { 105 return -1; 106 } 107 108 /* Fetch first UChar after '\\' */ 109 c = Character.codePointAt(s, offset); 110 offset += UTF16.getCharCount(c); 111 112 /* Convert hexadecimal and octal escapes */ 113 switch (c) { 114 case 'u': 115 minDig = maxDig = 4; 116 break; 117 case 'U': 118 minDig = maxDig = 8; 119 break; 120 case 'x': 121 minDig = 1; 122 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { 123 ++offset; 124 braces = true; 125 maxDig = 8; 126 } else { 127 maxDig = 2; 128 } 129 break; 130 default: 131 dig = UCharacter.digit(c, 8); 132 if (dig >= 0) { 133 minDig = 1; 134 maxDig = 3; 135 n = 1; /* Already have first octal digit */ 136 bitsPerDigit = 3; 137 result = dig; 138 } 139 break; 140 } 141 if (minDig != 0) { 142 while (offset < length && n < maxDig) { 143 c = UTF16.charAt(s, offset); 144 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); 145 if (dig < 0) { 146 break; 147 } 148 result = (result << bitsPerDigit) | dig; 149 offset += UTF16.getCharCount(c); 150 ++n; 151 } 152 if (n < minDig) { 153 return -1; 154 } 155 if (braces) { 156 if (c != 0x7D /*}*/) { 157 return -1; 158 } 159 ++offset; 160 } 161 if (result < 0 || result >= 0x110000) { 162 return -1; 163 } 164 // If an escape sequence specifies a lead surrogate, see 165 // if there is a trail surrogate after it, either as an 166 // escape or as a literal. If so, join them up into a 167 // supplementary. 168 if (offset < length && 169 UTF16.isLeadSurrogate((char) result)) { 170 int ahead = offset+1; 171 c = s.charAt(offset); // [sic] get 16-bit code unit 172 if (c == '\\' && ahead < length) { 173 int o[] = new int[] { ahead }; 174 c = unescapeAt(s, o); 175 ahead = o[0]; 176 } 177 if (UTF16.isTrailSurrogate((char) c)) { 178 offset = ahead; 179 result = UCharacterProperty.getRawSupplementary( 180 (char) result, (char) c); 181 } 182 } 183 offset16[0] = offset; 184 return result; 185 } 186 187 /* Convert C-style escapes in table */ 188 for (i=0; i<UNESCAPE_MAP.length; i+=2) { 189 if (c == UNESCAPE_MAP[i]) { 190 offset16[0] = offset; 191 return UNESCAPE_MAP[i+1]; 192 } else if (c < UNESCAPE_MAP[i]) { 193 break; 194 } 195 } 196 197 /* Map \cX to control-X: X & 0x1F */ 198 if (c == 'c' && offset < length) { 199 c = UTF16.charAt(s, offset); 200 offset16[0] = offset + UTF16.getCharCount(c); 201 return 0x1F & c; 202 } 203 204 /* If no special forms are recognized, then consider 205 * the backslash to generically escape the next character. */ 206 offset16[0] = offset; 207 return c; 208 } 209 210 /** 211 * Supplies a zero-padded hex representation of an integer (without 0x) 212 */ 213 public static String hex(long i, int places) { 214 if (i == Long.MIN_VALUE) return "-8000000000000000"; 215 boolean negative = i < 0; 216 if (negative) { 217 i = -i; 218 } 219 String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); 220 if (result.length() < places) { 221 result = "0000000000000000".substring(result.length(),places) + result; 222 } 223 if (negative) { 224 return '-' + result; 225 } 226 return result; 227 } 228 229 static final char DIGITS[] = { 230 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 231 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 232 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 233 'U', 'V', 'W', 'X', 'Y', 'Z' 234 }; 235 236 /** 237 * Return true if the character is NOT printable ASCII. The tab, 238 * newline and linefeed characters are considered unprintable. 239 */ 240 public static boolean isUnprintable(int c) { 241 //0x20 = 32 and 0x7E = 126 242 return !(c >= 0x20 && c <= 0x7E); 243 } 244 245 /** 246 * Escape unprintable characters using <backslash>uxxxx notation 247 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and 248 * above. If the character is printable ASCII, then do nothing 249 * and return FALSE. Otherwise, append the escaped notation and 250 * return TRUE. 251 */ 252 public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { 253 try { 254 if (isUnprintable(c)) { 255 result.append('\\'); 256 if ((c & ~0xFFFF) != 0) { 257 result.append('U'); 258 result.append(DIGITS[0xF&(c>>28)]); 259 result.append(DIGITS[0xF&(c>>24)]); 260 result.append(DIGITS[0xF&(c>>20)]); 261 result.append(DIGITS[0xF&(c>>16)]); 262 } else { 263 result.append('u'); 264 } 265 result.append(DIGITS[0xF&(c>>12)]); 266 result.append(DIGITS[0xF&(c>>8)]); 267 result.append(DIGITS[0xF&(c>>4)]); 268 result.append(DIGITS[0xF&c]); 269 return true; 270 } 271 return false; 272 } catch (IOException e) { 273 throw new IllegalArgumentException(e); 274 } 275 } 276 }