New src/java.base/share/classes/jdk/internal/icu/impl/Utility.java

   1 /*
   2  * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 1996-2011, International Business Machines Corporation and    *
  28  * others. All Rights Reserved.                                                *
  29  *******************************************************************************
  30  */
  31 
  32 package jdk.internal.icu.impl;
  33 
  34 import jdk.internal.icu.lang.UCharacter;
  35 import jdk.internal.icu.text.UTF16;
  36 
  37 import java.io.IOException;
  38 import java.util.Locale;
  39 
  40 public final class Utility {
  41 
  42     /**
  43      * Convert characters outside the range U+0020 to U+007F to
  44      * Unicode escapes, and convert backslash to a double backslash.
  45      */
  46     public static final String escape(String s) {
  47         StringBuilder buf = new StringBuilder();
  48         for (int i=0; i<s.length(); ) {
  49             int c = Character.codePointAt(s, i);
  50             i += UTF16.getCharCount(c);
  51             if (c >= ' ' && c <= 0x007F) {
  52                 if (c == '\\') {
  53                     buf.append("\\\\"); // That is, "\\"
  54                 } else {
  55                     buf.append((char)c);
  56                 }
  57             } else {
  58                 boolean four = c <= 0xFFFF;
  59                 buf.append(four ? "\\u" : "\\U");
  60                 buf.append(hex(c, four ? 4 : 8));
  61             }
  62         }
  63         return buf.toString();
  64     }
  65 
  66     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
  67     private static final char[] UNESCAPE_MAP = {
  68         /*"   0x22, 0x22 */
  69         /*'   0x27, 0x27 */
  70         /*?   0x3F, 0x3F */
  71         /*\   0x5C, 0x5C */
  72         /*a*/ 0x61, 0x07,
  73         /*b*/ 0x62, 0x08,
  74         /*e*/ 0x65, 0x1b,
  75         /*f*/ 0x66, 0x0c,
  76         /*n*/ 0x6E, 0x0a,
  77         /*r*/ 0x72, 0x0d,
  78         /*t*/ 0x74, 0x09,
  79         /*v*/ 0x76, 0x0b
  80     };
  81 
  82     /**
  83      * Convert an escape to a 32-bit code point value.  We attempt
  84      * to parallel the icu4c unescapeAt() function.
  85      * @param offset16 an array containing offset to the character
  86      * <em>after</em> the backslash.  Upon return offset16[0] will
  87      * be updated to point after the escape sequence.
  88      * @return character value from 0 to 10FFFF, or -1 on error.
  89      */
  90     public static int unescapeAt(String s, int[] offset16) {
  91         int c;
  92         int result = 0;
  93         int n = 0;
  94         int minDig = 0;
  95         int maxDig = 0;
  96         int bitsPerDigit = 4;
  97         int dig;
  98         int i;
  99         boolean braces = false;
 100 
 101         /* Check that offset is in range */
 102         int offset = offset16[0];
 103         int length = s.length();
 104         if (offset < 0 || offset >= length) {
 105             return -1;
 106         }
 107 
 108         /* Fetch first UChar after '\\' */
 109         c = Character.codePointAt(s, offset);
 110         offset += UTF16.getCharCount(c);
 111 
 112         /* Convert hexadecimal and octal escapes */
 113         switch (c) {
 114         case 'u':
 115             minDig = maxDig = 4;
 116             break;
 117         case 'U':
 118             minDig = maxDig = 8;
 119             break;
 120         case 'x':
 121             minDig = 1;
 122             if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
 123                 ++offset;
 124                 braces = true;
 125                 maxDig = 8;
 126             } else {
 127                 maxDig = 2;
 128           }
 129             break;
 130         default:
 131             dig = UCharacter.digit(c, 8);
 132             if (dig >= 0) {
 133                 minDig = 1;
 134                 maxDig = 3;
 135                 n = 1; /* Already have first octal digit */
 136                 bitsPerDigit = 3;
 137                 result = dig;
 138             }
 139             break;
 140         }
 141         if (minDig != 0) {
 142             while (offset < length && n < maxDig) {
 143                 c = UTF16.charAt(s, offset);
 144                 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
 145                 if (dig < 0) {
 146                     break;
 147                 }
 148                 result = (result << bitsPerDigit) | dig;
 149                 offset += UTF16.getCharCount(c);
 150                 ++n;
 151             }
 152             if (n < minDig) {
 153                 return -1;
 154             }
 155             if (braces) {
 156                 if (c != 0x7D /*}*/) {
 157                     return -1;
 158                 }
 159                 ++offset;
 160           }
 161             if (result < 0 || result >= 0x110000) {
 162                 return -1;
 163             }
 164             // If an escape sequence specifies a lead surrogate, see
 165             // if there is a trail surrogate after it, either as an
 166             // escape or as a literal.  If so, join them up into a
 167             // supplementary.
 168             if (offset < length &&
 169                     UTF16.isLeadSurrogate((char) result)) {
 170                 int ahead = offset+1;
 171                 c = s.charAt(offset); // [sic] get 16-bit code unit
 172                 if (c == '\\' && ahead < length) {
 173                     int o[] = new int[] { ahead };
 174                     c = unescapeAt(s, o);
 175                     ahead = o[0];
 176                 }
 177                 if (UTF16.isTrailSurrogate((char) c)) {
 178                     offset = ahead;
 179                     result = UCharacterProperty.getRawSupplementary(
 180                             (char) result, (char) c);
 181                 }
 182             }
 183             offset16[0] = offset;
 184             return result;
 185         }
 186 
 187         /* Convert C-style escapes in table */
 188         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
 189             if (c == UNESCAPE_MAP[i]) {
 190                 offset16[0] = offset;
 191                 return UNESCAPE_MAP[i+1];
 192             } else if (c < UNESCAPE_MAP[i]) {
 193                 break;
 194             }
 195         }
 196 
 197         /* Map \cX to control-X: X & 0x1F */
 198         if (c == 'c' && offset < length) {
 199             c = UTF16.charAt(s, offset);
 200             offset16[0] = offset + UTF16.getCharCount(c);
 201             return 0x1F & c;
 202         }
 203 
 204         /* If no special forms are recognized, then consider
 205          * the backslash to generically escape the next character. */
 206         offset16[0] = offset;
 207         return c;
 208     }
 209 
 210     /**
 211      * Supplies a zero-padded hex representation of an integer (without 0x)
 212      */
 213     public static String hex(long i, int places) {
 214         if (i == Long.MIN_VALUE) return "-8000000000000000";
 215         boolean negative = i < 0;
 216         if (negative) {
 217             i = -i;
 218         }
 219         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
 220         if (result.length() < places) {
 221             result = "0000000000000000".substring(result.length(),places) + result;
 222         }
 223         if (negative) {
 224             return '-' + result;
 225         }
 226         return result;
 227     }
 228 
 229     static final char DIGITS[] = {
 230         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 231         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
 232         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
 233         'U', 'V', 'W', 'X', 'Y', 'Z'
 234     };
 235 
 236     /**
 237      * Return true if the character is NOT printable ASCII.  The tab,
 238      * newline and linefeed characters are considered unprintable.
 239      */
 240     public static boolean isUnprintable(int c) {
 241         //0x20 = 32 and 0x7E = 126
 242         return !(c >= 0x20 && c <= 0x7E);
 243     }
 244 
 245     /**
 246      * Escape unprintable characters using <backslash>uxxxx notation
 247      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
 248      * above.  If the character is printable ASCII, then do nothing
 249      * and return FALSE.  Otherwise, append the escaped notation and
 250      * return TRUE.
 251      */
 252     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
 253         try {
 254             if (isUnprintable(c)) {
 255                 result.append('\\');
 256                 if ((c & ~0xFFFF) != 0) {
 257                     result.append('U');
 258                     result.append(DIGITS[0xF&(c>>28)]);
 259                     result.append(DIGITS[0xF&(c>>24)]);
 260                     result.append(DIGITS[0xF&(c>>20)]);
 261                     result.append(DIGITS[0xF&(c>>16)]);
 262                 } else {
 263                     result.append('u');
 264                 }
 265                 result.append(DIGITS[0xF&(c>>12)]);
 266                 result.append(DIGITS[0xF&(c>>8)]);
 267                 result.append(DIGITS[0xF&(c>>4)]);
 268                 result.append(DIGITS[0xF&c]);
 269                 return true;
 270             }
 271             return false;
 272         } catch (IOException e) {
 273             throw new IllegalArgumentException(e);
 274         }
 275     }
 276 }