New src/java.base/share/classes/sun/text/normalizer/Utility.java

   1 /*
   2  * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 /*
  26  *******************************************************************************
  27  * Copyright (C) 1996-2011, International Business Machines Corporation and    *
  28  * others. All Rights Reserved.                                                *
  29  *******************************************************************************
  30  */
  31 
  32 package sun.text.normalizer;
  33 
  34 import java.io.IOException;
  35 import java.util.Locale;
  36 
  37 final class Utility {
  38 
  39     /**
  40      * Convert characters outside the range U+0020 to U+007F to
  41      * Unicode escapes, and convert backslash to a double backslash.
  42      */
  43     public static final String escape(String s) {
  44         StringBuilder buf = new StringBuilder();
  45         for (int i=0; i<s.length(); ) {
  46             int c = Character.codePointAt(s, i);
  47             i += UTF16.getCharCount(c);
  48             if (c >= ' ' && c <= 0x007F) {
  49                 if (c == '\\') {
  50                     buf.append("\\\\"); // That is, "\\"
  51                 } else {
  52                     buf.append((char)c);
  53                 }
  54             } else {
  55                 boolean four = c <= 0xFFFF;
  56                 buf.append(four ? "\\u" : "\\U");
  57                 buf.append(hex(c, four ? 4 : 8));
  58             }
  59         }
  60         return buf.toString();
  61     }
  62 
  63     /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
  64     private static final char[] UNESCAPE_MAP = {
  65         /*"   0x22, 0x22 */
  66         /*'   0x27, 0x27 */
  67         /*?   0x3F, 0x3F */
  68         /*\   0x5C, 0x5C */
  69         /*a*/ 0x61, 0x07,
  70         /*b*/ 0x62, 0x08,
  71         /*e*/ 0x65, 0x1b,
  72         /*f*/ 0x66, 0x0c,
  73         /*n*/ 0x6E, 0x0a,
  74         /*r*/ 0x72, 0x0d,
  75         /*t*/ 0x74, 0x09,
  76         /*v*/ 0x76, 0x0b
  77     };
  78 
  79     /**
  80      * Convert an escape to a 32-bit code point value.  We attempt
  81      * to parallel the icu4c unescapeAt() function.
  82      * @param offset16 an array containing offset to the character
  83      * <em>after</em> the backslash.  Upon return offset16[0] will
  84      * be updated to point after the escape sequence.
  85      * @return character value from 0 to 10FFFF, or -1 on error.
  86      */
  87     public static int unescapeAt(String s, int[] offset16) {
  88         int c;
  89         int result = 0;
  90         int n = 0;
  91         int minDig = 0;
  92         int maxDig = 0;
  93         int bitsPerDigit = 4;
  94         int dig;
  95         int i;
  96         boolean braces = false;
  97 
  98         /* Check that offset is in range */
  99         int offset = offset16[0];
 100         int length = s.length();
 101         if (offset < 0 || offset >= length) {
 102             return -1;
 103         }
 104 
 105         /* Fetch first UChar after '\\' */
 106         c = Character.codePointAt(s, offset);
 107         offset += UTF16.getCharCount(c);
 108 
 109         /* Convert hexadecimal and octal escapes */
 110         switch (c) {
 111         case 'u':
 112             minDig = maxDig = 4;
 113             break;
 114         case 'U':
 115             minDig = maxDig = 8;
 116             break;
 117         case 'x':
 118             minDig = 1;
 119             if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
 120                 ++offset;
 121                 braces = true;
 122                 maxDig = 8;
 123             } else {
 124                 maxDig = 2;
 125           }
 126             break;
 127         default:
 128             dig = UCharacter.digit(c, 8);
 129             if (dig >= 0) {
 130                 minDig = 1;
 131                 maxDig = 3;
 132                 n = 1; /* Already have first octal digit */
 133                 bitsPerDigit = 3;
 134                 result = dig;
 135             }
 136             break;
 137         }
 138         if (minDig != 0) {
 139             while (offset < length && n < maxDig) {
 140                 c = UTF16.charAt(s, offset);
 141                 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
 142                 if (dig < 0) {
 143                     break;
 144                 }
 145                 result = (result << bitsPerDigit) | dig;
 146                 offset += UTF16.getCharCount(c);
 147                 ++n;
 148             }
 149             if (n < minDig) {
 150                 return -1;
 151             }
 152             if (braces) {
 153                 if (c != 0x7D /*}*/) {
 154                     return -1;
 155                 }
 156                 ++offset;
 157           }
 158             if (result < 0 || result >= 0x110000) {
 159                 return -1;
 160             }
 161             // If an escape sequence specifies a lead surrogate, see
 162             // if there is a trail surrogate after it, either as an
 163             // escape or as a literal.  If so, join them up into a
 164             // supplementary.
 165             if (offset < length &&
 166                     UTF16.isLeadSurrogate((char) result)) {
 167                 int ahead = offset+1;
 168                 c = s.charAt(offset); // [sic] get 16-bit code unit
 169                 if (c == '\\' && ahead < length) {
 170                     int[] o = new int[] { ahead };
 171                     c = unescapeAt(s, o);
 172                     ahead = o[0];
 173                 }
 174                 if (UTF16.isTrailSurrogate((char) c)) {
 175                     offset = ahead;
 176                     result = UCharacterProperty.getRawSupplementary(
 177                             (char) result, (char) c);
 178                 }
 179             }
 180             offset16[0] = offset;
 181             return result;
 182         }
 183 
 184         /* Convert C-style escapes in table */
 185         for (i=0; i<UNESCAPE_MAP.length; i+=2) {
 186             if (c == UNESCAPE_MAP[i]) {
 187                 offset16[0] = offset;
 188                 return UNESCAPE_MAP[i+1];
 189             } else if (c < UNESCAPE_MAP[i]) {
 190                 break;
 191             }
 192         }
 193 
 194         /* Map \cX to control-X: X & 0x1F */
 195         if (c == 'c' && offset < length) {
 196             c = UTF16.charAt(s, offset);
 197             offset16[0] = offset + UTF16.getCharCount(c);
 198             return 0x1F & c;
 199         }
 200 
 201         /* If no special forms are recognized, then consider
 202          * the backslash to generically escape the next character. */
 203         offset16[0] = offset;
 204         return c;
 205     }
 206 
 207     /**
 208      * Supplies a zero-padded hex representation of an integer (without 0x)
 209      */
 210     public static String hex(long i, int places) {
 211         if (i == Long.MIN_VALUE) return "-8000000000000000";
 212         boolean negative = i < 0;
 213         if (negative) {
 214             i = -i;
 215         }
 216         String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
 217         if (result.length() < places) {
 218             result = "0000000000000000".substring(result.length(),places) + result;
 219         }
 220         if (negative) {
 221             return '-' + result;
 222         }
 223         return result;
 224     }
 225 
 226     static final char[] DIGITS = {
 227         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
 228         'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
 229         'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
 230         'U', 'V', 'W', 'X', 'Y', 'Z'
 231     };
 232 
 233     /**
 234      * Return true if the character is NOT printable ASCII.  The tab,
 235      * newline and linefeed characters are considered unprintable.
 236      */
 237     public static boolean isUnprintable(int c) {
 238         //0x20 = 32 and 0x7E = 126
 239         return !(c >= 0x20 && c <= 0x7E);
 240     }
 241 
 242     /**
 243      * Escape unprintable characters using <backslash>uxxxx notation
 244      * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
 245      * above.  If the character is printable ASCII, then do nothing
 246      * and return FALSE.  Otherwise, append the escaped notation and
 247      * return TRUE.
 248      */
 249     public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
 250         try {
 251             if (isUnprintable(c)) {
 252                 result.append('\\');
 253                 if ((c & ~0xFFFF) != 0) {
 254                     result.append('U');
 255                     result.append(DIGITS[0xF&(c>>28)]);
 256                     result.append(DIGITS[0xF&(c>>24)]);
 257                     result.append(DIGITS[0xF&(c>>20)]);
 258                     result.append(DIGITS[0xF&(c>>16)]);
 259                 } else {
 260                     result.append('u');
 261                 }
 262                 result.append(DIGITS[0xF&(c>>12)]);
 263                 result.append(DIGITS[0xF&(c>>8)]);
 264                 result.append(DIGITS[0xF&(c>>4)]);
 265                 result.append(DIGITS[0xF&c]);
 266                 return true;
 267             }
 268             return false;
 269         } catch (IOException e) {
 270             throw new IllegalArgumentException(e);
 271         }
 272     }
 273 }