New src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

   1 /*
   2  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import java.nio.CharBuffer;
  29 import java.util.Arrays;
  30 
  31 import com.sun.tools.javac.file.JavacFileManager;
  32 import com.sun.tools.javac.resources.CompilerProperties.Errors;
  33 import com.sun.tools.javac.util.ArrayUtils;
  34 import com.sun.tools.javac.util.Log;
  35 import com.sun.tools.javac.util.Name;
  36 import com.sun.tools.javac.util.Names;
  37 
  38 import static com.sun.tools.javac.util.LayoutCharacters.*;
  39 
  40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
  41  * characters contained in the input stream, handling unicode escape accordingly.
  42  * Additionally, it provides features for saving chars into a buffer and to retrieve
  43  * them at a later stage.
  44  *
  45  *  <p><b>This is NOT part of any supported API.
  46  *  If you write code that depends on this, you do so at your own risk.
  47  *  This code and its internal interfaces are subject to change or
  48  *  deletion without notice.</b>
  49  */
  50 public class UnicodeReader {
  51 
  52     /** The input buffer, index of next character to be read,
  53      *  index of one past last character in buffer.
  54      */
  55     protected char[] buf;
  56     protected int bp;
  57     protected final int buflen;
  58 
  59     /** The current character.
  60      */
  61     protected char ch;
  62 
  63     /** The buffer index of the last converted unicode character
  64      */
  65     protected int unicodeConversionBp = -1;
  66 
  67     /** Control conversion of unicode characters
  68      */
  69     protected boolean unicodeConversion = true;
  70 
  71     protected Log log;
  72     protected Names names;
  73 
  74     /** A character buffer for saved chars.
  75      */
  76     protected char[] sbuf = new char[128];
  77     protected int realLength;
  78     protected int sp;
  79 
  80     /**
  81      * Create a scanner from the input array.  This method might
  82      * modify the array.  To avoid copying the input array, ensure
  83      * that {@code inputLength < input.length} or
  84      * {@code input[input.length -1]} is a white space character.
  85      *
  86      * @param sf the factory which created this Scanner
  87      * @param buffer the input, might be modified
  88      * Must be positive and less than or equal to input.length.
  89      */
  90     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
  91         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
  92     }
  93 
  94     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
  95         log = sf.log;
  96         names = sf.names;
  97         realLength = inputLength;
  98         if (inputLength == input.length) {
  99             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
 100                 inputLength--;
 101             } else {
 102                 input = Arrays.copyOf(input, inputLength + 1);
 103             }
 104         }
 105         buf = input;
 106         buflen = inputLength;
 107         buf[buflen] = EOI;
 108         bp = -1;
 109         scanChar();
 110     }
 111 
 112     /** Read next character.
 113      */
 114     protected void scanChar() {
 115         if (bp < buflen) {
 116             ch = buf[++bp];
 117             if (ch == '\\') {
 118                 convertUnicode();
 119             }
 120         }
 121     }
 122 
 123     /** Read next character in comment, skipping over double '\' characters.
 124      */
 125     protected void scanCommentChar() {
 126         scanChar();
 127         if (ch == '\\') {
 128             if (peekChar() == '\\' && !isUnicode()) {
 129                 skipChar();
 130             } else {
 131                 convertUnicode();
 132             }
 133         }
 134     }
 135 
 136     /** Append a character to sbuf.
 137      */
 138     protected void putChar(char ch, boolean scan) {
 139         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
 140         sbuf[sp++] = ch;
 141         if (scan)
 142             scanChar();
 143     }
 144 
 145     protected void putChar(char ch) {
 146         putChar(ch, false);
 147     }
 148 
 149     protected void putChar(boolean scan) {
 150         putChar(ch, scan);
 151     }
 152 
 153     Name name() {
 154         return names.fromChars(sbuf, 0, sp);
 155     }
 156 
 157     String chars() {
 158         return new String(sbuf, 0, sp);
 159     }
 160 
 161     protected boolean setUnicodeConversion(boolean newState) {
 162         boolean oldState = unicodeConversion;
 163         unicodeConversion = newState;
 164         return oldState;
 165     }
 166 
 167     /** Convert unicode escape; bp points to initial '\' character
 168      *  (Spec 3.3).
 169      */
 170     protected void convertUnicode() {
 171         if (ch == '\\' && unicodeConversion && unicodeConversionBp != bp ) {
 172             bp++; ch = buf[bp];
 173             if (ch == 'u') {
 174                 do {
 175                     bp++; ch = buf[bp];
 176                 } while (ch == 'u');
 177                 int limit = bp + 3;
 178                 if (limit < buflen) {
 179                     int d = digit(bp, 16);
 180                     int code = d;
 181                     while (bp < limit && d >= 0) {
 182                         bp++; ch = buf[bp];
 183                         d = digit(bp, 16);
 184                         code = (code << 4) + d;
 185                     }
 186                     if (d >= 0) {
 187                         ch = (char)code;
 188                         unicodeConversionBp = bp;
 189                         return;
 190                     }
 191                 }
 192                 log.error(bp, Errors.IllegalUnicodeEsc);
 193             } else {
 194                 bp--;
 195                 ch = '\\';
 196             }
 197         }
 198     }
 199 
 200     /** Are surrogates supported?
 201      */
 202     final static boolean surrogatesSupported = surrogatesSupported();
 203     private static boolean surrogatesSupported() {
 204         try {
 205             Character.isHighSurrogate('a');
 206             return true;
 207         } catch (NoSuchMethodError ex) {
 208             return false;
 209         }
 210     }
 211 
 212     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
 213      *  the next character is a low surrogate, returns the code point
 214      *  constructed from these surrogates. Otherwise, returns -1.
 215      *  This method will not consume any of the characters.
 216      */
 217     protected int peekSurrogates() {
 218         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
 219             char high = ch;
 220             int prevBP = bp;
 221 
 222             scanChar();
 223 
 224             char low = ch;
 225 
 226             ch = high;
 227             bp = prevBP;
 228 
 229             if (Character.isLowSurrogate(low)) {
 230                 return Character.toCodePoint(high, low);
 231             }
 232         }
 233 
 234         return -1;
 235     }
 236 
 237     /** Convert an ASCII digit from its base (8, 10, or 16)
 238      *  to its value.
 239      */
 240     protected int digit(int pos, int base) {
 241         char c = ch;
 242         if ('0' <= c && c <= '9')
 243             return Character.digit(c, base); //a fast common case
 244         int codePoint = peekSurrogates();
 245         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
 246         if (result >= 0 && c > 0x7f) {
 247             log.error(pos + 1, Errors.IllegalNonasciiDigit);
 248             if (codePoint >= 0)
 249                 scanChar();
 250             ch = "0123456789abcdef".charAt(result);
 251         }
 252         return result;
 253     }
 254 
 255     protected boolean isUnicode() {
 256         return unicodeConversionBp == bp;
 257     }
 258 
 259     protected void skipChar() {
 260         bp++;
 261     }
 262 
 263     protected char peekChar() {
 264         return buf[bp + 1];
 265     }
 266 
 267     protected char peekBack() {
 268         return buf[bp];
 269     }
 270 
 271     /**
 272      * Skips consecutive occurrences of the current character, leaving bp positioned
 273      * at the last occurrence. Returns the occurrence count.
 274      */
 275     protected int skipRepeats() {
 276         int start = bp;
 277         while (bp < buflen) {
 278             if (buf[bp] != buf[bp + 1])
 279                 break;
 280             bp++;
 281         }
 282         return bp - start;
 283     }
 284 
 285     /**
 286      * Returns a copy of the input buffer, up to its inputLength.
 287      * Unicode escape sequences are not translated.
 288      */
 289     public char[] getRawCharacters() {
 290         char[] chars = new char[buflen];
 291         System.arraycopy(buf, 0, chars, 0, buflen);
 292         return chars;
 293     }
 294 
 295     /**
 296      * Returns a copy of a character array subset of the input buffer.
 297      * The returned array begins at the {@code beginIndex} and
 298      * extends to the character at index {@code endIndex - 1}.
 299      * Thus the length of the substring is {@code endIndex-beginIndex}.
 300      * This behavior is like
 301      * {@code String.substring(beginIndex, endIndex)}.
 302      * Unicode escape sequences are not translated.
 303      *
 304      * @param beginIndex the beginning index, inclusive.
 305      * @param endIndex the ending index, exclusive.
 306      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
 307      *         array bounds
 308      */
 309     public char[] getRawCharacters(int beginIndex, int endIndex) {
 310         int length = endIndex - beginIndex;
 311         char[] chars = new char[length];
 312         System.arraycopy(buf, beginIndex, chars, 0, length);
 313         return chars;
 314     }
 315 }