1 /*
   2  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import java.nio.CharBuffer;
  29 import java.util.Arrays;
  30 
  31 import com.sun.tools.javac.file.JavacFileManager;
  32 import com.sun.tools.javac.resources.CompilerProperties.Errors;
  33 import com.sun.tools.javac.util.ArrayUtils;
  34 import com.sun.tools.javac.util.Log;
  35 import com.sun.tools.javac.util.Name;
  36 import com.sun.tools.javac.util.Names;
  37 
  38 import static com.sun.tools.javac.util.LayoutCharacters.*;
  39 
  40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
  41  * characters contained in the input stream, handling unicode escape accordingly.
  42  * Additionally, it provides features for saving chars into a buffer and to retrieve
  43  * them at a later stage.
  44  *
  45  *  <p><b>This is NOT part of any supported API.
  46  *  If you write code that depends on this, you do so at your own risk.
  47  *  This code and its internal interfaces are subject to change or
  48  *  deletion without notice.</b>
  49  */
  50 public class UnicodeReader {
  51 
  52     /** The input buffer, index of next character to be read,
  53      *  index of one past last character in buffer.
  54      */
  55     protected char[] buf;
  56     protected int bp;
  57     protected final int buflen;
  58 
  59     /** The current character.
  60      */
  61     protected char ch;
  62 
  63     /** The buffer index of the last converted unicode character
  64      */
  65     protected int unicodeConversionBp = -1;
  66 
  67     protected Log log;
  68     protected Names names;
  69 
  70     /** A character buffer for saved chars.
  71      */
  72     protected char[] sbuf = new char[128];
  73     protected int realLength;
  74     protected int sp;
  75 
  76     /**
  77      * Create a scanner from the input array.  This method might
  78      * modify the array.  To avoid copying the input array, ensure
  79      * that {@code inputLength < input.length} or
  80      * {@code input[input.length -1]} is a white space character.
  81      *
  82      * @param sf the factory which created this Scanner
  83      * @param buffer the input, might be modified
  84      * Must be positive and less than or equal to input.length.
  85      */
  86     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
  87         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
  88     }
  89 
  90     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
  91         log = sf.log;
  92         names = sf.names;
  93         realLength = inputLength;
  94         if (inputLength == input.length) {
  95             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
  96                 inputLength--;
  97             } else {
  98                 input = Arrays.copyOf(input, inputLength + 1);
  99             }
 100         }
 101         buf = input;
 102         buflen = inputLength;
 103         buf[buflen] = EOI;
 104         bp = -1;
 105         scanChar();
 106     }
 107 
 108     /** Read next character.
 109      */
 110     protected void scanChar() {
 111         if (bp < buflen) {
 112             ch = buf[++bp];
 113             if (ch == '\\') {
 114                 convertUnicode();
 115             }
 116         }
 117     }
 118 
 119     /** Read next character in comment, skipping over double '\' characters.
 120      */
 121     protected void scanCommentChar() {
 122         scanChar();
 123         if (ch == '\\') {
 124             if (peekChar() == '\\' && !isUnicode()) {
 125                 skipChar();
 126             } else {
 127                 convertUnicode();
 128             }
 129         }
 130     }
 131 
 132     /** Append a character to sbuf.
 133      */
 134     protected void putChar(char ch, boolean scan) {
 135         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
 136         sbuf[sp++] = ch;
 137         if (scan)
 138             scanChar();
 139     }
 140 
 141     protected void putChar(char ch) {
 142         putChar(ch, false);
 143     }
 144 
 145     protected void putChar(boolean scan) {
 146         putChar(ch, scan);
 147     }
 148 
 149     protected void nextChar(boolean skip) {
 150         if (!skip) {
 151             sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
 152             sbuf[sp++] = ch;
 153         }
 154 
 155         scanChar();
 156     }
 157 
 158     Name name() {
 159         return names.fromChars(sbuf, 0, sp);
 160     }
 161 
 162     String chars() {
 163         return new String(sbuf, 0, sp);
 164     }
 165 
 166     /** Add 'count' copies of the character 'ch' to the string buffer.
 167      */
 168     protected void repeat(char ch, int count) {
 169         for ( ; 0 < count; count--) {
 170             putChar(ch, false);
 171         }
 172     }
 173 
 174     /** Reset the scan buffer pointer to 'pos'.
 175      */
 176     protected void reset(int pos) {
 177         bp = pos - 1;
 178         scanChar();
 179     }
 180 
 181     /** Convert unicode escape; bp points to initial '\' character
 182      *  (Spec 3.3).
 183      */
 184     protected void convertUnicode() {
 185         if (ch == '\\' && unicodeConversionBp != bp ) {
 186             bp++; ch = buf[bp];
 187             if (ch == 'u') {
 188                 do {
 189                     bp++; ch = buf[bp];
 190                 } while (ch == 'u');
 191                 int limit = bp + 3;
 192                 if (limit < buflen) {
 193                     int d = digit(bp, 16);
 194                     int code = d;
 195                     while (bp < limit && d >= 0) {
 196                         bp++; ch = buf[bp];
 197                         d = digit(bp, 16);
 198                         code = (code << 4) + d;
 199                     }
 200                     if (d >= 0) {
 201                         ch = (char)code;
 202                         unicodeConversionBp = bp;
 203                         return;
 204                     }
 205                 }
 206                 log.error(bp, Errors.IllegalUnicodeEsc);
 207             } else {
 208                 bp--;
 209                 ch = '\\';
 210             }
 211         }
 212     }
 213 
 214     /** Are surrogates supported?
 215      */
 216     final static boolean surrogatesSupported = surrogatesSupported();
 217     private static boolean surrogatesSupported() {
 218         try {
 219             Character.isHighSurrogate('a');
 220             return true;
 221         } catch (NoSuchMethodError ex) {
 222             return false;
 223         }
 224     }
 225 
 226     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
 227      *  the next character is a low surrogate, returns the code point
 228      *  constructed from these surrogates. Otherwise, returns -1.
 229      *  This method will not consume any of the characters.
 230      */
 231     protected int peekSurrogates() {
 232         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
 233             char high = ch;
 234             int prevBP = bp;
 235 
 236             scanChar();
 237 
 238             char low = ch;
 239 
 240             ch = high;
 241             bp = prevBP;
 242 
 243             if (Character.isLowSurrogate(low)) {
 244                 return Character.toCodePoint(high, low);
 245             }
 246         }
 247 
 248         return -1;
 249     }
 250 
 251     /** Convert an ASCII digit from its base (8, 10, or 16)
 252      *  to its value.
 253      */
 254     protected int digit(int pos, int base) {
 255         char c = ch;
 256         if ('0' <= c && c <= '9')
 257             return Character.digit(c, base); //a fast common case
 258         int codePoint = peekSurrogates();
 259         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
 260         if (result >= 0 && c > 0x7f) {
 261             log.error(pos + 1, Errors.IllegalNonasciiDigit);
 262             if (codePoint >= 0)
 263                 scanChar();
 264             ch = "0123456789abcdef".charAt(result);
 265         }
 266         return result;
 267     }
 268 
 269     protected boolean isUnicode() {
 270         return unicodeConversionBp == bp;
 271     }
 272 
 273     protected void skipChar() {
 274         bp++;
 275     }
 276 
 277     protected char peekChar() {
 278         return buf[bp + 1];
 279     }
 280 
 281     /**
 282      * Returns a copy of the input buffer, up to its inputLength.
 283      * Unicode escape sequences are not translated.
 284      */
 285     public char[] getRawCharacters() {
 286         char[] chars = new char[buflen];
 287         System.arraycopy(buf, 0, chars, 0, buflen);
 288         return chars;
 289     }
 290 
 291     /**
 292      * Returns a copy of a character array subset of the input buffer.
 293      * The returned array begins at the {@code beginIndex} and
 294      * extends to the character at index {@code endIndex - 1}.
 295      * Thus the length of the substring is {@code endIndex-beginIndex}.
 296      * This behavior is like
 297      * {@code String.substring(beginIndex, endIndex)}.
 298      * Unicode escape sequences are not translated.
 299      *
 300      * @param beginIndex the beginning index, inclusive.
 301      * @param endIndex the ending index, exclusive.
 302      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
 303      *         array bounds
 304      */
 305     public char[] getRawCharacters(int beginIndex, int endIndex) {
 306         int length = endIndex - beginIndex;
 307         char[] chars = new char[length];
 308         System.arraycopy(buf, beginIndex, chars, 0, length);
 309         return chars;
 310     }
 311 }