1 /* 2 * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.tools.javac.parser; 27 28 import java.nio.CharBuffer; 29 import java.util.Arrays; 30 31 import com.sun.tools.javac.file.JavacFileManager; 32 import com.sun.tools.javac.resources.CompilerProperties.Errors; 33 import com.sun.tools.javac.util.ArrayUtils; 34 import com.sun.tools.javac.util.Log; 35 import com.sun.tools.javac.util.Name; 36 import com.sun.tools.javac.util.Names; 37 38 import static com.sun.tools.javac.util.LayoutCharacters.*; 39 40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of 41 * characters contained in the input stream, handling unicode escape accordingly. 42 * Additionally, it provides features for saving chars into a buffer and to retrieve 43 * them at a later stage. 44 * 45 * <p><b>This is NOT part of any supported API. 46 * If you write code that depends on this, you do so at your own risk. 47 * This code and its internal interfaces are subject to change or 48 * deletion without notice.</b> 49 */ 50 public class UnicodeReader { 51 52 /** The input buffer, index of next character to be read, 53 * index of one past last character in buffer. 54 */ 55 protected char[] buf; 56 protected int bp; 57 protected final int buflen; 58 59 /** The current character. 60 */ 61 protected char ch; 62 63 /** The buffer index of the last converted unicode character 64 */ 65 protected int unicodeConversionBp = -1; 66 67 /** Control conversion of unicode characters 68 */ 69 protected boolean unicodeConversion = true; 70 71 protected Log log; 72 protected Names names; 73 74 /** A character buffer for saved chars. 75 */ 76 protected char[] sbuf = new char[128]; 77 protected int realLength; 78 protected int sp; 79 80 /** 81 * Create a scanner from the input array. This method might 82 * modify the array. To avoid copying the input array, ensure 83 * that {@code inputLength < input.length} or 84 * {@code input[input.length -1]} is a white space character. 85 * 86 * @param sf the factory which created this Scanner 87 * @param buffer the input, might be modified 88 * Must be positive and less than or equal to input.length. 89 */ 90 protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { 91 this(sf, JavacFileManager.toArray(buffer), buffer.limit()); 92 } 93 94 protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { 95 log = sf.log; 96 names = sf.names; 97 realLength = inputLength; 98 if (inputLength == input.length) { 99 if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { 100 inputLength--; 101 } else { 102 input = Arrays.copyOf(input, inputLength + 1); 103 } 104 } 105 buf = input; 106 buflen = inputLength; 107 buf[buflen] = EOI; 108 bp = -1; 109 scanChar(); 110 } 111 112 /** Read next character. 113 */ 114 protected void scanChar() { 115 if (bp < buflen) { 116 ch = buf[++bp]; 117 if (ch == '\\') { 118 convertUnicode(); 119 } 120 } 121 } 122 123 /** Read next character in comment, skipping over double '\' characters. 124 */ 125 protected void scanCommentChar() { 126 scanChar(); 127 if (ch == '\\') { 128 if (peekChar() == '\\' && !isUnicode()) { 129 skipChar(); 130 } else { 131 convertUnicode(); 132 } 133 } 134 } 135 136 /** Append a character to sbuf. 137 */ 138 protected void putChar(char ch, boolean scan) { 139 sbuf = ArrayUtils.ensureCapacity(sbuf, sp); 140 sbuf[sp++] = ch; 141 if (scan) 142 scanChar(); 143 } 144 145 protected void putChar(char ch) { 146 putChar(ch, false); 147 } 148 149 protected void putChar(boolean scan) { 150 putChar(ch, scan); 151 } 152 153 Name name() { 154 return names.fromChars(sbuf, 0, sp); 155 } 156 157 String chars() { 158 return new String(sbuf, 0, sp); 159 } 160 161 protected boolean setUnicodeConversion(boolean newState) { 162 boolean oldState = unicodeConversion; 163 unicodeConversion = newState; 164 return oldState; 165 } 166 167 /** Convert unicode escape; bp points to initial '\' character 168 * (Spec 3.3). 169 */ 170 protected void convertUnicode() { 171 if (ch == '\\' && unicodeConversion && unicodeConversionBp != bp ) { 172 bp++; ch = buf[bp]; 173 if (ch == 'u') { 174 do { 175 bp++; ch = buf[bp]; 176 } while (ch == 'u'); 177 int limit = bp + 3; 178 if (limit < buflen) { 179 int d = digit(bp, 16); 180 int code = d; 181 while (bp < limit && d >= 0) { 182 bp++; ch = buf[bp]; 183 d = digit(bp, 16); 184 code = (code << 4) + d; 185 } 186 if (d >= 0) { 187 ch = (char)code; 188 unicodeConversionBp = bp; 189 return; 190 } 191 } 192 log.error(bp, Errors.IllegalUnicodeEsc); 193 } else { 194 bp--; 195 ch = '\\'; 196 } 197 } 198 } 199 200 /** Are surrogates supported? 201 */ 202 final static boolean surrogatesSupported = surrogatesSupported(); 203 private static boolean surrogatesSupported() { 204 try { 205 Character.isHighSurrogate('a'); 206 return true; 207 } catch (NoSuchMethodError ex) { 208 return false; 209 } 210 } 211 212 /** Scan surrogate pairs. If 'ch' is a high surrogate and 213 * the next character is a low surrogate, returns the code point 214 * constructed from these surrogates. Otherwise, returns -1. 215 * This method will not consume any of the characters. 216 */ 217 protected int peekSurrogates() { 218 if (surrogatesSupported && Character.isHighSurrogate(ch)) { 219 char high = ch; 220 int prevBP = bp; 221 222 scanChar(); 223 224 char low = ch; 225 226 ch = high; 227 bp = prevBP; 228 229 if (Character.isLowSurrogate(low)) { 230 return Character.toCodePoint(high, low); 231 } 232 } 233 234 return -1; 235 } 236 237 /** Convert an ASCII digit from its base (8, 10, or 16) 238 * to its value. 239 */ 240 protected int digit(int pos, int base) { 241 char c = ch; 242 if ('0' <= c && c <= '9') 243 return Character.digit(c, base); //a fast common case 244 int codePoint = peekSurrogates(); 245 int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base); 246 if (result >= 0 && c > 0x7f) { 247 log.error(pos + 1, Errors.IllegalNonasciiDigit); 248 if (codePoint >= 0) 249 scanChar(); 250 ch = "0123456789abcdef".charAt(result); 251 } 252 return result; 253 } 254 255 protected boolean isUnicode() { 256 return unicodeConversionBp == bp; 257 } 258 259 protected void skipChar() { 260 bp++; 261 } 262 263 protected char peekChar() { 264 return buf[bp + 1]; 265 } 266 267 protected char peekBack() { 268 return buf[bp]; 269 } 270 271 /** 272 * Skips consecutive occurrences of the current character, leaving bp positioned 273 * at the last occurrence. Returns the occurrence count. 274 */ 275 protected int skipRepeats() { 276 int start = bp; 277 while (bp < buflen) { 278 if (buf[bp] != buf[bp + 1]) 279 break; 280 bp++; 281 } 282 return bp - start; 283 } 284 285 /** 286 * Returns a copy of the input buffer, up to its inputLength. 287 * Unicode escape sequences are not translated. 288 */ 289 public char[] getRawCharacters() { 290 char[] chars = new char[buflen]; 291 System.arraycopy(buf, 0, chars, 0, buflen); 292 return chars; 293 } 294 295 /** 296 * Returns a copy of a character array subset of the input buffer. 297 * The returned array begins at the {@code beginIndex} and 298 * extends to the character at index {@code endIndex - 1}. 299 * Thus the length of the substring is {@code endIndex-beginIndex}. 300 * This behavior is like 301 * {@code String.substring(beginIndex, endIndex)}. 302 * Unicode escape sequences are not translated. 303 * 304 * @param beginIndex the beginning index, inclusive. 305 * @param endIndex the ending index, exclusive. 306 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 307 * array bounds 308 */ 309 public char[] getRawCharacters(int beginIndex, int endIndex) { 310 int length = endIndex - beginIndex; 311 char[] chars = new char[length]; 312 System.arraycopy(buf, beginIndex, chars, 0, length); 313 return chars; 314 } 315 }