1 /* 2 * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.tools.javac.parser; 27 28 import java.nio.CharBuffer; 29 import java.util.Arrays; 30 31 import com.sun.tools.javac.file.JavacFileManager; 32 import com.sun.tools.javac.resources.CompilerProperties.Errors; 33 import com.sun.tools.javac.util.ArrayUtils; 34 import com.sun.tools.javac.util.Log; 35 import com.sun.tools.javac.util.Name; 36 import com.sun.tools.javac.util.Names; 37 38 import static com.sun.tools.javac.util.LayoutCharacters.*; 39 40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of 41 * characters contained in the input stream, handling unicode escape accordingly. 42 * Additionally, it provides features for saving chars into a buffer and to retrieve 43 * them at a later stage. 44 * 45 * <p><b>This is NOT part of any supported API. 46 * If you write code that depends on this, you do so at your own risk. 47 * This code and its internal interfaces are subject to change or 48 * deletion without notice.</b> 49 */ 50 public class UnicodeReader { 51 52 /** The input buffer, index of next character to be read, 53 * index of one past last character in buffer. 54 */ 55 protected char[] buf; 56 protected int bp; 57 protected final int buflen; 58 59 /** The current character. 60 */ 61 protected char ch; 62 63 /** The buffer index of the last converted unicode character 64 */ 65 protected int unicodeConversionBp = -1; 66 67 protected Log log; 68 protected Names names; 69 70 /** A character buffer for saved chars. 71 */ 72 protected char[] sbuf = new char[128]; 73 protected int realLength; 74 protected int sp; 75 76 /** 77 * Create a scanner from the input array. This method might 78 * modify the array. To avoid copying the input array, ensure 79 * that {@code inputLength < input.length} or 80 * {@code input[input.length -1]} is a white space character. 81 * 82 * @param sf the factory which created this Scanner 83 * @param buffer the input, might be modified 84 * Must be positive and less than or equal to input.length. 85 */ 86 protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { 87 this(sf, JavacFileManager.toArray(buffer), buffer.limit()); 88 } 89 90 protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { 91 log = sf.log; 92 names = sf.names; 93 realLength = inputLength; 94 if (inputLength == input.length) { 95 if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { 96 inputLength--; 97 } else { 98 input = Arrays.copyOf(input, inputLength + 1); 99 } 100 } 101 buf = input; 102 buflen = inputLength; 103 buf[buflen] = EOI; 104 bp = -1; 105 scanChar(); 106 } 107 108 /** Read next character. 109 */ 110 protected void scanChar() { 111 if (bp < buflen) { 112 ch = buf[++bp]; 113 if (ch == '\\') { 114 convertUnicode(); 115 } 116 } 117 } 118 119 /** Read next character in comment, skipping over double '\' characters. 120 */ 121 protected void scanCommentChar() { 122 scanChar(); 123 if (ch == '\\') { 124 if (peekChar() == '\\' && !isUnicode()) { 125 skipChar(); 126 } else { 127 convertUnicode(); 128 } 129 } 130 } 131 132 /** Append a character to sbuf. 133 */ 134 protected void putChar(char ch, boolean scan) { 135 sbuf = ArrayUtils.ensureCapacity(sbuf, sp); 136 sbuf[sp++] = ch; 137 if (scan) 138 scanChar(); 139 } 140 141 protected void putChar(char ch) { 142 putChar(ch, false); 143 } 144 145 protected void putChar(boolean scan) { 146 putChar(ch, scan); 147 } 148 149 protected void nextChar(boolean skip) { 150 if (!skip) { 151 sbuf = ArrayUtils.ensureCapacity(sbuf, sp); 152 sbuf[sp++] = ch; 153 } 154 155 scanChar(); 156 } 157 158 Name name() { 159 return names.fromChars(sbuf, 0, sp); 160 } 161 162 String chars() { 163 return new String(sbuf, 0, sp); 164 } 165 166 /** Add 'count' copies of the character 'ch' to the string buffer. 167 */ 168 protected void repeat(char ch, int count) { 169 for ( ; 0 < count; count--) { 170 putChar(ch, false); 171 } 172 } 173 174 /** Reset the scan buffer pointer to 'pos'. 175 */ 176 protected void reset(int pos) { 177 bp = pos - 1; 178 scanChar(); 179 } 180 181 /** Convert unicode escape; bp points to initial '\' character 182 * (Spec 3.3). 183 */ 184 protected void convertUnicode() { 185 if (ch == '\\' && unicodeConversionBp != bp ) { 186 bp++; ch = buf[bp]; 187 if (ch == 'u') { 188 do { 189 bp++; ch = buf[bp]; 190 } while (ch == 'u'); 191 int limit = bp + 3; 192 if (limit < buflen) { 193 int d = digit(bp, 16); 194 int code = d; 195 while (bp < limit && d >= 0) { 196 bp++; ch = buf[bp]; 197 d = digit(bp, 16); 198 code = (code << 4) + d; 199 } 200 if (d >= 0) { 201 ch = (char)code; 202 unicodeConversionBp = bp; 203 return; 204 } 205 } 206 log.error(bp, Errors.IllegalUnicodeEsc); 207 } else { 208 bp--; 209 ch = '\\'; 210 } 211 } 212 } 213 214 /** Are surrogates supported? 215 */ 216 final static boolean surrogatesSupported = surrogatesSupported(); 217 private static boolean surrogatesSupported() { 218 try { 219 Character.isHighSurrogate('a'); 220 return true; 221 } catch (NoSuchMethodError ex) { 222 return false; 223 } 224 } 225 226 /** Scan surrogate pairs. If 'ch' is a high surrogate and 227 * the next character is a low surrogate, returns the code point 228 * constructed from these surrogates. Otherwise, returns -1. 229 * This method will not consume any of the characters. 230 */ 231 protected int peekSurrogates() { 232 if (surrogatesSupported && Character.isHighSurrogate(ch)) { 233 char high = ch; 234 int prevBP = bp; 235 236 scanChar(); 237 238 char low = ch; 239 240 ch = high; 241 bp = prevBP; 242 243 if (Character.isLowSurrogate(low)) { 244 return Character.toCodePoint(high, low); 245 } 246 } 247 248 return -1; 249 } 250 251 /** Convert an ASCII digit from its base (8, 10, or 16) 252 * to its value. 253 */ 254 protected int digit(int pos, int base) { 255 char c = ch; 256 if ('0' <= c && c <= '9') 257 return Character.digit(c, base); //a fast common case 258 int codePoint = peekSurrogates(); 259 int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base); 260 if (result >= 0 && c > 0x7f) { 261 log.error(pos + 1, Errors.IllegalNonasciiDigit); 262 if (codePoint >= 0) 263 scanChar(); 264 ch = "0123456789abcdef".charAt(result); 265 } 266 return result; 267 } 268 269 protected boolean isUnicode() { 270 return unicodeConversionBp == bp; 271 } 272 273 protected void skipChar() { 274 bp++; 275 } 276 277 protected char peekChar() { 278 return buf[bp + 1]; 279 } 280 281 /** 282 * Returns a copy of the input buffer, up to its inputLength. 283 * Unicode escape sequences are not translated. 284 */ 285 public char[] getRawCharacters() { 286 char[] chars = new char[buflen]; 287 System.arraycopy(buf, 0, chars, 0, buflen); 288 return chars; 289 } 290 291 /** 292 * Returns a copy of a character array subset of the input buffer. 293 * The returned array begins at the {@code beginIndex} and 294 * extends to the character at index {@code endIndex - 1}. 295 * Thus the length of the substring is {@code endIndex-beginIndex}. 296 * This behavior is like 297 * {@code String.substring(beginIndex, endIndex)}. 298 * Unicode escape sequences are not translated. 299 * 300 * @param beginIndex the beginning index, inclusive. 301 * @param endIndex the ending index, exclusive. 302 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 303 * array bounds 304 */ 305 public char[] getRawCharacters(int beginIndex, int endIndex) { 306 int length = endIndex - beginIndex; 307 char[] chars = new char[length]; 308 System.arraycopy(buf, beginIndex, chars, 0, length); 309 return chars; 310 } 311 }