1 /* 2 * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.tools.javac.parser; 27 28 import java.util.Arrays; 29 30 import com.sun.tools.javac.resources.CompilerProperties.Errors; 31 import com.sun.tools.javac.util.Log; 32 33 import static com.sun.tools.javac.util.LayoutCharacters.EOI; 34 import static com.sun.tools.javac.util.LayoutCharacters.tabulate; 35 36 /** 37 * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters 38 * one by one as contained in the input stream, handling unicode escape sequences accordingly. 39 * 40 * <p><b>This is NOT part of any supported API. 41 * If you write code that depends on this, you do so at your own risk. 42 * This code and its internal interfaces are subject to change or 43 * deletion without notice.</b></p> 44 */ 45 public class UnicodeReader { 46 /** 47 * Buffer containing characters from source file. May contain extraneous characters 48 * beyond this.length. 49 */ 50 private final char[] buffer; 51 52 /** 53 * Length of meaningful content in buffer. 54 */ 55 private final int length; 56 57 /** 58 * Character buffer index of character currently being observed. 59 */ 60 private int position; 61 62 /** 63 * Number of characters combined to provide character currently being observed. Typically 64 * one, but may be more when combinations of surrogate pairs and unicode escape sequences 65 * are read. 66 */ 67 private int width; 68 69 /** 70 * Character currently being observed. If a surrogate pair is read then will be the high 71 * member of the pair. 72 */ 73 private char character; 74 75 /** 76 * Codepoint of character currently being observed. Typically equivalent to the character 77 * but will have a value greater that 0xFFFF when a surrogate pair. 78 */ 79 private int codepoint; 80 81 /** 82 * true if the last character was a backslash. This is used to handle the special case 83 * when a backslash precedes a unicode escape sequence. In that case, the second backslash 84 * is treated as a backslash and not part of a unicode escape sequence. 85 */ 86 private boolean wasBackslash; 87 88 /** 89 * Log for error reporting. 90 */ 91 private final Log log; 92 93 /** 94 * Constructor. 95 * 96 * @param sf scan factory. 97 * @param array array containing contents of source. 98 * @param length length of meaningful content in buffer. 99 */ 100 protected UnicodeReader(ScannerFactory sf, char[] array, int length) { 101 this.buffer = array; 102 this.length = length; 103 this.position = 0; 104 this.width = 0; 105 this.character = '\0'; 106 this.codepoint = 0; 107 this.wasBackslash = false; 108 this.log = sf.log; 109 110 nextCodePoint(); 111 } 112 113 /** 114 * Returns the length of the buffer. This is length of meaningful content in buffer and 115 * not the length of the buffer array. 116 * 117 * @return length of the buffer. 118 */ 119 protected int length() { 120 return length; 121 } 122 123 /** 124 * Return true if current position is past the end of the meaningful part of the buffer. 125 * 126 * @return true if current position is past the end of the meaningful part of the buffer. 127 */ 128 protected boolean isEOF() { 129 return position >= length; 130 } 131 132 /** 133 * Fetches the next 16-bit character from the buffer and places it in this.character. 134 */ 135 private void nextCharacter() { 136 // Index of next character in buffer. 137 int index = position + width; 138 139 // If past end of buffer. 140 if (length <= index) { 141 // End of file is marked with EOI. 142 character = EOI; 143 } else { 144 // Next character in buffer. 145 character = buffer[index]; 146 // Increment length of codepoint. 147 width++; 148 } 149 } 150 151 /** 152 * Fetches the next 16-bit character from the buffer. If an unicode escape sequence 153 * is detected then converts the unicode escape sequence to a character. 154 */ 155 private void nextUnicode() { 156 // Position to next codepoint. 157 position += width; 158 // Codepoint has no characters yet. 159 width = 0; 160 161 // Fetch next character. 162 nextCharacter(); 163 164 // If second backslash is detected. 165 if (wasBackslash) { 166 // Treat like a normal character (not part of unicode escape sequence.) 167 wasBackslash = false; 168 } else if (character == '\\') { 169 // May be a unicode escape sequence. 170 wasBackslash = !unicodeEscape(); 171 } 172 173 // Codepoint and character match if not surrogate. 174 codepoint = (int)character; 175 } 176 177 /** 178 * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized 179 * then converts unicode escape sequence to a character. If two characters are a surrogate pair 180 * then converts to a codepoint. 181 */ 182 private void nextCodePoint() { 183 // Next unicode character. 184 nextUnicode(); 185 186 // Return early if ASCII or not a surrogate pair. 187 if (isASCII() || !Character.isHighSurrogate(character)) { 188 return; 189 } 190 191 // Capture high surrogate and position. 192 char hi = character; 193 int savePosition = position; 194 int saveWidth = width; 195 196 // Get potential low surrogate. 197 nextUnicode(); 198 char lo = character; 199 200 if (Character.isLowSurrogate(lo)) { 201 // Start codepoint at start of high surrogate. 202 position = savePosition; 203 width += saveWidth; 204 // Compute codepoint. 205 codepoint = Character.toCodePoint(hi, lo); 206 } else { 207 // Restore to treat high surrogate as just a character. 208 position = savePosition; 209 width = saveWidth; 210 character = hi; 211 codepoint = (int)hi; 212 // Could potential report an error here (old code did not.) 213 } 214 } 215 216 /** 217 * Converts an unicode escape sequence into a character. 218 * 219 * @return true if was a valid escape sequence. 220 */ 221 private boolean unicodeEscape() { 222 // Start of unicode escape sequence (past backslash.) 223 int start = position + width; 224 int index; 225 226 // Skip multiple 'u'. 227 for (index = start; index < length; index++) { 228 if (buffer[index] != 'u') { 229 break; 230 } 231 } 232 233 // Needs to be at least backslash-u. 234 if (index != start) { 235 // If enough characters available. 236 if (index + 4 < length) { 237 // Convert four hex digits to codepoint. If any digit is invalid then the 238 // result is negative. 239 int code = (Character.digit(buffer[index++], 16) << 12) | 240 (Character.digit(buffer[index++], 16) << 8) | 241 (Character.digit(buffer[index++], 16) << 4) | 242 Character.digit(buffer[index++], 16); 243 244 // If all digits are good. 245 if (code >= 0) { 246 width = index - position; 247 character = (char)code; 248 249 return true; 250 } 251 } 252 253 // Did not work out. 254 log.error(position, Errors.IllegalUnicodeEsc); 255 width = index - position; 256 257 return true; 258 } 259 260 // Must be just a backslash. 261 character = '\\'; 262 width = 1; 263 264 return false; 265 } 266 267 /** 268 * Return the current position in the character buffer. 269 * 270 * @return current position in the character buffer. 271 */ 272 protected int position() { 273 return position; 274 } 275 276 277 /** 278 * Reset the reader to the specified position. 279 * Warning: Do not use when previous character was an ASCII or unicode backslash. 280 * @param pos 281 */ 282 protected void reset(int pos) { 283 position = pos; 284 width = 0; 285 wasBackslash = false; 286 nextCodePoint(); 287 } 288 289 /** 290 * Return the current character in at the current position. 291 * 292 * @return current character in at the current position. 293 */ 294 protected char get() { 295 return character; 296 } 297 298 /** 299 * Return the current codepoint in at the current position. 300 * 301 * @return current codepoint in at the current position. 302 */ 303 protected int getCodepoint() { 304 return codepoint; 305 } 306 307 /** 308 * Returns true if the current codepoint is a surrogate. 309 * 310 * @return true if the current codepoint is a surrogate. 311 */ 312 protected boolean isSurrogate() { 313 return 0xFFFF < codepoint; 314 } 315 316 /** 317 * Returns true if the current character is ASCII. 318 * 319 * @return true if the current character is ASCII. 320 */ 321 protected boolean isASCII() { 322 return character <= 0x7F; 323 } 324 325 /** 326 * Advances the current character to the next character. 327 * 328 * @return next character. 329 */ 330 protected char next() { 331 nextCodePoint(); 332 333 return character; 334 } 335 336 /** 337 * Compare character. Returns true if a match. 338 * 339 * @param ch character to match. 340 * 341 * @return true if a match. 342 */ 343 protected boolean is(char ch) { 344 return character == ch; 345 } 346 347 /** 348 * Match one of the arguments. Returns true if a match. 349 */ 350 protected boolean isOneOf(char ch1, char ch2) { 351 return is(ch1) || is(ch2); 352 } 353 protected boolean isOneOf(char ch1, char ch2, char ch3) { 354 return is(ch1) || is(ch2) || is(ch3); 355 } 356 protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) { 357 return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6); 358 } 359 360 /** 361 * Tests to see if current character is in the range of lo to hi characters (inclusive). 362 * 363 * @param lo lowest character in range. 364 * @param hi highest character in range. 365 * 366 * @return true if the current character is in range. 367 */ 368 protected boolean inRange(char lo, char hi) { 369 return lo <= character && character <= hi; 370 } 371 372 /** 373 * Compare character and advance if a match. Returns true if a match. 374 * 375 * @param ch character to match. 376 * 377 * @return true if a match. 378 */ 379 protected boolean accept(char ch) { 380 if (is(ch)) { 381 next(); 382 383 return true; 384 } 385 386 return false; 387 } 388 389 /** 390 * Match one of the arguments and advance if a match. Returns true if a match. 391 */ 392 protected boolean acceptOneOf(char ch1, char ch2) { 393 if (isOneOf(ch1, ch2)) { 394 next(); 395 396 return true; 397 } 398 399 return false; 400 } 401 402 protected boolean acceptOneOf(char ch1, char ch2, char ch3) { 403 if (isOneOf(ch1, ch2, ch3)) { 404 next(); 405 406 return true; 407 } 408 409 return false; 410 } 411 412 /** 413 * Skip over all occurances of character. 414 * 415 * @param ch character to accept. 416 */ 417 protected void skip(char ch) { 418 while (accept(ch)) { 419 // next 420 } 421 } 422 423 /** 424 * Skip over ASCII white space characters. 425 */ 426 protected void skipWhitespace() { 427 while (acceptOneOf(' ', '\t', '\f')) { 428 // next 429 } 430 } 431 432 /** 433 * Skip to end of line. 434 */ 435 protected void skipToEOLN() { 436 while (!isEOF()) { 437 if (isOneOf('\r', '\n')) { 438 break; 439 } 440 441 next(); 442 } 443 444 } 445 446 /** 447 * Compare string and advance if a match. Returns true if a match. 448 * Warning: Do not use when previous character was a backslash 449 * (confuses state of wasBackslash.) 450 * 451 * @param string string to match character for character. 452 * 453 * @return true if a match. 454 */ 455 protected boolean accept(String string) { 456 // Quick test. 457 if (string.length() == 0 || !is(string.charAt(0))) { 458 return false; 459 } 460 461 // Be prepared to retreat if not a match. 462 int savedPosition = position; 463 464 nextCodePoint(); 465 466 // Check each character. 467 for (int i = 1; i < string.length(); i++) { 468 if (!is(string.charAt(i))) { 469 // Restart if not a match. 470 reset(savedPosition); 471 472 return false; 473 } 474 475 nextCodePoint(); 476 } 477 478 return true; 479 } 480 481 /** 482 * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not 483 * advance character. 484 * 485 * @param pos starting position. 486 * @param digitRadix base of number being converted. 487 * 488 * @return value of digit. 489 */ 490 protected int digit(int pos, int digitRadix) { 491 int result; 492 493 // Just an ASCII digit. 494 if (inRange('0', '9')) { 495 // Fast common case. 496 result = character - '0'; 497 498 return result < digitRadix ? result : -1; 499 } 500 501 // Handle other digits. 502 result = isSurrogate() ? Character.digit(codepoint, digitRadix) : 503 Character.digit(character, digitRadix); 504 505 if (result >= 0 && !isASCII()) { 506 log.error(position(), Errors.IllegalNonasciiDigit); 507 character = "0123456789abcdef".charAt(result); 508 } 509 510 return result; 511 } 512 513 /** 514 * Returns the input buffer. Unicode escape sequences are not translated. 515 * 516 * @return the input buffer. 517 */ 518 public char[] getRawCharacters() { 519 return length == buffer.length ? buffer : Arrays.copyOf(buffer, length); 520 } 521 522 /** 523 * Returns a copy of a character array subset of the input buffer. 524 * The returned array begins at the {@code beginIndex} and 525 * extends to the character at index {@code endIndex - 1}. 526 * Thus the length of the substring is {@code endIndex-beginIndex}. 527 * This behavior is like 528 * {@code String.substring(beginIndex, endIndex)}. 529 * Unicode escape sequences are not translated. 530 * 531 * @param beginIndex the beginning index, inclusive. 532 * @param endIndex the ending index, exclusive. 533 * 534 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 535 * array bounds 536 */ 537 public char[] getRawCharacters(int beginIndex, int endIndex) { 538 return Arrays.copyOfRange(buffer, beginIndex, endIndex); 539 } 540 541 /** 542 * This is a specialized version of UnicodeReader that keeps track of the 543 * column position within a given character stream. Used for Javadoc 544 * processing to build a table for mapping positions in the comment string 545 * to positions in the source file. 546 */ 547 static class PositionTrackingReader extends UnicodeReader { 548 /** 549 * Offset from the beginning of the original reader buffer. 550 */ 551 private int offset; 552 553 /** 554 * Current column in the comment. 555 */ 556 private int column; 557 558 /** 559 * Constructor. 560 * 561 * @param sf Scan factory. 562 * @param array Array containing contents of source. 563 * @param offset Position offset in original source buffer. 564 */ 565 protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) { 566 super(sf, array, array.length); 567 this.offset = offset; 568 this.column = 0; 569 } 570 571 /** 572 * Advances the current character to the next character. Tracks column. 573 * 574 * @return next character. 575 */ 576 @Override 577 protected char next() { 578 super.next(); 579 580 if (isOneOf('\n', '\r', '\f')) { 581 column = 0; 582 } else if (is('\t')) { 583 column = tabulate(column); 584 } else { 585 column++; 586 } 587 588 return get(); 589 } 590 591 /** 592 * Returns the current column. 593 * 594 * @return the current column. 595 */ 596 protected int column() { 597 return column; 598 } 599 600 /** 601 * Returns position relative to the original source buffer. 602 * 603 * @return 604 */ 605 protected int offsetPosition() { 606 return position() + offset; 607 } 608 } 609 610 }