1 /*
   2  * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.tools.javac.parser;
  27 
  28 import java.util.Arrays;
  29 
  30 import com.sun.tools.javac.resources.CompilerProperties.Errors;
  31 import com.sun.tools.javac.util.Log;
  32 
  33 import static com.sun.tools.javac.util.LayoutCharacters.EOI;
  34 import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
  35 
  36 /**
  37  * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
  38  * one by one as contained in the input stream, handling unicode escape sequences accordingly.
  39  *
  40  *  <p><b>This is NOT part of any supported API.
  41  *  If you write code that depends on this, you do so at your own risk.
  42  *  This code and its internal interfaces are subject to change or
  43  *  deletion without notice.</b></p>
  44  */
  45 public class UnicodeReader {
  46     /**
  47      * Buffer containing characters from source file. May contain extraneous characters
  48      * beyond this.length.
  49      */
  50     private final char[] buffer;
  51 
  52     /**
  53      * Length of meaningful content in buffer.
  54      */
  55     private final int length;
  56 
  57     /**
  58      * Character buffer index of character currently being observed.
  59      */
  60     private int position;
  61 
  62     /**
  63      * Number of characters combined to provide character currently being observed. Typically
  64      * one, but may be more when combinations of surrogate pairs and unicode escape sequences
  65      * are read.
  66      */
  67     private int width;
  68 
  69     /**
  70      * Character currently being observed. If a surrogate pair is read then will be the high
  71      * member of the pair.
  72      */
  73     private char character;
  74 
  75     /**
  76      * Codepoint of character currently being observed. Typically equivalent to the character
  77      * but will have a value greater that 0xFFFF when a surrogate pair.
  78      */
  79     private int codepoint;
  80 
  81     /**
  82      * true if the last character was a backslash. This is used to handle the special case
  83      * when a backslash precedes a unicode escape sequence. In that case, the second backslash
  84      * is treated as a backslash and not part of a unicode escape sequence.
  85      */
  86     private boolean wasBackslash;
  87 
  88     /**
  89      * Log for error reporting.
  90      */
  91     private final Log log;
  92 
  93     /**
  94      * Constructor.
  95      *
  96      * @param sf      scan factory.
  97      * @param array   array containing contents of source.
  98      * @param length  length of meaningful content in buffer.
  99      */
 100     protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
 101         this.buffer = array;
 102         this.length = length;
 103         this.position = 0;
 104         this.width = 0;
 105         this.character = '\0';
 106         this.codepoint = 0;
 107         this.wasBackslash = false;
 108         this.log = sf.log;
 109 
 110         nextCodePoint();
 111     }
 112 
 113     /**
 114      * Returns the length of the buffer. This is length of meaningful content in buffer and
 115      * not the length of the buffer array.
 116      *
 117      * @return length of the buffer.
 118      */
 119     protected int length() {
 120         return length;
 121     }
 122 
 123     /**
 124      * Return true if current position is past the end of the meaningful part of the buffer.
 125      *
 126      * @return true if current position is past the end of the meaningful part of the buffer.
 127      */
 128     protected boolean isEOF() {
 129         return position >= length;
 130     }
 131 
 132     /**
 133      * Fetches the next 16-bit character from the buffer and places it in this.character.
 134      */
 135     private void nextCharacter() {
 136         // Index of next character in buffer.
 137         int index = position + width;
 138 
 139         // If past end of buffer.
 140         if (length <= index) {
 141             // End of file is marked with EOI.
 142             character = EOI;
 143         } else {
 144             // Next character in buffer.
 145             character = buffer[index];
 146             // Increment length of codepoint.
 147             width++;
 148         }
 149     }
 150 
 151     /**
 152      * Fetches the next 16-bit character from the buffer. If an unicode escape sequence
 153      * is detected then converts the unicode escape sequence to a character.
 154      */
 155     private void nextUnicode() {
 156         // Position to next codepoint.
 157         position += width;
 158         // Codepoint has no characters yet.
 159         width = 0;
 160 
 161         // Fetch next character.
 162         nextCharacter();
 163 
 164         // If second backslash is detected.
 165         if (wasBackslash) {
 166             // Treat like a normal character (not part of unicode escape sequence.)
 167             wasBackslash = false;
 168         } else if (character == '\\') {
 169             // May be a unicode escape sequence.
 170             wasBackslash = !unicodeEscape();
 171         }
 172 
 173         // Codepoint and character match if not surrogate.
 174         codepoint = (int)character;
 175     }
 176 
 177     /**
 178      * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized
 179      * then converts unicode escape sequence to a character. If two characters are a surrogate pair
 180      * then converts to a codepoint.
 181      */
 182     private void nextCodePoint() {
 183         // Next unicode character.
 184         nextUnicode();
 185 
 186         // Return early if ASCII or not a surrogate pair.
 187         if (isASCII() || !Character.isHighSurrogate(character)) {
 188             return;
 189         }
 190 
 191         // Capture high surrogate and position.
 192         char hi = character;
 193         int savePosition = position;
 194         int saveWidth = width;
 195 
 196         // Get potential low surrogate.
 197         nextUnicode();
 198         char lo = character;
 199 
 200         if (Character.isLowSurrogate(lo)) {
 201             // Start codepoint at start of high surrogate.
 202             position = savePosition;
 203             width += saveWidth;
 204             // Compute codepoint.
 205             codepoint = Character.toCodePoint(hi, lo);
 206         } else {
 207             // Restore to treat high surrogate as just a character.
 208             position = savePosition;
 209             width = saveWidth;
 210             character = hi;
 211             codepoint = (int)hi;
 212             // Could potential report an error here (old code did not.)
 213         }
 214     }
 215 
 216     /**
 217      * Converts an unicode escape sequence into a character.
 218      *
 219      * @return true if was a valid escape sequence.
 220      */
 221     private boolean unicodeEscape() {
 222         // Start of unicode escape sequence (past backslash.)
 223         int start = position + width;
 224         int index;
 225 
 226         // Skip multiple 'u'.
 227         for (index = start; index < length; index++) {
 228             if (buffer[index] != 'u') {
 229                 break;
 230             }
 231         }
 232 
 233         // Needs to be at least backslash-u.
 234         if (index != start) {
 235             // If enough characters available.
 236             if (index + 4 < length) {
 237                 // Convert four hex digits to codepoint. If any digit is invalid then the
 238                 // result is negative.
 239                 int code = (Character.digit(buffer[index++], 16) << 12) |
 240                            (Character.digit(buffer[index++], 16) << 8) |
 241                            (Character.digit(buffer[index++], 16) << 4) |
 242                             Character.digit(buffer[index++], 16);
 243 
 244                 // If all digits are good.
 245                 if (code >= 0) {
 246                     width = index - position;
 247                     character = (char)code;
 248 
 249                     return true;
 250                 }
 251             }
 252 
 253             // Did not work out.
 254             log.error(position, Errors.IllegalUnicodeEsc);
 255             width = index - position;
 256 
 257             return true;
 258         }
 259 
 260         // Must be just a backslash.
 261         character = '\\';
 262         width = 1;
 263 
 264         return false;
 265     }
 266 
 267     /**
 268      * Return the current position in the character buffer.
 269      *
 270      * @return  current position in the character buffer.
 271      */
 272     protected int position() {
 273         return position;
 274     }
 275 
 276 
 277     /**
 278      * Reset the reader to the specified position.
 279      * Warning: Do not use when previous character was an ASCII or unicode backslash.
 280      * @param pos
 281      */
 282     protected void reset(int pos) {
 283         position = pos;
 284         width = 0;
 285         wasBackslash = false;
 286         nextCodePoint();
 287     }
 288 
 289     /**
 290      * Return the current character in at the current position.
 291      *
 292      * @return current character in at the current position.
 293      */
 294     protected char get() {
 295         return character;
 296     }
 297 
 298     /**
 299      * Return the current codepoint in at the current position.
 300      *
 301      * @return current codepoint in at the current position.
 302      */
 303     protected int getCodepoint() {
 304         return codepoint;
 305     }
 306 
 307     /**
 308      * Returns true if the current codepoint is a surrogate.
 309      *
 310      * @return true if the current codepoint is a surrogate.
 311      */
 312     protected boolean isSurrogate() {
 313         return 0xFFFF < codepoint;
 314     }
 315 
 316     /**
 317      * Returns true if the current character is ASCII.
 318      *
 319      * @return true if the current character is ASCII.
 320      */
 321     protected boolean isASCII() {
 322         return character <= 0x7F;
 323     }
 324 
 325     /**
 326      * Advances the current character to the next character.
 327      *
 328      * @return next character.
 329      */
 330     protected char next() {
 331         nextCodePoint();
 332 
 333         return character;
 334     }
 335 
 336     /**
 337      * Compare character. Returns true if a match.
 338      *
 339      * @param ch  character to match.
 340      *
 341      * @return true if a match.
 342      */
 343     protected boolean is(char ch) {
 344         return character == ch;
 345     }
 346 
 347     /**
 348      * Match one of the arguments. Returns true if a match.
 349      */
 350     protected boolean isOneOf(char ch1, char ch2) {
 351         return is(ch1) || is(ch2);
 352     }
 353     protected boolean isOneOf(char ch1, char ch2, char ch3) {
 354         return is(ch1) || is(ch2) || is(ch3);
 355     }
 356     protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
 357         return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
 358     }
 359 
 360     /**
 361      * Tests to see if current character is in the range of lo to hi characters (inclusive).
 362      *
 363      * @param lo  lowest character in range.
 364      * @param hi  highest character in range.
 365      *
 366      * @return true if the current character is in range.
 367      */
 368     protected boolean inRange(char lo, char hi) {
 369         return lo <= character && character <= hi;
 370     }
 371 
 372     /**
 373      * Compare character and advance if a match. Returns true if a match.
 374      *
 375      * @param ch  character to match.
 376      *
 377      * @return true if a match.
 378      */
 379     protected boolean accept(char ch) {
 380         if (is(ch)) {
 381             next();
 382 
 383             return true;
 384         }
 385 
 386         return false;
 387     }
 388 
 389     /**
 390      * Match one of the arguments and advance if a match. Returns true if a match.
 391      */
 392     protected boolean acceptOneOf(char ch1, char ch2) {
 393         if (isOneOf(ch1, ch2)) {
 394             next();
 395 
 396             return true;
 397         }
 398 
 399         return false;
 400     }
 401 
 402     protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
 403         if (isOneOf(ch1, ch2, ch3)) {
 404             next();
 405 
 406             return true;
 407         }
 408 
 409         return false;
 410     }
 411 
 412     /**
 413      * Skip over all occurances of character.
 414      *
 415      * @param ch character to accept.
 416      */
 417     protected void skip(char ch) {
 418         while (accept(ch)) {
 419             // next
 420         }
 421     }
 422 
 423     /**
 424      * Skip over ASCII white space characters.
 425      */
 426     protected void skipWhitespace() {
 427         while (acceptOneOf(' ', '\t', '\f')) {
 428             // next
 429         }
 430     }
 431 
 432     /**
 433      * Skip to end of line.
 434      */
 435     protected void skipToEOLN() {
 436         while (!isEOF()) {
 437             if (isOneOf('\r', '\n')) {
 438                 break;
 439             }
 440 
 441             next();
 442         }
 443 
 444     }
 445 
 446     /**
 447      * Compare string and advance if a match. Returns true if a match.
 448      * Warning: Do not use when previous character was a backslash
 449      * (confuses state of wasBackslash.)
 450      *
 451      * @param string string to match character for character.
 452      *
 453      * @return true if a match.
 454      */
 455     protected boolean accept(String string) {
 456         // Quick test.
 457         if (string.length() == 0 || !is(string.charAt(0))) {
 458             return false;
 459         }
 460 
 461         // Be prepared to retreat if not a match.
 462         int savedPosition = position;
 463 
 464         nextCodePoint();
 465 
 466         // Check each character.
 467         for (int i = 1; i < string.length(); i++) {
 468             if (!is(string.charAt(i))) {
 469                 // Restart if not a match.
 470                 reset(savedPosition);
 471 
 472                 return false;
 473             }
 474 
 475             nextCodePoint();
 476         }
 477 
 478         return true;
 479     }
 480 
 481     /**
 482      * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
 483      * advance character.
 484      *
 485      * @param pos         starting position.
 486      * @param digitRadix  base of number being converted.
 487      *
 488      * @return value of digit.
 489      */
 490     protected int digit(int pos, int digitRadix) {
 491         int result;
 492 
 493         // Just an ASCII digit.
 494         if (inRange('0', '9')) {
 495             // Fast common case.
 496             result = character - '0';
 497 
 498             return result < digitRadix ? result : -1;
 499         }
 500 
 501         // Handle other digits.
 502         result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
 503                                  Character.digit(character, digitRadix);
 504 
 505         if (result >= 0 && !isASCII()) {
 506             log.error(position(), Errors.IllegalNonasciiDigit);
 507             character = "0123456789abcdef".charAt(result);
 508         }
 509 
 510         return result;
 511     }
 512 
 513     /**
 514      * Returns the input buffer. Unicode escape sequences are not translated.
 515      *
 516      * @return the input buffer.
 517      */
 518     public char[] getRawCharacters() {
 519         return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
 520     }
 521 
 522     /**
 523      * Returns a copy of a character array subset of the input buffer.
 524      * The returned array begins at the {@code beginIndex} and
 525      * extends to the character at index {@code endIndex - 1}.
 526      * Thus the length of the substring is {@code endIndex-beginIndex}.
 527      * This behavior is like
 528      * {@code String.substring(beginIndex, endIndex)}.
 529      * Unicode escape sequences are not translated.
 530      *
 531      * @param  beginIndex the beginning index, inclusive.
 532      * @param  endIndex the ending index, exclusive.
 533      *
 534      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
 535      *         array bounds
 536      */
 537     public char[] getRawCharacters(int beginIndex, int endIndex) {
 538         return Arrays.copyOfRange(buffer, beginIndex, endIndex);
 539     }
 540 
 541     /**
 542      * This is a specialized version of UnicodeReader that keeps track of the
 543      * column position within a given character stream. Used for Javadoc
 544      * processing to build a table for mapping positions in the comment string
 545      * to positions in the source file.
 546      */
 547     static class PositionTrackingReader extends UnicodeReader {
 548         /**
 549          * Offset from the beginning of the original reader buffer.
 550          */
 551         private int offset;
 552 
 553         /**
 554          * Current column in the comment.
 555          */
 556         private int column;
 557 
 558         /**
 559          * Constructor.
 560          *
 561          * @param sf      Scan factory.
 562          * @param array   Array containing contents of source.
 563          * @param offset  Position offset in original source buffer.
 564          */
 565         protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
 566             super(sf, array, array.length);
 567             this.offset = offset;
 568             this.column = 0;
 569         }
 570 
 571         /**
 572          * Advances the current character to the next character. Tracks column.
 573          *
 574          * @return next character.
 575          */
 576         @Override
 577         protected char next() {
 578             super.next();
 579 
 580             if (isOneOf('\n', '\r', '\f')) {
 581                 column = 0;
 582             } else if (is('\t')) {
 583                 column = tabulate(column);
 584             } else {
 585                 column++;
 586             }
 587 
 588             return get();
 589         }
 590 
 591         /**
 592          * Returns the current column.
 593          *
 594          * @return  the current column.
 595          */
 596         protected int column() {
 597             return column;
 598         }
 599 
 600         /**
 601          * Returns position relative to the original source buffer.
 602          *
 603          * @return
 604          */
 605         protected int offsetPosition() {
 606             return position() + offset;
 607         }
 608     }
 609 
 610 }