open Cdiff src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

rev 60227 : 8224225: Tokenizer improvements
Reviewed-by: jlaskey


*** 1,7 ****
  /*
!  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.  Oracle designates this
--- 1,7 ----
  /*
!  * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.  Oracle designates this
*** 23,293 ****
   * questions.
   */
  
  package com.sun.tools.javac.parser;
  
- import java.nio.CharBuffer;
  import java.util.Arrays;
  
- import com.sun.tools.javac.file.JavacFileManager;
  import com.sun.tools.javac.resources.CompilerProperties.Errors;
- import com.sun.tools.javac.util.ArrayUtils;
  import com.sun.tools.javac.util.Log;
- import com.sun.tools.javac.util.Name;
- import com.sun.tools.javac.util.Names;
  
! import static com.sun.tools.javac.util.LayoutCharacters.*;
  
! /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
!  * characters contained in the input stream, handling unicode escape accordingly.
!  * Additionally, it provides features for saving chars into a buffer and to retrieve
!  * them at a later stage.
   *
   *  <p><b>This is NOT part of any supported API.
   *  If you write code that depends on this, you do so at your own risk.
   *  This code and its internal interfaces are subject to change or
!  *  deletion without notice.</b>
   */
  public class UnicodeReader {
  
!     /** The input buffer, index of next character to be read,
!      *  index of one past last character in buffer.
       */
!     protected char[] buf;
!     protected int bp;
!     protected final int buflen;
  
!     /** The current character.
       */
!     protected char ch;
  
!     /** The buffer index of the last converted unicode character
       */
!     protected int unicodeConversionBp = -1;
  
!     protected Log log;
!     protected Names names;
  
!     /** A character buffer for saved chars.
       */
!     protected char[] sbuf = new char[128];
!     protected int realLength;
!     protected int sp;
  
      /**
!      * Create a scanner from the input array.  This method might
!      * modify the array.  To avoid copying the input array, ensure
!      * that {@code inputLength < input.length} or
!      * {@code input[input.length -1]} is a white space character.
       *
!      * @param sf the factory which created this Scanner
!      * @param buffer the input, might be modified
!      * Must be positive and less than or equal to input.length.
       */
!     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
!         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
      }
  
!     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
!         log = sf.log;
!         names = sf.names;
!         realLength = inputLength;
!         if (inputLength == input.length) {
!             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
!                 inputLength--;
!             } else {
!                 input = Arrays.copyOf(input, inputLength + 1);
              }
          }
-         buf = input;
-         buflen = inputLength;
-         buf[buflen] = EOI;
-         bp = -1;
-         scanChar();
      }
  
!     /** Read next character.
       */
!     protected void scanChar() {
!         if (bp < buflen) {
!             ch = buf[++bp];
!             if (ch == '\\') {
!                 convertUnicode();
              }
          }
      }
  
!     /** Read next character in comment, skipping over double '\' characters.
!      */
!     protected void scanCommentChar() {
!         scanChar();
!         if (ch == '\\') {
!             if (peekChar() == '\\' && !isUnicode()) {
!                 skipChar();
              } else {
!                 convertUnicode();
!             }
          }
      }
  
!     /** Append a character to sbuf.
       */
!     protected void putChar(char ch, boolean scan) {
!         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
!         sbuf[sp++] = ch;
!         if (scan)
!             scanChar();
      }
  
!     protected void putChar(char ch) {
!         putChar(ch, false);
      }
  
!     protected void putChar(boolean scan) {
!         putChar(ch, scan);
      }
  
!     protected void nextChar(boolean skip) {
!         if (!skip) {
!             sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
!             sbuf[sp++] = ch;
          }
  
!         scanChar();
      }
  
!     Name name() {
!         return names.fromChars(sbuf, 0, sp);
      }
  
!     String chars() {
!         return new String(sbuf, 0, sp);
      }
  
!     /** Add 'count' copies of the character 'ch' to the string buffer.
       */
!     protected void repeat(char ch, int count) {
!         for ( ; 0 < count; count--) {
!             putChar(ch, false);
          }
      }
  
!     /** Reset the scan buffer pointer to 'pos'.
       */
!     protected void reset(int pos) {
!         bp = pos - 1;
!         scanChar();
      }
  
!     /** Convert unicode escape; bp points to initial '\' character
!      *  (Spec 3.3).
       */
!     protected void convertUnicode() {
!         if (ch == '\\' && unicodeConversionBp != bp ) {
!             bp++; ch = buf[bp];
!             if (ch == 'u') {
!                 do {
!                     bp++; ch = buf[bp];
!                 } while (ch == 'u');
!                 int limit = bp + 3;
!                 if (limit < buflen) {
!                     int d = digit(bp, 16);
!                     int code = d;
!                     while (bp < limit && d >= 0) {
!                         bp++; ch = buf[bp];
!                         d = digit(bp, 16);
!                         code = (code << 4) + d;
!                     }
!                     if (d >= 0) {
!                         ch = (char)code;
!                         unicodeConversionBp = bp;
!                         return;
                      }
                  }
!                 log.error(bp, Errors.IllegalUnicodeEsc);
!             } else {
!                 bp--;
!                 ch = '\\';
              }
          }
      }
  
!     /** Are surrogates supported?
       */
!     final static boolean surrogatesSupported = surrogatesSupported();
!     private static boolean surrogatesSupported() {
!         try {
!             Character.isHighSurrogate('a');
              return true;
-         } catch (NoSuchMethodError ex) {
-             return false;
          }
      }
  
!     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
!      *  the next character is a low surrogate, returns the code point
!      *  constructed from these surrogates. Otherwise, returns -1.
!      *  This method will not consume any of the characters.
       */
!     protected int peekSurrogates() {
!         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
!             char high = ch;
!             int prevBP = bp;
  
!             scanChar();
  
!             char low = ch;
  
!             ch = high;
!             bp = prevBP;
  
!             if (Character.isLowSurrogate(low)) {
!                 return Character.toCodePoint(high, low);
              }
          }
  
!         return -1;
      }
  
!     /** Convert an ASCII digit from its base (8, 10, or 16)
!      *  to its value.
       */
!     protected int digit(int pos, int base) {
!         char c = ch;
!         if ('0' <= c && c <= '9')
!             return Character.digit(c, base); //a fast common case
!         int codePoint = peekSurrogates();
!         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
!         if (result >= 0 && c > 0x7f) {
!             log.error(pos + 1, Errors.IllegalNonasciiDigit);
!             if (codePoint >= 0)
!                 scanChar();
!             ch = "0123456789abcdef".charAt(result);
          }
-         return result;
      }
  
!     protected boolean isUnicode() {
!         return unicodeConversionBp == bp;
      }
  
!     protected void skipChar() {
!         bp++;
      }
  
-     protected char peekChar() {
-         return buf[bp + 1];
      }
  
      /**
!      * Returns a copy of the input buffer, up to its inputLength.
!      * Unicode escape sequences are not translated.
       */
      public char[] getRawCharacters() {
!         char[] chars = new char[buflen];
!         System.arraycopy(buf, 0, chars, 0, buflen);
!         return chars;
      }
  
      /**
       * Returns a copy of a character array subset of the input buffer.
       * The returned array begins at the {@code beginIndex} and
--- 23,524 ----
   * questions.
   */
  
  package com.sun.tools.javac.parser;
  
  import java.util.Arrays;
  
  import com.sun.tools.javac.resources.CompilerProperties.Errors;
  import com.sun.tools.javac.util.Log;
  
! import static com.sun.tools.javac.util.LayoutCharacters.EOI;
! import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
  
! /**
!  * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
!  * one by one as contained in the input stream, handling unicode escape sequences accordingly.
   *
   *  <p><b>This is NOT part of any supported API.
   *  If you write code that depends on this, you do so at your own risk.
   *  This code and its internal interfaces are subject to change or
!  *  deletion without notice.</b></p>
   */
  public class UnicodeReader {
+     /**
+      * Buffer containing characters from source file. May contain extraneous characters
+      * beyond this.length.
+      */
+     private final char[] buffer;
  
!     /**
!      * Length of meaningful content in buffer.
       */
!     private final int length;
  
!     /**
!      * Character buffer index of character currently being observed.
       */
!     private int position;
  
!     /**
!      * Number of characters combined to provide character currently being observed. Typically
!      * one, but may be more when combinations of surrogate pairs and unicode escape sequences
!      * are read.
!      */
!     private int width;
! 
!     /**
!      * Character currently being observed. If a surrogate pair is read then will be the high
!      * member of the pair.
!      */
!     private char character;
! 
!     /**
!      * Codepoint of character currently being observed. Typically equivalent to the character
!      * but will have a value greater that 0xFFFF when a surrogate pair.
       */
!     private int codepoint;
  
!     /**
!      * true if the last character was a backslash. This is used to handle the special case
!      * when a backslash precedes a unicode escape sequence. In that case, the second backslash
!      * is treated as a backslash and not part of a unicode escape sequence.
!      */
!     private boolean wasBackslash;
  
!     /**
!      * Log for error reporting.
       */
!     private final Log log;
  
      /**
!      * Constructor.
       *
!      * @param sf      scan factory.
!      * @param array   array containing contents of source.
!      * @param length  length of meaningful content in buffer.
!      */
!     protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
!         this.buffer = array;
!         this.length = length;
!         this.position = 0;
!         this.width = 0;
!         this.character = '\0';
!         this.codepoint = 0;
!         this.wasBackslash = false;
!         this.log = sf.log;
! 
!         nextCodePoint();
!     }
! 
!     /**
!      * Returns the length of the buffer. This is length of meaningful content in buffer and
!      * not the length of the buffer array.
!      *
!      * @return length of the buffer.
       */
!     protected int length() {
!         return length;
      }
  
!     /**
!      * Return true if current position is past the end of the meaningful part of the buffer.
!      *
!      * @return true if current position is past the end of the meaningful part of the buffer.
!      */
!     protected boolean isEOF() {
!         return position >= length;
      }
+ 
+     /**
+      * Fetches the next 16-bit character from the buffer and places it in this.character.
+      */
+     private void nextCharacter() {
+         // Index of next character in buffer.
+         int index = position + width;
+ 
+         // If past end of buffer.
+         if (length <= index) {
+             // End of file is marked with EOI.
+             character = EOI;
+         } else {
+             // Next character in buffer.
+             character = buffer[index];
+             // Increment length of codepoint.
+             width++;
          }
      }
  
!     /**
!      * Fetches the next 16-bit character from the buffer. If an unicode escape sequence
!      * is detected then converts the unicode escape sequence to a character.
       */
!     private void nextUnicode() {
!         // Position to next codepoint.
!         position += width;
!         // Codepoint has no characters yet.
!         width = 0;
! 
!         // Fetch next character.
!         nextCharacter();
! 
!         // If second backslash is detected.
!         if (wasBackslash) {
!             // Treat like a normal character (not part of unicode escape sequence.)
!             wasBackslash = false;
!         } else if (character == '\\') {
!             // May be a unicode escape sequence.
!             wasBackslash = !unicodeEscape();
          }
+ 
+         // Codepoint and character match if not surrogate.
+         codepoint = (int)character;
      }
+ 
+     /**
+      * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized
+      * then converts unicode escape sequence to a character. If two characters are a surrogate pair
+      * then converts to a codepoint.
+      */
+     private void nextCodePoint() {
+         // Next unicode character.
+         nextUnicode();
+ 
+         // Return early if ASCII or not a surrogate pair.
+         if (isASCII() || !Character.isHighSurrogate(character)) {
+             return;
          }
  
!         // Capture high surrogate and position.
!         char hi = character;
!         int savePosition = position;
!         int saveWidth = width;
! 
!         // Get potential low surrogate.
!         nextUnicode();
!         char lo = character;
! 
!         if (Character.isLowSurrogate(lo)) {
!             // Start codepoint at start of high surrogate.
!             position = savePosition;
!             width += saveWidth;
!             // Compute codepoint.
!             codepoint = Character.toCodePoint(hi, lo);
          } else {
!             // Restore to treat high surrogate as just a character.
!             position = savePosition;
!             width = saveWidth;
!             character = hi;
!             codepoint = (int)hi;
!             // Could potential report an error here (old code did not.)
          }
      }
  
!     /**
!      * Converts an unicode escape sequence into a character.
!      *
!      * @return true if was a valid escape sequence.
       */
!     private boolean unicodeEscape() {
!         // Start of unicode escape sequence (past backslash.)
!         int start = position + width;
!         int index;
! 
!         // Skip multiple 'u'.
!         for (index = start; index < length; index++) {
!             if (buffer[index] != 'u') {
!                 break;
!             }
          }
  
!         // Needs to be at least backslash-u.
!         if (index != start) {
!             // If enough characters available.
!             if (index + 4 < length) {
!                 // Convert four hex digits to codepoint. If any digit is invalid then the
!                 // result is negative.
!                 int code = (Character.digit(buffer[index++], 16) << 12) |
!                            (Character.digit(buffer[index++], 16) << 8) |
!                            (Character.digit(buffer[index++], 16) << 4) |
!                             Character.digit(buffer[index++], 16);
! 
!                 // If all digits are good.
!                 if (code >= 0) {
!                     width = index - position;
!                     character = (char)code;
! 
!                     return true;
!                 }
              }
  
!             // Did not work out.
!             log.error(position, Errors.IllegalUnicodeEsc);
!             width = index - position;
! 
!             return true;
          }
  
!         // Must be just a backslash.
!         character = '\\';
!         width = 1;
! 
!         return false;
      }
  
!     /**
!      * Return the current position in the character buffer.
!      *
!      * @return  current position in the character buffer.
!      */
!     protected int position() {
!         return position;
      }
  
! 
!     /**
!      * Reset the reader to the specified position.
!      * Warning: Do not use when previous character was an ASCII or unicode backslash.
!      * @param pos
!      */
!     protected void reset(int pos) {
!         position = pos;
!         width = 0;
!         wasBackslash = false;
!         nextCodePoint();
      }
  
!     /**
!      * Return the current character in at the current position.
!      *
!      * @return current character in at the current position.
!      */
!     protected char get() {
!         return character;
      }
  
!     /**
!      * Return the current codepoint in at the current position.
!      *
!      * @return current codepoint in at the current position.
       */
!     protected int getCodepoint() {
!         return codepoint;
      }
+ 
+     /**
+      * Returns true if the current codepoint is a surrogate.
+      *
+      * @return true if the current codepoint is a surrogate.
+      */
+     protected boolean isSurrogate() {
+         return 0xFFFF < codepoint;
      }
  
!     /**
!      * Returns true if the current character is ASCII.
!      *
!      * @return true if the current character is ASCII.
       */
!     protected boolean isASCII() {
!         return character <= 0x7F;
      }
  
!     /**
!      * Advances the current character to the next character.
!      *
!      * @return next character.
       */
!     protected char next() {
!         nextCodePoint();
! 
!         return character;
      }
+ 
+     /**
+      * Compare character. Returns true if a match.
+      *
+      * @param ch  character to match.
+      *
+      * @return true if a match.
+      */
+     protected boolean is(char ch) {
+         return character == ch;
      }
! 
!     /**
!      * Match one of the arguments. Returns true if a match.
!      */
!     protected boolean isOneOf(char ch1, char ch2) {
!         return is(ch1) || is(ch2);
      }
+     protected boolean isOneOf(char ch1, char ch2, char ch3) {
+         return is(ch1) || is(ch2) || is(ch3);
      }
+     protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
+         return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
      }
  
!     /**
!      * Tests to see if current character is in the range of lo to hi characters (inclusive).
!      *
!      * @param lo  lowest character in range.
!      * @param hi  highest character in range.
!      *
!      * @return true if the current character is in range.
!      */
!     protected boolean inRange(char lo, char hi) {
!         return lo <= character && character <= hi;
!     }
! 
!     /**
!      * Compare character and advance if a match. Returns true if a match.
!      *
!      * @param ch  character to match.
!      *
!      * @return true if a match.
       */
!     protected boolean accept(char ch) {
!         if (is(ch)) {
!             next();
! 
              return true;
          }
+ 
+         return false;
      }
  
!     /**
!      * Match one of the arguments and advance if a match. Returns true if a match.
       */
!     protected boolean acceptOneOf(char ch1, char ch2) {
!         if (isOneOf(ch1, ch2)) {
!             next();
  
!             return true;
!         }
  
!         return false;
!     }
  
!     protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
!         if (isOneOf(ch1, ch2, ch3)) {
!             next();
  
!             return true;
          }
+ 
+         return false;
      }
  
!     /**
!      * Skip over all occurances of character.
!      *
!      * @param ch character to accept.
!      */
!     protected void skip(char ch) {
!         while (accept(ch)) {
!             // next
!         }
      }
  
!     /**
!      * Skip over ASCII white space characters.
       */
!     protected void skipWhitespace() {
!         while (acceptOneOf(' ', '\t', '\f')) {
!             // next
          }
      }
  
!     /**
!      * Skip to end of line.
!      */
!     protected void skipToEOLN() {
!         while (!isEOF()) {
!             if (isOneOf('\r', '\n')) {
!                 break;
              }
  
!             next();
          }
  
      }
  
      /**
!      * Compare string and advance if a match. Returns true if a match.
!      * Warning: Do not use when previous character was a backslash
!      * (confuses state of wasBackslash.)
!      *
!      * @param string string to match character for character.
!      *
!      * @return true if a match.
!      */
!     protected boolean accept(String string) {
!         // Quick test.
!         if (string.length() == 0 || !is(string.charAt(0))) {
!             return false;
!         }
! 
!         // Be prepared to retreat if not a match.
!         int savedPosition = position;
! 
!         nextCodePoint();
! 
!         // Check each character.
!         for (int i = 1; i < string.length(); i++) {
!             if (!is(string.charAt(i))) {
!                 // Restart if not a match.
!                 reset(savedPosition);
! 
!                 return false;
!             }
! 
!             nextCodePoint();
!         }
! 
!         return true;
!     }
! 
!     /**
!      * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
!      * advance character.
!      *
!      * @param pos         starting position.
!      * @param digitRadix  base of number being converted.
!      *
!      * @return value of digit.
!      */
!     protected int digit(int pos, int digitRadix) {
!         int result;
! 
!         // Just an ASCII digit.
!         if (inRange('0', '9')) {
!             // Fast common case.
!             result = character - '0';
! 
!             return result < digitRadix ? result : -1;
!         }
! 
!         // Handle other digits.
!         result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
!                                  Character.digit(character, digitRadix);
! 
!         if (result >= 0 && !isASCII()) {
!             log.error(position(), Errors.IllegalNonasciiDigit);
!             character = "0123456789abcdef".charAt(result);
!         }
! 
!         return result;
!     }
! 
!     /**
!      * Returns the input buffer. Unicode escape sequences are not translated.
!      *
!      * @return the input buffer.
       */
      public char[] getRawCharacters() {
!         return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
      }
  
      /**
       * Returns a copy of a character array subset of the input buffer.
       * The returned array begins at the {@code beginIndex} and
*** 297,311 ****
       * {@code String.substring(beginIndex, endIndex)}.
       * Unicode escape sequences are not translated.
       *
       * @param beginIndex the beginning index, inclusive.
       * @param endIndex the ending index, exclusive.
       * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
       *         array bounds
       */
      public char[] getRawCharacters(int beginIndex, int endIndex) {
!         int length = endIndex - beginIndex;
!         char[] chars = new char[length];
!         System.arraycopy(buf, beginIndex, chars, 0, length);
!         return chars;
      }
  }
--- 528,610 ----
       * {@code String.substring(beginIndex, endIndex)}.
       * Unicode escape sequences are not translated.
       *
       * @param  beginIndex the beginning index, inclusive.
       * @param  endIndex the ending index, exclusive.
+      *
       * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
       *         array bounds
       */
      public char[] getRawCharacters(int beginIndex, int endIndex) {
!         return Arrays.copyOfRange(buffer, beginIndex, endIndex);
      }
+ 
+     /**
+      * This is a specialized version of UnicodeReader that keeps track of the
+      * column position within a given character stream. Used for Javadoc
+      * processing to build a table for mapping positions in the comment string
+      * to positions in the source file.
+      */
+     static class PositionTrackingReader extends UnicodeReader {
+         /**
+          * Offset from the beginning of the original reader buffer.
+          */
+         private int offset;
+ 
+         /**
+          * Current column in the comment.
+          */
+         private int column;
+ 
+         /**
+          * Constructor.
+          *
+          * @param sf      Scan factory.
+          * @param array   Array containing contents of source.
+          * @param offset  Position offset in original source buffer.
+          */
+         protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
+             super(sf, array, array.length);
+             this.offset = offset;
+             this.column = 0;
+         }
+ 
+         /**
+          * Advances the current character to the next character. Tracks column.
+          *
+          * @return next character.
+          */
+         @Override
+         protected char next() {
+             super.next();
+ 
+             if (isOneOf('\n', '\r', '\f')) {
+                 column = 0;
+             } else if (is('\t')) {
+                 column = tabulate(column);
+             } else {
+                 column++;
+             }
+ 
+             return get();
+         }
+ 
+         /**
+          * Returns the current column.
+          *
+          * @return  the current column.
+          */
+         protected int column() {
+             return column;
+         }
+ 
+         /**
+          * Returns position relative to the original source buffer.
+          *
+          * @return
+          */
+         protected int offsetPosition() {
+             return position() + offset;
+         }
+     }
+ 
  }

< prev index next >