open Udiff src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

rev 60227 : 8224225: Tokenizer improvements
Reviewed-by: jlaskey

@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this

@@ -23,271 +23,502 @@
  * questions.
  */
 
 package com.sun.tools.javac.parser;
 
-import java.nio.CharBuffer;
 import java.util.Arrays;
 
-import com.sun.tools.javac.file.JavacFileManager;
 import com.sun.tools.javac.resources.CompilerProperties.Errors;
-import com.sun.tools.javac.util.ArrayUtils;
 import com.sun.tools.javac.util.Log;
-import com.sun.tools.javac.util.Name;
-import com.sun.tools.javac.util.Names;
 
-import static com.sun.tools.javac.util.LayoutCharacters.*;
+import static com.sun.tools.javac.util.LayoutCharacters.EOI;
+import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
 
-/** The char reader used by the javac lexer/tokenizer. Returns the sequence of
- * characters contained in the input stream, handling unicode escape accordingly.
- * Additionally, it provides features for saving chars into a buffer and to retrieve
- * them at a later stage.
+/**
+ * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
+ * one by one as contained in the input stream, handling unicode escape sequences accordingly.
  *
  *  <p><b>This is NOT part of any supported API.
  *  If you write code that depends on this, you do so at your own risk.
  *  This code and its internal interfaces are subject to change or
- *  deletion without notice.</b>
+ *  deletion without notice.</b></p>
  */
 public class UnicodeReader {
+    /**
+     * Buffer containing characters from source file. May contain extraneous characters
+     * beyond this.length.
+     */
+    private final char[] buffer;
 
-    /** The input buffer, index of next character to be read,
-     *  index of one past last character in buffer.
+    /**
+     * Length of meaningful content in buffer.
      */
-    protected char[] buf;
-    protected int bp;
-    protected final int buflen;
+    private final int length;
 
-    /** The current character.
+    /**
+     * Character buffer index of character currently being observed.
      */
-    protected char ch;
+    private int position;
 
-    /** The buffer index of the last converted unicode character
+    /**
+     * Number of characters combined to provide character currently being observed. Typically
+     * one, but may be more when combinations of surrogate pairs and unicode escape sequences
+     * are read.
+     */
+    private int width;
+
+    /**
+     * Character currently being observed. If a surrogate pair is read then will be the high
+     * member of the pair.
+     */
+    private char character;
+
+    /**
+     * Codepoint of character currently being observed. Typically equivalent to the character
+     * but will have a value greater that 0xFFFF when a surrogate pair.
      */
-    protected int unicodeConversionBp = -1;
+    private int codepoint;
 
-    protected Log log;
-    protected Names names;
+    /**
+     * true if the last character was a backslash. This is used to handle the special case
+     * when a backslash precedes a unicode escape sequence. In that case, the second backslash
+     * is treated as a backslash and not part of a unicode escape sequence.
+     */
+    private boolean wasBackslash;
 
-    /** A character buffer for saved chars.
+    /**
+     * Log for error reporting.
      */
-    protected char[] sbuf = new char[128];
-    protected int realLength;
-    protected int sp;
+    private final Log log;
 
     /**
-     * Create a scanner from the input array.  This method might
-     * modify the array.  To avoid copying the input array, ensure
-     * that {@code inputLength < input.length} or
-     * {@code input[input.length -1]} is a white space character.
+     * Constructor.
      *
-     * @param sf the factory which created this Scanner
-     * @param buffer the input, might be modified
-     * Must be positive and less than or equal to input.length.
+     * @param sf      scan factory.
+     * @param array   array containing contents of source.
+     * @param length  length of meaningful content in buffer.
+     */
+    protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
+        this.buffer = array;
+        this.length = length;
+        this.position = 0;
+        this.width = 0;
+        this.character = '\0';
+        this.codepoint = 0;
+        this.wasBackslash = false;
+        this.log = sf.log;
+
+        nextCodePoint();
+    }
+
+    /**
+     * Returns the length of the buffer. This is length of meaningful content in buffer and
+     * not the length of the buffer array.
+     *
+     * @return length of the buffer.
      */
-    protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
-        this(sf, JavacFileManager.toArray(buffer), buffer.limit());
+    protected int length() {
+        return length;
     }
 
-    protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
-        log = sf.log;
-        names = sf.names;
-        realLength = inputLength;
-        if (inputLength == input.length) {
-            if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
-                inputLength--;
-            } else {
-                input = Arrays.copyOf(input, inputLength + 1);
+    /**
+     * Return true if current position is past the end of the meaningful part of the buffer.
+     *
+     * @return true if current position is past the end of the meaningful part of the buffer.
+     */
+    protected boolean isEOF() {
+        return position >= length;
             }
+
+    /**
+     * Fetches the next 16-bit character from the buffer and places it in this.character.
+     */
+    private void nextCharacter() {
+        // Index of next character in buffer.
+        int index = position + width;
+
+        // If past end of buffer.
+        if (length <= index) {
+            // End of file is marked with EOI.
+            character = EOI;
+        } else {
+            // Next character in buffer.
+            character = buffer[index];
+            // Increment length of codepoint.
+            width++;
         }
-        buf = input;
-        buflen = inputLength;
-        buf[buflen] = EOI;
-        bp = -1;
-        scanChar();
     }
 
-    /** Read next character.
+    /**
+     * Fetches the next 16-bit character from the buffer. If an unicode escape sequence
+     * is detected then converts the unicode escape sequence to a character.
      */
-    protected void scanChar() {
-        if (bp < buflen) {
-            ch = buf[++bp];
-            if (ch == '\\') {
-                convertUnicode();
+    private void nextUnicode() {
+        // Position to next codepoint.
+        position += width;
+        // Codepoint has no characters yet.
+        width = 0;
+
+        // Fetch next character.
+        nextCharacter();
+
+        // If second backslash is detected.
+        if (wasBackslash) {
+            // Treat like a normal character (not part of unicode escape sequence.)
+            wasBackslash = false;
+        } else if (character == '\\') {
+            // May be a unicode escape sequence.
+            wasBackslash = !unicodeEscape();
             }
+
+        // Codepoint and character match if not surrogate.
+        codepoint = (int)character;
         }
+
+    /**
+     * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized
+     * then converts unicode escape sequence to a character. If two characters are a surrogate pair
+     * then converts to a codepoint.
+     */
+    private void nextCodePoint() {
+        // Next unicode character.
+        nextUnicode();
+
+        // Return early if ASCII or not a surrogate pair.
+        if (isASCII() || !Character.isHighSurrogate(character)) {
+            return;
     }
 
-    /** Read next character in comment, skipping over double '\' characters.
-     */
-    protected void scanCommentChar() {
-        scanChar();
-        if (ch == '\\') {
-            if (peekChar() == '\\' && !isUnicode()) {
-                skipChar();
+        // Capture high surrogate and position.
+        char hi = character;
+        int savePosition = position;
+        int saveWidth = width;
+
+        // Get potential low surrogate.
+        nextUnicode();
+        char lo = character;
+
+        if (Character.isLowSurrogate(lo)) {
+            // Start codepoint at start of high surrogate.
+            position = savePosition;
+            width += saveWidth;
+            // Compute codepoint.
+            codepoint = Character.toCodePoint(hi, lo);
             } else {
-                convertUnicode();
-            }
+            // Restore to treat high surrogate as just a character.
+            position = savePosition;
+            width = saveWidth;
+            character = hi;
+            codepoint = (int)hi;
+            // Could potential report an error here (old code did not.)
         }
     }
 
-    /** Append a character to sbuf.
+    /**
+     * Converts an unicode escape sequence into a character.
+     *
+     * @return true if was a valid escape sequence.
      */
-    protected void putChar(char ch, boolean scan) {
-        sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
-        sbuf[sp++] = ch;
-        if (scan)
-            scanChar();
+    private boolean unicodeEscape() {
+        // Start of unicode escape sequence (past backslash.)
+        int start = position + width;
+        int index;
+
+        // Skip multiple 'u'.
+        for (index = start; index < length; index++) {
+            if (buffer[index] != 'u') {
+                break;
+            }
     }
 
-    protected void putChar(char ch) {
-        putChar(ch, false);
+        // Needs to be at least backslash-u.
+        if (index != start) {
+            // If enough characters available.
+            if (index + 4 < length) {
+                // Convert four hex digits to codepoint. If any digit is invalid then the
+                // result is negative.
+                int code = (Character.digit(buffer[index++], 16) << 12) |
+                           (Character.digit(buffer[index++], 16) << 8) |
+                           (Character.digit(buffer[index++], 16) << 4) |
+                            Character.digit(buffer[index++], 16);
+
+                // If all digits are good.
+                if (code >= 0) {
+                    width = index - position;
+                    character = (char)code;
+
+                    return true;
+                }
     }
 
-    protected void putChar(boolean scan) {
-        putChar(ch, scan);
+            // Did not work out.
+            log.error(position, Errors.IllegalUnicodeEsc);
+            width = index - position;
+
+            return true;
     }
 
-    protected void nextChar(boolean skip) {
-        if (!skip) {
-            sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
-            sbuf[sp++] = ch;
+        // Must be just a backslash.
+        character = '\\';
+        width = 1;
+
+        return false;
         }
 
-        scanChar();
+    /**
+     * Return the current position in the character buffer.
+     *
+     * @return  current position in the character buffer.
+     */
+    protected int position() {
+        return position;
     }
 
-    Name name() {
-        return names.fromChars(sbuf, 0, sp);
+
+    /**
+     * Reset the reader to the specified position.
+     * Warning: Do not use when previous character was an ASCII or unicode backslash.
+     * @param pos
+     */
+    protected void reset(int pos) {
+        position = pos;
+        width = 0;
+        wasBackslash = false;
+        nextCodePoint();
     }
 
-    String chars() {
-        return new String(sbuf, 0, sp);
+    /**
+     * Return the current character in at the current position.
+     *
+     * @return current character in at the current position.
+     */
+    protected char get() {
+        return character;
     }
 
-    /** Add 'count' copies of the character 'ch' to the string buffer.
+    /**
+     * Return the current codepoint in at the current position.
+     *
+     * @return current codepoint in at the current position.
      */
-    protected void repeat(char ch, int count) {
-        for ( ; 0 < count; count--) {
-            putChar(ch, false);
+    protected int getCodepoint() {
+        return codepoint;
         }
+
+    /**
+     * Returns true if the current codepoint is a surrogate.
+     *
+     * @return true if the current codepoint is a surrogate.
+     */
+    protected boolean isSurrogate() {
+        return 0xFFFF < codepoint;
     }
 
-    /** Reset the scan buffer pointer to 'pos'.
+    /**
+     * Returns true if the current character is ASCII.
+     *
+     * @return true if the current character is ASCII.
      */
-    protected void reset(int pos) {
-        bp = pos - 1;
-        scanChar();
+    protected boolean isASCII() {
+        return character <= 0x7F;
     }
 
-    /** Convert unicode escape; bp points to initial '\' character
-     *  (Spec 3.3).
+    /**
+     * Advances the current character to the next character.
+     *
+     * @return next character.
      */
-    protected void convertUnicode() {
-        if (ch == '\\' && unicodeConversionBp != bp ) {
-            bp++; ch = buf[bp];
-            if (ch == 'u') {
-                do {
-                    bp++; ch = buf[bp];
-                } while (ch == 'u');
-                int limit = bp + 3;
-                if (limit < buflen) {
-                    int d = digit(bp, 16);
-                    int code = d;
-                    while (bp < limit && d >= 0) {
-                        bp++; ch = buf[bp];
-                        d = digit(bp, 16);
-                        code = (code << 4) + d;
-                    }
-                    if (d >= 0) {
-                        ch = (char)code;
-                        unicodeConversionBp = bp;
-                        return;
+    protected char next() {
+        nextCodePoint();
+
+        return character;
                     }
+
+    /**
+     * Compare character. Returns true if a match.
+     *
+     * @param ch  character to match.
+     *
+     * @return true if a match.
+     */
+    protected boolean is(char ch) {
+        return character == ch;
                 }
-                log.error(bp, Errors.IllegalUnicodeEsc);
-            } else {
-                bp--;
-                ch = '\\';
+
+    /**
+     * Match one of the arguments. Returns true if a match.
+     */
+    protected boolean isOneOf(char ch1, char ch2) {
+        return is(ch1) || is(ch2);
             }
+    protected boolean isOneOf(char ch1, char ch2, char ch3) {
+        return is(ch1) || is(ch2) || is(ch3);
         }
+    protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
+        return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
     }
 
-    /** Are surrogates supported?
+    /**
+     * Tests to see if current character is in the range of lo to hi characters (inclusive).
+     *
+     * @param lo  lowest character in range.
+     * @param hi  highest character in range.
+     *
+     * @return true if the current character is in range.
+     */
+    protected boolean inRange(char lo, char hi) {
+        return lo <= character && character <= hi;
+    }
+
+    /**
+     * Compare character and advance if a match. Returns true if a match.
+     *
+     * @param ch  character to match.
+     *
+     * @return true if a match.
      */
-    final static boolean surrogatesSupported = surrogatesSupported();
-    private static boolean surrogatesSupported() {
-        try {
-            Character.isHighSurrogate('a');
+    protected boolean accept(char ch) {
+        if (is(ch)) {
+            next();
+
             return true;
-        } catch (NoSuchMethodError ex) {
-            return false;
         }
+
+        return false;
     }
 
-    /** Scan surrogate pairs.  If 'ch' is a high surrogate and
-     *  the next character is a low surrogate, returns the code point
-     *  constructed from these surrogates. Otherwise, returns -1.
-     *  This method will not consume any of the characters.
+    /**
+     * Match one of the arguments and advance if a match. Returns true if a match.
      */
-    protected int peekSurrogates() {
-        if (surrogatesSupported && Character.isHighSurrogate(ch)) {
-            char high = ch;
-            int prevBP = bp;
+    protected boolean acceptOneOf(char ch1, char ch2) {
+        if (isOneOf(ch1, ch2)) {
+            next();
 
-            scanChar();
+            return true;
+        }
 
-            char low = ch;
+        return false;
+    }
 
-            ch = high;
-            bp = prevBP;
+    protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
+        if (isOneOf(ch1, ch2, ch3)) {
+            next();
 
-            if (Character.isLowSurrogate(low)) {
-                return Character.toCodePoint(high, low);
+            return true;
             }
+
+        return false;
         }
 
-        return -1;
+    /**
+     * Skip over all occurances of character.
+     *
+     * @param ch character to accept.
+     */
+    protected void skip(char ch) {
+        while (accept(ch)) {
+            // next
+        }
     }
 
-    /** Convert an ASCII digit from its base (8, 10, or 16)
-     *  to its value.
+    /**
+     * Skip over ASCII white space characters.
      */
-    protected int digit(int pos, int base) {
-        char c = ch;
-        if ('0' <= c && c <= '9')
-            return Character.digit(c, base); //a fast common case
-        int codePoint = peekSurrogates();
-        int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
-        if (result >= 0 && c > 0x7f) {
-            log.error(pos + 1, Errors.IllegalNonasciiDigit);
-            if (codePoint >= 0)
-                scanChar();
-            ch = "0123456789abcdef".charAt(result);
+    protected void skipWhitespace() {
+        while (acceptOneOf(' ', '\t', '\f')) {
+            // next
         }
-        return result;
     }
 
-    protected boolean isUnicode() {
-        return unicodeConversionBp == bp;
+    /**
+     * Skip to end of line.
+     */
+    protected void skipToEOLN() {
+        while (!isEOF()) {
+            if (isOneOf('\r', '\n')) {
+                break;
     }
 
-    protected void skipChar() {
-        bp++;
+            next();
     }
 
-    protected char peekChar() {
-        return buf[bp + 1];
     }
 
     /**
-     * Returns a copy of the input buffer, up to its inputLength.
-     * Unicode escape sequences are not translated.
+     * Compare string and advance if a match. Returns true if a match.
+     * Warning: Do not use when previous character was a backslash
+     * (confuses state of wasBackslash.)
+     *
+     * @param string string to match character for character.
+     *
+     * @return true if a match.
+     */
+    protected boolean accept(String string) {
+        // Quick test.
+        if (string.length() == 0 || !is(string.charAt(0))) {
+            return false;
+        }
+
+        // Be prepared to retreat if not a match.
+        int savedPosition = position;
+
+        nextCodePoint();
+
+        // Check each character.
+        for (int i = 1; i < string.length(); i++) {
+            if (!is(string.charAt(i))) {
+                // Restart if not a match.
+                reset(savedPosition);
+
+                return false;
+            }
+
+            nextCodePoint();
+        }
+
+        return true;
+    }
+
+    /**
+     * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
+     * advance character.
+     *
+     * @param pos         starting position.
+     * @param digitRadix  base of number being converted.
+     *
+     * @return value of digit.
+     */
+    protected int digit(int pos, int digitRadix) {
+        int result;
+
+        // Just an ASCII digit.
+        if (inRange('0', '9')) {
+            // Fast common case.
+            result = character - '0';
+
+            return result < digitRadix ? result : -1;
+        }
+
+        // Handle other digits.
+        result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
+                                 Character.digit(character, digitRadix);
+
+        if (result >= 0 && !isASCII()) {
+            log.error(position(), Errors.IllegalNonasciiDigit);
+            character = "0123456789abcdef".charAt(result);
+        }
+
+        return result;
+    }
+
+    /**
+     * Returns the input buffer. Unicode escape sequences are not translated.
+     *
+     * @return the input buffer.
      */
     public char[] getRawCharacters() {
-        char[] chars = new char[buflen];
-        System.arraycopy(buf, 0, chars, 0, buflen);
-        return chars;
+        return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
     }
 
     /**
      * Returns a copy of a character array subset of the input buffer.
      * The returned array begins at the {@code beginIndex} and

@@ -297,15 +528,83 @@
      * {@code String.substring(beginIndex, endIndex)}.
      * Unicode escape sequences are not translated.
      *
      * @param beginIndex the beginning index, inclusive.
      * @param endIndex the ending index, exclusive.
+     *
      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
      *         array bounds
      */
     public char[] getRawCharacters(int beginIndex, int endIndex) {
-        int length = endIndex - beginIndex;
-        char[] chars = new char[length];
-        System.arraycopy(buf, beginIndex, chars, 0, length);
-        return chars;
+        return Arrays.copyOfRange(buffer, beginIndex, endIndex);
     }
+
+    /**
+     * This is a specialized version of UnicodeReader that keeps track of the
+     * column position within a given character stream. Used for Javadoc
+     * processing to build a table for mapping positions in the comment string
+     * to positions in the source file.
+     */
+    static class PositionTrackingReader extends UnicodeReader {
+        /**
+         * Offset from the beginning of the original reader buffer.
+         */
+        private int offset;
+
+        /**
+         * Current column in the comment.
+         */
+        private int column;
+
+        /**
+         * Constructor.
+         *
+         * @param sf      Scan factory.
+         * @param array   Array containing contents of source.
+         * @param offset  Position offset in original source buffer.
+         */
+        protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
+            super(sf, array, array.length);
+            this.offset = offset;
+            this.column = 0;
+        }
+
+        /**
+         * Advances the current character to the next character. Tracks column.
+         *
+         * @return next character.
+         */
+        @Override
+        protected char next() {
+            super.next();
+
+            if (isOneOf('\n', '\r', '\f')) {
+                column = 0;
+            } else if (is('\t')) {
+                column = tabulate(column);
+            } else {
+                column++;
+            }
+
+            return get();
+        }
+
+        /**
+         * Returns the current column.
+         *
+         * @return  the current column.
+         */
+        protected int column() {
+            return column;
+        }
+
+        /**
+         * Returns position relative to the original source buffer.
+         *
+         * @return
+         */
+        protected int offsetPosition() {
+            return position() + offset;
+        }
+    }
+
 }

< prev index next >