< prev index next >
src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
Print this page
rev 60227 : 8224225: Tokenizer improvements
Reviewed-by: jlaskey
@@ -1,7 +1,7 @@
/*
- * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
@@ -23,271 +23,502 @@
* questions.
*/
package com.sun.tools.javac.parser;
-import java.nio.CharBuffer;
import java.util.Arrays;
-import com.sun.tools.javac.file.JavacFileManager;
import com.sun.tools.javac.resources.CompilerProperties.Errors;
-import com.sun.tools.javac.util.ArrayUtils;
import com.sun.tools.javac.util.Log;
-import com.sun.tools.javac.util.Name;
-import com.sun.tools.javac.util.Names;
-import static com.sun.tools.javac.util.LayoutCharacters.*;
+import static com.sun.tools.javac.util.LayoutCharacters.EOI;
+import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
-/** The char reader used by the javac lexer/tokenizer. Returns the sequence of
- * characters contained in the input stream, handling unicode escape accordingly.
- * Additionally, it provides features for saving chars into a buffer and to retrieve
- * them at a later stage.
+/**
+ * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
+ * one by one as contained in the input stream, handling unicode escape sequences accordingly.
*
* <p><b>This is NOT part of any supported API.
* If you write code that depends on this, you do so at your own risk.
* This code and its internal interfaces are subject to change or
- * deletion without notice.</b>
+ * deletion without notice.</b></p>
*/
public class UnicodeReader {
+ /**
+ * Buffer containing characters from source file. May contain extraneous characters
+ * beyond this.length.
+ */
+ private final char[] buffer;
- /** The input buffer, index of next character to be read,
- * index of one past last character in buffer.
+ /**
+ * Length of meaningful content in buffer.
*/
- protected char[] buf;
- protected int bp;
- protected final int buflen;
+ private final int length;
- /** The current character.
+ /**
+ * Character buffer index of character currently being observed.
*/
- protected char ch;
+ private int position;
- /** The buffer index of the last converted unicode character
+ /**
+ * Number of characters combined to provide character currently being observed. Typically
+ * one, but may be more when combinations of surrogate pairs and unicode escape sequences
+ * are read.
+ */
+ private int width;
+
+ /**
+ * Character currently being observed. If a surrogate pair is read then will be the high
+ * member of the pair.
+ */
+ private char character;
+
+ /**
+ * Codepoint of character currently being observed. Typically equivalent to the character
+ * but will have a value greater that 0xFFFF when a surrogate pair.
*/
- protected int unicodeConversionBp = -1;
+ private int codepoint;
- protected Log log;
- protected Names names;
+ /**
+ * true if the last character was a backslash. This is used to handle the special case
+ * when a backslash precedes a unicode escape sequence. In that case, the second backslash
+ * is treated as a backslash and not part of a unicode escape sequence.
+ */
+ private boolean wasBackslash;
- /** A character buffer for saved chars.
+ /**
+ * Log for error reporting.
*/
- protected char[] sbuf = new char[128];
- protected int realLength;
- protected int sp;
+ private final Log log;
/**
- * Create a scanner from the input array. This method might
- * modify the array. To avoid copying the input array, ensure
- * that {@code inputLength < input.length} or
- * {@code input[input.length -1]} is a white space character.
+ * Constructor.
*
- * @param sf the factory which created this Scanner
- * @param buffer the input, might be modified
- * Must be positive and less than or equal to input.length.
+ * @param sf scan factory.
+ * @param array array containing contents of source.
+ * @param length length of meaningful content in buffer.
+ */
+ protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
+ this.buffer = array;
+ this.length = length;
+ this.position = 0;
+ this.width = 0;
+ this.character = '\0';
+ this.codepoint = 0;
+ this.wasBackslash = false;
+ this.log = sf.log;
+
+ nextCodePoint();
+ }
+
+ /**
+ * Returns the length of the buffer. This is length of meaningful content in buffer and
+ * not the length of the buffer array.
+ *
+ * @return length of the buffer.
*/
- protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
- this(sf, JavacFileManager.toArray(buffer), buffer.limit());
+ protected int length() {
+ return length;
}
- protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
- log = sf.log;
- names = sf.names;
- realLength = inputLength;
- if (inputLength == input.length) {
- if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
- inputLength--;
- } else {
- input = Arrays.copyOf(input, inputLength + 1);
+ /**
+ * Return true if current position is past the end of the meaningful part of the buffer.
+ *
+ * @return true if current position is past the end of the meaningful part of the buffer.
+ */
+ protected boolean isEOF() {
+ return position >= length;
}
+
+ /**
+ * Fetches the next 16-bit character from the buffer and places it in this.character.
+ */
+ private void nextCharacter() {
+ // Index of next character in buffer.
+ int index = position + width;
+
+ // If past end of buffer.
+ if (length <= index) {
+ // End of file is marked with EOI.
+ character = EOI;
+ } else {
+ // Next character in buffer.
+ character = buffer[index];
+ // Increment length of codepoint.
+ width++;
}
- buf = input;
- buflen = inputLength;
- buf[buflen] = EOI;
- bp = -1;
- scanChar();
}
- /** Read next character.
+ /**
+ * Fetches the next 16-bit character from the buffer. If an unicode escape sequence
+ * is detected then converts the unicode escape sequence to a character.
*/
- protected void scanChar() {
- if (bp < buflen) {
- ch = buf[++bp];
- if (ch == '\\') {
- convertUnicode();
+ private void nextUnicode() {
+ // Position to next codepoint.
+ position += width;
+ // Codepoint has no characters yet.
+ width = 0;
+
+ // Fetch next character.
+ nextCharacter();
+
+ // If second backslash is detected.
+ if (wasBackslash) {
+ // Treat like a normal character (not part of unicode escape sequence.)
+ wasBackslash = false;
+ } else if (character == '\\') {
+ // May be a unicode escape sequence.
+ wasBackslash = !unicodeEscape();
}
+
+ // Codepoint and character match if not surrogate.
+ codepoint = (int)character;
}
+
+ /**
+ * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized
+ * then converts unicode escape sequence to a character. If two characters are a surrogate pair
+ * then converts to a codepoint.
+ */
+ private void nextCodePoint() {
+ // Next unicode character.
+ nextUnicode();
+
+ // Return early if ASCII or not a surrogate pair.
+ if (isASCII() || !Character.isHighSurrogate(character)) {
+ return;
}
- /** Read next character in comment, skipping over double '\' characters.
- */
- protected void scanCommentChar() {
- scanChar();
- if (ch == '\\') {
- if (peekChar() == '\\' && !isUnicode()) {
- skipChar();
+ // Capture high surrogate and position.
+ char hi = character;
+ int savePosition = position;
+ int saveWidth = width;
+
+ // Get potential low surrogate.
+ nextUnicode();
+ char lo = character;
+
+ if (Character.isLowSurrogate(lo)) {
+ // Start codepoint at start of high surrogate.
+ position = savePosition;
+ width += saveWidth;
+ // Compute codepoint.
+ codepoint = Character.toCodePoint(hi, lo);
} else {
- convertUnicode();
- }
+ // Restore to treat high surrogate as just a character.
+ position = savePosition;
+ width = saveWidth;
+ character = hi;
+ codepoint = (int)hi;
+ // Could potential report an error here (old code did not.)
}
}
- /** Append a character to sbuf.
+ /**
+ * Converts an unicode escape sequence into a character.
+ *
+ * @return true if was a valid escape sequence.
*/
- protected void putChar(char ch, boolean scan) {
- sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
- sbuf[sp++] = ch;
- if (scan)
- scanChar();
+ private boolean unicodeEscape() {
+ // Start of unicode escape sequence (past backslash.)
+ int start = position + width;
+ int index;
+
+ // Skip multiple 'u'.
+ for (index = start; index < length; index++) {
+ if (buffer[index] != 'u') {
+ break;
+ }
}
- protected void putChar(char ch) {
- putChar(ch, false);
+ // Needs to be at least backslash-u.
+ if (index != start) {
+ // If enough characters available.
+ if (index + 4 < length) {
+ // Convert four hex digits to codepoint. If any digit is invalid then the
+ // result is negative.
+ int code = (Character.digit(buffer[index++], 16) << 12) |
+ (Character.digit(buffer[index++], 16) << 8) |
+ (Character.digit(buffer[index++], 16) << 4) |
+ Character.digit(buffer[index++], 16);
+
+ // If all digits are good.
+ if (code >= 0) {
+ width = index - position;
+ character = (char)code;
+
+ return true;
+ }
}
- protected void putChar(boolean scan) {
- putChar(ch, scan);
+ // Did not work out.
+ log.error(position, Errors.IllegalUnicodeEsc);
+ width = index - position;
+
+ return true;
}
- protected void nextChar(boolean skip) {
- if (!skip) {
- sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
- sbuf[sp++] = ch;
+ // Must be just a backslash.
+ character = '\\';
+ width = 1;
+
+ return false;
}
- scanChar();
+ /**
+ * Return the current position in the character buffer.
+ *
+ * @return current position in the character buffer.
+ */
+ protected int position() {
+ return position;
}
- Name name() {
- return names.fromChars(sbuf, 0, sp);
+
+ /**
+ * Reset the reader to the specified position.
+ * Warning: Do not use when previous character was an ASCII or unicode backslash.
+ * @param pos
+ */
+ protected void reset(int pos) {
+ position = pos;
+ width = 0;
+ wasBackslash = false;
+ nextCodePoint();
}
- String chars() {
- return new String(sbuf, 0, sp);
+ /**
+ * Return the current character in at the current position.
+ *
+ * @return current character in at the current position.
+ */
+ protected char get() {
+ return character;
}
- /** Add 'count' copies of the character 'ch' to the string buffer.
+ /**
+ * Return the current codepoint in at the current position.
+ *
+ * @return current codepoint in at the current position.
*/
- protected void repeat(char ch, int count) {
- for ( ; 0 < count; count--) {
- putChar(ch, false);
+ protected int getCodepoint() {
+ return codepoint;
}
+
+ /**
+ * Returns true if the current codepoint is a surrogate.
+ *
+ * @return true if the current codepoint is a surrogate.
+ */
+ protected boolean isSurrogate() {
+ return 0xFFFF < codepoint;
}
- /** Reset the scan buffer pointer to 'pos'.
+ /**
+ * Returns true if the current character is ASCII.
+ *
+ * @return true if the current character is ASCII.
*/
- protected void reset(int pos) {
- bp = pos - 1;
- scanChar();
+ protected boolean isASCII() {
+ return character <= 0x7F;
}
- /** Convert unicode escape; bp points to initial '\' character
- * (Spec 3.3).
+ /**
+ * Advances the current character to the next character.
+ *
+ * @return next character.
*/
- protected void convertUnicode() {
- if (ch == '\\' && unicodeConversionBp != bp ) {
- bp++; ch = buf[bp];
- if (ch == 'u') {
- do {
- bp++; ch = buf[bp];
- } while (ch == 'u');
- int limit = bp + 3;
- if (limit < buflen) {
- int d = digit(bp, 16);
- int code = d;
- while (bp < limit && d >= 0) {
- bp++; ch = buf[bp];
- d = digit(bp, 16);
- code = (code << 4) + d;
- }
- if (d >= 0) {
- ch = (char)code;
- unicodeConversionBp = bp;
- return;
+ protected char next() {
+ nextCodePoint();
+
+ return character;
}
+
+ /**
+ * Compare character. Returns true if a match.
+ *
+ * @param ch character to match.
+ *
+ * @return true if a match.
+ */
+ protected boolean is(char ch) {
+ return character == ch;
}
- log.error(bp, Errors.IllegalUnicodeEsc);
- } else {
- bp--;
- ch = '\\';
+
+ /**
+ * Match one of the arguments. Returns true if a match.
+ */
+ protected boolean isOneOf(char ch1, char ch2) {
+ return is(ch1) || is(ch2);
}
+ protected boolean isOneOf(char ch1, char ch2, char ch3) {
+ return is(ch1) || is(ch2) || is(ch3);
}
+ protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
+ return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
}
- /** Are surrogates supported?
+ /**
+ * Tests to see if current character is in the range of lo to hi characters (inclusive).
+ *
+ * @param lo lowest character in range.
+ * @param hi highest character in range.
+ *
+ * @return true if the current character is in range.
+ */
+ protected boolean inRange(char lo, char hi) {
+ return lo <= character && character <= hi;
+ }
+
+ /**
+ * Compare character and advance if a match. Returns true if a match.
+ *
+ * @param ch character to match.
+ *
+ * @return true if a match.
*/
- final static boolean surrogatesSupported = surrogatesSupported();
- private static boolean surrogatesSupported() {
- try {
- Character.isHighSurrogate('a');
+ protected boolean accept(char ch) {
+ if (is(ch)) {
+ next();
+
return true;
- } catch (NoSuchMethodError ex) {
- return false;
}
+
+ return false;
}
- /** Scan surrogate pairs. If 'ch' is a high surrogate and
- * the next character is a low surrogate, returns the code point
- * constructed from these surrogates. Otherwise, returns -1.
- * This method will not consume any of the characters.
+ /**
+ * Match one of the arguments and advance if a match. Returns true if a match.
*/
- protected int peekSurrogates() {
- if (surrogatesSupported && Character.isHighSurrogate(ch)) {
- char high = ch;
- int prevBP = bp;
+ protected boolean acceptOneOf(char ch1, char ch2) {
+ if (isOneOf(ch1, ch2)) {
+ next();
- scanChar();
+ return true;
+ }
- char low = ch;
+ return false;
+ }
- ch = high;
- bp = prevBP;
+ protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
+ if (isOneOf(ch1, ch2, ch3)) {
+ next();
- if (Character.isLowSurrogate(low)) {
- return Character.toCodePoint(high, low);
+ return true;
}
+
+ return false;
}
- return -1;
+ /**
+ * Skip over all occurances of character.
+ *
+ * @param ch character to accept.
+ */
+ protected void skip(char ch) {
+ while (accept(ch)) {
+ // next
+ }
}
- /** Convert an ASCII digit from its base (8, 10, or 16)
- * to its value.
+ /**
+ * Skip over ASCII white space characters.
*/
- protected int digit(int pos, int base) {
- char c = ch;
- if ('0' <= c && c <= '9')
- return Character.digit(c, base); //a fast common case
- int codePoint = peekSurrogates();
- int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
- if (result >= 0 && c > 0x7f) {
- log.error(pos + 1, Errors.IllegalNonasciiDigit);
- if (codePoint >= 0)
- scanChar();
- ch = "0123456789abcdef".charAt(result);
+ protected void skipWhitespace() {
+ while (acceptOneOf(' ', '\t', '\f')) {
+ // next
}
- return result;
}
- protected boolean isUnicode() {
- return unicodeConversionBp == bp;
+ /**
+ * Skip to end of line.
+ */
+ protected void skipToEOLN() {
+ while (!isEOF()) {
+ if (isOneOf('\r', '\n')) {
+ break;
}
- protected void skipChar() {
- bp++;
+ next();
}
- protected char peekChar() {
- return buf[bp + 1];
}
/**
- * Returns a copy of the input buffer, up to its inputLength.
- * Unicode escape sequences are not translated.
+ * Compare string and advance if a match. Returns true if a match.
+ * Warning: Do not use when previous character was a backslash
+ * (confuses state of wasBackslash.)
+ *
+ * @param string string to match character for character.
+ *
+ * @return true if a match.
+ */
+ protected boolean accept(String string) {
+ // Quick test.
+ if (string.length() == 0 || !is(string.charAt(0))) {
+ return false;
+ }
+
+ // Be prepared to retreat if not a match.
+ int savedPosition = position;
+
+ nextCodePoint();
+
+ // Check each character.
+ for (int i = 1; i < string.length(); i++) {
+ if (!is(string.charAt(i))) {
+ // Restart if not a match.
+ reset(savedPosition);
+
+ return false;
+ }
+
+ nextCodePoint();
+ }
+
+ return true;
+ }
+
+ /**
+ * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
+ * advance character.
+ *
+ * @param pos starting position.
+ * @param digitRadix base of number being converted.
+ *
+ * @return value of digit.
+ */
+ protected int digit(int pos, int digitRadix) {
+ int result;
+
+ // Just an ASCII digit.
+ if (inRange('0', '9')) {
+ // Fast common case.
+ result = character - '0';
+
+ return result < digitRadix ? result : -1;
+ }
+
+ // Handle other digits.
+ result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
+ Character.digit(character, digitRadix);
+
+ if (result >= 0 && !isASCII()) {
+ log.error(position(), Errors.IllegalNonasciiDigit);
+ character = "0123456789abcdef".charAt(result);
+ }
+
+ return result;
+ }
+
+ /**
+ * Returns the input buffer. Unicode escape sequences are not translated.
+ *
+ * @return the input buffer.
*/
public char[] getRawCharacters() {
- char[] chars = new char[buflen];
- System.arraycopy(buf, 0, chars, 0, buflen);
- return chars;
+ return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
}
/**
* Returns a copy of a character array subset of the input buffer.
* The returned array begins at the {@code beginIndex} and
@@ -297,15 +528,83 @@
* {@code String.substring(beginIndex, endIndex)}.
* Unicode escape sequences are not translated.
*
* @param beginIndex the beginning index, inclusive.
* @param endIndex the ending index, exclusive.
+ *
* @throws ArrayIndexOutOfBoundsException if either offset is outside of the
* array bounds
*/
public char[] getRawCharacters(int beginIndex, int endIndex) {
- int length = endIndex - beginIndex;
- char[] chars = new char[length];
- System.arraycopy(buf, beginIndex, chars, 0, length);
- return chars;
+ return Arrays.copyOfRange(buffer, beginIndex, endIndex);
}
+
+ /**
+ * This is a specialized version of UnicodeReader that keeps track of the
+ * column position within a given character stream. Used for Javadoc
+ * processing to build a table for mapping positions in the comment string
+ * to positions in the source file.
+ */
+ static class PositionTrackingReader extends UnicodeReader {
+ /**
+ * Offset from the beginning of the original reader buffer.
+ */
+ private int offset;
+
+ /**
+ * Current column in the comment.
+ */
+ private int column;
+
+ /**
+ * Constructor.
+ *
+ * @param sf Scan factory.
+ * @param array Array containing contents of source.
+ * @param offset Position offset in original source buffer.
+ */
+ protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
+ super(sf, array, array.length);
+ this.offset = offset;
+ this.column = 0;
+ }
+
+ /**
+ * Advances the current character to the next character. Tracks column.
+ *
+ * @return next character.
+ */
+ @Override
+ protected char next() {
+ super.next();
+
+ if (isOneOf('\n', '\r', '\f')) {
+ column = 0;
+ } else if (is('\t')) {
+ column = tabulate(column);
+ } else {
+ column++;
+ }
+
+ return get();
+ }
+
+ /**
+ * Returns the current column.
+ *
+ * @return the current column.
+ */
+ protected int column() {
+ return column;
+ }
+
+ /**
+ * Returns position relative to the original source buffer.
+ *
+ * @return
+ */
+ protected int offsetPosition() {
+ return position() + offset;
+ }
+ }
+
}
< prev index next >