< prev index next >
src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
Print this page
rev 60227 : 8224225: Tokenizer improvements
Reviewed-by: jlaskey
*** 1,7 ****
/*
! * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
--- 1,7 ----
/*
! * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
*** 23,293 ****
* questions.
*/
package com.sun.tools.javac.parser;
- import java.nio.CharBuffer;
import java.util.Arrays;
- import com.sun.tools.javac.file.JavacFileManager;
import com.sun.tools.javac.resources.CompilerProperties.Errors;
- import com.sun.tools.javac.util.ArrayUtils;
import com.sun.tools.javac.util.Log;
- import com.sun.tools.javac.util.Name;
- import com.sun.tools.javac.util.Names;
! import static com.sun.tools.javac.util.LayoutCharacters.*;
! /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
! * characters contained in the input stream, handling unicode escape accordingly.
! * Additionally, it provides features for saving chars into a buffer and to retrieve
! * them at a later stage.
*
* <p><b>This is NOT part of any supported API.
* If you write code that depends on this, you do so at your own risk.
* This code and its internal interfaces are subject to change or
! * deletion without notice.</b>
*/
public class UnicodeReader {
! /** The input buffer, index of next character to be read,
! * index of one past last character in buffer.
*/
! protected char[] buf;
! protected int bp;
! protected final int buflen;
! /** The current character.
*/
! protected char ch;
! /** The buffer index of the last converted unicode character
*/
! protected int unicodeConversionBp = -1;
! protected Log log;
! protected Names names;
! /** A character buffer for saved chars.
*/
! protected char[] sbuf = new char[128];
! protected int realLength;
! protected int sp;
/**
! * Create a scanner from the input array. This method might
! * modify the array. To avoid copying the input array, ensure
! * that {@code inputLength < input.length} or
! * {@code input[input.length -1]} is a white space character.
*
! * @param sf the factory which created this Scanner
! * @param buffer the input, might be modified
! * Must be positive and less than or equal to input.length.
*/
! protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
! this(sf, JavacFileManager.toArray(buffer), buffer.limit());
}
! protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
! log = sf.log;
! names = sf.names;
! realLength = inputLength;
! if (inputLength == input.length) {
! if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
! inputLength--;
! } else {
! input = Arrays.copyOf(input, inputLength + 1);
}
}
- buf = input;
- buflen = inputLength;
- buf[buflen] = EOI;
- bp = -1;
- scanChar();
}
! /** Read next character.
*/
! protected void scanChar() {
! if (bp < buflen) {
! ch = buf[++bp];
! if (ch == '\\') {
! convertUnicode();
}
}
}
! /** Read next character in comment, skipping over double '\' characters.
! */
! protected void scanCommentChar() {
! scanChar();
! if (ch == '\\') {
! if (peekChar() == '\\' && !isUnicode()) {
! skipChar();
} else {
! convertUnicode();
! }
}
}
! /** Append a character to sbuf.
*/
! protected void putChar(char ch, boolean scan) {
! sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
! sbuf[sp++] = ch;
! if (scan)
! scanChar();
}
! protected void putChar(char ch) {
! putChar(ch, false);
}
! protected void putChar(boolean scan) {
! putChar(ch, scan);
}
! protected void nextChar(boolean skip) {
! if (!skip) {
! sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
! sbuf[sp++] = ch;
}
! scanChar();
}
! Name name() {
! return names.fromChars(sbuf, 0, sp);
}
! String chars() {
! return new String(sbuf, 0, sp);
}
! /** Add 'count' copies of the character 'ch' to the string buffer.
*/
! protected void repeat(char ch, int count) {
! for ( ; 0 < count; count--) {
! putChar(ch, false);
}
}
! /** Reset the scan buffer pointer to 'pos'.
*/
! protected void reset(int pos) {
! bp = pos - 1;
! scanChar();
}
! /** Convert unicode escape; bp points to initial '\' character
! * (Spec 3.3).
*/
! protected void convertUnicode() {
! if (ch == '\\' && unicodeConversionBp != bp ) {
! bp++; ch = buf[bp];
! if (ch == 'u') {
! do {
! bp++; ch = buf[bp];
! } while (ch == 'u');
! int limit = bp + 3;
! if (limit < buflen) {
! int d = digit(bp, 16);
! int code = d;
! while (bp < limit && d >= 0) {
! bp++; ch = buf[bp];
! d = digit(bp, 16);
! code = (code << 4) + d;
! }
! if (d >= 0) {
! ch = (char)code;
! unicodeConversionBp = bp;
! return;
}
}
! log.error(bp, Errors.IllegalUnicodeEsc);
! } else {
! bp--;
! ch = '\\';
}
}
}
! /** Are surrogates supported?
*/
! final static boolean surrogatesSupported = surrogatesSupported();
! private static boolean surrogatesSupported() {
! try {
! Character.isHighSurrogate('a');
return true;
- } catch (NoSuchMethodError ex) {
- return false;
}
}
! /** Scan surrogate pairs. If 'ch' is a high surrogate and
! * the next character is a low surrogate, returns the code point
! * constructed from these surrogates. Otherwise, returns -1.
! * This method will not consume any of the characters.
*/
! protected int peekSurrogates() {
! if (surrogatesSupported && Character.isHighSurrogate(ch)) {
! char high = ch;
! int prevBP = bp;
! scanChar();
! char low = ch;
! ch = high;
! bp = prevBP;
! if (Character.isLowSurrogate(low)) {
! return Character.toCodePoint(high, low);
}
}
! return -1;
}
! /** Convert an ASCII digit from its base (8, 10, or 16)
! * to its value.
*/
! protected int digit(int pos, int base) {
! char c = ch;
! if ('0' <= c && c <= '9')
! return Character.digit(c, base); //a fast common case
! int codePoint = peekSurrogates();
! int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
! if (result >= 0 && c > 0x7f) {
! log.error(pos + 1, Errors.IllegalNonasciiDigit);
! if (codePoint >= 0)
! scanChar();
! ch = "0123456789abcdef".charAt(result);
}
- return result;
}
! protected boolean isUnicode() {
! return unicodeConversionBp == bp;
}
! protected void skipChar() {
! bp++;
}
- protected char peekChar() {
- return buf[bp + 1];
}
/**
! * Returns a copy of the input buffer, up to its inputLength.
! * Unicode escape sequences are not translated.
*/
public char[] getRawCharacters() {
! char[] chars = new char[buflen];
! System.arraycopy(buf, 0, chars, 0, buflen);
! return chars;
}
/**
* Returns a copy of a character array subset of the input buffer.
* The returned array begins at the {@code beginIndex} and
--- 23,524 ----
* questions.
*/
package com.sun.tools.javac.parser;
import java.util.Arrays;
import com.sun.tools.javac.resources.CompilerProperties.Errors;
import com.sun.tools.javac.util.Log;
! import static com.sun.tools.javac.util.LayoutCharacters.EOI;
! import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
! /**
! * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
! * one by one as contained in the input stream, handling unicode escape sequences accordingly.
*
* <p><b>This is NOT part of any supported API.
* If you write code that depends on this, you do so at your own risk.
* This code and its internal interfaces are subject to change or
! * deletion without notice.</b></p>
*/
public class UnicodeReader {
+ /**
+ * Buffer containing characters from source file. May contain extraneous characters
+ * beyond this.length.
+ */
+ private final char[] buffer;
! /**
! * Length of meaningful content in buffer.
*/
! private final int length;
! /**
! * Character buffer index of character currently being observed.
*/
! private int position;
! /**
! * Number of characters combined to provide character currently being observed. Typically
! * one, but may be more when combinations of surrogate pairs and unicode escape sequences
! * are read.
! */
! private int width;
!
! /**
! * Character currently being observed. If a surrogate pair is read then will be the high
! * member of the pair.
! */
! private char character;
!
! /**
! * Codepoint of character currently being observed. Typically equivalent to the character
! * but will have a value greater that 0xFFFF when a surrogate pair.
*/
! private int codepoint;
! /**
! * true if the last character was a backslash. This is used to handle the special case
! * when a backslash precedes a unicode escape sequence. In that case, the second backslash
! * is treated as a backslash and not part of a unicode escape sequence.
! */
! private boolean wasBackslash;
! /**
! * Log for error reporting.
*/
! private final Log log;
/**
! * Constructor.
*
! * @param sf scan factory.
! * @param array array containing contents of source.
! * @param length length of meaningful content in buffer.
! */
! protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
! this.buffer = array;
! this.length = length;
! this.position = 0;
! this.width = 0;
! this.character = '\0';
! this.codepoint = 0;
! this.wasBackslash = false;
! this.log = sf.log;
!
! nextCodePoint();
! }
!
! /**
! * Returns the length of the buffer. This is length of meaningful content in buffer and
! * not the length of the buffer array.
! *
! * @return length of the buffer.
*/
! protected int length() {
! return length;
}
! /**
! * Return true if current position is past the end of the meaningful part of the buffer.
! *
! * @return true if current position is past the end of the meaningful part of the buffer.
! */
! protected boolean isEOF() {
! return position >= length;
}
+
+ /**
+ * Fetches the next 16-bit character from the buffer and places it in this.character.
+ */
+ private void nextCharacter() {
+ // Index of next character in buffer.
+ int index = position + width;
+
+ // If past end of buffer.
+ if (length <= index) {
+ // End of file is marked with EOI.
+ character = EOI;
+ } else {
+ // Next character in buffer.
+ character = buffer[index];
+ // Increment length of codepoint.
+ width++;
}
}
! /**
! * Fetches the next 16-bit character from the buffer. If an unicode escape sequence
! * is detected then converts the unicode escape sequence to a character.
*/
! private void nextUnicode() {
! // Position to next codepoint.
! position += width;
! // Codepoint has no characters yet.
! width = 0;
!
! // Fetch next character.
! nextCharacter();
!
! // If second backslash is detected.
! if (wasBackslash) {
! // Treat like a normal character (not part of unicode escape sequence.)
! wasBackslash = false;
! } else if (character == '\\') {
! // May be a unicode escape sequence.
! wasBackslash = !unicodeEscape();
}
+
+ // Codepoint and character match if not surrogate.
+ codepoint = (int)character;
}
+
+ /**
+ * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized
+ * then converts unicode escape sequence to a character. If two characters are a surrogate pair
+ * then converts to a codepoint.
+ */
+ private void nextCodePoint() {
+ // Next unicode character.
+ nextUnicode();
+
+ // Return early if ASCII or not a surrogate pair.
+ if (isASCII() || !Character.isHighSurrogate(character)) {
+ return;
}
! // Capture high surrogate and position.
! char hi = character;
! int savePosition = position;
! int saveWidth = width;
!
! // Get potential low surrogate.
! nextUnicode();
! char lo = character;
!
! if (Character.isLowSurrogate(lo)) {
! // Start codepoint at start of high surrogate.
! position = savePosition;
! width += saveWidth;
! // Compute codepoint.
! codepoint = Character.toCodePoint(hi, lo);
} else {
! // Restore to treat high surrogate as just a character.
! position = savePosition;
! width = saveWidth;
! character = hi;
! codepoint = (int)hi;
! // Could potential report an error here (old code did not.)
}
}
! /**
! * Converts an unicode escape sequence into a character.
! *
! * @return true if was a valid escape sequence.
*/
! private boolean unicodeEscape() {
! // Start of unicode escape sequence (past backslash.)
! int start = position + width;
! int index;
!
! // Skip multiple 'u'.
! for (index = start; index < length; index++) {
! if (buffer[index] != 'u') {
! break;
! }
}
! // Needs to be at least backslash-u.
! if (index != start) {
! // If enough characters available.
! if (index + 4 < length) {
! // Convert four hex digits to codepoint. If any digit is invalid then the
! // result is negative.
! int code = (Character.digit(buffer[index++], 16) << 12) |
! (Character.digit(buffer[index++], 16) << 8) |
! (Character.digit(buffer[index++], 16) << 4) |
! Character.digit(buffer[index++], 16);
!
! // If all digits are good.
! if (code >= 0) {
! width = index - position;
! character = (char)code;
!
! return true;
! }
}
! // Did not work out.
! log.error(position, Errors.IllegalUnicodeEsc);
! width = index - position;
!
! return true;
}
! // Must be just a backslash.
! character = '\\';
! width = 1;
!
! return false;
}
! /**
! * Return the current position in the character buffer.
! *
! * @return current position in the character buffer.
! */
! protected int position() {
! return position;
}
!
! /**
! * Reset the reader to the specified position.
! * Warning: Do not use when previous character was an ASCII or unicode backslash.
! * @param pos
! */
! protected void reset(int pos) {
! position = pos;
! width = 0;
! wasBackslash = false;
! nextCodePoint();
}
! /**
! * Return the current character in at the current position.
! *
! * @return current character in at the current position.
! */
! protected char get() {
! return character;
}
! /**
! * Return the current codepoint in at the current position.
! *
! * @return current codepoint in at the current position.
*/
! protected int getCodepoint() {
! return codepoint;
}
+
+ /**
+ * Returns true if the current codepoint is a surrogate.
+ *
+ * @return true if the current codepoint is a surrogate.
+ */
+ protected boolean isSurrogate() {
+ return 0xFFFF < codepoint;
}
! /**
! * Returns true if the current character is ASCII.
! *
! * @return true if the current character is ASCII.
*/
! protected boolean isASCII() {
! return character <= 0x7F;
}
! /**
! * Advances the current character to the next character.
! *
! * @return next character.
*/
! protected char next() {
! nextCodePoint();
!
! return character;
}
+
+ /**
+ * Compare character. Returns true if a match.
+ *
+ * @param ch character to match.
+ *
+ * @return true if a match.
+ */
+ protected boolean is(char ch) {
+ return character == ch;
}
!
! /**
! * Match one of the arguments. Returns true if a match.
! */
! protected boolean isOneOf(char ch1, char ch2) {
! return is(ch1) || is(ch2);
}
+ protected boolean isOneOf(char ch1, char ch2, char ch3) {
+ return is(ch1) || is(ch2) || is(ch3);
}
+ protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
+ return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
}
! /**
! * Tests to see if current character is in the range of lo to hi characters (inclusive).
! *
! * @param lo lowest character in range.
! * @param hi highest character in range.
! *
! * @return true if the current character is in range.
! */
! protected boolean inRange(char lo, char hi) {
! return lo <= character && character <= hi;
! }
!
! /**
! * Compare character and advance if a match. Returns true if a match.
! *
! * @param ch character to match.
! *
! * @return true if a match.
*/
! protected boolean accept(char ch) {
! if (is(ch)) {
! next();
!
return true;
}
+
+ return false;
}
! /**
! * Match one of the arguments and advance if a match. Returns true if a match.
*/
! protected boolean acceptOneOf(char ch1, char ch2) {
! if (isOneOf(ch1, ch2)) {
! next();
! return true;
! }
! return false;
! }
! protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
! if (isOneOf(ch1, ch2, ch3)) {
! next();
! return true;
}
+
+ return false;
}
! /**
! * Skip over all occurances of character.
! *
! * @param ch character to accept.
! */
! protected void skip(char ch) {
! while (accept(ch)) {
! // next
! }
}
! /**
! * Skip over ASCII white space characters.
*/
! protected void skipWhitespace() {
! while (acceptOneOf(' ', '\t', '\f')) {
! // next
}
}
! /**
! * Skip to end of line.
! */
! protected void skipToEOLN() {
! while (!isEOF()) {
! if (isOneOf('\r', '\n')) {
! break;
}
! next();
}
}
/**
! * Compare string and advance if a match. Returns true if a match.
! * Warning: Do not use when previous character was a backslash
! * (confuses state of wasBackslash.)
! *
! * @param string string to match character for character.
! *
! * @return true if a match.
! */
! protected boolean accept(String string) {
! // Quick test.
! if (string.length() == 0 || !is(string.charAt(0))) {
! return false;
! }
!
! // Be prepared to retreat if not a match.
! int savedPosition = position;
!
! nextCodePoint();
!
! // Check each character.
! for (int i = 1; i < string.length(); i++) {
! if (!is(string.charAt(i))) {
! // Restart if not a match.
! reset(savedPosition);
!
! return false;
! }
!
! nextCodePoint();
! }
!
! return true;
! }
!
! /**
! * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
! * advance character.
! *
! * @param pos starting position.
! * @param digitRadix base of number being converted.
! *
! * @return value of digit.
! */
! protected int digit(int pos, int digitRadix) {
! int result;
!
! // Just an ASCII digit.
! if (inRange('0', '9')) {
! // Fast common case.
! result = character - '0';
!
! return result < digitRadix ? result : -1;
! }
!
! // Handle other digits.
! result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
! Character.digit(character, digitRadix);
!
! if (result >= 0 && !isASCII()) {
! log.error(position(), Errors.IllegalNonasciiDigit);
! character = "0123456789abcdef".charAt(result);
! }
!
! return result;
! }
!
! /**
! * Returns the input buffer. Unicode escape sequences are not translated.
! *
! * @return the input buffer.
*/
public char[] getRawCharacters() {
! return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
}
/**
* Returns a copy of a character array subset of the input buffer.
* The returned array begins at the {@code beginIndex} and
*** 297,311 ****
* {@code String.substring(beginIndex, endIndex)}.
* Unicode escape sequences are not translated.
*
* @param beginIndex the beginning index, inclusive.
* @param endIndex the ending index, exclusive.
* @throws ArrayIndexOutOfBoundsException if either offset is outside of the
* array bounds
*/
public char[] getRawCharacters(int beginIndex, int endIndex) {
! int length = endIndex - beginIndex;
! char[] chars = new char[length];
! System.arraycopy(buf, beginIndex, chars, 0, length);
! return chars;
}
}
--- 528,610 ----
* {@code String.substring(beginIndex, endIndex)}.
* Unicode escape sequences are not translated.
*
* @param beginIndex the beginning index, inclusive.
* @param endIndex the ending index, exclusive.
+ *
* @throws ArrayIndexOutOfBoundsException if either offset is outside of the
* array bounds
*/
public char[] getRawCharacters(int beginIndex, int endIndex) {
! return Arrays.copyOfRange(buffer, beginIndex, endIndex);
}
+
+ /**
+ * This is a specialized version of UnicodeReader that keeps track of the
+ * column position within a given character stream. Used for Javadoc
+ * processing to build a table for mapping positions in the comment string
+ * to positions in the source file.
+ */
+ static class PositionTrackingReader extends UnicodeReader {
+ /**
+ * Offset from the beginning of the original reader buffer.
+ */
+ private int offset;
+
+ /**
+ * Current column in the comment.
+ */
+ private int column;
+
+ /**
+ * Constructor.
+ *
+ * @param sf Scan factory.
+ * @param array Array containing contents of source.
+ * @param offset Position offset in original source buffer.
+ */
+ protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
+ super(sf, array, array.length);
+ this.offset = offset;
+ this.column = 0;
+ }
+
+ /**
+ * Advances the current character to the next character. Tracks column.
+ *
+ * @return next character.
+ */
+ @Override
+ protected char next() {
+ super.next();
+
+ if (isOneOf('\n', '\r', '\f')) {
+ column = 0;
+ } else if (is('\t')) {
+ column = tabulate(column);
+ } else {
+ column++;
+ }
+
+ return get();
+ }
+
+ /**
+ * Returns the current column.
+ *
+ * @return the current column.
+ */
+ protected int column() {
+ return column;
+ }
+
+ /**
+ * Returns position relative to the original source buffer.
+ *
+ * @return
+ */
+ protected int offsetPosition() {
+ return position() + offset;
+ }
+ }
+
}
< prev index next >