< prev index next >

src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java

Print this page
rev 60227 : 8224225: Tokenizer improvements
Reviewed-by: jlaskey

*** 1,7 **** /* ! * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 23,293 **** * questions. */ package com.sun.tools.javac.parser; - import java.nio.CharBuffer; import java.util.Arrays; - import com.sun.tools.javac.file.JavacFileManager; import com.sun.tools.javac.resources.CompilerProperties.Errors; - import com.sun.tools.javac.util.ArrayUtils; import com.sun.tools.javac.util.Log; - import com.sun.tools.javac.util.Name; - import com.sun.tools.javac.util.Names; ! import static com.sun.tools.javac.util.LayoutCharacters.*; ! /** The char reader used by the javac lexer/tokenizer. Returns the sequence of ! * characters contained in the input stream, handling unicode escape accordingly. ! * Additionally, it provides features for saving chars into a buffer and to retrieve ! * them at a later stage. * * <p><b>This is NOT part of any supported API. * If you write code that depends on this, you do so at your own risk. * This code and its internal interfaces are subject to change or ! * deletion without notice.</b> */ public class UnicodeReader { ! /** The input buffer, index of next character to be read, ! * index of one past last character in buffer. */ ! protected char[] buf; ! protected int bp; ! protected final int buflen; ! /** The current character. */ ! protected char ch; ! /** The buffer index of the last converted unicode character */ ! protected int unicodeConversionBp = -1; ! protected Log log; ! protected Names names; ! /** A character buffer for saved chars. */ ! protected char[] sbuf = new char[128]; ! protected int realLength; ! protected int sp; /** ! * Create a scanner from the input array. This method might ! * modify the array. To avoid copying the input array, ensure ! * that {@code inputLength < input.length} or ! * {@code input[input.length -1]} is a white space character. * ! * @param sf the factory which created this Scanner ! * @param buffer the input, might be modified ! * Must be positive and less than or equal to input.length. */ ! protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) { ! this(sf, JavacFileManager.toArray(buffer), buffer.limit()); } ! protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) { ! log = sf.log; ! names = sf.names; ! realLength = inputLength; ! if (inputLength == input.length) { ! if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) { ! inputLength--; ! } else { ! input = Arrays.copyOf(input, inputLength + 1); } } - buf = input; - buflen = inputLength; - buf[buflen] = EOI; - bp = -1; - scanChar(); } ! /** Read next character. */ ! protected void scanChar() { ! if (bp < buflen) { ! ch = buf[++bp]; ! if (ch == '\\') { ! convertUnicode(); } } } ! /** Read next character in comment, skipping over double '\' characters. ! */ ! protected void scanCommentChar() { ! scanChar(); ! if (ch == '\\') { ! if (peekChar() == '\\' && !isUnicode()) { ! skipChar(); } else { ! convertUnicode(); ! } } } ! /** Append a character to sbuf. */ ! protected void putChar(char ch, boolean scan) { ! sbuf = ArrayUtils.ensureCapacity(sbuf, sp); ! sbuf[sp++] = ch; ! if (scan) ! scanChar(); } ! protected void putChar(char ch) { ! putChar(ch, false); } ! protected void putChar(boolean scan) { ! putChar(ch, scan); } ! protected void nextChar(boolean skip) { ! if (!skip) { ! sbuf = ArrayUtils.ensureCapacity(sbuf, sp); ! sbuf[sp++] = ch; } ! scanChar(); } ! Name name() { ! return names.fromChars(sbuf, 0, sp); } ! String chars() { ! return new String(sbuf, 0, sp); } ! /** Add 'count' copies of the character 'ch' to the string buffer. */ ! protected void repeat(char ch, int count) { ! for ( ; 0 < count; count--) { ! putChar(ch, false); } } ! /** Reset the scan buffer pointer to 'pos'. */ ! protected void reset(int pos) { ! bp = pos - 1; ! scanChar(); } ! /** Convert unicode escape; bp points to initial '\' character ! * (Spec 3.3). */ ! protected void convertUnicode() { ! if (ch == '\\' && unicodeConversionBp != bp ) { ! bp++; ch = buf[bp]; ! if (ch == 'u') { ! do { ! bp++; ch = buf[bp]; ! } while (ch == 'u'); ! int limit = bp + 3; ! if (limit < buflen) { ! int d = digit(bp, 16); ! int code = d; ! while (bp < limit && d >= 0) { ! bp++; ch = buf[bp]; ! d = digit(bp, 16); ! code = (code << 4) + d; ! } ! if (d >= 0) { ! ch = (char)code; ! unicodeConversionBp = bp; ! return; } } ! log.error(bp, Errors.IllegalUnicodeEsc); ! } else { ! bp--; ! ch = '\\'; } } } ! /** Are surrogates supported? */ ! final static boolean surrogatesSupported = surrogatesSupported(); ! private static boolean surrogatesSupported() { ! try { ! Character.isHighSurrogate('a'); return true; - } catch (NoSuchMethodError ex) { - return false; } } ! /** Scan surrogate pairs. If 'ch' is a high surrogate and ! * the next character is a low surrogate, returns the code point ! * constructed from these surrogates. Otherwise, returns -1. ! * This method will not consume any of the characters. */ ! protected int peekSurrogates() { ! if (surrogatesSupported && Character.isHighSurrogate(ch)) { ! char high = ch; ! int prevBP = bp; ! scanChar(); ! char low = ch; ! ch = high; ! bp = prevBP; ! if (Character.isLowSurrogate(low)) { ! return Character.toCodePoint(high, low); } } ! return -1; } ! /** Convert an ASCII digit from its base (8, 10, or 16) ! * to its value. */ ! protected int digit(int pos, int base) { ! char c = ch; ! if ('0' <= c && c <= '9') ! return Character.digit(c, base); //a fast common case ! int codePoint = peekSurrogates(); ! int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base); ! if (result >= 0 && c > 0x7f) { ! log.error(pos + 1, Errors.IllegalNonasciiDigit); ! if (codePoint >= 0) ! scanChar(); ! ch = "0123456789abcdef".charAt(result); } - return result; } ! protected boolean isUnicode() { ! return unicodeConversionBp == bp; } ! protected void skipChar() { ! bp++; } - protected char peekChar() { - return buf[bp + 1]; } /** ! * Returns a copy of the input buffer, up to its inputLength. ! * Unicode escape sequences are not translated. */ public char[] getRawCharacters() { ! char[] chars = new char[buflen]; ! System.arraycopy(buf, 0, chars, 0, buflen); ! return chars; } /** * Returns a copy of a character array subset of the input buffer. * The returned array begins at the {@code beginIndex} and --- 23,524 ---- * questions. */ package com.sun.tools.javac.parser; import java.util.Arrays; import com.sun.tools.javac.resources.CompilerProperties.Errors; import com.sun.tools.javac.util.Log; ! import static com.sun.tools.javac.util.LayoutCharacters.EOI; ! import static com.sun.tools.javac.util.LayoutCharacters.tabulate; ! /** ! * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters ! * one by one as contained in the input stream, handling unicode escape sequences accordingly. * * <p><b>This is NOT part of any supported API. * If you write code that depends on this, you do so at your own risk. * This code and its internal interfaces are subject to change or ! * deletion without notice.</b></p> */ public class UnicodeReader { + /** + * Buffer containing characters from source file. May contain extraneous characters + * beyond this.length. + */ + private final char[] buffer; ! /** ! * Length of meaningful content in buffer. */ ! private final int length; ! /** ! * Character buffer index of character currently being observed. */ ! private int position; ! /** ! * Number of characters combined to provide character currently being observed. Typically ! * one, but may be more when combinations of surrogate pairs and unicode escape sequences ! * are read. ! */ ! private int width; ! ! /** ! * Character currently being observed. If a surrogate pair is read then will be the high ! * member of the pair. ! */ ! private char character; ! ! /** ! * Codepoint of character currently being observed. Typically equivalent to the character ! * but will have a value greater that 0xFFFF when a surrogate pair. */ ! private int codepoint; ! /** ! * true if the last character was a backslash. This is used to handle the special case ! * when a backslash precedes a unicode escape sequence. In that case, the second backslash ! * is treated as a backslash and not part of a unicode escape sequence. ! */ ! private boolean wasBackslash; ! /** ! * Log for error reporting. */ ! private final Log log; /** ! * Constructor. * ! * @param sf scan factory. ! * @param array array containing contents of source. ! * @param length length of meaningful content in buffer. ! */ ! protected UnicodeReader(ScannerFactory sf, char[] array, int length) { ! this.buffer = array; ! this.length = length; ! this.position = 0; ! this.width = 0; ! this.character = '\0'; ! this.codepoint = 0; ! this.wasBackslash = false; ! this.log = sf.log; ! ! nextCodePoint(); ! } ! ! /** ! * Returns the length of the buffer. This is length of meaningful content in buffer and ! * not the length of the buffer array. ! * ! * @return length of the buffer. */ ! protected int length() { ! return length; } ! /** ! * Return true if current position is past the end of the meaningful part of the buffer. ! * ! * @return true if current position is past the end of the meaningful part of the buffer. ! */ ! protected boolean isEOF() { ! return position >= length; } + + /** + * Fetches the next 16-bit character from the buffer and places it in this.character. + */ + private void nextCharacter() { + // Index of next character in buffer. + int index = position + width; + + // If past end of buffer. + if (length <= index) { + // End of file is marked with EOI. + character = EOI; + } else { + // Next character in buffer. + character = buffer[index]; + // Increment length of codepoint. + width++; } } ! /** ! * Fetches the next 16-bit character from the buffer. If an unicode escape sequence ! * is detected then converts the unicode escape sequence to a character. */ ! private void nextUnicode() { ! // Position to next codepoint. ! position += width; ! // Codepoint has no characters yet. ! width = 0; ! ! // Fetch next character. ! nextCharacter(); ! ! // If second backslash is detected. ! if (wasBackslash) { ! // Treat like a normal character (not part of unicode escape sequence.) ! wasBackslash = false; ! } else if (character == '\\') { ! // May be a unicode escape sequence. ! wasBackslash = !unicodeEscape(); } + + // Codepoint and character match if not surrogate. + codepoint = (int)character; } + + /** + * Fetches the nextcode point from the buffer. If an unicode escape sequence is recognized + * then converts unicode escape sequence to a character. If two characters are a surrogate pair + * then converts to a codepoint. + */ + private void nextCodePoint() { + // Next unicode character. + nextUnicode(); + + // Return early if ASCII or not a surrogate pair. + if (isASCII() || !Character.isHighSurrogate(character)) { + return; } ! // Capture high surrogate and position. ! char hi = character; ! int savePosition = position; ! int saveWidth = width; ! ! // Get potential low surrogate. ! nextUnicode(); ! char lo = character; ! ! if (Character.isLowSurrogate(lo)) { ! // Start codepoint at start of high surrogate. ! position = savePosition; ! width += saveWidth; ! // Compute codepoint. ! codepoint = Character.toCodePoint(hi, lo); } else { ! // Restore to treat high surrogate as just a character. ! position = savePosition; ! width = saveWidth; ! character = hi; ! codepoint = (int)hi; ! // Could potential report an error here (old code did not.) } } ! /** ! * Converts an unicode escape sequence into a character. ! * ! * @return true if was a valid escape sequence. */ ! private boolean unicodeEscape() { ! // Start of unicode escape sequence (past backslash.) ! int start = position + width; ! int index; ! ! // Skip multiple 'u'. ! for (index = start; index < length; index++) { ! if (buffer[index] != 'u') { ! break; ! } } ! // Needs to be at least backslash-u. ! if (index != start) { ! // If enough characters available. ! if (index + 4 < length) { ! // Convert four hex digits to codepoint. If any digit is invalid then the ! // result is negative. ! int code = (Character.digit(buffer[index++], 16) << 12) | ! (Character.digit(buffer[index++], 16) << 8) | ! (Character.digit(buffer[index++], 16) << 4) | ! Character.digit(buffer[index++], 16); ! ! // If all digits are good. ! if (code >= 0) { ! width = index - position; ! character = (char)code; ! ! return true; ! } } ! // Did not work out. ! log.error(position, Errors.IllegalUnicodeEsc); ! width = index - position; ! ! return true; } ! // Must be just a backslash. ! character = '\\'; ! width = 1; ! ! return false; } ! /** ! * Return the current position in the character buffer. ! * ! * @return current position in the character buffer. ! */ ! protected int position() { ! return position; } ! ! /** ! * Reset the reader to the specified position. ! * Warning: Do not use when previous character was an ASCII or unicode backslash. ! * @param pos ! */ ! protected void reset(int pos) { ! position = pos; ! width = 0; ! wasBackslash = false; ! nextCodePoint(); } ! /** ! * Return the current character in at the current position. ! * ! * @return current character in at the current position. ! */ ! protected char get() { ! return character; } ! /** ! * Return the current codepoint in at the current position. ! * ! * @return current codepoint in at the current position. */ ! protected int getCodepoint() { ! return codepoint; } + + /** + * Returns true if the current codepoint is a surrogate. + * + * @return true if the current codepoint is a surrogate. + */ + protected boolean isSurrogate() { + return 0xFFFF < codepoint; } ! /** ! * Returns true if the current character is ASCII. ! * ! * @return true if the current character is ASCII. */ ! protected boolean isASCII() { ! return character <= 0x7F; } ! /** ! * Advances the current character to the next character. ! * ! * @return next character. */ ! protected char next() { ! nextCodePoint(); ! ! return character; } + + /** + * Compare character. Returns true if a match. + * + * @param ch character to match. + * + * @return true if a match. + */ + protected boolean is(char ch) { + return character == ch; } ! ! /** ! * Match one of the arguments. Returns true if a match. ! */ ! protected boolean isOneOf(char ch1, char ch2) { ! return is(ch1) || is(ch2); } + protected boolean isOneOf(char ch1, char ch2, char ch3) { + return is(ch1) || is(ch2) || is(ch3); } + protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) { + return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6); } ! /** ! * Tests to see if current character is in the range of lo to hi characters (inclusive). ! * ! * @param lo lowest character in range. ! * @param hi highest character in range. ! * ! * @return true if the current character is in range. ! */ ! protected boolean inRange(char lo, char hi) { ! return lo <= character && character <= hi; ! } ! ! /** ! * Compare character and advance if a match. Returns true if a match. ! * ! * @param ch character to match. ! * ! * @return true if a match. */ ! protected boolean accept(char ch) { ! if (is(ch)) { ! next(); ! return true; } + + return false; } ! /** ! * Match one of the arguments and advance if a match. Returns true if a match. */ ! protected boolean acceptOneOf(char ch1, char ch2) { ! if (isOneOf(ch1, ch2)) { ! next(); ! return true; ! } ! return false; ! } ! protected boolean acceptOneOf(char ch1, char ch2, char ch3) { ! if (isOneOf(ch1, ch2, ch3)) { ! next(); ! return true; } + + return false; } ! /** ! * Skip over all occurances of character. ! * ! * @param ch character to accept. ! */ ! protected void skip(char ch) { ! while (accept(ch)) { ! // next ! } } ! /** ! * Skip over ASCII white space characters. */ ! protected void skipWhitespace() { ! while (acceptOneOf(' ', '\t', '\f')) { ! // next } } ! /** ! * Skip to end of line. ! */ ! protected void skipToEOLN() { ! while (!isEOF()) { ! if (isOneOf('\r', '\n')) { ! break; } ! next(); } } /** ! * Compare string and advance if a match. Returns true if a match. ! * Warning: Do not use when previous character was a backslash ! * (confuses state of wasBackslash.) ! * ! * @param string string to match character for character. ! * ! * @return true if a match. ! */ ! protected boolean accept(String string) { ! // Quick test. ! if (string.length() == 0 || !is(string.charAt(0))) { ! return false; ! } ! ! // Be prepared to retreat if not a match. ! int savedPosition = position; ! ! nextCodePoint(); ! ! // Check each character. ! for (int i = 1; i < string.length(); i++) { ! if (!is(string.charAt(i))) { ! // Restart if not a match. ! reset(savedPosition); ! ! return false; ! } ! ! nextCodePoint(); ! } ! ! return true; ! } ! ! /** ! * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not ! * advance character. ! * ! * @param pos starting position. ! * @param digitRadix base of number being converted. ! * ! * @return value of digit. ! */ ! protected int digit(int pos, int digitRadix) { ! int result; ! ! // Just an ASCII digit. ! if (inRange('0', '9')) { ! // Fast common case. ! result = character - '0'; ! ! return result < digitRadix ? result : -1; ! } ! ! // Handle other digits. ! result = isSurrogate() ? Character.digit(codepoint, digitRadix) : ! Character.digit(character, digitRadix); ! ! if (result >= 0 && !isASCII()) { ! log.error(position(), Errors.IllegalNonasciiDigit); ! character = "0123456789abcdef".charAt(result); ! } ! ! return result; ! } ! ! /** ! * Returns the input buffer. Unicode escape sequences are not translated. ! * ! * @return the input buffer. */ public char[] getRawCharacters() { ! return length == buffer.length ? buffer : Arrays.copyOf(buffer, length); } /** * Returns a copy of a character array subset of the input buffer. * The returned array begins at the {@code beginIndex} and
*** 297,311 **** * {@code String.substring(beginIndex, endIndex)}. * Unicode escape sequences are not translated. * * @param beginIndex the beginning index, inclusive. * @param endIndex the ending index, exclusive. * @throws ArrayIndexOutOfBoundsException if either offset is outside of the * array bounds */ public char[] getRawCharacters(int beginIndex, int endIndex) { ! int length = endIndex - beginIndex; ! char[] chars = new char[length]; ! System.arraycopy(buf, beginIndex, chars, 0, length); ! return chars; } } --- 528,610 ---- * {@code String.substring(beginIndex, endIndex)}. * Unicode escape sequences are not translated. * * @param beginIndex the beginning index, inclusive. * @param endIndex the ending index, exclusive. + * * @throws ArrayIndexOutOfBoundsException if either offset is outside of the * array bounds */ public char[] getRawCharacters(int beginIndex, int endIndex) { ! return Arrays.copyOfRange(buffer, beginIndex, endIndex); } + + /** + * This is a specialized version of UnicodeReader that keeps track of the + * column position within a given character stream. Used for Javadoc + * processing to build a table for mapping positions in the comment string + * to positions in the source file. + */ + static class PositionTrackingReader extends UnicodeReader { + /** + * Offset from the beginning of the original reader buffer. + */ + private int offset; + + /** + * Current column in the comment. + */ + private int column; + + /** + * Constructor. + * + * @param sf Scan factory. + * @param array Array containing contents of source. + * @param offset Position offset in original source buffer. + */ + protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) { + super(sf, array, array.length); + this.offset = offset; + this.column = 0; + } + + /** + * Advances the current character to the next character. Tracks column. + * + * @return next character. + */ + @Override + protected char next() { + super.next(); + + if (isOneOf('\n', '\r', '\f')) { + column = 0; + } else if (is('\t')) { + column = tabulate(column); + } else { + column++; + } + + return get(); + } + + /** + * Returns the current column. + * + * @return the current column. + */ + protected int column() { + return column; + } + + /** + * Returns position relative to the original source buffer. + * + * @return + */ + protected int offsetPosition() { + return position() + offset; + } + } + }
< prev index next >