/* * Copyright (c) 2010, 2014, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package javafx.css; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; import com.sun.javafx.css.parser.LexerState; import com.sun.javafx.css.parser.Recognizer; import com.sun.javafx.css.parser.Token; final class CssLexer { final static int STRING = 10; final static int IDENT = 11; final static int FUNCTION = 12; final static int NUMBER = 13; final static int CM = 14; final static int EMS = 15; final static int EXS = 16; final static int IN = 17; final static int MM = 18; final static int PC = 19; final static int PT = 20; final static int PX = 21; final static int PERCENTAGE = 22; final static int DEG = 23; final static int GRAD = 24; final static int RAD = 25; final static int TURN = 26; final static int GREATER = 27; final static int LBRACE = 28; final static int RBRACE = 29; final static int SEMI = 30; final static int COLON = 31; final static int SOLIDUS = 32; final static int STAR = 33; final static int LPAREN = 34; final static int RPAREN = 35; final static int COMMA = 36; final static int HASH = 37; final static int DOT = 38; final static int IMPORTANT_SYM = 39; final static int WS = 40; final static int NL = 41; final static int FONT_FACE = 42; final static int URL = 43; final static int IMPORT = 44; final static int SECONDS = 45; final static int MS = 46; final static int AT_KEYWORD = 47; private final Recognizer A = (c) -> c == 'a' || c == 'A'; private final Recognizer B = (c) -> c == 'b' || c == 'B'; private final Recognizer C = (c) -> c == 'c' || c == 'C'; private final Recognizer D = (c) -> c == 'd' || c == 'D'; private final Recognizer E = (c) -> c == 'e' || c == 'E'; private final Recognizer F = (c) -> c == 'f' || c == 'F'; private final Recognizer G = (c) -> c == 'g' || c == 'G'; private final Recognizer H = (c) -> c == 'h' || c == 'H'; private final Recognizer I = (c) -> c == 'i' || c == 'I'; private final Recognizer J = (c) -> c == 'j' || c == 'J'; private final Recognizer K = (c) -> c == 'k' || c == 'K'; private final Recognizer L = (c) -> c == 'l' || c == 'L'; private final Recognizer M = (c) -> c == 'm' || c == 'M'; private final Recognizer N = (c) -> c == 'n' || c == 'N'; private final Recognizer O = (c) -> c == 'o' || c == 'O'; private final Recognizer P = (c) -> c == 'p' || c == 'P'; private final Recognizer Q = (c) -> c == 'q' || c == 'Q'; private final Recognizer R = (c) -> c == 'r' || c == 'R'; private final Recognizer S = (c) -> c == 's' || c == 'S'; private final Recognizer T = (c) -> c == 't' || c == 'T'; private final Recognizer U = (c) -> c == 'u' || c == 'U'; private final Recognizer V = (c) -> c == 'v' || c == 'V'; private final Recognizer W = (c) -> c == 'w' || c == 'W'; private final Recognizer X = (c) -> c == 'x' || c == 'X'; private final Recognizer Y = (c) -> c == 'y' || c == 'Y'; private final Recognizer Z = (c) -> c == 'z' || c == 'Z'; private final Recognizer ALPHA = (c) -> ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); private final Recognizer NON_ASCII = (c) -> '\u0080' <= c && c <= '\uFFFF'; private final Recognizer DOT_CHAR = (c) -> c == '.'; private final Recognizer GREATER_CHAR = (c) -> c == '>'; private final Recognizer LBRACE_CHAR = (c) -> c == '{'; private final Recognizer RBRACE_CHAR = (c) -> c == '}'; private final Recognizer SEMI_CHAR = (c) -> c == ';'; private final Recognizer COLON_CHAR = (c) -> c == ':'; private final Recognizer SOLIDUS_CHAR = (c) -> c == '/'; private final Recognizer MINUS_CHAR = (c) -> c == '-'; private final Recognizer PLUS_CHAR = (c) -> c == '+'; private final Recognizer STAR_CHAR = (c) -> c == '*'; private final Recognizer LPAREN_CHAR = (c) -> c == '('; private final Recognizer RPAREN_CHAR = (c) -> c == ')'; private final Recognizer COMMA_CHAR = (c) -> c == ','; private final Recognizer UNDERSCORE_CHAR = (c) -> c == '_'; private final Recognizer HASH_CHAR = (c) -> c == '#'; private final Recognizer WS_CHARS = (c) -> c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f'; private final Recognizer NL_CHARS = (c) -> (c == '\r' || c == '\n'); private final Recognizer DIGIT = (c) -> '0' <= c && c <= '9'; private final Recognizer HEX_DIGIT = (c) -> ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); // The initial accepts any character final LexerState initState = new LexerState("initState", null) { @Override public boolean accepts(int c) { return true; } }; final LexerState hashState = new LexerState("hashState", HASH_CHAR ); final LexerState minusState = new LexerState("minusState", MINUS_CHAR ); final LexerState plusState = new LexerState("plusState", PLUS_CHAR ); // The dot char is either just a dot or may be the start of a number final LexerState dotState = new LexerState(DOT, "dotState", DOT_CHAR ); // [_a-z]|{nonascii}|{escape} final LexerState nmStartState = new LexerState(IDENT, "nmStartState", UNDERSCORE_CHAR, ALPHA ); // nmchar [_a-z0-9-]|{nonascii}|{escape} final LexerState nmCharState = new LexerState(IDENT, "nmCharState", UNDERSCORE_CHAR, ALPHA, DIGIT, MINUS_CHAR ); // same as nmchar, but need to differentiate between nmchar in ident and // nmchar in final LexerState hashNameCharState = new LexerState(HASH, "hashNameCharState", UNDERSCORE_CHAR, ALPHA, DIGIT, MINUS_CHAR ); // lparen after ident implies function final LexerState lparenState = new LexerState(FUNCTION, "lparenState", LPAREN_CHAR ) { @Override public int getType() { if (text.indexOf("url(") == 0) { try { return consumeUrl(); } catch (IOException ioe) { return Token.INVALID; } } return super.getType(); } }; // initial digits in a number final LexerState leadingDigitsState = new LexerState(NUMBER,"leadingDigitsState", DIGIT ); // If the dot char follows leading digits, a plus or a minus, then it is // a decimal mark final LexerState decimalMarkState = new LexerState("decimalMarkState", DOT_CHAR ); // digits following decimal mark final LexerState trailingDigitsState = new LexerState(NUMBER,"trailingDigitsState", DIGIT ); // http://www.w3.org/TR/css3-values/ final LexerState unitsState = new UnitsState(); private Map createStateMap() { Map map = new HashMap(); // initState -- [#] --> hashState // initState -- [-] --> minusState // initState -- [+] --> plusState // initState -- [_a-z] --> nmStartState // initState -- [0-9] --> leadingDigitsState // initState -- [.] --> dotState map.put( initState, new LexerState[] { hashState, minusState, nmStartState, plusState, minusState, leadingDigitsState, dotState } ); // minus could be the start of an ident or a number // minusState -- [_a-z] --> nmStartState // minusState -- [0-9] --> leadingDigitsState // minusState -- [.] --> decimalMarkState map.put( minusState, new LexerState[] { nmStartState, leadingDigitsState, decimalMarkState, } ); // // # {name} // hash {nmchar}+ // hashState -- [_a-z0-9-] --> nmCharState // nmCharState -- [_a-z0-9-] --> nmCharState // map.put( hashState, new LexerState[] { hashNameCharState } ); map.put( hashNameCharState, new LexerState[] { hashNameCharState, } ); // // {ident} // ident '-'? {nmchar}+ // nmStartState -- [_a-z0-9-] --> nmCharState // nmCharState -- [_a-z0-9-] --> nmCharState // nmCharState -- [(] --> lparenState // map.put( nmStartState, new LexerState[] { nmCharState } ); map.put( nmCharState, new LexerState[] { nmCharState, lparenState } ); // from +/- state, next state must be a digit or a dot map.put( plusState, new LexerState[] { leadingDigitsState, decimalMarkState } ); // from leadingDigitsState, next state must be // another digit, a decimal mark, or units map.put( leadingDigitsState, new LexerState[] { leadingDigitsState, decimalMarkState, unitsState } ); // from decimal mark, next state must be a digit. // Need to map both dotState and decimalMarkState // since dot might be the first character and would // not be seen as a decimal point. map.put( dotState, new LexerState[] { trailingDigitsState } ); map.put( decimalMarkState, new LexerState[] { trailingDigitsState } ); // from trailingDigitsState, next state must be another digit or units map.put( trailingDigitsState, new LexerState[] { trailingDigitsState, unitsState, } ); // UnitsState stays in UnitsState map.put( unitsState, new LexerState[] { unitsState } ); return map; } CssLexer() { this.stateMap = createStateMap(); this.text = new StringBuilder(64); this.currentState = initState; } void setReader(Reader reader) { this.reader = reader; lastc = -1; pos = offset = 0; line = 1; this.currentState = initState; this.token = null; try { this.ch = readChar(); } catch (IOException ioe) { token = Token.EOF_TOKEN; } } private Token scanImportant() throws IOException{ // CSS 2.1 grammar for important_sym // "!"({w}|{comment})*{I}{M}{P}{O}{R}{T}{A}{N}{T} final Recognizer[] important_sym = new Recognizer[] { I, M, P, O, R, T, A, N, T }; int current = 0; text.append((char)ch); // get past the '!' ch = readChar(); while(true) { switch (ch) { case Token.EOF: token = Token.EOF_TOKEN; return token; case '/': ch = readChar(); if (ch == '*') skipComment(); else if (ch == '/') skipEOL(); else { text.append('/').append((char)ch); int temp = offset; offset = pos; return new Token(Token.INVALID, text.toString(), line, temp); } break; case ' ': case '\t': case '\r': case '\n': case '\f': ch = readChar(); break; default: boolean accepted = true; while(accepted && current < important_sym.length) { accepted = important_sym[current++].recognize(ch); text.append((char)ch); ch = readChar(); } if (accepted) { final int temp = offset; offset = pos-1; // will have read one char too many return new Token(IMPORTANT_SYM, "!important", line, temp); } else { while (ch != ';' && ch != '}' && ch != Token.EOF) { ch = readChar(); } if (ch != Token.EOF) { final int temp = offset; offset = pos-1; // will have read one char too many return new Token(Token.SKIP, text.toString(), line, temp); } else { return Token.EOF_TOKEN; } } } } } // http://www.ietf.org/rfc/rfc3986 // http://www.w3.org/TR/2011/REC-CSS2-20110607/syndata.html#uri // http://www.w3.org/TR/css3-syntax/#consume-a-url-token private int consumeUrl() throws IOException { text.delete(0, text.length()); // skip initial white space while (WS_CHARS.recognize(ch)) { ch = readChar(); } if (ch == Token.EOF) { return Token.EOF; } if (ch == '\'' || ch == '"') { int endQuote = ch; ch = readChar(); // consume the string while (ch != endQuote) { if (ch == Token.EOF) { break; } // un-escaped newline is an error if (NL_CHARS.recognize(ch)) { break; } // handle escaped char // Note: this block does not handle the algorithm for consuming hex-digits if (ch == '\\') { ch = readChar(); if (NL_CHARS.recognize(ch)) { // consume newline while(NL_CHARS.recognize(ch)) { ch = readChar(); } } else if (ch != Token.EOF) { // if EOF, do nothing text.append((char)ch); ch = readChar(); } continue; } text.append((char)ch); ch = readChar(); } if (ch == endQuote) { ch = readChar(); while(WS_CHARS.recognize(ch)) { ch = readChar(); } // After consuming white-space, the char has to be rparen or EOF. Error otherwise. if (ch == ')') { // consume the rparen ch = readChar(); return URL; } if(ch == Token.EOF) { return URL; } } } else { // TODO: a lot of repeat code from above text.append((char)ch); ch = readChar(); while (true) { while (WS_CHARS.recognize(ch)) { ch = readChar(); } if (ch == ')') { // consume the rparen ch = readChar(); return URL; } if (ch == Token.EOF) { return URL; } // handle escaped char // Note: this block does not handle the algorithm for consuming hex-digits if (ch == '\\') { ch = readChar(); if (NL_CHARS.recognize(ch)) { // consume newline while(NL_CHARS.recognize(ch)) { ch = readChar(); } } else if (ch != Token.EOF) { // if EOF, do nothing text.append((char)ch); ch = readChar(); } continue; } if (ch == '\'' || ch == '"' || ch == '(') { break; } text.append((char)ch); ch = readChar(); } } // if we get to here, then the token is bad // consume up to rparen or eof while(true) { int lastCh = ch; if (ch == Token.EOF) { return Token.EOF; } else if (ch == ')' && lastCh != '\\') { ch = readChar(); return Token.INVALID; } lastCh = ch; ch = readChar(); } } private class UnitsState extends LexerState { private final Recognizer[][] units = { // TODO: all units from http://www.w3.org/TR/css3-values/ // If units are added, getType and unitsMask must be updated! { C, M }, { D, E, G }, { E, M }, { E, X }, { G, R, A, D }, { I, N }, { M, M }, { M, S }, { P, C }, { P, T }, { P, X }, { R, A, D }, { S }, { T, U, R, N }, { (c) -> c == '%'} }; // One bit per unit private int unitsMask = 0x7FFF; // Offset into inner array of units private int index = -1; UnitsState() { super(-1, "UnitsState", null); } @Override public int getType() { int type = Token.INVALID; // Must keep this in sync with units array. // Small switch will be faster than Math.log(oldMask)/Math.log(2) switch (unitsMask) { case 0x1: type = CM; break; case 0x2: type = DEG; break; case 0x4: type = EMS; break; case 0x8: type = EXS; break; case 0x10: type = GRAD; break; case 0x20: type = IN; break; case 0x40: type = MM; break; case 0x80: type = MS; break; case 0x100: type = PC; break; case 0x200: type = PT; break; case 0x400: type = PX; break; case 0x800: type = RAD; break; case 0x1000: type = SECONDS; break; case 0x2000: type = TURN; break; case 0x4000: type = PERCENTAGE; break; default: type = Token.INVALID; } // reset unitsMask = 0x7fff; index = -1; return type; } @Override public boolean accepts(int c) { // Ensure that something bogus like '10xyzzy' is // consumed as a token by only returning false // if the char is not alpha or % if (!ALPHA.recognize(c) && c != '%') { return false; } // If unitsMask is zero, then we've already figured out that // this is an invalid token, but we want to accept c so that // '10xyzzy' is consumed as a token, albeit an invalid one. if (unitsMask == 0) return true; index += 1; for (int n=0 ; n < units.length; n++) { final int u = 1 << n; // the unit at this index already failed. Move on. if ((unitsMask & u) == 0) continue; if ((index >= units[n].length) || !(units[n][index].recognize(c))) { // not a match, turn off this bit unitsMask &= ~u; } } return true; } } private void skipComment() throws IOException { while(ch != -1) { if (ch == '*') { ch = readChar(); if (ch == '/') { offset = pos; ch=readChar(); break; } } else { ch = readChar(); } } } private void skipEOL() throws IOException { int lastc = ch; while (ch != -1) { ch = readChar(); // EOL is cr, lf, or crlf if ((ch == '\n') || (lastc == '\r' && ch != '\n')) { break; } } } private int pos = 0; private int offset = 0; private int line = 1; private int lastc = -1; private int readChar() throws IOException { int c = reader.read(); // only reset line and pos counters after having read a NL since // a NL token is created after the readChar if (lastc == '\n' || (lastc == '\r' && c != '\n')) { // set pos to 1 since we've already read the first char of the new line pos = 1; offset = 0; line++; } else { pos++; } lastc = c; return c; } Token nextToken() { Token tok = null; if (token != null) { tok = token; if (token.getType() != Token.EOF) token = null; } else { do { tok = getToken(); } while (tok != null && // tok.getType() != Token.EOF && Token.SKIP_TOKEN.equals(tok)); } // reset text buffer and currentState text.delete(0,text.length()); currentState = initState; return tok; } private Token getToken() { try { while (true) { charNotConsumed = false; final LexerState[] reachableStates = currentState != null ? stateMap.get(currentState) : null; final int max = reachableStates != null ? reachableStates.length : 0; LexerState newState = null; for (int n=0; n': token = new Token(GREATER,">", line, offset); offset = pos; break; case '{': token = new Token(LBRACE,"{", line, offset); offset = pos; break; case '}': token = new Token(RBRACE,"}", line, offset); offset = pos; break; case ';': token = new Token(SEMI,";", line, offset); offset = pos; break; case ':': token = new Token(COLON,":", line, offset); offset = pos; break; case '*': token = new Token(STAR,"*", line, offset); offset = pos; break; case '(': token = new Token(LPAREN,"(", line, offset); offset = pos; break; case ')': token = new Token(RPAREN,")", line, offset); offset = pos; break; case ',': token = new Token(COMMA,",", line, offset); offset = pos; break; case '.': token = new Token(DOT,".", line, offset); offset = pos; break; case ' ': case '\t': case '\f': token = new Token(WS, Character.toString((char)ch), line, offset); offset = pos; break; case '\r': token = new Token(NL, "\\r", line, offset); // offset and pos are reset on next readChar ch = readChar(); if (ch == '\n') { token = new Token(NL, "\\r\\n", line, offset); // offset and pos are reset on next readChar } else { // already read the next character, so return // return the NL token here (avoid the readChar // at the end of the loop below) final Token tok = token; token = (ch == -1) ? Token.EOF_TOKEN : null; return tok; } break; case '\n': token = new Token(NL, "\\n", line, offset); // offset and pos are reset on next readChar break; case '!': Token tok = scanImportant(); return tok; case '@': token = new Token(AT_KEYWORD, "@", line, offset); offset = pos; break; default: // System.err.println("hit default case: ch = " + Character.toString((char)ch)); token = new Token(Token.INVALID, Character.toString((char)ch), line, offset); offset = pos; break; } if (token == null) { // System.err.println("token is null! ch = " + Character.toString((char)ch)); token = new Token(Token.INVALID, null, line, offset); offset = pos; } else if (token.getType() == Token.EOF) { return token; } if (ch != -1 && !charNotConsumed) ch = readChar(); final Token tok = token; token = null; return tok; } } catch (IOException ioe) { token = Token.EOF_TOKEN; return token; } } private int ch; private boolean charNotConsumed = false; private Reader reader; private Token token; private final Map stateMap; private LexerState currentState; private final StringBuilder text; }