/* * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package jdk.nashorn.internal.runtime.regexp; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.PatternSyntaxException; import jdk.nashorn.internal.parser.Lexer; import jdk.nashorn.internal.parser.Scanner; import jdk.nashorn.internal.runtime.BitVector; /** * Scan a JavaScript regexp, converting to Java regex if necessary. * */ final class RegExpScanner extends Scanner { /** * String builder used to rewrite the pattern for the currently used regexp factory. */ private final StringBuilder sb; /** Expected token table */ private final Map expected = new HashMap<>(); /** Capturing parenthesis that have been found so far. */ private final List caps = new LinkedList<>(); /** Forward references to capturing parenthesis to be resolved later.*/ private final LinkedList forwardReferences = new LinkedList<>(); /** Current level of zero-width negative lookahead assertions. */ private int negLookaheadLevel; /** Sequential id of current top-level zero-width negative lookahead assertion. */ private int negLookaheadGroup; /** Are we currently inside a character class? */ private boolean inCharClass = false; /** Are we currently inside a negated character class? */ private boolean inNegativeClass = false; private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-"; private static class Capture { /** Zero-width negative lookaheads enclosing the capture. */ private final int negLookaheadLevel; /** Sequential id of top-level negative lookaheads containing the capture. */ private final int negLookaheadGroup; Capture(final int negLookaheadGroup, final int negLookaheadLevel) { this.negLookaheadGroup = negLookaheadGroup; this.negLookaheadLevel = negLookaheadLevel; } /** * Returns true if this Capture can be referenced from the position specified by the * group and level parameters. This is the case if either the group is not within * a negative lookahead, or the position of the referrer is in the same negative lookahead. * * @param group current negative lookahead group * @param level current negative lokahead level * @return true if this capture group can be referenced from the given position */ boolean canBeReferencedFrom(final int group, final int level) { return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel); } } /** * Constructor * @param string the JavaScript regexp to parse */ private RegExpScanner(final String string) { super(string); sb = new StringBuilder(limit); reset(0); expected.put(']', 0); expected.put('}', 0); } private void processForwardReferences() { final Iterator iterator = forwardReferences.descendingIterator(); while (iterator.hasNext()) { final int pos = iterator.next(); final int num = iterator.next(); if (num > caps.size()) { // Non-existing backreference. If the number begins with a valid octal convert it to // Unicode escape and append the rest to a literal character sequence. final StringBuilder buffer = new StringBuilder(); octalOrLiteral(Integer.toString(num), buffer); sb.insert(pos, buffer); } } forwardReferences.clear(); } /** * Scan a JavaScript regexp string returning a Java safe regex string. * * @param string * JavaScript regexp string. * @return Java safe regex string. */ public static RegExpScanner scan(final String string) { final RegExpScanner scanner = new RegExpScanner(string); try { scanner.disjunction(); } catch (final Exception e) { throw new PatternSyntaxException(e.getMessage(), string, scanner.position); } // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors if (scanner.position != string.length()) { final String p = scanner.getStringBuilder().toString(); throw new PatternSyntaxException(string, p, p.length() + 1); } scanner.processForwardReferences(); return scanner; } final StringBuilder getStringBuilder() { return sb; } String getJavaPattern() { return sb.toString(); } BitVector getGroupsInNegativeLookahead() { BitVector vec = null; for (int i = 0; i < caps.size(); i++) { final Capture cap = caps.get(i); if (cap.negLookaheadLevel > 0) { if (vec == null) { vec = new BitVector(caps.size() + 1); } vec.set(i + 1); } } return vec; } /** * Commit n characters to the builder and to a given token * @param n Number of characters. * @return Committed token */ private boolean commit(final int n) { switch (n) { case 1: sb.append(ch0); skip(1); break; case 2: sb.append(ch0); sb.append(ch1); skip(2); break; case 3: sb.append(ch0); sb.append(ch1); sb.append(ch2); skip(3); break; default: assert false : "Should not reach here"; } return true; } /** * Restart the buffers back at an earlier position. * * @param startIn * Position in the input stream. * @param startOut * Position in the output stream. */ private void restart(final int startIn, final int startOut) { reset(startIn); sb.setLength(startOut); } private void push(final char ch) { expected.put(ch, expected.get(ch) + 1); } private void pop(final char ch) { expected.put(ch, Math.min(0, expected.get(ch) - 1)); } /* * Recursive descent tokenizer starts below. */ /* * Disjunction :: * Alternative * Alternative | Disjunction */ private void disjunction() { while (true) { alternative(); if (ch0 == '|') { commit(1); } else { break; } } } /* * Alternative :: * [empty] * Alternative Term */ private void alternative() { while (term()) { // do nothing } } /* * Term :: * Assertion * Atom * Atom Quantifier */ private boolean term() { final int startIn = position; final int startOut = sb.length(); if (assertion()) { return true; } if (atom()) { quantifier(); return true; } restart(startIn, startOut); return false; } /* * Assertion :: * ^ * $ * \b * \B * ( ? = Disjunction ) * ( ? ! Disjunction ) */ private boolean assertion() { final int startIn = position; final int startOut = sb.length(); switch (ch0) { case '^': case '$': return commit(1); case '\\': if (ch1 == 'b' || ch1 == 'B') { return commit(2); } break; case '(': if (ch1 != '?') { break; } if (ch2 != '=' && ch2 != '!') { break; } final boolean isNegativeLookahead = (ch2 == '!'); commit(3); if (isNegativeLookahead) { if (negLookaheadLevel == 0) { negLookaheadGroup++; } negLookaheadLevel++; } disjunction(); if (isNegativeLookahead) { negLookaheadLevel--; } if (ch0 == ')') { return commit(1); } break; default: break; } restart(startIn, startOut); return false; } /* * Quantifier :: * QuantifierPrefix * QuantifierPrefix ? */ private boolean quantifier() { if (quantifierPrefix()) { if (ch0 == '?') { commit(1); } return true; } return false; } /* * QuantifierPrefix :: * * * + * ? * { DecimalDigits } * { DecimalDigits , } * { DecimalDigits , DecimalDigits } */ private boolean quantifierPrefix() { final int startIn = position; final int startOut = sb.length(); switch (ch0) { case '*': case '+': case '?': return commit(1); case '{': commit(1); if (!decimalDigits()) { break; // not a quantifier - back out } push('}'); if (ch0 == ',') { commit(1); decimalDigits(); } if (ch0 == '}') { pop('}'); commit(1); } else { // Bad quantifier should be rejected but is accepted by all major engines restart(startIn, startOut); return false; } return true; default: break; } restart(startIn, startOut); return false; } /* * Atom :: * PatternCharacter * . * \ AtomEscape * CharacterClass * ( Disjunction ) * ( ? : Disjunction ) * */ private boolean atom() { final int startIn = position; final int startOut = sb.length(); if (patternCharacter()) { return true; } if (ch0 == '.') { return commit(1); } if (ch0 == '\\') { commit(1); if (atomEscape()) { return true; } } if (characterClass()) { return true; } if (ch0 == '(') { commit(1); if (ch0 == '?' && ch1 == ':') { commit(2); } else { caps.add(new Capture(negLookaheadGroup, negLookaheadLevel)); } disjunction(); if (ch0 == ')') { commit(1); return true; } } restart(startIn, startOut); return false; } /* * PatternCharacter :: * SourceCharacter but not any of: ^$\.*+?()[]{}| */ @SuppressWarnings("fallthrough") private boolean patternCharacter() { if (atEOF()) { return false; } switch (ch0) { case '^': case '$': case '\\': case '.': case '*': case '+': case '?': case '(': case ')': case '[': case '|': return false; case '}': case ']': final int n = expected.get(ch0); if (n != 0) { return false; } case '{': // if not a valid quantifier escape curly brace to match itself // this ensures compatibility with other JS implementations if (!quantifierPrefix()) { sb.append('\\'); return commit(1); } return false; default: return commit(1); // SOURCECHARACTER } } /* * AtomEscape :: * DecimalEscape * CharacterEscape * CharacterClassEscape */ private boolean atomEscape() { // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape(); } /* * CharacterEscape :: * ControlEscape * c ControlLetter * HexEscapeSequence * UnicodeEscapeSequence * IdentityEscape */ private boolean characterEscape() { final int startIn = position; final int startOut = sb.length(); if (controlEscape()) { return true; } if (ch0 == 'c') { commit(1); if (controlLetter()) { return true; } restart(startIn, startOut); } if (hexEscapeSequence() || unicodeEscapeSequence()) { return true; } restart(startIn, startOut); return false; } private boolean scanEscapeSequence(final char leader, final int length) { final int startIn = position; final int startOut = sb.length(); if (ch0 != leader) { return false; } commit(1); for (int i = 0; i < length; i++) { final char ch0l = Character.toLowerCase(ch0); if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { commit(1); } else { restart(startIn, startOut); return false; } } return true; } private boolean hexEscapeSequence() { return scanEscapeSequence('x', 2); } private boolean unicodeEscapeSequence() { return scanEscapeSequence('u', 4); } /* * ControlEscape :: * one of fnrtv */ private boolean controlEscape() { switch (ch0) { case 'f': case 'n': case 'r': case 't': case 'v': return commit(1); default: return false; } } /* * ControlLetter :: * one of abcdefghijklmnopqrstuvwxyz * ABCDEFGHIJKLMNOPQRSTUVWXYZ */ private boolean controlLetter() { // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class. if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z') || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) { // for some reason java regexps don't like control characters on the // form "\\ca".match([string with ascii 1 at char0]). Translating // them to unicode does it though. sb.setLength(sb.length() - 1); unicode(ch0 % 32, sb); skip(1); return true; } return false; } /* * IdentityEscape :: * SourceCharacter but not IdentifierPart * (200c) * (200d) */ private boolean identityEscape() { if (atEOF()) { throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException } // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here. if (ch0 == 'c') { sb.append('\\'); // Treat invalid \c control sequence as \\c } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) { sb.setLength(sb.length() - 1); } return commit(1); } /* * DecimalEscape :: * DecimalIntegerLiteral [lookahead DecimalDigit] */ private boolean decimalEscape() { final int startIn = position; final int startOut = sb.length(); if (ch0 == '0' && !isOctalDigit(ch1)) { skip(1); // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a character (Unicodevalue0000); sb.append("\u0000"); return true; } if (isDecimalDigit(ch0)) { if (ch0 == '0') { // We know this is an octal escape. if (inCharClass) { // Convert octal escape to unicode escape if inside character class. int octalValue = 0; while (isOctalDigit(ch0)) { octalValue = octalValue * 8 + ch0 - '0'; skip(1); } unicode(octalValue, sb); } else { // Copy decimal escape as-is decimalDigits(); } } else { // This should be a backreference, but could also be an octal escape or even a literal string. int decimalValue = 0; while (isDecimalDigit(ch0)) { decimalValue = decimalValue * 10 + ch0 - '0'; skip(1); } if (inCharClass) { // No backreferences in character classes. Encode as unicode escape or literal char sequence sb.setLength(sb.length() - 1); octalOrLiteral(Integer.toString(decimalValue), sb); } else if (decimalValue <= caps.size()) { // Captures inside a negative lookahead are undefined when referenced from the outside. final Capture capture = caps.get(decimalValue - 1); if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) { // Outside reference to capture in negative lookahead, omit from output buffer. sb.setLength(sb.length() - 1); } else { // Append backreference to output buffer. sb.append(decimalValue); } } else { // Forward references to a capture group are always undefined so we can omit it from the output buffer. // However, if the target capture does not exist, we need to rewrite the reference as hex escape // or literal string, so register the reference for later processing. sb.setLength(sb.length() - 1); forwardReferences.add(decimalValue); forwardReferences.add(sb.length()); } } return true; } restart(startIn, startOut); return false; } /* * CharacterClassEscape :: * one of dDsSwW */ private boolean characterClassEscape() { switch (ch0) { // java.util.regex requires translation of \s and \S to explicit character list case 's': if (RegExpFactory.usesJavaUtilRegex()) { sb.setLength(sb.length() - 1); // No nested class required if we already are inside a character class if (inCharClass) { sb.append(Lexer.getWhitespaceRegExp()); } else { sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']'); } skip(1); return true; } return commit(1); case 'S': if (RegExpFactory.usesJavaUtilRegex()) { sb.setLength(sb.length() - 1); // In negative class we must use intersection to get double negation ("not anything else than space") sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']'); skip(1); return true; } return commit(1); case 'd': case 'D': case 'w': case 'W': return commit(1); default: return false; } } /* * CharacterClass :: * [ [lookahead {^}] ClassRanges ] * [ ^ ClassRanges ] */ private boolean characterClass() { final int startIn = position; final int startOut = sb.length(); if (ch0 == '[') { try { inCharClass = true; push(']'); commit(1); if (ch0 == '^') { inNegativeClass = true; commit(1); } if (classRanges() && ch0 == ']') { pop(']'); commit(1); // Substitute empty character classes [] and [^] that never or always match if (position == startIn + 2) { sb.setLength(sb.length() - 1); sb.append("^\\s\\S]"); } else if (position == startIn + 3 && inNegativeClass) { sb.setLength(sb.length() - 2); sb.append("\\s\\S]"); } return true; } } finally { inCharClass = false; // no nested character classes in JavaScript inNegativeClass = false; } } restart(startIn, startOut); return false; } /* * ClassRanges :: * [empty] * NonemptyClassRanges */ private boolean classRanges() { nonemptyClassRanges(); return true; } /* * NonemptyClassRanges :: * ClassAtom * ClassAtom NonemptyClassRangesNoDash * ClassAtom - ClassAtom ClassRanges */ private boolean nonemptyClassRanges() { final int startIn = position; final int startOut = sb.length(); if (classAtom()) { if (ch0 == '-') { commit(1); if (classAtom() && classRanges()) { return true; } } nonemptyClassRangesNoDash(); return true; } restart(startIn, startOut); return false; } /* * NonemptyClassRangesNoDash :: * ClassAtom * ClassAtomNoDash NonemptyClassRangesNoDash * ClassAtomNoDash - ClassAtom ClassRanges */ private boolean nonemptyClassRangesNoDash() { final int startIn = position; final int startOut = sb.length(); if (classAtomNoDash()) { // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom if (ch0 == '-') { commit(1); if (classAtom() && classRanges()) { return true; } //fallthru } nonemptyClassRangesNoDash(); return true; // still a class atom } if (classAtom()) { return true; } restart(startIn, startOut); return false; } /* * ClassAtom : - ClassAtomNoDash */ private boolean classAtom() { if (ch0 == '-') { return commit(1); } return classAtomNoDash(); } /* * ClassAtomNoDash :: * SourceCharacter but not one of \ or ] or - * \ ClassEscape */ private boolean classAtomNoDash() { if (atEOF()) { return false; } final int startIn = position; final int startOut = sb.length(); switch (ch0) { case ']': case '-': return false; case '[': // unescaped left square bracket - add escape sb.append('\\'); return commit(1); case '\\': commit(1); if (classEscape()) { return true; } restart(startIn, startOut); return false; default: return commit(1); } } /* * ClassEscape :: * DecimalEscape * b * CharacterEscape * CharacterClassEscape */ private boolean classEscape() { if (decimalEscape()) { return true; } if (ch0 == 'b') { sb.setLength(sb.length() - 1); sb.append('\b'); skip(1); return true; } // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all return characterEscape() || characterClassEscape() || identityEscape(); } /* * DecimalDigits */ private boolean decimalDigits() { if (!isDecimalDigit(ch0)) { return false; } while (isDecimalDigit(ch0)) { commit(1); } return true; } private static void unicode(final int value, final StringBuilder buffer) { final String hex = Integer.toHexString(value); buffer.append('u'); for (int i = 0; i < 4 - hex.length(); i++) { buffer.append('0'); } buffer.append(hex); } // Convert what would have been a backreference into a unicode escape, or a number literal, or both. private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) { final int length = numberLiteral.length(); int octalValue = 0; int pos = 0; // Maximum value for octal escape is 0377 (255) so we stop the loop at 32 while (pos < length && octalValue < 0x20) { final char ch = numberLiteral.charAt(pos); if (isOctalDigit(ch)) { octalValue = octalValue * 8 + ch - '0'; } else { break; } pos++; } if (octalValue > 0) { buffer.append('\\'); unicode(octalValue, buffer); buffer.append(numberLiteral.substring(pos)); } else { buffer.append(numberLiteral); } } private static boolean isOctalDigit(final char ch) { return ch >= '0' && ch <= '7'; } private static boolean isDecimalDigit(final char ch) { return ch >= '0' && ch <= '9'; } }