1 /*
   2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.nashorn.internal.parser;
  27 
  28 import java.util.ArrayList;
  29 import java.util.HashMap;
  30 import java.util.Iterator;
  31 import java.util.LinkedHashMap;
  32 import java.util.LinkedList;
  33 import java.util.List;
  34 import java.util.Map;
  35 import java.util.regex.PatternSyntaxException;
  36 import jdk.nashorn.internal.runtime.BitVector;
  37 
  38 /**
  39  * Scan a JavaScript regexp, converting to Java regex if necessary.
  40  *
  41  */
  42 public class RegExpScanner extends Scanner {
  43 
  44     /**
  45      * String builder to accumulate the result - this contains verbatim parsed JavaScript.
  46      * to get the java equivalent we need to create a Pattern token and return its toString()
  47      */
  48     private final StringBuilder sb;
  49 
  50     /** An optional error message if one occurred during parse. */
  51     private String errorMessage;
  52 
  53     /** Is this the special case of a regexp that never matches anything */
  54     private boolean neverMatches;
  55 
  56     /** The resulting java.util.regex pattern string. */
  57     private String javaPattern;
  58 
  59     /** Expected token table */
  60     private final Map<Character, Integer> expected = new HashMap<>();
  61 
  62     /** Capturing parenthesis that have been found so far. */
  63     private final List<Capture> caps = new LinkedList<>();
  64 
  65     /** Forward references to capturing parenthesis to be resolved later.*/
  66     private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
  67 
  68     /** Current level of zero-width negative lookahead assertions. */
  69     private int negativeLookaheadLevel;
  70 
  71     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
  72 
  73     private static class Capture {
  74         /**
  75          * Zero-width negative lookaheads enclosing the capture.
  76          */
  77         private final int negativeLookaheadLevel;
  78         /**
  79          * Captures that live inside a negative lookahead are dead after the
  80          * lookahead and will be undefined if referenced from outside.
  81          */
  82         private boolean isDead;
  83 
  84         Capture(final int negativeLookaheadLevel) {
  85             this.negativeLookaheadLevel = negativeLookaheadLevel;
  86         }
  87 
  88         public int getNegativeLookaheadLevel() {
  89             return negativeLookaheadLevel;
  90         }
  91 
  92         public boolean isDead() {
  93             return isDead;
  94         }
  95 
  96         public void setDead() {
  97             this.isDead = true;
  98         }
  99     }
 100 
 101     /**
 102      * This is a token - the JavaScript regexp is scanned into a token tree
 103      * A token has other tokens as children as well as "atoms", i.e. Strings.
 104      *
 105      */
 106     private static class Token {
 107 
 108         private enum Type {
 109             PATTERN,
 110             DISJUNCTION,
 111             ALTERNATIVE,
 112             TERM,
 113             ASSERTION,
 114             QUANTIFIER,
 115             QUANTIFIER_PREFIX,
 116             ATOM,
 117             PATTERN_CHARACTER,
 118             ATOM_ESCAPE,
 119             CHARACTER_ESCAPE,
 120             CONTROL_ESCAPE,
 121             CONTROL_LETTER,
 122             IDENTITY_ESCAPE,
 123             DECIMAL_ESCAPE,
 124             CHARACTERCLASS_ESCAPE,
 125             CHARACTERCLASS,
 126             CLASSRANGES,
 127             NON_EMPTY_CLASSRANGES,
 128             NON_EMPTY_CLASSRANGES_NODASH,
 129             CLASSATOM,
 130             CLASSATOM_NODASH,
 131             CLASS_ESCAPE,
 132             DECIMALDIGITS,
 133             HEX_ESCAPESEQUENCE,
 134             UNICODE_ESCAPESEQUENCE,
 135         }
 136 
 137         /**
 138          * Token tyoe
 139          */
 140         private final Token.Type type;
 141 
 142         /**
 143          * Child nodes
 144          */
 145         private final List<Object> children;
 146 
 147         /**
 148          * Parent node
 149          */
 150         private Token parent;
 151 
 152         /**
 153          * Dead code flag
 154          */
 155         private boolean isDead;
 156 
 157         private static final Map<Type, ToString> toStringMap = new HashMap<>();
 158         private static final ToString DEFAULT_TOSTRING = new ToString();
 159 
 160         private static String unicode(final int value) {
 161             final StringBuilder sb = new StringBuilder();
 162             final String hex = Integer.toHexString(value);
 163             sb.append('u');
 164             for (int i = 0; i < 4 - hex.length(); i++) {
 165                 sb.append('0');
 166             }
 167             sb.append(hex);
 168 
 169             return sb.toString();
 170         }
 171 
 172         static {
 173             toStringMap.put(Type.CHARACTERCLASS, new ToString() {
 174                 @Override
 175                 public String toString(final Token token) {
 176                     return super.toString(token).replace("\\b", "\b");
 177                 }
 178             });
 179 
 180             // for some reason java regexps don't like control characters on the
 181             // form "\\ca".match([string with ascii 1 at char0]). Translating
 182             // them to unicode does it though.
 183             toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
 184                 @Override
 185                 public String toString(final Token token) {
 186                     final String str = super.toString(token);
 187                     if (str.length() == 2) {
 188                         return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
 189                     }
 190                     return str;
 191                 }
 192             });
 193 
 194             toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
 195                 @Override
 196                 public String toString(final Token token) {
 197                     final String str = super.toString(token);
 198 
 199                     if ("\0".equals(str)) {
 200                         return str;
 201                     }
 202 
 203                     int value;
 204 
 205                     if (!token.hasParentOfType(Type.CLASSRANGES)) {
 206                         return str;
 207                     }
 208 
 209                     value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
 210                     if (value > 0xff) {
 211                         throw new NumberFormatException(str);
 212                     }
 213 
 214                     return Token.unicode(value);
 215                 }
 216             });
 217 
 218         }
 219 
 220         /**
 221          * JavaScript Token to Java regex substring framework.
 222          *
 223          */
 224         private static class ToString {
 225             String toString(final Token token) {
 226                 final StringBuilder sb = new StringBuilder();
 227                 for (final Object child : token.getChildren()) {
 228                     sb.append(child);
 229                 }
 230 
 231                 //perform global substitutions that hold true for any evaluated form
 232                 String str = sb.toString();
 233                 switch (str) {
 234                 case "\\s":
 235                     str = "[" + Lexer.getWhitespaceRegExp() + "]";
 236                     break;
 237                 case "\\S":
 238                     str = "[^" + Lexer.getWhitespaceRegExp() + "]";
 239                     break;
 240                 case "[^]":
 241                     str = "[\\s\\S]";
 242                     break;
 243                 default:
 244                     break;
 245                 }
 246                 return str;
 247             }
 248         }
 249 
 250         /**
 251          * Token iterator. Doesn't return "atom" children. i.e. string representations,
 252          * just tokens
 253          *
 254          */
 255         private static class TokenIterator implements Iterator<Token> {
 256             private final List<Token> preorder;
 257 
 258             private void init(final Token root) {
 259                 preorder.add(root);
 260                 for (final Object child : root.getChildren()) {
 261                     if (child instanceof Token) {
 262                         init((Token)child);
 263                     }
 264                 }
 265             }
 266 
 267             TokenIterator(final Token root) {
 268                 preorder = new ArrayList<>();
 269                 init(root);
 270             }
 271 
 272             @Override
 273             public boolean hasNext() {
 274                 return !preorder.isEmpty();
 275             }
 276 
 277             @Override
 278             public Token next() {
 279                 return preorder.remove(0);
 280             }
 281 
 282             @Override
 283             public void remove() {
 284                 next();
 285             }
 286         }
 287 
 288         /**
 289          * Constructor
 290          * @param type the token type
 291          */
 292         Token(final Token.Type type) {
 293             this.type = type;
 294             children = new ArrayList<>();
 295         }
 296 
 297         /**
 298          * Add a an "atom" child to a token
 299          * @param child the child to add
 300          * @return the token (for chaining)
 301          */
 302         public Token add(final String child) {
 303             children.add(child);
 304             return this;
 305         }
 306 
 307         /**
 308          * Add a child to a token
 309          * @param child the child
 310          * @return the token (for chaining)
 311          */
 312         public Token add(final Token child) {
 313             if (child != null) {
 314                 children.add(child);
 315                 child.setParent(this);
 316             }
 317             return this;
 318         }
 319 
 320         /**
 321          * Remove a child from a token
 322          * @param child the child to remove
 323          * @return true if successful
 324          */
 325         public boolean remove(final Token child) {
 326             return children.remove(child);
 327         }
 328 
 329         /**
 330          * Remove the last child from a token
 331          * @return the removed child
 332          */
 333         public Object removeLast() {
 334             return children.remove(children.size() - 1);
 335         }
 336 
 337         /**
 338          * Flag this token as dead code
 339          * @param isDead is it dead or not
 340          */
 341         private void setIsDead(final boolean isDead) {
 342             this.isDead = isDead;
 343         }
 344 
 345         /**
 346          * Is this token dead code
 347          * @return boolean
 348          */
 349         private boolean getIsDead() {
 350             return isDead;
 351         }
 352 
 353         /**
 354          * Get the parent of this token
 355          * @return parent token
 356          */
 357         public Token getParent() {
 358             return parent;
 359         }
 360 
 361         public boolean hasParentOfType(final Token.Type parentType) {
 362             for (Token p = getParent(); p != null; p = p.getParent()) {
 363                 if (p.getType() == parentType) {
 364                     return true;
 365                 }
 366             }
 367             return false;
 368         }
 369 
 370         public boolean hasChildOfType(final Token.Type childType) {
 371             for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
 372                 if (iter.next().getType() == childType) {
 373                     return true;
 374                 }
 375             }
 376             return false;
 377         }
 378 
 379         /**
 380          * Set the parent of this token
 381          * @param parent
 382          */
 383         private void setParent(final Token parent) {
 384             this.parent = parent;
 385         }
 386 
 387         /**
 388          * Get the children of this token
 389          * @return an array of children, never null
 390          */
 391         public Object[] getChildren() {
 392             return children.toArray();
 393         }
 394 
 395         /**
 396          * Reset this token, remove all children
 397          */
 398         public void reset() {
 399             children.clear();
 400         }
 401 
 402         /**
 403          * Get a preorder token iterator with this token as root
 404          * @return an iterator
 405          */
 406         public Iterator<Token> iterator() {
 407             return new TokenIterator(this);
 408         }
 409 
 410         /**
 411          * Get the type of this token
 412          * @return type
 413          */
 414         public Type getType() {
 415             return type;
 416         }
 417 
 418         /**
 419          * Turn this token into Java regexp compatible text
 420          * @return part of a java regexp
 421          */
 422         @Override
 423         public String toString() {
 424             ToString t = toStringMap.get(getType());
 425             if (t == null) {
 426                 t = DEFAULT_TOSTRING;
 427             }
 428             return t.toString(this);
 429         }
 430     }
 431 
 432     /**
 433      * Constructor
 434      * @param string the JavaScript regexp to parse
 435      */
 436     private RegExpScanner(final String string) {
 437         super(string);
 438         sb = new StringBuilder(limit);
 439         reset(0);
 440         expected.put(']', 0);
 441         expected.put('}', 0);
 442     }
 443 
 444     private void processForwardReferences() {
 445         if (neverMatches()) {
 446             return;
 447         }
 448 
 449         for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
 450             if (fwdRef.getKey().intValue() > caps.size()) {
 451                 neverMatches = true;
 452                 break;
 453             }
 454 
 455             fwdRef.getValue().setIsDead(true);
 456         }
 457 
 458         forwardReferences.clear();
 459     }
 460 
 461     /**
 462      * Scan a JavaScript regexp string returning a Java safe regex string.
 463      *
 464      * @param string
 465      *            JavaScript regexp string.
 466      * @return Java safe regex string.
 467      */
 468     public static RegExpScanner scan(final String string) {
 469         final RegExpScanner scanner = new RegExpScanner(string);
 470 
 471         Token pattern;
 472 
 473         try {
 474             pattern = scanner.pattern();
 475         } catch (final Exception e) {
 476             throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
 477         }
 478 
 479         scanner.processForwardReferences();
 480         if (scanner.neverMatches()) {
 481             return null; // never matches
 482         }
 483 
 484         // go over the code and remove dead code
 485         final Iterator<Token> iter = pattern.iterator();
 486         while (iter.hasNext()) {
 487             final Token next = iter.next();
 488             if (next.getIsDead()) {
 489                 next.getParent().remove(next);
 490             }
 491         }
 492 
 493         // turn the pattern into a string, p, the java equivalent string for our js regexp
 494         final String p = pattern.toString();
 495         // if builder contains all tokens that were sent in, we know
 496         // we correctly parsed the entire JavaScript regexp without syntax errors
 497         if (!string.equals(scanner.getStringBuilder().toString())) {
 498             throw new PatternSyntaxException(string, p, p.length() + 1);
 499         }
 500 
 501         scanner.javaPattern = p;
 502         return scanner;
 503      }
 504 
 505     /**
 506      * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
 507      * is an example where we never match
 508      *
 509      * @return boolean
 510      */
 511     private boolean neverMatches() {
 512         return neverMatches;
 513     }
 514 
 515     /**
 516      * This is used to set better error messages that can be reused
 517      * in NativeRegExp for augmenting e.g. SyntaxErrors.
 518      *
 519      * @return an error message or null if no extra info
 520      */
 521     public String getErrorMessage() {
 522         return errorMessage;
 523     }
 524 
 525     final StringBuilder getStringBuilder() {
 526         return sb;
 527     }
 528 
 529     String getJavaPattern() {
 530         return javaPattern;
 531     }
 532 
 533     BitVector getGroupsInNegativeLookahead() {
 534         BitVector vec = null;
 535         for (int i = 0; i < caps.size(); i++) {
 536             final Capture cap = caps.get(i);
 537             if (cap.getNegativeLookaheadLevel() > 0) {
 538                 if (vec == null) {
 539                     vec = new BitVector(caps.size() + 1);
 540                 }
 541                 vec.set(i + 1);
 542             }
 543         }
 544         return vec;
 545     }
 546 
 547     /**
 548      * Commit n characters to the builder and to a given token
 549      * @param token Uncommitted token.
 550      * @param n     Number of characters.
 551      * @return Committed token
 552      */
 553     private Token commit(final Token token, final int n) {
 554         final int startIn = position;
 555 
 556         switch (n) {
 557         case 1:
 558             sb.append(ch0);
 559             skip(1);
 560             break;
 561         case 2:
 562             sb.append(ch0);
 563             sb.append(ch1);
 564             skip(2);
 565             break;
 566         case 3:
 567             sb.append(ch0);
 568             sb.append(ch1);
 569             sb.append(ch2);
 570             skip(3);
 571             break;
 572         default:
 573             assert false : "Should not reach here";
 574         }
 575 
 576         if (token == null) {
 577             return null;
 578         }
 579 
 580         return token.add(sb.substring(startIn, sb.length()));
 581     }
 582 
 583     /**
 584      * Restart the buffers back at an earlier position.
 585      *
 586      * @param startIn
 587      *            Position in the input stream.
 588      * @param startOut
 589      *            Position in the output stream.
 590      */
 591     private void restart(final int startIn, final int startOut) {
 592         reset(startIn);
 593         sb.setLength(startOut);
 594     }
 595 
 596     private void push(final char ch) {
 597         expected.put(ch, expected.get(ch) + 1);
 598     }
 599 
 600     private void pop(final char ch) {
 601         expected.put(ch, Math.min(0, expected.get(ch) - 1));
 602     }
 603 
 604     /*
 605      * Recursive descent tokenizer starts below.
 606      */
 607 
 608     /*
 609      * Pattern ::
 610      *      Disjunction
 611      */
 612     private Token pattern() {
 613         final Token token = new Token(Token.Type.PATTERN);
 614 
 615         final Token child = disjunction();
 616         if (child != null) {
 617             return token.add(child);
 618         }
 619 
 620         return null;
 621     }
 622 
 623     /*
 624      * Disjunction ::
 625      *      Alternative
 626      *      Alternative | Disjunction
 627      */
 628     private Token disjunction() {
 629         final Token token = new Token(Token.Type.DISJUNCTION);
 630 
 631         while (true) {
 632             token.add(alternative());
 633 
 634             if (ch0 == '|') {
 635                 commit(token, 1);
 636             } else {
 637                 break;
 638             }
 639         }
 640 
 641         return token;
 642     }
 643 
 644     /*
 645      * Alternative ::
 646      *      [empty]
 647      *      Alternative Term
 648      */
 649     private Token alternative() {
 650         final Token token = new Token(Token.Type.ALTERNATIVE);
 651 
 652         Token child;
 653         while ((child = term()) != null) {
 654             token.add(child);
 655         }
 656 
 657         return token;
 658     }
 659 
 660     /*
 661      * Term ::
 662      *      Assertion
 663      *      Atom
 664      *      Atom Quantifier
 665      */
 666     private Token term() {
 667         final int startIn  = position;
 668         final int startOut = sb.length();
 669         final Token token  = new Token(Token.Type.TERM);
 670         Token child;
 671 
 672         child = assertion();
 673         if (child != null) {
 674             return token.add(child);
 675         }
 676 
 677         child = atom();
 678         if (child != null) {
 679             boolean emptyCharacterClass = false;
 680             if ("[]".equals(child.toString())) {
 681                 emptyCharacterClass = true;
 682             }
 683 
 684             token.add(child);
 685 
 686             final Token quantifier = quantifier();
 687             if (quantifier != null) {
 688                 token.add(quantifier);
 689             }
 690 
 691             if (emptyCharacterClass) {
 692                 if (quantifier == null) {
 693                     neverMatches = true; //never matches ever.
 694                 } else {
 695                     //if we can get away with max zero, remove this entire token
 696                     final String qs = quantifier.toString();
 697                     if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
 698                         token.setIsDead(true);
 699                     }
 700                 }
 701             }
 702 
 703             return token;
 704         }
 705 
 706         restart(startIn, startOut);
 707         return null;
 708     }
 709 
 710     /*
 711      * Assertion ::
 712      *      ^
 713      *      $
 714      *      \b
 715      *      \B
 716      *      ( ? = Disjunction )
 717      *      ( ? ! Disjunction )
 718      */
 719     private Token assertion() {
 720         final int startIn  = position;
 721         final int startOut = sb.length();
 722         final Token token  = new Token(Token.Type.ASSERTION);
 723 
 724         switch (ch0) {
 725         case '^':
 726         case '$':
 727             return commit(token, 1);
 728 
 729         case '\\':
 730             if (ch1 == 'b' || ch1 == 'B') {
 731                 return commit(token, 2);
 732             }
 733             break;
 734 
 735         case '(':
 736             if (ch1 != '?') {
 737                 break;
 738             }
 739             if (ch2 != '=' && ch2 != '!') {
 740                 break;
 741             }
 742             final boolean isNegativeLookahead = (ch2 == '!');
 743             commit(token, 3);
 744 
 745             if (isNegativeLookahead) {
 746                 negativeLookaheadLevel++;
 747             }
 748             final Token disjunction = disjunction();
 749             if (isNegativeLookahead) {
 750                 for (final Capture cap : caps) {
 751                     if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
 752                         cap.setDead();
 753                     }
 754                 }
 755                 negativeLookaheadLevel--;
 756             }
 757 
 758             if (disjunction != null && ch0 == ')') {
 759                 token.add(disjunction);
 760                 return commit(token, 1);
 761             }
 762             break;
 763 
 764         default:
 765             break;
 766         }
 767 
 768         restart(startIn, startOut);
 769 
 770         return null;
 771     }
 772 
 773     /*
 774      * Quantifier ::
 775      *      QuantifierPrefix
 776      *      QuantifierPrefix ?
 777      */
 778     private Token quantifier() {
 779         final Token token = new Token(Token.Type.QUANTIFIER);
 780         final Token child = quantifierPrefix();
 781         if (child != null) {
 782             token.add(child);
 783             if (ch0 == '?') {
 784                 commit(token, 1);
 785             }
 786             return token;
 787         }
 788         return null;
 789     }
 790 
 791     /*
 792      * QuantifierPrefix ::
 793      *      *
 794      *      +
 795      *      ?
 796      *      { DecimalDigits }
 797      *      { DecimalDigits , }
 798      *      { DecimalDigits , DecimalDigits }
 799      */
 800     private Token quantifierPrefix() {
 801         final int startIn  = position;
 802         final int startOut = sb.length();
 803         final Token token  = new Token(Token.Type.QUANTIFIER_PREFIX);
 804 
 805         switch (ch0) {
 806         case '*':
 807         case '+':
 808         case '?':
 809             return commit(token, 1);
 810 
 811         case '{':
 812             commit(token, 1);
 813 
 814             final Token child = decimalDigits();
 815             if (child == null) {
 816                 break; // not a quantifier - back out
 817             }
 818             push('}');
 819             token.add(child);
 820 
 821             if (ch0 == ',') {
 822                 commit(token, 1);
 823                 token.add(decimalDigits());
 824             }
 825 
 826             if (ch0 == '}') {
 827                 pop('}');
 828                 commit(token, 1);
 829             }
 830 
 831             return token;
 832 
 833         default:
 834             break;
 835         }
 836 
 837         restart(startIn, startOut);
 838         return null;
 839     }
 840 
 841     /*
 842      * Atom ::
 843      *      PatternCharacter
 844      *      .
 845      *      \ AtomEscape
 846      *      CharacterClass
 847      *      ( Disjunction )
 848      *      ( ? : Disjunction )
 849      *
 850      */
 851     private Token atom() {
 852         final int startIn  = position;
 853         final int startOut = sb.length();
 854         final Token token  = new Token(Token.Type.ATOM);
 855         Token child;
 856 
 857         child = patternCharacter();
 858         if (child != null) {
 859             return token.add(child);
 860         }
 861 
 862         if (ch0 == '.') {
 863             return commit(token, 1);
 864         }
 865 
 866         if (ch0 == '\\') {
 867             commit(token, 1);
 868             child = atomEscape();
 869 
 870             if (child != null) {
 871                 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
 872                     final char idEscape = child.toString().charAt(0);
 873                     if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
 874                         token.reset();
 875                     }
 876                 }
 877 
 878                 token.add(child);
 879 
 880                 // forward backreferences always match empty. JavaScript != Java
 881                 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
 882                     final int refNum = Integer.parseInt(child.toString());
 883 
 884                     if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
 885                         // reference to dead in-negative-lookahead capture
 886                         token.setIsDead(true);
 887                     } else if (caps.size() < refNum) {
 888                         // forward reference: always matches against empty string (dead token).
 889                         // invalid reference (non-existant capture): pattern never matches.
 890                         forwardReferences.put(refNum, token);
 891                     }
 892                 }
 893 
 894                 return token;
 895             }
 896         }
 897 
 898         child = characterClass();
 899         if (child != null) {
 900             return token.add(child);
 901         }
 902 
 903         if (ch0 == '(') {
 904             boolean capturingParens = true;
 905             commit(token, 1);
 906             if (ch0 == '?' && ch1 == ':') {
 907                 capturingParens = false;
 908                 commit(token, 2);
 909             }
 910 
 911             child = disjunction();
 912             if (child != null) {
 913                 token.add(child);
 914                 if (ch0 == ')') {
 915                     final Token atom = commit(token, 1);
 916                     if (capturingParens) {
 917                         caps.add(new Capture(negativeLookaheadLevel));
 918                     }
 919                     return atom;
 920                 }
 921             }
 922         }
 923 
 924         restart(startIn, startOut);
 925         return null;
 926     }
 927 
 928     /*
 929      * PatternCharacter ::
 930      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
 931      */
 932     @SuppressWarnings("fallthrough")
 933     private Token patternCharacter() {
 934         if (atEOF()) {
 935             return null;
 936         }
 937 
 938         switch (ch0) {
 939         case '^':
 940         case '$':
 941         case '\\':
 942         case '.':
 943         case '*':
 944         case '+':
 945         case '?':
 946         case '(':
 947         case ')':
 948         case '[':
 949         case '|':
 950             return null;
 951 
 952         case '}':
 953         case ']':
 954             final int n = expected.get(ch0);
 955             if (n != 0) {
 956                 return null;
 957             }
 958 
 959        case '{':
 960            // if not a valid quantifier escape curly brace to match itself
 961            // this ensures compatibility with other JS implementations
 962            final Token quant = quantifierPrefix();
 963            return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
 964 
 965         default:
 966             return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
 967         }
 968     }
 969 
 970     /*
 971      * AtomEscape ::
 972      *      DecimalEscape
 973      *      CharacterEscape
 974      *      CharacterClassEscape
 975      */
 976     private Token atomEscape() {
 977         final Token token = new Token(Token.Type.ATOM_ESCAPE);
 978         Token child;
 979 
 980         child = decimalEscape();
 981         if (child != null) {
 982             return token.add(child);
 983         }
 984 
 985         child = characterClassEscape();
 986         if (child != null) {
 987             return token.add(child);
 988         }
 989 
 990         child = characterEscape();
 991         if (child != null) {
 992             return token.add(child);
 993         }
 994 
 995 
 996         return null;
 997     }
 998 
 999     /*
1000      * CharacterEscape ::
1001      *      ControlEscape
1002      *      c ControlLetter
1003      *      HexEscapeSequence
1004      *      UnicodeEscapeSequence
1005      *      IdentityEscape
1006      */
1007     private Token characterEscape() {
1008         final int startIn  = position;
1009         final int startOut = sb.length();
1010 
1011         final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
1012         Token child;
1013 
1014         child = controlEscape();
1015         if (child != null) {
1016             return token.add(child);
1017         }
1018 
1019         if (ch0 == 'c') {
1020             commit(token, 1);
1021             child = controlLetter();
1022             if (child != null) {
1023                 return token.add(child);
1024             }
1025             restart(startIn, startOut);
1026         }
1027 
1028         child = hexEscapeSequence();
1029         if (child != null) {
1030             return token.add(child);
1031         }
1032 
1033         child = unicodeEscapeSequence();
1034         if (child != null) {
1035             return token.add(child);
1036         }
1037 
1038         child = identityEscape();
1039         if (child != null) {
1040             return token.add(child);
1041         }
1042 
1043         restart(startIn, startOut);
1044 
1045         return null;
1046     }
1047 
1048     private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
1049         final int startIn  = position;
1050         final int startOut = sb.length();
1051 
1052         if (ch0 != leader) {
1053             return false;
1054         }
1055 
1056         commit(token, 1);
1057         for (int i = 0; i < length; i++) {
1058             final char ch0l = Character.toLowerCase(ch0);
1059             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
1060                 commit(token, 1);
1061             } else {
1062                 restart(startIn, startOut);
1063                 return false;
1064             }
1065         }
1066 
1067         return true;
1068     }
1069 
1070     private Token hexEscapeSequence() {
1071         final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
1072         if (scanEscapeSequence('x', 2, token)) {
1073             return token;
1074         }
1075         return null;
1076     }
1077 
1078     private Token unicodeEscapeSequence() {
1079         final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
1080         if (scanEscapeSequence('u', 4, token)) {
1081             return token;
1082         }
1083         return null;
1084     }
1085 
1086     /*
1087      * ControlEscape ::
1088      *      one of fnrtv
1089      */
1090     private Token controlEscape() {
1091         switch (ch0) {
1092         case 'f':
1093         case 'n':
1094         case 'r':
1095         case 't':
1096         case 'v':
1097             return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
1098 
1099         default:
1100             return null;
1101         }
1102     }
1103 
1104     /*
1105      * ControlLetter ::
1106      *      one of abcdefghijklmnopqrstuvwxyz
1107      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
1108      */
1109     private Token controlLetter() {
1110         final char c = Character.toUpperCase(ch0);
1111         if (c >= 'A' && c <= 'Z') {
1112             final Token token = new Token(Token.Type.CONTROL_LETTER);
1113             commit(token, 1);
1114             return token;
1115         }
1116         return null;
1117         /*
1118         Token token = new Token(Token.Type.CONTROL_LETTER);
1119         commit(null, 1);//add original char to builder not to token
1120         this.neverMatches = c < 'A' || c > 'Z';
1121         return token.add(""+c);*/
1122     }
1123 
1124     /*
1125      * IdentityEscape ::
1126      *      SourceCharacter but not IdentifierPart
1127      *      <ZWJ>  (200c)
1128      *      <ZWNJ> (200d)
1129      */
1130     private Token identityEscape() {
1131         final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
1132         commit(token, 1);
1133         return token;
1134     }
1135 
1136     /*
1137      * DecimalEscape ::
1138      *      DecimalIntegerLiteral [lookahead DecimalDigit]
1139      */
1140     private Token decimalEscape() {
1141         final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
1142         final int startIn  = position;
1143         final int startOut = sb.length();
1144 
1145         if (ch0 == '0' && !isDecimalDigit(ch1)) {
1146             commit(token, 1);
1147             token.removeLast();
1148             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
1149             return token.add("\u0000");
1150         }
1151 
1152         if (isDecimalDigit(ch0)) {
1153             while (isDecimalDigit(ch0)) {
1154                 commit(token, 1);
1155             }
1156             return token;
1157         }
1158 
1159         restart(startIn, startOut);
1160 
1161         return null;
1162     }
1163 
1164     /*
1165      * CharacterClassEscape ::
1166      *  one of dDsSwW
1167      */
1168     private Token characterClassEscape() {
1169         switch (ch0) {
1170         case 's':
1171         case 'S':
1172         case 'd':
1173         case 'D':
1174         case 'w':
1175         case 'W':
1176             return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
1177 
1178         default:
1179             return null;
1180         }
1181     }
1182 
1183     /*
1184      * CharacterClass ::
1185      *      [ [lookahead {^}] ClassRanges ]
1186      *      [ ^ ClassRanges ]
1187      */
1188     private Token characterClass() {
1189         final int startIn  = position;
1190         final int startOut = sb.length();
1191         final Token token  = new Token(Token.Type.CHARACTERCLASS);
1192 
1193         if (ch0 == '[') {
1194             push(']');
1195             commit(token, 1);
1196 
1197             if (ch0 == '^') {
1198                 commit(token, 1);
1199             }
1200 
1201             final Token child = classRanges();
1202             if (child != null && ch0 == ']') {
1203                 pop(']');
1204                 token.add(child);
1205                 return commit(token, 1);
1206             }
1207         }
1208 
1209         restart(startIn, startOut);
1210         return null;
1211     }
1212 
1213     /*
1214      * ClassRanges ::
1215      *      [empty]
1216      *      NonemptyClassRanges
1217      */
1218     private Token classRanges() {
1219         return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
1220     }
1221 
1222     /*
1223      * NonemptyClassRanges ::
1224      *      ClassAtom
1225      *      ClassAtom NonemptyClassRangesNoDash
1226      *      ClassAtom - ClassAtom ClassRanges
1227      */
1228     private Token nonemptyClassRanges() {
1229         final int startIn  = position;
1230         final int startOut = sb.length();
1231         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
1232         Token child;
1233 
1234         child = classAtom();
1235         if (child != null) {
1236             token.add(child);
1237 
1238             if (ch0 == '-') {
1239                 commit(token, 1);
1240 
1241                 final Token child1 = classAtom();
1242                 final Token child2 = classRanges();
1243                 if (child1 != null && child2 != null) {
1244                     token.add(child1);
1245                     token.add(child2);
1246 
1247                     return token;
1248                 }
1249             }
1250 
1251             child = nonemptyClassRangesNoDash();
1252             if (child != null) {
1253                 token.add(child);
1254                 return token;
1255             }
1256 
1257             return token;
1258         }
1259 
1260         restart(startIn, startOut);
1261         return null;
1262     }
1263 
1264     /*
1265      * NonemptyClassRangesNoDash ::
1266      *      ClassAtom
1267      *      ClassAtomNoDash NonemptyClassRangesNoDash
1268      *      ClassAtomNoDash - ClassAtom ClassRanges
1269      */
1270     private Token nonemptyClassRangesNoDash() {
1271         final int startIn  = position;
1272         final int startOut = sb.length();
1273         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
1274         Token child;
1275 
1276         child = classAtomNoDash();
1277         if (child != null) {
1278             token.add(child);
1279 
1280             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
1281             if (ch0 == '-') {
1282                commit(token, 1);
1283 
1284                final Token child1 = classAtom();
1285                final Token child2 = classRanges();
1286                if (child1 != null && child2 != null) {
1287                    token.add(child1);
1288                    return token.add(child2);
1289                }
1290                //fallthru
1291            }
1292 
1293             child = nonemptyClassRangesNoDash();
1294             if (child != null) {
1295                 token.add(child);
1296             }
1297             return token; // still a class atom
1298         }
1299 
1300         child = classAtom();
1301         if (child != null) {
1302             return token.add(child);
1303         }
1304 
1305         restart(startIn, startOut);
1306         return null;
1307     }
1308 
1309     /*
1310      * ClassAtom : - ClassAtomNoDash
1311      */
1312     private Token classAtom() {
1313         final Token token = new Token(Token.Type.CLASSATOM);
1314 
1315         if (ch0 == '-') {
1316             return commit(token, 1);
1317         }
1318 
1319         final Token child = classAtomNoDash();
1320         if (child != null) {
1321             return token.add(child);
1322         }
1323 
1324         return null;
1325     }
1326 
1327     /*
1328      * ClassAtomNoDash ::
1329      *      SourceCharacter but not one of \ or ] or -
1330      *      \ ClassEscape
1331      */
1332     private Token classAtomNoDash() {
1333         final int startIn  = position;
1334         final int startOut = sb.length();
1335         final Token token  = new Token(Token.Type.CLASSATOM_NODASH);
1336 
1337         switch (ch0) {
1338         case ']':
1339         case '-':
1340         case '\0':
1341             return null;
1342 
1343         case '[':
1344             // unescaped left square bracket - add escape
1345             return commit(token.add("\\"), 1);
1346 
1347         case '\\':
1348             commit(token, 1);
1349             final Token child = classEscape();
1350             if (child != null) {
1351                 return token.add(child);
1352             }
1353 
1354             restart(startIn, startOut);
1355             return null;
1356 
1357         default:
1358             return commit(token, 1);
1359         }
1360     }
1361 
1362     /*
1363      * ClassEscape ::
1364      *      DecimalEscape
1365      *      b
1366      *      CharacterEscape
1367      *      CharacterClassEscape
1368      */
1369     private Token classEscape() {
1370         final Token token = new Token(Token.Type.CLASS_ESCAPE);
1371         Token child;
1372 
1373         child = decimalEscape();
1374         if (child != null) {
1375             return token.add(child);
1376         }
1377 
1378         if (ch0 == 'b') {
1379             return commit(token, 1);
1380         }
1381 
1382         child = characterEscape();
1383         if (child != null) {
1384             return token.add(child);
1385         }
1386 
1387         child = characterClassEscape();
1388         if (child != null) {
1389             return token.add(child);
1390         }
1391 
1392         return null;
1393     }
1394 
1395     /*
1396      * DecimalDigits
1397      */
1398     private Token decimalDigits() {
1399         if (!isDecimalDigit(ch0)) {
1400             return null;
1401         }
1402 
1403         final Token token = new Token(Token.Type.DECIMALDIGITS);
1404         while (isDecimalDigit(ch0)) {
1405             commit(token, 1);
1406         }
1407 
1408         return token;
1409     }
1410 
1411     private static boolean isDecimalDigit(final char ch) {
1412         return ch >= '0' && ch <= '9';
1413     }
1414 }