1 /*
   2  * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.nashorn.internal.parser;
  27 
  28 import java.util.ArrayList;
  29 import java.util.HashMap;
  30 import java.util.Iterator;
  31 import java.util.LinkedHashMap;
  32 import java.util.LinkedList;
  33 import java.util.List;
  34 import java.util.Map;
  35 import java.util.regex.PatternSyntaxException;
  36 import jdk.nashorn.internal.runtime.BitVector;
  37 
  38 /**
  39  * Scan a JavaScript regexp, converting to Java regex if necessary.
  40  *
  41  */
  42 public class RegExpScanner extends Scanner {
  43 
  44     /**
  45      * String builder to accumulate the result - this contains verbatim parsed JavaScript.
  46      * to get the java equivalent we need to create a Pattern token and return its toString()
  47      */
  48     private final StringBuilder sb;
  49 
  50     /** An optional error message if one occurred during parse. */
  51     private String errorMessage;
  52 
  53     /** Is this the special case of a regexp that never matches anything */
  54     private boolean neverMatches;
  55 
  56     /** The resulting java.util.regex pattern string. */
  57     private String javaPattern;
  58 
  59     /** Expected token table */
  60     private final Map<Character, Integer> expected = new HashMap<>();
  61 
  62     /** Capturing parenthesis that have been found so far. */
  63     private final List<Capture> caps = new LinkedList<>();
  64 
  65     /** Forward references to capturing parenthesis to be resolved later.*/
  66     private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
  67 
  68     /** Current level of zero-width negative lookahead assertions. */
  69     private int negativeLookaheadLevel;
  70 
  71     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
  72 
  73     private static class Capture {
  74         /**
  75          * Zero-width negative lookaheads enclosing the capture.
  76          */
  77         private final int negativeLookaheadLevel;
  78         /**
  79          * Captures that live inside a negative lookahead are dead after the
  80          * lookahead and will be undefined if referenced from outside.
  81          */
  82         private boolean isDead;
  83 
  84         Capture(final int negativeLookaheadLevel) {
  85             this.negativeLookaheadLevel = negativeLookaheadLevel;
  86         }
  87 
  88         public int getNegativeLookaheadLevel() {
  89             return negativeLookaheadLevel;
  90         }
  91 
  92         public boolean isDead() {
  93             return isDead;
  94         }
  95 
  96         public void setDead() {
  97             this.isDead = true;
  98         }
  99     }
 100 
 101     /**
 102      * This is a token - the JavaScript regexp is scanned into a token tree
 103      * A token has other tokens as children as well as "atoms", i.e. Strings.
 104      *
 105      */
 106     private static class Token {
 107 
 108         private enum Type {
 109             PATTERN,
 110             DISJUNCTION,
 111             ALTERNATIVE,
 112             TERM,
 113             ASSERTION,
 114             QUANTIFIER,
 115             QUANTIFIER_PREFIX,
 116             ATOM,
 117             PATTERN_CHARACTER,
 118             ATOM_ESCAPE,
 119             CHARACTER_ESCAPE,
 120             CONTROL_ESCAPE,
 121             CONTROL_LETTER,
 122             IDENTITY_ESCAPE,
 123             DECIMAL_ESCAPE,
 124             CHARACTERCLASS_ESCAPE,
 125             CHARACTERCLASS,
 126             CLASSRANGES,
 127             NON_EMPTY_CLASSRANGES,
 128             NON_EMPTY_CLASSRANGES_NODASH,
 129             CLASSATOM,
 130             CLASSATOM_NODASH,
 131             CLASS_ESCAPE,
 132             DECIMALDIGITS,
 133             HEX_ESCAPESEQUENCE,
 134             UNICODE_ESCAPESEQUENCE,
 135         }
 136 
 137         /**
 138          * Token tyoe
 139          */
 140         private final Token.Type type;
 141 
 142         /**
 143          * Child nodes
 144          */
 145         private final List<Object> children;
 146 
 147         /**
 148          * Parent node
 149          */
 150         private Token parent;
 151 
 152         /**
 153          * Dead code flag
 154          */
 155         private boolean isDead;
 156 
 157         private static final Map<Type, ToString> toStringMap = new HashMap<>();
 158         private static final ToString DEFAULT_TOSTRING = new ToString();
 159 
 160         private static String unicode(final int value) {
 161             final StringBuilder sb = new StringBuilder();
 162             final String hex = Integer.toHexString(value);
 163             sb.append('u');
 164             for (int i = 0; i < 4 - hex.length(); i++) {
 165                 sb.append('0');
 166             }
 167             sb.append(hex);
 168 
 169             return sb.toString();
 170         }
 171 
 172         static {
 173             toStringMap.put(Type.CHARACTERCLASS, new ToString() {
 174                 @Override
 175                 public String toString(final Token token) {
 176                     return super.toString(token).replace("\\b", "\b");
 177                 }
 178             });
 179 
 180             // for some reason java regexps don't like control characters on the
 181             // form "\\ca".match([string with ascii 1 at char0]). Translating
 182             // them to unicode does it though.
 183             toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
 184                 @Override
 185                 public String toString(final Token token) {
 186                     final String str = super.toString(token);
 187                     if (str.length() == 2) {
 188                         return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
 189                     }
 190                     return str;
 191                 }
 192             });
 193 
 194             toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
 195                 @Override
 196                 public String toString(final Token token) {
 197                     final String str = super.toString(token);
 198 
 199                     if ("\0".equals(str)) {
 200                         return str;
 201                     }
 202 
 203                     int value;
 204 
 205                     if (!token.hasParentOfType(Type.CLASSRANGES)) {
 206                         return str;
 207                     }
 208 
 209                     value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
 210                     if (value > 0xff) {
 211                         throw new NumberFormatException(str);
 212                     }
 213 
 214                     return Token.unicode(value);
 215                 }
 216             });
 217 
 218         }
 219 
 220         /**
 221          * JavaScript Token to Java regex substring framework.
 222          *
 223          */
 224         private static class ToString {
 225             String toString(final Token token) {
 226                 final StringBuilder sb = new StringBuilder();
 227                 for (final Object child : token.getChildren()) {
 228                     sb.append(child);
 229                 }
 230 
 231                 //perform global substitutions that hold true for any evaluated form
 232                 String str = sb.toString();
 233                 if (str.equals("[^]")) {







 234                     str = "[\\s\\S]";



 235                 }
 236                 return str;
 237             }
 238         }
 239 
 240         /**
 241          * Token iterator. Doesn't return "atom" children. i.e. string representations,
 242          * just tokens
 243          *
 244          */
 245         private static class TokenIterator implements Iterator<Token> {
 246             private final List<Token> preorder;
 247 
 248             private void init(final Token root) {
 249                 preorder.add(root);
 250                 for (final Object child : root.getChildren()) {
 251                     if (child instanceof Token) {
 252                         init((Token)child);
 253                     }
 254                 }
 255             }
 256 
 257             TokenIterator(final Token root) {
 258                 preorder = new ArrayList<>();
 259                 init(root);
 260             }
 261 
 262             @Override
 263             public boolean hasNext() {
 264                 return !preorder.isEmpty();
 265             }
 266 
 267             @Override
 268             public Token next() {
 269                 return preorder.remove(0);
 270             }
 271 
 272             @Override
 273             public void remove() {
 274                 next();
 275             }
 276         }
 277 
 278         /**
 279          * Constructor
 280          * @param type the token type
 281          */
 282         Token(final Token.Type type) {
 283             this.type = type;
 284             children = new ArrayList<>();
 285         }
 286 
 287         /**
 288          * Add a an "atom" child to a token
 289          * @param child the child to add
 290          * @return the token (for chaining)
 291          */
 292         public Token add(final String child) {
 293             children.add(child);
 294             return this;
 295         }
 296 
 297         /**
 298          * Add a child to a token
 299          * @param child the child
 300          * @return the token (for chaining)
 301          */
 302         public Token add(final Token child) {
 303             if (child != null) {
 304                 children.add(child);
 305                 child.setParent(this);
 306             }
 307             return this;
 308         }
 309 
 310         /**
 311          * Remove a child from a token
 312          * @param child the child to remove
 313          * @return true if successful
 314          */
 315         public boolean remove(final Token child) {
 316             return children.remove(child);
 317         }
 318 
 319         /**
 320          * Remove the last child from a token
 321          * @return the removed child
 322          */
 323         public Object removeLast() {
 324             return children.remove(children.size() - 1);
 325         }
 326 
 327         /**
 328          * Flag this token as dead code
 329          * @param isDead is it dead or not
 330          */
 331         private void setIsDead(final boolean isDead) {
 332             this.isDead = isDead;
 333         }
 334 
 335         /**
 336          * Is this token dead code
 337          * @return boolean
 338          */
 339         private boolean getIsDead() {
 340             return isDead;
 341         }
 342 
 343         /**
 344          * Get the parent of this token
 345          * @return parent token
 346          */
 347         public Token getParent() {
 348             return parent;
 349         }
 350 
 351         public boolean hasParentOfType(final Token.Type parentType) {
 352             for (Token p = getParent(); p != null; p = p.getParent()) {
 353                 if (p.getType() == parentType) {
 354                     return true;
 355                 }
 356             }
 357             return false;
 358         }
 359 
 360         public boolean hasChildOfType(final Token.Type childType) {
 361             for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
 362                 if (iter.next().getType() == childType) {
 363                     return true;
 364                 }
 365             }
 366             return false;
 367         }
 368 
 369         /**
 370          * Set the parent of this token
 371          * @param parent
 372          */
 373         private void setParent(final Token parent) {
 374             this.parent = parent;
 375         }
 376 
 377         /**
 378          * Get the children of this token
 379          * @return an array of children, never null
 380          */
 381         public Object[] getChildren() {
 382             return children.toArray();
 383         }
 384 
 385         /**
 386          * Reset this token, remove all children
 387          */
 388         public void reset() {
 389             children.clear();
 390         }
 391 
 392         /**
 393          * Get a preorder token iterator with this token as root
 394          * @return an iterator
 395          */
 396         public Iterator<Token> iterator() {
 397             return new TokenIterator(this);
 398         }
 399 
 400         /**
 401          * Get the type of this token
 402          * @return type
 403          */
 404         public Type getType() {
 405             return type;
 406         }
 407 
 408         /**
 409          * Turn this token into Java regexp compatible text
 410          * @return part of a java regexp
 411          */
 412         @Override
 413         public String toString() {
 414             ToString t = toStringMap.get(getType());
 415             if (t == null) {
 416                 t = DEFAULT_TOSTRING;
 417             }
 418             return t.toString(this);
 419         }
 420     }
 421 
 422     /**
 423      * Constructor
 424      * @param string the JavaScript regexp to parse
 425      */
 426     private RegExpScanner(final String string) {
 427         super(string);
 428         sb = new StringBuilder(limit);
 429         reset(0);
 430         expected.put(']', 0);
 431         expected.put('}', 0);
 432     }
 433 
 434     private void processForwardReferences() {
 435         if (neverMatches()) {
 436             return;
 437         }
 438 
 439         for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
 440             if (fwdRef.getKey().intValue() > caps.size()) {
 441                 neverMatches = true;
 442                 break;
 443             }
 444 
 445             fwdRef.getValue().setIsDead(true);
 446         }
 447 
 448         forwardReferences.clear();
 449     }
 450 
 451     /**
 452      * Scan a JavaScript regexp string returning a Java safe regex string.
 453      *
 454      * @param string
 455      *            JavaScript regexp string.
 456      * @return Java safe regex string.
 457      */
 458     public static RegExpScanner scan(final String string) {
 459         final RegExpScanner scanner = new RegExpScanner(string);
 460 
 461         Token pattern;
 462 
 463         try {
 464             pattern = scanner.pattern();
 465         } catch (final Exception e) {
 466             throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
 467         }
 468 
 469         scanner.processForwardReferences();
 470         if (scanner.neverMatches()) {
 471             return null; // never matches
 472         }
 473 
 474         // go over the code and remove dead code
 475         final Iterator<Token> iter = pattern.iterator();
 476         while (iter.hasNext()) {
 477             final Token next = iter.next();
 478             if (next.getIsDead()) {
 479                 next.getParent().remove(next);
 480             }
 481         }
 482 
 483         // turn the pattern into a string, p, the java equivalent string for our js regexp
 484         final String p = pattern.toString();
 485         // if builder contains all tokens that were sent in, we know
 486         // we correctly parsed the entire JavaScript regexp without syntax errors
 487         if (!string.equals(scanner.getStringBuilder().toString())) {
 488             throw new PatternSyntaxException(string, p, p.length() + 1);
 489         }
 490 
 491         scanner.javaPattern = p;
 492         return scanner;
 493      }
 494 
 495     /**
 496      * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
 497      * is an example where we never match
 498      *
 499      * @return boolean
 500      */
 501     private boolean neverMatches() {
 502         return neverMatches;
 503     }
 504 
 505     /**
 506      * This is used to set better error messages that can be reused
 507      * in NativeRegExp for augmenting e.g. SyntaxErrors.
 508      *
 509      * @return an error message or null if no extra info
 510      */
 511     public String getErrorMessage() {
 512         return errorMessage;
 513     }
 514 
 515     final StringBuilder getStringBuilder() {
 516         return sb;
 517     }
 518 
 519     String getJavaPattern() {
 520         return javaPattern;
 521     }
 522 
 523     BitVector getGroupsInNegativeLookahead() {
 524         BitVector vec = null;
 525         for (int i = 0; i < caps.size(); i++) {
 526             final Capture cap = caps.get(i);
 527             if (cap.getNegativeLookaheadLevel() > 0) {
 528                 if (vec == null) {
 529                     vec = new BitVector(caps.size() + 1);
 530                 }
 531                 vec.set(i + 1);
 532             }
 533         }
 534         return vec;
 535     }
 536 
 537     /**
 538      * Commit n characters to the builder and to a given token
 539      * @param token Uncommitted token.
 540      * @param n     Number of characters.
 541      * @return Committed token
 542      */
 543     private Token commit(final Token token, final int n) {
 544         final int startIn = position;
 545 
 546         switch (n) {
 547         case 1:
 548             sb.append(ch0);
 549             skip(1);
 550             break;
 551         case 2:
 552             sb.append(ch0);
 553             sb.append(ch1);
 554             skip(2);
 555             break;
 556         case 3:
 557             sb.append(ch0);
 558             sb.append(ch1);
 559             sb.append(ch2);
 560             skip(3);
 561             break;
 562         default:
 563             assert false : "Should not reach here";
 564         }
 565 
 566         if (token == null) {
 567             return null;
 568         }
 569 
 570         return token.add(sb.substring(startIn, sb.length()));
 571     }
 572 
 573     /**
 574      * Restart the buffers back at an earlier position.
 575      *
 576      * @param startIn
 577      *            Position in the input stream.
 578      * @param startOut
 579      *            Position in the output stream.
 580      */
 581     private void restart(final int startIn, final int startOut) {
 582         reset(startIn);
 583         sb.setLength(startOut);
 584     }
 585 
 586     private void push(final char ch) {
 587         expected.put(ch, expected.get(ch) + 1);
 588     }
 589 
 590     private void pop(final char ch) {
 591         expected.put(ch, Math.min(0, expected.get(ch) - 1));
 592     }
 593 
 594     /*
 595      * Recursive descent tokenizer starts below.
 596      */
 597 
 598     /*
 599      * Pattern ::
 600      *      Disjunction
 601      */
 602     private Token pattern() {
 603         final Token token = new Token(Token.Type.PATTERN);
 604 
 605         final Token child = disjunction();
 606         if (child != null) {
 607             return token.add(child);
 608         }
 609 
 610         return null;
 611     }
 612 
 613     /*
 614      * Disjunction ::
 615      *      Alternative
 616      *      Alternative | Disjunction
 617      */
 618     private Token disjunction() {
 619         final Token token = new Token(Token.Type.DISJUNCTION);
 620 
 621         while (true) {
 622             token.add(alternative());
 623 
 624             if (ch0 == '|') {
 625                 commit(token, 1);
 626             } else {
 627                 break;
 628             }
 629         }
 630 
 631         return token;
 632     }
 633 
 634     /*
 635      * Alternative ::
 636      *      [empty]
 637      *      Alternative Term
 638      */
 639     private Token alternative() {
 640         final Token token = new Token(Token.Type.ALTERNATIVE);
 641 
 642         Token child;
 643         while ((child = term()) != null) {
 644             token.add(child);
 645         }
 646 
 647         return token;
 648     }
 649 
 650     /*
 651      * Term ::
 652      *      Assertion
 653      *      Atom
 654      *      Atom Quantifier
 655      */
 656     private Token term() {
 657         final int startIn  = position;
 658         final int startOut = sb.length();
 659         final Token token  = new Token(Token.Type.TERM);
 660         Token child;
 661 
 662         child = assertion();
 663         if (child != null) {
 664             return token.add(child);
 665         }
 666 
 667         child = atom();
 668         if (child != null) {
 669             boolean emptyCharacterClass = false;
 670             if ("[]".equals(child.toString())) {
 671                 emptyCharacterClass = true;
 672             }
 673 
 674             token.add(child);
 675 
 676             final Token quantifier = quantifier();
 677             if (quantifier != null) {
 678                 token.add(quantifier);
 679             }
 680 
 681             if (emptyCharacterClass) {
 682                 if (quantifier == null) {
 683                     neverMatches = true; //never matches ever.
 684                 } else {
 685                     //if we can get away with max zero, remove this entire token
 686                     final String qs = quantifier.toString();
 687                     if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
 688                         token.setIsDead(true);
 689                     }
 690                 }
 691             }
 692 
 693             return token;
 694         }
 695 
 696         restart(startIn, startOut);
 697         return null;
 698     }
 699 
 700     /*
 701      * Assertion ::
 702      *      ^
 703      *      $
 704      *      \b
 705      *      \B
 706      *      ( ? = Disjunction )
 707      *      ( ? ! Disjunction )
 708      */
 709     private Token assertion() {
 710         final int startIn  = position;
 711         final int startOut = sb.length();
 712         final Token token  = new Token(Token.Type.ASSERTION);
 713 
 714         switch (ch0) {
 715         case '^':
 716         case '$':
 717             return commit(token, 1);
 718 
 719         case '\\':
 720             if (ch1 == 'b' || ch1 == 'B') {
 721                 return commit(token, 2);
 722             }
 723             break;
 724 
 725         case '(':
 726             if (ch1 != '?') {
 727                 break;
 728             }
 729             if (ch2 != '=' && ch2 != '!') {
 730                 break;
 731             }
 732             final boolean isNegativeLookahead = (ch2 == '!');
 733             commit(token, 3);
 734 
 735             if (isNegativeLookahead) {
 736                 negativeLookaheadLevel++;
 737             }
 738             final Token disjunction = disjunction();
 739             if (isNegativeLookahead) {
 740                 for (final Capture cap : caps) {
 741                     if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
 742                         cap.setDead();
 743                     }
 744                 }
 745                 negativeLookaheadLevel--;
 746             }
 747 
 748             if (disjunction != null && ch0 == ')') {
 749                 token.add(disjunction);
 750                 return commit(token, 1);
 751             }
 752             break;
 753 
 754         default:
 755             break;
 756         }
 757 
 758         restart(startIn, startOut);
 759 
 760         return null;
 761     }
 762 
 763     /*
 764      * Quantifier ::
 765      *      QuantifierPrefix
 766      *      QuantifierPrefix ?
 767      */
 768     private Token quantifier() {
 769         final Token token = new Token(Token.Type.QUANTIFIER);
 770         final Token child = quantifierPrefix();
 771         if (child != null) {
 772             token.add(child);
 773             if (ch0 == '?') {
 774                 commit(token, 1);
 775             }
 776             return token;
 777         }
 778         return null;
 779     }
 780 
 781     /*
 782      * QuantifierPrefix ::
 783      *      *
 784      *      +
 785      *      ?
 786      *      { DecimalDigits }
 787      *      { DecimalDigits , }
 788      *      { DecimalDigits , DecimalDigits }
 789      */
 790     private Token quantifierPrefix() {
 791         final int startIn  = position;
 792         final int startOut = sb.length();
 793         final Token token  = new Token(Token.Type.QUANTIFIER_PREFIX);
 794 
 795         switch (ch0) {
 796         case '*':
 797         case '+':
 798         case '?':
 799             return commit(token, 1);
 800 
 801         case '{':
 802             commit(token, 1);
 803 
 804             final Token child = decimalDigits();
 805             if (child == null) {
 806                 break; // not a quantifier - back out
 807             }
 808             push('}');
 809             token.add(child);
 810 
 811             if (ch0 == ',') {
 812                 commit(token, 1);
 813                 token.add(decimalDigits());
 814             }
 815 
 816             if (ch0 == '}') {
 817                 pop('}');
 818                 commit(token, 1);
 819             }
 820 
 821             return token;
 822 
 823         default:
 824             break;
 825         }
 826 
 827         restart(startIn, startOut);
 828         return null;
 829     }
 830 
 831     /*
 832      * Atom ::
 833      *      PatternCharacter
 834      *      .
 835      *      \ AtomEscape
 836      *      CharacterClass
 837      *      ( Disjunction )
 838      *      ( ? : Disjunction )
 839      *
 840      */
 841     private Token atom() {
 842         final int startIn  = position;
 843         final int startOut = sb.length();
 844         final Token token  = new Token(Token.Type.ATOM);
 845         Token child;
 846 
 847         child = patternCharacter();
 848         if (child != null) {
 849             return token.add(child);
 850         }
 851 
 852         if (ch0 == '.') {
 853             return commit(token, 1);
 854         }
 855 
 856         if (ch0 == '\\') {
 857             commit(token, 1);
 858             child = atomEscape();
 859 
 860             if (child != null) {
 861                 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
 862                     final char idEscape = child.toString().charAt(0);
 863                     if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
 864                         token.reset();
 865                     }
 866                 }
 867 
 868                 token.add(child);
 869 
 870                 // forward backreferences always match empty. JavaScript != Java
 871                 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
 872                     final int refNum = Integer.parseInt(child.toString());
 873 
 874                     if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
 875                         // reference to dead in-negative-lookahead capture
 876                         token.setIsDead(true);
 877                     } else if (caps.size() < refNum) {
 878                         // forward reference: always matches against empty string (dead token).
 879                         // invalid reference (non-existant capture): pattern never matches.
 880                         forwardReferences.put(refNum, token);
 881                     }
 882                 }
 883 
 884                 return token;
 885             }
 886         }
 887 
 888         child = characterClass();
 889         if (child != null) {
 890             return token.add(child);
 891         }
 892 
 893         if (ch0 == '(') {
 894             boolean capturingParens = true;
 895             commit(token, 1);
 896             if (ch0 == '?' && ch1 == ':') {
 897                 capturingParens = false;
 898                 commit(token, 2);
 899             }
 900 
 901             child = disjunction();
 902             if (child != null) {
 903                 token.add(child);
 904                 if (ch0 == ')') {
 905                     final Token atom = commit(token, 1);
 906                     if (capturingParens) {
 907                         caps.add(new Capture(negativeLookaheadLevel));
 908                     }
 909                     return atom;
 910                 }
 911             }
 912         }
 913 
 914         restart(startIn, startOut);
 915         return null;
 916     }
 917 
 918     /*
 919      * PatternCharacter ::
 920      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
 921      */
 922     @SuppressWarnings("fallthrough")
 923     private Token patternCharacter() {
 924         if (atEOF()) {
 925             return null;
 926         }
 927 
 928         switch (ch0) {
 929         case '^':
 930         case '$':
 931         case '\\':
 932         case '.':
 933         case '*':
 934         case '+':
 935         case '?':
 936         case '(':
 937         case ')':
 938         case '[':
 939         case '|':
 940             return null;
 941 
 942         case '}':
 943         case ']':
 944             final int n = expected.get(ch0);
 945             if (n != 0) {
 946                 return null;
 947             }
 948 
 949        case '{':
 950            // if not a valid quantifier escape curly brace to match itself
 951            // this ensures compatibility with other JS implementations
 952            final Token quant = quantifierPrefix();
 953            return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
 954 
 955         default:
 956             return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
 957         }
 958     }
 959 
 960     /*
 961      * AtomEscape ::
 962      *      DecimalEscape
 963      *      CharacterEscape
 964      *      CharacterClassEscape
 965      */
 966     private Token atomEscape() {
 967         final Token token = new Token(Token.Type.ATOM_ESCAPE);
 968         Token child;
 969 
 970         child = decimalEscape();
 971         if (child != null) {
 972             return token.add(child);
 973         }
 974 
 975         child = characterClassEscape();
 976         if (child != null) {
 977             return token.add(child);
 978         }
 979 
 980         child = characterEscape();
 981         if (child != null) {
 982             return token.add(child);
 983         }
 984 
 985 
 986         return null;
 987     }
 988 
 989     /*
 990      * CharacterEscape ::
 991      *      ControlEscape
 992      *      c ControlLetter
 993      *      HexEscapeSequence
 994      *      UnicodeEscapeSequence
 995      *      IdentityEscape
 996      */
 997     private Token characterEscape() {
 998         final int startIn  = position;
 999         final int startOut = sb.length();
1000 
1001         final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
1002         Token child;
1003 
1004         child = controlEscape();
1005         if (child != null) {
1006             return token.add(child);
1007         }
1008 
1009         if (ch0 == 'c') {
1010             commit(token, 1);
1011             child = controlLetter();
1012             if (child != null) {
1013                 return token.add(child);
1014             }
1015             restart(startIn, startOut);
1016         }
1017 
1018         child = hexEscapeSequence();
1019         if (child != null) {
1020             return token.add(child);
1021         }
1022 
1023         child = unicodeEscapeSequence();
1024         if (child != null) {
1025             return token.add(child);
1026         }
1027 
1028         child = identityEscape();
1029         if (child != null) {
1030             return token.add(child);
1031         }
1032 
1033         restart(startIn, startOut);
1034 
1035         return null;
1036     }
1037 
1038     private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
1039         final int startIn  = position;
1040         final int startOut = sb.length();
1041 
1042         if (ch0 != leader) {
1043             return false;
1044         }
1045 
1046         commit(token, 1);
1047         for (int i = 0; i < length; i++) {
1048             final char ch0l = Character.toLowerCase(ch0);
1049             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
1050                 commit(token, 1);
1051             } else {
1052                 restart(startIn, startOut);
1053                 return false;
1054             }
1055         }
1056 
1057         return true;
1058     }
1059 
1060     private Token hexEscapeSequence() {
1061         final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
1062         if (scanEscapeSequence('x', 2, token)) {
1063             return token;
1064         }
1065         return null;
1066     }
1067 
1068     private Token unicodeEscapeSequence() {
1069         final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
1070         if (scanEscapeSequence('u', 4, token)) {
1071             return token;
1072         }
1073         return null;
1074     }
1075 
1076     /*
1077      * ControlEscape ::
1078      *      one of fnrtv
1079      */
1080     private Token controlEscape() {
1081         switch (ch0) {
1082         case 'f':
1083         case 'n':
1084         case 'r':
1085         case 't':
1086         case 'v':
1087             return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
1088 
1089         default:
1090             return null;
1091         }
1092     }
1093 
1094     /*
1095      * ControlLetter ::
1096      *      one of abcdefghijklmnopqrstuvwxyz
1097      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
1098      */
1099     private Token controlLetter() {
1100         final char c = Character.toUpperCase(ch0);
1101         if (c >= 'A' && c <= 'Z') {
1102             final Token token = new Token(Token.Type.CONTROL_LETTER);
1103             commit(token, 1);
1104             return token;
1105         }
1106         return null;
1107         /*
1108         Token token = new Token(Token.Type.CONTROL_LETTER);
1109         commit(null, 1);//add original char to builder not to token
1110         this.neverMatches = c < 'A' || c > 'Z';
1111         return token.add(""+c);*/
1112     }
1113 
1114     /*
1115      * IdentityEscape ::
1116      *      SourceCharacter but not IdentifierPart
1117      *      <ZWJ>  (200c)
1118      *      <ZWNJ> (200d)
1119      */
1120     private Token identityEscape() {
1121         final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
1122         commit(token, 1);
1123         return token;
1124     }
1125 
1126     /*
1127      * DecimalEscape ::
1128      *      DecimalIntegerLiteral [lookahead DecimalDigit]
1129      */
1130     private Token decimalEscape() {
1131         final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
1132         final int startIn  = position;
1133         final int startOut = sb.length();
1134 
1135         if (ch0 == '0' && !isDecimalDigit(ch1)) {
1136             commit(token, 1);
1137             token.removeLast();
1138             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
1139             return token.add("\u0000");
1140         }
1141 
1142         if (isDecimalDigit(ch0)) {
1143             while (isDecimalDigit(ch0)) {
1144                 commit(token, 1);
1145             }
1146             return token;
1147         }
1148 
1149         restart(startIn, startOut);
1150 
1151         return null;
1152     }
1153 
1154     /*
1155      * CharacterClassEscape ::
1156      *  one of dDsSwW
1157      */
1158     private Token characterClassEscape() {
1159         switch (ch0) {
1160         case 's':
1161         case 'S':
1162         case 'd':
1163         case 'D':
1164         case 'w':
1165         case 'W':
1166             return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
1167 
1168         default:
1169             return null;
1170         }
1171     }
1172 
1173     /*
1174      * CharacterClass ::
1175      *      [ [lookahead {^}] ClassRanges ]
1176      *      [ ^ ClassRanges ]
1177      */
1178     private Token characterClass() {
1179         final int startIn  = position;
1180         final int startOut = sb.length();
1181         final Token token  = new Token(Token.Type.CHARACTERCLASS);
1182 
1183         if (ch0 == '[') {
1184             push(']');
1185             commit(token, 1);
1186 
1187             if (ch0 == '^') {
1188                 commit(token, 1);
1189             }
1190 
1191             final Token child = classRanges();
1192             if (child != null && ch0 == ']') {
1193                 pop(']');
1194                 token.add(child);
1195                 return commit(token, 1);
1196             }
1197         }
1198 
1199         restart(startIn, startOut);
1200         return null;
1201     }
1202 
1203     /*
1204      * ClassRanges ::
1205      *      [empty]
1206      *      NonemptyClassRanges
1207      */
1208     private Token classRanges() {
1209         return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
1210     }
1211 
1212     /*
1213      * NonemptyClassRanges ::
1214      *      ClassAtom
1215      *      ClassAtom NonemptyClassRangesNoDash
1216      *      ClassAtom - ClassAtom ClassRanges
1217      */
1218     private Token nonemptyClassRanges() {
1219         final int startIn  = position;
1220         final int startOut = sb.length();
1221         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
1222         Token child;
1223 
1224         child = classAtom();
1225         if (child != null) {
1226             token.add(child);
1227 
1228             if (ch0 == '-') {
1229                 commit(token, 1);
1230 
1231                 final Token child1 = classAtom();
1232                 final Token child2 = classRanges();
1233                 if (child1 != null && child2 != null) {
1234                     token.add(child1);
1235                     token.add(child2);
1236 
1237                     return token;
1238                 }
1239             }
1240 
1241             child = nonemptyClassRangesNoDash();
1242             if (child != null) {
1243                 token.add(child);
1244                 return token;
1245             }
1246 
1247             return token;
1248         }
1249 
1250         restart(startIn, startOut);
1251         return null;
1252     }
1253 
1254     /*
1255      * NonemptyClassRangesNoDash ::
1256      *      ClassAtom
1257      *      ClassAtomNoDash NonemptyClassRangesNoDash
1258      *      ClassAtomNoDash - ClassAtom ClassRanges
1259      */
1260     private Token nonemptyClassRangesNoDash() {
1261         final int startIn  = position;
1262         final int startOut = sb.length();
1263         final Token token  = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
1264         Token child;
1265 
1266         child = classAtomNoDash();
1267         if (child != null) {
1268             token.add(child);
1269 
1270             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
1271             if (ch0 == '-') {
1272                commit(token, 1);
1273 
1274                final Token child1 = classAtom();
1275                final Token child2 = classRanges();
1276                if (child1 != null && child2 != null) {
1277                    token.add(child1);
1278                    return token.add(child2);
1279                }
1280                //fallthru
1281            }
1282 
1283             child = nonemptyClassRangesNoDash();
1284             if (child != null) {
1285                 token.add(child);
1286             }
1287             return token; // still a class atom
1288         }
1289 
1290         child = classAtom();
1291         if (child != null) {
1292             return token.add(child);
1293         }
1294 
1295         restart(startIn, startOut);
1296         return null;
1297     }
1298 
1299     /*
1300      * ClassAtom : - ClassAtomNoDash
1301      */
1302     private Token classAtom() {
1303         final Token token = new Token(Token.Type.CLASSATOM);
1304 
1305         if (ch0 == '-') {
1306             return commit(token, 1);
1307         }
1308 
1309         final Token child = classAtomNoDash();
1310         if (child != null) {
1311             return token.add(child);
1312         }
1313 
1314         return null;
1315     }
1316 
1317     /*
1318      * ClassAtomNoDash ::
1319      *      SourceCharacter but not one of \ or ] or -
1320      *      \ ClassEscape
1321      */
1322     private Token classAtomNoDash() {
1323         final int startIn  = position;
1324         final int startOut = sb.length();
1325         final Token token  = new Token(Token.Type.CLASSATOM_NODASH);
1326 
1327         switch (ch0) {
1328         case ']':
1329         case '-':
1330         case '\0':
1331             return null;
1332 
1333         case '[':
1334             // unescaped left square bracket - add escape
1335             return commit(token.add("\\"), 1);
1336 
1337         case '\\':
1338             commit(token, 1);
1339             final Token child = classEscape();
1340             if (child != null) {
1341                 return token.add(child);
1342             }
1343 
1344             restart(startIn, startOut);
1345             return null;
1346 
1347         default:
1348             return commit(token, 1);
1349         }
1350     }
1351 
1352     /*
1353      * ClassEscape ::
1354      *      DecimalEscape
1355      *      b
1356      *      CharacterEscape
1357      *      CharacterClassEscape
1358      */
1359     private Token classEscape() {
1360         final Token token = new Token(Token.Type.CLASS_ESCAPE);
1361         Token child;
1362 
1363         child = decimalEscape();
1364         if (child != null) {
1365             return token.add(child);
1366         }
1367 
1368         if (ch0 == 'b') {
1369             return commit(token, 1);
1370         }
1371 
1372         child = characterEscape();
1373         if (child != null) {
1374             return token.add(child);
1375         }
1376 
1377         child = characterClassEscape();
1378         if (child != null) {
1379             return token.add(child);
1380         }
1381 
1382         return null;
1383     }
1384 
1385     /*
1386      * DecimalDigits
1387      */
1388     private Token decimalDigits() {
1389         if (!isDecimalDigit(ch0)) {
1390             return null;
1391         }
1392 
1393         final Token token = new Token(Token.Type.DECIMALDIGITS);
1394         while (isDecimalDigit(ch0)) {
1395             commit(token, 1);
1396         }
1397 
1398         return token;
1399     }
1400 
1401     private static boolean isDecimalDigit(final char ch) {
1402         return ch >= '0' && ch <= '9';
1403     }
1404 }
--- EOF ---