1 /*
   2  * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package jdk.nashorn.internal.runtime.regexp;
  27 
  28 import java.util.HashMap;
  29 import java.util.Iterator;
  30 import java.util.LinkedList;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.regex.PatternSyntaxException;
  34 import jdk.nashorn.internal.parser.Lexer;
  35 import jdk.nashorn.internal.parser.Scanner;
  36 import jdk.nashorn.internal.runtime.BitVector;
  37 
  38 /**
  39  * Scan a JavaScript regexp, converting to Java regex if necessary.
  40  *
  41  */
  42 final class RegExpScanner extends Scanner {
  43 
  44     /**
  45      * String builder used to rewrite the pattern for the currently used regexp factory.
  46      */
  47     private final StringBuilder sb;
  48 
  49     /** Expected token table */
  50     private final Map<Character, Integer> expected = new HashMap<>();
  51 
  52     /** Capturing parenthesis that have been found so far. */
  53     private final List<Capture> caps = new LinkedList<>();
  54 
  55     /** Forward references to capturing parenthesis to be resolved later.*/
  56     private final LinkedList<Integer> forwardReferences = new LinkedList<>();
  57 
  58     /** Current level of zero-width negative lookahead assertions. */
  59     private int negLookaheadLevel;
  60 
  61     /** Sequential id of current top-level zero-width negative lookahead assertion. */
  62     private int negLookaheadGroup;
  63 
  64     /** Are we currently inside a character class? */
  65     private boolean inCharClass = false;
  66 
  67     /** Are we currently inside a negated character class? */
  68     private boolean inNegativeClass = false;
  69 
  70     private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-";
  71 
  72     private static class Capture {
  73         /** Zero-width negative lookaheads enclosing the capture. */
  74         private final int negLookaheadLevel;
  75         /** Sequential id of top-level negative lookaheads containing the capture. */
  76         private  final int negLookaheadGroup;
  77 
  78         Capture(final int negLookaheadGroup, final int negLookaheadLevel) {
  79             this.negLookaheadGroup = negLookaheadGroup;
  80             this.negLookaheadLevel = negLookaheadLevel;
  81         }
  82 
  83         /**
  84          * Returns true if this Capture can be referenced from the position specified by the
  85          * group and level parameters. This is the case if either the group is not within
  86          * a negative lookahead, or the position of the referrer is in the same negative lookahead.
  87          *
  88          * @param group current negative lookahead group
  89          * @param level current negative lokahead level
  90          * @return true if this capture group can be referenced from the given position
  91          */
  92         boolean canBeReferencedFrom(final int group, final int level) {
  93             return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel);
  94         }
  95 
  96     }
  97 
  98     /**
  99      * Constructor
 100      * @param string the JavaScript regexp to parse
 101      */
 102     private RegExpScanner(final String string) {
 103         super(string);
 104         sb = new StringBuilder(limit);
 105         reset(0);
 106         expected.put(']', 0);
 107         expected.put('}', 0);
 108     }
 109 
 110     private void processForwardReferences() {
 111 
 112         final Iterator<Integer> iterator = forwardReferences.descendingIterator();
 113         while (iterator.hasNext()) {
 114             final int pos = iterator.next();
 115             final int num = iterator.next();
 116             if (num > caps.size()) {
 117                 // Non-existing backreference. If the number begins with a valid octal convert it to
 118                 // Unicode escape and append the rest to a literal character sequence.
 119                 final StringBuilder buffer = new StringBuilder();
 120                 octalOrLiteral(Integer.toString(num), buffer);
 121                 sb.insert(pos, buffer);
 122             }
 123         }
 124 
 125         forwardReferences.clear();
 126     }
 127 
 128     /**
 129      * Scan a JavaScript regexp string returning a Java safe regex string.
 130      *
 131      * @param string
 132      *            JavaScript regexp string.
 133      * @return Java safe regex string.
 134      */
 135     public static RegExpScanner scan(final String string) {
 136         final RegExpScanner scanner = new RegExpScanner(string);
 137 
 138         try {
 139             scanner.disjunction();
 140         } catch (final Exception e) {
 141             throw new PatternSyntaxException(e.getMessage(), string, scanner.position);
 142         }
 143 
 144         // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors
 145         if (scanner.position != string.length()) {
 146             final String p = scanner.getStringBuilder().toString();
 147             throw new PatternSyntaxException(string, p, p.length() + 1);
 148         }
 149 
 150         scanner.processForwardReferences();
 151 
 152         return scanner;
 153     }
 154 
 155     final StringBuilder getStringBuilder() {
 156         return sb;
 157     }
 158 
 159     String getJavaPattern() {
 160         return sb.toString();
 161     }
 162 
 163     BitVector getGroupsInNegativeLookahead() {
 164         BitVector vec = null;
 165         for (int i = 0; i < caps.size(); i++) {
 166             final Capture cap = caps.get(i);
 167             if (cap.negLookaheadLevel > 0) {
 168                 if (vec == null) {
 169                     vec = new BitVector(caps.size() + 1);
 170                 }
 171                 vec.set(i + 1);
 172             }
 173         }
 174         return vec;
 175     }
 176 
 177     /**
 178      * Commit n characters to the builder and to a given token
 179      * @param n     Number of characters.
 180      * @return Committed token
 181      */
 182     private boolean commit(final int n) {
 183         switch (n) {
 184         case 1:
 185             sb.append(ch0);
 186             skip(1);
 187             break;
 188         case 2:
 189             sb.append(ch0);
 190             sb.append(ch1);
 191             skip(2);
 192             break;
 193         case 3:
 194             sb.append(ch0);
 195             sb.append(ch1);
 196             sb.append(ch2);
 197             skip(3);
 198             break;
 199         default:
 200             assert false : "Should not reach here";
 201         }
 202 
 203         return true;
 204     }
 205 
 206     /**
 207      * Restart the buffers back at an earlier position.
 208      *
 209      * @param startIn
 210      *            Position in the input stream.
 211      * @param startOut
 212      *            Position in the output stream.
 213      */
 214     private void restart(final int startIn, final int startOut) {
 215         reset(startIn);
 216         sb.setLength(startOut);
 217     }
 218 
 219     private void push(final char ch) {
 220         expected.put(ch, expected.get(ch) + 1);
 221     }
 222 
 223     private void pop(final char ch) {
 224         expected.put(ch, Math.min(0, expected.get(ch) - 1));
 225     }
 226 
 227     /*
 228      * Recursive descent tokenizer starts below.
 229      */
 230 
 231     /*
 232      * Disjunction ::
 233      *      Alternative
 234      *      Alternative | Disjunction
 235      */
 236     private void disjunction() {
 237         while (true) {
 238             alternative();
 239 
 240             if (ch0 == '|') {
 241                 commit(1);
 242             } else {
 243                 break;
 244             }
 245         }
 246     }
 247 
 248     /*
 249      * Alternative ::
 250      *      [empty]
 251      *      Alternative Term
 252      */
 253     private void alternative() {
 254         while (term()) {
 255             // do nothing
 256         }
 257     }
 258 
 259     /*
 260      * Term ::
 261      *      Assertion
 262      *      Atom
 263      *      Atom Quantifier
 264      */
 265     private boolean term() {
 266         final int startIn  = position;
 267         final int startOut = sb.length();
 268 
 269         if (assertion()) {
 270             return true;
 271         }
 272 
 273         if (atom()) {
 274             quantifier();
 275             return true;
 276         }
 277 
 278         restart(startIn, startOut);
 279         return false;
 280     }
 281 
 282     /*
 283      * Assertion ::
 284      *      ^
 285      *      $
 286      *      \b
 287      *      \B
 288      *      ( ? = Disjunction )
 289      *      ( ? ! Disjunction )
 290      */
 291     private boolean assertion() {
 292         final int startIn  = position;
 293         final int startOut = sb.length();
 294 
 295         switch (ch0) {
 296         case '^':
 297         case '$':
 298             return commit(1);
 299 
 300         case '\\':
 301             if (ch1 == 'b' || ch1 == 'B') {
 302                 return commit(2);
 303             }
 304             break;
 305 
 306         case '(':
 307             if (ch1 != '?') {
 308                 break;
 309             }
 310             if (ch2 != '=' && ch2 != '!') {
 311                 break;
 312             }
 313             final boolean isNegativeLookahead = (ch2 == '!');
 314             commit(3);
 315 
 316             if (isNegativeLookahead) {
 317                 if (negLookaheadLevel == 0) {
 318                     negLookaheadGroup++;
 319                 }
 320                 negLookaheadLevel++;
 321             }
 322             disjunction();
 323             if (isNegativeLookahead) {
 324                 negLookaheadLevel--;
 325             }
 326 
 327             if (ch0 == ')') {
 328                 return commit(1);
 329             }
 330             break;
 331 
 332         default:
 333             break;
 334         }
 335 
 336         restart(startIn, startOut);
 337         return false;
 338     }
 339 
 340     /*
 341      * Quantifier ::
 342      *      QuantifierPrefix
 343      *      QuantifierPrefix ?
 344      */
 345     private boolean quantifier() {
 346         if (quantifierPrefix()) {
 347             if (ch0 == '?') {
 348                 commit(1);
 349             }
 350             return true;
 351         }
 352         return false;
 353     }
 354 
 355     /*
 356      * QuantifierPrefix ::
 357      *      *
 358      *      +
 359      *      ?
 360      *      { DecimalDigits }
 361      *      { DecimalDigits , }
 362      *      { DecimalDigits , DecimalDigits }
 363      */
 364     private boolean quantifierPrefix() {
 365         final int startIn  = position;
 366         final int startOut = sb.length();
 367 
 368         switch (ch0) {
 369         case '*':
 370         case '+':
 371         case '?':
 372             return commit(1);
 373 
 374         case '{':
 375             commit(1);
 376 
 377             if (!decimalDigits()) {
 378                 break; // not a quantifier - back out
 379             }
 380             push('}');
 381 
 382             if (ch0 == ',') {
 383                 commit(1);
 384                 decimalDigits();
 385             }
 386 
 387             if (ch0 == '}') {
 388                 pop('}');
 389                 commit(1);
 390             } else {
 391                 // Bad quantifier should be rejected but is accepted by all major engines
 392                 restart(startIn, startOut);
 393                 return false;
 394             }
 395 
 396             return true;
 397 
 398         default:
 399             break;
 400         }
 401 
 402         restart(startIn, startOut);
 403         return false;
 404     }
 405 
 406     /*
 407      * Atom ::
 408      *      PatternCharacter
 409      *      .
 410      *      \ AtomEscape
 411      *      CharacterClass
 412      *      ( Disjunction )
 413      *      ( ? : Disjunction )
 414      *
 415      */
 416     private boolean atom() {
 417         final int startIn  = position;
 418         final int startOut = sb.length();
 419 
 420         if (patternCharacter()) {
 421             return true;
 422         }
 423 
 424         if (ch0 == '.') {
 425             return commit(1);
 426         }
 427 
 428         if (ch0 == '\\') {
 429             commit(1);
 430 
 431             if (atomEscape()) {
 432                 return true;
 433             }
 434         }
 435 
 436         if (characterClass()) {
 437             return true;
 438         }
 439 
 440         if (ch0 == '(') {
 441             commit(1);
 442             if (ch0 == '?' && ch1 == ':') {
 443                 commit(2);
 444             } else {
 445                 caps.add(new Capture(negLookaheadGroup, negLookaheadLevel));
 446             }
 447 
 448             disjunction();
 449 
 450             if (ch0 == ')') {
 451                 commit(1);
 452                 return true;
 453             }
 454         }
 455 
 456         restart(startIn, startOut);
 457         return false;
 458     }
 459 
 460     /*
 461      * PatternCharacter ::
 462      *      SourceCharacter but not any of: ^$\.*+?()[]{}|
 463      */
 464     @SuppressWarnings("fallthrough")
 465     private boolean patternCharacter() {
 466         if (atEOF()) {
 467             return false;
 468         }
 469 
 470         switch (ch0) {
 471         case '^':
 472         case '$':
 473         case '\\':
 474         case '.':
 475         case '*':
 476         case '+':
 477         case '?':
 478         case '(':
 479         case ')':
 480         case '[':
 481         case '|':
 482             return false;
 483 
 484         case '}':
 485         case ']':
 486             final int n = expected.get(ch0);
 487             if (n != 0) {
 488                 return false;
 489             }
 490 
 491        case '{':
 492            // if not a valid quantifier escape curly brace to match itself
 493            // this ensures compatibility with other JS implementations
 494            if (!quantifierPrefix()) {
 495                sb.append('\\');
 496                return commit(1);
 497            }
 498            return false;
 499 
 500         default:
 501             return commit(1); // SOURCECHARACTER
 502         }
 503     }
 504 
 505     /*
 506      * AtomEscape ::
 507      *      DecimalEscape
 508      *      CharacterEscape
 509      *      CharacterClassEscape
 510      */
 511     private boolean atomEscape() {
 512         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
 513         return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape();
 514     }
 515 
 516     /*
 517      * CharacterEscape ::
 518      *      ControlEscape
 519      *      c ControlLetter
 520      *      HexEscapeSequence
 521      *      UnicodeEscapeSequence
 522      *      IdentityEscape
 523      */
 524     private boolean characterEscape() {
 525         final int startIn  = position;
 526         final int startOut = sb.length();
 527 
 528         if (controlEscape()) {
 529             return true;
 530         }
 531 
 532         if (ch0 == 'c') {
 533             commit(1);
 534             if (controlLetter()) {
 535                 return true;
 536             }
 537             restart(startIn, startOut);
 538         }
 539 
 540         if (hexEscapeSequence() || unicodeEscapeSequence()) {
 541             return true;
 542         }
 543 
 544         restart(startIn, startOut);
 545         return false;
 546     }
 547 
 548     private boolean scanEscapeSequence(final char leader, final int length) {
 549         final int startIn  = position;
 550         final int startOut = sb.length();
 551 
 552         if (ch0 != leader) {
 553             return false;
 554         }
 555 
 556         commit(1);
 557         for (int i = 0; i < length; i++) {
 558             final char ch0l = Character.toLowerCase(ch0);
 559             if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
 560                 commit(1);
 561             } else {
 562                 restart(startIn, startOut);
 563                 return false;
 564             }
 565         }
 566 
 567         return true;
 568     }
 569 
 570     private boolean hexEscapeSequence() {
 571         return scanEscapeSequence('x', 2);
 572     }
 573 
 574     private boolean unicodeEscapeSequence() {
 575         return scanEscapeSequence('u', 4);
 576     }
 577 
 578     /*
 579      * ControlEscape ::
 580      *      one of fnrtv
 581      */
 582     private boolean controlEscape() {
 583         switch (ch0) {
 584         case 'f':
 585         case 'n':
 586         case 'r':
 587         case 't':
 588         case 'v':
 589             return commit(1);
 590 
 591         default:
 592             return false;
 593         }
 594     }
 595 
 596     /*
 597      * ControlLetter ::
 598      *      one of abcdefghijklmnopqrstuvwxyz
 599      *      ABCDEFGHIJKLMNOPQRSTUVWXYZ
 600      */
 601     private boolean controlLetter() {
 602         // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class.
 603         if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z')
 604                 || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) {
 605             // for some reason java regexps don't like control characters on the
 606             // form "\\ca".match([string with ascii 1 at char0]). Translating
 607             // them to unicode does it though.
 608             sb.setLength(sb.length() - 1);
 609             unicode(ch0 % 32, sb);
 610             skip(1);
 611             return true;
 612         }
 613         return false;
 614     }
 615 
 616     /*
 617      * IdentityEscape ::
 618      *      SourceCharacter but not IdentifierPart
 619      *      <ZWJ>  (200c)
 620      *      <ZWNJ> (200d)
 621      */
 622     private boolean identityEscape() {
 623         if (atEOF()) {
 624             throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException
 625         }
 626         // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here.
 627         if (ch0 == 'c') {
 628             sb.append('\\'); // Treat invalid \c control sequence as \\c
 629         } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) {
 630             sb.setLength(sb.length() - 1);
 631         }
 632         return commit(1);
 633     }
 634 
 635     /*
 636      * DecimalEscape ::
 637      *      DecimalIntegerLiteral [lookahead DecimalDigit]
 638      */
 639     private boolean decimalEscape() {
 640         final int startIn  = position;
 641         final int startOut = sb.length();
 642 
 643         if (ch0 == '0' && !isOctalDigit(ch1)) {
 644             skip(1);
 645             //  DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
 646             sb.append("\u0000");
 647             return true;
 648         }
 649 
 650         if (isDecimalDigit(ch0)) {
 651 
 652             if (ch0 == '0') {
 653                 // We know this is an octal escape.
 654                 if (inCharClass) {
 655                     // Convert octal escape to unicode escape if inside character class.
 656                     int octalValue = 0;
 657                     while (isOctalDigit(ch0)) {
 658                         octalValue = octalValue * 8 + ch0 - '0';
 659                         skip(1);
 660                     }
 661 
 662                     unicode(octalValue, sb);
 663 
 664                 } else {
 665                     // Copy decimal escape as-is
 666                     decimalDigits();
 667                 }
 668             } else {
 669                 // This should be a backreference, but could also be an octal escape or even a literal string.
 670                 int decimalValue = 0;
 671                 while (isDecimalDigit(ch0)) {
 672                     decimalValue = decimalValue * 10 + ch0 - '0';
 673                     skip(1);
 674                 }
 675 
 676                 if (inCharClass) {
 677                     // No backreferences in character classes. Encode as unicode escape or literal char sequence
 678                     sb.setLength(sb.length() - 1);
 679                     octalOrLiteral(Integer.toString(decimalValue), sb);
 680 
 681                 } else if (decimalValue <= caps.size()) {
 682                     //  Captures inside a negative lookahead are undefined when referenced from the outside.
 683                     final Capture capture = caps.get(decimalValue - 1);
 684                     if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) {
 685                         // Outside reference to capture in negative lookahead, omit from output buffer.
 686                         sb.setLength(sb.length() - 1);
 687                     } else {
 688                         // Append backreference to output buffer.
 689                         sb.append(decimalValue);
 690                     }
 691                 } else {
 692                     // Forward references to a capture group are always undefined so we can omit it from the output buffer.
 693                     // However, if the target capture does not exist, we need to rewrite the reference as hex escape
 694                     // or literal string, so register the reference for later processing.
 695                     sb.setLength(sb.length() - 1);
 696                     forwardReferences.add(decimalValue);
 697                     forwardReferences.add(sb.length());
 698                 }
 699 
 700             }
 701             return true;
 702         }
 703 
 704         restart(startIn, startOut);
 705         return false;
 706     }
 707 
 708     /*
 709      * CharacterClassEscape ::
 710      *  one of dDsSwW
 711      */
 712     private boolean characterClassEscape() {
 713         switch (ch0) {
 714         // java.util.regex requires translation of \s and \S to explicit character list
 715         case 's':
 716             if (RegExpFactory.usesJavaUtilRegex()) {
 717                 sb.setLength(sb.length() - 1);
 718                 // No nested class required if we already are inside a character class
 719                 if (inCharClass) {
 720                     sb.append(Lexer.getWhitespaceRegExp());
 721                 } else {
 722                     sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']');
 723                 }
 724                 skip(1);
 725                 return true;
 726             }
 727             return commit(1);
 728         case 'S':
 729             if (RegExpFactory.usesJavaUtilRegex()) {
 730                 sb.setLength(sb.length() - 1);
 731                 // In negative class we must use intersection to get double negation ("not anything else than space")
 732                 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']');
 733                 skip(1);
 734                 return true;
 735             }
 736             return commit(1);
 737         case 'd':
 738         case 'D':
 739         case 'w':
 740         case 'W':
 741             return commit(1);
 742 
 743         default:
 744             return false;
 745         }
 746     }
 747 
 748     /*
 749      * CharacterClass ::
 750      *      [ [lookahead {^}] ClassRanges ]
 751      *      [ ^ ClassRanges ]
 752      */
 753     private boolean characterClass() {
 754         final int startIn  = position;
 755         final int startOut = sb.length();
 756 
 757         if (ch0 == '[') {
 758             try {
 759                 inCharClass = true;
 760                 push(']');
 761                 commit(1);
 762 
 763                 if (ch0 == '^') {
 764                     inNegativeClass = true;
 765                     commit(1);
 766                 }
 767 
 768                 if (classRanges() && ch0 == ']') {
 769                     pop(']');
 770                     commit(1);
 771 
 772                     // Substitute empty character classes [] and [^] that never or always match
 773                     if (position == startIn + 2) {
 774                         sb.setLength(sb.length() - 1);
 775                         sb.append("^\\s\\S]");
 776                     } else if (position == startIn + 3 && inNegativeClass) {
 777                         sb.setLength(sb.length() - 2);
 778                         sb.append("\\s\\S]");
 779                     }
 780 
 781                     return true;
 782                 }
 783             } finally {
 784                 inCharClass = false;  // no nested character classes in JavaScript
 785                 inNegativeClass = false;
 786             }
 787         }
 788 
 789         restart(startIn, startOut);
 790         return false;
 791     }
 792 
 793     /*
 794      * ClassRanges ::
 795      *      [empty]
 796      *      NonemptyClassRanges
 797      */
 798     private boolean classRanges() {
 799         nonemptyClassRanges();
 800         return true;
 801     }
 802 
 803     /*
 804      * NonemptyClassRanges ::
 805      *      ClassAtom
 806      *      ClassAtom NonemptyClassRangesNoDash
 807      *      ClassAtom - ClassAtom ClassRanges
 808      */
 809     private boolean nonemptyClassRanges() {
 810         final int startIn  = position;
 811         final int startOut = sb.length();
 812 
 813         if (classAtom()) {
 814 
 815             if (ch0 == '-') {
 816                 commit(1);
 817 
 818                 if (classAtom() && classRanges()) {
 819                     return true;
 820                 }
 821             }
 822 
 823             nonemptyClassRangesNoDash();
 824 
 825             return true;
 826         }
 827 
 828         restart(startIn, startOut);
 829         return false;
 830     }
 831 
 832     /*
 833      * NonemptyClassRangesNoDash ::
 834      *      ClassAtom
 835      *      ClassAtomNoDash NonemptyClassRangesNoDash
 836      *      ClassAtomNoDash - ClassAtom ClassRanges
 837      */
 838     private boolean nonemptyClassRangesNoDash() {
 839         final int startIn  = position;
 840         final int startOut = sb.length();
 841 
 842         if (classAtomNoDash()) {
 843 
 844             // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
 845             if (ch0 == '-') {
 846                commit(1);
 847 
 848                if (classAtom() && classRanges()) {
 849                    return true;
 850                }
 851                //fallthru
 852            }
 853 
 854             nonemptyClassRangesNoDash();
 855             return true; // still a class atom
 856         }
 857 
 858         if (classAtom()) {
 859             return true;
 860         }
 861 
 862         restart(startIn, startOut);
 863         return false;
 864     }
 865 
 866     /*
 867      * ClassAtom : - ClassAtomNoDash
 868      */
 869     private boolean classAtom() {
 870 
 871         if (ch0 == '-') {
 872             return commit(1);
 873         }
 874 
 875         return classAtomNoDash();
 876     }
 877 
 878     /*
 879      * ClassAtomNoDash ::
 880      *      SourceCharacter but not one of \ or ] or -
 881      *      \ ClassEscape
 882      */
 883     private boolean classAtomNoDash() {
 884         if (atEOF()) {
 885             return false;
 886         }
 887         final int startIn  = position;
 888         final int startOut = sb.length();
 889 
 890         switch (ch0) {
 891         case ']':
 892         case '-':
 893             return false;
 894 
 895         case '[':
 896             // unescaped left square bracket - add escape
 897             sb.append('\\');
 898             return commit(1);
 899 
 900         case '\\':
 901             commit(1);
 902             if (classEscape()) {
 903                 return true;
 904             }
 905 
 906             restart(startIn, startOut);
 907             return false;
 908 
 909         default:
 910             return commit(1);
 911         }
 912     }
 913 
 914     /*
 915      * ClassEscape ::
 916      *      DecimalEscape
 917      *      b
 918      *      CharacterEscape
 919      *      CharacterClassEscape
 920      */
 921     private boolean classEscape() {
 922 
 923         if (decimalEscape()) {
 924             return true;
 925         }
 926 
 927         if (ch0 == 'b') {
 928             sb.setLength(sb.length() - 1);
 929             sb.append('\b');
 930             skip(1);
 931             return true;
 932         }
 933 
 934         // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all
 935         return characterEscape() || characterClassEscape() || identityEscape();
 936     }
 937 
 938     /*
 939      * DecimalDigits
 940      */
 941     private boolean decimalDigits() {
 942         if (!isDecimalDigit(ch0)) {
 943             return false;
 944         }
 945 
 946         while (isDecimalDigit(ch0)) {
 947             commit(1);
 948         }
 949 
 950         return true;
 951     }
 952 
 953     private static void unicode(final int value, final StringBuilder buffer) {
 954         final String hex = Integer.toHexString(value);
 955         buffer.append('u');
 956         for (int i = 0; i < 4 - hex.length(); i++) {
 957             buffer.append('0');
 958         }
 959         buffer.append(hex);
 960     }
 961 
 962     // Convert what would have been a backreference into a unicode escape, or a number literal, or both.
 963     private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) {
 964         final int length = numberLiteral.length();
 965         int octalValue = 0;
 966         int pos = 0;
 967         // Maximum value for octal escape is 0377 (255) so we stop the loop at 32
 968         while (pos < length && octalValue < 0x20) {
 969             final char ch = numberLiteral.charAt(pos);
 970             if (isOctalDigit(ch)) {
 971                 octalValue = octalValue * 8 + ch - '0';
 972             } else {
 973                 break;
 974             }
 975             pos++;
 976         }
 977         if (octalValue > 0) {
 978             buffer.append('\\');
 979             unicode(octalValue, buffer);
 980             buffer.append(numberLiteral.substring(pos));
 981         } else {
 982             buffer.append(numberLiteral);
 983         }
 984     }
 985 
 986     private static boolean isOctalDigit(final char ch) {
 987         return ch >= '0' && ch <= '7';
 988     }
 989 
 990     private static boolean isDecimalDigit(final char ch) {
 991         return ch >= '0' && ch <= '9';
 992     }
 993 }