1 /* 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.parser; 27 28 import java.util.ArrayList; 29 import java.util.HashMap; 30 import java.util.Iterator; 31 import java.util.LinkedHashMap; 32 import java.util.LinkedList; 33 import java.util.List; 34 import java.util.Map; 35 import java.util.regex.PatternSyntaxException; 36 import jdk.nashorn.internal.runtime.BitVector; 37 38 /** 39 * Scan a JavaScript regexp, converting to Java regex if necessary. 40 * 41 */ 42 public class RegExpScanner extends Scanner { 43 44 /** 45 * String builder to accumulate the result - this contains verbatim parsed JavaScript. 46 * to get the java equivalent we need to create a Pattern token and return its toString() 47 */ 48 private final StringBuilder sb; 49 50 /** An optional error message if one occurred during parse. */ 51 private String errorMessage; 52 53 /** Is this the special case of a regexp that never matches anything */ 54 private boolean neverMatches; 55 56 /** The resulting java.util.regex pattern string. */ 57 private String javaPattern; 58 59 /** Expected token table */ 60 private final Map<Character, Integer> expected = new HashMap<>(); 61 62 /** Capturing parenthesis that have been found so far. */ 63 private final List<Capture> caps = new LinkedList<>(); 64 65 /** Forward references to capturing parenthesis to be resolved later.*/ 66 private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); 67 68 /** Current level of zero-width negative lookahead assertions. */ 69 private int negativeLookaheadLevel; 70 71 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; 72 73 private static class Capture { 74 /** 75 * Zero-width negative lookaheads enclosing the capture. 76 */ 77 private final int negativeLookaheadLevel; 78 /** 79 * Captures that live inside a negative lookahead are dead after the 80 * lookahead and will be undefined if referenced from outside. 81 */ 82 private boolean isDead; 83 84 Capture(final int negativeLookaheadLevel) { 85 this.negativeLookaheadLevel = negativeLookaheadLevel; 86 } 87 88 public int getNegativeLookaheadLevel() { 89 return negativeLookaheadLevel; 90 } 91 92 public boolean isDead() { 93 return isDead; 94 } 95 96 public void setDead() { 97 this.isDead = true; 98 } 99 } 100 101 /** 102 * This is a token - the JavaScript regexp is scanned into a token tree 103 * A token has other tokens as children as well as "atoms", i.e. Strings. 104 * 105 */ 106 private static class Token { 107 108 private enum Type { 109 PATTERN, 110 DISJUNCTION, 111 ALTERNATIVE, 112 TERM, 113 ASSERTION, 114 QUANTIFIER, 115 QUANTIFIER_PREFIX, 116 ATOM, 117 PATTERN_CHARACTER, 118 ATOM_ESCAPE, 119 CHARACTER_ESCAPE, 120 CONTROL_ESCAPE, 121 CONTROL_LETTER, 122 IDENTITY_ESCAPE, 123 DECIMAL_ESCAPE, 124 CHARACTERCLASS_ESCAPE, 125 CHARACTERCLASS, 126 CLASSRANGES, 127 NON_EMPTY_CLASSRANGES, 128 NON_EMPTY_CLASSRANGES_NODASH, 129 CLASSATOM, 130 CLASSATOM_NODASH, 131 CLASS_ESCAPE, 132 DECIMALDIGITS, 133 HEX_ESCAPESEQUENCE, 134 UNICODE_ESCAPESEQUENCE, 135 } 136 137 /** 138 * Token tyoe 139 */ 140 private final Token.Type type; 141 142 /** 143 * Child nodes 144 */ 145 private final List<Object> children; 146 147 /** 148 * Parent node 149 */ 150 private Token parent; 151 152 /** 153 * Dead code flag 154 */ 155 private boolean isDead; 156 157 private static final Map<Type, ToString> toStringMap = new HashMap<>(); 158 private static final ToString DEFAULT_TOSTRING = new ToString(); 159 160 private static String unicode(final int value) { 161 final StringBuilder sb = new StringBuilder(); 162 final String hex = Integer.toHexString(value); 163 sb.append('u'); 164 for (int i = 0; i < 4 - hex.length(); i++) { 165 sb.append('0'); 166 } 167 sb.append(hex); 168 169 return sb.toString(); 170 } 171 172 static { 173 toStringMap.put(Type.CHARACTERCLASS, new ToString() { 174 @Override 175 public String toString(final Token token) { 176 return super.toString(token).replace("\\b", "\b"); 177 } 178 }); 179 180 // for some reason java regexps don't like control characters on the 181 // form "\\ca".match([string with ascii 1 at char0]). Translating 182 // them to unicode does it though. 183 toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { 184 @Override 185 public String toString(final Token token) { 186 final String str = super.toString(token); 187 if (str.length() == 2) { 188 return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); 189 } 190 return str; 191 } 192 }); 193 194 toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { 195 @Override 196 public String toString(final Token token) { 197 final String str = super.toString(token); 198 199 if ("\0".equals(str)) { 200 return str; 201 } 202 203 int value; 204 205 if (!token.hasParentOfType(Type.CLASSRANGES)) { 206 return str; 207 } 208 209 value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal 210 if (value > 0xff) { 211 throw new NumberFormatException(str); 212 } 213 214 return Token.unicode(value); 215 } 216 }); 217 218 } 219 220 /** 221 * JavaScript Token to Java regex substring framework. 222 * 223 */ 224 private static class ToString { 225 String toString(final Token token) { 226 final StringBuilder sb = new StringBuilder(); 227 for (final Object child : token.getChildren()) { 228 sb.append(child); 229 } 230 231 //perform global substitutions that hold true for any evaluated form 232 String str = sb.toString(); 233 if (str.equals("[^]")) { 234 str = "[\\s\\S]"; 235 } 236 return str; 237 } 238 } 239 240 /** 241 * Token iterator. Doesn't return "atom" children. i.e. string representations, 242 * just tokens 243 * 244 */ 245 private static class TokenIterator implements Iterator<Token> { 246 private final List<Token> preorder; 247 248 private void init(final Token root) { 249 preorder.add(root); 250 for (final Object child : root.getChildren()) { 251 if (child instanceof Token) { 252 init((Token)child); 253 } 254 } 255 } 256 257 TokenIterator(final Token root) { 258 preorder = new ArrayList<>(); 259 init(root); 260 } 261 262 @Override 263 public boolean hasNext() { 264 return !preorder.isEmpty(); 265 } 266 267 @Override 268 public Token next() { 269 return preorder.remove(0); 270 } 271 272 @Override 273 public void remove() { 274 next(); 275 } 276 } 277 278 /** 279 * Constructor 280 * @param type the token type 281 */ 282 Token(final Token.Type type) { 283 this.type = type; 284 children = new ArrayList<>(); 285 } 286 287 /** 288 * Add a an "atom" child to a token 289 * @param child the child to add 290 * @return the token (for chaining) 291 */ 292 public Token add(final String child) { 293 children.add(child); 294 return this; 295 } 296 297 /** 298 * Add a child to a token 299 * @param child the child 300 * @return the token (for chaining) 301 */ 302 public Token add(final Token child) { 303 if (child != null) { 304 children.add(child); 305 child.setParent(this); 306 } 307 return this; 308 } 309 310 /** 311 * Remove a child from a token 312 * @param child the child to remove 313 * @return true if successful 314 */ 315 public boolean remove(final Token child) { 316 return children.remove(child); 317 } 318 319 /** 320 * Remove the last child from a token 321 * @return the removed child 322 */ 323 public Object removeLast() { 324 return children.remove(children.size() - 1); 325 } 326 327 /** 328 * Flag this token as dead code 329 * @param isDead is it dead or not 330 */ 331 private void setIsDead(final boolean isDead) { 332 this.isDead = isDead; 333 } 334 335 /** 336 * Is this token dead code 337 * @return boolean 338 */ 339 private boolean getIsDead() { 340 return isDead; 341 } 342 343 /** 344 * Get the parent of this token 345 * @return parent token 346 */ 347 public Token getParent() { 348 return parent; 349 } 350 351 public boolean hasParentOfType(final Token.Type parentType) { 352 for (Token p = getParent(); p != null; p = p.getParent()) { 353 if (p.getType() == parentType) { 354 return true; 355 } 356 } 357 return false; 358 } 359 360 public boolean hasChildOfType(final Token.Type childType) { 361 for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { 362 if (iter.next().getType() == childType) { 363 return true; 364 } 365 } 366 return false; 367 } 368 369 /** 370 * Set the parent of this token 371 * @param parent 372 */ 373 private void setParent(final Token parent) { 374 this.parent = parent; 375 } 376 377 /** 378 * Get the children of this token 379 * @return an array of children, never null 380 */ 381 public Object[] getChildren() { 382 return children.toArray(); 383 } 384 385 /** 386 * Reset this token, remove all children 387 */ 388 public void reset() { 389 children.clear(); 390 } 391 392 /** 393 * Get a preorder token iterator with this token as root 394 * @return an iterator 395 */ 396 public Iterator<Token> iterator() { 397 return new TokenIterator(this); 398 } 399 400 /** 401 * Get the type of this token 402 * @return type 403 */ 404 public Type getType() { 405 return type; 406 } 407 408 /** 409 * Turn this token into Java regexp compatible text 410 * @return part of a java regexp 411 */ 412 @Override 413 public String toString() { 414 ToString t = toStringMap.get(getType()); 415 if (t == null) { 416 t = DEFAULT_TOSTRING; 417 } 418 return t.toString(this); 419 } 420 } 421 422 /** 423 * Constructor 424 * @param string the JavaScript regexp to parse 425 */ 426 private RegExpScanner(final String string) { 427 super(string); 428 sb = new StringBuilder(limit); 429 reset(0); 430 expected.put(']', 0); 431 expected.put('}', 0); 432 } 433 434 private void processForwardReferences() { 435 if (neverMatches()) { 436 return; 437 } 438 439 for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { 440 if (fwdRef.getKey().intValue() > caps.size()) { 441 neverMatches = true; 442 break; 443 } 444 445 fwdRef.getValue().setIsDead(true); 446 } 447 448 forwardReferences.clear(); 449 } 450 451 /** 452 * Scan a JavaScript regexp string returning a Java safe regex string. 453 * 454 * @param string 455 * JavaScript regexp string. 456 * @return Java safe regex string. 457 */ 458 public static RegExpScanner scan(final String string) { 459 final RegExpScanner scanner = new RegExpScanner(string); 460 461 Token pattern; 462 463 try { 464 pattern = scanner.pattern(); 465 } catch (final Exception e) { 466 throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); 467 } 468 469 scanner.processForwardReferences(); 470 if (scanner.neverMatches()) { 471 return null; // never matches 472 } 473 474 // go over the code and remove dead code 475 final Iterator<Token> iter = pattern.iterator(); 476 while (iter.hasNext()) { 477 final Token next = iter.next(); 478 if (next.getIsDead()) { 479 next.getParent().remove(next); 480 } 481 } 482 483 // turn the pattern into a string, p, the java equivalent string for our js regexp 484 final String p = pattern.toString(); 485 // if builder contains all tokens that were sent in, we know 486 // we correctly parsed the entire JavaScript regexp without syntax errors 487 if (!string.equals(scanner.getStringBuilder().toString())) { 488 throw new PatternSyntaxException(string, p, p.length() + 1); 489 } 490 491 scanner.javaPattern = p; 492 return scanner; 493 } 494 495 /** 496 * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, 497 * is an example where we never match 498 * 499 * @return boolean 500 */ 501 private boolean neverMatches() { 502 return neverMatches; 503 } 504 505 /** 506 * This is used to set better error messages that can be reused 507 * in NativeRegExp for augmenting e.g. SyntaxErrors. 508 * 509 * @return an error message or null if no extra info 510 */ 511 public String getErrorMessage() { 512 return errorMessage; 513 } 514 515 final StringBuilder getStringBuilder() { 516 return sb; 517 } 518 519 String getJavaPattern() { 520 return javaPattern; 521 } 522 523 BitVector getGroupsInNegativeLookahead() { 524 BitVector vec = null; 525 for (int i = 0; i < caps.size(); i++) { 526 final Capture cap = caps.get(i); 527 if (cap.getNegativeLookaheadLevel() > 0) { 528 if (vec == null) { 529 vec = new BitVector(caps.size() + 1); 530 } 531 vec.set(i + 1); 532 } 533 } 534 return vec; 535 } 536 537 /** 538 * Commit n characters to the builder and to a given token 539 * @param token Uncommitted token. 540 * @param n Number of characters. 541 * @return Committed token 542 */ 543 private Token commit(final Token token, final int n) { 544 final int startIn = position; 545 546 switch (n) { 547 case 1: 548 sb.append(ch0); 549 skip(1); 550 break; 551 case 2: 552 sb.append(ch0); 553 sb.append(ch1); 554 skip(2); 555 break; 556 case 3: 557 sb.append(ch0); 558 sb.append(ch1); 559 sb.append(ch2); 560 skip(3); 561 break; 562 default: 563 assert false : "Should not reach here"; 564 } 565 566 if (token == null) { 567 return null; 568 } 569 570 return token.add(sb.substring(startIn, sb.length())); 571 } 572 573 /** 574 * Restart the buffers back at an earlier position. 575 * 576 * @param startIn 577 * Position in the input stream. 578 * @param startOut 579 * Position in the output stream. 580 */ 581 private void restart(final int startIn, final int startOut) { 582 reset(startIn); 583 sb.setLength(startOut); 584 } 585 586 private void push(final char ch) { 587 expected.put(ch, expected.get(ch) + 1); 588 } 589 590 private void pop(final char ch) { 591 expected.put(ch, Math.min(0, expected.get(ch) - 1)); 592 } 593 594 /* 595 * Recursive descent tokenizer starts below. 596 */ 597 598 /* 599 * Pattern :: 600 * Disjunction 601 */ 602 private Token pattern() { 603 final Token token = new Token(Token.Type.PATTERN); 604 605 final Token child = disjunction(); 606 if (child != null) { 607 return token.add(child); 608 } 609 610 return null; 611 } 612 613 /* 614 * Disjunction :: 615 * Alternative 616 * Alternative | Disjunction 617 */ 618 private Token disjunction() { 619 final Token token = new Token(Token.Type.DISJUNCTION); 620 621 while (true) { 622 token.add(alternative()); 623 624 if (ch0 == '|') { 625 commit(token, 1); 626 } else { 627 break; 628 } 629 } 630 631 return token; 632 } 633 634 /* 635 * Alternative :: 636 * [empty] 637 * Alternative Term 638 */ 639 private Token alternative() { 640 final Token token = new Token(Token.Type.ALTERNATIVE); 641 642 Token child; 643 while ((child = term()) != null) { 644 token.add(child); 645 } 646 647 return token; 648 } 649 650 /* 651 * Term :: 652 * Assertion 653 * Atom 654 * Atom Quantifier 655 */ 656 private Token term() { 657 final int startIn = position; 658 final int startOut = sb.length(); 659 final Token token = new Token(Token.Type.TERM); 660 Token child; 661 662 child = assertion(); 663 if (child != null) { 664 return token.add(child); 665 } 666 667 child = atom(); 668 if (child != null) { 669 boolean emptyCharacterClass = false; 670 if ("[]".equals(child.toString())) { 671 emptyCharacterClass = true; 672 } 673 674 token.add(child); 675 676 final Token quantifier = quantifier(); 677 if (quantifier != null) { 678 token.add(quantifier); 679 } 680 681 if (emptyCharacterClass) { 682 if (quantifier == null) { 683 neverMatches = true; //never matches ever. 684 } else { 685 //if we can get away with max zero, remove this entire token 686 final String qs = quantifier.toString(); 687 if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { 688 token.setIsDead(true); 689 } 690 } 691 } 692 693 return token; 694 } 695 696 restart(startIn, startOut); 697 return null; 698 } 699 700 /* 701 * Assertion :: 702 * ^ 703 * $ 704 * \b 705 * \B 706 * ( ? = Disjunction ) 707 * ( ? ! Disjunction ) 708 */ 709 private Token assertion() { 710 final int startIn = position; 711 final int startOut = sb.length(); 712 final Token token = new Token(Token.Type.ASSERTION); 713 714 switch (ch0) { 715 case '^': 716 case '$': 717 return commit(token, 1); 718 719 case '\\': 720 if (ch1 == 'b' || ch1 == 'B') { 721 return commit(token, 2); 722 } 723 break; 724 725 case '(': 726 if (ch1 != '?') { 727 break; 728 } 729 if (ch2 != '=' && ch2 != '!') { 730 break; 731 } 732 final boolean isNegativeLookahead = (ch2 == '!'); 733 commit(token, 3); 734 735 if (isNegativeLookahead) { 736 negativeLookaheadLevel++; 737 } 738 final Token disjunction = disjunction(); 739 if (isNegativeLookahead) { 740 for (final Capture cap : caps) { 741 if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { 742 cap.setDead(); 743 } 744 } 745 negativeLookaheadLevel--; 746 } 747 748 if (disjunction != null && ch0 == ')') { 749 token.add(disjunction); 750 return commit(token, 1); 751 } 752 break; 753 754 default: 755 break; 756 } 757 758 restart(startIn, startOut); 759 760 return null; 761 } 762 763 /* 764 * Quantifier :: 765 * QuantifierPrefix 766 * QuantifierPrefix ? 767 */ 768 private Token quantifier() { 769 final Token token = new Token(Token.Type.QUANTIFIER); 770 final Token child = quantifierPrefix(); 771 if (child != null) { 772 token.add(child); 773 if (ch0 == '?') { 774 commit(token, 1); 775 } 776 return token; 777 } 778 return null; 779 } 780 781 /* 782 * QuantifierPrefix :: 783 * * 784 * + 785 * ? 786 * { DecimalDigits } 787 * { DecimalDigits , } 788 * { DecimalDigits , DecimalDigits } 789 */ 790 private Token quantifierPrefix() { 791 final int startIn = position; 792 final int startOut = sb.length(); 793 final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); 794 795 switch (ch0) { 796 case '*': 797 case '+': 798 case '?': 799 return commit(token, 1); 800 801 case '{': 802 commit(token, 1); 803 804 final Token child = decimalDigits(); 805 if (child == null) { 806 break; // not a quantifier - back out 807 } 808 push('}'); 809 token.add(child); 810 811 if (ch0 == ',') { 812 commit(token, 1); 813 token.add(decimalDigits()); 814 } 815 816 if (ch0 == '}') { 817 pop('}'); 818 commit(token, 1); 819 } 820 821 return token; 822 823 default: 824 break; 825 } 826 827 restart(startIn, startOut); 828 return null; 829 } 830 831 /* 832 * Atom :: 833 * PatternCharacter 834 * . 835 * \ AtomEscape 836 * CharacterClass 837 * ( Disjunction ) 838 * ( ? : Disjunction ) 839 * 840 */ 841 private Token atom() { 842 final int startIn = position; 843 final int startOut = sb.length(); 844 final Token token = new Token(Token.Type.ATOM); 845 Token child; 846 847 child = patternCharacter(); 848 if (child != null) { 849 return token.add(child); 850 } 851 852 if (ch0 == '.') { 853 return commit(token, 1); 854 } 855 856 if (ch0 == '\\') { 857 commit(token, 1); 858 child = atomEscape(); 859 860 if (child != null) { 861 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { 862 final char idEscape = child.toString().charAt(0); 863 if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { 864 token.reset(); 865 } 866 } 867 868 token.add(child); 869 870 // forward backreferences always match empty. JavaScript != Java 871 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { 872 final int refNum = Integer.parseInt(child.toString()); 873 874 if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { 875 // reference to dead in-negative-lookahead capture 876 token.setIsDead(true); 877 } else if (caps.size() < refNum) { 878 // forward reference: always matches against empty string (dead token). 879 // invalid reference (non-existant capture): pattern never matches. 880 forwardReferences.put(refNum, token); 881 } 882 } 883 884 return token; 885 } 886 } 887 888 child = characterClass(); 889 if (child != null) { 890 return token.add(child); 891 } 892 893 if (ch0 == '(') { 894 boolean capturingParens = true; 895 commit(token, 1); 896 if (ch0 == '?' && ch1 == ':') { 897 capturingParens = false; 898 commit(token, 2); 899 } 900 901 child = disjunction(); 902 if (child != null) { 903 token.add(child); 904 if (ch0 == ')') { 905 final Token atom = commit(token, 1); 906 if (capturingParens) { 907 caps.add(new Capture(negativeLookaheadLevel)); 908 } 909 return atom; 910 } 911 } 912 } 913 914 restart(startIn, startOut); 915 return null; 916 } 917 918 /* 919 * PatternCharacter :: 920 * SourceCharacter but not any of: ^$\.*+?()[]{}| 921 */ 922 @SuppressWarnings("fallthrough") 923 private Token patternCharacter() { 924 if (atEOF()) { 925 return null; 926 } 927 928 switch (ch0) { 929 case '^': 930 case '$': 931 case '\\': 932 case '.': 933 case '*': 934 case '+': 935 case '?': 936 case '(': 937 case ')': 938 case '[': 939 case '|': 940 return null; 941 942 case '}': 943 case ']': 944 final int n = expected.get(ch0); 945 if (n != 0) { 946 return null; 947 } 948 949 case '{': 950 // if not a valid quantifier escape curly brace to match itself 951 // this ensures compatibility with other JS implementations 952 final Token quant = quantifierPrefix(); 953 return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; 954 955 default: 956 return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER 957 } 958 } 959 960 /* 961 * AtomEscape :: 962 * DecimalEscape 963 * CharacterEscape 964 * CharacterClassEscape 965 */ 966 private Token atomEscape() { 967 final Token token = new Token(Token.Type.ATOM_ESCAPE); 968 Token child; 969 970 child = decimalEscape(); 971 if (child != null) { 972 return token.add(child); 973 } 974 975 child = characterClassEscape(); 976 if (child != null) { 977 return token.add(child); 978 } 979 980 child = characterEscape(); 981 if (child != null) { 982 return token.add(child); 983 } 984 985 986 return null; 987 } 988 989 /* 990 * CharacterEscape :: 991 * ControlEscape 992 * c ControlLetter 993 * HexEscapeSequence 994 * UnicodeEscapeSequence 995 * IdentityEscape 996 */ 997 private Token characterEscape() { 998 final int startIn = position; 999 final int startOut = sb.length(); 1000 1001 final Token token = new Token(Token.Type.CHARACTER_ESCAPE); 1002 Token child; 1003 1004 child = controlEscape(); 1005 if (child != null) { 1006 return token.add(child); 1007 } 1008 1009 if (ch0 == 'c') { 1010 commit(token, 1); 1011 child = controlLetter(); 1012 if (child != null) { 1013 return token.add(child); 1014 } 1015 restart(startIn, startOut); 1016 } 1017 1018 child = hexEscapeSequence(); 1019 if (child != null) { 1020 return token.add(child); 1021 } 1022 1023 child = unicodeEscapeSequence(); 1024 if (child != null) { 1025 return token.add(child); 1026 } 1027 1028 child = identityEscape(); 1029 if (child != null) { 1030 return token.add(child); 1031 } 1032 1033 restart(startIn, startOut); 1034 1035 return null; 1036 } 1037 1038 private boolean scanEscapeSequence(final char leader, final int length, final Token token) { 1039 final int startIn = position; 1040 final int startOut = sb.length(); 1041 1042 if (ch0 != leader) { 1043 return false; 1044 } 1045 1046 commit(token, 1); 1047 for (int i = 0; i < length; i++) { 1048 final char ch0l = Character.toLowerCase(ch0); 1049 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { 1050 commit(token, 1); 1051 } else { 1052 restart(startIn, startOut); 1053 return false; 1054 } 1055 } 1056 1057 return true; 1058 } 1059 1060 private Token hexEscapeSequence() { 1061 final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); 1062 if (scanEscapeSequence('x', 2, token)) { 1063 return token; 1064 } 1065 return null; 1066 } 1067 1068 private Token unicodeEscapeSequence() { 1069 final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); 1070 if (scanEscapeSequence('u', 4, token)) { 1071 return token; 1072 } 1073 return null; 1074 } 1075 1076 /* 1077 * ControlEscape :: 1078 * one of fnrtv 1079 */ 1080 private Token controlEscape() { 1081 switch (ch0) { 1082 case 'f': 1083 case 'n': 1084 case 'r': 1085 case 't': 1086 case 'v': 1087 return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); 1088 1089 default: 1090 return null; 1091 } 1092 } 1093 1094 /* 1095 * ControlLetter :: 1096 * one of abcdefghijklmnopqrstuvwxyz 1097 * ABCDEFGHIJKLMNOPQRSTUVWXYZ 1098 */ 1099 private Token controlLetter() { 1100 final char c = Character.toUpperCase(ch0); 1101 if (c >= 'A' && c <= 'Z') { 1102 final Token token = new Token(Token.Type.CONTROL_LETTER); 1103 commit(token, 1); 1104 return token; 1105 } 1106 return null; 1107 /* 1108 Token token = new Token(Token.Type.CONTROL_LETTER); 1109 commit(null, 1);//add original char to builder not to token 1110 this.neverMatches = c < 'A' || c > 'Z'; 1111 return token.add(""+c);*/ 1112 } 1113 1114 /* 1115 * IdentityEscape :: 1116 * SourceCharacter but not IdentifierPart 1117 * <ZWJ> (200c) 1118 * <ZWNJ> (200d) 1119 */ 1120 private Token identityEscape() { 1121 final Token token = new Token(Token.Type.IDENTITY_ESCAPE); 1122 commit(token, 1); 1123 return token; 1124 } 1125 1126 /* 1127 * DecimalEscape :: 1128 * DecimalIntegerLiteral [lookahead DecimalDigit] 1129 */ 1130 private Token decimalEscape() { 1131 final Token token = new Token(Token.Type.DECIMAL_ESCAPE); 1132 final int startIn = position; 1133 final int startOut = sb.length(); 1134 1135 if (ch0 == '0' && !isDecimalDigit(ch1)) { 1136 commit(token, 1); 1137 token.removeLast(); 1138 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); 1139 return token.add("\u0000"); 1140 } 1141 1142 if (isDecimalDigit(ch0)) { 1143 while (isDecimalDigit(ch0)) { 1144 commit(token, 1); 1145 } 1146 return token; 1147 } 1148 1149 restart(startIn, startOut); 1150 1151 return null; 1152 } 1153 1154 /* 1155 * CharacterClassEscape :: 1156 * one of dDsSwW 1157 */ 1158 private Token characterClassEscape() { 1159 switch (ch0) { 1160 case 's': 1161 case 'S': 1162 case 'd': 1163 case 'D': 1164 case 'w': 1165 case 'W': 1166 return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); 1167 1168 default: 1169 return null; 1170 } 1171 } 1172 1173 /* 1174 * CharacterClass :: 1175 * [ [lookahead {^}] ClassRanges ] 1176 * [ ^ ClassRanges ] 1177 */ 1178 private Token characterClass() { 1179 final int startIn = position; 1180 final int startOut = sb.length(); 1181 final Token token = new Token(Token.Type.CHARACTERCLASS); 1182 1183 if (ch0 == '[') { 1184 push(']'); 1185 commit(token, 1); 1186 1187 if (ch0 == '^') { 1188 commit(token, 1); 1189 } 1190 1191 final Token child = classRanges(); 1192 if (child != null && ch0 == ']') { 1193 pop(']'); 1194 token.add(child); 1195 return commit(token, 1); 1196 } 1197 } 1198 1199 restart(startIn, startOut); 1200 return null; 1201 } 1202 1203 /* 1204 * ClassRanges :: 1205 * [empty] 1206 * NonemptyClassRanges 1207 */ 1208 private Token classRanges() { 1209 return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); 1210 } 1211 1212 /* 1213 * NonemptyClassRanges :: 1214 * ClassAtom 1215 * ClassAtom NonemptyClassRangesNoDash 1216 * ClassAtom - ClassAtom ClassRanges 1217 */ 1218 private Token nonemptyClassRanges() { 1219 final int startIn = position; 1220 final int startOut = sb.length(); 1221 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); 1222 Token child; 1223 1224 child = classAtom(); 1225 if (child != null) { 1226 token.add(child); 1227 1228 if (ch0 == '-') { 1229 commit(token, 1); 1230 1231 final Token child1 = classAtom(); 1232 final Token child2 = classRanges(); 1233 if (child1 != null && child2 != null) { 1234 token.add(child1); 1235 token.add(child2); 1236 1237 return token; 1238 } 1239 } 1240 1241 child = nonemptyClassRangesNoDash(); 1242 if (child != null) { 1243 token.add(child); 1244 return token; 1245 } 1246 1247 return token; 1248 } 1249 1250 restart(startIn, startOut); 1251 return null; 1252 } 1253 1254 /* 1255 * NonemptyClassRangesNoDash :: 1256 * ClassAtom 1257 * ClassAtomNoDash NonemptyClassRangesNoDash 1258 * ClassAtomNoDash - ClassAtom ClassRanges 1259 */ 1260 private Token nonemptyClassRangesNoDash() { 1261 final int startIn = position; 1262 final int startOut = sb.length(); 1263 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); 1264 Token child; 1265 1266 child = classAtomNoDash(); 1267 if (child != null) { 1268 token.add(child); 1269 1270 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom 1271 if (ch0 == '-') { 1272 commit(token, 1); 1273 1274 final Token child1 = classAtom(); 1275 final Token child2 = classRanges(); 1276 if (child1 != null && child2 != null) { 1277 token.add(child1); 1278 return token.add(child2); 1279 } 1280 //fallthru 1281 } 1282 1283 child = nonemptyClassRangesNoDash(); 1284 if (child != null) { 1285 token.add(child); 1286 } 1287 return token; // still a class atom 1288 } 1289 1290 child = classAtom(); 1291 if (child != null) { 1292 return token.add(child); 1293 } 1294 1295 restart(startIn, startOut); 1296 return null; 1297 } 1298 1299 /* 1300 * ClassAtom : - ClassAtomNoDash 1301 */ 1302 private Token classAtom() { 1303 final Token token = new Token(Token.Type.CLASSATOM); 1304 1305 if (ch0 == '-') { 1306 return commit(token, 1); 1307 } 1308 1309 final Token child = classAtomNoDash(); 1310 if (child != null) { 1311 return token.add(child); 1312 } 1313 1314 return null; 1315 } 1316 1317 /* 1318 * ClassAtomNoDash :: 1319 * SourceCharacter but not one of \ or ] or - 1320 * \ ClassEscape 1321 */ 1322 private Token classAtomNoDash() { 1323 final int startIn = position; 1324 final int startOut = sb.length(); 1325 final Token token = new Token(Token.Type.CLASSATOM_NODASH); 1326 1327 switch (ch0) { 1328 case ']': 1329 case '-': 1330 case '\0': 1331 return null; 1332 1333 case '[': 1334 // unescaped left square bracket - add escape 1335 return commit(token.add("\\"), 1); 1336 1337 case '\\': 1338 commit(token, 1); 1339 final Token child = classEscape(); 1340 if (child != null) { 1341 return token.add(child); 1342 } 1343 1344 restart(startIn, startOut); 1345 return null; 1346 1347 default: 1348 return commit(token, 1); 1349 } 1350 } 1351 1352 /* 1353 * ClassEscape :: 1354 * DecimalEscape 1355 * b 1356 * CharacterEscape 1357 * CharacterClassEscape 1358 */ 1359 private Token classEscape() { 1360 final Token token = new Token(Token.Type.CLASS_ESCAPE); 1361 Token child; 1362 1363 child = decimalEscape(); 1364 if (child != null) { 1365 return token.add(child); 1366 } 1367 1368 if (ch0 == 'b') { 1369 return commit(token, 1); 1370 } 1371 1372 child = characterEscape(); 1373 if (child != null) { 1374 return token.add(child); 1375 } 1376 1377 child = characterClassEscape(); 1378 if (child != null) { 1379 return token.add(child); 1380 } 1381 1382 return null; 1383 } 1384 1385 /* 1386 * DecimalDigits 1387 */ 1388 private Token decimalDigits() { 1389 if (!isDecimalDigit(ch0)) { 1390 return null; 1391 } 1392 1393 final Token token = new Token(Token.Type.DECIMALDIGITS); 1394 while (isDecimalDigit(ch0)) { 1395 commit(token, 1); 1396 } 1397 1398 return token; 1399 } 1400 1401 private static boolean isDecimalDigit(final char ch) { 1402 return ch >= '0' && ch <= '9'; 1403 } 1404 }