1 /* 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.parser; 27 28 import java.util.ArrayList; 29 import java.util.HashMap; 30 import java.util.Iterator; 31 import java.util.LinkedHashMap; 32 import java.util.LinkedList; 33 import java.util.List; 34 import java.util.Map; 35 import java.util.regex.PatternSyntaxException; 36 import jdk.nashorn.internal.runtime.BitVector; 37 38 /** 39 * Scan a JavaScript regexp, converting to Java regex if necessary. 40 * 41 */ 42 public class RegExpScanner extends Scanner { 43 44 /** 45 * String builder to accumulate the result - this contains verbatim parsed JavaScript. 46 * to get the java equivalent we need to create a Pattern token and return its toString() 47 */ 48 private final StringBuilder sb; 49 50 /** An optional error message if one occurred during parse. */ 51 private String errorMessage; 52 53 /** Is this the special case of a regexp that never matches anything */ 54 private boolean neverMatches; 55 56 /** The resulting java.util.regex pattern string. */ 57 private String javaPattern; 58 59 /** Expected token table */ 60 private final Map<Character, Integer> expected = new HashMap<>(); 61 62 /** Capturing parenthesis that have been found so far. */ 63 private final List<Capture> caps = new LinkedList<>(); 64 65 /** Forward references to capturing parenthesis to be resolved later.*/ 66 private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>(); 67 68 /** Current level of zero-width negative lookahead assertions. */ 69 private int negativeLookaheadLevel; 70 71 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?"; 72 73 private static class Capture { 74 /** 75 * Zero-width negative lookaheads enclosing the capture. 76 */ 77 private final int negativeLookaheadLevel; 78 /** 79 * Captures that live inside a negative lookahead are dead after the 80 * lookahead and will be undefined if referenced from outside. 81 */ 82 private boolean isDead; 83 84 Capture(final int negativeLookaheadLevel) { 85 this.negativeLookaheadLevel = negativeLookaheadLevel; 86 } 87 88 public int getNegativeLookaheadLevel() { 89 return negativeLookaheadLevel; 90 } 91 92 public boolean isDead() { 93 return isDead; 94 } 95 96 public void setDead() { 97 this.isDead = true; 98 } 99 } 100 101 /** 102 * This is a token - the JavaScript regexp is scanned into a token tree 103 * A token has other tokens as children as well as "atoms", i.e. Strings. 104 * 105 */ 106 private static class Token { 107 108 private enum Type { 109 PATTERN, 110 DISJUNCTION, 111 ALTERNATIVE, 112 TERM, 113 ASSERTION, 114 QUANTIFIER, 115 QUANTIFIER_PREFIX, 116 ATOM, 117 PATTERN_CHARACTER, 118 ATOM_ESCAPE, 119 CHARACTER_ESCAPE, 120 CONTROL_ESCAPE, 121 CONTROL_LETTER, 122 IDENTITY_ESCAPE, 123 DECIMAL_ESCAPE, 124 CHARACTERCLASS_ESCAPE, 125 CHARACTERCLASS, 126 CLASSRANGES, 127 NON_EMPTY_CLASSRANGES, 128 NON_EMPTY_CLASSRANGES_NODASH, 129 CLASSATOM, 130 CLASSATOM_NODASH, 131 CLASS_ESCAPE, 132 DECIMALDIGITS, 133 HEX_ESCAPESEQUENCE, 134 UNICODE_ESCAPESEQUENCE, 135 } 136 137 /** 138 * Token tyoe 139 */ 140 private final Token.Type type; 141 142 /** 143 * Child nodes 144 */ 145 private final List<Object> children; 146 147 /** 148 * Parent node 149 */ 150 private Token parent; 151 152 /** 153 * Dead code flag 154 */ 155 private boolean isDead; 156 157 private static final Map<Type, ToString> toStringMap = new HashMap<>(); 158 private static final ToString DEFAULT_TOSTRING = new ToString(); 159 160 private static String unicode(final int value) { 161 final StringBuilder sb = new StringBuilder(); 162 final String hex = Integer.toHexString(value); 163 sb.append('u'); 164 for (int i = 0; i < 4 - hex.length(); i++) { 165 sb.append('0'); 166 } 167 sb.append(hex); 168 169 return sb.toString(); 170 } 171 172 static { 173 toStringMap.put(Type.CHARACTERCLASS, new ToString() { 174 @Override 175 public String toString(final Token token) { 176 return super.toString(token).replace("\\b", "\b"); 177 } 178 }); 179 180 // for some reason java regexps don't like control characters on the 181 // form "\\ca".match([string with ascii 1 at char0]). Translating 182 // them to unicode does it though. 183 toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() { 184 @Override 185 public String toString(final Token token) { 186 final String str = super.toString(token); 187 if (str.length() == 2) { 188 return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1); 189 } 190 return str; 191 } 192 }); 193 194 toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() { 195 @Override 196 public String toString(final Token token) { 197 final String str = super.toString(token); 198 199 if ("\0".equals(str)) { 200 return str; 201 } 202 203 int value; 204 205 if (!token.hasParentOfType(Type.CLASSRANGES)) { 206 return str; 207 } 208 209 value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal 210 if (value > 0xff) { 211 throw new NumberFormatException(str); 212 } 213 214 return Token.unicode(value); 215 } 216 }); 217 218 } 219 220 /** 221 * JavaScript Token to Java regex substring framework. 222 * 223 */ 224 private static class ToString { 225 String toString(final Token token) { 226 final StringBuilder sb = new StringBuilder(); 227 for (final Object child : token.getChildren()) { 228 sb.append(child); 229 } 230 231 //perform global substitutions that hold true for any evaluated form 232 String str = sb.toString(); 233 switch (str) { 234 case "\\s": 235 str = "[" + Lexer.getWhitespaceRegExp() + "]"; 236 break; 237 case "\\S": 238 str = "[^" + Lexer.getWhitespaceRegExp() + "]"; 239 break; 240 case "[^]": 241 str = "[\\s\\S]"; 242 break; 243 default: 244 break; 245 } 246 return str; 247 } 248 } 249 250 /** 251 * Token iterator. Doesn't return "atom" children. i.e. string representations, 252 * just tokens 253 * 254 */ 255 private static class TokenIterator implements Iterator<Token> { 256 private final List<Token> preorder; 257 258 private void init(final Token root) { 259 preorder.add(root); 260 for (final Object child : root.getChildren()) { 261 if (child instanceof Token) { 262 init((Token)child); 263 } 264 } 265 } 266 267 TokenIterator(final Token root) { 268 preorder = new ArrayList<>(); 269 init(root); 270 } 271 272 @Override 273 public boolean hasNext() { 274 return !preorder.isEmpty(); 275 } 276 277 @Override 278 public Token next() { 279 return preorder.remove(0); 280 } 281 282 @Override 283 public void remove() { 284 next(); 285 } 286 } 287 288 /** 289 * Constructor 290 * @param type the token type 291 */ 292 Token(final Token.Type type) { 293 this.type = type; 294 children = new ArrayList<>(); 295 } 296 297 /** 298 * Add a an "atom" child to a token 299 * @param child the child to add 300 * @return the token (for chaining) 301 */ 302 public Token add(final String child) { 303 children.add(child); 304 return this; 305 } 306 307 /** 308 * Add a child to a token 309 * @param child the child 310 * @return the token (for chaining) 311 */ 312 public Token add(final Token child) { 313 if (child != null) { 314 children.add(child); 315 child.setParent(this); 316 } 317 return this; 318 } 319 320 /** 321 * Remove a child from a token 322 * @param child the child to remove 323 * @return true if successful 324 */ 325 public boolean remove(final Token child) { 326 return children.remove(child); 327 } 328 329 /** 330 * Remove the last child from a token 331 * @return the removed child 332 */ 333 public Object removeLast() { 334 return children.remove(children.size() - 1); 335 } 336 337 /** 338 * Flag this token as dead code 339 * @param isDead is it dead or not 340 */ 341 private void setIsDead(final boolean isDead) { 342 this.isDead = isDead; 343 } 344 345 /** 346 * Is this token dead code 347 * @return boolean 348 */ 349 private boolean getIsDead() { 350 return isDead; 351 } 352 353 /** 354 * Get the parent of this token 355 * @return parent token 356 */ 357 public Token getParent() { 358 return parent; 359 } 360 361 public boolean hasParentOfType(final Token.Type parentType) { 362 for (Token p = getParent(); p != null; p = p.getParent()) { 363 if (p.getType() == parentType) { 364 return true; 365 } 366 } 367 return false; 368 } 369 370 public boolean hasChildOfType(final Token.Type childType) { 371 for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) { 372 if (iter.next().getType() == childType) { 373 return true; 374 } 375 } 376 return false; 377 } 378 379 /** 380 * Set the parent of this token 381 * @param parent 382 */ 383 private void setParent(final Token parent) { 384 this.parent = parent; 385 } 386 387 /** 388 * Get the children of this token 389 * @return an array of children, never null 390 */ 391 public Object[] getChildren() { 392 return children.toArray(); 393 } 394 395 /** 396 * Reset this token, remove all children 397 */ 398 public void reset() { 399 children.clear(); 400 } 401 402 /** 403 * Get a preorder token iterator with this token as root 404 * @return an iterator 405 */ 406 public Iterator<Token> iterator() { 407 return new TokenIterator(this); 408 } 409 410 /** 411 * Get the type of this token 412 * @return type 413 */ 414 public Type getType() { 415 return type; 416 } 417 418 /** 419 * Turn this token into Java regexp compatible text 420 * @return part of a java regexp 421 */ 422 @Override 423 public String toString() { 424 ToString t = toStringMap.get(getType()); 425 if (t == null) { 426 t = DEFAULT_TOSTRING; 427 } 428 return t.toString(this); 429 } 430 } 431 432 /** 433 * Constructor 434 * @param string the JavaScript regexp to parse 435 */ 436 private RegExpScanner(final String string) { 437 super(string); 438 sb = new StringBuilder(limit); 439 reset(0); 440 expected.put(']', 0); 441 expected.put('}', 0); 442 } 443 444 private void processForwardReferences() { 445 if (neverMatches()) { 446 return; 447 } 448 449 for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) { 450 if (fwdRef.getKey().intValue() > caps.size()) { 451 neverMatches = true; 452 break; 453 } 454 455 fwdRef.getValue().setIsDead(true); 456 } 457 458 forwardReferences.clear(); 459 } 460 461 /** 462 * Scan a JavaScript regexp string returning a Java safe regex string. 463 * 464 * @param string 465 * JavaScript regexp string. 466 * @return Java safe regex string. 467 */ 468 public static RegExpScanner scan(final String string) { 469 final RegExpScanner scanner = new RegExpScanner(string); 470 471 Token pattern; 472 473 try { 474 pattern = scanner.pattern(); 475 } catch (final Exception e) { 476 throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length()); 477 } 478 479 scanner.processForwardReferences(); 480 if (scanner.neverMatches()) { 481 return null; // never matches 482 } 483 484 // go over the code and remove dead code 485 final Iterator<Token> iter = pattern.iterator(); 486 while (iter.hasNext()) { 487 final Token next = iter.next(); 488 if (next.getIsDead()) { 489 next.getParent().remove(next); 490 } 491 } 492 493 // turn the pattern into a string, p, the java equivalent string for our js regexp 494 final String p = pattern.toString(); 495 // if builder contains all tokens that were sent in, we know 496 // we correctly parsed the entire JavaScript regexp without syntax errors 497 if (!string.equals(scanner.getStringBuilder().toString())) { 498 throw new PatternSyntaxException(string, p, p.length() + 1); 499 } 500 501 scanner.javaPattern = p; 502 return scanner; 503 } 504 505 /** 506 * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript, 507 * is an example where we never match 508 * 509 * @return boolean 510 */ 511 private boolean neverMatches() { 512 return neverMatches; 513 } 514 515 /** 516 * This is used to set better error messages that can be reused 517 * in NativeRegExp for augmenting e.g. SyntaxErrors. 518 * 519 * @return an error message or null if no extra info 520 */ 521 public String getErrorMessage() { 522 return errorMessage; 523 } 524 525 final StringBuilder getStringBuilder() { 526 return sb; 527 } 528 529 String getJavaPattern() { 530 return javaPattern; 531 } 532 533 BitVector getGroupsInNegativeLookahead() { 534 BitVector vec = null; 535 for (int i = 0; i < caps.size(); i++) { 536 final Capture cap = caps.get(i); 537 if (cap.getNegativeLookaheadLevel() > 0) { 538 if (vec == null) { 539 vec = new BitVector(caps.size() + 1); 540 } 541 vec.set(i + 1); 542 } 543 } 544 return vec; 545 } 546 547 /** 548 * Commit n characters to the builder and to a given token 549 * @param token Uncommitted token. 550 * @param n Number of characters. 551 * @return Committed token 552 */ 553 private Token commit(final Token token, final int n) { 554 final int startIn = position; 555 556 switch (n) { 557 case 1: 558 sb.append(ch0); 559 skip(1); 560 break; 561 case 2: 562 sb.append(ch0); 563 sb.append(ch1); 564 skip(2); 565 break; 566 case 3: 567 sb.append(ch0); 568 sb.append(ch1); 569 sb.append(ch2); 570 skip(3); 571 break; 572 default: 573 assert false : "Should not reach here"; 574 } 575 576 if (token == null) { 577 return null; 578 } 579 580 return token.add(sb.substring(startIn, sb.length())); 581 } 582 583 /** 584 * Restart the buffers back at an earlier position. 585 * 586 * @param startIn 587 * Position in the input stream. 588 * @param startOut 589 * Position in the output stream. 590 */ 591 private void restart(final int startIn, final int startOut) { 592 reset(startIn); 593 sb.setLength(startOut); 594 } 595 596 private void push(final char ch) { 597 expected.put(ch, expected.get(ch) + 1); 598 } 599 600 private void pop(final char ch) { 601 expected.put(ch, Math.min(0, expected.get(ch) - 1)); 602 } 603 604 /* 605 * Recursive descent tokenizer starts below. 606 */ 607 608 /* 609 * Pattern :: 610 * Disjunction 611 */ 612 private Token pattern() { 613 final Token token = new Token(Token.Type.PATTERN); 614 615 final Token child = disjunction(); 616 if (child != null) { 617 return token.add(child); 618 } 619 620 return null; 621 } 622 623 /* 624 * Disjunction :: 625 * Alternative 626 * Alternative | Disjunction 627 */ 628 private Token disjunction() { 629 final Token token = new Token(Token.Type.DISJUNCTION); 630 631 while (true) { 632 token.add(alternative()); 633 634 if (ch0 == '|') { 635 commit(token, 1); 636 } else { 637 break; 638 } 639 } 640 641 return token; 642 } 643 644 /* 645 * Alternative :: 646 * [empty] 647 * Alternative Term 648 */ 649 private Token alternative() { 650 final Token token = new Token(Token.Type.ALTERNATIVE); 651 652 Token child; 653 while ((child = term()) != null) { 654 token.add(child); 655 } 656 657 return token; 658 } 659 660 /* 661 * Term :: 662 * Assertion 663 * Atom 664 * Atom Quantifier 665 */ 666 private Token term() { 667 final int startIn = position; 668 final int startOut = sb.length(); 669 final Token token = new Token(Token.Type.TERM); 670 Token child; 671 672 child = assertion(); 673 if (child != null) { 674 return token.add(child); 675 } 676 677 child = atom(); 678 if (child != null) { 679 boolean emptyCharacterClass = false; 680 if ("[]".equals(child.toString())) { 681 emptyCharacterClass = true; 682 } 683 684 token.add(child); 685 686 final Token quantifier = quantifier(); 687 if (quantifier != null) { 688 token.add(quantifier); 689 } 690 691 if (emptyCharacterClass) { 692 if (quantifier == null) { 693 neverMatches = true; //never matches ever. 694 } else { 695 //if we can get away with max zero, remove this entire token 696 final String qs = quantifier.toString(); 697 if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) { 698 token.setIsDead(true); 699 } 700 } 701 } 702 703 return token; 704 } 705 706 restart(startIn, startOut); 707 return null; 708 } 709 710 /* 711 * Assertion :: 712 * ^ 713 * $ 714 * \b 715 * \B 716 * ( ? = Disjunction ) 717 * ( ? ! Disjunction ) 718 */ 719 private Token assertion() { 720 final int startIn = position; 721 final int startOut = sb.length(); 722 final Token token = new Token(Token.Type.ASSERTION); 723 724 switch (ch0) { 725 case '^': 726 case '$': 727 return commit(token, 1); 728 729 case '\\': 730 if (ch1 == 'b' || ch1 == 'B') { 731 return commit(token, 2); 732 } 733 break; 734 735 case '(': 736 if (ch1 != '?') { 737 break; 738 } 739 if (ch2 != '=' && ch2 != '!') { 740 break; 741 } 742 final boolean isNegativeLookahead = (ch2 == '!'); 743 commit(token, 3); 744 745 if (isNegativeLookahead) { 746 negativeLookaheadLevel++; 747 } 748 final Token disjunction = disjunction(); 749 if (isNegativeLookahead) { 750 for (final Capture cap : caps) { 751 if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) { 752 cap.setDead(); 753 } 754 } 755 negativeLookaheadLevel--; 756 } 757 758 if (disjunction != null && ch0 == ')') { 759 token.add(disjunction); 760 return commit(token, 1); 761 } 762 break; 763 764 default: 765 break; 766 } 767 768 restart(startIn, startOut); 769 770 return null; 771 } 772 773 /* 774 * Quantifier :: 775 * QuantifierPrefix 776 * QuantifierPrefix ? 777 */ 778 private Token quantifier() { 779 final Token token = new Token(Token.Type.QUANTIFIER); 780 final Token child = quantifierPrefix(); 781 if (child != null) { 782 token.add(child); 783 if (ch0 == '?') { 784 commit(token, 1); 785 } 786 return token; 787 } 788 return null; 789 } 790 791 /* 792 * QuantifierPrefix :: 793 * * 794 * + 795 * ? 796 * { DecimalDigits } 797 * { DecimalDigits , } 798 * { DecimalDigits , DecimalDigits } 799 */ 800 private Token quantifierPrefix() { 801 final int startIn = position; 802 final int startOut = sb.length(); 803 final Token token = new Token(Token.Type.QUANTIFIER_PREFIX); 804 805 switch (ch0) { 806 case '*': 807 case '+': 808 case '?': 809 return commit(token, 1); 810 811 case '{': 812 commit(token, 1); 813 814 final Token child = decimalDigits(); 815 if (child == null) { 816 break; // not a quantifier - back out 817 } 818 push('}'); 819 token.add(child); 820 821 if (ch0 == ',') { 822 commit(token, 1); 823 token.add(decimalDigits()); 824 } 825 826 if (ch0 == '}') { 827 pop('}'); 828 commit(token, 1); 829 } 830 831 return token; 832 833 default: 834 break; 835 } 836 837 restart(startIn, startOut); 838 return null; 839 } 840 841 /* 842 * Atom :: 843 * PatternCharacter 844 * . 845 * \ AtomEscape 846 * CharacterClass 847 * ( Disjunction ) 848 * ( ? : Disjunction ) 849 * 850 */ 851 private Token atom() { 852 final int startIn = position; 853 final int startOut = sb.length(); 854 final Token token = new Token(Token.Type.ATOM); 855 Token child; 856 857 child = patternCharacter(); 858 if (child != null) { 859 return token.add(child); 860 } 861 862 if (ch0 == '.') { 863 return commit(token, 1); 864 } 865 866 if (ch0 == '\\') { 867 commit(token, 1); 868 child = atomEscape(); 869 870 if (child != null) { 871 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) { 872 final char idEscape = child.toString().charAt(0); 873 if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) { 874 token.reset(); 875 } 876 } 877 878 token.add(child); 879 880 // forward backreferences always match empty. JavaScript != Java 881 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) { 882 final int refNum = Integer.parseInt(child.toString()); 883 884 if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) { 885 // reference to dead in-negative-lookahead capture 886 token.setIsDead(true); 887 } else if (caps.size() < refNum) { 888 // forward reference: always matches against empty string (dead token). 889 // invalid reference (non-existant capture): pattern never matches. 890 forwardReferences.put(refNum, token); 891 } 892 } 893 894 return token; 895 } 896 } 897 898 child = characterClass(); 899 if (child != null) { 900 return token.add(child); 901 } 902 903 if (ch0 == '(') { 904 boolean capturingParens = true; 905 commit(token, 1); 906 if (ch0 == '?' && ch1 == ':') { 907 capturingParens = false; 908 commit(token, 2); 909 } 910 911 child = disjunction(); 912 if (child != null) { 913 token.add(child); 914 if (ch0 == ')') { 915 final Token atom = commit(token, 1); 916 if (capturingParens) { 917 caps.add(new Capture(negativeLookaheadLevel)); 918 } 919 return atom; 920 } 921 } 922 } 923 924 restart(startIn, startOut); 925 return null; 926 } 927 928 /* 929 * PatternCharacter :: 930 * SourceCharacter but not any of: ^$\.*+?()[]{}| 931 */ 932 @SuppressWarnings("fallthrough") 933 private Token patternCharacter() { 934 if (atEOF()) { 935 return null; 936 } 937 938 switch (ch0) { 939 case '^': 940 case '$': 941 case '\\': 942 case '.': 943 case '*': 944 case '+': 945 case '?': 946 case '(': 947 case ')': 948 case '[': 949 case '|': 950 return null; 951 952 case '}': 953 case ']': 954 final int n = expected.get(ch0); 955 if (n != 0) { 956 return null; 957 } 958 959 case '{': 960 // if not a valid quantifier escape curly brace to match itself 961 // this ensures compatibility with other JS implementations 962 final Token quant = quantifierPrefix(); 963 return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null; 964 965 default: 966 return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER 967 } 968 } 969 970 /* 971 * AtomEscape :: 972 * DecimalEscape 973 * CharacterEscape 974 * CharacterClassEscape 975 */ 976 private Token atomEscape() { 977 final Token token = new Token(Token.Type.ATOM_ESCAPE); 978 Token child; 979 980 child = decimalEscape(); 981 if (child != null) { 982 return token.add(child); 983 } 984 985 child = characterClassEscape(); 986 if (child != null) { 987 return token.add(child); 988 } 989 990 child = characterEscape(); 991 if (child != null) { 992 return token.add(child); 993 } 994 995 996 return null; 997 } 998 999 /* 1000 * CharacterEscape :: 1001 * ControlEscape 1002 * c ControlLetter 1003 * HexEscapeSequence 1004 * UnicodeEscapeSequence 1005 * IdentityEscape 1006 */ 1007 private Token characterEscape() { 1008 final int startIn = position; 1009 final int startOut = sb.length(); 1010 1011 final Token token = new Token(Token.Type.CHARACTER_ESCAPE); 1012 Token child; 1013 1014 child = controlEscape(); 1015 if (child != null) { 1016 return token.add(child); 1017 } 1018 1019 if (ch0 == 'c') { 1020 commit(token, 1); 1021 child = controlLetter(); 1022 if (child != null) { 1023 return token.add(child); 1024 } 1025 restart(startIn, startOut); 1026 } 1027 1028 child = hexEscapeSequence(); 1029 if (child != null) { 1030 return token.add(child); 1031 } 1032 1033 child = unicodeEscapeSequence(); 1034 if (child != null) { 1035 return token.add(child); 1036 } 1037 1038 child = identityEscape(); 1039 if (child != null) { 1040 return token.add(child); 1041 } 1042 1043 restart(startIn, startOut); 1044 1045 return null; 1046 } 1047 1048 private boolean scanEscapeSequence(final char leader, final int length, final Token token) { 1049 final int startIn = position; 1050 final int startOut = sb.length(); 1051 1052 if (ch0 != leader) { 1053 return false; 1054 } 1055 1056 commit(token, 1); 1057 for (int i = 0; i < length; i++) { 1058 final char ch0l = Character.toLowerCase(ch0); 1059 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { 1060 commit(token, 1); 1061 } else { 1062 restart(startIn, startOut); 1063 return false; 1064 } 1065 } 1066 1067 return true; 1068 } 1069 1070 private Token hexEscapeSequence() { 1071 final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE); 1072 if (scanEscapeSequence('x', 2, token)) { 1073 return token; 1074 } 1075 return null; 1076 } 1077 1078 private Token unicodeEscapeSequence() { 1079 final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE); 1080 if (scanEscapeSequence('u', 4, token)) { 1081 return token; 1082 } 1083 return null; 1084 } 1085 1086 /* 1087 * ControlEscape :: 1088 * one of fnrtv 1089 */ 1090 private Token controlEscape() { 1091 switch (ch0) { 1092 case 'f': 1093 case 'n': 1094 case 'r': 1095 case 't': 1096 case 'v': 1097 return commit(new Token(Token.Type.CONTROL_ESCAPE), 1); 1098 1099 default: 1100 return null; 1101 } 1102 } 1103 1104 /* 1105 * ControlLetter :: 1106 * one of abcdefghijklmnopqrstuvwxyz 1107 * ABCDEFGHIJKLMNOPQRSTUVWXYZ 1108 */ 1109 private Token controlLetter() { 1110 final char c = Character.toUpperCase(ch0); 1111 if (c >= 'A' && c <= 'Z') { 1112 final Token token = new Token(Token.Type.CONTROL_LETTER); 1113 commit(token, 1); 1114 return token; 1115 } 1116 return null; 1117 /* 1118 Token token = new Token(Token.Type.CONTROL_LETTER); 1119 commit(null, 1);//add original char to builder not to token 1120 this.neverMatches = c < 'A' || c > 'Z'; 1121 return token.add(""+c);*/ 1122 } 1123 1124 /* 1125 * IdentityEscape :: 1126 * SourceCharacter but not IdentifierPart 1127 * <ZWJ> (200c) 1128 * <ZWNJ> (200d) 1129 */ 1130 private Token identityEscape() { 1131 final Token token = new Token(Token.Type.IDENTITY_ESCAPE); 1132 commit(token, 1); 1133 return token; 1134 } 1135 1136 /* 1137 * DecimalEscape :: 1138 * DecimalIntegerLiteral [lookahead DecimalDigit] 1139 */ 1140 private Token decimalEscape() { 1141 final Token token = new Token(Token.Type.DECIMAL_ESCAPE); 1142 final int startIn = position; 1143 final int startOut = sb.length(); 1144 1145 if (ch0 == '0' && !isDecimalDigit(ch1)) { 1146 commit(token, 1); 1147 token.removeLast(); 1148 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); 1149 return token.add("\u0000"); 1150 } 1151 1152 if (isDecimalDigit(ch0)) { 1153 while (isDecimalDigit(ch0)) { 1154 commit(token, 1); 1155 } 1156 return token; 1157 } 1158 1159 restart(startIn, startOut); 1160 1161 return null; 1162 } 1163 1164 /* 1165 * CharacterClassEscape :: 1166 * one of dDsSwW 1167 */ 1168 private Token characterClassEscape() { 1169 switch (ch0) { 1170 case 's': 1171 case 'S': 1172 case 'd': 1173 case 'D': 1174 case 'w': 1175 case 'W': 1176 return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1); 1177 1178 default: 1179 return null; 1180 } 1181 } 1182 1183 /* 1184 * CharacterClass :: 1185 * [ [lookahead {^}] ClassRanges ] 1186 * [ ^ ClassRanges ] 1187 */ 1188 private Token characterClass() { 1189 final int startIn = position; 1190 final int startOut = sb.length(); 1191 final Token token = new Token(Token.Type.CHARACTERCLASS); 1192 1193 if (ch0 == '[') { 1194 push(']'); 1195 commit(token, 1); 1196 1197 if (ch0 == '^') { 1198 commit(token, 1); 1199 } 1200 1201 final Token child = classRanges(); 1202 if (child != null && ch0 == ']') { 1203 pop(']'); 1204 token.add(child); 1205 return commit(token, 1); 1206 } 1207 } 1208 1209 restart(startIn, startOut); 1210 return null; 1211 } 1212 1213 /* 1214 * ClassRanges :: 1215 * [empty] 1216 * NonemptyClassRanges 1217 */ 1218 private Token classRanges() { 1219 return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges()); 1220 } 1221 1222 /* 1223 * NonemptyClassRanges :: 1224 * ClassAtom 1225 * ClassAtom NonemptyClassRangesNoDash 1226 * ClassAtom - ClassAtom ClassRanges 1227 */ 1228 private Token nonemptyClassRanges() { 1229 final int startIn = position; 1230 final int startOut = sb.length(); 1231 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES); 1232 Token child; 1233 1234 child = classAtom(); 1235 if (child != null) { 1236 token.add(child); 1237 1238 if (ch0 == '-') { 1239 commit(token, 1); 1240 1241 final Token child1 = classAtom(); 1242 final Token child2 = classRanges(); 1243 if (child1 != null && child2 != null) { 1244 token.add(child1); 1245 token.add(child2); 1246 1247 return token; 1248 } 1249 } 1250 1251 child = nonemptyClassRangesNoDash(); 1252 if (child != null) { 1253 token.add(child); 1254 return token; 1255 } 1256 1257 return token; 1258 } 1259 1260 restart(startIn, startOut); 1261 return null; 1262 } 1263 1264 /* 1265 * NonemptyClassRangesNoDash :: 1266 * ClassAtom 1267 * ClassAtomNoDash NonemptyClassRangesNoDash 1268 * ClassAtomNoDash - ClassAtom ClassRanges 1269 */ 1270 private Token nonemptyClassRangesNoDash() { 1271 final int startIn = position; 1272 final int startOut = sb.length(); 1273 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH); 1274 Token child; 1275 1276 child = classAtomNoDash(); 1277 if (child != null) { 1278 token.add(child); 1279 1280 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom 1281 if (ch0 == '-') { 1282 commit(token, 1); 1283 1284 final Token child1 = classAtom(); 1285 final Token child2 = classRanges(); 1286 if (child1 != null && child2 != null) { 1287 token.add(child1); 1288 return token.add(child2); 1289 } 1290 //fallthru 1291 } 1292 1293 child = nonemptyClassRangesNoDash(); 1294 if (child != null) { 1295 token.add(child); 1296 } 1297 return token; // still a class atom 1298 } 1299 1300 child = classAtom(); 1301 if (child != null) { 1302 return token.add(child); 1303 } 1304 1305 restart(startIn, startOut); 1306 return null; 1307 } 1308 1309 /* 1310 * ClassAtom : - ClassAtomNoDash 1311 */ 1312 private Token classAtom() { 1313 final Token token = new Token(Token.Type.CLASSATOM); 1314 1315 if (ch0 == '-') { 1316 return commit(token, 1); 1317 } 1318 1319 final Token child = classAtomNoDash(); 1320 if (child != null) { 1321 return token.add(child); 1322 } 1323 1324 return null; 1325 } 1326 1327 /* 1328 * ClassAtomNoDash :: 1329 * SourceCharacter but not one of \ or ] or - 1330 * \ ClassEscape 1331 */ 1332 private Token classAtomNoDash() { 1333 final int startIn = position; 1334 final int startOut = sb.length(); 1335 final Token token = new Token(Token.Type.CLASSATOM_NODASH); 1336 1337 switch (ch0) { 1338 case ']': 1339 case '-': 1340 case '\0': 1341 return null; 1342 1343 case '[': 1344 // unescaped left square bracket - add escape 1345 return commit(token.add("\\"), 1); 1346 1347 case '\\': 1348 commit(token, 1); 1349 final Token child = classEscape(); 1350 if (child != null) { 1351 return token.add(child); 1352 } 1353 1354 restart(startIn, startOut); 1355 return null; 1356 1357 default: 1358 return commit(token, 1); 1359 } 1360 } 1361 1362 /* 1363 * ClassEscape :: 1364 * DecimalEscape 1365 * b 1366 * CharacterEscape 1367 * CharacterClassEscape 1368 */ 1369 private Token classEscape() { 1370 final Token token = new Token(Token.Type.CLASS_ESCAPE); 1371 Token child; 1372 1373 child = decimalEscape(); 1374 if (child != null) { 1375 return token.add(child); 1376 } 1377 1378 if (ch0 == 'b') { 1379 return commit(token, 1); 1380 } 1381 1382 child = characterEscape(); 1383 if (child != null) { 1384 return token.add(child); 1385 } 1386 1387 child = characterClassEscape(); 1388 if (child != null) { 1389 return token.add(child); 1390 } 1391 1392 return null; 1393 } 1394 1395 /* 1396 * DecimalDigits 1397 */ 1398 private Token decimalDigits() { 1399 if (!isDecimalDigit(ch0)) { 1400 return null; 1401 } 1402 1403 final Token token = new Token(Token.Type.DECIMALDIGITS); 1404 while (isDecimalDigit(ch0)) { 1405 commit(token, 1); 1406 } 1407 1408 return token; 1409 } 1410 1411 private static boolean isDecimalDigit(final char ch) { 1412 return ch >= '0' && ch <= '9'; 1413 } 1414 }