1 /*
2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 package jdk.nashorn.internal.parser;
27
28 import java.util.ArrayList;
29 import java.util.HashMap;
30 import java.util.Iterator;
31 import java.util.LinkedHashMap;
32 import java.util.LinkedList;
33 import java.util.List;
34 import java.util.Map;
35 import java.util.regex.PatternSyntaxException;
36 import jdk.nashorn.internal.runtime.BitVector;
37
38 /**
39 * Scan a JavaScript regexp, converting to Java regex if necessary.
40 *
41 */
42 public class RegExpScanner extends Scanner {
43
44 /**
45 * String builder to accumulate the result - this contains verbatim parsed JavaScript.
46 * to get the java equivalent we need to create a Pattern token and return its toString()
47 */
48 private final StringBuilder sb;
49
50 /** An optional error message if one occurred during parse. */
51 private String errorMessage;
52
53 /** Is this the special case of a regexp that never matches anything */
54 private boolean neverMatches;
55
56 /** The resulting java.util.regex pattern string. */
57 private String javaPattern;
58
59 /** Expected token table */
60 private final Map<Character, Integer> expected = new HashMap<>();
61
62 /** Capturing parenthesis that have been found so far. */
63 private final List<Capture> caps = new LinkedList<>();
64
65 /** Forward references to capturing parenthesis to be resolved later.*/
66 private final Map<Integer, Token> forwardReferences = new LinkedHashMap<>();
67
68 /** Current level of zero-width negative lookahead assertions. */
69 private int negativeLookaheadLevel;
70
71 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?";
72
73 private static class Capture {
74 /**
75 * Zero-width negative lookaheads enclosing the capture.
76 */
77 private final int negativeLookaheadLevel;
78 /**
79 * Captures that live inside a negative lookahead are dead after the
80 * lookahead and will be undefined if referenced from outside.
81 */
82 private boolean isDead;
83
84 Capture(final int negativeLookaheadLevel) {
85 this.negativeLookaheadLevel = negativeLookaheadLevel;
86 }
87
88 public int getNegativeLookaheadLevel() {
89 return negativeLookaheadLevel;
90 }
91
92 public boolean isDead() {
93 return isDead;
94 }
95
96 public void setDead() {
97 this.isDead = true;
98 }
99 }
100
101 /**
102 * This is a token - the JavaScript regexp is scanned into a token tree
103 * A token has other tokens as children as well as "atoms", i.e. Strings.
104 *
105 */
106 private static class Token {
107
108 private enum Type {
109 PATTERN,
110 DISJUNCTION,
111 ALTERNATIVE,
112 TERM,
113 ASSERTION,
114 QUANTIFIER,
115 QUANTIFIER_PREFIX,
116 ATOM,
117 PATTERN_CHARACTER,
118 ATOM_ESCAPE,
119 CHARACTER_ESCAPE,
120 CONTROL_ESCAPE,
121 CONTROL_LETTER,
122 IDENTITY_ESCAPE,
123 DECIMAL_ESCAPE,
124 CHARACTERCLASS_ESCAPE,
125 CHARACTERCLASS,
126 CLASSRANGES,
127 NON_EMPTY_CLASSRANGES,
128 NON_EMPTY_CLASSRANGES_NODASH,
129 CLASSATOM,
130 CLASSATOM_NODASH,
131 CLASS_ESCAPE,
132 DECIMALDIGITS,
133 HEX_ESCAPESEQUENCE,
134 UNICODE_ESCAPESEQUENCE,
135 }
136
137 /**
138 * Token tyoe
139 */
140 private final Token.Type type;
141
142 /**
143 * Child nodes
144 */
145 private final List<Object> children;
146
147 /**
148 * Parent node
149 */
150 private Token parent;
151
152 /**
153 * Dead code flag
154 */
155 private boolean isDead;
156
157 private static final Map<Type, ToString> toStringMap = new HashMap<>();
158 private static final ToString DEFAULT_TOSTRING = new ToString();
159
160 private static String unicode(final int value) {
161 final StringBuilder sb = new StringBuilder();
162 final String hex = Integer.toHexString(value);
163 sb.append('u');
164 for (int i = 0; i < 4 - hex.length(); i++) {
165 sb.append('0');
166 }
167 sb.append(hex);
168
169 return sb.toString();
170 }
171
172 static {
173 toStringMap.put(Type.CHARACTERCLASS, new ToString() {
174 @Override
175 public String toString(final Token token) {
176 return super.toString(token).replace("\\b", "\b");
177 }
178 });
179
180 // for some reason java regexps don't like control characters on the
181 // form "\\ca".match([string with ascii 1 at char0]). Translating
182 // them to unicode does it though.
183 toStringMap.put(Type.CHARACTER_ESCAPE, new ToString() {
184 @Override
185 public String toString(final Token token) {
186 final String str = super.toString(token);
187 if (str.length() == 2) {
188 return Token.unicode(Character.toLowerCase(str.charAt(1)) - 'a' + 1);
189 }
190 return str;
191 }
192 });
193
194 toStringMap.put(Type.DECIMAL_ESCAPE, new ToString() {
195 @Override
196 public String toString(final Token token) {
197 final String str = super.toString(token);
198
199 if ("\0".equals(str)) {
200 return str;
201 }
202
203 int value;
204
205 if (!token.hasParentOfType(Type.CLASSRANGES)) {
206 return str;
207 }
208
209 value = Integer.parseInt(str, 8); //throws exception that leads to SyntaxError if not octal
210 if (value > 0xff) {
211 throw new NumberFormatException(str);
212 }
213
214 return Token.unicode(value);
215 }
216 });
217
218 }
219
220 /**
221 * JavaScript Token to Java regex substring framework.
222 *
223 */
224 private static class ToString {
225 String toString(final Token token) {
226 final StringBuilder sb = new StringBuilder();
227 for (final Object child : token.getChildren()) {
228 sb.append(child);
229 }
230
231 //perform global substitutions that hold true for any evaluated form
232 String str = sb.toString();
233 if (str.equals("[^]")) {
234 str = "[\\s\\S]";
235 }
236 return str;
237 }
238 }
239
240 /**
241 * Token iterator. Doesn't return "atom" children. i.e. string representations,
242 * just tokens
243 *
244 */
245 private static class TokenIterator implements Iterator<Token> {
246 private final List<Token> preorder;
247
248 private void init(final Token root) {
249 preorder.add(root);
250 for (final Object child : root.getChildren()) {
251 if (child instanceof Token) {
252 init((Token)child);
253 }
254 }
255 }
256
257 TokenIterator(final Token root) {
258 preorder = new ArrayList<>();
259 init(root);
260 }
261
262 @Override
263 public boolean hasNext() {
264 return !preorder.isEmpty();
265 }
266
267 @Override
268 public Token next() {
269 return preorder.remove(0);
270 }
271
272 @Override
273 public void remove() {
274 next();
275 }
276 }
277
278 /**
279 * Constructor
280 * @param type the token type
281 */
282 Token(final Token.Type type) {
283 this.type = type;
284 children = new ArrayList<>();
285 }
286
287 /**
288 * Add a an "atom" child to a token
289 * @param child the child to add
290 * @return the token (for chaining)
291 */
292 public Token add(final String child) {
293 children.add(child);
294 return this;
295 }
296
297 /**
298 * Add a child to a token
299 * @param child the child
300 * @return the token (for chaining)
301 */
302 public Token add(final Token child) {
303 if (child != null) {
304 children.add(child);
305 child.setParent(this);
306 }
307 return this;
308 }
309
310 /**
311 * Remove a child from a token
312 * @param child the child to remove
313 * @return true if successful
314 */
315 public boolean remove(final Token child) {
316 return children.remove(child);
317 }
318
319 /**
320 * Remove the last child from a token
321 * @return the removed child
322 */
323 public Object removeLast() {
324 return children.remove(children.size() - 1);
325 }
326
327 /**
328 * Flag this token as dead code
329 * @param isDead is it dead or not
330 */
331 private void setIsDead(final boolean isDead) {
332 this.isDead = isDead;
333 }
334
335 /**
336 * Is this token dead code
337 * @return boolean
338 */
339 private boolean getIsDead() {
340 return isDead;
341 }
342
343 /**
344 * Get the parent of this token
345 * @return parent token
346 */
347 public Token getParent() {
348 return parent;
349 }
350
351 public boolean hasParentOfType(final Token.Type parentType) {
352 for (Token p = getParent(); p != null; p = p.getParent()) {
353 if (p.getType() == parentType) {
354 return true;
355 }
356 }
357 return false;
358 }
359
360 public boolean hasChildOfType(final Token.Type childType) {
361 for (final Iterator<Token> iter = iterator() ; iter.hasNext() ; ) {
362 if (iter.next().getType() == childType) {
363 return true;
364 }
365 }
366 return false;
367 }
368
369 /**
370 * Set the parent of this token
371 * @param parent
372 */
373 private void setParent(final Token parent) {
374 this.parent = parent;
375 }
376
377 /**
378 * Get the children of this token
379 * @return an array of children, never null
380 */
381 public Object[] getChildren() {
382 return children.toArray();
383 }
384
385 /**
386 * Reset this token, remove all children
387 */
388 public void reset() {
389 children.clear();
390 }
391
392 /**
393 * Get a preorder token iterator with this token as root
394 * @return an iterator
395 */
396 public Iterator<Token> iterator() {
397 return new TokenIterator(this);
398 }
399
400 /**
401 * Get the type of this token
402 * @return type
403 */
404 public Type getType() {
405 return type;
406 }
407
408 /**
409 * Turn this token into Java regexp compatible text
410 * @return part of a java regexp
411 */
412 @Override
413 public String toString() {
414 ToString t = toStringMap.get(getType());
415 if (t == null) {
416 t = DEFAULT_TOSTRING;
417 }
418 return t.toString(this);
419 }
420 }
421
422 /**
423 * Constructor
424 * @param string the JavaScript regexp to parse
425 */
426 private RegExpScanner(final String string) {
427 super(string);
428 sb = new StringBuilder(limit);
429 reset(0);
430 expected.put(']', 0);
431 expected.put('}', 0);
432 }
433
434 private void processForwardReferences() {
435 if (neverMatches()) {
436 return;
437 }
438
439 for (final Map.Entry<Integer, Token> fwdRef : forwardReferences.entrySet()) {
440 if (fwdRef.getKey().intValue() > caps.size()) {
441 neverMatches = true;
442 break;
443 }
444
445 fwdRef.getValue().setIsDead(true);
446 }
447
448 forwardReferences.clear();
449 }
450
451 /**
452 * Scan a JavaScript regexp string returning a Java safe regex string.
453 *
454 * @param string
455 * JavaScript regexp string.
456 * @return Java safe regex string.
457 */
458 public static RegExpScanner scan(final String string) {
459 final RegExpScanner scanner = new RegExpScanner(string);
460
461 Token pattern;
462
463 try {
464 pattern = scanner.pattern();
465 } catch (final Exception e) {
466 throw new PatternSyntaxException(e.getMessage(), string, scanner.sb.length());
467 }
468
469 scanner.processForwardReferences();
470 if (scanner.neverMatches()) {
471 return null; // never matches
472 }
473
474 // go over the code and remove dead code
475 final Iterator<Token> iter = pattern.iterator();
476 while (iter.hasNext()) {
477 final Token next = iter.next();
478 if (next.getIsDead()) {
479 next.getParent().remove(next);
480 }
481 }
482
483 // turn the pattern into a string, p, the java equivalent string for our js regexp
484 final String p = pattern.toString();
485 // if builder contains all tokens that were sent in, we know
486 // we correctly parsed the entire JavaScript regexp without syntax errors
487 if (!string.equals(scanner.getStringBuilder().toString())) {
488 throw new PatternSyntaxException(string, p, p.length() + 1);
489 }
490
491 scanner.javaPattern = p;
492 return scanner;
493 }
494
495 /**
496 * Does this regexp ever match anything? Use of e.g. [], which is legal in JavaScript,
497 * is an example where we never match
498 *
499 * @return boolean
500 */
501 private boolean neverMatches() {
502 return neverMatches;
503 }
504
505 /**
506 * This is used to set better error messages that can be reused
507 * in NativeRegExp for augmenting e.g. SyntaxErrors.
508 *
509 * @return an error message or null if no extra info
510 */
511 public String getErrorMessage() {
512 return errorMessage;
513 }
514
515 final StringBuilder getStringBuilder() {
516 return sb;
517 }
518
519 String getJavaPattern() {
520 return javaPattern;
521 }
522
523 BitVector getGroupsInNegativeLookahead() {
524 BitVector vec = null;
525 for (int i = 0; i < caps.size(); i++) {
526 final Capture cap = caps.get(i);
527 if (cap.getNegativeLookaheadLevel() > 0) {
528 if (vec == null) {
529 vec = new BitVector(caps.size() + 1);
530 }
531 vec.set(i + 1);
532 }
533 }
534 return vec;
535 }
536
537 /**
538 * Commit n characters to the builder and to a given token
539 * @param token Uncommitted token.
540 * @param n Number of characters.
541 * @return Committed token
542 */
543 private Token commit(final Token token, final int n) {
544 final int startIn = position;
545
546 switch (n) {
547 case 1:
548 sb.append(ch0);
549 skip(1);
550 break;
551 case 2:
552 sb.append(ch0);
553 sb.append(ch1);
554 skip(2);
555 break;
556 case 3:
557 sb.append(ch0);
558 sb.append(ch1);
559 sb.append(ch2);
560 skip(3);
561 break;
562 default:
563 assert false : "Should not reach here";
564 }
565
566 if (token == null) {
567 return null;
568 }
569
570 return token.add(sb.substring(startIn, sb.length()));
571 }
572
573 /**
574 * Restart the buffers back at an earlier position.
575 *
576 * @param startIn
577 * Position in the input stream.
578 * @param startOut
579 * Position in the output stream.
580 */
581 private void restart(final int startIn, final int startOut) {
582 reset(startIn);
583 sb.setLength(startOut);
584 }
585
586 private void push(final char ch) {
587 expected.put(ch, expected.get(ch) + 1);
588 }
589
590 private void pop(final char ch) {
591 expected.put(ch, Math.min(0, expected.get(ch) - 1));
592 }
593
594 /*
595 * Recursive descent tokenizer starts below.
596 */
597
598 /*
599 * Pattern ::
600 * Disjunction
601 */
602 private Token pattern() {
603 final Token token = new Token(Token.Type.PATTERN);
604
605 final Token child = disjunction();
606 if (child != null) {
607 return token.add(child);
608 }
609
610 return null;
611 }
612
613 /*
614 * Disjunction ::
615 * Alternative
616 * Alternative | Disjunction
617 */
618 private Token disjunction() {
619 final Token token = new Token(Token.Type.DISJUNCTION);
620
621 while (true) {
622 token.add(alternative());
623
624 if (ch0 == '|') {
625 commit(token, 1);
626 } else {
627 break;
628 }
629 }
630
631 return token;
632 }
633
634 /*
635 * Alternative ::
636 * [empty]
637 * Alternative Term
638 */
639 private Token alternative() {
640 final Token token = new Token(Token.Type.ALTERNATIVE);
641
642 Token child;
643 while ((child = term()) != null) {
644 token.add(child);
645 }
646
647 return token;
648 }
649
650 /*
651 * Term ::
652 * Assertion
653 * Atom
654 * Atom Quantifier
655 */
656 private Token term() {
657 final int startIn = position;
658 final int startOut = sb.length();
659 final Token token = new Token(Token.Type.TERM);
660 Token child;
661
662 child = assertion();
663 if (child != null) {
664 return token.add(child);
665 }
666
667 child = atom();
668 if (child != null) {
669 boolean emptyCharacterClass = false;
670 if ("[]".equals(child.toString())) {
671 emptyCharacterClass = true;
672 }
673
674 token.add(child);
675
676 final Token quantifier = quantifier();
677 if (quantifier != null) {
678 token.add(quantifier);
679 }
680
681 if (emptyCharacterClass) {
682 if (quantifier == null) {
683 neverMatches = true; //never matches ever.
684 } else {
685 //if we can get away with max zero, remove this entire token
686 final String qs = quantifier.toString();
687 if ("+".equals(qs) || "*".equals(qs) || qs.startsWith("{0,")) {
688 token.setIsDead(true);
689 }
690 }
691 }
692
693 return token;
694 }
695
696 restart(startIn, startOut);
697 return null;
698 }
699
700 /*
701 * Assertion ::
702 * ^
703 * $
704 * \b
705 * \B
706 * ( ? = Disjunction )
707 * ( ? ! Disjunction )
708 */
709 private Token assertion() {
710 final int startIn = position;
711 final int startOut = sb.length();
712 final Token token = new Token(Token.Type.ASSERTION);
713
714 switch (ch0) {
715 case '^':
716 case '$':
717 return commit(token, 1);
718
719 case '\\':
720 if (ch1 == 'b' || ch1 == 'B') {
721 return commit(token, 2);
722 }
723 break;
724
725 case '(':
726 if (ch1 != '?') {
727 break;
728 }
729 if (ch2 != '=' && ch2 != '!') {
730 break;
731 }
732 final boolean isNegativeLookahead = (ch2 == '!');
733 commit(token, 3);
734
735 if (isNegativeLookahead) {
736 negativeLookaheadLevel++;
737 }
738 final Token disjunction = disjunction();
739 if (isNegativeLookahead) {
740 for (final Capture cap : caps) {
741 if (cap.getNegativeLookaheadLevel() >= negativeLookaheadLevel) {
742 cap.setDead();
743 }
744 }
745 negativeLookaheadLevel--;
746 }
747
748 if (disjunction != null && ch0 == ')') {
749 token.add(disjunction);
750 return commit(token, 1);
751 }
752 break;
753
754 default:
755 break;
756 }
757
758 restart(startIn, startOut);
759
760 return null;
761 }
762
763 /*
764 * Quantifier ::
765 * QuantifierPrefix
766 * QuantifierPrefix ?
767 */
768 private Token quantifier() {
769 final Token token = new Token(Token.Type.QUANTIFIER);
770 final Token child = quantifierPrefix();
771 if (child != null) {
772 token.add(child);
773 if (ch0 == '?') {
774 commit(token, 1);
775 }
776 return token;
777 }
778 return null;
779 }
780
781 /*
782 * QuantifierPrefix ::
783 * *
784 * +
785 * ?
786 * { DecimalDigits }
787 * { DecimalDigits , }
788 * { DecimalDigits , DecimalDigits }
789 */
790 private Token quantifierPrefix() {
791 final int startIn = position;
792 final int startOut = sb.length();
793 final Token token = new Token(Token.Type.QUANTIFIER_PREFIX);
794
795 switch (ch0) {
796 case '*':
797 case '+':
798 case '?':
799 return commit(token, 1);
800
801 case '{':
802 commit(token, 1);
803
804 final Token child = decimalDigits();
805 if (child == null) {
806 break; // not a quantifier - back out
807 }
808 push('}');
809 token.add(child);
810
811 if (ch0 == ',') {
812 commit(token, 1);
813 token.add(decimalDigits());
814 }
815
816 if (ch0 == '}') {
817 pop('}');
818 commit(token, 1);
819 }
820
821 return token;
822
823 default:
824 break;
825 }
826
827 restart(startIn, startOut);
828 return null;
829 }
830
831 /*
832 * Atom ::
833 * PatternCharacter
834 * .
835 * \ AtomEscape
836 * CharacterClass
837 * ( Disjunction )
838 * ( ? : Disjunction )
839 *
840 */
841 private Token atom() {
842 final int startIn = position;
843 final int startOut = sb.length();
844 final Token token = new Token(Token.Type.ATOM);
845 Token child;
846
847 child = patternCharacter();
848 if (child != null) {
849 return token.add(child);
850 }
851
852 if (ch0 == '.') {
853 return commit(token, 1);
854 }
855
856 if (ch0 == '\\') {
857 commit(token, 1);
858 child = atomEscape();
859
860 if (child != null) {
861 if (child.hasChildOfType(Token.Type.IDENTITY_ESCAPE)) {
862 final char idEscape = child.toString().charAt(0);
863 if (NON_IDENT_ESCAPES.indexOf(idEscape) == -1) {
864 token.reset();
865 }
866 }
867
868 token.add(child);
869
870 // forward backreferences always match empty. JavaScript != Java
871 if (child.hasChildOfType(Token.Type.DECIMAL_ESCAPE) && !"\u0000".equals(child.toString())) {
872 final int refNum = Integer.parseInt(child.toString());
873
874 if (refNum - 1 < caps.size() && caps.get(refNum - 1).isDead()) {
875 // reference to dead in-negative-lookahead capture
876 token.setIsDead(true);
877 } else if (caps.size() < refNum) {
878 // forward reference: always matches against empty string (dead token).
879 // invalid reference (non-existant capture): pattern never matches.
880 forwardReferences.put(refNum, token);
881 }
882 }
883
884 return token;
885 }
886 }
887
888 child = characterClass();
889 if (child != null) {
890 return token.add(child);
891 }
892
893 if (ch0 == '(') {
894 boolean capturingParens = true;
895 commit(token, 1);
896 if (ch0 == '?' && ch1 == ':') {
897 capturingParens = false;
898 commit(token, 2);
899 }
900
901 child = disjunction();
902 if (child != null) {
903 token.add(child);
904 if (ch0 == ')') {
905 final Token atom = commit(token, 1);
906 if (capturingParens) {
907 caps.add(new Capture(negativeLookaheadLevel));
908 }
909 return atom;
910 }
911 }
912 }
913
914 restart(startIn, startOut);
915 return null;
916 }
917
918 /*
919 * PatternCharacter ::
920 * SourceCharacter but not any of: ^$\.*+?()[]{}|
921 */
922 @SuppressWarnings("fallthrough")
923 private Token patternCharacter() {
924 if (atEOF()) {
925 return null;
926 }
927
928 switch (ch0) {
929 case '^':
930 case '$':
931 case '\\':
932 case '.':
933 case '*':
934 case '+':
935 case '?':
936 case '(':
937 case ')':
938 case '[':
939 case '|':
940 return null;
941
942 case '}':
943 case ']':
944 final int n = expected.get(ch0);
945 if (n != 0) {
946 return null;
947 }
948
949 case '{':
950 // if not a valid quantifier escape curly brace to match itself
951 // this ensures compatibility with other JS implementations
952 final Token quant = quantifierPrefix();
953 return (quant == null) ? commit(new Token(Token.Type.PATTERN_CHARACTER).add("\\"), 1) : null;
954
955 default:
956 return commit(new Token(Token.Type.PATTERN_CHARACTER), 1); // SOURCECHARACTER
957 }
958 }
959
960 /*
961 * AtomEscape ::
962 * DecimalEscape
963 * CharacterEscape
964 * CharacterClassEscape
965 */
966 private Token atomEscape() {
967 final Token token = new Token(Token.Type.ATOM_ESCAPE);
968 Token child;
969
970 child = decimalEscape();
971 if (child != null) {
972 return token.add(child);
973 }
974
975 child = characterClassEscape();
976 if (child != null) {
977 return token.add(child);
978 }
979
980 child = characterEscape();
981 if (child != null) {
982 return token.add(child);
983 }
984
985
986 return null;
987 }
988
989 /*
990 * CharacterEscape ::
991 * ControlEscape
992 * c ControlLetter
993 * HexEscapeSequence
994 * UnicodeEscapeSequence
995 * IdentityEscape
996 */
997 private Token characterEscape() {
998 final int startIn = position;
999 final int startOut = sb.length();
1000
1001 final Token token = new Token(Token.Type.CHARACTER_ESCAPE);
1002 Token child;
1003
1004 child = controlEscape();
1005 if (child != null) {
1006 return token.add(child);
1007 }
1008
1009 if (ch0 == 'c') {
1010 commit(token, 1);
1011 child = controlLetter();
1012 if (child != null) {
1013 return token.add(child);
1014 }
1015 restart(startIn, startOut);
1016 }
1017
1018 child = hexEscapeSequence();
1019 if (child != null) {
1020 return token.add(child);
1021 }
1022
1023 child = unicodeEscapeSequence();
1024 if (child != null) {
1025 return token.add(child);
1026 }
1027
1028 child = identityEscape();
1029 if (child != null) {
1030 return token.add(child);
1031 }
1032
1033 restart(startIn, startOut);
1034
1035 return null;
1036 }
1037
1038 private boolean scanEscapeSequence(final char leader, final int length, final Token token) {
1039 final int startIn = position;
1040 final int startOut = sb.length();
1041
1042 if (ch0 != leader) {
1043 return false;
1044 }
1045
1046 commit(token, 1);
1047 for (int i = 0; i < length; i++) {
1048 final char ch0l = Character.toLowerCase(ch0);
1049 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) {
1050 commit(token, 1);
1051 } else {
1052 restart(startIn, startOut);
1053 return false;
1054 }
1055 }
1056
1057 return true;
1058 }
1059
1060 private Token hexEscapeSequence() {
1061 final Token token = new Token(Token.Type.HEX_ESCAPESEQUENCE);
1062 if (scanEscapeSequence('x', 2, token)) {
1063 return token;
1064 }
1065 return null;
1066 }
1067
1068 private Token unicodeEscapeSequence() {
1069 final Token token = new Token(Token.Type.UNICODE_ESCAPESEQUENCE);
1070 if (scanEscapeSequence('u', 4, token)) {
1071 return token;
1072 }
1073 return null;
1074 }
1075
1076 /*
1077 * ControlEscape ::
1078 * one of fnrtv
1079 */
1080 private Token controlEscape() {
1081 switch (ch0) {
1082 case 'f':
1083 case 'n':
1084 case 'r':
1085 case 't':
1086 case 'v':
1087 return commit(new Token(Token.Type.CONTROL_ESCAPE), 1);
1088
1089 default:
1090 return null;
1091 }
1092 }
1093
1094 /*
1095 * ControlLetter ::
1096 * one of abcdefghijklmnopqrstuvwxyz
1097 * ABCDEFGHIJKLMNOPQRSTUVWXYZ
1098 */
1099 private Token controlLetter() {
1100 final char c = Character.toUpperCase(ch0);
1101 if (c >= 'A' && c <= 'Z') {
1102 final Token token = new Token(Token.Type.CONTROL_LETTER);
1103 commit(token, 1);
1104 return token;
1105 }
1106 return null;
1107 /*
1108 Token token = new Token(Token.Type.CONTROL_LETTER);
1109 commit(null, 1);//add original char to builder not to token
1110 this.neverMatches = c < 'A' || c > 'Z';
1111 return token.add(""+c);*/
1112 }
1113
1114 /*
1115 * IdentityEscape ::
1116 * SourceCharacter but not IdentifierPart
1117 * <ZWJ> (200c)
1118 * <ZWNJ> (200d)
1119 */
1120 private Token identityEscape() {
1121 final Token token = new Token(Token.Type.IDENTITY_ESCAPE);
1122 commit(token, 1);
1123 return token;
1124 }
1125
1126 /*
1127 * DecimalEscape ::
1128 * DecimalIntegerLiteral [lookahead DecimalDigit]
1129 */
1130 private Token decimalEscape() {
1131 final Token token = new Token(Token.Type.DECIMAL_ESCAPE);
1132 final int startIn = position;
1133 final int startOut = sb.length();
1134
1135 if (ch0 == '0' && !isDecimalDigit(ch1)) {
1136 commit(token, 1);
1137 token.removeLast();
1138 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000);
1139 return token.add("\u0000");
1140 }
1141
1142 if (isDecimalDigit(ch0)) {
1143 while (isDecimalDigit(ch0)) {
1144 commit(token, 1);
1145 }
1146 return token;
1147 }
1148
1149 restart(startIn, startOut);
1150
1151 return null;
1152 }
1153
1154 /*
1155 * CharacterClassEscape ::
1156 * one of dDsSwW
1157 */
1158 private Token characterClassEscape() {
1159 switch (ch0) {
1160 case 's':
1161 case 'S':
1162 case 'd':
1163 case 'D':
1164 case 'w':
1165 case 'W':
1166 return commit(new Token(Token.Type.CHARACTERCLASS_ESCAPE), 1);
1167
1168 default:
1169 return null;
1170 }
1171 }
1172
1173 /*
1174 * CharacterClass ::
1175 * [ [lookahead {^}] ClassRanges ]
1176 * [ ^ ClassRanges ]
1177 */
1178 private Token characterClass() {
1179 final int startIn = position;
1180 final int startOut = sb.length();
1181 final Token token = new Token(Token.Type.CHARACTERCLASS);
1182
1183 if (ch0 == '[') {
1184 push(']');
1185 commit(token, 1);
1186
1187 if (ch0 == '^') {
1188 commit(token, 1);
1189 }
1190
1191 final Token child = classRanges();
1192 if (child != null && ch0 == ']') {
1193 pop(']');
1194 token.add(child);
1195 return commit(token, 1);
1196 }
1197 }
1198
1199 restart(startIn, startOut);
1200 return null;
1201 }
1202
1203 /*
1204 * ClassRanges ::
1205 * [empty]
1206 * NonemptyClassRanges
1207 */
1208 private Token classRanges() {
1209 return new Token(Token.Type.CLASSRANGES).add(nonemptyClassRanges());
1210 }
1211
1212 /*
1213 * NonemptyClassRanges ::
1214 * ClassAtom
1215 * ClassAtom NonemptyClassRangesNoDash
1216 * ClassAtom - ClassAtom ClassRanges
1217 */
1218 private Token nonemptyClassRanges() {
1219 final int startIn = position;
1220 final int startOut = sb.length();
1221 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES);
1222 Token child;
1223
1224 child = classAtom();
1225 if (child != null) {
1226 token.add(child);
1227
1228 if (ch0 == '-') {
1229 commit(token, 1);
1230
1231 final Token child1 = classAtom();
1232 final Token child2 = classRanges();
1233 if (child1 != null && child2 != null) {
1234 token.add(child1);
1235 token.add(child2);
1236
1237 return token;
1238 }
1239 }
1240
1241 child = nonemptyClassRangesNoDash();
1242 if (child != null) {
1243 token.add(child);
1244 return token;
1245 }
1246
1247 return token;
1248 }
1249
1250 restart(startIn, startOut);
1251 return null;
1252 }
1253
1254 /*
1255 * NonemptyClassRangesNoDash ::
1256 * ClassAtom
1257 * ClassAtomNoDash NonemptyClassRangesNoDash
1258 * ClassAtomNoDash - ClassAtom ClassRanges
1259 */
1260 private Token nonemptyClassRangesNoDash() {
1261 final int startIn = position;
1262 final int startOut = sb.length();
1263 final Token token = new Token(Token.Type.NON_EMPTY_CLASSRANGES_NODASH);
1264 Token child;
1265
1266 child = classAtomNoDash();
1267 if (child != null) {
1268 token.add(child);
1269
1270 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom
1271 if (ch0 == '-') {
1272 commit(token, 1);
1273
1274 final Token child1 = classAtom();
1275 final Token child2 = classRanges();
1276 if (child1 != null && child2 != null) {
1277 token.add(child1);
1278 return token.add(child2);
1279 }
1280 //fallthru
1281 }
1282
1283 child = nonemptyClassRangesNoDash();
1284 if (child != null) {
1285 token.add(child);
1286 }
1287 return token; // still a class atom
1288 }
1289
1290 child = classAtom();
1291 if (child != null) {
1292 return token.add(child);
1293 }
1294
1295 restart(startIn, startOut);
1296 return null;
1297 }
1298
1299 /*
1300 * ClassAtom : - ClassAtomNoDash
1301 */
1302 private Token classAtom() {
1303 final Token token = new Token(Token.Type.CLASSATOM);
1304
1305 if (ch0 == '-') {
1306 return commit(token, 1);
1307 }
1308
1309 final Token child = classAtomNoDash();
1310 if (child != null) {
1311 return token.add(child);
1312 }
1313
1314 return null;
1315 }
1316
1317 /*
1318 * ClassAtomNoDash ::
1319 * SourceCharacter but not one of \ or ] or -
1320 * \ ClassEscape
1321 */
1322 private Token classAtomNoDash() {
1323 final int startIn = position;
1324 final int startOut = sb.length();
1325 final Token token = new Token(Token.Type.CLASSATOM_NODASH);
1326
1327 switch (ch0) {
1328 case ']':
1329 case '-':
1330 case '\0':
1331 return null;
1332
1333 case '[':
1334 // unescaped left square bracket - add escape
1335 return commit(token.add("\\"), 1);
1336
1337 case '\\':
1338 commit(token, 1);
1339 final Token child = classEscape();
1340 if (child != null) {
1341 return token.add(child);
1342 }
1343
1344 restart(startIn, startOut);
1345 return null;
1346
1347 default:
1348 return commit(token, 1);
1349 }
1350 }
1351
1352 /*
1353 * ClassEscape ::
1354 * DecimalEscape
1355 * b
1356 * CharacterEscape
1357 * CharacterClassEscape
1358 */
1359 private Token classEscape() {
1360 final Token token = new Token(Token.Type.CLASS_ESCAPE);
1361 Token child;
1362
1363 child = decimalEscape();
1364 if (child != null) {
1365 return token.add(child);
1366 }
1367
1368 if (ch0 == 'b') {
1369 return commit(token, 1);
1370 }
1371
1372 child = characterEscape();
1373 if (child != null) {
1374 return token.add(child);
1375 }
1376
1377 child = characterClassEscape();
1378 if (child != null) {
1379 return token.add(child);
1380 }
1381
1382 return null;
1383 }
1384
1385 /*
1386 * DecimalDigits
1387 */
1388 private Token decimalDigits() {
1389 if (!isDecimalDigit(ch0)) {
1390 return null;
1391 }
1392
1393 final Token token = new Token(Token.Type.DECIMALDIGITS);
1394 while (isDecimalDigit(ch0)) {
1395 commit(token, 1);
1396 }
1397
1398 return token;
1399 }
1400
1401 private static boolean isDecimalDigit(final char ch) {
1402 return ch >= '0' && ch <= '9';
1403 }
1404 }
--- EOF ---