1 /* 2 * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.runtime.regexp; 27 28 import java.util.HashMap; 29 import java.util.Iterator; 30 import java.util.LinkedList; 31 import java.util.List; 32 import java.util.Map; 33 import java.util.regex.PatternSyntaxException; 34 import jdk.nashorn.internal.parser.Lexer; 35 import jdk.nashorn.internal.parser.Scanner; 36 import jdk.nashorn.internal.runtime.BitVector; 37 38 /** 39 * Scan a JavaScript regexp, converting to Java regex if necessary. 40 * 41 */ 42 final class RegExpScanner extends Scanner { 43 44 /** 45 * String builder used to rewrite the pattern for the currently used regexp factory. 46 */ 47 private final StringBuilder sb; 48 49 /** Expected token table */ 50 private final Map<Character, Integer> expected = new HashMap<>(); 51 52 /** Capturing parenthesis that have been found so far. */ 53 private final List<Capture> caps = new LinkedList<>(); 54 55 /** Forward references to capturing parenthesis to be resolved later.*/ 56 private final LinkedList<Integer> forwardReferences = new LinkedList<>(); 57 58 /** Current level of zero-width negative lookahead assertions. */ 59 private int negLookaheadLevel; 60 61 /** Sequential id of current top-level zero-width negative lookahead assertion. */ 62 private int negLookaheadGroup; 63 64 /** Are we currently inside a character class? */ 65 private boolean inCharClass = false; 66 67 /** Are we currently inside a negated character class? */ 68 private boolean inNegativeClass = false; 69 70 private static final String NON_IDENT_ESCAPES = "$^*+(){}[]|\\.?-"; 71 72 private static class Capture { 73 /** Zero-width negative lookaheads enclosing the capture. */ 74 private final int negLookaheadLevel; 75 /** Sequential id of top-level negative lookaheads containing the capture. */ 76 private final int negLookaheadGroup; 77 78 Capture(final int negLookaheadGroup, final int negLookaheadLevel) { 79 this.negLookaheadGroup = negLookaheadGroup; 80 this.negLookaheadLevel = negLookaheadLevel; 81 } 82 83 /** 84 * Returns true if this Capture can be referenced from the position specified by the 85 * group and level parameters. This is the case if either the group is not within 86 * a negative lookahead, or the position of the referrer is in the same negative lookahead. 87 * 88 * @param group current negative lookahead group 89 * @param level current negative lokahead level 90 * @return true if this capture group can be referenced from the given position 91 */ 92 boolean canBeReferencedFrom(final int group, final int level) { 93 return this.negLookaheadLevel == 0 || (group == this.negLookaheadGroup && level >= this.negLookaheadLevel); 94 } 95 96 } 97 98 /** 99 * Constructor 100 * @param string the JavaScript regexp to parse 101 */ 102 private RegExpScanner(final String string) { 103 super(string); 104 sb = new StringBuilder(limit); 105 reset(0); 106 expected.put(']', 0); 107 expected.put('}', 0); 108 } 109 110 private void processForwardReferences() { 111 112 final Iterator<Integer> iterator = forwardReferences.descendingIterator(); 113 while (iterator.hasNext()) { 114 final int pos = iterator.next(); 115 final int num = iterator.next(); 116 if (num > caps.size()) { 117 // Non-existing backreference. If the number begins with a valid octal convert it to 118 // Unicode escape and append the rest to a literal character sequence. 119 final StringBuilder buffer = new StringBuilder(); 120 octalOrLiteral(Integer.toString(num), buffer); 121 sb.insert(pos, buffer); 122 } 123 } 124 125 forwardReferences.clear(); 126 } 127 128 /** 129 * Scan a JavaScript regexp string returning a Java safe regex string. 130 * 131 * @param string 132 * JavaScript regexp string. 133 * @return Java safe regex string. 134 */ 135 public static RegExpScanner scan(final String string) { 136 final RegExpScanner scanner = new RegExpScanner(string); 137 138 try { 139 scanner.disjunction(); 140 } catch (final Exception e) { 141 throw new PatternSyntaxException(e.getMessage(), string, scanner.position); 142 } 143 144 // Throw syntax error unless we parsed the entire JavaScript regexp without syntax errors 145 if (scanner.position != string.length()) { 146 final String p = scanner.getStringBuilder().toString(); 147 throw new PatternSyntaxException(string, p, p.length() + 1); 148 } 149 150 scanner.processForwardReferences(); 151 152 return scanner; 153 } 154 155 final StringBuilder getStringBuilder() { 156 return sb; 157 } 158 159 String getJavaPattern() { 160 return sb.toString(); 161 } 162 163 BitVector getGroupsInNegativeLookahead() { 164 BitVector vec = null; 165 for (int i = 0; i < caps.size(); i++) { 166 final Capture cap = caps.get(i); 167 if (cap.negLookaheadLevel > 0) { 168 if (vec == null) { 169 vec = new BitVector(caps.size() + 1); 170 } 171 vec.set(i + 1); 172 } 173 } 174 return vec; 175 } 176 177 /** 178 * Commit n characters to the builder and to a given token 179 * @param n Number of characters. 180 * @return Committed token 181 */ 182 private boolean commit(final int n) { 183 switch (n) { 184 case 1: 185 sb.append(ch0); 186 skip(1); 187 break; 188 case 2: 189 sb.append(ch0); 190 sb.append(ch1); 191 skip(2); 192 break; 193 case 3: 194 sb.append(ch0); 195 sb.append(ch1); 196 sb.append(ch2); 197 skip(3); 198 break; 199 default: 200 assert false : "Should not reach here"; 201 } 202 203 return true; 204 } 205 206 /** 207 * Restart the buffers back at an earlier position. 208 * 209 * @param startIn 210 * Position in the input stream. 211 * @param startOut 212 * Position in the output stream. 213 */ 214 private void restart(final int startIn, final int startOut) { 215 reset(startIn); 216 sb.setLength(startOut); 217 } 218 219 private void push(final char ch) { 220 expected.put(ch, expected.get(ch) + 1); 221 } 222 223 private void pop(final char ch) { 224 expected.put(ch, Math.min(0, expected.get(ch) - 1)); 225 } 226 227 /* 228 * Recursive descent tokenizer starts below. 229 */ 230 231 /* 232 * Disjunction :: 233 * Alternative 234 * Alternative | Disjunction 235 */ 236 private void disjunction() { 237 while (true) { 238 alternative(); 239 240 if (ch0 == '|') { 241 commit(1); 242 } else { 243 break; 244 } 245 } 246 } 247 248 /* 249 * Alternative :: 250 * [empty] 251 * Alternative Term 252 */ 253 private void alternative() { 254 while (term()) { 255 // do nothing 256 } 257 } 258 259 /* 260 * Term :: 261 * Assertion 262 * Atom 263 * Atom Quantifier 264 */ 265 private boolean term() { 266 final int startIn = position; 267 final int startOut = sb.length(); 268 269 if (assertion()) { 270 return true; 271 } 272 273 if (atom()) { 274 quantifier(); 275 return true; 276 } 277 278 restart(startIn, startOut); 279 return false; 280 } 281 282 /* 283 * Assertion :: 284 * ^ 285 * $ 286 * \b 287 * \B 288 * ( ? = Disjunction ) 289 * ( ? ! Disjunction ) 290 */ 291 private boolean assertion() { 292 final int startIn = position; 293 final int startOut = sb.length(); 294 295 switch (ch0) { 296 case '^': 297 case '$': 298 return commit(1); 299 300 case '\\': 301 if (ch1 == 'b' || ch1 == 'B') { 302 return commit(2); 303 } 304 break; 305 306 case '(': 307 if (ch1 != '?') { 308 break; 309 } 310 if (ch2 != '=' && ch2 != '!') { 311 break; 312 } 313 final boolean isNegativeLookahead = (ch2 == '!'); 314 commit(3); 315 316 if (isNegativeLookahead) { 317 if (negLookaheadLevel == 0) { 318 negLookaheadGroup++; 319 } 320 negLookaheadLevel++; 321 } 322 disjunction(); 323 if (isNegativeLookahead) { 324 negLookaheadLevel--; 325 } 326 327 if (ch0 == ')') { 328 return commit(1); 329 } 330 break; 331 332 default: 333 break; 334 } 335 336 restart(startIn, startOut); 337 return false; 338 } 339 340 /* 341 * Quantifier :: 342 * QuantifierPrefix 343 * QuantifierPrefix ? 344 */ 345 private boolean quantifier() { 346 if (quantifierPrefix()) { 347 if (ch0 == '?') { 348 commit(1); 349 } 350 return true; 351 } 352 return false; 353 } 354 355 /* 356 * QuantifierPrefix :: 357 * * 358 * + 359 * ? 360 * { DecimalDigits } 361 * { DecimalDigits , } 362 * { DecimalDigits , DecimalDigits } 363 */ 364 private boolean quantifierPrefix() { 365 final int startIn = position; 366 final int startOut = sb.length(); 367 368 switch (ch0) { 369 case '*': 370 case '+': 371 case '?': 372 return commit(1); 373 374 case '{': 375 commit(1); 376 377 if (!decimalDigits()) { 378 break; // not a quantifier - back out 379 } 380 push('}'); 381 382 if (ch0 == ',') { 383 commit(1); 384 decimalDigits(); 385 } 386 387 if (ch0 == '}') { 388 pop('}'); 389 commit(1); 390 } else { 391 // Bad quantifier should be rejected but is accepted by all major engines 392 restart(startIn, startOut); 393 return false; 394 } 395 396 return true; 397 398 default: 399 break; 400 } 401 402 restart(startIn, startOut); 403 return false; 404 } 405 406 /* 407 * Atom :: 408 * PatternCharacter 409 * . 410 * \ AtomEscape 411 * CharacterClass 412 * ( Disjunction ) 413 * ( ? : Disjunction ) 414 * 415 */ 416 private boolean atom() { 417 final int startIn = position; 418 final int startOut = sb.length(); 419 420 if (patternCharacter()) { 421 return true; 422 } 423 424 if (ch0 == '.') { 425 return commit(1); 426 } 427 428 if (ch0 == '\\') { 429 commit(1); 430 431 if (atomEscape()) { 432 return true; 433 } 434 } 435 436 if (characterClass()) { 437 return true; 438 } 439 440 if (ch0 == '(') { 441 commit(1); 442 if (ch0 == '?' && ch1 == ':') { 443 commit(2); 444 } else { 445 caps.add(new Capture(negLookaheadGroup, negLookaheadLevel)); 446 } 447 448 disjunction(); 449 450 if (ch0 == ')') { 451 commit(1); 452 return true; 453 } 454 } 455 456 restart(startIn, startOut); 457 return false; 458 } 459 460 /* 461 * PatternCharacter :: 462 * SourceCharacter but not any of: ^$\.*+?()[]{}| 463 */ 464 @SuppressWarnings("fallthrough") 465 private boolean patternCharacter() { 466 if (atEOF()) { 467 return false; 468 } 469 470 switch (ch0) { 471 case '^': 472 case '$': 473 case '\\': 474 case '.': 475 case '*': 476 case '+': 477 case '?': 478 case '(': 479 case ')': 480 case '[': 481 case '|': 482 return false; 483 484 case '}': 485 case ']': 486 final int n = expected.get(ch0); 487 if (n != 0) { 488 return false; 489 } 490 491 case '{': 492 // if not a valid quantifier escape curly brace to match itself 493 // this ensures compatibility with other JS implementations 494 if (!quantifierPrefix()) { 495 sb.append('\\'); 496 return commit(1); 497 } 498 return false; 499 500 default: 501 return commit(1); // SOURCECHARACTER 502 } 503 } 504 505 /* 506 * AtomEscape :: 507 * DecimalEscape 508 * CharacterEscape 509 * CharacterClassEscape 510 */ 511 private boolean atomEscape() { 512 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all 513 return decimalEscape() || characterClassEscape() || characterEscape() || identityEscape(); 514 } 515 516 /* 517 * CharacterEscape :: 518 * ControlEscape 519 * c ControlLetter 520 * HexEscapeSequence 521 * UnicodeEscapeSequence 522 * IdentityEscape 523 */ 524 private boolean characterEscape() { 525 final int startIn = position; 526 final int startOut = sb.length(); 527 528 if (controlEscape()) { 529 return true; 530 } 531 532 if (ch0 == 'c') { 533 commit(1); 534 if (controlLetter()) { 535 return true; 536 } 537 restart(startIn, startOut); 538 } 539 540 if (hexEscapeSequence() || unicodeEscapeSequence()) { 541 return true; 542 } 543 544 restart(startIn, startOut); 545 return false; 546 } 547 548 private boolean scanEscapeSequence(final char leader, final int length) { 549 final int startIn = position; 550 final int startOut = sb.length(); 551 552 if (ch0 != leader) { 553 return false; 554 } 555 556 commit(1); 557 for (int i = 0; i < length; i++) { 558 final char ch0l = Character.toLowerCase(ch0); 559 if ((ch0l >= 'a' && ch0l <= 'f') || isDecimalDigit(ch0)) { 560 commit(1); 561 } else { 562 restart(startIn, startOut); 563 return false; 564 } 565 } 566 567 return true; 568 } 569 570 private boolean hexEscapeSequence() { 571 return scanEscapeSequence('x', 2); 572 } 573 574 private boolean unicodeEscapeSequence() { 575 return scanEscapeSequence('u', 4); 576 } 577 578 /* 579 * ControlEscape :: 580 * one of fnrtv 581 */ 582 private boolean controlEscape() { 583 switch (ch0) { 584 case 'f': 585 case 'n': 586 case 'r': 587 case 't': 588 case 'v': 589 return commit(1); 590 591 default: 592 return false; 593 } 594 } 595 596 /* 597 * ControlLetter :: 598 * one of abcdefghijklmnopqrstuvwxyz 599 * ABCDEFGHIJKLMNOPQRSTUVWXYZ 600 */ 601 private boolean controlLetter() { 602 // To match other engines we also accept '0'..'9' and '_' as control letters inside a character class. 603 if ((ch0 >= 'A' && ch0 <= 'Z') || (ch0 >= 'a' && ch0 <= 'z') 604 || (inCharClass && (isDecimalDigit(ch0) || ch0 == '_'))) { 605 // for some reason java regexps don't like control characters on the 606 // form "\\ca".match([string with ascii 1 at char0]). Translating 607 // them to unicode does it though. 608 sb.setLength(sb.length() - 1); 609 unicode(ch0 % 32, sb); 610 skip(1); 611 return true; 612 } 613 return false; 614 } 615 616 /* 617 * IdentityEscape :: 618 * SourceCharacter but not IdentifierPart 619 * <ZWJ> (200c) 620 * <ZWNJ> (200d) 621 */ 622 private boolean identityEscape() { 623 if (atEOF()) { 624 throw new RuntimeException("\\ at end of pattern"); // will be converted to PatternSyntaxException 625 } 626 // ES 5.1 A.7 requires "not IdentifierPart" here but all major engines accept any character here. 627 if (ch0 == 'c') { 628 sb.append('\\'); // Treat invalid \c control sequence as \\c 629 } else if (NON_IDENT_ESCAPES.indexOf(ch0) == -1) { 630 sb.setLength(sb.length() - 1); 631 } 632 return commit(1); 633 } 634 635 /* 636 * DecimalEscape :: 637 * DecimalIntegerLiteral [lookahead DecimalDigit] 638 */ 639 private boolean decimalEscape() { 640 final int startIn = position; 641 final int startOut = sb.length(); 642 643 if (ch0 == '0' && !isOctalDigit(ch1)) { 644 skip(1); 645 // DecimalEscape :: 0. If i is zero, return the EscapeValue consisting of a <NUL> character (Unicodevalue0000); 646 sb.append("\u0000"); 647 return true; 648 } 649 650 if (isDecimalDigit(ch0)) { 651 652 if (ch0 == '0') { 653 // We know this is an octal escape. 654 if (inCharClass) { 655 // Convert octal escape to unicode escape if inside character class. 656 int octalValue = 0; 657 while (isOctalDigit(ch0)) { 658 octalValue = octalValue * 8 + ch0 - '0'; 659 skip(1); 660 } 661 662 unicode(octalValue, sb); 663 664 } else { 665 // Copy decimal escape as-is 666 decimalDigits(); 667 } 668 } else { 669 // This should be a backreference, but could also be an octal escape or even a literal string. 670 int decimalValue = 0; 671 while (isDecimalDigit(ch0)) { 672 decimalValue = decimalValue * 10 + ch0 - '0'; 673 skip(1); 674 } 675 676 if (inCharClass) { 677 // No backreferences in character classes. Encode as unicode escape or literal char sequence 678 sb.setLength(sb.length() - 1); 679 octalOrLiteral(Integer.toString(decimalValue), sb); 680 681 } else if (decimalValue <= caps.size()) { 682 // Captures inside a negative lookahead are undefined when referenced from the outside. 683 final Capture capture = caps.get(decimalValue - 1); 684 if (!capture.canBeReferencedFrom(negLookaheadGroup, negLookaheadLevel)) { 685 // Outside reference to capture in negative lookahead, omit from output buffer. 686 sb.setLength(sb.length() - 1); 687 } else { 688 // Append backreference to output buffer. 689 sb.append(decimalValue); 690 } 691 } else { 692 // Forward references to a capture group are always undefined so we can omit it from the output buffer. 693 // However, if the target capture does not exist, we need to rewrite the reference as hex escape 694 // or literal string, so register the reference for later processing. 695 sb.setLength(sb.length() - 1); 696 forwardReferences.add(decimalValue); 697 forwardReferences.add(sb.length()); 698 } 699 700 } 701 return true; 702 } 703 704 restart(startIn, startOut); 705 return false; 706 } 707 708 /* 709 * CharacterClassEscape :: 710 * one of dDsSwW 711 */ 712 private boolean characterClassEscape() { 713 switch (ch0) { 714 // java.util.regex requires translation of \s and \S to explicit character list 715 case 's': 716 if (RegExpFactory.usesJavaUtilRegex()) { 717 sb.setLength(sb.length() - 1); 718 // No nested class required if we already are inside a character class 719 if (inCharClass) { 720 sb.append(Lexer.getWhitespaceRegExp()); 721 } else { 722 sb.append('[').append(Lexer.getWhitespaceRegExp()).append(']'); 723 } 724 skip(1); 725 return true; 726 } 727 return commit(1); 728 case 'S': 729 if (RegExpFactory.usesJavaUtilRegex()) { 730 sb.setLength(sb.length() - 1); 731 // In negative class we must use intersection to get double negation ("not anything else than space") 732 sb.append(inNegativeClass ? "&&[" : "[^").append(Lexer.getWhitespaceRegExp()).append(']'); 733 skip(1); 734 return true; 735 } 736 return commit(1); 737 case 'd': 738 case 'D': 739 case 'w': 740 case 'W': 741 return commit(1); 742 743 default: 744 return false; 745 } 746 } 747 748 /* 749 * CharacterClass :: 750 * [ [lookahead {^}] ClassRanges ] 751 * [ ^ ClassRanges ] 752 */ 753 private boolean characterClass() { 754 final int startIn = position; 755 final int startOut = sb.length(); 756 757 if (ch0 == '[') { 758 try { 759 inCharClass = true; 760 push(']'); 761 commit(1); 762 763 if (ch0 == '^') { 764 inNegativeClass = true; 765 commit(1); 766 } 767 768 if (classRanges() && ch0 == ']') { 769 pop(']'); 770 commit(1); 771 772 // Substitute empty character classes [] and [^] that never or always match 773 if (position == startIn + 2) { 774 sb.setLength(sb.length() - 1); 775 sb.append("^\\s\\S]"); 776 } else if (position == startIn + 3 && inNegativeClass) { 777 sb.setLength(sb.length() - 2); 778 sb.append("\\s\\S]"); 779 } 780 781 return true; 782 } 783 } finally { 784 inCharClass = false; // no nested character classes in JavaScript 785 inNegativeClass = false; 786 } 787 } 788 789 restart(startIn, startOut); 790 return false; 791 } 792 793 /* 794 * ClassRanges :: 795 * [empty] 796 * NonemptyClassRanges 797 */ 798 private boolean classRanges() { 799 nonemptyClassRanges(); 800 return true; 801 } 802 803 /* 804 * NonemptyClassRanges :: 805 * ClassAtom 806 * ClassAtom NonemptyClassRangesNoDash 807 * ClassAtom - ClassAtom ClassRanges 808 */ 809 private boolean nonemptyClassRanges() { 810 final int startIn = position; 811 final int startOut = sb.length(); 812 813 if (classAtom()) { 814 815 if (ch0 == '-') { 816 commit(1); 817 818 if (classAtom() && classRanges()) { 819 return true; 820 } 821 } 822 823 nonemptyClassRangesNoDash(); 824 825 return true; 826 } 827 828 restart(startIn, startOut); 829 return false; 830 } 831 832 /* 833 * NonemptyClassRangesNoDash :: 834 * ClassAtom 835 * ClassAtomNoDash NonemptyClassRangesNoDash 836 * ClassAtomNoDash - ClassAtom ClassRanges 837 */ 838 private boolean nonemptyClassRangesNoDash() { 839 final int startIn = position; 840 final int startOut = sb.length(); 841 842 if (classAtomNoDash()) { 843 844 // need to check dash first, as for e.g. [a-b|c-d] will otherwise parse - as an atom 845 if (ch0 == '-') { 846 commit(1); 847 848 if (classAtom() && classRanges()) { 849 return true; 850 } 851 //fallthru 852 } 853 854 nonemptyClassRangesNoDash(); 855 return true; // still a class atom 856 } 857 858 if (classAtom()) { 859 return true; 860 } 861 862 restart(startIn, startOut); 863 return false; 864 } 865 866 /* 867 * ClassAtom : - ClassAtomNoDash 868 */ 869 private boolean classAtom() { 870 871 if (ch0 == '-') { 872 return commit(1); 873 } 874 875 return classAtomNoDash(); 876 } 877 878 /* 879 * ClassAtomNoDash :: 880 * SourceCharacter but not one of \ or ] or - 881 * \ ClassEscape 882 */ 883 private boolean classAtomNoDash() { 884 if (atEOF()) { 885 return false; 886 } 887 final int startIn = position; 888 final int startOut = sb.length(); 889 890 switch (ch0) { 891 case ']': 892 case '-': 893 return false; 894 895 case '[': 896 // unescaped left square bracket - add escape 897 sb.append('\\'); 898 return commit(1); 899 900 case '\\': 901 commit(1); 902 if (classEscape()) { 903 return true; 904 } 905 906 restart(startIn, startOut); 907 return false; 908 909 default: 910 return commit(1); 911 } 912 } 913 914 /* 915 * ClassEscape :: 916 * DecimalEscape 917 * b 918 * CharacterEscape 919 * CharacterClassEscape 920 */ 921 private boolean classEscape() { 922 923 if (decimalEscape()) { 924 return true; 925 } 926 927 if (ch0 == 'b') { 928 sb.setLength(sb.length() - 1); 929 sb.append('\b'); 930 skip(1); 931 return true; 932 } 933 934 // Note that contrary to ES 5.1 spec we put identityEscape() last because it acts as a catch-all 935 return characterEscape() || characterClassEscape() || identityEscape(); 936 } 937 938 /* 939 * DecimalDigits 940 */ 941 private boolean decimalDigits() { 942 if (!isDecimalDigit(ch0)) { 943 return false; 944 } 945 946 while (isDecimalDigit(ch0)) { 947 commit(1); 948 } 949 950 return true; 951 } 952 953 private static void unicode(final int value, final StringBuilder buffer) { 954 final String hex = Integer.toHexString(value); 955 buffer.append('u'); 956 for (int i = 0; i < 4 - hex.length(); i++) { 957 buffer.append('0'); 958 } 959 buffer.append(hex); 960 } 961 962 // Convert what would have been a backreference into a unicode escape, or a number literal, or both. 963 private static void octalOrLiteral(final String numberLiteral, final StringBuilder buffer) { 964 final int length = numberLiteral.length(); 965 int octalValue = 0; 966 int pos = 0; 967 // Maximum value for octal escape is 0377 (255) so we stop the loop at 32 968 while (pos < length && octalValue < 0x20) { 969 final char ch = numberLiteral.charAt(pos); 970 if (isOctalDigit(ch)) { 971 octalValue = octalValue * 8 + ch - '0'; 972 } else { 973 break; 974 } 975 pos++; 976 } 977 if (octalValue > 0) { 978 buffer.append('\\'); 979 unicode(octalValue, buffer); 980 buffer.append(numberLiteral.substring(pos)); 981 } else { 982 buffer.append(numberLiteral); 983 } 984 } 985 986 private static boolean isOctalDigit(final char ch) { 987 return ch >= '0' && ch <= '7'; 988 } 989 990 private static boolean isDecimalDigit(final char ch) { 991 return ch >= '0' && ch <= '9'; 992 } 993 }