1 /* 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.parser; 27 28 import static jdk.nashorn.internal.parser.TokenType.ADD; 29 import static jdk.nashorn.internal.parser.TokenType.DECIMAL; 30 import static jdk.nashorn.internal.parser.TokenType.EOF; 31 import static jdk.nashorn.internal.parser.TokenType.EOL; 32 import static jdk.nashorn.internal.parser.TokenType.ERROR; 33 import static jdk.nashorn.internal.parser.TokenType.ESCSTRING; 34 import static jdk.nashorn.internal.parser.TokenType.EXECSTRING; 35 import static jdk.nashorn.internal.parser.TokenType.FLOATING; 36 import static jdk.nashorn.internal.parser.TokenType.HEXADECIMAL; 37 import static jdk.nashorn.internal.parser.TokenType.LBRACE; 38 import static jdk.nashorn.internal.parser.TokenType.LPAREN; 39 import static jdk.nashorn.internal.parser.TokenType.OCTAL; 40 import static jdk.nashorn.internal.parser.TokenType.RBRACE; 41 import static jdk.nashorn.internal.parser.TokenType.REGEX; 42 import static jdk.nashorn.internal.parser.TokenType.RPAREN; 43 import static jdk.nashorn.internal.parser.TokenType.STRING; 44 import static jdk.nashorn.internal.parser.TokenType.XML; 45 46 import jdk.nashorn.internal.runtime.ECMAErrors; 47 import jdk.nashorn.internal.runtime.ErrorManager; 48 import jdk.nashorn.internal.runtime.JSErrorType; 49 import jdk.nashorn.internal.runtime.JSType; 50 import jdk.nashorn.internal.runtime.ParserException; 51 import jdk.nashorn.internal.runtime.Source; 52 import jdk.nashorn.internal.runtime.options.Options; 53 54 /** 55 * Responsible for converting source content into a stream of tokens. 56 * 57 */ 58 @SuppressWarnings("fallthrough") 59 public class Lexer extends Scanner { 60 private static final long MIN_INT_L = Integer.MIN_VALUE; 61 private static final long MAX_INT_L = Integer.MAX_VALUE; 62 63 private static final boolean XML_LITERALS = Options.getBooleanProperty("nashorn.lexer.xmlliterals"); 64 65 /** Content source. */ 66 private final Source source; 67 68 /** Buffered stream for tokens. */ 69 private final TokenStream stream; 70 71 /** True if here and edit strings are supported. */ 72 private final boolean scripting; 73 74 /** True if a nested scan. (scan to completion, no EOF.) */ 75 private final boolean nested; 76 77 /** Pending new line number and position. */ 78 private int pendingLine; 79 80 /** Position of last EOL + 1. */ 81 private int linePosition; 82 83 /** Type of last token added. */ 84 private TokenType last; 85 86 private static final String JAVASCRIPT_WHITESPACE; 87 private static final String JAVASCRIPT_WHITESPACE_EOL; 88 private static final String JAVASCRIPT_WHITESPACE_IN_REGEXP; 89 90 private static final String JSON_WHITESPACE; 91 private static final String JSON_WHITESPACE_EOL; 92 93 static String unicodeEscape(final char ch) { 94 final StringBuilder sb = new StringBuilder(); 95 96 sb.append("\\u"); 97 98 final String hex = Integer.toHexString(ch); 99 for (int i = hex.length(); i < 4; i++) { 100 sb.append('0'); 101 } 102 sb.append(hex); 103 104 return sb.toString(); 105 } 106 107 static { 108 final StringBuilder ws = new StringBuilder(); 109 final StringBuilder wsEOL = new StringBuilder(); 110 final StringBuilder wsRegExp = new StringBuilder(); 111 final StringBuilder jsonWs = new StringBuilder(); 112 113 jsonWs.append((char)0x000a); 114 jsonWs.append((char)0x000d); 115 JSON_WHITESPACE_EOL = jsonWs.toString(); 116 117 jsonWs.append((char)0x0009); 118 jsonWs.append((char)0x0020); 119 JSON_WHITESPACE = jsonWs.toString(); 120 121 for (int i = 0; i <= 0xffff; i++) { 122 switch (i) { 123 case 0x000a: // line feed 124 case 0x000d: // carriage return (ctrl-m) 125 case 0x2028: // line separator 126 case 0x2029: // paragraph separator 127 wsEOL.append((char)i); 128 case 0x0009: // tab 129 case 0x0020: // ASCII space 130 case 0x000b: // tabulation line 131 case 0x000c: // ff (ctrl-l) 132 case 0x00a0: // Latin-1 space 133 case 0x1680: // Ogham space mark 134 case 0x180e: // separator, Mongolian vowel 135 case 0x2000: // en quad 136 case 0x2001: // em quad 137 case 0x2002: // en space 138 case 0x2003: // em space 139 case 0x2004: // three-per-em space 140 case 0x2005: // four-per-em space 141 case 0x2006: // six-per-em space 142 case 0x2007: // figure space 143 case 0x2008: // punctuation space 144 case 0x2009: // thin space 145 case 0x200a: // hair space 146 case 0x202f: // narrow no-break space 147 case 0x205f: // medium mathematical space 148 case 0x3000: // ideographic space 149 case 0xfeff: // byte order mark 150 ws.append((char)i); 151 152 wsRegExp.append(Lexer.unicodeEscape((char)i)); 153 break; 154 155 default: 156 break; 157 } 158 } 159 160 JAVASCRIPT_WHITESPACE = ws.toString(); 161 JAVASCRIPT_WHITESPACE_EOL = wsEOL.toString(); 162 JAVASCRIPT_WHITESPACE_IN_REGEXP = wsRegExp.toString(); 163 164 } 165 166 /** 167 * Constructor 168 * 169 * @param source the source 170 * @param stream the token stream to lex 171 */ 172 public Lexer(final Source source, final TokenStream stream) { 173 this(source, stream, false); 174 } 175 176 /** 177 * Constructor 178 * 179 * @param source the source 180 * @param stream the token stream to lex 181 * @param scripting are we in scripting mode 182 */ 183 public Lexer(final Source source, final TokenStream stream, final boolean scripting) { 184 super(source.getContent(), 1, 0, source.getLength()); 185 186 this.source = source; 187 this.stream = stream; 188 this.scripting = scripting; 189 this.nested = false; 190 this.pendingLine = 1; 191 this.last = EOL; 192 } 193 194 private Lexer(final Lexer lexer, final State state) { 195 super(lexer, state); 196 197 source = lexer.source; 198 stream = lexer.stream; 199 scripting = lexer.scripting; 200 nested = true; 201 202 pendingLine = state.pendingLine; 203 linePosition = state.linePosition; 204 last = EOL; 205 } 206 207 static class State extends Scanner.State { 208 /** Pending new line number and position. */ 209 public final int pendingLine; 210 211 /** Position of last EOL + 1. */ 212 public final int linePosition; 213 214 /** Type of last token added. */ 215 public final TokenType last; 216 217 /* 218 * Constructor. 219 */ 220 221 State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last) { 222 super(position, limit, line); 223 224 this.pendingLine = pendingLine; 225 this.linePosition = linePosition; 226 this.last = last; 227 } 228 } 229 230 /** 231 * Save the state of the scan. 232 * 233 * @return Captured state. 234 */ 235 @Override 236 State saveState() { 237 return new State(position, limit, line, pendingLine, linePosition, last); 238 } 239 240 /** 241 * Restore the state of the scan. 242 * 243 * @param state 244 * Captured state. 245 */ 246 void restoreState(final State state) { 247 super.restoreState(state); 248 249 pendingLine = state.pendingLine; 250 linePosition = state.linePosition; 251 last = state.last; 252 } 253 254 /** 255 * Add a new token to the stream. 256 * 257 * @param type 258 * Token type. 259 * @param start 260 * Start position. 261 * @param end 262 * End position. 263 */ 264 protected void add(final TokenType type, final int start, final int end) { 265 // Record last token. 266 last = type; 267 268 // Only emit the last EOL in a cluster. 269 if (type == EOL) { 270 pendingLine = end; 271 linePosition = start; 272 } else { 273 // Write any pending EOL to stream. 274 if (pendingLine != -1) { 275 stream.put(Token.toDesc(EOL, linePosition, pendingLine)); 276 pendingLine = -1; 277 } 278 279 // Write token to stream. 280 stream.put(Token.toDesc(type, start, end - start)); 281 } 282 } 283 284 /** 285 * Add a new token to the stream. 286 * 287 * @param type 288 * Token type. 289 * @param start 290 * Start position. 291 */ 292 protected void add(final TokenType type, final int start) { 293 add(type, start, position); 294 } 295 296 /** 297 * Return the String of valid whitespace characters for regular 298 * expressions in JavaScript 299 * @return regexp whitespace string 300 */ 301 public static String getWhitespaceRegExp() { 302 return JAVASCRIPT_WHITESPACE_IN_REGEXP; 303 } 304 305 /** 306 * Skip end of line. 307 * 308 * @param addEOL true if EOL token should be recorded. 309 */ 310 private void skipEOL(final boolean addEOL) { 311 312 if (ch0 == '\r') { // detect \r\n pattern 313 skip(1); 314 if (ch0 == '\n') { 315 skip(1); 316 } 317 } else { // all other space, ch0 is guaranteed to be EOL or \0 318 skip(1); 319 } 320 321 // bump up line count 322 line++; 323 324 if (addEOL) { 325 // Add an EOL token. 326 add(EOL, position, line); 327 } 328 } 329 330 /** 331 * Skip over rest of line including end of line. 332 * 333 * @param addEOL true if EOL token should be recorded. 334 */ 335 private void skipLine(final boolean addEOL) { 336 // Ignore characters. 337 while (!isEOL(ch0) && !atEOF()) { 338 skip(1); 339 } 340 // Skip over end of line. 341 skipEOL(addEOL); 342 } 343 344 /** 345 * Test whether a char is valid JavaScript whitespace 346 * @param ch a char 347 * @return true if valid JavaScript whitespace 348 */ 349 public static boolean isJSWhitespace(final char ch) { 350 return JAVASCRIPT_WHITESPACE.indexOf(ch) != -1; 351 } 352 353 /** 354 * Test whether a char is valid JavaScript end of line 355 * @param ch a char 356 * @return true if valid JavaScript end of line 357 */ 358 public static boolean isJSEOL(final char ch) { 359 return JAVASCRIPT_WHITESPACE_EOL.indexOf(ch) != -1; 360 } 361 362 /** 363 * Test whether a char is valid JSON whitespace 364 * @param ch a char 365 * @return true if valid JSON whitespace 366 */ 367 public static boolean isJsonWhitespace(final char ch) { 368 return JSON_WHITESPACE.indexOf(ch) != -1; 369 } 370 371 /** 372 * Test whether a char is valid JSON end of line 373 * @param ch a char 374 * @return true if valid JSON end of line 375 */ 376 public static boolean isJsonEOL(final char ch) { 377 return JSON_WHITESPACE_EOL.indexOf(ch) != -1; 378 } 379 380 /** 381 * Test if char is a string delimiter, e.g. '\' or '"'. Also scans exec 382 * strings ('`') in scripting mode. 383 * @param ch a char 384 * @return true if string delimiter 385 */ 386 protected boolean isStringDelimiter(final char ch) { 387 return ch == '\'' || ch == '"' || (scripting && ch == '`'); 388 } 389 390 /** 391 * Test whether a char is valid JavaScript whitespace 392 * @param ch a char 393 * @return true if valid JavaScript whitespace 394 */ 395 protected boolean isWhitespace(final char ch) { 396 return Lexer.isJSWhitespace(ch); 397 } 398 399 /** 400 * Test whether a char is valid JavaScript end of line 401 * @param ch a char 402 * @return true if valid JavaScript end of line 403 */ 404 protected boolean isEOL(final char ch) { 405 return Lexer.isJSEOL(ch); 406 } 407 408 /** 409 * Skip over whitespace and detect end of line, adding EOL tokens if 410 * encountered. 411 * 412 * @param addEOL true if EOL tokens should be recorded. 413 */ 414 private void skipWhitespace(final boolean addEOL) { 415 while (isWhitespace(ch0)) { 416 if (isEOL(ch0)) { 417 skipEOL(addEOL); 418 } else { 419 skip(1); 420 } 421 } 422 } 423 424 /** 425 * Skip over comments. 426 * 427 * @return True if a comment. 428 */ 429 protected boolean skipComments() { 430 if (ch0 == '/') { 431 // Is it a // comment. 432 if (ch1 == '/') { 433 // Skip over //. 434 skip(2); 435 // Scan for EOL. 436 while (!atEOF() && !isEOL(ch0)) { 437 skip(1); 438 } 439 // Did detect a comment. 440 return true; 441 } else if (ch1 == '*') { 442 // Record beginning of comment. 443 final int start = position; 444 // Skip over /*. 445 skip(2); 446 // Scan for */. 447 while (!atEOF() && !(ch0 == '*' && ch1 == '/')) { 448 // If end of line handle else skip character. 449 if (isEOL(ch0)) { 450 skipEOL(true); 451 } else { 452 skip(1); 453 } 454 } 455 456 if (atEOF()) { 457 // TODO - Report closing */ missing in parser. 458 add(ERROR, start); 459 } else { 460 // Skip */. 461 skip(2); 462 } 463 464 // Did detect a comment. 465 return true; 466 } 467 } 468 469 if (scripting && ch0 == '#') { 470 // shell style comment 471 // Skip over #. 472 skip(1); 473 // Scan for EOL. 474 while (!atEOF() && !isEOL(ch0)) { 475 skip(1); 476 } 477 // Did detect a comment. 478 return true; 479 } 480 481 // Not a comment. 482 return false; 483 } 484 485 /** 486 * Convert a regex token to a token object. 487 * 488 * @param start Position in source content. 489 * @param length Length of regex token. 490 * @return Regex token object. 491 */ 492 public RegexToken valueOfPattern(final int start, final int length) { 493 // Save the current position. 494 final int savePosition = position; 495 // Reset to beginning of content. 496 reset(start); 497 // Buffer for recording characters. 498 final StringBuilder sb = new StringBuilder(length); 499 500 // Skip /. 501 skip(1); 502 boolean inBrackets = false; 503 // Scan for closing /, stopping at end of line. 504 while (!atEOF() && ch0 != '/' && !isEOL(ch0) || inBrackets) { 505 // Skip over escaped character. 506 if (ch0 == '\\') { 507 sb.append(ch0); 508 sb.append(ch1); 509 skip(2); 510 } else { 511 if (ch0 == '[') { 512 inBrackets = true; 513 } else if (ch0 == ']') { 514 inBrackets = false; 515 } 516 517 // Skip literal character. 518 sb.append(ch0); 519 skip(1); 520 } 521 } 522 523 // Get pattern as string. 524 final String regex = sb.toString(); 525 526 // Skip /. 527 skip(1); 528 529 // Options as string. 530 final String options = source.getString(position, scanIdentifier()); 531 532 reset(savePosition); 533 534 // Compile the pattern. 535 return new RegexToken(regex, options); 536 } 537 538 /** 539 * Return true if the given token can be the beginning of a literal. 540 * 541 * @param token a token 542 * @return true if token can start a literal. 543 */ 544 public boolean canStartLiteral(final TokenType token) { 545 return token.startsWith('/') || ((scripting || XML_LITERALS) && token.startsWith('<')); 546 } 547 548 /** 549 * Check whether the given token represents the beginning of a literal. If so scan 550 * the literal and return <tt>true</tt>, otherwise return false. 551 * 552 * @param token the token. 553 * @param startTokenType the token type. 554 * @return True if a literal beginning with startToken was found and scanned. 555 */ 556 protected boolean scanLiteral(final long token, final TokenType startTokenType) { 557 // Check if it can be a literal. 558 if (!canStartLiteral(startTokenType)) { 559 return false; 560 } 561 // We break on ambiguous tokens so if we already moved on it can't be a literal. 562 if (stream.get(stream.last()) != token) { 563 return false; 564 } 565 // Rewind to token start position 566 reset(Token.descPosition(token)); 567 568 if (ch0 == '/') { 569 return scanRegEx(); 570 } else if (ch0 == '<') { 571 if (ch1 == '<') { 572 return scanHereString(); 573 } else if (Character.isJavaIdentifierStart(ch1)) { 574 return scanXMLLiteral(); 575 } 576 } 577 578 return false; 579 } 580 581 /** 582 * Scan over regex literal. 583 * 584 * @return True if a regex literal. 585 */ 586 private boolean scanRegEx() { 587 assert ch0 == '/'; 588 // Make sure it's not a comment. 589 if (ch1 != '/' && ch1 != '*') { 590 // Record beginning of literal. 591 final int start = position; 592 // Skip /. 593 skip(1); 594 boolean inBrackets = false; 595 596 // Scan for closing /, stopping at end of line. 597 while (!atEOF() && (ch0 != '/' || inBrackets) && !isEOL(ch0)) { 598 // Skip over escaped character. 599 if (ch0 == '\\') { 600 skip(1); 601 if (isEOL(ch0)) { 602 reset(start); 603 return false; 604 } 605 skip(1); 606 } else { 607 if (ch0 == '[') { 608 inBrackets = true; 609 } else if (ch0 == ']') { 610 inBrackets = false; 611 } 612 613 // Skip literal character. 614 skip(1); 615 } 616 } 617 618 // If regex literal. 619 if (ch0 == '/') { 620 // Skip /. 621 skip(1); 622 623 // Skip over options. 624 while (!atEOF() && Character.isJavaIdentifierPart(ch0) || ch0 == '\\' && ch1 == 'u') { 625 skip(1); 626 } 627 628 // Add regex token. 629 add(REGEX, start); 630 // Regex literal detected. 631 return true; 632 } 633 634 // False start try again. 635 reset(start); 636 } 637 638 // Regex literal not detected. 639 return false; 640 } 641 642 /** 643 * Convert a digit to a integer. Can't use Character.digit since we are 644 * restricted to ASCII by the spec. 645 * 646 * @param ch Character to convert. 647 * @param base Numeric base. 648 * 649 * @return The converted digit or -1 if invalid. 650 */ 651 protected static int convertDigit(final char ch, final int base) { 652 int digit; 653 654 if ('0' <= ch && ch <= '9') { 655 digit = ch - '0'; 656 } else if ('A' <= ch && ch <= 'Z') { 657 digit = ch - 'A' + 10; 658 } else if ('a' <= ch && ch <= 'z') { 659 digit = ch - 'a' + 10; 660 } else { 661 return -1; 662 } 663 664 return digit < base ? digit : -1; 665 } 666 667 668 /** 669 * Get the value of a hexadecimal numeric sequence. 670 * 671 * @param length Number of digits. 672 * @param type Type of token to report against. 673 * @return Value of sequence or < 0 if no digits. 674 */ 675 private int hexSequence(final int length, final TokenType type) { 676 int value = 0; 677 678 for (int i = 0; i < length; i++) { 679 final int digit = convertDigit(ch0, 16); 680 681 if (digit == -1) { 682 error(Lexer.message("invalid.hex"), type, position, limit); 683 return i == 0 ? -1 : value; 684 } 685 686 value = digit | value << 4; 687 skip(1); 688 } 689 690 return value; 691 } 692 693 /** 694 * Get the value of an octal numeric sequence. This parses up to 3 digits with a maximum value of 255. 695 * 696 * @return Value of sequence. 697 */ 698 private int octalSequence() { 699 int value = 0; 700 701 for (int i = 0; i < 3; i++) { 702 final int digit = convertDigit(ch0, 8); 703 704 if (digit == -1) { 705 break; 706 } 707 value = digit | value << 3; 708 skip(1); 709 710 if (i == 1 && value >= 32) { 711 break; 712 } 713 } 714 return value; 715 } 716 717 /** 718 * Convert a string to a JavaScript identifier. 719 * 720 * @param start Position in source content. 721 * @param length Length of token. 722 * @return Ident string or null if an error. 723 */ 724 private String valueOfIdent(final int start, final int length) throws RuntimeException { 725 // Save the current position. 726 final int savePosition = position; 727 // End of scan. 728 final int end = start + length; 729 // Reset to beginning of content. 730 reset(start); 731 // Buffer for recording characters. 732 final StringBuilder sb = new StringBuilder(length); 733 734 // Scan until end of line or end of file. 735 while (!atEOF() && position < end && !isEOL(ch0)) { 736 // If escape character. 737 if (ch0 == '\\' && ch1 == 'u') { 738 skip(2); 739 final int ch = hexSequence(4, TokenType.IDENT); 740 if (isWhitespace((char)ch)) { 741 return null; 742 } 743 if (ch < 0) { 744 sb.append('\\'); 745 sb.append('u'); 746 } else { 747 sb.append((char)ch); 748 } 749 } else { 750 // Add regular character. 751 sb.append(ch0); 752 skip(1); 753 } 754 } 755 756 // Restore position. 757 reset(savePosition); 758 759 return sb.toString(); 760 } 761 762 /** 763 * Scan over and identifier or keyword. Handles identifiers containing 764 * encoded Unicode chars. 765 * 766 * Example: 767 * 768 * var \u0042 = 44; 769 */ 770 private void scanIdentifierOrKeyword() { 771 // Record beginning of identifier. 772 final int start = position; 773 // Scan identifier. 774 final int length = scanIdentifier(); 775 // Check to see if it is a keyword. 776 final TokenType type = TokenLookup.lookupKeyword(content, start, length); 777 // Add keyword or identifier token. 778 add(type, start); 779 } 780 781 /** 782 * Convert a string to a JavaScript string object. 783 * 784 * @param start Position in source content. 785 * @param length Length of token. 786 * @return JavaScript string object. 787 */ 788 private String valueOfString(final int start, final int length, final boolean strict) throws RuntimeException { 789 // Save the current position. 790 final int savePosition = position; 791 // Calculate the end position. 792 final int end = start + length; 793 // Reset to beginning of string. 794 reset(start); 795 796 // Buffer for recording characters. 797 final StringBuilder sb = new StringBuilder(length); 798 799 // Scan until end of string. 800 while (position < end) { 801 // If escape character. 802 if (ch0 == '\\') { 803 skip(1); 804 805 final char next = ch0; 806 final int afterSlash = position; 807 808 skip(1); 809 810 // Special characters. 811 switch (next) { 812 case '0': 813 case '1': 814 case '2': 815 case '3': 816 case '4': 817 case '5': 818 case '6': 819 case '7': { 820 if (strict) { 821 // "\0" itself is allowed in strict mode. Only other 'real' 822 // octal escape sequences are not allowed (eg. "\02", "\31"). 823 // See section 7.8.4 String literals production EscapeSequence 824 if (next != '0' || (ch0 >= '0' && ch0 <= '9')) { 825 error(Lexer.message("strict.no.octal"), STRING, position, limit); 826 } 827 } 828 reset(afterSlash); 829 // Octal sequence. 830 final int ch = octalSequence(); 831 832 if (ch < 0) { 833 sb.append('\\'); 834 sb.append('x'); 835 } else { 836 sb.append((char)ch); 837 } 838 break; 839 } 840 case 'n': 841 sb.append('\n'); 842 break; 843 case 't': 844 sb.append('\t'); 845 break; 846 case 'b': 847 sb.append('\b'); 848 break; 849 case 'f': 850 sb.append('\f'); 851 break; 852 case 'r': 853 sb.append('\r'); 854 break; 855 case '\'': 856 sb.append('\''); 857 break; 858 case '\"': 859 sb.append('\"'); 860 break; 861 case '\\': 862 sb.append('\\'); 863 break; 864 case '\r': // CR | CRLF 865 if (ch0 == '\n') { 866 skip(1); 867 } 868 // fall through 869 case '\n': // LF 870 case '\u2028': // LS 871 case '\u2029': // PS 872 // continue on the next line, slash-return continues string 873 // literal 874 break; 875 case 'x': { 876 // Hex sequence. 877 final int ch = hexSequence(2, STRING); 878 879 if (ch < 0) { 880 sb.append('\\'); 881 sb.append('x'); 882 } else { 883 sb.append((char)ch); 884 } 885 } 886 break; 887 case 'u': { 888 // Unicode sequence. 889 final int ch = hexSequence(4, STRING); 890 891 if (ch < 0) { 892 sb.append('\\'); 893 sb.append('u'); 894 } else { 895 sb.append((char)ch); 896 } 897 } 898 break; 899 case 'v': 900 sb.append('\u000B'); 901 break; 902 // All other characters. 903 default: 904 sb.append(next); 905 break; 906 } 907 } else { 908 // Add regular character. 909 sb.append(ch0); 910 skip(1); 911 } 912 } 913 914 // Restore position. 915 reset(savePosition); 916 917 return sb.toString(); 918 } 919 920 /** 921 * Scan over a string literal. 922 */ 923 protected void scanString(final boolean add) { 924 // Type of string. 925 TokenType type = STRING; 926 // Record starting quote. 927 final char quote = ch0; 928 // Skip over quote. 929 skip(1); 930 931 // Record beginning of string content. 932 final State stringState = saveState(); 933 934 // Scan until close quote or end of line. 935 while (!atEOF() && ch0 != quote && !isEOL(ch0)) { 936 // Skip over escaped character. 937 if (ch0 == '\\') { 938 type = ESCSTRING; 939 skip(1); 940 if (! isEscapeCharacter(ch0)) { 941 error(Lexer.message("invalid.escape.char"), STRING, position, limit); 942 } 943 if (isEOL(ch0)) { 944 // Multiline string literal 945 skipEOL(false); 946 continue; 947 } 948 } 949 // Skip literal character. 950 skip(1); 951 } 952 953 // If close quote. 954 if (ch0 == quote) { 955 // Skip close quote. 956 skip(1); 957 } else { 958 error(Lexer.message("missing.close.quote"), STRING, position, limit); 959 } 960 961 // If not just scanning. 962 if (add) { 963 // Record end of string. 964 stringState.setLimit(position - 1); 965 966 if (scripting && !stringState.isEmpty()) { 967 switch (quote) { 968 case '`': 969 // Mark the beginning of an exec string. 970 add(EXECSTRING, stringState.position, stringState.limit); 971 // Frame edit string with left brace. 972 add(LBRACE, stringState.position, stringState.position); 973 // Process edit string. 974 editString(type, stringState); 975 // Frame edit string with right brace. 976 add(RBRACE, stringState.limit, stringState.limit); 977 break; 978 case '"': 979 // Only edit double quoted strings. 980 editString(type, stringState); 981 break; 982 case '\'': 983 // Add string token without editing. 984 add(type, stringState.position, stringState.limit); 985 break; 986 default: 987 break; 988 } 989 } else { 990 /// Add string token without editing. 991 add(type, stringState.position, stringState.limit); 992 } 993 } 994 } 995 996 /** 997 * Is the given character a valid escape char after "\" ? 998 * 999 * @param ch character to be checked 1000 * @return if the given character is valid after "\" 1001 */ 1002 protected boolean isEscapeCharacter(final char ch) { 1003 return true; 1004 } 1005 1006 /** 1007 * Convert string to number. 1008 * 1009 * @param valueString String to convert. 1010 * @param radix Numeric base. 1011 * @return Converted number. 1012 */ 1013 private static Number valueOf(final String valueString, final int radix) throws NumberFormatException { 1014 try { 1015 final long value = Long.parseLong(valueString, radix); 1016 if(value >= MIN_INT_L && value <= MAX_INT_L) { 1017 return Integer.valueOf((int)value); 1018 } 1019 return Long.valueOf(value); 1020 } catch (final NumberFormatException e) { 1021 if (radix == 10) { 1022 return Double.valueOf(valueString); 1023 } 1024 1025 double value = 0.0; 1026 1027 for (int i = 0; i < valueString.length(); i++) { 1028 final char ch = valueString.charAt(i); 1029 // Preverified, should always be a valid digit. 1030 final int digit = convertDigit(ch, radix); 1031 value *= radix; 1032 value += digit; 1033 } 1034 1035 return value; 1036 } 1037 } 1038 1039 /** 1040 * Convert string to number. 1041 * 1042 * @param valueString String to convert. 1043 * @return Converted number. 1044 */ 1045 private static Number valueOf(final String valueString) throws NumberFormatException { 1046 return JSType.narrowestIntegerRepresentation(Double.valueOf(valueString)); 1047 } 1048 1049 /** 1050 * Scan a number. 1051 */ 1052 protected void scanNumber() { 1053 // Record beginning of number. 1054 final int start = position; 1055 // Assume value is a decimal. 1056 TokenType type = DECIMAL; 1057 1058 // First digit of number. 1059 int digit = convertDigit(ch0, 10); 1060 1061 // If number begins with 0x. 1062 if (digit == 0 && (ch1 == 'x' || ch1 == 'X') && convertDigit(ch2, 16) != -1) { 1063 // Skip over 0xN. 1064 skip(3); 1065 // Skip over remaining digits. 1066 while (convertDigit(ch0, 16) != -1) { 1067 skip(1); 1068 } 1069 1070 type = HEXADECIMAL; 1071 } else { 1072 // Check for possible octal constant. 1073 boolean octal = digit == 0; 1074 // Skip first digit if not leading '.'. 1075 if (digit != -1) { 1076 skip(1); 1077 } 1078 1079 // Skip remaining digits. 1080 while ((digit = convertDigit(ch0, 10)) != -1) { 1081 // Check octal only digits. 1082 octal = octal && digit < 8; 1083 // Skip digit. 1084 skip(1); 1085 } 1086 1087 if (octal && position - start > 1) { 1088 type = OCTAL; 1089 } else if (ch0 == '.' || ch0 == 'E' || ch0 == 'e') { 1090 // Must be a double. 1091 if (ch0 == '.') { 1092 // Skip period. 1093 skip(1); 1094 // Skip mantissa. 1095 while (convertDigit(ch0, 10) != -1) { 1096 skip(1); 1097 } 1098 } 1099 1100 // Detect exponent. 1101 if (ch0 == 'E' || ch0 == 'e') { 1102 // Skip E. 1103 skip(1); 1104 // Detect and skip exponent sign. 1105 if (ch0 == '+' || ch0 == '-') { 1106 skip(1); 1107 } 1108 // Skip exponent. 1109 while (convertDigit(ch0, 10) != -1) { 1110 skip(1); 1111 } 1112 } 1113 1114 type = FLOATING; 1115 } 1116 } 1117 1118 if (Character.isJavaIdentifierStart(ch0)) { 1119 error(Lexer.message("missing.space.after.number"), type, position, 1); 1120 } 1121 1122 // Add number token. 1123 add(type, start); 1124 } 1125 1126 /** 1127 * Convert a regex token to a token object. 1128 * 1129 * @param start Position in source content. 1130 * @param length Length of regex token. 1131 * @return Regex token object. 1132 */ 1133 XMLToken valueOfXML(final int start, final int length) { 1134 return new XMLToken(source.getString(start, length)); 1135 } 1136 1137 /** 1138 * Scan over a XML token. 1139 * 1140 * @return TRUE if is an XML literal. 1141 */ 1142 private boolean scanXMLLiteral() { 1143 assert ch0 == '<' && Character.isJavaIdentifierStart(ch1); 1144 if (XML_LITERALS) { 1145 // Record beginning of xml expression. 1146 final int start = position; 1147 1148 int openCount = 0; 1149 1150 do { 1151 if (ch0 == '<') { 1152 if (ch1 == '/' && Character.isJavaIdentifierStart(ch2)) { 1153 skip(3); 1154 openCount--; 1155 } else if (Character.isJavaIdentifierStart(ch1)) { 1156 skip(2); 1157 openCount++; 1158 } else if (ch1 == '?') { 1159 skip(2); 1160 } else if (ch1 == '!' && ch2 == '-' && ch3 == '-') { 1161 skip(4); 1162 } else { 1163 reset(start); 1164 return false; 1165 } 1166 1167 while (!atEOF() && ch0 != '>') { 1168 if (ch0 == '/' && ch1 == '>') { 1169 openCount--; 1170 skip(1); 1171 break; 1172 } else if (ch0 == '\"' || ch0 == '\'') { 1173 scanString(false); 1174 } else { 1175 skip(1); 1176 } 1177 } 1178 1179 if (ch0 != '>') { 1180 reset(start); 1181 return false; 1182 } 1183 1184 skip(1); 1185 } else if (atEOF()) { 1186 reset(start); 1187 return false; 1188 } else { 1189 skip(1); 1190 } 1191 } while (openCount > 0); 1192 1193 add(XML, start); 1194 return true; 1195 } 1196 1197 return false; 1198 } 1199 1200 /** 1201 * Scan over identifier characters. 1202 * 1203 * @return Length of identifier or zero if none found. 1204 */ 1205 private int scanIdentifier() { 1206 final int start = position; 1207 1208 // Make sure first character is valid start character. 1209 if (ch0 == '\\' && ch1 == 'u') { 1210 skip(2); 1211 final int ch = hexSequence(4, TokenType.IDENT); 1212 1213 if (!Character.isJavaIdentifierStart(ch)) { 1214 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1215 } 1216 } else if (!Character.isJavaIdentifierStart(ch0)) { 1217 // Not an identifier. 1218 return 0; 1219 } 1220 1221 // Make sure remaining characters are valid part characters. 1222 while (!atEOF()) { 1223 if (ch0 == '\\' && ch1 == 'u') { 1224 skip(2); 1225 final int ch = hexSequence(4, TokenType.IDENT); 1226 1227 if (!Character.isJavaIdentifierPart(ch)) { 1228 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1229 } 1230 } else if (Character.isJavaIdentifierPart(ch0)) { 1231 skip(1); 1232 } else { 1233 break; 1234 } 1235 } 1236 1237 // Length of identifier sequence. 1238 return position - start; 1239 } 1240 1241 /** 1242 * Compare two identifiers (in content) for equality. 1243 * 1244 * @param aStart Start of first identifier. 1245 * @param aLength Length of first identifier. 1246 * @param bStart Start of second identifier. 1247 * @param bLength Length of second identifier. 1248 * @return True if equal. 1249 */ 1250 private boolean identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength) { 1251 if (aLength == bLength) { 1252 for (int i = 0; i < aLength; i++) { 1253 if (content[aStart + i] != content[bStart + i]) { 1254 return false; 1255 } 1256 } 1257 1258 return true; 1259 } 1260 1261 return false; 1262 } 1263 1264 /** 1265 * Detect if a line starts with a marker identifier. 1266 * 1267 * @param identStart Start of identifier. 1268 * @param identLength Length of identifier. 1269 * @return True if detected. 1270 */ 1271 private boolean hasHereMarker(final int identStart, final int identLength) { 1272 // Skip any whitespace. 1273 skipWhitespace(false); 1274 1275 return identifierEqual(identStart, identLength, position, scanIdentifier()); 1276 } 1277 1278 /** 1279 * Lexer to service edit strings. 1280 */ 1281 private static class EditStringLexer extends Lexer { 1282 /** Type of string literals to emit. */ 1283 final TokenType stringType; 1284 1285 /* 1286 * Constructor. 1287 */ 1288 1289 EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState) { 1290 super(lexer, stringState); 1291 1292 this.stringType = stringType; 1293 } 1294 1295 /** 1296 * Lexify the contents of the string. 1297 */ 1298 @Override 1299 public void lexify() { 1300 // Record start of string position. 1301 int stringStart = position; 1302 // Indicate that the priming first string has not been emitted. 1303 boolean primed = false; 1304 1305 while (true) { 1306 // Detect end of content. 1307 if (atEOF()) { 1308 break; 1309 } 1310 1311 // Honour escapes (should be well formed.) 1312 if (ch0 == '\\' && stringType == ESCSTRING) { 1313 skip(2); 1314 1315 continue; 1316 } 1317 1318 // If start of expression. 1319 if (ch0 == '$' && ch1 == '{') { 1320 if (!primed || stringStart != position) { 1321 if (primed) { 1322 add(ADD, stringStart, stringStart + 1); 1323 } 1324 1325 add(stringType, stringStart, position); 1326 primed = true; 1327 } 1328 1329 // Skip ${ 1330 skip(2); 1331 1332 // Save expression state. 1333 final State expressionState = saveState(); 1334 1335 // Start with one open brace. 1336 int braceCount = 1; 1337 1338 // Scan for the rest of the string. 1339 while (!atEOF()) { 1340 // If closing brace. 1341 if (ch0 == '}') { 1342 // Break only only if matching brace. 1343 if (--braceCount == 0) { 1344 break; 1345 } 1346 } else if (ch0 == '{') { 1347 // Bump up the brace count. 1348 braceCount++; 1349 } 1350 1351 // Skip to next character. 1352 skip(1); 1353 } 1354 1355 // If braces don't match then report an error. 1356 if (braceCount != 0) { 1357 error(Lexer.message("edit.string.missing.brace"), LBRACE, expressionState.position - 1, 1); 1358 } 1359 1360 // Mark end of expression. 1361 expressionState.setLimit(position); 1362 // Skip closing brace. 1363 skip(1); 1364 1365 // Start next string. 1366 stringStart = position; 1367 1368 // Concatenate expression. 1369 add(ADD, expressionState.position, expressionState.position + 1); 1370 add(LPAREN, expressionState.position, expressionState.position + 1); 1371 1372 // Scan expression. 1373 final Lexer lexer = new Lexer(this, expressionState); 1374 lexer.lexify(); 1375 1376 // Close out expression parenthesis. 1377 add(RPAREN, position - 1, position); 1378 1379 continue; 1380 } 1381 1382 // Next character in string. 1383 skip(1); 1384 } 1385 1386 // If there is any unemitted string portion. 1387 if (stringStart != limit) { 1388 // Concatenate remaining string. 1389 if (primed) { 1390 add(ADD, stringStart, 1); 1391 } 1392 1393 add(stringType, stringStart, limit); 1394 } 1395 } 1396 1397 } 1398 1399 /** 1400 * Edit string for nested expressions. 1401 * 1402 * @param stringType Type of string literals to emit. 1403 * @param stringState State of lexer at start of string. 1404 */ 1405 private void editString(final TokenType stringType, final State stringState) { 1406 // Use special lexer to scan string. 1407 final EditStringLexer lexer = new EditStringLexer(this, stringType, stringState); 1408 lexer.lexify(); 1409 1410 // Need to keep lexer informed. 1411 last = stringType; 1412 } 1413 1414 /** 1415 * Scan over a here string. 1416 * 1417 * @return TRUE if is a here string. 1418 */ 1419 private boolean scanHereString() { 1420 assert ch0 == '<' && ch1 == '<'; 1421 if (scripting) { 1422 // Record beginning of here string. 1423 final State saved = saveState(); 1424 1425 // << or <<< 1426 final boolean excludeLastEOL = ch2 != '<'; 1427 1428 if (excludeLastEOL) { 1429 skip(2); 1430 } else { 1431 skip(3); 1432 } 1433 1434 // Scan identifier. 1435 final int identStart = position; 1436 final int identLength = scanIdentifier(); 1437 1438 // Check for identifier. 1439 if (identLength == 0) { 1440 // Treat as shift. 1441 restoreState(saved); 1442 1443 return false; 1444 } 1445 1446 // Record rest of line. 1447 final State restState = saveState(); 1448 skipLine(false); 1449 restState.setLimit(position); 1450 1451 // Record beginning of string. 1452 final State stringState = saveState(); 1453 int stringEnd = position; 1454 1455 // Hunt down marker. 1456 while (!atEOF()) { 1457 // Skip any whitespace. 1458 skipWhitespace(false); 1459 1460 if (hasHereMarker(identStart, identLength)) { 1461 break; 1462 } 1463 1464 skipLine(false); 1465 stringEnd = position; 1466 } 1467 1468 // Record end of string. 1469 stringState.setLimit(stringEnd); 1470 1471 // If marker is missing. 1472 if (stringState.isEmpty() || atEOF()) { 1473 error(Lexer.message("here.missing.end.marker", source.getString(identStart, identLength)), last, position, position); 1474 restoreState(saved); 1475 1476 return false; 1477 } 1478 1479 // Remove last end of line if specified. 1480 if (excludeLastEOL) { 1481 // Handles \n. 1482 if (content[stringEnd - 1] == '\n') { 1483 stringEnd--; 1484 } 1485 1486 // Handles \r and \r\n. 1487 if (content[stringEnd - 1] == '\r') { 1488 stringEnd--; 1489 } 1490 1491 // Update end of string. 1492 stringState.setLimit(stringEnd); 1493 } 1494 1495 // Edit string if appropriate. 1496 if (scripting && !stringState.isEmpty()) { 1497 editString(STRING, stringState); 1498 } else { 1499 // Add here string. 1500 add(STRING, stringState.position, stringState.limit); 1501 } 1502 1503 // Scan rest of original line. 1504 final Lexer restLexer = new Lexer(this, restState); 1505 1506 restLexer.lexify(); 1507 1508 return true; 1509 } 1510 1511 return false; 1512 } 1513 1514 /** 1515 * Breaks source content down into lex units, adding tokens to the token 1516 * stream. The routine scans until the stream buffer is full. Can be called 1517 * repeatedly until EOF is detected. 1518 */ 1519 public void lexify() { 1520 while (!stream.isFull() || nested) { 1521 // Skip over whitespace. 1522 skipWhitespace(true); 1523 1524 // Detect end of file. 1525 if (atEOF()) { 1526 if (!nested) { 1527 // Add an EOF token at the end. 1528 add(EOF, position); 1529 } 1530 1531 break; 1532 } 1533 1534 // Check for comments. Note that we don't scan for regexp and other literals here as 1535 // we may not have enough context to distinguish them from similar looking operators. 1536 // Instead we break on ambiguous operators below and let the parser decide. 1537 if (ch0 == '/' && skipComments()) { 1538 continue; 1539 } 1540 1541 if (scripting && ch0 == '#' && skipComments()) { 1542 continue; 1543 } 1544 1545 // TokenType for lookup of delimiter or operator. 1546 TokenType type; 1547 1548 if (ch0 == '.' && convertDigit(ch1, 10) != -1) { 1549 // '.' followed by digit. 1550 // Scan and add a number. 1551 scanNumber(); 1552 } else if ((type = TokenLookup.lookupOperator(ch0, ch1, ch2, ch3)) != null) { 1553 // Get the number of characters in the token. 1554 final int typeLength = type.getLength(); 1555 // Skip that many characters. 1556 skip(typeLength); 1557 // Add operator token. 1558 add(type, position - typeLength); 1559 // Some operator tokens also mark the beginning of regexp, XML, or here string literals. 1560 // We break to let the parser decide what it is. 1561 if (canStartLiteral(type)) { 1562 break; 1563 } 1564 } else if (Character.isJavaIdentifierStart(ch0) || ch0 == '\\' && ch1 == 'u') { 1565 // Scan and add identifier or keyword. 1566 scanIdentifierOrKeyword(); 1567 } else if (isStringDelimiter(ch0)) { 1568 // Scan and add a string. 1569 scanString(true); 1570 } else if (Character.isDigit(ch0)) { 1571 // Scan and add a number. 1572 scanNumber(); 1573 } else { 1574 // Don't recognize this character. 1575 skip(1); 1576 add(ERROR, position - 1); 1577 } 1578 } 1579 } 1580 1581 /** 1582 * Return value of token given its token descriptor. 1583 * 1584 * @param token Token descriptor. 1585 * @return JavaScript value. 1586 */ 1587 Object getValueOf(final long token, final boolean strict) { 1588 final int start = Token.descPosition(token); 1589 final int len = Token.descLength(token); 1590 1591 switch (Token.descType(token)) { 1592 case DECIMAL: 1593 return Lexer.valueOf(source.getString(start, len), 10); // number 1594 case OCTAL: 1595 return Lexer.valueOf(source.getString(start, len), 8); // number 1596 case HEXADECIMAL: 1597 return Lexer.valueOf(source.getString(start + 2, len - 2), 16); // number 1598 case FLOATING: 1599 return Lexer.valueOf(source.getString(start, len)); // number 1600 case STRING: 1601 return source.getString(start, len); // String 1602 case ESCSTRING: 1603 return valueOfString(start, len, strict); // String 1604 case IDENT: 1605 return valueOfIdent(start, len); // String 1606 case REGEX: 1607 return valueOfPattern(start, len); // RegexToken::LexerToken 1608 case XML: 1609 return valueOfXML(start, len); // XMLToken::LexerToken 1610 default: 1611 break; 1612 } 1613 1614 return null; 1615 } 1616 1617 protected static String message(final String msgId, final String... args) { 1618 return ECMAErrors.getMessage("lexer.error." + msgId, args); 1619 } 1620 1621 /** 1622 * Generate a runtime exception 1623 * 1624 * @param message error message 1625 * @param type token type 1626 * @param start start position of lexed error 1627 * @param length length of lexed error 1628 * @throws ParserException unconditionally 1629 */ 1630 protected void error(final String message, final TokenType type, final int start, final int length) throws ParserException { 1631 final long token = Token.toDesc(type, start, length); 1632 final int pos = Token.descPosition(token); 1633 final int lineNum = source.getLine(pos); 1634 final int columnNum = source.getColumn(pos); 1635 final String formatted = ErrorManager.format(message, source, lineNum, columnNum, token); 1636 throw new ParserException(JSErrorType.SYNTAX_ERROR, formatted, source, lineNum, columnNum, token); 1637 } 1638 1639 /** 1640 * Helper class for Lexer tokens, e.g XML or RegExp tokens. 1641 * This is the abstract superclass 1642 */ 1643 public static abstract class LexerToken { 1644 private final String expression; 1645 1646 /** 1647 * Constructor 1648 * @param expression token expression 1649 */ 1650 protected LexerToken(final String expression) { 1651 this.expression = expression; 1652 } 1653 1654 /** 1655 * Get the expression 1656 * @return expression 1657 */ 1658 public String getExpression() { 1659 return expression; 1660 } 1661 } 1662 1663 /** 1664 * Temporary container for regular expressions. 1665 */ 1666 public static class RegexToken extends LexerToken { 1667 /** Options. */ 1668 private final String options; 1669 1670 /** 1671 * Constructor. 1672 * 1673 * @param expression regexp expression 1674 * @param options regexp options 1675 */ 1676 public RegexToken(final String expression, final String options) { 1677 super(expression); 1678 this.options = options; 1679 } 1680 1681 /** 1682 * Get regexp options 1683 * @return options 1684 */ 1685 public String getOptions() { 1686 return options; 1687 } 1688 1689 @Override 1690 public String toString() { 1691 return '/' + getExpression() + '/' + options; 1692 } 1693 } 1694 1695 /** 1696 * Temporary container for XML expression. 1697 */ 1698 public static class XMLToken extends LexerToken { 1699 1700 /** 1701 * Constructor. 1702 * 1703 * @param expression XML expression 1704 */ 1705 public XMLToken(final String expression) { 1706 super(expression); 1707 } 1708 } 1709 }