1 /* 2 * Copyright (c) 2010, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package jdk.nashorn.internal.parser; 27 28 import static jdk.nashorn.internal.parser.TokenType.ADD; 29 import static jdk.nashorn.internal.parser.TokenType.COMMENT; 30 import static jdk.nashorn.internal.parser.TokenType.DECIMAL; 31 import static jdk.nashorn.internal.parser.TokenType.EOF; 32 import static jdk.nashorn.internal.parser.TokenType.EOL; 33 import static jdk.nashorn.internal.parser.TokenType.ERROR; 34 import static jdk.nashorn.internal.parser.TokenType.ESCSTRING; 35 import static jdk.nashorn.internal.parser.TokenType.EXECSTRING; 36 import static jdk.nashorn.internal.parser.TokenType.FLOATING; 37 import static jdk.nashorn.internal.parser.TokenType.HEXADECIMAL; 38 import static jdk.nashorn.internal.parser.TokenType.LBRACE; 39 import static jdk.nashorn.internal.parser.TokenType.LPAREN; 40 import static jdk.nashorn.internal.parser.TokenType.OCTAL; 41 import static jdk.nashorn.internal.parser.TokenType.RBRACE; 42 import static jdk.nashorn.internal.parser.TokenType.REGEX; 43 import static jdk.nashorn.internal.parser.TokenType.RPAREN; 44 import static jdk.nashorn.internal.parser.TokenType.STRING; 45 import static jdk.nashorn.internal.parser.TokenType.XML; 46 47 import jdk.nashorn.internal.runtime.ECMAErrors; 48 import jdk.nashorn.internal.runtime.ErrorManager; 49 import jdk.nashorn.internal.runtime.JSErrorType; 50 import jdk.nashorn.internal.runtime.ParserException; 51 import jdk.nashorn.internal.runtime.Source; 52 import jdk.nashorn.internal.runtime.options.Options; 53 54 /** 55 * Responsible for converting source content into a stream of tokens. 56 * 57 */ 58 @SuppressWarnings("fallthrough") 59 public class Lexer extends Scanner { 60 private static final long MIN_INT_L = Integer.MIN_VALUE; 61 private static final long MAX_INT_L = Integer.MAX_VALUE; 62 63 private static final boolean XML_LITERALS = Options.getBooleanProperty("nashorn.lexer.xmlliterals"); 64 65 /** Content source. */ 66 private final Source source; 67 68 /** Buffered stream for tokens. */ 69 private final TokenStream stream; 70 71 /** True if here and edit strings are supported. */ 72 private final boolean scripting; 73 74 /** True if a nested scan. (scan to completion, no EOF.) */ 75 private final boolean nested; 76 77 /** Pending new line number and position. */ 78 private int pendingLine; 79 80 /** Position of last EOL + 1. */ 81 private int linePosition; 82 83 /** Type of last token added. */ 84 private TokenType last; 85 86 private static final String SPACETAB = " \t"; // ASCII space and tab 87 private static final String LFCR = "\n\r"; // line feed and carriage return (ctrl-m) 88 89 private static final String JSON_WHITESPACE_EOL = LFCR; 90 private static final String JSON_WHITESPACE = SPACETAB + LFCR; 91 92 private static final String JAVASCRIPT_WHITESPACE_EOL = 93 LFCR + 94 "\u2028" + // line separator 95 "\u2029" // paragraph separator 96 ; 97 private static final String JAVASCRIPT_WHITESPACE = 98 SPACETAB + 99 JAVASCRIPT_WHITESPACE_EOL + 100 "\u000b" + // tabulation line 101 "\u000c" + // ff (ctrl-l) 102 "\u00a0" + // Latin-1 space 103 "\u1680" + // Ogham space mark 104 "\u180e" + // separator, Mongolian vowel 105 "\u2000" + // en quad 106 "\u2001" + // em quad 107 "\u2002" + // en space 108 "\u2003" + // em space 109 "\u2004" + // three-per-em space 110 "\u2005" + // four-per-em space 111 "\u2006" + // six-per-em space 112 "\u2007" + // figure space 113 "\u2008" + // punctuation space 114 "\u2009" + // thin space 115 "\u200a" + // hair space 116 "\u202f" + // narrow no-break space 117 "\u205f" + // medium mathematical space 118 "\u3000" + // ideographic space 119 "\ufeff" // byte order mark 120 ; 121 122 private static final String JAVASCRIPT_WHITESPACE_IN_REGEXP = 123 "\\u000a" + // line feed 124 "\\u000d" + // carriage return (ctrl-m) 125 "\\u2028" + // line separator 126 "\\u2029" + // paragraph separator 127 "\\u0009" + // tab 128 "\\u0020" + // ASCII space 129 "\\u000b" + // tabulation line 130 "\\u000c" + // ff (ctrl-l) 131 "\\u00a0" + // Latin-1 space 132 "\\u1680" + // Ogham space mark 133 "\\u180e" + // separator, Mongolian vowel 134 "\\u2000" + // en quad 135 "\\u2001" + // em quad 136 "\\u2002" + // en space 137 "\\u2003" + // em space 138 "\\u2004" + // three-per-em space 139 "\\u2005" + // four-per-em space 140 "\\u2006" + // six-per-em space 141 "\\u2007" + // figure space 142 "\\u2008" + // punctuation space 143 "\\u2009" + // thin space 144 "\\u200a" + // hair space 145 "\\u202f" + // narrow no-break space 146 "\\u205f" + // medium mathematical space 147 "\\u3000" + // ideographic space 148 "\\ufeff" // byte order mark 149 ; 150 151 static String unicodeEscape(final char ch) { 152 final StringBuilder sb = new StringBuilder(); 153 154 sb.append("\\u"); 155 156 final String hex = Integer.toHexString(ch); 157 for (int i = hex.length(); i < 4; i++) { 158 sb.append('0'); 159 } 160 sb.append(hex); 161 162 return sb.toString(); 163 } 164 165 /** 166 * Constructor 167 * 168 * @param source the source 169 * @param stream the token stream to lex 170 */ 171 public Lexer(final Source source, final TokenStream stream) { 172 this(source, stream, false); 173 } 174 175 /** 176 * Constructor 177 * 178 * @param source the source 179 * @param stream the token stream to lex 180 * @param scripting are we in scripting mode 181 */ 182 public Lexer(final Source source, final TokenStream stream, final boolean scripting) { 183 super(source.getContent(), 1, 0, source.getLength()); 184 185 this.source = source; 186 this.stream = stream; 187 this.scripting = scripting; 188 this.nested = false; 189 this.pendingLine = 1; 190 this.last = EOL; 191 } 192 193 private Lexer(final Lexer lexer, final State state) { 194 super(lexer, state); 195 196 source = lexer.source; 197 stream = lexer.stream; 198 scripting = lexer.scripting; 199 nested = true; 200 201 pendingLine = state.pendingLine; 202 linePosition = state.linePosition; 203 last = EOL; 204 } 205 206 static class State extends Scanner.State { 207 /** Pending new line number and position. */ 208 public final int pendingLine; 209 210 /** Position of last EOL + 1. */ 211 public final int linePosition; 212 213 /** Type of last token added. */ 214 public final TokenType last; 215 216 /* 217 * Constructor. 218 */ 219 220 State(final int position, final int limit, final int line, final int pendingLine, final int linePosition, final TokenType last) { 221 super(position, limit, line); 222 223 this.pendingLine = pendingLine; 224 this.linePosition = linePosition; 225 this.last = last; 226 } 227 } 228 229 /** 230 * Save the state of the scan. 231 * 232 * @return Captured state. 233 */ 234 @Override 235 State saveState() { 236 return new State(position, limit, line, pendingLine, linePosition, last); 237 } 238 239 /** 240 * Restore the state of the scan. 241 * 242 * @param state 243 * Captured state. 244 */ 245 void restoreState(final State state) { 246 super.restoreState(state); 247 248 pendingLine = state.pendingLine; 249 linePosition = state.linePosition; 250 last = state.last; 251 } 252 253 /** 254 * Add a new token to the stream. 255 * 256 * @param type 257 * Token type. 258 * @param start 259 * Start position. 260 * @param end 261 * End position. 262 */ 263 protected void add(final TokenType type, final int start, final int end) { 264 // Record last token. 265 last = type; 266 267 // Only emit the last EOL in a cluster. 268 if (type == EOL) { 269 pendingLine = end; 270 linePosition = start; 271 } else { 272 // Write any pending EOL to stream. 273 if (pendingLine != -1) { 274 stream.put(Token.toDesc(EOL, linePosition, pendingLine)); 275 pendingLine = -1; 276 } 277 278 // Write token to stream. 279 stream.put(Token.toDesc(type, start, end - start)); 280 } 281 } 282 283 /** 284 * Add a new token to the stream. 285 * 286 * @param type 287 * Token type. 288 * @param start 289 * Start position. 290 */ 291 protected void add(final TokenType type, final int start) { 292 add(type, start, position); 293 } 294 295 /** 296 * Return the String of valid whitespace characters for regular 297 * expressions in JavaScript 298 * @return regexp whitespace string 299 */ 300 public static String getWhitespaceRegExp() { 301 return JAVASCRIPT_WHITESPACE_IN_REGEXP; 302 } 303 304 /** 305 * Skip end of line. 306 * 307 * @param addEOL true if EOL token should be recorded. 308 */ 309 private void skipEOL(final boolean addEOL) { 310 311 if (ch0 == '\r') { // detect \r\n pattern 312 skip(1); 313 if (ch0 == '\n') { 314 skip(1); 315 } 316 } else { // all other space, ch0 is guaranteed to be EOL or \0 317 skip(1); 318 } 319 320 // bump up line count 321 line++; 322 323 if (addEOL) { 324 // Add an EOL token. 325 add(EOL, position, line); 326 } 327 } 328 329 /** 330 * Skip over rest of line including end of line. 331 * 332 * @param addEOL true if EOL token should be recorded. 333 */ 334 private void skipLine(final boolean addEOL) { 335 // Ignore characters. 336 while (!isEOL(ch0) && !atEOF()) { 337 skip(1); 338 } 339 // Skip over end of line. 340 skipEOL(addEOL); 341 } 342 343 /** 344 * Test whether a char is valid JavaScript whitespace 345 * @param ch a char 346 * @return true if valid JavaScript whitespace 347 */ 348 public static boolean isJSWhitespace(final char ch) { 349 return JAVASCRIPT_WHITESPACE.indexOf(ch) != -1; 350 } 351 352 /** 353 * Test whether a char is valid JavaScript end of line 354 * @param ch a char 355 * @return true if valid JavaScript end of line 356 */ 357 public static boolean isJSEOL(final char ch) { 358 return JAVASCRIPT_WHITESPACE_EOL.indexOf(ch) != -1; 359 } 360 361 /** 362 * Test whether a char is valid JSON whitespace 363 * @param ch a char 364 * @return true if valid JSON whitespace 365 */ 366 public static boolean isJsonWhitespace(final char ch) { 367 return JSON_WHITESPACE.indexOf(ch) != -1; 368 } 369 370 /** 371 * Test whether a char is valid JSON end of line 372 * @param ch a char 373 * @return true if valid JSON end of line 374 */ 375 public static boolean isJsonEOL(final char ch) { 376 return JSON_WHITESPACE_EOL.indexOf(ch) != -1; 377 } 378 379 /** 380 * Test if char is a string delimiter, e.g. '\' or '"'. Also scans exec 381 * strings ('`') in scripting mode. 382 * @param ch a char 383 * @return true if string delimiter 384 */ 385 protected boolean isStringDelimiter(final char ch) { 386 return ch == '\'' || ch == '"' || (scripting && ch == '`'); 387 } 388 389 /** 390 * Test whether a char is valid JavaScript whitespace 391 * @param ch a char 392 * @return true if valid JavaScript whitespace 393 */ 394 protected boolean isWhitespace(final char ch) { 395 return Lexer.isJSWhitespace(ch); 396 } 397 398 /** 399 * Test whether a char is valid JavaScript end of line 400 * @param ch a char 401 * @return true if valid JavaScript end of line 402 */ 403 protected boolean isEOL(final char ch) { 404 return Lexer.isJSEOL(ch); 405 } 406 407 /** 408 * Skip over whitespace and detect end of line, adding EOL tokens if 409 * encountered. 410 * 411 * @param addEOL true if EOL tokens should be recorded. 412 */ 413 private void skipWhitespace(final boolean addEOL) { 414 while (isWhitespace(ch0)) { 415 if (isEOL(ch0)) { 416 skipEOL(addEOL); 417 } else { 418 skip(1); 419 } 420 } 421 } 422 423 /** 424 * Skip over comments. 425 * 426 * @return True if a comment. 427 */ 428 protected boolean skipComments() { 429 // Save the current position. 430 final int start = position; 431 432 if (ch0 == '/') { 433 // Is it a // comment. 434 if (ch1 == '/') { 435 // Skip over //. 436 skip(2); 437 // Scan for EOL. 438 while (!atEOF() && !isEOL(ch0)) { 439 skip(1); 440 } 441 // Did detect a comment. 442 add(COMMENT, start); 443 return true; 444 } else if (ch1 == '*') { 445 // Skip over /*. 446 skip(2); 447 // Scan for */. 448 while (!atEOF() && !(ch0 == '*' && ch1 == '/')) { 449 // If end of line handle else skip character. 450 if (isEOL(ch0)) { 451 skipEOL(true); 452 } else { 453 skip(1); 454 } 455 } 456 457 if (atEOF()) { 458 // TODO - Report closing */ missing in parser. 459 add(ERROR, start); 460 } else { 461 // Skip */. 462 skip(2); 463 } 464 465 // Did detect a comment. 466 add(COMMENT, start); 467 return true; 468 } 469 } else if (ch0 == '#') { 470 assert scripting; 471 // shell style comment 472 // Skip over #. 473 skip(1); 474 // Scan for EOL. 475 while (!atEOF() && !isEOL(ch0)) { 476 skip(1); 477 } 478 // Did detect a comment. 479 add(COMMENT, start); 480 return true; 481 } 482 483 // Not a comment. 484 return false; 485 } 486 487 /** 488 * Convert a regex token to a token object. 489 * 490 * @param start Position in source content. 491 * @param length Length of regex token. 492 * @return Regex token object. 493 */ 494 public RegexToken valueOfPattern(final int start, final int length) { 495 // Save the current position. 496 final int savePosition = position; 497 // Reset to beginning of content. 498 reset(start); 499 // Buffer for recording characters. 500 final StringBuilder sb = new StringBuilder(length); 501 502 // Skip /. 503 skip(1); 504 boolean inBrackets = false; 505 // Scan for closing /, stopping at end of line. 506 while (!atEOF() && ch0 != '/' && !isEOL(ch0) || inBrackets) { 507 // Skip over escaped character. 508 if (ch0 == '\\') { 509 sb.append(ch0); 510 sb.append(ch1); 511 skip(2); 512 } else { 513 if (ch0 == '[') { 514 inBrackets = true; 515 } else if (ch0 == ']') { 516 inBrackets = false; 517 } 518 519 // Skip literal character. 520 sb.append(ch0); 521 skip(1); 522 } 523 } 524 525 // Get pattern as string. 526 final String regex = sb.toString(); 527 528 // Skip /. 529 skip(1); 530 531 // Options as string. 532 final String options = source.getString(position, scanIdentifier()); 533 534 reset(savePosition); 535 536 // Compile the pattern. 537 return new RegexToken(regex, options); 538 } 539 540 /** 541 * Return true if the given token can be the beginning of a literal. 542 * 543 * @param token a token 544 * @return true if token can start a literal. 545 */ 546 public boolean canStartLiteral(final TokenType token) { 547 return token.startsWith('/') || ((scripting || XML_LITERALS) && token.startsWith('<')); 548 } 549 550 /** 551 * interface to receive line information for multi-line literals. 552 */ 553 protected interface LineInfoReceiver { 554 /** 555 * Receives line information 556 * @param line last line number 557 * @param linePosition position of last line 558 */ 559 public void lineInfo(int line, int linePosition); 560 } 561 562 /** 563 * Check whether the given token represents the beginning of a literal. If so scan 564 * the literal and return <tt>true</tt>, otherwise return false. 565 * 566 * @param token the token. 567 * @param startTokenType the token type. 568 * @param lir LineInfoReceiver that receives line info for multi-line string literals. 569 * @return True if a literal beginning with startToken was found and scanned. 570 */ 571 protected boolean scanLiteral(final long token, final TokenType startTokenType, final LineInfoReceiver lir) { 572 // Check if it can be a literal. 573 if (!canStartLiteral(startTokenType)) { 574 return false; 575 } 576 // We break on ambiguous tokens so if we already moved on it can't be a literal. 577 if (stream.get(stream.last()) != token) { 578 return false; 579 } 580 // Rewind to token start position 581 reset(Token.descPosition(token)); 582 583 if (ch0 == '/') { 584 return scanRegEx(); 585 } else if (ch0 == '<') { 586 if (ch1 == '<') { 587 return scanHereString(lir); 588 } else if (Character.isJavaIdentifierStart(ch1)) { 589 return scanXMLLiteral(); 590 } 591 } 592 593 return false; 594 } 595 596 /** 597 * Scan over regex literal. 598 * 599 * @return True if a regex literal. 600 */ 601 private boolean scanRegEx() { 602 assert ch0 == '/'; 603 // Make sure it's not a comment. 604 if (ch1 != '/' && ch1 != '*') { 605 // Record beginning of literal. 606 final int start = position; 607 // Skip /. 608 skip(1); 609 boolean inBrackets = false; 610 611 // Scan for closing /, stopping at end of line. 612 while (!atEOF() && (ch0 != '/' || inBrackets) && !isEOL(ch0)) { 613 // Skip over escaped character. 614 if (ch0 == '\\') { 615 skip(1); 616 if (isEOL(ch0)) { 617 reset(start); 618 return false; 619 } 620 skip(1); 621 } else { 622 if (ch0 == '[') { 623 inBrackets = true; 624 } else if (ch0 == ']') { 625 inBrackets = false; 626 } 627 628 // Skip literal character. 629 skip(1); 630 } 631 } 632 633 // If regex literal. 634 if (ch0 == '/') { 635 // Skip /. 636 skip(1); 637 638 // Skip over options. 639 while (!atEOF() && Character.isJavaIdentifierPart(ch0) || ch0 == '\\' && ch1 == 'u') { 640 skip(1); 641 } 642 643 // Add regex token. 644 add(REGEX, start); 645 // Regex literal detected. 646 return true; 647 } 648 649 // False start try again. 650 reset(start); 651 } 652 653 // Regex literal not detected. 654 return false; 655 } 656 657 /** 658 * Convert a digit to a integer. Can't use Character.digit since we are 659 * restricted to ASCII by the spec. 660 * 661 * @param ch Character to convert. 662 * @param base Numeric base. 663 * 664 * @return The converted digit or -1 if invalid. 665 */ 666 protected static int convertDigit(final char ch, final int base) { 667 int digit; 668 669 if ('0' <= ch && ch <= '9') { 670 digit = ch - '0'; 671 } else if ('A' <= ch && ch <= 'Z') { 672 digit = ch - 'A' + 10; 673 } else if ('a' <= ch && ch <= 'z') { 674 digit = ch - 'a' + 10; 675 } else { 676 return -1; 677 } 678 679 return digit < base ? digit : -1; 680 } 681 682 683 /** 684 * Get the value of a hexadecimal numeric sequence. 685 * 686 * @param length Number of digits. 687 * @param type Type of token to report against. 688 * @return Value of sequence or < 0 if no digits. 689 */ 690 private int hexSequence(final int length, final TokenType type) { 691 int value = 0; 692 693 for (int i = 0; i < length; i++) { 694 final int digit = convertDigit(ch0, 16); 695 696 if (digit == -1) { 697 error(Lexer.message("invalid.hex"), type, position, limit); 698 return i == 0 ? -1 : value; 699 } 700 701 value = digit | value << 4; 702 skip(1); 703 } 704 705 return value; 706 } 707 708 /** 709 * Get the value of an octal numeric sequence. This parses up to 3 digits with a maximum value of 255. 710 * 711 * @return Value of sequence. 712 */ 713 private int octalSequence() { 714 int value = 0; 715 716 for (int i = 0; i < 3; i++) { 717 final int digit = convertDigit(ch0, 8); 718 719 if (digit == -1) { 720 break; 721 } 722 value = digit | value << 3; 723 skip(1); 724 725 if (i == 1 && value >= 32) { 726 break; 727 } 728 } 729 return value; 730 } 731 732 /** 733 * Convert a string to a JavaScript identifier. 734 * 735 * @param start Position in source content. 736 * @param length Length of token. 737 * @return Ident string or null if an error. 738 */ 739 private String valueOfIdent(final int start, final int length) throws RuntimeException { 740 // Save the current position. 741 final int savePosition = position; 742 // End of scan. 743 final int end = start + length; 744 // Reset to beginning of content. 745 reset(start); 746 // Buffer for recording characters. 747 final StringBuilder sb = new StringBuilder(length); 748 749 // Scan until end of line or end of file. 750 while (!atEOF() && position < end && !isEOL(ch0)) { 751 // If escape character. 752 if (ch0 == '\\' && ch1 == 'u') { 753 skip(2); 754 final int ch = hexSequence(4, TokenType.IDENT); 755 if (isWhitespace((char)ch)) { 756 return null; 757 } 758 if (ch < 0) { 759 sb.append('\\'); 760 sb.append('u'); 761 } else { 762 sb.append((char)ch); 763 } 764 } else { 765 // Add regular character. 766 sb.append(ch0); 767 skip(1); 768 } 769 } 770 771 // Restore position. 772 reset(savePosition); 773 774 return sb.toString(); 775 } 776 777 /** 778 * Scan over and identifier or keyword. Handles identifiers containing 779 * encoded Unicode chars. 780 * 781 * Example: 782 * 783 * var \u0042 = 44; 784 */ 785 private void scanIdentifierOrKeyword() { 786 // Record beginning of identifier. 787 final int start = position; 788 // Scan identifier. 789 final int length = scanIdentifier(); 790 // Check to see if it is a keyword. 791 final TokenType type = TokenLookup.lookupKeyword(content, start, length); 792 // Add keyword or identifier token. 793 add(type, start); 794 } 795 796 /** 797 * Convert a string to a JavaScript string object. 798 * 799 * @param start Position in source content. 800 * @param length Length of token. 801 * @return JavaScript string object. 802 */ 803 private String valueOfString(final int start, final int length, final boolean strict) throws RuntimeException { 804 // Save the current position. 805 final int savePosition = position; 806 // Calculate the end position. 807 final int end = start + length; 808 // Reset to beginning of string. 809 reset(start); 810 811 // Buffer for recording characters. 812 final StringBuilder sb = new StringBuilder(length); 813 814 // Scan until end of string. 815 while (position < end) { 816 // If escape character. 817 if (ch0 == '\\') { 818 skip(1); 819 820 final char next = ch0; 821 final int afterSlash = position; 822 823 skip(1); 824 825 // Special characters. 826 switch (next) { 827 case '0': 828 case '1': 829 case '2': 830 case '3': 831 case '4': 832 case '5': 833 case '6': 834 case '7': { 835 if (strict) { 836 // "\0" itself is allowed in strict mode. Only other 'real' 837 // octal escape sequences are not allowed (eg. "\02", "\31"). 838 // See section 7.8.4 String literals production EscapeSequence 839 if (next != '0' || (ch0 >= '0' && ch0 <= '9')) { 840 error(Lexer.message("strict.no.octal"), STRING, position, limit); 841 } 842 } 843 reset(afterSlash); 844 // Octal sequence. 845 final int ch = octalSequence(); 846 847 if (ch < 0) { 848 sb.append('\\'); 849 sb.append('x'); 850 } else { 851 sb.append((char)ch); 852 } 853 break; 854 } 855 case 'n': 856 sb.append('\n'); 857 break; 858 case 't': 859 sb.append('\t'); 860 break; 861 case 'b': 862 sb.append('\b'); 863 break; 864 case 'f': 865 sb.append('\f'); 866 break; 867 case 'r': 868 sb.append('\r'); 869 break; 870 case '\'': 871 sb.append('\''); 872 break; 873 case '\"': 874 sb.append('\"'); 875 break; 876 case '\\': 877 sb.append('\\'); 878 break; 879 case '\r': // CR | CRLF 880 if (ch0 == '\n') { 881 skip(1); 882 } 883 // fall through 884 case '\n': // LF 885 case '\u2028': // LS 886 case '\u2029': // PS 887 // continue on the next line, slash-return continues string 888 // literal 889 break; 890 case 'x': { 891 // Hex sequence. 892 final int ch = hexSequence(2, STRING); 893 894 if (ch < 0) { 895 sb.append('\\'); 896 sb.append('x'); 897 } else { 898 sb.append((char)ch); 899 } 900 } 901 break; 902 case 'u': { 903 // Unicode sequence. 904 final int ch = hexSequence(4, STRING); 905 906 if (ch < 0) { 907 sb.append('\\'); 908 sb.append('u'); 909 } else { 910 sb.append((char)ch); 911 } 912 } 913 break; 914 case 'v': 915 sb.append('\u000B'); 916 break; 917 // All other characters. 918 default: 919 sb.append(next); 920 break; 921 } 922 } else { 923 // Add regular character. 924 sb.append(ch0); 925 skip(1); 926 } 927 } 928 929 // Restore position. 930 reset(savePosition); 931 932 return sb.toString(); 933 } 934 935 /** 936 * Scan over a string literal. 937 * @param add true if we nare not just scanning but should actually modify the token stream 938 */ 939 protected void scanString(final boolean add) { 940 // Type of string. 941 TokenType type = STRING; 942 // Record starting quote. 943 final char quote = ch0; 944 // Skip over quote. 945 skip(1); 946 947 // Record beginning of string content. 948 final State stringState = saveState(); 949 950 // Scan until close quote or end of line. 951 while (!atEOF() && ch0 != quote && !isEOL(ch0)) { 952 // Skip over escaped character. 953 if (ch0 == '\\') { 954 type = ESCSTRING; 955 skip(1); 956 if (! isEscapeCharacter(ch0)) { 957 error(Lexer.message("invalid.escape.char"), STRING, position, limit); 958 } 959 if (isEOL(ch0)) { 960 // Multiline string literal 961 skipEOL(false); 962 continue; 963 } 964 } 965 // Skip literal character. 966 skip(1); 967 } 968 969 // If close quote. 970 if (ch0 == quote) { 971 // Skip close quote. 972 skip(1); 973 } else { 974 error(Lexer.message("missing.close.quote"), STRING, position, limit); 975 } 976 977 // If not just scanning. 978 if (add) { 979 // Record end of string. 980 stringState.setLimit(position - 1); 981 982 if (scripting && !stringState.isEmpty()) { 983 switch (quote) { 984 case '`': 985 // Mark the beginning of an exec string. 986 add(EXECSTRING, stringState.position, stringState.limit); 987 // Frame edit string with left brace. 988 add(LBRACE, stringState.position, stringState.position); 989 // Process edit string. 990 editString(type, stringState); 991 // Frame edit string with right brace. 992 add(RBRACE, stringState.limit, stringState.limit); 993 break; 994 case '"': 995 // Only edit double quoted strings. 996 editString(type, stringState); 997 break; 998 case '\'': 999 // Add string token without editing. 1000 add(type, stringState.position, stringState.limit); 1001 break; 1002 default: 1003 break; 1004 } 1005 } else { 1006 /// Add string token without editing. 1007 add(type, stringState.position, stringState.limit); 1008 } 1009 } 1010 } 1011 1012 /** 1013 * Is the given character a valid escape char after "\" ? 1014 * 1015 * @param ch character to be checked 1016 * @return if the given character is valid after "\" 1017 */ 1018 protected boolean isEscapeCharacter(final char ch) { 1019 return true; 1020 } 1021 1022 /** 1023 * Convert string to number. 1024 * 1025 * @param valueString String to convert. 1026 * @param radix Numeric base. 1027 * @return Converted number. 1028 */ 1029 private static Number valueOf(final String valueString, final int radix) throws NumberFormatException { 1030 try { 1031 final long value = Long.parseLong(valueString, radix); 1032 if(value >= MIN_INT_L && value <= MAX_INT_L) { 1033 return Integer.valueOf((int)value); 1034 } 1035 return Long.valueOf(value); 1036 } catch (final NumberFormatException e) { 1037 if (radix == 10) { 1038 return Double.valueOf(valueString); 1039 } 1040 1041 double value = 0.0; 1042 1043 for (int i = 0; i < valueString.length(); i++) { 1044 final char ch = valueString.charAt(i); 1045 // Preverified, should always be a valid digit. 1046 final int digit = convertDigit(ch, radix); 1047 value *= radix; 1048 value += digit; 1049 } 1050 1051 return value; 1052 } 1053 } 1054 1055 /** 1056 * Scan a number. 1057 */ 1058 protected void scanNumber() { 1059 // Record beginning of number. 1060 final int start = position; 1061 // Assume value is a decimal. 1062 TokenType type = DECIMAL; 1063 1064 // First digit of number. 1065 int digit = convertDigit(ch0, 10); 1066 1067 // If number begins with 0x. 1068 if (digit == 0 && (ch1 == 'x' || ch1 == 'X') && convertDigit(ch2, 16) != -1) { 1069 // Skip over 0xN. 1070 skip(3); 1071 // Skip over remaining digits. 1072 while (convertDigit(ch0, 16) != -1) { 1073 skip(1); 1074 } 1075 1076 type = HEXADECIMAL; 1077 } else { 1078 // Check for possible octal constant. 1079 boolean octal = digit == 0; 1080 // Skip first digit if not leading '.'. 1081 if (digit != -1) { 1082 skip(1); 1083 } 1084 1085 // Skip remaining digits. 1086 while ((digit = convertDigit(ch0, 10)) != -1) { 1087 // Check octal only digits. 1088 octal = octal && digit < 8; 1089 // Skip digit. 1090 skip(1); 1091 } 1092 1093 if (octal && position - start > 1) { 1094 type = OCTAL; 1095 } else if (ch0 == '.' || ch0 == 'E' || ch0 == 'e') { 1096 // Must be a double. 1097 if (ch0 == '.') { 1098 // Skip period. 1099 skip(1); 1100 // Skip mantissa. 1101 while (convertDigit(ch0, 10) != -1) { 1102 skip(1); 1103 } 1104 } 1105 1106 // Detect exponent. 1107 if (ch0 == 'E' || ch0 == 'e') { 1108 // Skip E. 1109 skip(1); 1110 // Detect and skip exponent sign. 1111 if (ch0 == '+' || ch0 == '-') { 1112 skip(1); 1113 } 1114 // Skip exponent. 1115 while (convertDigit(ch0, 10) != -1) { 1116 skip(1); 1117 } 1118 } 1119 1120 type = FLOATING; 1121 } 1122 } 1123 1124 if (Character.isJavaIdentifierStart(ch0)) { 1125 error(Lexer.message("missing.space.after.number"), type, position, 1); 1126 } 1127 1128 // Add number token. 1129 add(type, start); 1130 } 1131 1132 /** 1133 * Convert a regex token to a token object. 1134 * 1135 * @param start Position in source content. 1136 * @param length Length of regex token. 1137 * @return Regex token object. 1138 */ 1139 XMLToken valueOfXML(final int start, final int length) { 1140 return new XMLToken(source.getString(start, length)); 1141 } 1142 1143 /** 1144 * Scan over a XML token. 1145 * 1146 * @return TRUE if is an XML literal. 1147 */ 1148 private boolean scanXMLLiteral() { 1149 assert ch0 == '<' && Character.isJavaIdentifierStart(ch1); 1150 if (XML_LITERALS) { 1151 // Record beginning of xml expression. 1152 final int start = position; 1153 1154 int openCount = 0; 1155 1156 do { 1157 if (ch0 == '<') { 1158 if (ch1 == '/' && Character.isJavaIdentifierStart(ch2)) { 1159 skip(3); 1160 openCount--; 1161 } else if (Character.isJavaIdentifierStart(ch1)) { 1162 skip(2); 1163 openCount++; 1164 } else if (ch1 == '?') { 1165 skip(2); 1166 } else if (ch1 == '!' && ch2 == '-' && ch3 == '-') { 1167 skip(4); 1168 } else { 1169 reset(start); 1170 return false; 1171 } 1172 1173 while (!atEOF() && ch0 != '>') { 1174 if (ch0 == '/' && ch1 == '>') { 1175 openCount--; 1176 skip(1); 1177 break; 1178 } else if (ch0 == '\"' || ch0 == '\'') { 1179 scanString(false); 1180 } else { 1181 skip(1); 1182 } 1183 } 1184 1185 if (ch0 != '>') { 1186 reset(start); 1187 return false; 1188 } 1189 1190 skip(1); 1191 } else if (atEOF()) { 1192 reset(start); 1193 return false; 1194 } else { 1195 skip(1); 1196 } 1197 } while (openCount > 0); 1198 1199 add(XML, start); 1200 return true; 1201 } 1202 1203 return false; 1204 } 1205 1206 /** 1207 * Scan over identifier characters. 1208 * 1209 * @return Length of identifier or zero if none found. 1210 */ 1211 private int scanIdentifier() { 1212 final int start = position; 1213 1214 // Make sure first character is valid start character. 1215 if (ch0 == '\\' && ch1 == 'u') { 1216 skip(2); 1217 final int ch = hexSequence(4, TokenType.IDENT); 1218 1219 if (!Character.isJavaIdentifierStart(ch)) { 1220 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1221 } 1222 } else if (!Character.isJavaIdentifierStart(ch0)) { 1223 // Not an identifier. 1224 return 0; 1225 } 1226 1227 // Make sure remaining characters are valid part characters. 1228 while (!atEOF()) { 1229 if (ch0 == '\\' && ch1 == 'u') { 1230 skip(2); 1231 final int ch = hexSequence(4, TokenType.IDENT); 1232 1233 if (!Character.isJavaIdentifierPart(ch)) { 1234 error(Lexer.message("illegal.identifier.character"), TokenType.IDENT, start, position); 1235 } 1236 } else if (Character.isJavaIdentifierPart(ch0)) { 1237 skip(1); 1238 } else { 1239 break; 1240 } 1241 } 1242 1243 // Length of identifier sequence. 1244 return position - start; 1245 } 1246 1247 /** 1248 * Compare two identifiers (in content) for equality. 1249 * 1250 * @param aStart Start of first identifier. 1251 * @param aLength Length of first identifier. 1252 * @param bStart Start of second identifier. 1253 * @param bLength Length of second identifier. 1254 * @return True if equal. 1255 */ 1256 private boolean identifierEqual(final int aStart, final int aLength, final int bStart, final int bLength) { 1257 if (aLength == bLength) { 1258 for (int i = 0; i < aLength; i++) { 1259 if (content[aStart + i] != content[bStart + i]) { 1260 return false; 1261 } 1262 } 1263 1264 return true; 1265 } 1266 1267 return false; 1268 } 1269 1270 /** 1271 * Detect if a line starts with a marker identifier. 1272 * 1273 * @param identStart Start of identifier. 1274 * @param identLength Length of identifier. 1275 * @return True if detected. 1276 */ 1277 private boolean hasHereMarker(final int identStart, final int identLength) { 1278 // Skip any whitespace. 1279 skipWhitespace(false); 1280 1281 return identifierEqual(identStart, identLength, position, scanIdentifier()); 1282 } 1283 1284 /** 1285 * Lexer to service edit strings. 1286 */ 1287 private static class EditStringLexer extends Lexer { 1288 /** Type of string literals to emit. */ 1289 final TokenType stringType; 1290 1291 /* 1292 * Constructor. 1293 */ 1294 1295 EditStringLexer(final Lexer lexer, final TokenType stringType, final State stringState) { 1296 super(lexer, stringState); 1297 1298 this.stringType = stringType; 1299 } 1300 1301 /** 1302 * Lexify the contents of the string. 1303 */ 1304 @Override 1305 public void lexify() { 1306 // Record start of string position. 1307 int stringStart = position; 1308 // Indicate that the priming first string has not been emitted. 1309 boolean primed = false; 1310 1311 while (true) { 1312 // Detect end of content. 1313 if (atEOF()) { 1314 break; 1315 } 1316 1317 // Honour escapes (should be well formed.) 1318 if (ch0 == '\\' && stringType == ESCSTRING) { 1319 skip(2); 1320 1321 continue; 1322 } 1323 1324 // If start of expression. 1325 if (ch0 == '$' && ch1 == '{') { 1326 if (!primed || stringStart != position) { 1327 if (primed) { 1328 add(ADD, stringStart, stringStart + 1); 1329 } 1330 1331 add(stringType, stringStart, position); 1332 primed = true; 1333 } 1334 1335 // Skip ${ 1336 skip(2); 1337 1338 // Save expression state. 1339 final State expressionState = saveState(); 1340 1341 // Start with one open brace. 1342 int braceCount = 1; 1343 1344 // Scan for the rest of the string. 1345 while (!atEOF()) { 1346 // If closing brace. 1347 if (ch0 == '}') { 1348 // Break only only if matching brace. 1349 if (--braceCount == 0) { 1350 break; 1351 } 1352 } else if (ch0 == '{') { 1353 // Bump up the brace count. 1354 braceCount++; 1355 } 1356 1357 // Skip to next character. 1358 skip(1); 1359 } 1360 1361 // If braces don't match then report an error. 1362 if (braceCount != 0) { 1363 error(Lexer.message("edit.string.missing.brace"), LBRACE, expressionState.position - 1, 1); 1364 } 1365 1366 // Mark end of expression. 1367 expressionState.setLimit(position); 1368 // Skip closing brace. 1369 skip(1); 1370 1371 // Start next string. 1372 stringStart = position; 1373 1374 // Concatenate expression. 1375 add(ADD, expressionState.position, expressionState.position + 1); 1376 add(LPAREN, expressionState.position, expressionState.position + 1); 1377 1378 // Scan expression. 1379 final Lexer lexer = new Lexer(this, expressionState); 1380 lexer.lexify(); 1381 1382 // Close out expression parenthesis. 1383 add(RPAREN, position - 1, position); 1384 1385 continue; 1386 } 1387 1388 // Next character in string. 1389 skip(1); 1390 } 1391 1392 // If there is any unemitted string portion. 1393 if (stringStart != limit) { 1394 // Concatenate remaining string. 1395 if (primed) { 1396 add(ADD, stringStart, 1); 1397 } 1398 1399 add(stringType, stringStart, limit); 1400 } 1401 } 1402 1403 } 1404 1405 /** 1406 * Edit string for nested expressions. 1407 * 1408 * @param stringType Type of string literals to emit. 1409 * @param stringState State of lexer at start of string. 1410 */ 1411 private void editString(final TokenType stringType, final State stringState) { 1412 // Use special lexer to scan string. 1413 final EditStringLexer lexer = new EditStringLexer(this, stringType, stringState); 1414 lexer.lexify(); 1415 1416 // Need to keep lexer informed. 1417 last = stringType; 1418 } 1419 1420 /** 1421 * Scan over a here string. 1422 * 1423 * @return TRUE if is a here string. 1424 */ 1425 private boolean scanHereString(final LineInfoReceiver lir) { 1426 assert ch0 == '<' && ch1 == '<'; 1427 if (scripting) { 1428 // Record beginning of here string. 1429 final State saved = saveState(); 1430 1431 // << or <<< 1432 final boolean excludeLastEOL = ch2 != '<'; 1433 1434 if (excludeLastEOL) { 1435 skip(2); 1436 } else { 1437 skip(3); 1438 } 1439 1440 // Scan identifier. 1441 final int identStart = position; 1442 final int identLength = scanIdentifier(); 1443 1444 // Check for identifier. 1445 if (identLength == 0) { 1446 // Treat as shift. 1447 restoreState(saved); 1448 1449 return false; 1450 } 1451 1452 // Record rest of line. 1453 final State restState = saveState(); 1454 // keep line number updated 1455 int lastLine = line; 1456 1457 skipLine(false); 1458 lastLine++; 1459 int lastLinePosition = position; 1460 restState.setLimit(position); 1461 1462 // Record beginning of string. 1463 final State stringState = saveState(); 1464 int stringEnd = position; 1465 1466 // Hunt down marker. 1467 while (!atEOF()) { 1468 // Skip any whitespace. 1469 skipWhitespace(false); 1470 1471 if (hasHereMarker(identStart, identLength)) { 1472 break; 1473 } 1474 1475 skipLine(false); 1476 lastLine++; 1477 lastLinePosition = position; 1478 stringEnd = position; 1479 } 1480 1481 // notify last line information 1482 lir.lineInfo(lastLine, lastLinePosition); 1483 1484 // Record end of string. 1485 stringState.setLimit(stringEnd); 1486 1487 // If marker is missing. 1488 if (stringState.isEmpty() || atEOF()) { 1489 error(Lexer.message("here.missing.end.marker", source.getString(identStart, identLength)), last, position, position); 1490 restoreState(saved); 1491 1492 return false; 1493 } 1494 1495 // Remove last end of line if specified. 1496 if (excludeLastEOL) { 1497 // Handles \n. 1498 if (content[stringEnd - 1] == '\n') { 1499 stringEnd--; 1500 } 1501 1502 // Handles \r and \r\n. 1503 if (content[stringEnd - 1] == '\r') { 1504 stringEnd--; 1505 } 1506 1507 // Update end of string. 1508 stringState.setLimit(stringEnd); 1509 } 1510 1511 // Edit string if appropriate. 1512 if (scripting && !stringState.isEmpty()) { 1513 editString(STRING, stringState); 1514 } else { 1515 // Add here string. 1516 add(STRING, stringState.position, stringState.limit); 1517 } 1518 1519 // Scan rest of original line. 1520 final Lexer restLexer = new Lexer(this, restState); 1521 1522 restLexer.lexify(); 1523 1524 return true; 1525 } 1526 1527 return false; 1528 } 1529 1530 /** 1531 * Breaks source content down into lex units, adding tokens to the token 1532 * stream. The routine scans until the stream buffer is full. Can be called 1533 * repeatedly until EOF is detected. 1534 */ 1535 public void lexify() { 1536 while (!stream.isFull() || nested) { 1537 // Skip over whitespace. 1538 skipWhitespace(true); 1539 1540 // Detect end of file. 1541 if (atEOF()) { 1542 if (!nested) { 1543 // Add an EOF token at the end. 1544 add(EOF, position); 1545 } 1546 1547 break; 1548 } 1549 1550 // Check for comments. Note that we don't scan for regexp and other literals here as 1551 // we may not have enough context to distinguish them from similar looking operators. 1552 // Instead we break on ambiguous operators below and let the parser decide. 1553 if (ch0 == '/' && skipComments()) { 1554 continue; 1555 } 1556 1557 if (scripting && ch0 == '#' && skipComments()) { 1558 continue; 1559 } 1560 1561 // TokenType for lookup of delimiter or operator. 1562 TokenType type; 1563 1564 if (ch0 == '.' && convertDigit(ch1, 10) != -1) { 1565 // '.' followed by digit. 1566 // Scan and add a number. 1567 scanNumber(); 1568 } else if ((type = TokenLookup.lookupOperator(ch0, ch1, ch2, ch3)) != null) { 1569 // Get the number of characters in the token. 1570 final int typeLength = type.getLength(); 1571 // Skip that many characters. 1572 skip(typeLength); 1573 // Add operator token. 1574 add(type, position - typeLength); 1575 // Some operator tokens also mark the beginning of regexp, XML, or here string literals. 1576 // We break to let the parser decide what it is. 1577 if (canStartLiteral(type)) { 1578 break; 1579 } 1580 } else if (Character.isJavaIdentifierStart(ch0) || ch0 == '\\' && ch1 == 'u') { 1581 // Scan and add identifier or keyword. 1582 scanIdentifierOrKeyword(); 1583 } else if (isStringDelimiter(ch0)) { 1584 // Scan and add a string. 1585 scanString(true); 1586 } else if (Character.isDigit(ch0)) { 1587 // Scan and add a number. 1588 scanNumber(); 1589 } else { 1590 // Don't recognize this character. 1591 skip(1); 1592 add(ERROR, position - 1); 1593 } 1594 } 1595 } 1596 1597 /** 1598 * Return value of token given its token descriptor. 1599 * 1600 * @param token Token descriptor. 1601 * @return JavaScript value. 1602 */ 1603 Object getValueOf(final long token, final boolean strict) { 1604 final int start = Token.descPosition(token); 1605 final int len = Token.descLength(token); 1606 1607 switch (Token.descType(token)) { 1608 case DECIMAL: 1609 return Lexer.valueOf(source.getString(start, len), 10); // number 1610 case OCTAL: 1611 return Lexer.valueOf(source.getString(start, len), 8); // number 1612 case HEXADECIMAL: 1613 return Lexer.valueOf(source.getString(start + 2, len - 2), 16); // number 1614 case FLOATING: 1615 return Double.valueOf(source.getString(start, len)); // number 1616 case STRING: 1617 return source.getString(start, len); // String 1618 case ESCSTRING: 1619 return valueOfString(start, len, strict); // String 1620 case IDENT: 1621 return valueOfIdent(start, len); // String 1622 case REGEX: 1623 return valueOfPattern(start, len); // RegexToken::LexerToken 1624 case XML: 1625 return valueOfXML(start, len); // XMLToken::LexerToken 1626 default: 1627 break; 1628 } 1629 1630 return null; 1631 } 1632 1633 /** 1634 * Get the correctly localized error message for a given message id format arguments 1635 * @param msgId message id 1636 * @param args format arguments 1637 * @return message 1638 */ 1639 protected static String message(final String msgId, final String... args) { 1640 return ECMAErrors.getMessage("lexer.error." + msgId, args); 1641 } 1642 1643 /** 1644 * Generate a runtime exception 1645 * 1646 * @param message error message 1647 * @param type token type 1648 * @param start start position of lexed error 1649 * @param length length of lexed error 1650 * @throws ParserException unconditionally 1651 */ 1652 protected void error(final String message, final TokenType type, final int start, final int length) throws ParserException { 1653 final long token = Token.toDesc(type, start, length); 1654 final int pos = Token.descPosition(token); 1655 final int lineNum = source.getLine(pos); 1656 final int columnNum = source.getColumn(pos); 1657 final String formatted = ErrorManager.format(message, source, lineNum, columnNum, token); 1658 throw new ParserException(JSErrorType.SYNTAX_ERROR, formatted, source, lineNum, columnNum, token); 1659 } 1660 1661 /** 1662 * Helper class for Lexer tokens, e.g XML or RegExp tokens. 1663 * This is the abstract superclass 1664 */ 1665 public static abstract class LexerToken { 1666 private final String expression; 1667 1668 /** 1669 * Constructor 1670 * @param expression token expression 1671 */ 1672 protected LexerToken(final String expression) { 1673 this.expression = expression; 1674 } 1675 1676 /** 1677 * Get the expression 1678 * @return expression 1679 */ 1680 public String getExpression() { 1681 return expression; 1682 } 1683 } 1684 1685 /** 1686 * Temporary container for regular expressions. 1687 */ 1688 public static class RegexToken extends LexerToken { 1689 /** Options. */ 1690 private final String options; 1691 1692 /** 1693 * Constructor. 1694 * 1695 * @param expression regexp expression 1696 * @param options regexp options 1697 */ 1698 public RegexToken(final String expression, final String options) { 1699 super(expression); 1700 this.options = options; 1701 } 1702 1703 /** 1704 * Get regexp options 1705 * @return options 1706 */ 1707 public String getOptions() { 1708 return options; 1709 } 1710 1711 @Override 1712 public String toString() { 1713 return '/' + getExpression() + '/' + options; 1714 } 1715 } 1716 1717 /** 1718 * Temporary container for XML expression. 1719 */ 1720 public static class XMLToken extends LexerToken { 1721 1722 /** 1723 * Constructor. 1724 * 1725 * @param expression XML expression 1726 */ 1727 public XMLToken(final String expression) { 1728 super(expression); 1729 } 1730 } 1731 }