1 /* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20 package jdk.nashorn.internal.joni; 21 22 import static jdk.nashorn.internal.joni.Option.isSingleline; 23 import static jdk.nashorn.internal.joni.ast.QuantifierNode.isRepeatInfinite; 24 25 import jdk.nashorn.internal.joni.ast.QuantifierNode; 26 import jdk.nashorn.internal.joni.constants.AnchorType; 27 import jdk.nashorn.internal.joni.constants.MetaChar; 28 import jdk.nashorn.internal.joni.constants.TokenType; 29 import jdk.nashorn.internal.joni.encoding.CharacterType; 30 import jdk.nashorn.internal.joni.encoding.PosixBracket; 31 import jdk.nashorn.internal.joni.encoding.Ptr; 32 import jdk.nashorn.internal.joni.exception.ErrorMessages; 33 import jdk.nashorn.internal.joni.exception.JOniException; 34 35 class Lexer extends ScannerSupport { 36 protected final ScanEnvironment env; 37 protected final Syntax syntax; // fast access to syntax 38 protected final Token token = new Token(); // current token 39 40 protected Lexer(ScanEnvironment env, char[] chars, int p, int end) { 41 super(chars, p, end); 42 this.env = env; 43 this.syntax = env.syntax; 44 } 45 46 /** 47 * @return 0: normal {n,m}, 2: fixed {n} 48 * !introduce returnCode here 49 */ 50 private int fetchRangeQuantifier() { 51 mark(); 52 boolean synAllow = syntax.allowInvalidInterval(); 53 54 if (!left()) { 55 if (synAllow) { 56 return 1; /* "....{" : OK! */ 57 } else { 58 newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE); 59 } 60 } 61 62 if (!synAllow) { 63 c = peek(); 64 if (c == ')' || c == '(' || c == '|') { 65 newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE); 66 } 67 } 68 69 int low = scanUnsignedNumber(); 70 if (low < 0) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); 71 if (low > Config.MAX_REPEAT_NUM) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); 72 73 boolean nonLow = false; 74 if (p == _p) { /* can't read low */ 75 if (syntax.allowIntervalLowAbbrev()) { 76 low = 0; 77 nonLow = true; 78 } else { 79 return invalidRangeQuantifier(synAllow); 80 } 81 } 82 83 if (!left()) return invalidRangeQuantifier(synAllow); 84 85 fetch(); 86 int up; 87 int ret = 0; 88 if (c == ',') { 89 int prev = p; // ??? last 90 up = scanUnsignedNumber(); 91 if (up < 0) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); 92 if (up > Config.MAX_REPEAT_NUM) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); 93 94 if (p == prev) { 95 if (nonLow) return invalidRangeQuantifier(synAllow); 96 up = QuantifierNode.REPEAT_INFINITE; /* {n,} : {n,infinite} */ 97 } 98 } else { 99 if (nonLow) return invalidRangeQuantifier(synAllow); 100 unfetch(); 101 up = low; /* {n} : exact n times */ 102 ret = 2; /* fixed */ 103 } 104 105 if (!left()) return invalidRangeQuantifier(synAllow); 106 fetch(); 107 108 if (syntax.opEscBraceInterval()) { 109 if (c != syntax.metaCharTable.esc) return invalidRangeQuantifier(synAllow); 110 fetch(); 111 } 112 113 if (c != '}') return invalidRangeQuantifier(synAllow); 114 115 if (!isRepeatInfinite(up) && low > up) { 116 newValueException(ERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE); 117 } 118 119 token.type = TokenType.INTERVAL; 120 token.setRepeatLower(low); 121 token.setRepeatUpper(up); 122 123 return ret; /* 0: normal {n,m}, 2: fixed {n} */ 124 } 125 126 private int invalidRangeQuantifier(boolean synAllow) { 127 if (synAllow) { 128 restore(); 129 return 1; 130 } else { 131 newSyntaxException(ERR_INVALID_REPEAT_RANGE_PATTERN); 132 return 0; // not reached 133 } 134 } 135 136 /* \M-, \C-, \c, or \... */ 137 private int fetchEscapedValue() { 138 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE); 139 fetch(); 140 141 switch(c) { 142 143 case 'M': 144 if (syntax.op2EscCapitalMBarMeta()) { 145 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_META); 146 fetch(); 147 if (c != '-') newSyntaxException(ERR_META_CODE_SYNTAX); 148 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_META); 149 fetch(); 150 if (c == syntax.metaCharTable.esc) { 151 c = fetchEscapedValue(); 152 } 153 c = ((c & 0xff) | 0x80); 154 } else { 155 fetchEscapedValueBackSlash(); 156 } 157 break; 158 159 case 'C': 160 if (syntax.op2EscCapitalCBarControl()) { 161 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_CONTROL); 162 fetch(); 163 if (c != '-') newSyntaxException(ERR_CONTROL_CODE_SYNTAX); 164 fetchEscapedValueControl(); 165 } else { 166 fetchEscapedValueBackSlash(); 167 } 168 break; 169 170 case 'c': 171 if (syntax.opEscCControl()) { 172 fetchEscapedValueControl(); 173 } 174 /* fall through */ 175 176 default: 177 fetchEscapedValueBackSlash(); 178 } // switch 179 180 return c; // ??? 181 } 182 183 private void fetchEscapedValueBackSlash() { 184 c = env.convertBackslashValue(c); 185 } 186 187 private void fetchEscapedValueControl() { 188 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_CONTROL); 189 fetch(); 190 if (c == '?') { 191 c = 0177; 192 } else { 193 if (c == syntax.metaCharTable.esc) { 194 c = fetchEscapedValue(); 195 } 196 c &= 0x9f; 197 } 198 } 199 200 private int nameEndCodePoint(int start) { 201 switch(start) { 202 case '<': 203 return '>'; 204 case '\'': 205 return '\''; 206 default: 207 return 0; 208 } 209 } 210 211 // USE_NAMED_GROUP && USE_BACKREF_AT_LEVEL 212 /* 213 \k<name+n>, \k<name-n> 214 \k<num+n>, \k<num-n> 215 \k<-num+n>, \k<-num-n> 216 */ 217 218 // value implicit (rnameEnd) 219 private boolean fetchNameWithLevel(int startCode, Ptr rbackNum, Ptr rlevel) { 220 int src = p; 221 boolean existLevel = false; 222 int isNum = 0; 223 int sign = 1; 224 225 int endCode = nameEndCodePoint(startCode); 226 int pnumHead = p; 227 int nameEnd = stop; 228 229 String err = null; 230 if (!left()) { 231 newValueException(ERR_EMPTY_GROUP_NAME); 232 } else { 233 fetch(); 234 if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME); 235 if (Character.isDigit(c)) { 236 isNum = 1; 237 } else if (c == '-') { 238 isNum = 2; 239 sign = -1; 240 pnumHead = p; 241 } else if (!EncodingHelper.isWord(c)) { 242 err = ERR_INVALID_GROUP_NAME; 243 } 244 } 245 246 while (left()) { 247 nameEnd = p; 248 fetch(); 249 if (c == endCode || c == ')' || c == '+' || c == '-') { 250 if (isNum == 2) err = ERR_INVALID_GROUP_NAME; 251 break; 252 } 253 254 if (isNum != 0) { 255 if (EncodingHelper.isDigit(c)) { 256 isNum = 1; 257 } else { 258 err = ERR_INVALID_GROUP_NAME; 259 // isNum = 0; 260 } 261 } else if (!EncodingHelper.isWord(c)) { 262 err = ERR_INVALID_CHAR_IN_GROUP_NAME; 263 } 264 } 265 266 boolean isEndCode = false; 267 if (err == null && c != endCode) { 268 if (c == '+' || c == '-') { 269 int flag = c == '-' ? -1 : 1; 270 271 fetch(); 272 if (!EncodingHelper.isDigit(c)) newValueException(ERR_INVALID_GROUP_NAME, src, stop); 273 unfetch(); 274 int level = scanUnsignedNumber(); 275 if (level < 0) newValueException(ERR_TOO_BIG_NUMBER); 276 rlevel.p = level * flag; 277 existLevel = true; 278 279 fetch(); 280 isEndCode = c == endCode; 281 } 282 283 if (!isEndCode) { 284 err = ERR_INVALID_GROUP_NAME; 285 nameEnd = stop; 286 } 287 } 288 289 if (err == null) { 290 if (isNum != 0) { 291 mark(); 292 p = pnumHead; 293 int backNum = scanUnsignedNumber(); 294 restore(); 295 if (backNum < 0) { 296 newValueException(ERR_TOO_BIG_NUMBER); 297 } else if (backNum == 0) { 298 newValueException(ERR_INVALID_GROUP_NAME, src, stop); 299 } 300 rbackNum.p = backNum * sign; 301 } 302 value = nameEnd; 303 return existLevel; 304 } else { 305 newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd); 306 return false; // not reached 307 } 308 } 309 310 // USE_NAMED_GROUP 311 // ref: 0 -> define name (don't allow number name) 312 // 1 -> reference name (allow number name) 313 private int fetchNameForNamedGroup(int startCode, boolean ref) { 314 int src = p; 315 value = 0; 316 317 int isNum = 0; 318 int sign = 1; 319 320 int endCode = nameEndCodePoint(startCode); 321 int pnumHead = p; 322 int nameEnd = stop; 323 324 String err = null; 325 if (!left()) { 326 newValueException(ERR_EMPTY_GROUP_NAME); 327 } else { 328 fetch(); 329 if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME); 330 if (EncodingHelper.isDigit(c)) { 331 if (ref) { 332 isNum = 1; 333 } else { 334 err = ERR_INVALID_GROUP_NAME; 335 // isNum = 0; 336 } 337 } else if (c == '-') { 338 if (ref) { 339 isNum = 2; 340 sign = -1; 341 pnumHead = p; 342 } else { 343 err = ERR_INVALID_GROUP_NAME; 344 // isNum = 0; 345 } 346 } else if (!EncodingHelper.isWord(c)) { 347 err = ERR_INVALID_CHAR_IN_GROUP_NAME; 348 } 349 } 350 351 if (err == null) { 352 while (left()) { 353 nameEnd = p; 354 fetch(); 355 if (c == endCode || c == ')') { 356 if (isNum == 2) err = ERR_INVALID_GROUP_NAME; 357 break; 358 } 359 360 if (isNum != 0) { 361 if (EncodingHelper.isDigit(c)) { 362 isNum = 1; 363 } else { 364 if (!EncodingHelper.isWord(c)) { 365 err = ERR_INVALID_CHAR_IN_GROUP_NAME; 366 } else { 367 err = ERR_INVALID_GROUP_NAME; 368 } 369 // isNum = 0; 370 } 371 } else { 372 if (!EncodingHelper.isWord(c)) { 373 err = ERR_INVALID_CHAR_IN_GROUP_NAME; 374 } 375 } 376 } 377 378 if (c != endCode) { 379 err = ERR_INVALID_GROUP_NAME; 380 nameEnd = stop; 381 } 382 383 int backNum = 0; 384 if (isNum != 0) { 385 mark(); 386 p = pnumHead; 387 backNum = scanUnsignedNumber(); 388 restore(); 389 if (backNum < 0) { 390 newValueException(ERR_TOO_BIG_NUMBER); 391 } else if (backNum == 0) { 392 newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd); 393 } 394 backNum *= sign; 395 } 396 value = nameEnd; 397 return backNum; 398 } else { 399 while (left()) { 400 nameEnd = p; 401 fetch(); 402 if (c == endCode || c == ')') break; 403 } 404 if (!left()) nameEnd = stop; 405 newValueException(err, src, nameEnd); 406 return 0; // not reached 407 } 408 } 409 410 // #else USE_NAMED_GROUP 411 // make it return nameEnd! 412 private final int fetchNameForNoNamedGroup(int startCode, boolean ref) { 413 int src = p; 414 value = 0; 415 416 int isNum = 0; 417 int sign = 1; 418 419 int endCode = nameEndCodePoint(startCode); 420 int pnumHead = p; 421 int nameEnd = stop; 422 423 String err = null; 424 if (!left()) { 425 newValueException(ERR_EMPTY_GROUP_NAME); 426 } else { 427 fetch(); 428 if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME); 429 430 if (EncodingHelper.isDigit(c)) { 431 isNum = 1; 432 } else if (c == '-') { 433 isNum = 2; 434 sign = -1; 435 pnumHead = p; 436 } else { 437 err = ERR_INVALID_CHAR_IN_GROUP_NAME; 438 } 439 } 440 441 while(left()) { 442 nameEnd = p; 443 444 fetch(); 445 if (c == endCode || c == ')') break; 446 if (!EncodingHelper.isDigit(c)) err = ERR_INVALID_CHAR_IN_GROUP_NAME; 447 } 448 449 if (err == null && c != endCode) { 450 err = ERR_INVALID_GROUP_NAME; 451 nameEnd = stop; 452 } 453 454 if (err == null) { 455 mark(); 456 p = pnumHead; 457 int backNum = scanUnsignedNumber(); 458 restore(); 459 if (backNum < 0) { 460 newValueException(ERR_TOO_BIG_NUMBER); 461 } else if (backNum == 0){ 462 newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd); 463 } 464 backNum *= sign; 465 466 value = nameEnd; 467 return backNum; 468 } else { 469 newValueException(err, src, nameEnd); 470 return 0; // not reached 471 } 472 } 473 474 protected final int fetchName(int startCode, boolean ref) { 475 if (Config.USE_NAMED_GROUP) { 476 return fetchNameForNamedGroup(startCode, ref); 477 } else { 478 return fetchNameForNoNamedGroup(startCode, ref); 479 } 480 } 481 482 private boolean strExistCheckWithEsc(int[]s, int n, int bad) { 483 int p = this.p; 484 int to = this.stop; 485 486 boolean inEsc = false; 487 int i=0; 488 while(p < to) { 489 if (inEsc) { 490 inEsc = false; 491 p ++; 492 } else { 493 int x = chars[p]; 494 int q = p + 1; 495 if (x == s[0]) { 496 for (i=1; i<n && q < to; i++) { 497 x = chars[q]; 498 if (x != s[i]) break; 499 q++; 500 } 501 if (i >= n) return true; 502 p++; 503 } else { 504 x = chars[p]; 505 if (x == bad) return false; 506 else if (x == syntax.metaCharTable.esc) inEsc = true; 507 p = q; 508 } 509 } 510 } 511 return false; 512 } 513 514 private static final int send[] = new int[]{':', ']'}; 515 516 private void fetchTokenInCCFor_charType(boolean flag, int type) { 517 token.type = TokenType.CHAR_TYPE; 518 token.setPropCType(type); 519 token.setPropNot(flag); 520 } 521 522 private void fetchTokenInCCFor_p() { 523 int c2 = peek(); // !!! migrate to peekIs 524 if (c2 == '{' && syntax.op2EscPBraceCharProperty()) { 525 inc(); 526 token.type = TokenType.CHAR_PROPERTY; 527 token.setPropNot(c == 'P'); 528 529 if (syntax.op2EscPBraceCircumflexNot()) { 530 c2 = fetchTo(); 531 if (c2 == '^') { 532 token.setPropNot(!token.getPropNot()); 533 } else { 534 unfetch(); 535 } 536 } 537 } else { 538 syntaxWarn(Warnings.INVALID_UNICODE_PROPERTY, (char)c); 539 } 540 } 541 542 private void fetchTokenInCCFor_x() { 543 if (!left()) return; 544 int last = p; 545 546 if (peekIs('{') && syntax.opEscXBraceHex8()) { 547 inc(); 548 int num = scanUnsignedHexadecimalNumber(8); 549 if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE); 550 if (left()) { 551 int c2 = peek(); 552 if (EncodingHelper.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); 553 } 554 555 if (p > last + 1 && left() && peekIs('}')) { 556 inc(); 557 token.type = TokenType.CODE_POINT; 558 token.base = 16; 559 token.setCode(num); 560 } else { 561 /* can't read nothing or invalid format */ 562 p = last; 563 } 564 } else if (syntax.opEscXHex2()) { 565 int num = scanUnsignedHexadecimalNumber(2); 566 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 567 if (p == last) { /* can't read nothing. */ 568 num = 0; /* but, it's not error */ 569 } 570 token.type = TokenType.RAW_BYTE; 571 token.base = 16; 572 token.setC(num); 573 } 574 } 575 576 private void fetchTokenInCCFor_u() { 577 if (!left()) return; 578 int last = p; 579 580 if (syntax.op2EscUHex4()) { 581 int num = scanUnsignedHexadecimalNumber(4); 582 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 583 if (p == last) { /* can't read nothing. */ 584 num = 0; /* but, it's not error */ 585 } 586 token.type = TokenType.CODE_POINT; 587 token.base = 16; 588 token.setCode(num); 589 } 590 } 591 592 private void fetchTokenInCCFor_digit() { 593 if (syntax.opEscOctal3()) { 594 unfetch(); 595 int last = p; 596 int num = scanUnsignedOctalNumber(3); 597 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 598 if (p == last) { /* can't read nothing. */ 599 num = 0; /* but, it's not error */ 600 } 601 token.type = TokenType.RAW_BYTE; 602 token.base = 8; 603 token.setC(num); 604 } 605 } 606 607 private void fetchTokenInCCFor_posixBracket() { 608 if (syntax.opPosixBracket() && peekIs(':')) { 609 token.backP = p; /* point at '[' is readed */ 610 inc(); 611 if (strExistCheckWithEsc(send, send.length, ']')) { 612 token.type = TokenType.POSIX_BRACKET_OPEN; 613 } else { 614 unfetch(); 615 // remove duplication, goto cc_in_cc; 616 if (syntax.op2CClassSetOp()) { 617 token.type = TokenType.CC_CC_OPEN; 618 } else { 619 env.ccEscWarn("["); 620 } 621 } 622 } else { // cc_in_cc: 623 if (syntax.op2CClassSetOp()) { 624 token.type = TokenType.CC_CC_OPEN; 625 } else { 626 env.ccEscWarn("["); 627 } 628 } 629 } 630 631 private void fetchTokenInCCFor_and() { 632 if (syntax.op2CClassSetOp() && left() && peekIs('&')) { 633 inc(); 634 token.type = TokenType.CC_AND; 635 } 636 } 637 638 protected final TokenType fetchTokenInCC() { 639 if (!left()) { 640 token.type = TokenType.EOT; 641 return token.type; 642 } 643 644 fetch(); 645 token.type = TokenType.CHAR; 646 token.base = 0; 647 token.setC(c); 648 token.escaped = false; 649 650 if (c == ']') { 651 token.type = TokenType.CC_CLOSE; 652 } else if (c == '-') { 653 token.type = TokenType.CC_RANGE; 654 } else if (c == syntax.metaCharTable.esc) { 655 if (!syntax.backSlashEscapeInCC()) return token.type; 656 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE); 657 fetch(); 658 token.escaped = true; 659 token.setC(c); 660 661 switch (c) { 662 case 'w': 663 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); 664 break; 665 case 'W': 666 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); 667 break; 668 case 'd': 669 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); 670 break; 671 case 'D': 672 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); 673 break; 674 case 's': 675 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); 676 break; 677 case 'S': 678 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); 679 break; 680 case 'h': 681 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT); 682 break; 683 case 'H': 684 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT); 685 break; 686 case 'p': 687 case 'P': 688 fetchTokenInCCFor_p(); 689 break; 690 case 'x': 691 fetchTokenInCCFor_x(); 692 break; 693 case 'u': 694 fetchTokenInCCFor_u(); 695 break; 696 case '0': 697 case '1': 698 case '2': 699 case '3': 700 case '4': 701 case '5': 702 case '6': 703 case '7': 704 fetchTokenInCCFor_digit(); 705 break; 706 707 default: 708 unfetch(); 709 int num = fetchEscapedValue(); 710 if (token.getC() != num) { 711 token.setCode(num); 712 token.type = TokenType.CODE_POINT; 713 } 714 break; 715 } // switch 716 717 } else if (c == '[') { 718 fetchTokenInCCFor_posixBracket(); 719 } else if (c == '&') { 720 fetchTokenInCCFor_and(); 721 } 722 return token.type; 723 } 724 725 protected final int backrefRelToAbs(int relNo) { 726 return env.numMem + 1 + relNo; 727 } 728 729 private void fetchTokenFor_repeat(int lower, int upper) { 730 token.type = TokenType.OP_REPEAT; 731 token.setRepeatLower(lower); 732 token.setRepeatUpper(upper); 733 greedyCheck(); 734 } 735 736 private void fetchTokenFor_openBrace() { 737 switch (fetchRangeQuantifier()) { 738 case 0: 739 greedyCheck(); 740 break; 741 case 2: 742 if (syntax.fixedIntervalIsGreedyOnly()) { 743 possessiveCheck(); 744 } else { 745 greedyCheck(); 746 } 747 break; 748 default: /* 1 : normal char */ 749 } // inner switch 750 } 751 752 private void fetchTokenFor_anchor(int subType) { 753 token.type = TokenType.ANCHOR; 754 token.setAnchor(subType); 755 } 756 757 private void fetchTokenFor_xBrace() { 758 if (!left()) return; 759 760 int last = p; 761 if (peekIs('{') && syntax.opEscXBraceHex8()) { 762 inc(); 763 int num = scanUnsignedHexadecimalNumber(8); 764 if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE); 765 if (left()) { 766 if (EncodingHelper.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); 767 } 768 769 if (p > last + 1 && left() && peekIs('}')) { 770 inc(); 771 token.type = TokenType.CODE_POINT; 772 token.setCode(num); 773 } else { 774 /* can't read nothing or invalid format */ 775 p = last; 776 } 777 } else if (syntax.opEscXHex2()) { 778 int num = scanUnsignedHexadecimalNumber(2); 779 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 780 if (p == last) { /* can't read nothing. */ 781 num = 0; /* but, it's not error */ 782 } 783 token.type = TokenType.RAW_BYTE; 784 token.base = 16; 785 token.setC(num); 786 } 787 } 788 789 private void fetchTokenFor_uHex() { 790 if (!left()) return; 791 int last = p; 792 793 if (syntax.op2EscUHex4()) { 794 int num = scanUnsignedHexadecimalNumber(4); 795 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 796 if (p == last) { /* can't read nothing. */ 797 num = 0; /* but, it's not error */ 798 } 799 token.type = TokenType.CODE_POINT; 800 token.base = 16; 801 token.setCode(num); 802 } 803 } 804 805 private void fetchTokenFor_digit() { 806 unfetch(); 807 int last = p; 808 int num = scanUnsignedNumber(); 809 if (num < 0 || num > Config.MAX_BACKREF_NUM) { // goto skip_backref 810 } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */ 811 if (syntax.strictCheckBackref()) { 812 if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) newValueException(ERR_INVALID_BACKREF); 813 } 814 token.type = TokenType.BACKREF; 815 token.setBackrefNum(1); 816 token.setBackrefRef1(num); 817 token.setBackrefByName(false); 818 if (Config.USE_BACKREF_WITH_LEVEL) token.setBackrefExistLevel(false); 819 return; 820 } 821 822 if (c == '8' || c == '9') { /* normal char */ // skip_backref: 823 p = last; 824 inc(); 825 return; 826 } 827 p = last; 828 829 fetchTokenFor_zero(); /* fall through */ 830 } 831 832 private void fetchTokenFor_zero() { 833 if (syntax.opEscOctal3()) { 834 int last = p; 835 int num = scanUnsignedOctalNumber(c == '0' ? 2 : 3); 836 if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); 837 if (p == last) { /* can't read nothing. */ 838 num = 0; /* but, it's not error */ 839 } 840 token.type = TokenType.RAW_BYTE; 841 token.base = 8; 842 token.setC(num); 843 } else if (c != '0') { 844 inc(); 845 } 846 } 847 848 private void fetchTokenFor_namedBackref() { 849 if (syntax.op2EscKNamedBackref()) { 850 if (left()) { 851 fetch(); 852 if (c =='<' || c == '\'') { 853 int last = p; 854 int backNum; 855 if (Config.USE_BACKREF_WITH_LEVEL) { 856 Ptr rbackNum = new Ptr(); 857 Ptr rlevel = new Ptr(); 858 token.setBackrefExistLevel(fetchNameWithLevel(c, rbackNum, rlevel)); 859 token.setBackrefLevel(rlevel.p); 860 backNum = rbackNum.p; 861 } else { 862 backNum = fetchName(c, true); 863 } // USE_BACKREF_AT_LEVEL 864 int nameEnd = value; // set by fetchNameWithLevel/fetchName 865 866 if (backNum != 0) { 867 if (backNum < 0) { 868 backNum = backrefRelToAbs(backNum); 869 if (backNum <= 0) newValueException(ERR_INVALID_BACKREF); 870 } 871 872 if (syntax.strictCheckBackref() && (backNum > env.numMem || env.memNodes == null)) { 873 newValueException(ERR_INVALID_BACKREF); 874 } 875 token.type = TokenType.BACKREF; 876 token.setBackrefByName(false); 877 token.setBackrefNum(1); 878 token.setBackrefRef1(backNum); 879 } else { 880 NameEntry e = env.reg.nameToGroupNumbers(chars, last, nameEnd); 881 if (e == null) newValueException(ERR_UNDEFINED_NAME_REFERENCE, last, nameEnd); 882 883 if (syntax.strictCheckBackref()) { 884 if (e.backNum == 1) { 885 if (e.backRef1 > env.numMem || 886 env.memNodes == null || 887 env.memNodes[e.backRef1] == null) newValueException(ERR_INVALID_BACKREF); 888 } else { 889 for (int i=0; i<e.backNum; i++) { 890 if (e.backRefs[i] > env.numMem || 891 env.memNodes == null || 892 env.memNodes[e.backRefs[i]] == null) newValueException(ERR_INVALID_BACKREF); 893 } 894 } 895 } 896 897 token.type = TokenType.BACKREF; 898 token.setBackrefByName(true); 899 900 if (e.backNum == 1) { 901 token.setBackrefNum(1); 902 token.setBackrefRef1(e.backRef1); 903 } else { 904 token.setBackrefNum(e.backNum); 905 token.setBackrefRefs(e.backRefs); 906 } 907 } 908 } else { 909 unfetch(); 910 syntaxWarn(Warnings.INVALID_BACKREFERENCE); 911 } 912 } else { 913 syntaxWarn(Warnings.INVALID_BACKREFERENCE); 914 } 915 } 916 } 917 918 private void fetchTokenFor_subexpCall() { 919 if (syntax.op2EscGSubexpCall()) { 920 if (left()) { 921 fetch(); 922 if (c == '<' || c == '\'') { 923 int last = p; 924 int gNum = fetchName(c, true); 925 int nameEnd = value; 926 token.type = TokenType.CALL; 927 token.setCallNameP(last); 928 token.setCallNameEnd(nameEnd); 929 token.setCallGNum(gNum); 930 } else { 931 unfetch(); 932 syntaxWarn(Warnings.INVALID_SUBEXP_CALL); 933 } 934 } else { 935 syntaxWarn(Warnings.INVALID_SUBEXP_CALL); 936 } 937 } 938 } 939 940 private void fetchTokenFor_charProperty() { 941 if (peekIs('{') && syntax.op2EscPBraceCharProperty()) { 942 inc(); 943 token.type = TokenType.CHAR_PROPERTY; 944 token.setPropNot(c == 'P'); 945 946 if (syntax.op2EscPBraceCircumflexNot()) { 947 fetch(); 948 if (c == '^') { 949 token.setPropNot(!token.getPropNot()); 950 } else { 951 unfetch(); 952 } 953 } 954 } else { 955 syntaxWarn(Warnings.INVALID_UNICODE_PROPERTY, (char)c); 956 } 957 } 958 959 private void fetchTokenFor_metaChars() { 960 if (c == syntax.metaCharTable.anyChar) { 961 token.type = TokenType.ANYCHAR; 962 } else if (c == syntax.metaCharTable.anyTime) { 963 fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE); 964 } else if (c == syntax.metaCharTable.zeroOrOneTime) { 965 fetchTokenFor_repeat(0, 1); 966 } else if (c == syntax.metaCharTable.oneOrMoreTime) { 967 fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE); 968 } else if (c == syntax.metaCharTable.anyCharAnyTime) { 969 token.type = TokenType.ANYCHAR_ANYTIME; 970 // goto out 971 } 972 } 973 974 protected final TokenType fetchToken() { 975 // mark(); // out 976 start: 977 while(true) { 978 if (!left()) { 979 token.type = TokenType.EOT; 980 return token.type; 981 } 982 983 token.type = TokenType.STRING; 984 token.base = 0; 985 token.backP = p; 986 987 fetch(); 988 989 if (c == syntax.metaCharTable.esc && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn) 990 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE); 991 992 token.backP = p; 993 fetch(); 994 995 token.setC(c); 996 token.escaped = true; 997 switch(c) { 998 999 case '*': 1000 if (syntax.opEscAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE); 1001 break; 1002 case '+': 1003 if (syntax.opEscPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE); 1004 break; 1005 case '?': 1006 if (syntax.opEscQMarkZeroOne()) fetchTokenFor_repeat(0, 1); 1007 break; 1008 case '{': 1009 if (syntax.opEscBraceInterval()) fetchTokenFor_openBrace(); 1010 break; 1011 case '|': 1012 if (syntax.opEscVBarAlt()) token.type = TokenType.ALT; 1013 break; 1014 case '(': 1015 if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_OPEN; 1016 break; 1017 case ')': 1018 if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE; 1019 break; 1020 case 'w': 1021 if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); 1022 break; 1023 case 'W': 1024 if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); 1025 break; 1026 case 'b': 1027 if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.WORD_BOUND); 1028 break; 1029 case 'B': 1030 if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND); 1031 break; 1032 case '<': 1033 if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_BEGIN); 1034 break; 1035 case '>': 1036 if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_END); 1037 break; 1038 case 's': 1039 if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); 1040 break; 1041 case 'S': 1042 if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); 1043 break; 1044 case 'd': 1045 if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); 1046 break; 1047 case 'D': 1048 if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); 1049 break; 1050 case 'h': 1051 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT); 1052 break; 1053 case 'H': 1054 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT); 1055 break; 1056 case 'A': 1057 if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF); 1058 break; 1059 case 'Z': 1060 if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.SEMI_END_BUF); 1061 break; 1062 case 'z': 1063 if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF); 1064 break; 1065 case 'G': 1066 if (syntax.opEscCapitalGBeginAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_POSITION); 1067 break; 1068 case '`': 1069 if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF); 1070 break; 1071 case '\'': 1072 if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF); 1073 break; 1074 case 'x': 1075 fetchTokenFor_xBrace(); 1076 break; 1077 case 'u': 1078 fetchTokenFor_uHex(); 1079 break; 1080 case '1': 1081 case '2': 1082 case '3': 1083 case '4': 1084 case '5': 1085 case '6': 1086 case '7': 1087 case '8': 1088 case '9': 1089 fetchTokenFor_digit(); 1090 break; 1091 case '0': 1092 fetchTokenFor_zero(); 1093 break; 1094 case 'k': 1095 if (Config.USE_NAMED_GROUP) fetchTokenFor_namedBackref(); 1096 break; 1097 case 'g': 1098 if (Config.USE_SUBEXP_CALL) fetchTokenFor_subexpCall(); 1099 break; 1100 case 'Q': 1101 if (syntax.op2EscCapitalQQuote()) token.type = TokenType.QUOTE_OPEN; 1102 break; 1103 case 'p': 1104 case 'P': 1105 fetchTokenFor_charProperty(); 1106 break; 1107 1108 default: 1109 unfetch(); 1110 int num = fetchEscapedValue(); 1111 1112 /* set_raw: */ 1113 if (token.getC() != num) { 1114 token.type = TokenType.CODE_POINT; 1115 token.setCode(num); 1116 } else { /* string */ 1117 p = token.backP + 1; 1118 } 1119 break; 1120 1121 } // switch (c) 1122 1123 } else { 1124 token.setC(c); 1125 token.escaped = false; 1126 1127 if (Config.USE_VARIABLE_META_CHARS && (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters())) { 1128 fetchTokenFor_metaChars(); 1129 break; 1130 } 1131 1132 { 1133 switch(c) { 1134 case '.': 1135 if (syntax.opDotAnyChar()) token.type = TokenType.ANYCHAR; 1136 break; 1137 case '*': 1138 if (syntax.opAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE); 1139 break; 1140 case '+': 1141 if (syntax.opPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE); 1142 break; 1143 case '?': 1144 if (syntax.opQMarkZeroOne()) fetchTokenFor_repeat(0, 1); 1145 break; 1146 case '{': 1147 if (syntax.opBraceInterval()) fetchTokenFor_openBrace(); 1148 break; 1149 case '|': 1150 if (syntax.opVBarAlt()) token.type = TokenType.ALT; 1151 break; 1152 1153 case '(': 1154 if (peekIs('?') && syntax.op2QMarkGroupEffect()) { 1155 inc(); 1156 if (peekIs('#')) { 1157 fetch(); 1158 while (true) { 1159 if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP); 1160 fetch(); 1161 if (c == syntax.metaCharTable.esc) { 1162 if (left()) fetch(); 1163 } else { 1164 if (c == ')') break; 1165 } 1166 } 1167 continue start; // goto start 1168 } 1169 unfetch(); 1170 } 1171 1172 if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_OPEN; 1173 break; 1174 case ')': 1175 if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE; 1176 break; 1177 case '^': 1178 if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE); 1179 break; 1180 case '$': 1181 if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE); 1182 break; 1183 case '[': 1184 if (syntax.opBracketCC()) token.type = TokenType.CC_CC_OPEN; 1185 break; 1186 case ']': 1187 //if (*src > env->pattern) /* /].../ is allowed. */ 1188 //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); 1189 break; 1190 case '#': 1191 if (Option.isExtend(env.option)) { 1192 while (left()) { 1193 fetch(); 1194 if (EncodingHelper.isNewLine(c)) break; 1195 } 1196 continue start; // goto start 1197 } 1198 break; 1199 1200 case ' ': 1201 case '\t': 1202 case '\n': 1203 case '\r': 1204 case '\f': 1205 if (Option.isExtend(env.option)) continue start; // goto start 1206 break; 1207 1208 default: // string 1209 break; 1210 1211 } // switch 1212 } 1213 } 1214 1215 break; 1216 } // while 1217 return token.type; 1218 } 1219 1220 private void greedyCheck() { 1221 if (left() && peekIs('?') && syntax.opQMarkNonGreedy()) { 1222 1223 fetch(); 1224 1225 token.setRepeatGreedy(false); 1226 token.setRepeatPossessive(false); 1227 } else { 1228 possessiveCheck(); 1229 } 1230 } 1231 1232 private void possessiveCheck() { 1233 if (left() && peekIs('+') && 1234 (syntax.op2PlusPossessiveRepeat() && token.type != TokenType.INTERVAL || 1235 syntax.op2PlusPossessiveInterval() && token.type == TokenType.INTERVAL)) { 1236 1237 fetch(); 1238 1239 token.setRepeatGreedy(true); 1240 token.setRepeatPossessive(true); 1241 } else { 1242 token.setRepeatGreedy(true); 1243 token.setRepeatPossessive(false); 1244 } 1245 } 1246 1247 protected final int fetchCharPropertyToCType() { 1248 mark(); 1249 1250 while (left()) { 1251 int last = p; 1252 fetch(); 1253 if (c == '}') { 1254 String name = new String(chars, _p, last - _p); 1255 return PosixBracket.propertyNameToCType(name); 1256 } else if (c == '(' || c == ')' || c == '{' || c == '|') { 1257 String name = new String(chars, _p, last - _p); 1258 throw new JOniException(ERR_INVALID_CHAR_PROPERTY_NAME.replaceAll("%n", name)); 1259 } 1260 } 1261 newInternalException(ERR_PARSER_BUG); 1262 return 0; // not reached 1263 } 1264 1265 protected final void syntaxWarn(String message, char c) { 1266 syntaxWarn(message.replace("<%n>", Character.toString(c))); 1267 } 1268 1269 protected final void syntaxWarn(String message) { 1270 if (Config.USE_WARN) { 1271 env.reg.warnings.warn(message + ": /" + new String(chars, getBegin(), getEnd()) + "/"); 1272 } 1273 } 1274 }