1 /* 2 * Permission is hereby granted, free of charge, to any person obtaining a copy of 3 * this software and associated documentation files (the "Software"), to deal in 4 * the Software without restriction, including without limitation the rights to 5 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 6 * of the Software, and to permit persons to whom the Software is furnished to do 7 * so, subject to the following conditions: 8 * 9 * The above copyright notice and this permission notice shall be included in all 10 * copies or substantial portions of the Software. 11 * 12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 * SOFTWARE. 19 */ 20 package jdk.nashorn.internal.joni; 21 22 import static jdk.nashorn.internal.joni.BitStatus.bsOnAtSimple; 23 import static jdk.nashorn.internal.joni.BitStatus.bsOnOff; 24 import static jdk.nashorn.internal.joni.Option.isDontCaptureGroup; 25 import static jdk.nashorn.internal.joni.Option.isIgnoreCase; 26 27 import jdk.nashorn.internal.joni.encoding.CharacterType; 28 import jdk.nashorn.internal.joni.encoding.PosixBracket; 29 import jdk.nashorn.internal.joni.encoding.Ptr; 30 import jdk.nashorn.internal.joni.ast.AnchorNode; 31 import jdk.nashorn.internal.joni.ast.AnyCharNode; 32 import jdk.nashorn.internal.joni.ast.BackRefNode; 33 import jdk.nashorn.internal.joni.ast.CClassNode; 34 import jdk.nashorn.internal.joni.ast.CTypeNode; 35 import jdk.nashorn.internal.joni.ast.CallNode; 36 import jdk.nashorn.internal.joni.ast.ConsAltNode; 37 import jdk.nashorn.internal.joni.ast.EncloseNode; 38 import jdk.nashorn.internal.joni.ast.Node; 39 import jdk.nashorn.internal.joni.ast.QuantifierNode; 40 import jdk.nashorn.internal.joni.ast.StringNode; 41 import jdk.nashorn.internal.joni.ast.CClassNode.CCStateArg; 42 import jdk.nashorn.internal.joni.constants.AnchorType; 43 import jdk.nashorn.internal.joni.constants.CCSTATE; 44 import jdk.nashorn.internal.joni.constants.CCVALTYPE; 45 import jdk.nashorn.internal.joni.constants.EncloseType; 46 import jdk.nashorn.internal.joni.constants.NodeType; 47 import jdk.nashorn.internal.joni.constants.TokenType; 48 49 class Parser extends Lexer { 50 51 protected final Regex regex; 52 protected Node root; 53 54 protected int returnCode; // return code used by parser methods (they itself return parsed nodes) 55 // this approach will not affect recursive calls 56 57 protected Parser(ScanEnvironment env, char[] chars, int p, int end) { 58 super(env, chars, p, end); 59 regex = env.reg; 60 } 61 62 // onig_parse_make_tree 63 protected final Node parse() { 64 root = parseRegexp(); 65 regex.numMem = env.numMem; 66 return root; 67 } 68 69 private static final int POSIX_BRACKET_NAME_MIN_LEN = 4; 70 private static final int POSIX_BRACKET_CHECK_LIMIT_LENGTH = 20; 71 private static final char BRACKET_END[] = ":]".toCharArray(); 72 private boolean parsePosixBracket(CClassNode cc) { 73 mark(); 74 75 boolean not; 76 if (peekIs('^')) { 77 inc(); 78 not = true; 79 } else { 80 not = false; 81 } 82 if (stop - p >= POSIX_BRACKET_NAME_MIN_LEN + 3) { // else goto not_posix_bracket 83 char[][] pbs = PosixBracket.PBSNamesLower; 84 for (int i=0; i<pbs.length; i++) { 85 char[] name = pbs[i]; 86 // hash lookup here ? 87 if (EncodingHelper.strNCmp(chars, p, stop, name, 0, name.length) == 0) { 88 p += name.length; 89 if (EncodingHelper.strNCmp(chars, p, stop, BRACKET_END, 0, BRACKET_END.length) != 0) { 90 newSyntaxException(ERR_INVALID_POSIX_BRACKET_TYPE); 91 } 92 cc.addCType(PosixBracket.PBSValues[i], not, env, this); 93 inc(); 94 inc(); 95 return false; 96 } 97 } 98 99 } 100 101 // not_posix_bracket: 102 c = 0; 103 int i= 0; 104 while (left() && ((c=peek()) != ':') && c != ']') { 105 inc(); 106 if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; 107 } 108 109 if (c == ':' && left()) { 110 inc(); 111 if (left()) { 112 fetch(); 113 if (c == ']') newSyntaxException(ERR_INVALID_POSIX_BRACKET_TYPE); 114 } 115 } 116 restore(); 117 return true; /* 1: is not POSIX bracket, but no error. */ 118 } 119 120 private CClassNode parseCharProperty() { 121 int ctype = fetchCharPropertyToCType(); 122 CClassNode n = new CClassNode(); 123 n.addCType(ctype, false, env, this); 124 if (token.getPropNot()) n.setNot(); 125 return n; 126 } 127 128 private boolean codeExistCheck(int code, boolean ignoreEscaped) { 129 mark(); 130 131 boolean inEsc = false; 132 while (left()) { 133 if (ignoreEscaped && inEsc) { 134 inEsc = false; 135 } else { 136 fetch(); 137 if (c == code) { 138 restore(); 139 return true; 140 } 141 if (c == syntax.metaCharTable.esc) inEsc = true; 142 } 143 } 144 145 restore(); 146 return false; 147 } 148 149 private CClassNode parseCharClass() { 150 fetchTokenInCC(); 151 152 final boolean neg; 153 if (token.type == TokenType.CHAR && token.getC() == '^' && !token.escaped) { 154 neg = true; 155 fetchTokenInCC(); 156 } else { 157 neg = false; 158 } 159 160 if (token.type == TokenType.CC_CLOSE) { 161 if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS); 162 env.ccEscWarn("]"); 163 token.type = TokenType.CHAR; /* allow []...] */ 164 } 165 166 CClassNode cc = new CClassNode(); 167 CClassNode prevCC = null; 168 CClassNode workCC = null; 169 170 CCStateArg arg = new CCStateArg(); 171 172 boolean andStart = false; 173 arg.state = CCSTATE.START; 174 175 while (token.type != TokenType.CC_CLOSE) { 176 boolean fetched = false; 177 178 switch (token.type) { 179 180 case CHAR: 181 if (token.getC() > 0xff) { 182 arg.inType = CCVALTYPE.CODE_POINT; 183 } else { 184 arg.inType = CCVALTYPE.SB; // sb_char: 185 } 186 arg.v = token.getC(); 187 arg.vIsRaw = false; 188 parseCharClassValEntry2(cc, arg); // goto val_entry2 189 break; 190 191 case RAW_BYTE: 192 if (token.base != 0) { /* tok->base != 0 : octal or hexadec. */ 193 byte[] buf = new byte[4]; 194 int psave = p; 195 int base = token.base; 196 buf[0] = (byte)token.getC(); 197 int i; 198 for (i=1; i<4; i++) { 199 fetchTokenInCC(); 200 if (token.type != TokenType.RAW_BYTE || token.base != base) { 201 fetched = true; 202 break; 203 } 204 buf[i] = (byte)token.getC(); 205 } 206 207 if (i == 1) { 208 arg.v = buf[0] & 0xff; 209 arg.inType = CCVALTYPE.SB; // goto raw_single 210 } else { 211 arg.v = EncodingHelper.mbcToCode(buf, 0, buf.length); 212 arg.inType = CCVALTYPE.CODE_POINT; 213 } 214 } else { 215 arg.v = token.getC(); 216 arg.inType = CCVALTYPE.SB; // raw_single: 217 } 218 arg.vIsRaw = true; 219 parseCharClassValEntry2(cc, arg); // goto val_entry2 220 break; 221 222 case CODE_POINT: 223 arg.v = token.getCode(); 224 arg.vIsRaw = true; 225 parseCharClassValEntry(cc, arg); // val_entry:, val_entry2 226 break; 227 228 case POSIX_BRACKET_OPEN: 229 if (parsePosixBracket(cc)) { /* true: is not POSIX bracket */ 230 env.ccEscWarn("["); 231 p = token.backP; 232 arg.v = token.getC(); 233 arg.vIsRaw = false; 234 parseCharClassValEntry(cc, arg); // goto val_entry 235 break; 236 } 237 cc.nextStateClass(arg, env); // goto next_class 238 break; 239 240 case CHAR_TYPE: 241 cc.addCType(token.getPropCType(), token.getPropNot(), env, this); 242 cc.nextStateClass(arg, env); // next_class: 243 break; 244 245 case CHAR_PROPERTY: 246 int ctype = fetchCharPropertyToCType(); 247 cc.addCType(ctype, token.getPropNot(), env, this); 248 cc.nextStateClass(arg, env); // goto next_class 249 break; 250 251 case CC_RANGE: 252 if (arg.state == CCSTATE.VALUE) { 253 fetchTokenInCC(); 254 fetched = true; 255 if (token.type == TokenType.CC_CLOSE) { /* allow [x-] */ 256 parseCharClassRangeEndVal(cc, arg); // range_end_val:, goto val_entry; 257 break; 258 } else if (token.type == TokenType.CC_AND) { 259 env.ccEscWarn("-"); 260 parseCharClassRangeEndVal(cc, arg); // goto range_end_val 261 break; 262 } 263 arg.state = CCSTATE.RANGE; 264 } else if (arg.state == CCSTATE.START) { 265 arg.v = token.getC(); /* [-xa] is allowed */ 266 arg.vIsRaw = false; 267 fetchTokenInCC(); 268 fetched = true; 269 if (token.type == TokenType.CC_RANGE || andStart) env.ccEscWarn("-"); /* [--x] or [a&&-x] is warned. */ 270 parseCharClassValEntry(cc, arg); // goto val_entry 271 break; 272 } else if (arg.state == CCSTATE.RANGE) { 273 env.ccEscWarn("-"); 274 parseCharClassSbChar(cc, arg); // goto sb_char /* [!--x] is allowed */ 275 break; 276 } else { /* CCS_COMPLETE */ 277 fetchTokenInCC(); 278 fetched = true; 279 if (token.type == TokenType.CC_CLOSE) { /* allow [a-b-] */ 280 parseCharClassRangeEndVal(cc, arg); // goto range_end_val 281 break; 282 } else if (token.type == TokenType.CC_AND) { 283 env.ccEscWarn("-"); 284 parseCharClassRangeEndVal(cc, arg); // goto range_end_val 285 break; 286 } 287 288 if (syntax.allowDoubleRangeOpInCC()) { 289 env.ccEscWarn("-"); 290 parseCharClassSbChar(cc, arg); // goto sb_char /* [0-9-a] is allowed as [0-9\-a] */ 291 break; 292 } 293 newSyntaxException(ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS); 294 } 295 break; 296 297 case CC_CC_OPEN: /* [ */ 298 CClassNode acc = parseCharClass(); 299 cc.or(acc); 300 break; 301 302 case CC_AND: /* && */ 303 if (arg.state == CCSTATE.VALUE) { 304 arg.v = 0; // ??? safe v ? 305 arg.vIsRaw = false; 306 cc.nextStateValue(arg, env); 307 } 308 /* initialize local variables */ 309 andStart = true; 310 arg.state = CCSTATE.START; 311 if (prevCC != null) { 312 prevCC.and(cc); 313 } else { 314 prevCC = cc; 315 if (workCC == null) workCC = new CClassNode(); 316 cc = workCC; 317 } 318 cc.clear(); 319 break; 320 321 case EOT: 322 newSyntaxException(ERR_PREMATURE_END_OF_CHAR_CLASS); 323 324 default: 325 newInternalException(ERR_PARSER_BUG); 326 } // switch 327 328 if (!fetched) fetchTokenInCC(); 329 330 } // while 331 332 if (arg.state == CCSTATE.VALUE) { 333 arg.v = 0; // ??? safe v ? 334 arg.vIsRaw = false; 335 cc.nextStateValue(arg, env); 336 } 337 338 if (prevCC != null) { 339 prevCC.and(cc); 340 cc = prevCC; 341 } 342 343 if (neg) { 344 cc.setNot(); 345 } else { 346 cc.clearNot(); 347 } 348 349 if (cc.isNot() && syntax.notNewlineInNegativeCC()) { 350 if (!cc.isEmpty()) { 351 final int NEW_LINE = 0x0a; 352 if (EncodingHelper.isNewLine(NEW_LINE)) { 353 cc.bs.set(NEW_LINE); 354 } 355 } 356 } 357 358 return cc; 359 } 360 361 private void parseCharClassSbChar(CClassNode cc, CCStateArg arg) { 362 arg.inType = CCVALTYPE.SB; 363 arg.v = token.getC(); 364 arg.vIsRaw = false; 365 parseCharClassValEntry2(cc, arg); // goto val_entry2 366 } 367 368 private void parseCharClassRangeEndVal(CClassNode cc, CCStateArg arg) { 369 arg.v = '-'; 370 arg.vIsRaw = false; 371 parseCharClassValEntry(cc, arg); // goto val_entry 372 } 373 374 private void parseCharClassValEntry(CClassNode cc, CCStateArg arg) { 375 arg.inType = arg.v <= 0xff ? CCVALTYPE.SB : CCVALTYPE.CODE_POINT; 376 parseCharClassValEntry2(cc, arg); // val_entry2: 377 } 378 379 private void parseCharClassValEntry2(CClassNode cc, CCStateArg arg) { 380 cc.nextStateValue(arg, env); 381 } 382 383 private Node parseEnclose(TokenType term) { 384 Node node = null; 385 386 if (!left()) newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS); 387 388 int option = env.option; 389 390 if (peekIs('?') && syntax.op2QMarkGroupEffect()) { 391 inc(); 392 if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP); 393 394 boolean listCapture = false; 395 396 fetch(); 397 switch(c) { 398 case ':': /* (?:...) grouping only */ 399 fetchToken(); // group: 400 node = parseSubExp(term); 401 returnCode = 1; /* group */ 402 return node; 403 case '=': 404 node = new AnchorNode(AnchorType.PREC_READ); 405 break; 406 case '!': /* preceding read */ 407 node = new AnchorNode(AnchorType.PREC_READ_NOT); 408 break; 409 case '>': /* (?>...) stop backtrack */ 410 node = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose 411 break; 412 case '\'': 413 if (Config.USE_NAMED_GROUP) { 414 if (syntax.op2QMarkLtNamedGroup()) { 415 listCapture = false; // goto named_group1 416 node = parseEncloseNamedGroup2(listCapture); 417 break; 418 } else { 419 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 420 } 421 } // USE_NAMED_GROUP 422 break; 423 case '<': /* look behind (?<=...), (?<!...) */ 424 fetch(); 425 if (c == '=') { 426 node = new AnchorNode(AnchorType.LOOK_BEHIND); 427 } else if (c == '!') { 428 node = new AnchorNode(AnchorType.LOOK_BEHIND_NOT); 429 } else { 430 if (Config.USE_NAMED_GROUP) { 431 if (syntax.op2QMarkLtNamedGroup()) { 432 unfetch(); 433 c = '<'; 434 435 listCapture = false; // named_group1: 436 node = parseEncloseNamedGroup2(listCapture); // named_group2: 437 break; 438 } else { 439 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 440 } 441 442 } else { // USE_NAMED_GROUP 443 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 444 } // USE_NAMED_GROUP 445 } 446 break; 447 case '@': 448 if (syntax.op2AtMarkCaptureHistory()) { 449 if (Config.USE_NAMED_GROUP) { 450 if (syntax.op2QMarkLtNamedGroup()) { 451 fetch(); 452 if (c == '<' || c == '\'') { 453 listCapture = true; 454 node = parseEncloseNamedGroup2(listCapture); // goto named_group2 /* (?@<name>...) */ 455 } 456 unfetch(); 457 } 458 } // USE_NAMED_GROUP 459 EncloseNode en = new EncloseNode(env.option, false); // node_new_enclose_memory 460 int num = env.addMemEntry(); 461 if (num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(ERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY); 462 en.regNum = num; 463 node = en; 464 } else { 465 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 466 } 467 break; 468 469 // case 'p': #ifdef USE_POSIXLINE_OPTION 470 case '-': 471 case 'i': 472 case 'm': 473 case 's': 474 case 'x': 475 boolean neg = false; 476 while (true) { 477 switch(c) { 478 case ':': 479 case ')': 480 break; 481 case '-': 482 neg = true; 483 break; 484 case 'x': 485 option = bsOnOff(option, Option.EXTEND, neg); 486 break; 487 case 'i': 488 option = bsOnOff(option, Option.IGNORECASE, neg); 489 break; 490 case 's': 491 if (syntax.op2OptionPerl()) { 492 option = bsOnOff(option, Option.MULTILINE, neg); 493 } else { 494 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 495 } 496 break; 497 case 'm': 498 if (syntax.op2OptionPerl()) { 499 option = bsOnOff(option, Option.SINGLELINE, !neg); 500 } else if (syntax.op2OptionRuby()) { 501 option = bsOnOff(option, Option.MULTILINE, neg); 502 } else { 503 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 504 } 505 break; 506 // case 'p': #ifdef USE_POSIXLINE_OPTION // not defined 507 // option = bsOnOff(option, Option.MULTILINE|Option.SINGLELINE, neg); 508 // break; 509 510 default: 511 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 512 } // switch 513 514 if (c == ')') { 515 EncloseNode en = new EncloseNode(option, 0); // node_new_option 516 node = en; 517 returnCode = 2; /* option only */ 518 return node; 519 } else if (c == ':') { 520 int prev = env.option; 521 env.option = option; 522 fetchToken(); 523 Node target = parseSubExp(term); 524 env.option = prev; 525 EncloseNode en = new EncloseNode(option, 0); // node_new_option 526 en.setTarget(target); 527 node = en; 528 returnCode = 0; 529 return node; 530 } 531 if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP); 532 fetch(); 533 } // while 534 535 default: 536 newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); 537 } // switch 538 539 } else { 540 if (isDontCaptureGroup(env.option)) { 541 fetchToken(); // goto group 542 node = parseSubExp(term); 543 returnCode = 1; /* group */ 544 return node; 545 } 546 EncloseNode en = new EncloseNode(env.option, false); // node_new_enclose_memory 547 int num = env.addMemEntry(); 548 en.regNum = num; 549 node = en; 550 } 551 552 fetchToken(); 553 Node target = parseSubExp(term); 554 555 if (node.getType() == NodeType.ANCHOR) { 556 AnchorNode an = (AnchorNode) node; 557 an.setTarget(target); 558 } else { 559 EncloseNode en = (EncloseNode)node; 560 en.setTarget(target); 561 if (en.type == EncloseType.MEMORY) { 562 /* Don't move this to previous of parse_subexp() */ 563 env.setMemNode(en.regNum, node); 564 } 565 } 566 returnCode = 0; 567 return node; // ?? 568 } 569 570 private Node parseEncloseNamedGroup2(boolean listCapture) { 571 int nm = p; 572 int num = fetchName(c, false); 573 int nameEnd = value; 574 num = env.addMemEntry(); 575 if (listCapture && num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(ERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY); 576 577 regex.nameAdd(chars, nm, nameEnd, num, syntax); 578 EncloseNode en = new EncloseNode(env.option, true); // node_new_enclose_memory 579 en.regNum = num; 580 581 Node node = en; 582 583 if (listCapture) env.captureHistory = bsOnAtSimple(env.captureHistory, num); 584 env.numNamed++; 585 return node; 586 } 587 588 private int findStrPosition(int[]s, int n, int from, int to, Ptr nextChar) { 589 int x; 590 int q; 591 int p = from; 592 int i = 0; 593 while (p < to) { 594 x = chars[p]; 595 q = p + 1; 596 if (x == s[0]) { 597 for (i=1; i<n && q<to; i++) { 598 x = chars[q]; 599 if (x != s[i]) break; 600 q++; 601 } 602 if (i >= n) { 603 if (chars[nextChar.p] != 0) nextChar.p = q; // we may need zero term semantics... 604 return p; 605 } 606 } 607 p = q; 608 } 609 return -1; 610 } 611 612 private Node parseExp(TokenType term) { 613 if (token.type == term) return StringNode.EMPTY; // goto end_of_token 614 615 Node node = null; 616 boolean group = false; 617 618 switch(token.type) { 619 case ALT: 620 case EOT: 621 return StringNode.EMPTY; // end_of_token:, node_new_empty 622 623 case SUBEXP_OPEN: 624 node = parseEnclose(TokenType.SUBEXP_CLOSE); 625 if (returnCode == 1) { 626 group = true; 627 } else if (returnCode == 2) { /* option only */ 628 int prev = env.option; 629 EncloseNode en = (EncloseNode)node; 630 env.option = en.option; 631 fetchToken(); 632 Node target = parseSubExp(term); 633 env.option = prev; 634 en.setTarget(target); 635 return node; 636 } 637 break; 638 case SUBEXP_CLOSE: 639 if (!syntax.allowUnmatchedCloseSubexp()) newSyntaxException(ERR_UNMATCHED_CLOSE_PARENTHESIS); 640 if (token.escaped) { 641 return parseExpTkRawByte(group); // goto tk_raw_byte 642 } else { 643 return parseExpTkByte(group); // goto tk_byte 644 } 645 case STRING: 646 return parseExpTkByte(group); // tk_byte: 647 648 case RAW_BYTE: 649 return parseExpTkRawByte(group); // tk_raw_byte: 650 case CODE_POINT: 651 char[] buf = new char[] {(char)token.getCode()}; 652 // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else 653 node = new StringNode(buf, 0, 1); 654 break; 655 656 case QUOTE_OPEN: 657 int[] endOp = new int[] {syntax.metaCharTable.esc, 'E'}; 658 int qstart = p; 659 Ptr nextChar = new Ptr(); 660 int qend = findStrPosition(endOp, endOp.length, qstart, stop, nextChar); 661 if (qend == -1) nextChar.p = qend = stop; 662 node = new StringNode(chars, qstart, qend); 663 p = nextChar.p; 664 break; 665 666 case CHAR_TYPE: 667 switch(token.getPropCType()) { 668 case CharacterType.D: 669 case CharacterType.S: 670 case CharacterType.W: 671 if (Config.NON_UNICODE_SDW) { 672 CClassNode cc = new CClassNode(); 673 cc.addCType(token.getPropCType(), false, env, this); 674 if (token.getPropNot()) cc.setNot(); 675 node = cc; 676 } 677 break; 678 679 case CharacterType.WORD: 680 node = new CTypeNode(token.getPropCType(), token.getPropNot()); 681 break; 682 683 case CharacterType.SPACE: 684 case CharacterType.DIGIT: 685 case CharacterType.XDIGIT: 686 // #ifdef USE_SHARED_CCLASS_TABLE ... #endif 687 CClassNode ccn = new CClassNode(); 688 ccn.addCType(token.getPropCType(), false, env, this); 689 if (token.getPropNot()) ccn.setNot(); 690 node = ccn; 691 break; 692 693 default: 694 newInternalException(ERR_PARSER_BUG); 695 696 } // inner switch 697 break; 698 699 case CHAR_PROPERTY: 700 node = parseCharProperty(); 701 break; 702 703 case CC_CC_OPEN: 704 CClassNode cc = parseCharClass(); 705 node = cc; 706 if (isIgnoreCase(env.option)) { 707 ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc); 708 EncodingHelper.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg); 709 710 if (arg.altRoot != null) { 711 node = ConsAltNode.newAltNode(node, arg.altRoot); 712 } 713 } 714 break; 715 716 case ANYCHAR: 717 node = new AnyCharNode(); 718 break; 719 720 case ANYCHAR_ANYTIME: 721 node = new AnyCharNode(); 722 QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false); 723 qn.setTarget(node); 724 node = qn; 725 break; 726 727 case BACKREF: 728 int[]backRefs = token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()}; 729 node = new BackRefNode(token.getBackrefNum(), 730 backRefs, 731 token.getBackrefByName(), 732 token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL 733 token.getBackrefLevel(), // ... 734 env); 735 736 break; 737 738 case CALL: 739 if (Config.USE_SUBEXP_CALL) { 740 int gNum = token.getCallGNum(); 741 742 if (gNum < 0) { 743 gNum = backrefRelToAbs(gNum); 744 if (gNum <= 0) newValueException(ERR_INVALID_BACKREF); 745 } 746 node = new CallNode(chars, token.getCallNameP(), token.getCallNameEnd(), gNum); 747 env.numCall++; 748 } // USE_SUBEXP_CALL 749 break; 750 751 case ANCHOR: 752 node = new AnchorNode(token.getAnchor()); // possible bug in oniguruma 753 break; 754 755 case OP_REPEAT: 756 case INTERVAL: 757 if (syntax.contextIndepRepeatOps()) { 758 if (syntax.contextInvalidRepeatOps()) { 759 newSyntaxException(ERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED); 760 } else { 761 node = StringNode.EMPTY; // node_new_empty 762 } 763 } else { 764 return parseExpTkByte(group); // goto tk_byte 765 } 766 break; 767 768 default: 769 newInternalException(ERR_PARSER_BUG); 770 } //switch 771 772 //targetp = node; 773 774 fetchToken(); // re_entry: 775 776 return parseExpRepeat(node, group); // repeat: 777 } 778 779 private Node parseExpTkByte(boolean group) { 780 StringNode node = new StringNode(chars, token.backP, p); // tk_byte: 781 while (true) { 782 fetchToken(); 783 if (token.type != TokenType.STRING) break; 784 785 if (token.backP == node.end) { 786 node.end = p; // non escaped character, remain shared, just increase shared range 787 } else { 788 node.cat(chars, token.backP, p); // non continuous string stream, need to COW 789 } 790 } 791 // targetp = node; 792 return parseExpRepeat(node, group); // string_end:, goto repeat 793 } 794 795 private Node parseExpTkRawByte(boolean group) { 796 // tk_raw_byte: 797 798 // important: we don't use 0xff mask here neither in the compiler 799 // (in the template string) so we won't have to mask target 800 // strings when comparing against them in the matcher 801 StringNode node = new StringNode((char)token.getC()); 802 node.setRaw(); 803 804 int len = 1; 805 while (true) { 806 if (len >= 1) { 807 if (len == 1) { 808 fetchToken(); 809 node.clearRaw(); 810 // !goto string_end;! 811 return parseExpRepeat(node, group); 812 } 813 } 814 815 fetchToken(); 816 if (token.type != TokenType.RAW_BYTE) { 817 /* Don't use this, it is wrong for little endian encodings. */ 818 // USE_PAD_TO_SHORT_BYTE_CHAR ... 819 820 newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING); 821 } 822 823 // important: we don't use 0xff mask here neither in the compiler 824 // (in the template string) so we won't have to mask target 825 // strings when comparing against them in the matcher 826 node.cat((char)token.getC()); 827 len++; 828 } // while 829 } 830 831 private Node parseExpRepeat(Node target, boolean group) { 832 while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { // repeat: 833 if (target.isInvalidQuantifier()) newSyntaxException(ERR_TARGET_OF_REPEAT_OPERATOR_INVALID); 834 835 QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(), 836 token.getRepeatUpper(), 837 token.type == TokenType.INTERVAL); 838 839 qtfr.greedy = token.getRepeatGreedy(); 840 int ret = qtfr.setQuantifier(target, group, env, chars, getBegin(), getEnd()); 841 Node qn = qtfr; 842 843 if (token.getRepeatPossessive()) { 844 EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose 845 en.setTarget(qn); 846 qn = en; 847 } 848 849 if (ret == 0) { 850 target = qn; 851 } else if (ret == 2) { /* split case: /abc+/ */ 852 target = ConsAltNode.newListNode(target, null); 853 ConsAltNode tmp = ((ConsAltNode)target).setCdr(ConsAltNode.newListNode(qn, null)); 854 855 fetchToken(); 856 return parseExpRepeatForCar(target, tmp, group); 857 } 858 fetchToken(); // goto re_entry 859 } 860 return target; 861 } 862 863 private Node parseExpRepeatForCar(Node top, ConsAltNode target, boolean group) { 864 while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { // repeat: 865 if (target.car.isInvalidQuantifier()) newSyntaxException(ERR_TARGET_OF_REPEAT_OPERATOR_INVALID); 866 867 QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(), 868 token.getRepeatUpper(), 869 token.type == TokenType.INTERVAL); 870 871 qtfr.greedy = token.getRepeatGreedy(); 872 int ret = qtfr.setQuantifier(target.car, group, env, chars, getBegin(), getEnd()); 873 Node qn = qtfr; 874 875 if (token.getRepeatPossessive()) { 876 EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose 877 en.setTarget(qn); 878 qn = en; 879 } 880 881 if (ret == 0) { 882 target.setCar(qn); 883 } else if (ret == 2) { /* split case: /abc+/ */ 884 assert false; 885 } 886 fetchToken(); // goto re_entry 887 } 888 return top; 889 } 890 891 private Node parseBranch(TokenType term) { 892 Node node = parseExp(term); 893 894 if (token.type == TokenType.EOT || token.type == term || token.type == TokenType.ALT) { 895 return node; 896 } else { 897 ConsAltNode top = ConsAltNode.newListNode(node, null); 898 ConsAltNode t = top; 899 900 while (token.type != TokenType.EOT && token.type != term && token.type != TokenType.ALT) { 901 node = parseExp(term); 902 if (node.getType() == NodeType.LIST) { 903 t.setCdr((ConsAltNode)node); 904 while (((ConsAltNode)node).cdr != null ) node = ((ConsAltNode)node).cdr; 905 906 t = ((ConsAltNode)node); 907 } else { 908 t.setCdr(ConsAltNode.newListNode(node, null)); 909 t = t.cdr; 910 } 911 } 912 return top; 913 } 914 } 915 916 /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ 917 private Node parseSubExp(TokenType term) { 918 Node node = parseBranch(term); 919 920 if (token.type == term) { 921 return node; 922 } else if (token.type == TokenType.ALT) { 923 ConsAltNode top = ConsAltNode.newAltNode(node, null); 924 ConsAltNode t = top; 925 while (token.type == TokenType.ALT) { 926 fetchToken(); 927 node = parseBranch(term); 928 929 t.setCdr(ConsAltNode.newAltNode(node, null)); 930 t = t.cdr; 931 } 932 933 if (token.type != term) parseSubExpError(term); 934 return top; 935 } else { 936 parseSubExpError(term); 937 return null; //not reached 938 } 939 } 940 941 private void parseSubExpError(TokenType term) { 942 if (term == TokenType.SUBEXP_CLOSE) { 943 newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS); 944 } else { 945 newInternalException(ERR_PARSER_BUG); 946 } 947 } 948 949 private Node parseRegexp() { 950 fetchToken(); 951 return parseSubExp(TokenType.EOT); 952 } 953 }