1 /*
   2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
   3  * this software and associated documentation files (the "Software"), to deal in
   4  * the Software without restriction, including without limitation the rights to
   5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   6  * of the Software, and to permit persons to whom the Software is furnished to do
   7  * so, subject to the following conditions:
   8  *
   9  * The above copyright notice and this permission notice shall be included in all
  10  * copies or substantial portions of the Software.
  11  *
  12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18  * SOFTWARE.
  19  */
  20 package jdk.nashorn.internal.runtime.regexp.joni;
  21 
  22 import static jdk.nashorn.internal.runtime.regexp.joni.Option.isSingleline;
  23 import static jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode.isRepeatInfinite;
  24 
  25 import jdk.nashorn.internal.runtime.regexp.joni.ast.QuantifierNode;
  26 import jdk.nashorn.internal.runtime.regexp.joni.constants.AnchorType;
  27 import jdk.nashorn.internal.runtime.regexp.joni.constants.MetaChar;
  28 import jdk.nashorn.internal.runtime.regexp.joni.constants.TokenType;
  29 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
  30 import jdk.nashorn.internal.runtime.regexp.joni.encoding.PosixBracket;
  31 import jdk.nashorn.internal.runtime.regexp.joni.encoding.Ptr;
  32 import jdk.nashorn.internal.runtime.regexp.joni.exception.ErrorMessages;
  33 import jdk.nashorn.internal.runtime.regexp.joni.exception.JOniException;
  34 
  35 class Lexer extends ScannerSupport {
  36     protected final ScanEnvironment env;
  37     protected final Syntax syntax;              // fast access to syntax
  38     protected final Token token = new Token();  // current token
  39 
  40     protected Lexer(ScanEnvironment env, char[] chars, int p, int end) {
  41         super(chars, p, end);
  42         this.env = env;
  43         this.syntax = env.syntax;
  44     }
  45 
  46     /**
  47      * @return 0: normal {n,m}, 2: fixed {n}
  48      * !introduce returnCode here
  49      */
  50     private int fetchRangeQuantifier() {
  51         mark();
  52         boolean synAllow = syntax.allowInvalidInterval();
  53 
  54         if (!left()) {
  55             if (synAllow) {
  56                 return 1; /* "....{" : OK! */
  57             } else {
  58                 newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
  59             }
  60         }
  61 
  62         if (!synAllow) {
  63             c = peek();
  64             if (c == ')' || c == '(' || c == '|') {
  65                 newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE);
  66             }
  67         }
  68 
  69         int low = scanUnsignedNumber();
  70         if (low < 0) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
  71         if (low > Config.MAX_REPEAT_NUM) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
  72 
  73         boolean nonLow = false;
  74         if (p == _p) { /* can't read low */
  75             if (syntax.allowIntervalLowAbbrev()) {
  76                 low = 0;
  77                 nonLow = true;
  78             } else {
  79                 return invalidRangeQuantifier(synAllow);
  80             }
  81         }
  82 
  83         if (!left()) return invalidRangeQuantifier(synAllow);
  84 
  85         fetch();
  86         int up;
  87         int ret = 0;
  88         if (c == ',') {
  89             int prev = p; // ??? last
  90             up = scanUnsignedNumber();
  91             if (up < 0) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
  92             if (up > Config.MAX_REPEAT_NUM) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE);
  93 
  94             if (p == prev) {
  95                 if (nonLow) return invalidRangeQuantifier(synAllow);
  96                 up = QuantifierNode.REPEAT_INFINITE; /* {n,} : {n,infinite} */
  97             }
  98         } else {
  99             if (nonLow) return invalidRangeQuantifier(synAllow);
 100             unfetch();
 101             up = low; /* {n} : exact n times */
 102             ret = 2; /* fixed */
 103         }
 104 
 105         if (!left()) return invalidRangeQuantifier(synAllow);
 106         fetch();
 107 
 108         if (syntax.opEscBraceInterval()) {
 109             if (c != syntax.metaCharTable.esc) return invalidRangeQuantifier(synAllow);
 110             fetch();
 111         }
 112 
 113         if (c != '}') return invalidRangeQuantifier(synAllow);
 114 
 115         if (!isRepeatInfinite(up) && low > up) {
 116             newValueException(ERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE);
 117         }
 118 
 119         token.type = TokenType.INTERVAL;
 120         token.setRepeatLower(low);
 121         token.setRepeatUpper(up);
 122 
 123         return ret; /* 0: normal {n,m}, 2: fixed {n} */
 124     }
 125 
 126     private int invalidRangeQuantifier(boolean synAllow) {
 127         if (synAllow) {
 128             restore();
 129             return 1;
 130         } else {
 131             newSyntaxException(ERR_INVALID_REPEAT_RANGE_PATTERN);
 132             return 0; // not reached
 133         }
 134     }
 135 
 136     /* \M-, \C-, \c, or \... */
 137     private int fetchEscapedValue() {
 138         if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
 139         fetch();
 140 
 141         switch(c) {
 142 
 143         case 'M':
 144             if (syntax.op2EscCapitalMBarMeta()) {
 145                 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_META);
 146                 fetch();
 147                 if (c != '-') newSyntaxException(ERR_META_CODE_SYNTAX);
 148                 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_META);
 149                 fetch();
 150                 if (c == syntax.metaCharTable.esc) {
 151                     c = fetchEscapedValue();
 152                 }
 153                 c = ((c & 0xff) | 0x80);
 154             } else {
 155                 fetchEscapedValueBackSlash();
 156             }
 157             break;
 158 
 159         case 'C':
 160             if (syntax.op2EscCapitalCBarControl()) {
 161                 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_CONTROL);
 162                 fetch();
 163                 if (c != '-') newSyntaxException(ERR_CONTROL_CODE_SYNTAX);
 164                 fetchEscapedValueControl();
 165             } else {
 166                 fetchEscapedValueBackSlash();
 167             }
 168             break;
 169 
 170         case 'c':
 171             if (syntax.opEscCControl()) {
 172                 fetchEscapedValueControl();
 173             }
 174             /* fall through */
 175 
 176         default:
 177             fetchEscapedValueBackSlash();
 178         } // switch
 179 
 180         return c; // ???
 181     }
 182 
 183     private void fetchEscapedValueBackSlash() {
 184         c = env.convertBackslashValue(c);
 185     }
 186 
 187     private void fetchEscapedValueControl() {
 188         if (!left()) newSyntaxException(ERR_END_PATTERN_AT_CONTROL);
 189         fetch();
 190         if (c == '?') {
 191             c = 0177;
 192         } else {
 193             if (c == syntax.metaCharTable.esc) {
 194                 c = fetchEscapedValue();
 195             }
 196             c &= 0x9f;
 197         }
 198     }
 199 
 200     private int nameEndCodePoint(int start) {
 201         switch(start) {
 202         case '<':
 203             return '>';
 204         case '\'':
 205             return '\'';
 206         default:
 207             return 0;
 208         }
 209     }
 210 
 211     // USE_NAMED_GROUP && USE_BACKREF_AT_LEVEL
 212     /*
 213         \k<name+n>, \k<name-n>
 214         \k<num+n>,  \k<num-n>
 215         \k<-num+n>, \k<-num-n>
 216      */
 217 
 218     // value implicit (rnameEnd)
 219     private boolean fetchNameWithLevel(int startCode, Ptr rbackNum, Ptr rlevel) {
 220         int src = p;
 221         boolean existLevel = false;
 222         int isNum = 0;
 223         int sign = 1;
 224 
 225         int endCode = nameEndCodePoint(startCode);
 226         int pnumHead = p;
 227         int nameEnd = stop;
 228 
 229         String err = null;
 230         if (!left()) {
 231             newValueException(ERR_EMPTY_GROUP_NAME);
 232         } else {
 233             fetch();
 234             if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME);
 235             if (Character.isDigit(c)) {
 236                 isNum = 1;
 237             } else if (c == '-') {
 238                 isNum = 2;
 239                 sign = -1;
 240                 pnumHead = p;
 241             } else if (!EncodingHelper.isWord(c)) {
 242                 err = ERR_INVALID_GROUP_NAME;
 243             }
 244         }
 245 
 246         while (left()) {
 247             nameEnd = p;
 248             fetch();
 249             if (c == endCode || c == ')' || c == '+' || c == '-') {
 250                 if (isNum == 2) err = ERR_INVALID_GROUP_NAME;
 251                 break;
 252             }
 253 
 254             if (isNum != 0) {
 255                 if (EncodingHelper.isDigit(c)) {
 256                     isNum = 1;
 257                 } else {
 258                     err = ERR_INVALID_GROUP_NAME;
 259                     // isNum = 0;
 260                 }
 261             } else if (!EncodingHelper.isWord(c)) {
 262                 err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 263             }
 264         }
 265 
 266         boolean isEndCode = false;
 267         if (err == null && c != endCode) {
 268             if (c == '+' || c == '-') {
 269                 int flag = c == '-' ? -1 : 1;
 270 
 271                 fetch();
 272                 if (!EncodingHelper.isDigit(c)) newValueException(ERR_INVALID_GROUP_NAME, src, stop);
 273                 unfetch();
 274                 int level = scanUnsignedNumber();
 275                 if (level < 0) newValueException(ERR_TOO_BIG_NUMBER);
 276                 rlevel.p = level * flag;
 277                 existLevel = true;
 278 
 279                 fetch();
 280                 isEndCode = c == endCode;
 281             }
 282 
 283             if (!isEndCode) {
 284                 err = ERR_INVALID_GROUP_NAME;
 285                 nameEnd = stop;
 286             }
 287         }
 288 
 289         if (err == null) {
 290             if (isNum != 0) {
 291                 mark();
 292                 p = pnumHead;
 293                 int backNum = scanUnsignedNumber();
 294                 restore();
 295                 if (backNum < 0) {
 296                     newValueException(ERR_TOO_BIG_NUMBER);
 297                 } else if (backNum == 0) {
 298                     newValueException(ERR_INVALID_GROUP_NAME, src, stop);
 299                 }
 300                 rbackNum.p = backNum * sign;
 301             }
 302             value = nameEnd;
 303             return existLevel;
 304         } else {
 305             newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd);
 306             return false; // not reached
 307         }
 308     }
 309 
 310     // USE_NAMED_GROUP
 311     // ref: 0 -> define name    (don't allow number name)
 312     //      1 -> reference name (allow number name)
 313     private int fetchNameForNamedGroup(int startCode, boolean ref) {
 314         int src = p;
 315         value = 0;
 316 
 317         int isNum = 0;
 318         int sign = 1;
 319 
 320         int endCode = nameEndCodePoint(startCode);
 321         int pnumHead = p;
 322         int nameEnd = stop;
 323 
 324         String err = null;
 325         if (!left()) {
 326             newValueException(ERR_EMPTY_GROUP_NAME);
 327         } else {
 328             fetch();
 329             if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME);
 330             if (EncodingHelper.isDigit(c)) {
 331                 if (ref) {
 332                     isNum = 1;
 333                 } else {
 334                     err = ERR_INVALID_GROUP_NAME;
 335                     // isNum = 0;
 336                 }
 337             } else if (c == '-') {
 338                 if (ref) {
 339                     isNum = 2;
 340                     sign = -1;
 341                     pnumHead = p;
 342                 } else {
 343                     err = ERR_INVALID_GROUP_NAME;
 344                     // isNum = 0;
 345                 }
 346             } else if (!EncodingHelper.isWord(c)) {
 347                 err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 348             }
 349         }
 350 
 351         if (err == null) {
 352             while (left()) {
 353                 nameEnd = p;
 354                 fetch();
 355                 if (c == endCode || c == ')') {
 356                     if (isNum == 2) err = ERR_INVALID_GROUP_NAME;
 357                     break;
 358                 }
 359 
 360                 if (isNum != 0) {
 361                     if (EncodingHelper.isDigit(c)) {
 362                         isNum = 1;
 363                     } else {
 364                         if (!EncodingHelper.isWord(c)) {
 365                             err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 366                         } else {
 367                             err = ERR_INVALID_GROUP_NAME;
 368                         }
 369                         // isNum = 0;
 370                     }
 371                 } else {
 372                     if (!EncodingHelper.isWord(c)) {
 373                         err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 374                     }
 375                 }
 376             }
 377 
 378             if (c != endCode) {
 379                 err = ERR_INVALID_GROUP_NAME;
 380                 nameEnd = stop;
 381             }
 382 
 383             int backNum = 0;
 384             if (isNum != 0) {
 385                 mark();
 386                 p = pnumHead;
 387                 backNum = scanUnsignedNumber();
 388                 restore();
 389                 if (backNum < 0) {
 390                     newValueException(ERR_TOO_BIG_NUMBER);
 391                 } else if (backNum == 0) {
 392                     newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd);
 393                 }
 394                 backNum *= sign;
 395             }
 396             value = nameEnd;
 397             return backNum;
 398         } else {
 399             while (left()) {
 400                 nameEnd = p;
 401                 fetch();
 402                 if (c == endCode || c == ')') break;
 403             }
 404             if (!left()) nameEnd = stop;
 405             newValueException(err, src, nameEnd);
 406             return 0; // not reached
 407         }
 408     }
 409 
 410     // #else USE_NAMED_GROUP
 411     // make it return nameEnd!
 412     private final int fetchNameForNoNamedGroup(int startCode, boolean ref) {
 413         int src = p;
 414         value = 0;
 415 
 416         int isNum = 0;
 417         int sign = 1;
 418 
 419         int endCode = nameEndCodePoint(startCode);
 420         int pnumHead = p;
 421         int nameEnd = stop;
 422 
 423         String err = null;
 424         if (!left()) {
 425             newValueException(ERR_EMPTY_GROUP_NAME);
 426         } else {
 427             fetch();
 428             if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME);
 429 
 430             if (EncodingHelper.isDigit(c)) {
 431                 isNum = 1;
 432             } else if (c == '-') {
 433                 isNum = 2;
 434                 sign = -1;
 435                 pnumHead = p;
 436             } else {
 437                 err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 438             }
 439         }
 440 
 441         while(left()) {
 442             nameEnd = p;
 443 
 444             fetch();
 445             if (c == endCode || c == ')') break;
 446             if (!EncodingHelper.isDigit(c)) err = ERR_INVALID_CHAR_IN_GROUP_NAME;
 447         }
 448 
 449         if (err == null && c != endCode) {
 450             err = ERR_INVALID_GROUP_NAME;
 451             nameEnd = stop;
 452         }
 453 
 454         if (err == null) {
 455             mark();
 456             p = pnumHead;
 457             int backNum = scanUnsignedNumber();
 458             restore();
 459             if (backNum < 0) {
 460                 newValueException(ERR_TOO_BIG_NUMBER);
 461             } else if (backNum == 0){
 462                 newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd);
 463             }
 464             backNum *= sign;
 465 
 466             value = nameEnd;
 467             return backNum;
 468         } else {
 469             newValueException(err, src, nameEnd);
 470             return 0; // not reached
 471         }
 472     }
 473 
 474     protected final int fetchName(int startCode, boolean ref) {
 475         if (Config.USE_NAMED_GROUP) {
 476             return fetchNameForNamedGroup(startCode, ref);
 477         } else {
 478             return fetchNameForNoNamedGroup(startCode, ref);
 479         }
 480     }
 481 
 482     private boolean strExistCheckWithEsc(int[]s, int n, int bad) {
 483         int p = this.p;
 484         int to = this.stop;
 485 
 486         boolean inEsc = false;
 487         int i=0;
 488         while(p < to) {
 489             if (inEsc) {
 490                 inEsc = false;
 491                 p ++;
 492             } else {
 493                 int x = chars[p];
 494                 int q = p + 1;
 495                 if (x == s[0]) {
 496                     for (i=1; i<n && q < to; i++) {
 497                         x = chars[q];
 498                         if (x != s[i]) break;
 499                         q++;
 500                     }
 501                     if (i >= n) return true;
 502                     p++;
 503                 } else {
 504                     x = chars[p];
 505                     if (x == bad) return false;
 506                     else if (x == syntax.metaCharTable.esc) inEsc = true;
 507                     p = q;
 508                 }
 509             }
 510         }
 511         return false;
 512     }
 513 
 514     private static final int send[] = new int[]{':', ']'};
 515 
 516     private void fetchTokenInCCFor_charType(boolean flag, int type) {
 517         token.type = TokenType.CHAR_TYPE;
 518         token.setPropCType(type);
 519         token.setPropNot(flag);
 520     }
 521 
 522     private void fetchTokenInCCFor_p() {
 523         int c2 = peek(); // !!! migrate to peekIs
 524         if (c2 == '{' && syntax.op2EscPBraceCharProperty()) {
 525             inc();
 526             token.type = TokenType.CHAR_PROPERTY;
 527             token.setPropNot(c == 'P');
 528 
 529             if (syntax.op2EscPBraceCircumflexNot()) {
 530                 c2 = fetchTo();
 531                 if (c2 == '^') {
 532                     token.setPropNot(!token.getPropNot());
 533                 } else {
 534                     unfetch();
 535                 }
 536             }
 537         } else {
 538             syntaxWarn(Warnings.INVALID_UNICODE_PROPERTY, (char)c);
 539         }
 540     }
 541 
 542     private void fetchTokenInCCFor_x() {
 543         if (!left()) return;
 544         int last = p;
 545 
 546         if (peekIs('{') && syntax.opEscXBraceHex8()) {
 547             inc();
 548             int num = scanUnsignedHexadecimalNumber(8);
 549             if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
 550             if (left()) {
 551                 int c2 = peek();
 552                 if (EncodingHelper.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
 553             }
 554 
 555             if (p > last + 1 && left() && peekIs('}')) {
 556                 inc();
 557                 token.type = TokenType.CODE_POINT;
 558                 token.base = 16;
 559                 token.setCode(num);
 560             } else {
 561                 /* can't read nothing or invalid format */
 562                 p = last;
 563             }
 564         } else if (syntax.opEscXHex2()) {
 565             int num = scanUnsignedHexadecimalNumber(2);
 566             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 567             if (p == last) { /* can't read nothing. */
 568                 num = 0; /* but, it's not error */
 569             }
 570             token.type = TokenType.RAW_BYTE;
 571             token.base = 16;
 572             token.setC(num);
 573         }
 574     }
 575 
 576     private void fetchTokenInCCFor_u() {
 577         if (!left()) return;
 578         int last = p;
 579 
 580         if (syntax.op2EscUHex4()) {
 581             int num = scanUnsignedHexadecimalNumber(4);
 582             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 583             if (p == last) {  /* can't read nothing. */
 584                 num = 0; /* but, it's not error */
 585             }
 586             token.type = TokenType.CODE_POINT;
 587             token.base = 16;
 588             token.setCode(num);
 589         }
 590     }
 591 
 592     private void fetchTokenInCCFor_digit() {
 593         if (syntax.opEscOctal3()) {
 594             unfetch();
 595             int last = p;
 596             int num = scanUnsignedOctalNumber(3);
 597             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 598             if (p == last) {  /* can't read nothing. */
 599                 num = 0; /* but, it's not error */
 600             }
 601             token.type = TokenType.RAW_BYTE;
 602             token.base = 8;
 603             token.setC(num);
 604         }
 605     }
 606 
 607     private void fetchTokenInCCFor_posixBracket() {
 608         if (syntax.opPosixBracket() && peekIs(':')) {
 609             token.backP = p; /* point at '[' is readed */
 610             inc();
 611             if (strExistCheckWithEsc(send, send.length, ']')) {
 612                 token.type = TokenType.POSIX_BRACKET_OPEN;
 613             } else {
 614                 unfetch();
 615                 // remove duplication, goto cc_in_cc;
 616                 if (syntax.op2CClassSetOp()) {
 617                     token.type = TokenType.CC_CC_OPEN;
 618                 } else {
 619                     env.ccEscWarn("[");
 620                 }
 621             }
 622         } else { // cc_in_cc:
 623             if (syntax.op2CClassSetOp()) {
 624                 token.type = TokenType.CC_CC_OPEN;
 625             } else {
 626                 env.ccEscWarn("[");
 627             }
 628         }
 629     }
 630 
 631     private void fetchTokenInCCFor_and() {
 632         if (syntax.op2CClassSetOp() && left() && peekIs('&')) {
 633             inc();
 634             token.type = TokenType.CC_AND;
 635         }
 636     }
 637 
 638     protected final TokenType fetchTokenInCC() {
 639         if (!left()) {
 640             token.type = TokenType.EOT;
 641             return token.type;
 642         }
 643 
 644         fetch();
 645         token.type = TokenType.CHAR;
 646         token.base = 0;
 647         token.setC(c);
 648         token.escaped = false;
 649 
 650         if (c == ']') {
 651             token.type = TokenType.CC_CLOSE;
 652         } else if (c == '-') {
 653             token.type = TokenType.CC_RANGE;
 654         } else if (c == syntax.metaCharTable.esc) {
 655             if (!syntax.backSlashEscapeInCC()) return token.type;
 656             if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
 657             fetch();
 658             token.escaped = true;
 659             token.setC(c);
 660 
 661             switch (c) {
 662             case 'w':
 663                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
 664                 break;
 665             case 'W':
 666                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
 667                 break;
 668             case 'd':
 669                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
 670                 break;
 671             case 'D':
 672                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
 673                 break;
 674             case 's':
 675                 fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
 676                 break;
 677             case 'S':
 678                 fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
 679                 break;
 680             case 'h':
 681                 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
 682                 break;
 683             case 'H':
 684                 if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
 685                 break;
 686             case 'p':
 687             case 'P':
 688                 fetchTokenInCCFor_p();
 689                 break;
 690             case 'x':
 691                 fetchTokenInCCFor_x();
 692                 break;
 693             case 'u':
 694                 fetchTokenInCCFor_u();
 695                 break;
 696             case '0':
 697             case '1':
 698             case '2':
 699             case '3':
 700             case '4':
 701             case '5':
 702             case '6':
 703             case '7':
 704                 fetchTokenInCCFor_digit();
 705                 break;
 706 
 707             default:
 708                 unfetch();
 709                 int num = fetchEscapedValue();
 710                 if (token.getC() != num) {
 711                     token.setCode(num);
 712                     token.type = TokenType.CODE_POINT;
 713                 }
 714                 break;
 715             } // switch
 716 
 717         } else if (c == '[') {
 718             fetchTokenInCCFor_posixBracket();
 719         } else if (c == '&') {
 720             fetchTokenInCCFor_and();
 721         }
 722         return token.type;
 723     }
 724 
 725     protected final int backrefRelToAbs(int relNo) {
 726         return env.numMem + 1 + relNo;
 727     }
 728 
 729     private void fetchTokenFor_repeat(int lower, int upper) {
 730         token.type = TokenType.OP_REPEAT;
 731         token.setRepeatLower(lower);
 732         token.setRepeatUpper(upper);
 733         greedyCheck();
 734     }
 735 
 736     private void fetchTokenFor_openBrace() {
 737         switch (fetchRangeQuantifier()) {
 738         case 0:
 739             greedyCheck();
 740             break;
 741         case 2:
 742             if (syntax.fixedIntervalIsGreedyOnly()) {
 743                 possessiveCheck();
 744             } else {
 745                 greedyCheck();
 746             }
 747             break;
 748         default: /* 1 : normal char */
 749         } // inner switch
 750     }
 751 
 752     private void fetchTokenFor_anchor(int subType) {
 753         token.type = TokenType.ANCHOR;
 754         token.setAnchor(subType);
 755     }
 756 
 757     private void fetchTokenFor_xBrace() {
 758         if (!left()) return;
 759 
 760         int last = p;
 761         if (peekIs('{') && syntax.opEscXBraceHex8()) {
 762             inc();
 763             int num = scanUnsignedHexadecimalNumber(8);
 764             if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE);
 765             if (left()) {
 766                 if (EncodingHelper.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE);
 767             }
 768 
 769             if (p > last + 1 && left() && peekIs('}')) {
 770                 inc();
 771                 token.type = TokenType.CODE_POINT;
 772                 token.setCode(num);
 773             } else {
 774                 /* can't read nothing or invalid format */
 775                 p = last;
 776             }
 777         } else if (syntax.opEscXHex2()) {
 778             int num = scanUnsignedHexadecimalNumber(2);
 779             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 780             if (p == last) { /* can't read nothing. */
 781                 num = 0; /* but, it's not error */
 782             }
 783             token.type = TokenType.RAW_BYTE;
 784             token.base = 16;
 785             token.setC(num);
 786         }
 787     }
 788 
 789     private void fetchTokenFor_uHex() {
 790         if (!left()) return;
 791         int last = p;
 792 
 793         if (syntax.op2EscUHex4()) {
 794             int num = scanUnsignedHexadecimalNumber(4);
 795             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 796             if (p == last) { /* can't read nothing. */
 797                 num = 0; /* but, it's not error */
 798             }
 799             token.type = TokenType.CODE_POINT;
 800             token.base = 16;
 801             token.setCode(num);
 802         }
 803     }
 804 
 805     private void fetchTokenFor_digit() {
 806         unfetch();
 807         int last = p;
 808         int num = scanUnsignedNumber();
 809         if (num < 0 || num > Config.MAX_BACKREF_NUM) { // goto skip_backref
 810         } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */
 811             if (syntax.strictCheckBackref()) {
 812                 if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) newValueException(ERR_INVALID_BACKREF);
 813             }
 814             token.type = TokenType.BACKREF;
 815             token.setBackrefNum(1);
 816             token.setBackrefRef1(num);
 817             token.setBackrefByName(false);
 818             if (Config.USE_BACKREF_WITH_LEVEL) token.setBackrefExistLevel(false);
 819             return;
 820         }
 821 
 822         if (c == '8' || c == '9') { /* normal char */ // skip_backref:
 823             p = last;
 824             inc();
 825             return;
 826         }
 827         p = last;
 828 
 829         fetchTokenFor_zero(); /* fall through */
 830     }
 831 
 832     private void fetchTokenFor_zero() {
 833         if (syntax.opEscOctal3()) {
 834             int last = p;
 835             int num = scanUnsignedOctalNumber(c == '0' ? 2 : 3);
 836             if (num < 0) newValueException(ERR_TOO_BIG_NUMBER);
 837             if (p == last) { /* can't read nothing. */
 838                 num = 0; /* but, it's not error */
 839             }
 840             token.type = TokenType.RAW_BYTE;
 841             token.base = 8;
 842             token.setC(num);
 843         } else if (c != '0') {
 844             inc();
 845         }
 846     }
 847 
 848     private void fetchTokenFor_namedBackref() {
 849         if (syntax.op2EscKNamedBackref()) {
 850             if (left()) {
 851                 fetch();
 852                 if (c =='<' || c == '\'') {
 853                     int last = p;
 854                     int backNum;
 855                     if (Config.USE_BACKREF_WITH_LEVEL) {
 856                         Ptr rbackNum = new Ptr();
 857                         Ptr rlevel = new Ptr();
 858                         token.setBackrefExistLevel(fetchNameWithLevel(c, rbackNum, rlevel));
 859                         token.setBackrefLevel(rlevel.p);
 860                         backNum = rbackNum.p;
 861                     } else {
 862                         backNum = fetchName(c, true);
 863                     } // USE_BACKREF_AT_LEVEL
 864                     int nameEnd = value; // set by fetchNameWithLevel/fetchName
 865 
 866                     if (backNum != 0) {
 867                         if (backNum < 0) {
 868                             backNum = backrefRelToAbs(backNum);
 869                             if (backNum <= 0) newValueException(ERR_INVALID_BACKREF);
 870                         }
 871 
 872                         if (syntax.strictCheckBackref() && (backNum > env.numMem || env.memNodes == null)) {
 873                             newValueException(ERR_INVALID_BACKREF);
 874                         }
 875                         token.type = TokenType.BACKREF;
 876                         token.setBackrefByName(false);
 877                         token.setBackrefNum(1);
 878                         token.setBackrefRef1(backNum);
 879                     } else {
 880                         NameEntry e = env.reg.nameToGroupNumbers(chars, last, nameEnd);
 881                         if (e == null) newValueException(ERR_UNDEFINED_NAME_REFERENCE, last, nameEnd);
 882 
 883                         if (syntax.strictCheckBackref()) {
 884                             if (e.backNum == 1) {
 885                                 if (e.backRef1 > env.numMem ||
 886                                     env.memNodes == null ||
 887                                     env.memNodes[e.backRef1] == null) newValueException(ERR_INVALID_BACKREF);
 888                             } else {
 889                                 for (int i=0; i<e.backNum; i++) {
 890                                     if (e.backRefs[i] > env.numMem ||
 891                                         env.memNodes == null ||
 892                                         env.memNodes[e.backRefs[i]] == null) newValueException(ERR_INVALID_BACKREF);
 893                                 }
 894                             }
 895                         }
 896 
 897                         token.type = TokenType.BACKREF;
 898                         token.setBackrefByName(true);
 899 
 900                         if (e.backNum == 1) {
 901                             token.setBackrefNum(1);
 902                             token.setBackrefRef1(e.backRef1);
 903                         } else {
 904                             token.setBackrefNum(e.backNum);
 905                             token.setBackrefRefs(e.backRefs);
 906                         }
 907                     }
 908                 } else {
 909                     unfetch();
 910                     syntaxWarn(Warnings.INVALID_BACKREFERENCE);
 911                 }
 912             } else {
 913                 syntaxWarn(Warnings.INVALID_BACKREFERENCE);
 914             }
 915         }
 916     }
 917 
 918     private void fetchTokenFor_subexpCall() {
 919         if (syntax.op2EscGSubexpCall()) {
 920             if (left()) {
 921                 fetch();
 922                 if (c == '<' || c == '\'') {
 923                     int last = p;
 924                     int gNum = fetchName(c, true);
 925                     int nameEnd = value;
 926                     token.type = TokenType.CALL;
 927                     token.setCallNameP(last);
 928                     token.setCallNameEnd(nameEnd);
 929                     token.setCallGNum(gNum);
 930                 } else {
 931                     unfetch();
 932                     syntaxWarn(Warnings.INVALID_SUBEXP_CALL);
 933                 }
 934             } else {
 935                 syntaxWarn(Warnings.INVALID_SUBEXP_CALL);
 936             }
 937         }
 938     }
 939 
 940     private void fetchTokenFor_charProperty() {
 941         if (peekIs('{') && syntax.op2EscPBraceCharProperty()) {
 942             inc();
 943             token.type = TokenType.CHAR_PROPERTY;
 944             token.setPropNot(c == 'P');
 945 
 946             if (syntax.op2EscPBraceCircumflexNot()) {
 947                 fetch();
 948                 if (c == '^') {
 949                     token.setPropNot(!token.getPropNot());
 950                 } else {
 951                     unfetch();
 952                 }
 953             }
 954         } else {
 955             syntaxWarn(Warnings.INVALID_UNICODE_PROPERTY, (char)c);
 956         }
 957     }
 958 
 959     private void fetchTokenFor_metaChars() {
 960         if (c == syntax.metaCharTable.anyChar) {
 961             token.type = TokenType.ANYCHAR;
 962         } else if (c == syntax.metaCharTable.anyTime) {
 963             fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
 964         }  else if (c == syntax.metaCharTable.zeroOrOneTime) {
 965             fetchTokenFor_repeat(0, 1);
 966         } else if (c == syntax.metaCharTable.oneOrMoreTime) {
 967             fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
 968         } else if (c == syntax.metaCharTable.anyCharAnyTime) {
 969             token.type = TokenType.ANYCHAR_ANYTIME;
 970             // goto out
 971         }
 972     }
 973 
 974     protected final TokenType fetchToken() {
 975         // mark(); // out
 976         start:
 977         while(true) {
 978             if (!left()) {
 979                 token.type = TokenType.EOT;
 980                 return token.type;
 981             }
 982 
 983             token.type = TokenType.STRING;
 984             token.base = 0;
 985             token.backP = p;
 986 
 987             fetch();
 988 
 989             if (c == syntax.metaCharTable.esc && !syntax.op2IneffectiveEscape()) { // IS_MC_ESC_CODE(code, syn)
 990                 if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE);
 991 
 992                 token.backP = p;
 993                 fetch();
 994 
 995                 token.setC(c);
 996                 token.escaped = true;
 997                 switch(c) {
 998 
 999                 case '*':
1000                     if (syntax.opEscAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
1001                     break;
1002                 case '+':
1003                     if (syntax.opEscPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
1004                     break;
1005                 case '?':
1006                     if (syntax.opEscQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
1007                     break;
1008                 case '{':
1009                     if (syntax.opEscBraceInterval()) fetchTokenFor_openBrace();
1010                     break;
1011                 case '|':
1012                     if (syntax.opEscVBarAlt()) token.type = TokenType.ALT;
1013                     break;
1014                 case '(':
1015                     if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
1016                     break;
1017                 case ')':
1018                     if (syntax.opEscLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
1019                     break;
1020                 case 'w':
1021                     if (syntax.opEscWWord()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
1022                     break;
1023                 case 'W':
1024                     if (syntax.opEscWWord()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD);
1025                     break;
1026                 case 'b':
1027                     if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.WORD_BOUND);
1028                     break;
1029                 case 'B':
1030                     if (syntax.opEscBWordBound()) fetchTokenFor_anchor(AnchorType.NOT_WORD_BOUND);
1031                     break;
1032                 case '<':
1033                     if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_BEGIN);
1034                     break;
1035                 case '>':
1036                     if (Config.USE_WORD_BEGIN_END && syntax.opEscLtGtWordBeginEnd()) fetchTokenFor_anchor(AnchorType.WORD_END);
1037                     break;
1038                 case 's':
1039                     if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
1040                     break;
1041                 case 'S':
1042                     if (syntax.opEscSWhiteSpace()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE);
1043                     break;
1044                 case 'd':
1045                     if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(false, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
1046                     break;
1047                 case 'D':
1048                     if (syntax.opEscDDigit()) fetchTokenInCCFor_charType(true, Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT);
1049                     break;
1050                 case 'h':
1051                     if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(false, CharacterType.XDIGIT);
1052                     break;
1053                 case 'H':
1054                     if (syntax.op2EscHXDigit()) fetchTokenInCCFor_charType(true, CharacterType.XDIGIT);
1055                     break;
1056                 case 'A':
1057                     if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
1058                     break;
1059                 case 'Z':
1060                     if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.SEMI_END_BUF);
1061                     break;
1062                 case 'z':
1063                     if (syntax.opEscAZBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
1064                     break;
1065                 case 'G':
1066                     if (syntax.opEscCapitalGBeginAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_POSITION);
1067                     break;
1068                 case '`':
1069                     if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.BEGIN_BUF);
1070                     break;
1071                 case '\'':
1072                     if (syntax.op2EscGnuBufAnchor()) fetchTokenFor_anchor(AnchorType.END_BUF);
1073                     break;
1074                 case 'x':
1075                     fetchTokenFor_xBrace();
1076                     break;
1077                 case 'u':
1078                     fetchTokenFor_uHex();
1079                     break;
1080                 case '1':
1081                 case '2':
1082                 case '3':
1083                 case '4':
1084                 case '5':
1085                 case '6':
1086                 case '7':
1087                 case '8':
1088                 case '9':
1089                     fetchTokenFor_digit();
1090                     break;
1091                 case '0':
1092                     fetchTokenFor_zero();
1093                     break;
1094                 case 'k':
1095                     if (Config.USE_NAMED_GROUP) fetchTokenFor_namedBackref();
1096                     break;
1097                 case 'g':
1098                     if (Config.USE_SUBEXP_CALL) fetchTokenFor_subexpCall();
1099                     break;
1100                 case 'Q':
1101                     if (syntax.op2EscCapitalQQuote()) token.type = TokenType.QUOTE_OPEN;
1102                     break;
1103                 case 'p':
1104                 case 'P':
1105                     fetchTokenFor_charProperty();
1106                     break;
1107 
1108                 default:
1109                     unfetch();
1110                     int num = fetchEscapedValue();
1111 
1112                     /* set_raw: */
1113                     if (token.getC() != num) {
1114                         token.type = TokenType.CODE_POINT;
1115                         token.setCode(num);
1116                     } else { /* string */
1117                         p = token.backP + 1;
1118                     }
1119                     break;
1120 
1121                 } // switch (c)
1122 
1123             } else {
1124                 token.setC(c);
1125                 token.escaped = false;
1126 
1127                 if (Config.USE_VARIABLE_META_CHARS && (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters())) {
1128                     fetchTokenFor_metaChars();
1129                     break;
1130                 }
1131 
1132                 {
1133                     switch(c) {
1134                     case '.':
1135                         if (syntax.opDotAnyChar()) token.type = TokenType.ANYCHAR;
1136                         break;
1137                     case '*':
1138                         if (syntax.opAsteriskZeroInf()) fetchTokenFor_repeat(0, QuantifierNode.REPEAT_INFINITE);
1139                         break;
1140                     case '+':
1141                         if (syntax.opPlusOneInf()) fetchTokenFor_repeat(1, QuantifierNode.REPEAT_INFINITE);
1142                         break;
1143                     case '?':
1144                         if (syntax.opQMarkZeroOne()) fetchTokenFor_repeat(0, 1);
1145                         break;
1146                     case '{':
1147                         if (syntax.opBraceInterval()) fetchTokenFor_openBrace();
1148                         break;
1149                     case '|':
1150                         if (syntax.opVBarAlt()) token.type = TokenType.ALT;
1151                         break;
1152 
1153                     case '(':
1154                         if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
1155                             inc();
1156                             if (peekIs('#')) {
1157                                 fetch();
1158                                 while (true) {
1159                                     if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP);
1160                                     fetch();
1161                                     if (c == syntax.metaCharTable.esc) {
1162                                         if (left()) fetch();
1163                                     } else {
1164                                         if (c == ')') break;
1165                                     }
1166                                 }
1167                                 continue start; // goto start
1168                             }
1169                             unfetch();
1170                         }
1171 
1172                         if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_OPEN;
1173                         break;
1174                     case ')':
1175                         if (syntax.opLParenSubexp()) token.type = TokenType.SUBEXP_CLOSE;
1176                         break;
1177                     case '^':
1178                         if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE);
1179                         break;
1180                     case '$':
1181                         if (syntax.opLineAnchor()) fetchTokenFor_anchor(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE);
1182                         break;
1183                     case '[':
1184                         if (syntax.opBracketCC()) token.type = TokenType.CC_CC_OPEN;
1185                         break;
1186                     case ']':
1187                         //if (*src > env->pattern)   /* /].../ is allowed. */
1188                         //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
1189                         break;
1190                     case '#':
1191                         if (Option.isExtend(env.option)) {
1192                             while (left()) {
1193                                 fetch();
1194                                 if (EncodingHelper.isNewLine(c)) break;
1195                             }
1196                             continue start; // goto start
1197                         }
1198                         break;
1199 
1200                     case ' ':
1201                     case '\t':
1202                     case '\n':
1203                     case '\r':
1204                     case '\f':
1205                         if (Option.isExtend(env.option)) continue start; // goto start
1206                         break;
1207 
1208                     default: // string
1209                         break;
1210 
1211                     } // switch
1212                 }
1213             }
1214 
1215             break;
1216         } // while
1217         return token.type;
1218     }
1219 
1220     private void greedyCheck() {
1221         if (left() && peekIs('?') && syntax.opQMarkNonGreedy()) {
1222 
1223             fetch();
1224 
1225             token.setRepeatGreedy(false);
1226             token.setRepeatPossessive(false);
1227         } else {
1228             possessiveCheck();
1229         }
1230     }
1231 
1232     private void possessiveCheck() {
1233         if (left() && peekIs('+') &&
1234             (syntax.op2PlusPossessiveRepeat() && token.type != TokenType.INTERVAL ||
1235              syntax.op2PlusPossessiveInterval() && token.type == TokenType.INTERVAL)) {
1236 
1237             fetch();
1238 
1239             token.setRepeatGreedy(true);
1240             token.setRepeatPossessive(true);
1241         } else {
1242             token.setRepeatGreedy(true);
1243             token.setRepeatPossessive(false);
1244         }
1245     }
1246 
1247     protected final int fetchCharPropertyToCType() {
1248         mark();
1249 
1250         while (left()) {
1251             int last = p;
1252             fetch();
1253             if (c == '}') {
1254                 String name = new String(chars, _p, last - _p);
1255                 return PosixBracket.propertyNameToCType(name);
1256             } else if (c == '(' || c == ')' || c == '{' || c == '|') {
1257                 String name = new String(chars, _p, last - _p);
1258                 throw new JOniException(ERR_INVALID_CHAR_PROPERTY_NAME.replaceAll("%n", name));
1259             }
1260         }
1261         newInternalException(ERR_PARSER_BUG);
1262         return 0; // not reached
1263     }
1264 
1265     protected final void syntaxWarn(String message, char c) {
1266         syntaxWarn(message.replace("<%n>", Character.toString(c)));
1267     }
1268 
1269     protected final void syntaxWarn(String message) {
1270         if (Config.USE_WARN) {
1271             env.reg.warnings.warn(message + ": /" + new String(chars, getBegin(), getEnd()) + "/");
1272         }
1273     }
1274 }