1 /*
   2  * Copyright (c) 2015, 2017, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  22 
  23 import java.util.ArrayList;
  24 import java.util.Locale;
  25 import java.util.MissingResourceException;
  26 import java.util.ResourceBundle;
  27 import jdk.xml.internal.SecuritySupport;
  28 
  29 /**
  30  * A Regular Expression Parser.
  31  *
  32  * @xerces.internal
  33  *
  34  * @LastModified: Sep 2017
  35  */
  36 class RegexParser {
  37     static final int T_CHAR = 0;
  38     static final int T_EOF = 1;
  39     static final int T_OR = 2;                  // '|'
  40     static final int T_STAR = 3;                // '*'
  41     static final int T_PLUS = 4;                // '+'
  42     static final int T_QUESTION = 5;            // '?'
  43     static final int T_LPAREN = 6;              // '('
  44     static final int T_RPAREN = 7;              // ')'
  45     static final int T_DOT = 8;                 // '.'
  46     static final int T_LBRACKET = 9;            // '['
  47     static final int T_BACKSOLIDUS = 10;        // '\'
  48     static final int T_CARET = 11;              // '^'
  49     static final int T_DOLLAR = 12;             // '$'
  50     static final int T_LPAREN2 = 13;            // '(?:'
  51     static final int T_LOOKAHEAD = 14;          // '(?='
  52     static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
  53     static final int T_LOOKBEHIND = 16;         // '(?<='
  54     static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
  55     static final int T_INDEPENDENT = 18;        // '(?>'
  56     static final int T_SET_OPERATIONS = 19;     // '(?['
  57     static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
  58     static final int T_COMMENT = 21;            // '(?#'
  59     static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
  60     static final int T_CONDITION = 23;          // '(?('
  61     static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
  62 
  63     static class ReferencePosition {
  64         int refNumber;
  65         int position;
  66         ReferencePosition(int n, int pos) {
  67             this.refNumber = n;
  68             this.position = pos;
  69         }
  70     }
  71 
  72     int offset;
  73     String regex;
  74     int regexlen;
  75     int options;
  76     ResourceBundle resources;
  77     int chardata;
  78     int nexttoken;
  79     static protected final int S_NORMAL = 0;
  80     static protected final int S_INBRACKETS = 1;
  81     static protected final int S_INXBRACKETS = 2;
  82     int context = S_NORMAL;
  83     int parenOpened = 1;
  84     int parennumber = 1;
  85     boolean hasBackReferences;
  86     ArrayList<ReferencePosition> references = null;
  87 
  88     public RegexParser() {
  89         this.setLocale(Locale.getDefault());
  90     }
  91     public RegexParser(Locale locale) {
  92         this.setLocale(locale);
  93     }
  94 
  95     public void setLocale(Locale locale) {
  96         try {
  97             if (locale != null) {
  98                 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
  99             }
 100             else {
 101                 this.resources = SecuritySupport.getResourceBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message");
 102             }
 103         }
 104         catch (MissingResourceException mre) {
 105             throw new RuntimeException("Installation Problem???  Couldn't load messages: "
 106                                        + mre.getMessage());
 107         }
 108     }
 109 
 110     final ParseException ex(String key, int loc) {
 111         return new ParseException(this.resources.getString(key), loc);
 112     }
 113 
 114     protected final boolean isSet(int flag) {
 115         return (this.options & flag) == flag;
 116     }
 117 
 118     Token parse(String regex, int options) throws ParseException {
 119         this.options = options;
 120         this.offset = 0;
 121         this.setContext(S_NORMAL);
 122         this.parennumber = 1;
 123         this.parenOpened = 1;
 124         this.hasBackReferences = false;
 125         this.regex = regex;
 126         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
 127             this.regex = REUtil.stripExtendedComment(this.regex);
 128         this.regexlen = this.regex.length();
 129 
 130 
 131         this.next();
 132         Token ret = this.parseRegex();
 133         if (this.offset != this.regexlen)
 134             throw ex("parser.parse.1", this.offset);
 135         if (this.read() != T_EOF) {
 136             throw ex("parser.parse.1", this.offset-1);
 137         }
 138         if (this.references != null) {
 139             for (int i = 0;  i < this.references.size();  i ++) {
 140                 ReferencePosition position = this.references.get(i);
 141                 if (this.parennumber <= position.refNumber)
 142                     throw ex("parser.parse.2", position.position);
 143             }
 144             this.references.clear();
 145         }
 146         return ret;
 147     }
 148 
 149     /*
 150     public RegularExpression createRegex(String regex, int options) throws ParseException {
 151         Token tok = this.parse(regex, options);
 152         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
 153     }
 154     */
 155 
 156     protected final void setContext(int con) {
 157         this.context = con;
 158     }
 159 
 160     final int read() {
 161         return this.nexttoken;
 162     }
 163 
 164     @SuppressWarnings("fallthrough")
 165     final void next() {
 166         if (this.offset >= this.regexlen) {
 167             this.chardata = -1;
 168             this.nexttoken = T_EOF;
 169             return;
 170         }
 171 
 172         int ret;
 173         int ch = this.regex.charAt(this.offset++);
 174         this.chardata = ch;
 175 
 176         if (this.context == S_INBRACKETS) {
 177             // In a character class, this.chardata has one character, that is to say,
 178             // a pair of surrogates is composed and stored to this.chardata.
 179             switch (ch) {
 180               case '\\':
 181                 ret = T_BACKSOLIDUS;
 182                 if (this.offset >= this.regexlen)
 183                     throw ex("parser.next.1", this.offset-1);
 184                 this.chardata = this.regex.charAt(this.offset++);
 185                 break;
 186 
 187               case '-':
 188                 // Allow character class subtraction (regardless of whether we are in
 189                 // XML Schema mode or not)
 190                 if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
 191                     this.offset++;
 192                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
 193                 } else
 194                     ret = T_CHAR;
 195                 break;
 196 
 197               case '[':
 198                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
 199                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
 200                     this.offset++;
 201                     ret = T_POSIX_CHARCLASS_START;
 202                     break;
 203                 } // Through down
 204               default:
 205                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
 206                     int low = this.regex.charAt(this.offset);
 207                     if (REUtil.isLowSurrogate(low)) {
 208                         this.chardata = REUtil.composeFromSurrogates(ch, low);
 209                         this.offset ++;
 210                     }
 211                 }
 212                 ret = T_CHAR;
 213             }
 214             this.nexttoken = ret;
 215             return;
 216         }
 217 
 218         switch (ch) {
 219           case '|': ret = T_OR;             break;
 220           case '*': ret = T_STAR;           break;
 221           case '+': ret = T_PLUS;           break;
 222           case '?': ret = T_QUESTION;       break;
 223           case ')': ret = T_RPAREN;         break;
 224           case '.': ret = T_DOT;            break;
 225           case '[': ret = T_LBRACKET;       break;
 226           case '^':
 227               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
 228                   ret = T_CHAR;
 229               }
 230               else {
 231                   ret = T_CARET;
 232               }
 233               break;
 234           case '$':
 235               if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) {
 236                   ret = T_CHAR;
 237               }
 238               else {
 239                   ret = T_DOLLAR;
 240               }
 241               break;
 242           case '(':
 243             ret = T_LPAREN;
 244             if (this.offset >= this.regexlen)
 245                 break;
 246             if (this.regex.charAt(this.offset) != '?')
 247                 break;
 248             if (++this.offset >= this.regexlen)
 249                 throw ex("parser.next.2", this.offset-1);
 250             ch = this.regex.charAt(this.offset++);
 251             switch (ch) {
 252               case ':':  ret = T_LPAREN2;            break;
 253               case '=':  ret = T_LOOKAHEAD;          break;
 254               case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
 255               case '[':  ret = T_SET_OPERATIONS;     break;
 256               case '>':  ret = T_INDEPENDENT;        break;
 257               case '<':
 258                 if (this.offset >= this.regexlen)
 259                     throw ex("parser.next.2", this.offset-3);
 260                 ch = this.regex.charAt(this.offset++);
 261                 if (ch == '=') {
 262                     ret = T_LOOKBEHIND;
 263                 } else if (ch == '!') {
 264                     ret = T_NEGATIVELOOKBEHIND;
 265                 } else
 266                     throw ex("parser.next.3", this.offset-3);
 267                 break;
 268               case '#':
 269                 while (this.offset < this.regexlen) {
 270                     ch = this.regex.charAt(this.offset++);
 271                     if (ch == ')')  break;
 272                 }
 273                 if (ch != ')')
 274                     throw ex("parser.next.4", this.offset-1);
 275                 ret = T_COMMENT;
 276                 break;
 277               default:
 278                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
 279                     this.offset --;
 280                     ret = T_MODIFIERS;
 281                     break;
 282                 } else if (ch == '(') {         // conditional
 283                     ret = T_CONDITION;          // this.offsets points the next of '('.
 284                     break;
 285                 }
 286                 throw ex("parser.next.2", this.offset-2);
 287             }
 288             break;
 289 
 290           case '\\':
 291             ret = T_BACKSOLIDUS;
 292             if (this.offset >= this.regexlen)
 293                 throw ex("parser.next.1", this.offset-1);
 294             this.chardata = this.regex.charAt(this.offset++);
 295             break;
 296 
 297           default:
 298             ret = T_CHAR;
 299         }
 300         this.nexttoken = ret;
 301     }
 302 
 303     /**
 304      * regex ::= term (`|` term)*
 305      * term ::= factor+
 306      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
 307      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
 308      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
 309      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
 310      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
 311      */
 312     Token parseRegex() throws ParseException {
 313         Token tok = this.parseTerm();
 314         Token parent = null;
 315         while (this.read() == T_OR) {
 316             this.next();                    // '|'
 317             if (parent == null) {
 318                 parent = Token.createUnion();
 319                 parent.addChild(tok);
 320                 tok = parent;
 321             }
 322             tok.addChild(this.parseTerm());
 323         }
 324         return tok;
 325     }
 326 
 327     /**
 328      * term ::= factor+
 329      */
 330     Token parseTerm() throws ParseException {
 331         int ch = this.read();
 332         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
 333             return Token.createEmpty();
 334         } else {
 335             Token tok = this.parseFactor();
 336             Token concat = null;
 337             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
 338                 if (concat == null) {
 339                     concat = Token.createConcat();
 340                     concat.addChild(tok);
 341                     tok = concat;
 342                 }
 343                 concat.addChild(this.parseFactor());
 344                 //tok = Token.createConcat(tok, this.parseFactor());
 345             }
 346             return tok;
 347         }
 348     }
 349 
 350     // ----------------------------------------------------------------
 351 
 352     Token processCaret() throws ParseException {
 353         this.next();
 354         return Token.token_linebeginning;
 355     }
 356     Token processDollar() throws ParseException {
 357         this.next();
 358         return Token.token_lineend;
 359     }
 360     Token processLookahead() throws ParseException {
 361         this.next();
 362         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
 363         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 364         this.next();                            // ')'
 365         return tok;
 366     }
 367     Token processNegativelookahead() throws ParseException {
 368         this.next();
 369         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
 370         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 371         this.next();                            // ')'
 372         return tok;
 373     }
 374     Token processLookbehind() throws ParseException {
 375         this.next();
 376         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
 377         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 378         this.next();                            // ')'
 379         return tok;
 380     }
 381     Token processNegativelookbehind() throws ParseException {
 382         this.next();
 383         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
 384         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 385         this.next();                    // ')'
 386         return tok;
 387     }
 388     Token processBacksolidus_A() throws ParseException {
 389         this.next();
 390         return Token.token_stringbeginning;
 391     }
 392     Token processBacksolidus_Z() throws ParseException {
 393         this.next();
 394         return Token.token_stringend2;
 395     }
 396     Token processBacksolidus_z() throws ParseException {
 397         this.next();
 398         return Token.token_stringend;
 399     }
 400     Token processBacksolidus_b() throws ParseException {
 401         this.next();
 402         return Token.token_wordedge;
 403     }
 404     Token processBacksolidus_B() throws ParseException {
 405         this.next();
 406         return Token.token_not_wordedge;
 407     }
 408     Token processBacksolidus_lt() throws ParseException {
 409         this.next();
 410         return Token.token_wordbeginning;
 411     }
 412     Token processBacksolidus_gt() throws ParseException {
 413         this.next();
 414         return Token.token_wordend;
 415     }
 416     Token processStar(Token tok) throws ParseException {
 417         this.next();
 418         if (this.read() == T_QUESTION) {
 419             this.next();
 420             return Token.createNGClosure(tok);
 421         } else
 422             return Token.createClosure(tok);
 423     }
 424     Token processPlus(Token tok) throws ParseException {
 425         // X+ -> XX*
 426         this.next();
 427         if (this.read() == T_QUESTION) {
 428             this.next();
 429             return Token.createConcat(tok, Token.createNGClosure(tok));
 430         } else
 431             return Token.createConcat(tok, Token.createClosure(tok));
 432     }
 433     Token processQuestion(Token tok) throws ParseException {
 434         // X? -> X|
 435         this.next();
 436         Token par = Token.createUnion();
 437         if (this.read() == T_QUESTION) {
 438             this.next();
 439             par.addChild(Token.createEmpty());
 440             par.addChild(tok);
 441         } else {
 442             par.addChild(tok);
 443             par.addChild(Token.createEmpty());
 444         }
 445         return par;
 446     }
 447     boolean checkQuestion(int off) {
 448         return off < this.regexlen && this.regex.charAt(off) == '?';
 449     }
 450     Token processParen() throws ParseException {
 451         this.next();
 452         int p = this.parenOpened++;
 453         Token tok = Token.createParen(this.parseRegex(), p);
 454         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 455         this.parennumber++;
 456         this.next();                            // Skips ')'
 457         return tok;
 458     }
 459     Token processParen2() throws ParseException {
 460         this.next();
 461         Token tok = Token.createParen(this.parseRegex(), 0);
 462         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 463         this.next();                            // Skips ')'
 464         return tok;
 465     }
 466     Token processCondition() throws ParseException {
 467                                                 // this.offset points the next of '('
 468         if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
 469                                                 // Parses a condition.
 470         int refno = -1;
 471         Token condition = null;
 472         int ch = this.regex.charAt(this.offset);
 473         if ('1' <= ch && ch <= '9') {
 474             refno = ch-'0';
 475             int finalRefno = refno;
 476 
 477             if (this.parennumber <= refno)
 478                 throw ex("parser.parse.2", this.offset);
 479 
 480             while (this.offset + 1 < this.regexlen) {
 481                 ch = this.regex.charAt(this.offset + 1);
 482                 if ('0' <= ch && ch <= '9') {
 483                     refno = (refno * 10) + (ch - '0');
 484                     if (refno < this.parennumber) {
 485                         finalRefno= refno;
 486                         ++this.offset;
 487                     }
 488                     else {
 489                         break;
 490                     }
 491                 }
 492                 else {
 493                     break;
 494                 }
 495             }
 496 
 497             this.hasBackReferences = true;
 498             if (this.references == null)  this.references = new ArrayList<>();
 499             this.references.add(new ReferencePosition(finalRefno, this.offset));
 500             this.offset ++;
 501             if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
 502             this.offset ++;
 503         } else {
 504             if (ch == '?')  this.offset --; // Points '('.
 505             this.next();
 506             condition = this.parseFactor();
 507             switch (condition.type) {
 508               case Token.LOOKAHEAD:
 509               case Token.NEGATIVELOOKAHEAD:
 510               case Token.LOOKBEHIND:
 511               case Token.NEGATIVELOOKBEHIND:
 512                 break;
 513               case Token.ANCHOR:
 514                 if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 515                 break;
 516               default:
 517                 throw ex("parser.factor.5", this.offset);
 518             }
 519         }
 520                                                 // Parses yes/no-patterns.
 521         this.next();
 522         Token yesPattern = this.parseRegex();
 523         Token noPattern = null;
 524         if (yesPattern.type == Token.UNION) {
 525             if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
 526             noPattern = yesPattern.getChild(1);
 527             yesPattern = yesPattern.getChild(0);
 528         }
 529         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 530         this.next();
 531         return Token.createCondition(refno, condition, yesPattern, noPattern);
 532     }
 533     Token processModifiers() throws ParseException {
 534                                                 // this.offset points the next of '?'.
 535                                                 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
 536         int add = 0, mask = 0, ch = -1;
 537         while (this.offset < this.regexlen) {
 538             ch = this.regex.charAt(this.offset);
 539             int v = REUtil.getOptionValue(ch);
 540             if (v == 0)  break;                 // '-' or ':'?
 541             add |= v;
 542             this.offset ++;
 543         }
 544         if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
 545         if (ch == '-') {
 546             this.offset ++;
 547             while (this.offset < this.regexlen) {
 548                 ch = this.regex.charAt(this.offset);
 549                 int v = REUtil.getOptionValue(ch);
 550                 if (v == 0)  break;             // ':'?
 551                 mask |= v;
 552                 this.offset ++;
 553             }
 554             if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
 555         }
 556         Token tok;
 557         if (ch == ':') {
 558             this.offset ++;
 559             this.next();
 560             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
 561             if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 562             this.next();
 563         } else if (ch == ')') {                 // such as (?-i)
 564             this.offset ++;
 565             this.next();
 566             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
 567         } else
 568             throw ex("parser.factor.3", this.offset);
 569 
 570         return tok;
 571     }
 572     Token processIndependent() throws ParseException {
 573         this.next();
 574         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
 575         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
 576         this.next();                            // Skips ')'
 577         return tok;
 578     }
 579     Token processBacksolidus_c() throws ParseException {
 580         int ch2;                                // Must be in 0x0040-0x005f
 581         if (this.offset >= this.regexlen
 582             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
 583             throw ex("parser.atom.1", this.offset-1);
 584         this.next();
 585         return Token.createChar(ch2-0x40);
 586     }
 587     Token processBacksolidus_C() throws ParseException {
 588         throw ex("parser.process.1", this.offset);
 589     }
 590     Token processBacksolidus_i() throws ParseException {
 591         Token tok = Token.createChar('i');
 592         this.next();
 593         return tok;
 594     }
 595     Token processBacksolidus_I() throws ParseException {
 596         throw ex("parser.process.1", this.offset);
 597     }
 598     Token processBacksolidus_g() throws ParseException {
 599         this.next();
 600         return Token.getGraphemePattern();
 601     }
 602     Token processBacksolidus_X() throws ParseException {
 603         this.next();
 604         return Token.getCombiningCharacterSequence();
 605     }
 606     Token processBackreference() throws ParseException {
 607         int refnum = this.chardata-'0';
 608         int finalRefnum = refnum;
 609 
 610         if (this.parennumber <= refnum)
 611             throw ex("parser.parse.2", this.offset-2);
 612 
 613         while  (this.offset < this.regexlen) {
 614             final int ch = this.regex.charAt(this.offset);
 615             if ('0' <= ch && ch <= '9') {
 616                 refnum = (refnum * 10) + (ch - '0');
 617                 if (refnum < this.parennumber) {
 618                     ++this.offset;
 619                     finalRefnum = refnum;
 620                     this.chardata = ch;
 621                 }
 622                 else {
 623                     break;
 624                 }
 625             }
 626             else {
 627                 break;
 628             }
 629         }
 630 
 631         Token tok = Token.createBackReference(finalRefnum);
 632         this.hasBackReferences = true;
 633         if (this.references == null)  this.references = new ArrayList<>();
 634         this.references.add(new ReferencePosition(finalRefnum, this.offset-2));
 635         this.next();
 636         return tok;
 637     }
 638 
 639     // ----------------------------------------------------------------
 640 
 641     /**
 642      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
 643      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
 644      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
 645      *            | '(?#' [^)]* ')'
 646      * minmax ::= '{' min (',' max?)? '}'
 647      * min ::= [0-9]+
 648      * max ::= [0-9]+
 649      */
 650     Token parseFactor() throws ParseException {
 651         int ch = this.read();
 652         Token tok;
 653         switch (ch) {
 654           case T_CARET:         return this.processCaret();
 655           case T_DOLLAR:        return this.processDollar();
 656           case T_LOOKAHEAD:     return this.processLookahead();
 657           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
 658           case T_LOOKBEHIND:    return this.processLookbehind();
 659           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
 660 
 661           case T_COMMENT:
 662             this.next();
 663             return Token.createEmpty();
 664 
 665           case T_BACKSOLIDUS:
 666             switch (this.chardata) {
 667               case 'A': return this.processBacksolidus_A();
 668               case 'Z': return this.processBacksolidus_Z();
 669               case 'z': return this.processBacksolidus_z();
 670               case 'b': return this.processBacksolidus_b();
 671               case 'B': return this.processBacksolidus_B();
 672               case '<': return this.processBacksolidus_lt();
 673               case '>': return this.processBacksolidus_gt();
 674             }
 675                                                 // through down
 676         }
 677         tok = this.parseAtom();
 678         ch = this.read();
 679         switch (ch) {
 680           case T_STAR:  return this.processStar(tok);
 681           case T_PLUS:  return this.processPlus(tok);
 682           case T_QUESTION: return this.processQuestion(tok);
 683           case T_CHAR:
 684             if (this.chardata == '{' && this.offset < this.regexlen) {
 685 
 686                 int off = this.offset;          // this.offset -> next of '{'
 687                 int min = 0, max = -1;
 688 
 689                 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
 690 
 691                     min = ch -'0';
 692                     while (off < this.regexlen
 693                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
 694                         min = min*10 +ch-'0';
 695                         if (min < 0)
 696                             throw ex("parser.quantifier.5", this.offset);
 697                     }
 698                 }
 699                 else {
 700                     throw ex("parser.quantifier.1", this.offset);
 701                 }
 702 
 703                 max = min;
 704                 if (ch == ',') {
 705 
 706                    if (off >= this.regexlen) {
 707                        throw ex("parser.quantifier.3", this.offset);
 708                    }
 709                    else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
 710 
 711                         max = ch -'0';       // {min,max}
 712                         while (off < this.regexlen
 713                                && (ch = this.regex.charAt(off++)) >= '0'
 714                                && ch <= '9') {
 715                             max = max*10 +ch-'0';
 716                             if (max < 0)
 717                                 throw ex("parser.quantifier.5", this.offset);
 718                         }
 719 
 720                         if (min > max)
 721                             throw ex("parser.quantifier.4", this.offset);
 722                    }
 723                    else { // assume {min,}
 724                         max = -1;
 725                     }
 726                 }
 727 
 728                if (ch != '}')
 729                    throw ex("parser.quantifier.2", this.offset);
 730 
 731                if (this.checkQuestion(off)) {  // off -> next of '}'
 732                     tok = Token.createNGClosure(tok);
 733                     this.offset = off+1;
 734                 } else {
 735                     tok = Token.createClosure(tok);
 736                     this.offset = off;
 737                 }
 738 
 739                 tok.setMin(min);
 740                 tok.setMax(max);
 741                 //System.err.println("CLOSURE: "+min+", "+max);
 742                 this.next();
 743             }
 744         }
 745         return tok;
 746     }
 747 
 748     /**
 749      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
 750      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
 751      *          | '(?>' regex ')'
 752      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
 753      */
 754     Token parseAtom() throws ParseException {
 755         int ch = this.read();
 756         Token tok = null;
 757         switch (ch) {
 758           case T_LPAREN:        return this.processParen();
 759           case T_LPAREN2:       return this.processParen2(); // '(?:'
 760           case T_CONDITION:     return this.processCondition(); // '(?('
 761           case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
 762           case T_INDEPENDENT:   return this.processIndependent();
 763           case T_DOT:
 764             this.next();                    // Skips '.'
 765             tok = Token.token_dot;
 766             break;
 767 
 768             /**
 769              * char-class ::= '[' ( '^'? range ','?)+ ']'
 770              * range ::= '\d' | '\w' | '\s' | category-block | range-char
 771              *           | range-char '-' range-char
 772              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
 773              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
 774              */
 775           case T_LBRACKET:      return this.parseCharacterClass(true);
 776           case T_SET_OPERATIONS: return this.parseSetOperations();
 777 
 778           case T_BACKSOLIDUS:
 779             switch (this.chardata) {
 780               case 'd':  case 'D':
 781               case 'w':  case 'W':
 782               case 's':  case 'S':
 783                 tok = this.getTokenForShorthand(this.chardata);
 784                 this.next();
 785                 return tok;
 786 
 787               case 'e':  case 'f':  case 'n':  case 'r':
 788               case 't':  case 'u':  case 'v':  case 'x':
 789                 {
 790                     int ch2 = this.decodeEscaped();
 791                     if (ch2 < 0x10000) {
 792                         tok = Token.createChar(ch2);
 793                     } else {
 794                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
 795                     }
 796                 }
 797                 break;
 798 
 799               case 'c': return this.processBacksolidus_c();
 800               case 'C': return this.processBacksolidus_C();
 801               case 'i': return this.processBacksolidus_i();
 802               case 'I': return this.processBacksolidus_I();
 803               case 'g': return this.processBacksolidus_g();
 804               case 'X': return this.processBacksolidus_X();
 805               case '1':  case '2':  case '3':  case '4':
 806               case '5':  case '6':  case '7':  case '8':  case '9':
 807                 return this.processBackreference();
 808 
 809               case 'P':
 810               case 'p':
 811                 int pstart = this.offset;
 812                 tok = processBacksolidus_pP(this.chardata);
 813                 if (tok == null)  throw this.ex("parser.atom.5", pstart);
 814                 break;
 815 
 816               default:
 817                 tok = Token.createChar(this.chardata);
 818             }
 819             this.next();
 820             break;
 821 
 822           case T_CHAR:
 823             if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
 824                 throw this.ex("parser.atom.4", this.offset-1);
 825             tok = Token.createChar(this.chardata);
 826             int high = this.chardata;
 827             this.next();
 828             if (REUtil.isHighSurrogate(high)
 829                 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
 830                 char[] sur = new char[2];
 831                 sur[0] = (char)high;
 832                 sur[1] = (char)this.chardata;
 833                 tok = Token.createParen(Token.createString(new String(sur)), 0);
 834                 this.next();
 835             }
 836             break;
 837 
 838           default:
 839             throw this.ex("parser.atom.4", this.offset-1);
 840         }
 841         return tok;
 842     }
 843 
 844     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
 845 
 846         this.next();
 847         if (this.read() != T_CHAR || this.chardata != '{')
 848             throw this.ex("parser.atom.2", this.offset-1);
 849 
 850         // handle category escape
 851         boolean positive = c == 'p';
 852         int namestart = this.offset;
 853         int nameend = this.regex.indexOf('}', namestart);
 854 
 855         if (nameend < 0)
 856             throw this.ex("parser.atom.3", this.offset);
 857 
 858         String pname = this.regex.substring(namestart, nameend);
 859         this.offset = nameend+1;
 860 
 861         return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
 862     }
 863 
 864     int processCIinCharacterClass(RangeToken tok, int c) {
 865         return this.decodeEscaped();
 866     }
 867 
 868     /**
 869      * char-class ::= '[' ( '^'? range ','?)+ ']'
 870      * range ::= '\d' | '\w' | '\s' | category-block | range-char
 871      *           | range-char '-' range-char
 872      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
 873      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
 874      */
 875     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
 876         this.setContext(S_INBRACKETS);
 877         this.next();                            // '['
 878         boolean nrange = false;
 879         RangeToken base = null;
 880         RangeToken tok;
 881         if (this.read() == T_CHAR && this.chardata == '^') {
 882             nrange = true;
 883             this.next();                        // '^'
 884             if (useNrange) {
 885                 tok = Token.createNRange();
 886             } else {
 887                 base = Token.createRange();
 888                 base.addRange(0, Token.UTF16_MAX);
 889                 tok = Token.createRange();
 890             }
 891         } else {
 892             tok = Token.createRange();
 893         }
 894         int type;
 895         boolean firstloop = true;
 896         while ((type = this.read()) != T_EOF) {
 897             if (type == T_CHAR && this.chardata == ']' && !firstloop)
 898                 break;
 899             int c = this.chardata;
 900             boolean end = false;
 901             if (type == T_BACKSOLIDUS) {
 902                 switch (c) {
 903                   case 'd':  case 'D':
 904                   case 'w':  case 'W':
 905                   case 's':  case 'S':
 906                     tok.mergeRanges(this.getTokenForShorthand(c));
 907                     end = true;
 908                     break;
 909 
 910                   case 'i':  case 'I':
 911                   case 'c':  case 'C':
 912                     c = this.processCIinCharacterClass(tok, c);
 913                     if (c < 0)  end = true;
 914                     break;
 915 
 916                   case 'p':
 917                   case 'P':
 918                     int pstart = this.offset;
 919                     RangeToken tok2 = this.processBacksolidus_pP(c);
 920                     if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
 921                     tok.mergeRanges(tok2);
 922                     end = true;
 923                     break;
 924 
 925                   default:
 926                     c = this.decodeEscaped();
 927                 } // \ + c
 928             } // backsolidus
 929                                                 // POSIX Character class such as [:alnum:]
 930             else if (type == T_POSIX_CHARCLASS_START) {
 931                 int nameend = this.regex.indexOf(':', this.offset);
 932                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
 933                 boolean positive = true;
 934                 if (this.regex.charAt(this.offset) == '^') {
 935                     this.offset ++;
 936                     positive = false;
 937                 }
 938                 String name = this.regex.substring(this.offset, nameend);
 939                 RangeToken range = Token.getRange(name, positive,
 940                                                   this.isSet(RegularExpression.XMLSCHEMA_MODE));
 941                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
 942                 tok.mergeRanges(range);
 943                 end = true;
 944                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
 945                     throw this.ex("parser.cc.1", nameend);
 946                 this.offset = nameend+2;
 947             }
 948             else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
 949                 if (nrange) {
 950                     nrange = false;
 951                     if (useNrange) {
 952                         tok = (RangeToken) Token.complementRanges(tok);
 953                     }
 954                     else {
 955                         base.subtractRanges(tok);
 956                         tok = base;
 957                     }
 958                 }
 959                 RangeToken range2 = this.parseCharacterClass(false);
 960                 tok.subtractRanges(range2);
 961                 if (this.read() != T_CHAR || this.chardata != ']') {
 962                     throw this.ex("parser.cc.5", this.offset);
 963                 }
 964                 break;                          // Exit this loop
 965             }
 966             this.next();
 967             if (!end) {                         // if not shorthands...
 968                 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
 969                     if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
 970                         tok.addRange(c, c);
 971                     }
 972                     else {
 973                         addCaseInsensitiveChar(tok, c);
 974                     }
 975                 }
 976                 else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
 977                     throw this.ex("parser.cc.8", this.offset-1);
 978                 }
 979                 else {
 980                     this.next(); // Skips '-'
 981                     if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
 982                     if (type == T_CHAR && this.chardata == ']') {
 983                         if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
 984                             tok.addRange(c, c);
 985                         }
 986                         else {
 987                             addCaseInsensitiveChar(tok, c);
 988                         }
 989                         tok.addRange('-', '-');
 990                     } else {
 991                         int rangeend = this.chardata;
 992                         if (type == T_BACKSOLIDUS) {
 993                             rangeend = this.decodeEscaped();
 994                         }
 995                         this.next();
 996                         if (c > rangeend) {
 997                             throw this.ex("parser.ope.3", this.offset-1);
 998                         }
 999                         if (!this.isSet(RegularExpression.IGNORE_CASE) ||
1000                                 (c > 0xffff && rangeend > 0xffff)) {
1001                             tok.addRange(c, rangeend);
1002                         }
1003                         else {
1004                             addCaseInsensitiveCharRange(tok, c, rangeend);
1005                         }
1006                     }
1007                 }
1008             }
1009             if (this.isSet(RegularExpression.SPECIAL_COMMA)
1010                 && this.read() == T_CHAR && this.chardata == ',') {
1011                 this.next();
1012             }
1013             firstloop = false;
1014         }
1015         if (this.read() == T_EOF) {
1016             throw this.ex("parser.cc.2", this.offset);
1017         }
1018 
1019         if (!useNrange && nrange) {
1020             base.subtractRanges(tok);
1021             tok = base;
1022         }
1023         tok.sortRanges();
1024         tok.compactRanges();
1025         this.setContext(S_NORMAL);
1026         this.next();                    // Skips ']'
1027 
1028         return tok;
1029     }
1030 
1031     /**
1032      * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
1033      */
1034     protected RangeToken parseSetOperations() throws ParseException {
1035         RangeToken tok = this.parseCharacterClass(false);
1036         int type;
1037         while ((type = this.read()) != T_RPAREN) {
1038             int ch = this.chardata;
1039             if (type == T_CHAR && (ch == '-' || ch == '&')
1040                 || type == T_PLUS) {
1041                 this.next();
1042                 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
1043                 RangeToken t2 = this.parseCharacterClass(false);
1044                 if (type == T_PLUS)
1045                     tok.mergeRanges(t2);
1046                 else if (ch == '-')
1047                     tok.subtractRanges(t2);
1048                 else if (ch == '&')
1049                     tok.intersectRanges(t2);
1050                 else
1051                     throw new RuntimeException("ASSERT");
1052             } else {
1053                 throw ex("parser.ope.2", this.offset-1);
1054             }
1055         }
1056         this.next();
1057         return tok;
1058     }
1059 
1060     Token getTokenForShorthand(int ch) {
1061         Token tok;
1062         switch (ch) {
1063           case 'd':
1064             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1065                 ? Token.getRange("Nd", true) : Token.token_0to9;
1066             break;
1067           case 'D':
1068             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1069                 ? Token.getRange("Nd", false) : Token.token_not_0to9;
1070             break;
1071           case 'w':
1072             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1073                 ? Token.getRange("IsWord", true) : Token.token_wordchars;
1074             break;
1075           case 'W':
1076             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1077                 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
1078             break;
1079           case 's':
1080             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1081                 ? Token.getRange("IsSpace", true) : Token.token_spaces;
1082             break;
1083           case 'S':
1084             tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
1085                 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
1086             break;
1087 
1088           default:
1089             throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
1090         }
1091         return tok;
1092     }
1093 
1094     /**
1095      */
1096     int decodeEscaped() throws ParseException {
1097         if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
1098         int c = this.chardata;
1099         switch (c) {
1100           case 'e':  c = 0x1b;  break; // ESCAPE U+001B
1101           case 'f':  c = '\f';  break; // FORM FEED U+000C
1102           case 'n':  c = '\n';  break; // LINE FEED U+000A
1103           case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
1104           case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
1105           //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
1106           case 'x':
1107             this.next();
1108             if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1109             if (this.chardata == '{') {
1110                 int v1 = 0;
1111                 int uv = 0;
1112                 do {
1113                     this.next();
1114                     if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
1115                     if ((v1 = hexChar(this.chardata)) < 0)
1116                         break;
1117                     if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
1118                     uv = uv*16+v1;
1119                 } while (true);
1120                 if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
1121                 if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
1122                 c = uv;
1123             } else {
1124                 int v1 = 0;
1125                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1126                     throw ex("parser.descape.1", this.offset-1);
1127                 int uv = v1;
1128                 this.next();
1129                 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1130                     throw ex("parser.descape.1", this.offset-1);
1131                 uv = uv*16+v1;
1132                 c = uv;
1133             }
1134             break;
1135 
1136           case 'u':
1137             int v1 = 0;
1138             this.next();
1139             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1140                 throw ex("parser.descape.1", this.offset-1);
1141             int uv = v1;
1142             this.next();
1143             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1144                 throw ex("parser.descape.1", this.offset-1);
1145             uv = uv*16+v1;
1146             this.next();
1147             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1148                 throw ex("parser.descape.1", this.offset-1);
1149             uv = uv*16+v1;
1150             this.next();
1151             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1152                 throw ex("parser.descape.1", this.offset-1);
1153             uv = uv*16+v1;
1154             c = uv;
1155             break;
1156 
1157           case 'v':
1158             this.next();
1159             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1160                 throw ex("parser.descape.1", this.offset-1);
1161             uv = v1;
1162             this.next();
1163             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1164                 throw ex("parser.descape.1", this.offset-1);
1165             uv = uv*16+v1;
1166             this.next();
1167             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1168                 throw ex("parser.descape.1", this.offset-1);
1169             uv = uv*16+v1;
1170             this.next();
1171             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1172                 throw ex("parser.descape.1", this.offset-1);
1173             uv = uv*16+v1;
1174             this.next();
1175             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1176                 throw ex("parser.descape.1", this.offset-1);
1177             uv = uv*16+v1;
1178             this.next();
1179             if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1180                 throw ex("parser.descape.1", this.offset-1);
1181             uv = uv*16+v1;
1182             if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
1183             c = uv;
1184             break;
1185           case 'A':
1186           case 'Z':
1187           case 'z':
1188             throw ex("parser.descape.5", this.offset-2);
1189           default:
1190         }
1191         return c;
1192     }
1193 
1194     static private final int hexChar(int ch) {
1195         if (ch < '0')  return -1;
1196         if (ch > 'f')  return -1;
1197         if (ch <= '9')  return ch-'0';
1198         if (ch < 'A')  return -1;
1199         if (ch <= 'F')  return ch-'A'+10;
1200         if (ch < 'a')  return -1;
1201         return ch-'a'+10;
1202     }
1203 
1204     static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
1205         final int[] caseMap = CaseInsensitiveMap.get(c);
1206         tok.addRange(c, c);
1207 
1208         if (caseMap != null) {
1209             for (int i=0; i<caseMap.length; i+=2) {
1210                 tok.addRange(caseMap[i], caseMap[i]);
1211             }
1212         }
1213 
1214     }
1215 
1216     static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) {
1217         int[] caseMap;
1218         int r1, r2;
1219         if (start <= end) {
1220             r1 = start;
1221             r2 = end;
1222         } else {
1223             r1 = end;
1224             r2 = start;
1225         }
1226 
1227         tok.addRange(r1, r2);
1228         for (int ch = r1;  ch <= r2;  ch++) {
1229             caseMap = CaseInsensitiveMap.get(ch);
1230             if (caseMap != null) {
1231                 for (int i=0; i<caseMap.length; i+=2) {
1232                     tok.addRange(caseMap[i], caseMap[i]);
1233                 }
1234             }
1235         }
1236     }
1237 }