1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  22 
  23 import java.io.IOException;
  24 import java.io.ObjectInputStream;
  25 import java.io.ObjectOutputStream;
  26 import java.io.ObjectStreamField;
  27 import java.util.ArrayList;
  28 import java.util.Collections;
  29 import java.util.HashMap;
  30 import java.util.HashSet;
  31 import java.util.List;
  32 import java.util.Map;
  33 import java.util.Set;
  34 import java.util.Vector;
  35 
  36 /**
  37  * This class represents a node in parse tree.
  38  *
  39  * @xerces.internal
  40  *
  41  * @version $Id: Token.java,v 1.7 2010/07/27 05:02:34 joehw Exp $
  42  */
  43 class Token implements java.io.Serializable {
  44 
  45     private static final long serialVersionUID = 8484976002585487481L;
  46 
  47     static final boolean COUNTTOKENS = true;
  48     static int tokens = 0;
  49 
  50     static final int CHAR = 0;                  // Literal char
  51     static final int DOT = 11;                  // .
  52     static final int CONCAT = 1;                // XY
  53     static final int UNION = 2;                 // X|Y|Z
  54     static final int CLOSURE = 3;               // X*
  55     static final int RANGE = 4;                 // [a-zA-Z] etc.
  56     static final int NRANGE = 5;                // [^a-zA-Z] etc.
  57     static final int PAREN = 6;                 // (X) or (?:X)
  58     static final int EMPTY = 7;                 //
  59     static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
  60     static final int NONGREEDYCLOSURE = 9;      // *? +?
  61     static final int STRING = 10;               // strings
  62     static final int BACKREFERENCE = 12;        // back references
  63     static final int LOOKAHEAD = 20;            // (?=...)
  64     static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
  65     static final int LOOKBEHIND = 22;           // (?<=...)
  66     static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
  67     static final int INDEPENDENT = 24;          // (?>...)
  68     static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
  69     static final int CONDITION = 26;            // (?(...)yes|no)
  70 
  71     static final int UTF16_MAX = 0x10ffff;
  72 
  73     final int type;
  74 
  75     static Token token_dot;
  76     static Token token_0to9;
  77     static Token token_wordchars;
  78     static Token token_not_0to9;
  79     static Token token_not_wordchars;
  80     static Token token_spaces;
  81     static Token token_not_spaces;
  82     static Token token_empty;
  83     static Token token_linebeginning;
  84     static Token token_linebeginning2;
  85     static Token token_lineend;
  86     static Token token_stringbeginning;
  87     static Token token_stringend;
  88     static Token token_stringend2;
  89     static Token token_wordedge;
  90     static Token token_not_wordedge;
  91     static Token token_wordbeginning;
  92     static Token token_wordend;
  93     static {
  94         Token.token_empty = new Token(Token.EMPTY);
  95 
  96         Token.token_linebeginning = Token.createAnchor('^');
  97         Token.token_linebeginning2 = Token.createAnchor('@');
  98         Token.token_lineend = Token.createAnchor('$');
  99         Token.token_stringbeginning = Token.createAnchor('A');
 100         Token.token_stringend = Token.createAnchor('z');
 101         Token.token_stringend2 = Token.createAnchor('Z');
 102         Token.token_wordedge = Token.createAnchor('b');
 103         Token.token_not_wordedge = Token.createAnchor('B');
 104         Token.token_wordbeginning = Token.createAnchor('<');
 105         Token.token_wordend = Token.createAnchor('>');
 106 
 107         Token.token_dot = new Token(Token.DOT);
 108 
 109         Token.token_0to9 = Token.createRange();
 110         Token.token_0to9.addRange('0', '9');
 111         Token.token_wordchars = Token.createRange();
 112         Token.token_wordchars.addRange('0', '9');
 113         Token.token_wordchars.addRange('A', 'Z');
 114         Token.token_wordchars.addRange('_', '_');
 115         Token.token_wordchars.addRange('a', 'z');
 116         Token.token_spaces = Token.createRange();
 117         Token.token_spaces.addRange('\t', '\t');
 118         Token.token_spaces.addRange('\n', '\n');
 119         Token.token_spaces.addRange('\f', '\f');
 120         Token.token_spaces.addRange('\r', '\r');
 121         Token.token_spaces.addRange(' ', ' ');
 122 
 123         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
 124         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
 125         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
 126     }
 127 
 128     static Token.ParenToken createLook(int type, Token child) {
 129         if (COUNTTOKENS)  Token.tokens ++;
 130         return new Token.ParenToken(type, child, 0);
 131     }
 132     static Token.ParenToken createParen(Token child, int pnumber) {
 133         if (COUNTTOKENS)  Token.tokens ++;
 134         return new Token.ParenToken(Token.PAREN, child, pnumber);
 135     }
 136     static Token.ClosureToken createClosure(Token tok) {
 137         if (COUNTTOKENS)  Token.tokens ++;
 138         return new Token.ClosureToken(Token.CLOSURE, tok);
 139     }
 140     static Token.ClosureToken createNGClosure(Token tok) {
 141         if (COUNTTOKENS)  Token.tokens ++;
 142         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
 143     }
 144     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
 145         if (COUNTTOKENS)  Token.tokens ++;
 146         return new Token.ConcatToken(tok1, tok2);
 147     }
 148     static Token.UnionToken createConcat() {
 149         if (COUNTTOKENS)  Token.tokens ++;
 150         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
 151     }
 152     static Token.UnionToken createUnion() {
 153         if (COUNTTOKENS)  Token.tokens ++;
 154         return new Token.UnionToken(Token.UNION);
 155     }
 156     static Token createEmpty() {
 157         return Token.token_empty;
 158     }
 159     static RangeToken createRange() {
 160         if (COUNTTOKENS)  Token.tokens ++;
 161         return new RangeToken(Token.RANGE);
 162     }
 163     static RangeToken createNRange() {
 164         if (COUNTTOKENS)  Token.tokens ++;
 165         return new RangeToken(Token.NRANGE);
 166     }
 167     static Token.CharToken createChar(int ch) {
 168         if (COUNTTOKENS)  Token.tokens ++;
 169         return new Token.CharToken(Token.CHAR, ch);
 170     }
 171     static private Token.CharToken createAnchor(int ch) {
 172         if (COUNTTOKENS)  Token.tokens ++;
 173         return new Token.CharToken(Token.ANCHOR, ch);
 174     }
 175     static Token.StringToken createBackReference(int refno) {
 176         if (COUNTTOKENS)  Token.tokens ++;
 177         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
 178     }
 179     static Token.StringToken createString(String str) {
 180         if (COUNTTOKENS)  Token.tokens ++;
 181         return new Token.StringToken(Token.STRING, str, 0);
 182     }
 183     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
 184         if (COUNTTOKENS)  Token.tokens ++;
 185         return new Token.ModifierToken(child, add, mask);
 186     }
 187     static Token.ConditionToken createCondition(int refno, Token condition,
 188                                                 Token yespat, Token nopat) {
 189         if (COUNTTOKENS)  Token.tokens ++;
 190         return new Token.ConditionToken(refno, condition, yespat, nopat);
 191     }
 192 
 193     protected Token(int type) {
 194         this.type = type;
 195     }
 196 
 197     /**
 198      * A number of children.
 199      */
 200     int size() {
 201         return 0;
 202     }
 203     Token getChild(int index) {
 204         return null;
 205     }
 206     void addChild(Token tok) {
 207         throw new RuntimeException("Not supported.");
 208     }
 209 
 210                                                 // for RANGE or NRANGE
 211     protected void addRange(int start, int end) {
 212         throw new RuntimeException("Not supported.");
 213     }
 214     protected void sortRanges() {
 215         throw new RuntimeException("Not supported.");
 216     }
 217     protected void compactRanges() {
 218         throw new RuntimeException("Not supported.");
 219     }
 220     protected void mergeRanges(Token tok) {
 221         throw new RuntimeException("Not supported.");
 222     }
 223     protected void subtractRanges(Token tok) {
 224         throw new RuntimeException("Not supported.");
 225     }
 226     protected void intersectRanges(Token tok) {
 227         throw new RuntimeException("Not supported.");
 228     }
 229     static Token complementRanges(Token tok) {
 230         return RangeToken.complementRanges(tok);
 231     }
 232 
 233 
 234     void setMin(int min) {                      // for CLOSURE
 235     }
 236     void setMax(int max) {                      // for CLOSURE
 237     }
 238     int getMin() {                              // for CLOSURE
 239         return -1;
 240     }
 241     int getMax() {                              // for CLOSURE
 242         return -1;
 243     }
 244     int getReferenceNumber() {                  // for STRING
 245         return 0;
 246     }
 247     String getString() {                        // for STRING
 248         return null;
 249     }
 250 
 251     int getParenNumber() {
 252         return 0;
 253     }
 254     int getChar() {
 255         return -1;
 256     }
 257 
 258     public String toString() {
 259         return this.toString(0);
 260     }
 261     public String toString(int options) {
 262         return this.type == Token.DOT ? "." : "";
 263     }
 264 
 265     /**
 266      * How many characters are needed?
 267      */
 268     final int getMinLength() {
 269         switch (this.type) {
 270           case CONCAT:
 271             int sum = 0;
 272             for (int i = 0;  i < this.size();  i ++)
 273                 sum += this.getChild(i).getMinLength();
 274             return sum;
 275 
 276           case CONDITION:
 277           case UNION:
 278             if (this.size() == 0)
 279                 return 0;
 280             int ret = this.getChild(0).getMinLength();
 281             for (int i = 1;  i < this.size();  i ++) {
 282                 int min = this.getChild(i).getMinLength();
 283                 if (min < ret)  ret = min;
 284             }
 285             return ret;
 286 
 287           case CLOSURE:
 288           case NONGREEDYCLOSURE:
 289             if (this.getMin() >= 0)
 290                 return this.getMin() * this.getChild(0).getMinLength();
 291             return 0;
 292 
 293           case EMPTY:
 294           case ANCHOR:
 295             return 0;
 296 
 297           case DOT:
 298           case CHAR:
 299           case RANGE:
 300           case NRANGE:
 301             return 1;
 302 
 303           case INDEPENDENT:
 304           case PAREN:
 305           case MODIFIERGROUP:
 306             return this.getChild(0).getMinLength();
 307 
 308           case BACKREFERENCE:
 309             return 0;                           // *******
 310 
 311           case STRING:
 312             return this.getString().length();
 313 
 314           case LOOKAHEAD:
 315           case NEGATIVELOOKAHEAD:
 316           case LOOKBEHIND:
 317           case NEGATIVELOOKBEHIND:
 318             return 0;                           // ***** Really?
 319 
 320           default:
 321             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
 322         }
 323     }
 324 
 325     final int getMaxLength() {
 326         switch (this.type) {
 327           case CONCAT:
 328             int sum = 0;
 329             for (int i = 0;  i < this.size();  i ++) {
 330                 int d = this.getChild(i).getMaxLength();
 331                 if (d < 0)  return -1;
 332                 sum += d;
 333             }
 334             return sum;
 335 
 336           case CONDITION:
 337           case UNION:
 338             if (this.size() == 0)
 339                 return 0;
 340             int ret = this.getChild(0).getMaxLength();
 341             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
 342                 int max = this.getChild(i).getMaxLength();
 343                 if (max < 0) {                  // infinity
 344                     ret = -1;
 345                     break;
 346                 }
 347                 if (max > ret)  ret = max;
 348             }
 349             return ret;
 350 
 351           case CLOSURE:
 352           case NONGREEDYCLOSURE:
 353             if (this.getMax() >= 0)
 354                                                 // When this.child.getMaxLength() < 0,
 355                                                 // this returns minus value
 356                 return this.getMax() * this.getChild(0).getMaxLength();
 357             return -1;
 358 
 359           case EMPTY:
 360           case ANCHOR:
 361             return 0;
 362 
 363           case CHAR:
 364             return 1;
 365           case DOT:
 366           case RANGE:
 367           case NRANGE:
 368             return 2;
 369 
 370           case INDEPENDENT:
 371           case PAREN:
 372           case MODIFIERGROUP:
 373             return this.getChild(0).getMaxLength();
 374 
 375           case BACKREFERENCE:
 376             return -1;                          // ******
 377 
 378           case STRING:
 379             return this.getString().length();
 380 
 381           case LOOKAHEAD:
 382           case NEGATIVELOOKAHEAD:
 383           case LOOKBEHIND:
 384           case NEGATIVELOOKBEHIND:
 385             return 0;                           // ***** Really?
 386 
 387           default:
 388             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
 389         }
 390     }
 391 
 392     static final int FC_CONTINUE = 0;
 393     static final int FC_TERMINAL = 1;
 394     static final int FC_ANY = 2;
 395     private static final boolean isSet(int options, int flag) {
 396         return (options & flag) == flag;
 397     }
 398     final int analyzeFirstCharacter(RangeToken result, int options) {
 399         switch (this.type) {
 400           case CONCAT:
 401             int ret = FC_CONTINUE;
 402             for (int i = 0;  i < this.size();  i ++)
 403                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
 404                     break;
 405             return ret;
 406 
 407           case UNION:
 408             if (this.size() == 0)
 409                 return FC_CONTINUE;
 410             /*
 411              *  a|b|c -> FC_TERMINAL
 412              *  a|.|c -> FC_ANY
 413              *  a|b|  -> FC_CONTINUE
 414              */
 415             int ret2 = FC_CONTINUE;
 416             boolean hasEmpty = false;
 417             for (int i = 0;  i < this.size();  i ++) {
 418                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
 419                 if (ret2 == FC_ANY)
 420                     break;
 421                 else if (ret2 == FC_CONTINUE)
 422                     hasEmpty = true;
 423             }
 424             return hasEmpty ? FC_CONTINUE : ret2;
 425 
 426           case CONDITION:
 427             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
 428             if (this.size() == 1)  return FC_CONTINUE;
 429             if (ret3 == FC_ANY)  return ret3;
 430             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
 431             if (ret4 == FC_ANY)  return ret4;
 432             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
 433 
 434           case CLOSURE:
 435           case NONGREEDYCLOSURE:
 436             this.getChild(0).analyzeFirstCharacter(result, options);
 437             return FC_CONTINUE;
 438 
 439           case EMPTY:
 440           case ANCHOR:
 441             return FC_CONTINUE;
 442 
 443           case CHAR:
 444             int ch = this.getChar();
 445             result.addRange(ch, ch);
 446             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
 447                 ch = Character.toUpperCase((char)ch);
 448                 result.addRange(ch, ch);
 449                 ch = Character.toLowerCase((char)ch);
 450                 result.addRange(ch, ch);
 451             }
 452             return FC_TERMINAL;
 453 
 454           case DOT:
 455               return FC_ANY;
 456 
 457           case RANGE:
 458             result.mergeRanges(this);
 459             return FC_TERMINAL;
 460 
 461           case NRANGE:                          // ****
 462             result.mergeRanges(Token.complementRanges(this));
 463             return FC_TERMINAL;
 464 
 465           case INDEPENDENT:
 466           case PAREN:
 467             return this.getChild(0).analyzeFirstCharacter(result, options);
 468 
 469           case MODIFIERGROUP:
 470             options |= ((ModifierToken)this).getOptions();
 471             options &= ~((ModifierToken)this).getOptionsMask();
 472             return this.getChild(0).analyzeFirstCharacter(result, options);
 473 
 474           case BACKREFERENCE:
 475             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
 476             return FC_ANY;
 477 
 478           case STRING:
 479             int cha = this.getString().charAt(0);
 480             int ch2;
 481             if (REUtil.isHighSurrogate(cha)
 482                 && this.getString().length() >= 2
 483                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
 484                 cha = REUtil.composeFromSurrogates(cha, ch2);
 485             result.addRange(cha, cha);
 486             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
 487                 cha = Character.toUpperCase((char)cha);
 488                 result.addRange(cha, cha);
 489                 cha = Character.toLowerCase((char)cha);
 490                 result.addRange(cha, cha);
 491             }
 492             return FC_TERMINAL;
 493 
 494           case LOOKAHEAD:
 495           case NEGATIVELOOKAHEAD:
 496           case LOOKBEHIND:
 497           case NEGATIVELOOKBEHIND:
 498             return FC_CONTINUE;
 499 
 500           default:
 501             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
 502         }
 503     }
 504 
 505     private final boolean isShorterThan(Token tok) {
 506         if (tok == null)  return false;
 507         /*
 508         int mylength;
 509         if (this.type == STRING)  mylength = this.getString().length();
 510         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
 511         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
 512         int otherlength;
 513         if (tok.type == STRING)  otherlength = tok.getString().length();
 514         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
 515         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
 516         */
 517         int mylength;
 518         if (this.type == STRING)  mylength = this.getString().length();
 519         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
 520         int otherlength;
 521         if (tok.type == STRING)  otherlength = tok.getString().length();
 522         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
 523         return mylength < otherlength;
 524     }
 525 
 526     static class FixedStringContainer {
 527         Token token = null;
 528         int options = 0;
 529         FixedStringContainer() {
 530         }
 531     }
 532 
 533     final void findFixedString(FixedStringContainer container, int options) {
 534         switch (this.type) {
 535           case CONCAT:
 536             Token prevToken = null;
 537             int prevOptions = 0;
 538             for (int i = 0;  i < this.size();  i ++) {
 539                 this.getChild(i).findFixedString(container, options);
 540                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
 541                     prevToken = container.token;
 542                     prevOptions = container.options;
 543                 }
 544             }
 545             container.token = prevToken;
 546             container.options = prevOptions;
 547             return;
 548 
 549           case UNION:
 550           case CLOSURE:
 551           case NONGREEDYCLOSURE:
 552           case EMPTY:
 553           case ANCHOR:
 554           case RANGE:
 555           case DOT:
 556           case NRANGE:
 557           case BACKREFERENCE:
 558           case LOOKAHEAD:
 559           case NEGATIVELOOKAHEAD:
 560           case LOOKBEHIND:
 561           case NEGATIVELOOKBEHIND:
 562           case CONDITION:
 563             container.token = null;
 564             return;
 565 
 566           case CHAR:                            // Ignore CHAR tokens.
 567             container.token = null;             // **
 568             return;                             // **
 569 
 570           case STRING:
 571             container.token = this;
 572             container.options = options;
 573             return;
 574 
 575           case INDEPENDENT:
 576           case PAREN:
 577             this.getChild(0).findFixedString(container, options);
 578             return;
 579 
 580           case MODIFIERGROUP:
 581             options |= ((ModifierToken)this).getOptions();
 582             options &= ~((ModifierToken)this).getOptionsMask();
 583             this.getChild(0).findFixedString(container, options);
 584             return;
 585 
 586           default:
 587             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
 588         }
 589     }
 590 
 591     boolean match(int ch) {
 592         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
 593     }
 594 
 595     // ------------------------------------------------------
 596     private final static Map<String, Token> categories = new HashMap<>();
 597     private final static Map<String, Token> categories2 = new HashMap<>();
 598     private static final String[] categoryNames = {
 599         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
 600         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
 601         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
 602         "Pi", "Pf",  // 29, 30
 603         "L", "M", "N", "Z", "C", "P", "S",      // 31-37
 604     };
 605 
 606     // Schema Rec. {Datatypes} - Punctuation
 607     static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
 608     static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
 609     static final int CHAR_LETTER = 31;
 610     static final int CHAR_MARK = 32;
 611     static final int CHAR_NUMBER = 33;
 612     static final int CHAR_SEPARATOR = 34;
 613     static final int CHAR_OTHER = 35;
 614     static final int CHAR_PUNCTUATION = 36;
 615     static final int CHAR_SYMBOL = 37;
 616 
 617     //blockNames in UNICODE 3.1 that supported by XML Schema REC
 618     private static final String[] blockNames = {
 619         /*0000..007F;*/ "Basic Latin",
 620         /*0080..00FF;*/ "Latin-1 Supplement",
 621         /*0100..017F;*/ "Latin Extended-A",
 622         /*0180..024F;*/ "Latin Extended-B",
 623         /*0250..02AF;*/ "IPA Extensions",
 624         /*02B0..02FF;*/ "Spacing Modifier Letters",
 625         /*0300..036F;*/ "Combining Diacritical Marks",
 626         /*0370..03FF;*/ "Greek",
 627         /*0400..04FF;*/ "Cyrillic",
 628         /*0530..058F;*/ "Armenian",
 629         /*0590..05FF;*/ "Hebrew",
 630         /*0600..06FF;*/ "Arabic",
 631         /*0700..074F;*/ "Syriac",
 632         /*0780..07BF;*/ "Thaana",
 633         /*0900..097F;*/ "Devanagari",
 634         /*0980..09FF;*/ "Bengali",
 635         /*0A00..0A7F;*/ "Gurmukhi",
 636         /*0A80..0AFF;*/ "Gujarati",
 637         /*0B00..0B7F;*/ "Oriya",
 638         /*0B80..0BFF;*/ "Tamil",
 639         /*0C00..0C7F;*/ "Telugu",
 640         /*0C80..0CFF;*/ "Kannada",
 641         /*0D00..0D7F;*/ "Malayalam",
 642         /*0D80..0DFF;*/ "Sinhala",
 643         /*0E00..0E7F;*/ "Thai",
 644         /*0E80..0EFF;*/ "Lao",
 645         /*0F00..0FFF;*/ "Tibetan",
 646         /*1000..109F;*/ "Myanmar",
 647         /*10A0..10FF;*/ "Georgian",
 648         /*1100..11FF;*/ "Hangul Jamo",
 649         /*1200..137F;*/ "Ethiopic",
 650         /*13A0..13FF;*/ "Cherokee",
 651         /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
 652         /*1680..169F;*/ "Ogham",
 653         /*16A0..16FF;*/ "Runic",
 654         /*1780..17FF;*/ "Khmer",
 655         /*1800..18AF;*/ "Mongolian",
 656         /*1E00..1EFF;*/ "Latin Extended Additional",
 657         /*1F00..1FFF;*/ "Greek Extended",
 658         /*2000..206F;*/ "General Punctuation",
 659         /*2070..209F;*/ "Superscripts and Subscripts",
 660         /*20A0..20CF;*/ "Currency Symbols",
 661         /*20D0..20FF;*/ "Combining Marks for Symbols",
 662         /*2100..214F;*/ "Letterlike Symbols",
 663         /*2150..218F;*/ "Number Forms",
 664         /*2190..21FF;*/ "Arrows",
 665         /*2200..22FF;*/ "Mathematical Operators",
 666         /*2300..23FF;*/ "Miscellaneous Technical",
 667         /*2400..243F;*/ "Control Pictures",
 668         /*2440..245F;*/ "Optical Character Recognition",
 669         /*2460..24FF;*/ "Enclosed Alphanumerics",
 670         /*2500..257F;*/ "Box Drawing",
 671         /*2580..259F;*/ "Block Elements",
 672         /*25A0..25FF;*/ "Geometric Shapes",
 673         /*2600..26FF;*/ "Miscellaneous Symbols",
 674         /*2700..27BF;*/ "Dingbats",
 675         /*2800..28FF;*/ "Braille Patterns",
 676         /*2E80..2EFF;*/ "CJK Radicals Supplement",
 677         /*2F00..2FDF;*/ "Kangxi Radicals",
 678         /*2FF0..2FFF;*/ "Ideographic Description Characters",
 679         /*3000..303F;*/ "CJK Symbols and Punctuation",
 680         /*3040..309F;*/ "Hiragana",
 681         /*30A0..30FF;*/ "Katakana",
 682         /*3100..312F;*/ "Bopomofo",
 683         /*3130..318F;*/ "Hangul Compatibility Jamo",
 684         /*3190..319F;*/ "Kanbun",
 685         /*31A0..31BF;*/ "Bopomofo Extended",
 686         /*3200..32FF;*/ "Enclosed CJK Letters and Months",
 687         /*3300..33FF;*/ "CJK Compatibility",
 688         /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
 689         /*4E00..9FFF;*/ "CJK Unified Ideographs",
 690         /*A000..A48F;*/ "Yi Syllables",
 691         /*A490..A4CF;*/ "Yi Radicals",
 692         /*AC00..D7A3;*/ "Hangul Syllables",
 693         /*E000..F8FF;*/ "Private Use",
 694         /*F900..FAFF;*/ "CJK Compatibility Ideographs",
 695         /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
 696         /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
 697         /*FE20..FE2F;*/ "Combining Half Marks",
 698         /*FE30..FE4F;*/ "CJK Compatibility Forms",
 699         /*FE50..FE6F;*/ "Small Form Variants",
 700         /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
 701         /*FEFF..FEFF;*/ "Specials",
 702         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
 703          //missing Specials add manually
 704         /*10300..1032F;*/ "Old Italic",         // 84
 705         /*10330..1034F;*/ "Gothic",
 706         /*10400..1044F;*/ "Deseret",
 707         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
 708         /*1D100..1D1FF;*/ "Musical Symbols",
 709         /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
 710         /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
 711         /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
 712         /*E0000..E007F;*/ "Tags",
 713         //missing 2 private use add manually
 714 
 715     };
 716     //ADD THOSE MANUALLY
 717     //F0000..FFFFD; "Private Use",
 718     //100000..10FFFD; "Private Use"
 719     //FFF0..FFFD; "Specials",
 720     static final String blockRanges =
 721        "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
 722         +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
 723         +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
 724         +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
 725         +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
 726         +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
 727         +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
 728         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
 729         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
 730         +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
 731         +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
 732     static final int[] nonBMPBlockRanges = {
 733         0x10300, 0x1032F,       // 84
 734         0x10330, 0x1034F,
 735         0x10400, 0x1044F,
 736         0x1D000, 0x1D0FF,
 737         0x1D100, 0x1D1FF,
 738         0x1D400, 0x1D7FF,
 739         0x20000, 0x2A6D6,
 740         0x2F800, 0x2FA1F,
 741         0xE0000, 0xE007F
 742     };
 743     private static final int NONBMP_BLOCK_START = 84;
 744 
 745     static protected RangeToken getRange(String name, boolean positive) {
 746         if (Token.categories.size() == 0) {
 747             synchronized (Token.categories) {
 748                 Token[] ranges = new Token[Token.categoryNames.length];
 749                 for (int i = 0;  i < ranges.length;  i ++) {
 750                     ranges[i] = Token.createRange();
 751                 }
 752                 int type;
 753                 for (int i = 0;  i < 0x10000;  i ++) {
 754                     type = Character.getType((char)i);
 755                     if (type == Character.START_PUNCTUATION ||
 756                         type == Character.END_PUNCTUATION) {
 757                         //build table of Pi values
 758                         if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
 759                             i == 0x201F || i == 0x2039) {
 760                             type = CHAR_INIT_QUOTE;
 761                         }
 762                         //build table of Pf values
 763                         if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
 764                             type = CHAR_FINAL_QUOTE;
 765                         }
 766                     }
 767                     ranges[type].addRange(i, i);
 768                     switch (type) {
 769                       case Character.UPPERCASE_LETTER:
 770                       case Character.LOWERCASE_LETTER:
 771                       case Character.TITLECASE_LETTER:
 772                       case Character.MODIFIER_LETTER:
 773                       case Character.OTHER_LETTER:
 774                         type = CHAR_LETTER;
 775                         break;
 776                       case Character.NON_SPACING_MARK:
 777                       case Character.COMBINING_SPACING_MARK:
 778                       case Character.ENCLOSING_MARK:
 779                         type = CHAR_MARK;
 780                         break;
 781                       case Character.DECIMAL_DIGIT_NUMBER:
 782                       case Character.LETTER_NUMBER:
 783                       case Character.OTHER_NUMBER:
 784                         type = CHAR_NUMBER;
 785                         break;
 786                       case Character.SPACE_SEPARATOR:
 787                       case Character.LINE_SEPARATOR:
 788                       case Character.PARAGRAPH_SEPARATOR:
 789                         type = CHAR_SEPARATOR;
 790                         break;
 791                       case Character.CONTROL:
 792                       case Character.FORMAT:
 793                       case Character.SURROGATE:
 794                       case Character.PRIVATE_USE:
 795                       case Character.UNASSIGNED:
 796                         type = CHAR_OTHER;
 797                         break;
 798                       case Character.CONNECTOR_PUNCTUATION:
 799                       case Character.DASH_PUNCTUATION:
 800                       case Character.START_PUNCTUATION:
 801                       case Character.END_PUNCTUATION:
 802                       case CHAR_INIT_QUOTE:
 803                       case CHAR_FINAL_QUOTE:
 804                       case Character.OTHER_PUNCTUATION:
 805                         type = CHAR_PUNCTUATION;
 806                         break;
 807                       case Character.MATH_SYMBOL:
 808                       case Character.CURRENCY_SYMBOL:
 809                       case Character.MODIFIER_SYMBOL:
 810                       case Character.OTHER_SYMBOL:
 811                         type = CHAR_SYMBOL;
 812                         break;
 813                       default:
 814                         throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
 815                     }
 816                     ranges[type].addRange(i, i);
 817                 } // for all characters
 818                 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
 819 
 820                 for (int i = 0;  i < ranges.length;  i ++) {
 821                     if (Token.categoryNames[i] != null) {
 822                         if (i == Character.UNASSIGNED) { // Unassigned
 823                             ranges[i].addRange(0x10000, Token.UTF16_MAX);
 824                         }
 825                         Token.categories.put(Token.categoryNames[i], ranges[i]);
 826                         Token.categories2.put(Token.categoryNames[i],
 827                                               Token.complementRanges(ranges[i]));
 828                     }
 829                 }
 830                 //REVISIT: do we really need to support block names as in Unicode 3.1
 831                 //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
 832                 //
 833                 StringBuilder buffer = new StringBuilder(50);
 834                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
 835                     Token r1 = Token.createRange();
 836                     int location;
 837                     if (i < NONBMP_BLOCK_START) {
 838                         location = i*2;
 839                         int rstart = Token.blockRanges.charAt(location);
 840                         int rend = Token.blockRanges.charAt(location+1);
 841                         //DEBUGING
 842                         //System.out.println(n+" " +Integer.toHexString(rstart)
 843                         //                     +"-"+ Integer.toHexString(rend));
 844                         r1.addRange(rstart, rend);
 845                     } else {
 846                         location = (i - NONBMP_BLOCK_START) * 2;
 847                         r1.addRange(Token.nonBMPBlockRanges[location],
 848                                     Token.nonBMPBlockRanges[location + 1]);
 849                     }
 850                     String n = Token.blockNames[i];
 851                     if (n.equals("Specials"))
 852                         r1.addRange(0xfff0, 0xfffd);
 853                     if (n.equals("Private Use")) {
 854                         r1.addRange(0xF0000,0xFFFFD);
 855                         r1.addRange(0x100000,0x10FFFD);
 856                     }
 857                     Token.categories.put(n, r1);
 858                     Token.categories2.put(n, Token.complementRanges(r1));
 859                     buffer.setLength(0);
 860                     buffer.append("Is");
 861                     if (n.indexOf(' ') >= 0) {
 862                         for (int ci = 0;  ci < n.length();  ci ++)
 863                             if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
 864                     }
 865                     else {
 866                         buffer.append(n);
 867                     }
 868                     Token.setAlias(buffer.toString(), n, true);
 869                 }
 870 
 871                 // TR#18 1.2
 872                 Token.setAlias("ASSIGNED", "Cn", false);
 873                 Token.setAlias("UNASSIGNED", "Cn", true);
 874                 Token all = Token.createRange();
 875                 all.addRange(0, Token.UTF16_MAX);
 876                 Token.categories.put("ALL", all);
 877                 Token.categories2.put("ALL", Token.complementRanges(all));
 878                 Token.registerNonXS("ASSIGNED");
 879                 Token.registerNonXS("UNASSIGNED");
 880                 Token.registerNonXS("ALL");
 881 
 882                 Token isalpha = Token.createRange();
 883                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
 884                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
 885                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
 886                 Token.categories.put("IsAlpha", isalpha);
 887                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
 888                 Token.registerNonXS("IsAlpha");
 889 
 890                 Token isalnum = Token.createRange();
 891                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
 892                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
 893                 Token.categories.put("IsAlnum", isalnum);
 894                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
 895                 Token.registerNonXS("IsAlnum");
 896 
 897                 Token isspace = Token.createRange();
 898                 isspace.mergeRanges(Token.token_spaces);
 899                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
 900                 Token.categories.put("IsSpace", isspace);
 901                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
 902                 Token.registerNonXS("IsSpace");
 903 
 904                 Token isword = Token.createRange();
 905                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
 906                 isword.addRange('_', '_');
 907                 Token.categories.put("IsWord", isword);
 908                 Token.categories2.put("IsWord", Token.complementRanges(isword));
 909                 Token.registerNonXS("IsWord");
 910 
 911                 Token isascii = Token.createRange();
 912                 isascii.addRange(0, 127);
 913                 Token.categories.put("IsASCII", isascii);
 914                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
 915                 Token.registerNonXS("IsASCII");
 916 
 917                 Token isnotgraph = Token.createRange();
 918                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
 919                 isnotgraph.addRange(' ', ' ');
 920                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
 921                 Token.categories2.put("IsGraph", isnotgraph);
 922                 Token.registerNonXS("IsGraph");
 923 
 924                 Token isxdigit = Token.createRange();
 925                 isxdigit.addRange('0', '9');
 926                 isxdigit.addRange('A', 'F');
 927                 isxdigit.addRange('a', 'f');
 928                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
 929                 Token.categories2.put("IsXDigit", isxdigit);
 930                 Token.registerNonXS("IsXDigit");
 931 
 932                 Token.setAlias("IsDigit", "Nd", true);
 933                 Token.setAlias("IsUpper", "Lu", true);
 934                 Token.setAlias("IsLower", "Ll", true);
 935                 Token.setAlias("IsCntrl", "C", true);
 936                 Token.setAlias("IsPrint", "C", false);
 937                 Token.setAlias("IsPunct", "P", true);
 938                 Token.registerNonXS("IsDigit");
 939                 Token.registerNonXS("IsUpper");
 940                 Token.registerNonXS("IsLower");
 941                 Token.registerNonXS("IsCntrl");
 942                 Token.registerNonXS("IsPrint");
 943                 Token.registerNonXS("IsPunct");
 944 
 945                 Token.setAlias("alpha", "IsAlpha", true);
 946                 Token.setAlias("alnum", "IsAlnum", true);
 947                 Token.setAlias("ascii", "IsASCII", true);
 948                 Token.setAlias("cntrl", "IsCntrl", true);
 949                 Token.setAlias("digit", "IsDigit", true);
 950                 Token.setAlias("graph", "IsGraph", true);
 951                 Token.setAlias("lower", "IsLower", true);
 952                 Token.setAlias("print", "IsPrint", true);
 953                 Token.setAlias("punct", "IsPunct", true);
 954                 Token.setAlias("space", "IsSpace", true);
 955                 Token.setAlias("upper", "IsUpper", true);
 956                 Token.setAlias("word", "IsWord", true); // Perl extension
 957                 Token.setAlias("xdigit", "IsXDigit", true);
 958                 Token.registerNonXS("alpha");
 959                 Token.registerNonXS("alnum");
 960                 Token.registerNonXS("ascii");
 961                 Token.registerNonXS("cntrl");
 962                 Token.registerNonXS("digit");
 963                 Token.registerNonXS("graph");
 964                 Token.registerNonXS("lower");
 965                 Token.registerNonXS("print");
 966                 Token.registerNonXS("punct");
 967                 Token.registerNonXS("space");
 968                 Token.registerNonXS("upper");
 969                 Token.registerNonXS("word");
 970                 Token.registerNonXS("xdigit");
 971             } // synchronized
 972         } // if null
 973         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
 974             : (RangeToken)Token.categories2.get(name);
 975         //if (tok == null) System.out.println(name);
 976         return tok;
 977     }
 978     static protected RangeToken getRange(String name, boolean positive, boolean xs) {
 979         RangeToken range = Token.getRange(name, positive);
 980         if (xs && range != null && Token.isRegisterNonXS(name))
 981             range = null;
 982         return range;
 983     }
 984 
 985     static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>());
 986     /**
 987      * This method is called by only getRange().
 988      * So this method need not MT-safe.
 989      */
 990     static protected void registerNonXS(String name) {
 991         Token.nonxs.add(name);
 992     }
 993 
 994     static protected boolean isRegisterNonXS(String name) {
 995         return Token.nonxs.contains(name);
 996     }
 997 
 998     private static void setAlias(String newName, String name, boolean positive) {
 999         Token t1 = (Token)Token.categories.get(name);
1000         Token t2 = (Token)Token.categories2.get(name);
1001         if (positive) {
1002             Token.categories.put(newName, t1);
1003             Token.categories2.put(newName, t2);
1004         } else {
1005             Token.categories2.put(newName, t1);
1006             Token.categories.put(newName, t2);
1007         }
1008     }
1009 
1010     // ------------------------------------------------------
1011 
1012     static final String viramaString =
1013     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1014     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1015     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1016     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1017     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1018     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1019     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1020     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1021     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1022     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1023     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1024 
1025     static private Token token_grapheme = null;
1026     static synchronized Token getGraphemePattern() {
1027         if (Token.token_grapheme != null)
1028             return Token.token_grapheme;
1029 
1030         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
1031         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1032         base_char.subtractRanges(Token.getRange("M", true));
1033         base_char.subtractRanges(Token.getRange("C", true));
1034 
1035         Token virama = Token.createRange();
1036         for (int i = 0;  i < Token.viramaString.length(); i++) {
1037             virama.addRange(i, i);
1038         }
1039 
1040         Token combiner_wo_virama = Token.createRange();
1041         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1042         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1043         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1044 
1045         Token left = Token.createUnion();       // base_char?
1046         left.addChild(base_char);
1047         left.addChild(Token.token_empty);
1048 
1049         Token foo = Token.createUnion();
1050         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1051         foo.addChild(combiner_wo_virama);
1052 
1053         foo = Token.createClosure(foo);
1054 
1055         foo = Token.createConcat(left, foo);
1056 
1057         Token.token_grapheme = foo;
1058         return Token.token_grapheme;
1059     }
1060 
1061     /**
1062      * Combing Character Sequence in Perl 5.6.
1063      */
1064     static private Token token_ccs = null;
1065     static synchronized Token getCombiningCharacterSequence() {
1066         if (Token.token_ccs != null)
1067             return Token.token_ccs;
1068 
1069         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1070         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1071         Token.token_ccs = foo;
1072         return Token.token_ccs;
1073     }
1074 
1075     // ------------------------------------------------------
1076 
1077     // ------------------------------------------------------
1078     /**
1079      * This class represents a node in parse tree.
1080      */
1081     static class StringToken extends Token implements java.io.Serializable {
1082 
1083         private static final long serialVersionUID = -4614366944218504172L;
1084 
1085         String string;
1086         final int refNumber;
1087 
1088         StringToken(int type, String str, int n) {
1089             super(type);
1090             this.string = str;
1091             this.refNumber = n;
1092         }
1093 
1094         int getReferenceNumber() {              // for STRING
1095             return this.refNumber;
1096         }
1097         String getString() {                    // for STRING
1098             return this.string;
1099         }
1100 
1101         public String toString(int options) {
1102             if (this.type == BACKREFERENCE)
1103                 return "\\"+this.refNumber;
1104             else
1105                 return REUtil.quoteMeta(this.string);
1106         }
1107     }
1108 
1109     /**
1110      * This class represents a node in parse tree.
1111      */
1112     static class ConcatToken extends Token implements java.io.Serializable {
1113 
1114         private static final long serialVersionUID = 8717321425541346381L;
1115 
1116         final Token child;
1117         final Token child2;
1118 
1119         ConcatToken(Token t1, Token t2) {
1120             super(Token.CONCAT);
1121             this.child = t1;
1122             this.child2 = t2;
1123         }
1124 
1125         int size() {
1126             return 2;
1127         }
1128         Token getChild(int index) {
1129             return index == 0 ? this.child : this.child2;
1130         }
1131 
1132         public String toString(int options) {
1133             String ret;
1134             if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1135                 ret = this.child.toString(options)+"+";
1136             } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1137                 ret = this.child.toString(options)+"+?";
1138             } else
1139                 ret = this.child.toString(options)+this.child2.toString(options);
1140             return ret;
1141         }
1142     }
1143 
1144     /**
1145      * This class represents a node in parse tree.
1146      */
1147     static class CharToken extends Token implements java.io.Serializable {
1148 
1149         private static final long serialVersionUID = -4394272816279496989L;
1150 
1151         final int chardata;
1152 
1153         CharToken(int type, int ch) {
1154             super(type);
1155             this.chardata = ch;
1156         }
1157 
1158         int getChar() {
1159             return this.chardata;
1160         }
1161 
1162         public String toString(int options) {
1163             String ret;
1164             switch (this.type) {
1165               case CHAR:
1166                 switch (this.chardata) {
1167                   case '|':  case '*':  case '+':  case '?':
1168                   case '(':  case ')':  case '.':  case '[':
1169                   case '{':  case '\\':
1170                     ret = "\\"+(char)this.chardata;
1171                     break;
1172                   case '\f':  ret = "\\f";  break;
1173                   case '\n':  ret = "\\n";  break;
1174                   case '\r':  ret = "\\r";  break;
1175                   case '\t':  ret = "\\t";  break;
1176                   case 0x1b:  ret = "\\e";  break;
1177                     //case 0x0b:  ret = "\\v";  break;
1178                   default:
1179                     if (this.chardata >= 0x10000) {
1180                         String pre = "0"+Integer.toHexString(this.chardata);
1181                         ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1182                     } else
1183                         ret = ""+(char)this.chardata;
1184                 }
1185                 break;
1186 
1187               case ANCHOR:
1188                 if (this == Token.token_linebeginning || this == Token.token_lineend)
1189                     ret = ""+(char)this.chardata;
1190                 else
1191                     ret = "\\"+(char)this.chardata;
1192                 break;
1193 
1194               default:
1195                 ret = null;
1196             }
1197             return ret;
1198         }
1199 
1200         boolean match(int ch) {
1201             if (this.type == CHAR) {
1202                 return ch == this.chardata;
1203             } else
1204                 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1205         }
1206     }
1207 
1208     /**
1209      * This class represents a node in parse tree.
1210      */
1211     static class ClosureToken extends Token implements java.io.Serializable {
1212 
1213         private static final long serialVersionUID = 1308971930673997452L;
1214 
1215         int min;
1216         int max;
1217         final Token child;
1218 
1219         ClosureToken(int type, Token tok) {
1220             super(type);
1221             this.child = tok;
1222             this.setMin(-1);
1223             this.setMax(-1);
1224         }
1225 
1226         int size() {
1227             return 1;
1228         }
1229         Token getChild(int index) {
1230             return this.child;
1231         }
1232 
1233         final void setMin(int min) {
1234             this.min = min;
1235         }
1236         final void setMax(int max) {
1237             this.max = max;
1238         }
1239         final int getMin() {
1240             return this.min;
1241         }
1242         final int getMax() {
1243             return this.max;
1244         }
1245 
1246         public String toString(int options) {
1247             String ret;
1248             if (this.type == CLOSURE) {
1249                 if (this.getMin() < 0 && this.getMax() < 0) {
1250                     ret = this.child.toString(options)+"*";
1251                 } else if (this.getMin() == this.getMax()) {
1252                     ret = this.child.toString(options)+"{"+this.getMin()+"}";
1253                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1254                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1255                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1256                     ret = this.child.toString(options)+"{"+this.getMin()+",}";
1257                 } else
1258                     throw new RuntimeException("Token#toString(): CLOSURE "
1259                                                +this.getMin()+", "+this.getMax());
1260             } else {
1261                 if (this.getMin() < 0 && this.getMax() < 0) {
1262                     ret = this.child.toString(options)+"*?";
1263                 } else if (this.getMin() == this.getMax()) {
1264                     ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1265                 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1266                     ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1267                 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1268                     ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1269                 } else
1270                     throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1271                                                +this.getMin()+", "+this.getMax());
1272             }
1273             return ret;
1274         }
1275     }
1276 
1277     /**
1278      * This class represents a node in parse tree.
1279      */
1280     static class ParenToken extends Token implements java.io.Serializable {
1281 
1282         private static final long serialVersionUID = -5938014719827987704L;
1283 
1284         final Token child;
1285         final int parennumber;
1286 
1287         ParenToken(int type, Token tok, int paren) {
1288             super(type);
1289             this.child = tok;
1290             this.parennumber = paren;
1291         }
1292 
1293         int size() {
1294             return 1;
1295         }
1296         Token getChild(int index) {
1297             return this.child;
1298         }
1299 
1300         int getParenNumber() {
1301             return this.parennumber;
1302         }
1303 
1304         public String toString(int options) {
1305             String ret = null;
1306             switch (this.type) {
1307               case PAREN:
1308                 if (this.parennumber == 0) {
1309                     ret = "(?:"+this.child.toString(options)+")";
1310                 } else {
1311                     ret = "("+this.child.toString(options)+")";
1312                 }
1313                 break;
1314 
1315               case LOOKAHEAD:
1316                 ret = "(?="+this.child.toString(options)+")";
1317                 break;
1318               case NEGATIVELOOKAHEAD:
1319                 ret = "(?!"+this.child.toString(options)+")";
1320                 break;
1321               case LOOKBEHIND:
1322                 ret = "(?<="+this.child.toString(options)+")";
1323                 break;
1324               case NEGATIVELOOKBEHIND:
1325                 ret = "(?<!"+this.child.toString(options)+")";
1326                 break;
1327               case INDEPENDENT:
1328                 ret = "(?>"+this.child.toString(options)+")";
1329                 break;
1330             }
1331             return ret;
1332         }
1333     }
1334 
1335     /**
1336      * (?(condition)yes-pattern|no-pattern)
1337      */
1338     static class ConditionToken extends Token implements java.io.Serializable {
1339 
1340         private static final long serialVersionUID = 4353765277910594411L;
1341 
1342         final int refNumber;
1343         final Token condition;
1344         final Token yes;
1345         final Token no;
1346         ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1347             super(Token.CONDITION);
1348             this.refNumber = refno;
1349             this.condition = cond;
1350             this.yes = yespat;
1351             this.no = nopat;
1352         }
1353         int size() {
1354             return this.no == null ? 1 : 2;
1355         }
1356         Token getChild(int index) {
1357             if (index == 0)  return this.yes;
1358             if (index == 1)  return this.no;
1359             throw new RuntimeException("Internal Error: "+index);
1360         }
1361 
1362         public String toString(int options) {
1363             String ret;
1364             if (refNumber > 0) {
1365                 ret = "(?("+refNumber+")";
1366             } else if (this.condition.type == Token.ANCHOR) {
1367                 ret = "(?("+this.condition+")";
1368             } else {
1369                 ret = "(?"+this.condition;
1370             }
1371 
1372             if (this.no == null) {
1373                 ret += this.yes+")";
1374             } else {
1375                 ret += this.yes+"|"+this.no+")";
1376             }
1377             return ret;
1378         }
1379     }
1380 
1381     /**
1382      * (ims-ims: .... )
1383      */
1384     static class ModifierToken extends Token implements java.io.Serializable {
1385 
1386         private static final long serialVersionUID = -9114536559696480356L;
1387 
1388         final Token child;
1389         final int add;
1390         final int mask;
1391 
1392         ModifierToken(Token tok, int add, int mask) {
1393             super(Token.MODIFIERGROUP);
1394             this.child = tok;
1395             this.add = add;
1396             this.mask = mask;
1397         }
1398 
1399         int size() {
1400             return 1;
1401         }
1402         Token getChild(int index) {
1403             return this.child;
1404         }
1405 
1406         int getOptions() {
1407             return this.add;
1408         }
1409         int getOptionsMask() {
1410             return this.mask;
1411         }
1412 
1413         public String toString(int options) {
1414             return "(?"
1415                 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1416                 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1417                 +":"
1418                 +this.child.toString(options)
1419                 +")";
1420         }
1421     }
1422 
1423     /**
1424      * This class represents a node in parse tree.
1425      * for UNION or CONCAT.
1426      */
1427     static class UnionToken extends Token implements java.io.Serializable {
1428 
1429         private static final long serialVersionUID = -2568843945989489861L;
1430 
1431         List<Token> children;
1432 
1433         /**
1434          * @serialField children Vector children
1435          */
1436         private static final ObjectStreamField[] serialPersistentFields =
1437             new ObjectStreamField[] {
1438                 new ObjectStreamField("children", Vector.class),
1439             };
1440 
1441         UnionToken(int type) {
1442             super(type);
1443         }
1444 
1445         @Override
1446         void addChild(Token tok) {
1447             if (tok == null)  return;
1448             if (this.children == null)  this.children = new ArrayList<>();
1449             if (this.type == UNION) {
1450                 this.children.add(tok);
1451                 return;
1452             }
1453                                                 // This is CONCAT, and new child is CONCAT.
1454             if (tok.type == CONCAT) {
1455                 for (int i = 0;  i < tok.size();  i ++)
1456                     this.addChild(tok.getChild(i)); // Recursion
1457                 return;
1458             }
1459             int size = this.children.size();
1460             if (size == 0) {
1461                 this.children.add(tok);
1462                 return;
1463             }
1464             Token previous = this.children.get(size - 1);
1465             if (!((previous.type == CHAR || previous.type == STRING)
1466                   && (tok.type == CHAR || tok.type == STRING))) {
1467                 this.children.add(tok);
1468                 return;
1469             }
1470 
1471             //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1472 
1473             StringBuilder buffer;
1474             int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1475             if (previous.type == CHAR) {        // Replace previous token by STRING
1476                 buffer = new StringBuilder(2 + nextMaxLength);
1477                 int ch = previous.getChar();
1478                 if (ch >= 0x10000)
1479                     buffer.append(REUtil.decomposeToSurrogates(ch));
1480                 else
1481                     buffer.append((char)ch);
1482                 previous = Token.createString(null);
1483                 this.children.set(size - 1, previous);
1484             } else {                            // STRING
1485                 buffer = new StringBuilder(previous.getString().length() + nextMaxLength);
1486                 buffer.append(previous.getString());
1487             }
1488 
1489             if (tok.type == CHAR) {
1490                 int ch = tok.getChar();
1491                 if (ch >= 0x10000)
1492                     buffer.append(REUtil.decomposeToSurrogates(ch));
1493                 else
1494                     buffer.append((char)ch);
1495             } else {
1496                 buffer.append(tok.getString());
1497             }
1498 
1499             ((StringToken)previous).string = new String(buffer);
1500         }
1501 
1502         @Override
1503         int size() {
1504             return this.children == null ? 0 : this.children.size();
1505         }
1506         @Override
1507         Token getChild(int index) {
1508             return this.children.get(index);
1509         }
1510 
1511         @Override
1512         public String toString(int options) {
1513             String ret;
1514             if (this.type == CONCAT) {
1515                 if (this.children.size() == 2) {
1516                     Token ch = this.getChild(0);
1517                     Token ch2 = this.getChild(1);
1518                     if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1519                         ret = ch.toString(options)+"+";
1520                     } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1521                         ret = ch.toString(options)+"+?";
1522                     } else
1523                         ret = ch.toString(options)+ch2.toString(options);
1524                 } else {
1525                     StringBuilder sb = new StringBuilder();
1526                     for (int i = 0;  i < this.children.size();  i ++) {
1527                         sb.append(((Token)this.children.get(i)).toString(options));
1528                     }
1529                     ret = new String(sb);
1530                 }
1531                 return ret;
1532             }
1533             if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1534                 ret = this.getChild(0).toString(options)+"?";
1535             } else if (this.children.size() == 2
1536                        && this.getChild(0).type == EMPTY) {
1537                 ret = this.getChild(1).toString(options)+"??";
1538             } else {
1539                 StringBuilder sb = new StringBuilder();
1540                 sb.append((this.children.get(0)).toString(options));
1541                 for (int i = 1;  i < this.children.size();  i ++) {
1542                     sb.append((char)'|');
1543                     sb.append((this.children.get(i)).toString(options));
1544                 }
1545                 ret = new String(sb);
1546             }
1547             return ret;
1548         }
1549 
1550         /**
1551          * @serialData Serialized fields. Convert the List to Vector for backward compatibility.
1552          */
1553         private void writeObject(ObjectOutputStream out) throws IOException {
1554             // Convert List to Vector
1555             Vector<Token> vChildren = (children == null)? null : new Vector<>(children);
1556 
1557             // Write serialized fields
1558             ObjectOutputStream.PutField pf = out.putFields();
1559             pf.put("children", vChildren);
1560             out.writeFields();
1561         }
1562 
1563         @SuppressWarnings("unchecked")
1564         private void readObject(ObjectInputStream in)
1565                             throws IOException, ClassNotFoundException {
1566             // We have to read serialized fields first.
1567             ObjectInputStream.GetField gf = in.readFields();
1568             Vector<Token> vChildren = (Vector<Token>)gf.get("children", null);
1569 
1570             //convert Vector back to List
1571             if (vChildren != null) children = new ArrayList<>(vChildren);
1572         }
1573     }
1574 }