1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 import java.io.IOException; 24 import java.io.ObjectInputStream; 25 import java.io.ObjectOutputStream; 26 import java.io.ObjectStreamField; 27 import java.util.ArrayList; 28 import java.util.Collections; 29 import java.util.HashMap; 30 import java.util.HashSet; 31 import java.util.List; 32 import java.util.Map; 33 import java.util.Set; 34 import java.util.Vector; 35 36 /** 37 * This class represents a node in parse tree. 38 * 39 * @xerces.internal 40 * 41 * @version $Id: Token.java,v 1.7 2010/07/27 05:02:34 joehw Exp $ 42 */ 43 class Token implements java.io.Serializable { 44 45 private static final long serialVersionUID = 8484976002585487481L; 46 47 static final boolean COUNTTOKENS = true; 48 static int tokens = 0; 49 50 static final int CHAR = 0; // Literal char 51 static final int DOT = 11; // . 52 static final int CONCAT = 1; // XY 53 static final int UNION = 2; // X|Y|Z 54 static final int CLOSURE = 3; // X* 55 static final int RANGE = 4; // [a-zA-Z] etc. 56 static final int NRANGE = 5; // [^a-zA-Z] etc. 57 static final int PAREN = 6; // (X) or (?:X) 58 static final int EMPTY = 7; // 59 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z 60 static final int NONGREEDYCLOSURE = 9; // *? +? 61 static final int STRING = 10; // strings 62 static final int BACKREFERENCE = 12; // back references 63 static final int LOOKAHEAD = 20; // (?=...) 64 static final int NEGATIVELOOKAHEAD = 21; // (?!...) 65 static final int LOOKBEHIND = 22; // (?<=...) 66 static final int NEGATIVELOOKBEHIND = 23; // (?<!...) 67 static final int INDEPENDENT = 24; // (?>...) 68 static final int MODIFIERGROUP = 25; // (?ims-ims:...) 69 static final int CONDITION = 26; // (?(...)yes|no) 70 71 static final int UTF16_MAX = 0x10ffff; 72 73 final int type; 74 75 static Token token_dot; 76 static Token token_0to9; 77 static Token token_wordchars; 78 static Token token_not_0to9; 79 static Token token_not_wordchars; 80 static Token token_spaces; 81 static Token token_not_spaces; 82 static Token token_empty; 83 static Token token_linebeginning; 84 static Token token_linebeginning2; 85 static Token token_lineend; 86 static Token token_stringbeginning; 87 static Token token_stringend; 88 static Token token_stringend2; 89 static Token token_wordedge; 90 static Token token_not_wordedge; 91 static Token token_wordbeginning; 92 static Token token_wordend; 93 static { 94 Token.token_empty = new Token(Token.EMPTY); 95 96 Token.token_linebeginning = Token.createAnchor('^'); 97 Token.token_linebeginning2 = Token.createAnchor('@'); 98 Token.token_lineend = Token.createAnchor('$'); 99 Token.token_stringbeginning = Token.createAnchor('A'); 100 Token.token_stringend = Token.createAnchor('z'); 101 Token.token_stringend2 = Token.createAnchor('Z'); 102 Token.token_wordedge = Token.createAnchor('b'); 103 Token.token_not_wordedge = Token.createAnchor('B'); 104 Token.token_wordbeginning = Token.createAnchor('<'); 105 Token.token_wordend = Token.createAnchor('>'); 106 107 Token.token_dot = new Token(Token.DOT); 108 109 Token.token_0to9 = Token.createRange(); 110 Token.token_0to9.addRange('0', '9'); 111 Token.token_wordchars = Token.createRange(); 112 Token.token_wordchars.addRange('0', '9'); 113 Token.token_wordchars.addRange('A', 'Z'); 114 Token.token_wordchars.addRange('_', '_'); 115 Token.token_wordchars.addRange('a', 'z'); 116 Token.token_spaces = Token.createRange(); 117 Token.token_spaces.addRange('\t', '\t'); 118 Token.token_spaces.addRange('\n', '\n'); 119 Token.token_spaces.addRange('\f', '\f'); 120 Token.token_spaces.addRange('\r', '\r'); 121 Token.token_spaces.addRange(' ', ' '); 122 123 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9); 124 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars); 125 Token.token_not_spaces = Token.complementRanges(Token.token_spaces); 126 } 127 128 static Token.ParenToken createLook(int type, Token child) { 129 if (COUNTTOKENS) Token.tokens ++; 130 return new Token.ParenToken(type, child, 0); 131 } 132 static Token.ParenToken createParen(Token child, int pnumber) { 133 if (COUNTTOKENS) Token.tokens ++; 134 return new Token.ParenToken(Token.PAREN, child, pnumber); 135 } 136 static Token.ClosureToken createClosure(Token tok) { 137 if (COUNTTOKENS) Token.tokens ++; 138 return new Token.ClosureToken(Token.CLOSURE, tok); 139 } 140 static Token.ClosureToken createNGClosure(Token tok) { 141 if (COUNTTOKENS) Token.tokens ++; 142 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok); 143 } 144 static Token.ConcatToken createConcat(Token tok1, Token tok2) { 145 if (COUNTTOKENS) Token.tokens ++; 146 return new Token.ConcatToken(tok1, tok2); 147 } 148 static Token.UnionToken createConcat() { 149 if (COUNTTOKENS) Token.tokens ++; 150 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug. 151 } 152 static Token.UnionToken createUnion() { 153 if (COUNTTOKENS) Token.tokens ++; 154 return new Token.UnionToken(Token.UNION); 155 } 156 static Token createEmpty() { 157 return Token.token_empty; 158 } 159 static RangeToken createRange() { 160 if (COUNTTOKENS) Token.tokens ++; 161 return new RangeToken(Token.RANGE); 162 } 163 static RangeToken createNRange() { 164 if (COUNTTOKENS) Token.tokens ++; 165 return new RangeToken(Token.NRANGE); 166 } 167 static Token.CharToken createChar(int ch) { 168 if (COUNTTOKENS) Token.tokens ++; 169 return new Token.CharToken(Token.CHAR, ch); 170 } 171 static private Token.CharToken createAnchor(int ch) { 172 if (COUNTTOKENS) Token.tokens ++; 173 return new Token.CharToken(Token.ANCHOR, ch); 174 } 175 static Token.StringToken createBackReference(int refno) { 176 if (COUNTTOKENS) Token.tokens ++; 177 return new Token.StringToken(Token.BACKREFERENCE, null, refno); 178 } 179 static Token.StringToken createString(String str) { 180 if (COUNTTOKENS) Token.tokens ++; 181 return new Token.StringToken(Token.STRING, str, 0); 182 } 183 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) { 184 if (COUNTTOKENS) Token.tokens ++; 185 return new Token.ModifierToken(child, add, mask); 186 } 187 static Token.ConditionToken createCondition(int refno, Token condition, 188 Token yespat, Token nopat) { 189 if (COUNTTOKENS) Token.tokens ++; 190 return new Token.ConditionToken(refno, condition, yespat, nopat); 191 } 192 193 protected Token(int type) { 194 this.type = type; 195 } 196 197 /** 198 * A number of children. 199 */ 200 int size() { 201 return 0; 202 } 203 Token getChild(int index) { 204 return null; 205 } 206 void addChild(Token tok) { 207 throw new RuntimeException("Not supported."); 208 } 209 210 // for RANGE or NRANGE 211 protected void addRange(int start, int end) { 212 throw new RuntimeException("Not supported."); 213 } 214 protected void sortRanges() { 215 throw new RuntimeException("Not supported."); 216 } 217 protected void compactRanges() { 218 throw new RuntimeException("Not supported."); 219 } 220 protected void mergeRanges(Token tok) { 221 throw new RuntimeException("Not supported."); 222 } 223 protected void subtractRanges(Token tok) { 224 throw new RuntimeException("Not supported."); 225 } 226 protected void intersectRanges(Token tok) { 227 throw new RuntimeException("Not supported."); 228 } 229 static Token complementRanges(Token tok) { 230 return RangeToken.complementRanges(tok); 231 } 232 233 234 void setMin(int min) { // for CLOSURE 235 } 236 void setMax(int max) { // for CLOSURE 237 } 238 int getMin() { // for CLOSURE 239 return -1; 240 } 241 int getMax() { // for CLOSURE 242 return -1; 243 } 244 int getReferenceNumber() { // for STRING 245 return 0; 246 } 247 String getString() { // for STRING 248 return null; 249 } 250 251 int getParenNumber() { 252 return 0; 253 } 254 int getChar() { 255 return -1; 256 } 257 258 public String toString() { 259 return this.toString(0); 260 } 261 public String toString(int options) { 262 return this.type == Token.DOT ? "." : ""; 263 } 264 265 /** 266 * How many characters are needed? 267 */ 268 final int getMinLength() { 269 switch (this.type) { 270 case CONCAT: 271 int sum = 0; 272 for (int i = 0; i < this.size(); i ++) 273 sum += this.getChild(i).getMinLength(); 274 return sum; 275 276 case CONDITION: 277 case UNION: 278 if (this.size() == 0) 279 return 0; 280 int ret = this.getChild(0).getMinLength(); 281 for (int i = 1; i < this.size(); i ++) { 282 int min = this.getChild(i).getMinLength(); 283 if (min < ret) ret = min; 284 } 285 return ret; 286 287 case CLOSURE: 288 case NONGREEDYCLOSURE: 289 if (this.getMin() >= 0) 290 return this.getMin() * this.getChild(0).getMinLength(); 291 return 0; 292 293 case EMPTY: 294 case ANCHOR: 295 return 0; 296 297 case DOT: 298 case CHAR: 299 case RANGE: 300 case NRANGE: 301 return 1; 302 303 case INDEPENDENT: 304 case PAREN: 305 case MODIFIERGROUP: 306 return this.getChild(0).getMinLength(); 307 308 case BACKREFERENCE: 309 return 0; // ******* 310 311 case STRING: 312 return this.getString().length(); 313 314 case LOOKAHEAD: 315 case NEGATIVELOOKAHEAD: 316 case LOOKBEHIND: 317 case NEGATIVELOOKBEHIND: 318 return 0; // ***** Really? 319 320 default: 321 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type); 322 } 323 } 324 325 final int getMaxLength() { 326 switch (this.type) { 327 case CONCAT: 328 int sum = 0; 329 for (int i = 0; i < this.size(); i ++) { 330 int d = this.getChild(i).getMaxLength(); 331 if (d < 0) return -1; 332 sum += d; 333 } 334 return sum; 335 336 case CONDITION: 337 case UNION: 338 if (this.size() == 0) 339 return 0; 340 int ret = this.getChild(0).getMaxLength(); 341 for (int i = 1; ret >= 0 && i < this.size(); i ++) { 342 int max = this.getChild(i).getMaxLength(); 343 if (max < 0) { // infinity 344 ret = -1; 345 break; 346 } 347 if (max > ret) ret = max; 348 } 349 return ret; 350 351 case CLOSURE: 352 case NONGREEDYCLOSURE: 353 if (this.getMax() >= 0) 354 // When this.child.getMaxLength() < 0, 355 // this returns minus value 356 return this.getMax() * this.getChild(0).getMaxLength(); 357 return -1; 358 359 case EMPTY: 360 case ANCHOR: 361 return 0; 362 363 case CHAR: 364 return 1; 365 case DOT: 366 case RANGE: 367 case NRANGE: 368 return 2; 369 370 case INDEPENDENT: 371 case PAREN: 372 case MODIFIERGROUP: 373 return this.getChild(0).getMaxLength(); 374 375 case BACKREFERENCE: 376 return -1; // ****** 377 378 case STRING: 379 return this.getString().length(); 380 381 case LOOKAHEAD: 382 case NEGATIVELOOKAHEAD: 383 case LOOKBEHIND: 384 case NEGATIVELOOKBEHIND: 385 return 0; // ***** Really? 386 387 default: 388 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type); 389 } 390 } 391 392 static final int FC_CONTINUE = 0; 393 static final int FC_TERMINAL = 1; 394 static final int FC_ANY = 2; 395 private static final boolean isSet(int options, int flag) { 396 return (options & flag) == flag; 397 } 398 final int analyzeFirstCharacter(RangeToken result, int options) { 399 switch (this.type) { 400 case CONCAT: 401 int ret = FC_CONTINUE; 402 for (int i = 0; i < this.size(); i ++) 403 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE) 404 break; 405 return ret; 406 407 case UNION: 408 if (this.size() == 0) 409 return FC_CONTINUE; 410 /* 411 * a|b|c -> FC_TERMINAL 412 * a|.|c -> FC_ANY 413 * a|b| -> FC_CONTINUE 414 */ 415 int ret2 = FC_CONTINUE; 416 boolean hasEmpty = false; 417 for (int i = 0; i < this.size(); i ++) { 418 ret2 = this.getChild(i).analyzeFirstCharacter(result, options); 419 if (ret2 == FC_ANY) 420 break; 421 else if (ret2 == FC_CONTINUE) 422 hasEmpty = true; 423 } 424 return hasEmpty ? FC_CONTINUE : ret2; 425 426 case CONDITION: 427 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options); 428 if (this.size() == 1) return FC_CONTINUE; 429 if (ret3 == FC_ANY) return ret3; 430 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options); 431 if (ret4 == FC_ANY) return ret4; 432 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL; 433 434 case CLOSURE: 435 case NONGREEDYCLOSURE: 436 this.getChild(0).analyzeFirstCharacter(result, options); 437 return FC_CONTINUE; 438 439 case EMPTY: 440 case ANCHOR: 441 return FC_CONTINUE; 442 443 case CHAR: 444 int ch = this.getChar(); 445 result.addRange(ch, ch); 446 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 447 ch = Character.toUpperCase((char)ch); 448 result.addRange(ch, ch); 449 ch = Character.toLowerCase((char)ch); 450 result.addRange(ch, ch); 451 } 452 return FC_TERMINAL; 453 454 case DOT: 455 return FC_ANY; 456 457 case RANGE: 458 result.mergeRanges(this); 459 return FC_TERMINAL; 460 461 case NRANGE: // **** 462 result.mergeRanges(Token.complementRanges(this)); 463 return FC_TERMINAL; 464 465 case INDEPENDENT: 466 case PAREN: 467 return this.getChild(0).analyzeFirstCharacter(result, options); 468 469 case MODIFIERGROUP: 470 options |= ((ModifierToken)this).getOptions(); 471 options &= ~((ModifierToken)this).getOptionsMask(); 472 return this.getChild(0).analyzeFirstCharacter(result, options); 473 474 case BACKREFERENCE: 475 result.addRange(0, UTF16_MAX); // **** We can not optimize. 476 return FC_ANY; 477 478 case STRING: 479 int cha = this.getString().charAt(0); 480 int ch2; 481 if (REUtil.isHighSurrogate(cha) 482 && this.getString().length() >= 2 483 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1)))) 484 cha = REUtil.composeFromSurrogates(cha, ch2); 485 result.addRange(cha, cha); 486 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) { 487 cha = Character.toUpperCase((char)cha); 488 result.addRange(cha, cha); 489 cha = Character.toLowerCase((char)cha); 490 result.addRange(cha, cha); 491 } 492 return FC_TERMINAL; 493 494 case LOOKAHEAD: 495 case NEGATIVELOOKAHEAD: 496 case LOOKBEHIND: 497 case NEGATIVELOOKBEHIND: 498 return FC_CONTINUE; 499 500 default: 501 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type); 502 } 503 } 504 505 private final boolean isShorterThan(Token tok) { 506 if (tok == null) return false; 507 /* 508 int mylength; 509 if (this.type == STRING) mylength = this.getString().length(); 510 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1; 511 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 512 int otherlength; 513 if (tok.type == STRING) otherlength = tok.getString().length(); 514 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1; 515 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 516 */ 517 int mylength; 518 if (this.type == STRING) mylength = this.getString().length(); 519 else throw new RuntimeException("Internal Error: Illegal type: "+this.type); 520 int otherlength; 521 if (tok.type == STRING) otherlength = tok.getString().length(); 522 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type); 523 return mylength < otherlength; 524 } 525 526 static class FixedStringContainer { 527 Token token = null; 528 int options = 0; 529 FixedStringContainer() { 530 } 531 } 532 533 final void findFixedString(FixedStringContainer container, int options) { 534 switch (this.type) { 535 case CONCAT: 536 Token prevToken = null; 537 int prevOptions = 0; 538 for (int i = 0; i < this.size(); i ++) { 539 this.getChild(i).findFixedString(container, options); 540 if (prevToken == null || prevToken.isShorterThan(container.token)) { 541 prevToken = container.token; 542 prevOptions = container.options; 543 } 544 } 545 container.token = prevToken; 546 container.options = prevOptions; 547 return; 548 549 case UNION: 550 case CLOSURE: 551 case NONGREEDYCLOSURE: 552 case EMPTY: 553 case ANCHOR: 554 case RANGE: 555 case DOT: 556 case NRANGE: 557 case BACKREFERENCE: 558 case LOOKAHEAD: 559 case NEGATIVELOOKAHEAD: 560 case LOOKBEHIND: 561 case NEGATIVELOOKBEHIND: 562 case CONDITION: 563 container.token = null; 564 return; 565 566 case CHAR: // Ignore CHAR tokens. 567 container.token = null; // ** 568 return; // ** 569 570 case STRING: 571 container.token = this; 572 container.options = options; 573 return; 574 575 case INDEPENDENT: 576 case PAREN: 577 this.getChild(0).findFixedString(container, options); 578 return; 579 580 case MODIFIERGROUP: 581 options |= ((ModifierToken)this).getOptions(); 582 options &= ~((ModifierToken)this).getOptionsMask(); 583 this.getChild(0).findFixedString(container, options); 584 return; 585 586 default: 587 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type); 588 } 589 } 590 591 boolean match(int ch) { 592 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 593 } 594 595 // ------------------------------------------------------ 596 private final static Map<String, Token> categories = new HashMap<>(); 597 private final static Map<String, Token> categories2 = new HashMap<>(); 598 private static final String[] categoryNames = { 599 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd", 600 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs", 601 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28 602 "Pi", "Pf", // 29, 30 603 "L", "M", "N", "Z", "C", "P", "S", // 31-37 604 }; 605 606 // Schema Rec. {Datatypes} - Punctuation 607 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote 608 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote 609 static final int CHAR_LETTER = 31; 610 static final int CHAR_MARK = 32; 611 static final int CHAR_NUMBER = 33; 612 static final int CHAR_SEPARATOR = 34; 613 static final int CHAR_OTHER = 35; 614 static final int CHAR_PUNCTUATION = 36; 615 static final int CHAR_SYMBOL = 37; 616 617 //blockNames in UNICODE 3.1 that supported by XML Schema REC 618 private static final String[] blockNames = { 619 /*0000..007F;*/ "Basic Latin", 620 /*0080..00FF;*/ "Latin-1 Supplement", 621 /*0100..017F;*/ "Latin Extended-A", 622 /*0180..024F;*/ "Latin Extended-B", 623 /*0250..02AF;*/ "IPA Extensions", 624 /*02B0..02FF;*/ "Spacing Modifier Letters", 625 /*0300..036F;*/ "Combining Diacritical Marks", 626 /*0370..03FF;*/ "Greek", 627 /*0400..04FF;*/ "Cyrillic", 628 /*0530..058F;*/ "Armenian", 629 /*0590..05FF;*/ "Hebrew", 630 /*0600..06FF;*/ "Arabic", 631 /*0700..074F;*/ "Syriac", 632 /*0780..07BF;*/ "Thaana", 633 /*0900..097F;*/ "Devanagari", 634 /*0980..09FF;*/ "Bengali", 635 /*0A00..0A7F;*/ "Gurmukhi", 636 /*0A80..0AFF;*/ "Gujarati", 637 /*0B00..0B7F;*/ "Oriya", 638 /*0B80..0BFF;*/ "Tamil", 639 /*0C00..0C7F;*/ "Telugu", 640 /*0C80..0CFF;*/ "Kannada", 641 /*0D00..0D7F;*/ "Malayalam", 642 /*0D80..0DFF;*/ "Sinhala", 643 /*0E00..0E7F;*/ "Thai", 644 /*0E80..0EFF;*/ "Lao", 645 /*0F00..0FFF;*/ "Tibetan", 646 /*1000..109F;*/ "Myanmar", 647 /*10A0..10FF;*/ "Georgian", 648 /*1100..11FF;*/ "Hangul Jamo", 649 /*1200..137F;*/ "Ethiopic", 650 /*13A0..13FF;*/ "Cherokee", 651 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics", 652 /*1680..169F;*/ "Ogham", 653 /*16A0..16FF;*/ "Runic", 654 /*1780..17FF;*/ "Khmer", 655 /*1800..18AF;*/ "Mongolian", 656 /*1E00..1EFF;*/ "Latin Extended Additional", 657 /*1F00..1FFF;*/ "Greek Extended", 658 /*2000..206F;*/ "General Punctuation", 659 /*2070..209F;*/ "Superscripts and Subscripts", 660 /*20A0..20CF;*/ "Currency Symbols", 661 /*20D0..20FF;*/ "Combining Marks for Symbols", 662 /*2100..214F;*/ "Letterlike Symbols", 663 /*2150..218F;*/ "Number Forms", 664 /*2190..21FF;*/ "Arrows", 665 /*2200..22FF;*/ "Mathematical Operators", 666 /*2300..23FF;*/ "Miscellaneous Technical", 667 /*2400..243F;*/ "Control Pictures", 668 /*2440..245F;*/ "Optical Character Recognition", 669 /*2460..24FF;*/ "Enclosed Alphanumerics", 670 /*2500..257F;*/ "Box Drawing", 671 /*2580..259F;*/ "Block Elements", 672 /*25A0..25FF;*/ "Geometric Shapes", 673 /*2600..26FF;*/ "Miscellaneous Symbols", 674 /*2700..27BF;*/ "Dingbats", 675 /*2800..28FF;*/ "Braille Patterns", 676 /*2E80..2EFF;*/ "CJK Radicals Supplement", 677 /*2F00..2FDF;*/ "Kangxi Radicals", 678 /*2FF0..2FFF;*/ "Ideographic Description Characters", 679 /*3000..303F;*/ "CJK Symbols and Punctuation", 680 /*3040..309F;*/ "Hiragana", 681 /*30A0..30FF;*/ "Katakana", 682 /*3100..312F;*/ "Bopomofo", 683 /*3130..318F;*/ "Hangul Compatibility Jamo", 684 /*3190..319F;*/ "Kanbun", 685 /*31A0..31BF;*/ "Bopomofo Extended", 686 /*3200..32FF;*/ "Enclosed CJK Letters and Months", 687 /*3300..33FF;*/ "CJK Compatibility", 688 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A", 689 /*4E00..9FFF;*/ "CJK Unified Ideographs", 690 /*A000..A48F;*/ "Yi Syllables", 691 /*A490..A4CF;*/ "Yi Radicals", 692 /*AC00..D7A3;*/ "Hangul Syllables", 693 /*E000..F8FF;*/ "Private Use", 694 /*F900..FAFF;*/ "CJK Compatibility Ideographs", 695 /*FB00..FB4F;*/ "Alphabetic Presentation Forms", 696 /*FB50..FDFF;*/ "Arabic Presentation Forms-A", 697 /*FE20..FE2F;*/ "Combining Half Marks", 698 /*FE30..FE4F;*/ "CJK Compatibility Forms", 699 /*FE50..FE6F;*/ "Small Form Variants", 700 /*FE70..FEFE;*/ "Arabic Presentation Forms-B", 701 /*FEFF..FEFF;*/ "Specials", 702 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms", 703 //missing Specials add manually 704 /*10300..1032F;*/ "Old Italic", // 84 705 /*10330..1034F;*/ "Gothic", 706 /*10400..1044F;*/ "Deseret", 707 /*1D000..1D0FF;*/ "Byzantine Musical Symbols", 708 /*1D100..1D1FF;*/ "Musical Symbols", 709 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols", 710 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B", 711 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement", 712 /*E0000..E007F;*/ "Tags", 713 //missing 2 private use add manually 714 715 }; 716 //ADD THOSE MANUALLY 717 //F0000..FFFFD; "Private Use", 718 //100000..10FFFD; "Private Use" 719 //FFF0..FFFD; "Specials", 720 static final String blockRanges = 721 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F" 722 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF" 723 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF" 724 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF" 725 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF" 726 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF" 727 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF" 728 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F" 729 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF" 730 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF" 731 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF"; 732 static final int[] nonBMPBlockRanges = { 733 0x10300, 0x1032F, // 84 734 0x10330, 0x1034F, 735 0x10400, 0x1044F, 736 0x1D000, 0x1D0FF, 737 0x1D100, 0x1D1FF, 738 0x1D400, 0x1D7FF, 739 0x20000, 0x2A6D6, 740 0x2F800, 0x2FA1F, 741 0xE0000, 0xE007F 742 }; 743 private static final int NONBMP_BLOCK_START = 84; 744 745 static protected RangeToken getRange(String name, boolean positive) { 746 if (Token.categories.size() == 0) { 747 synchronized (Token.categories) { 748 Token[] ranges = new Token[Token.categoryNames.length]; 749 for (int i = 0; i < ranges.length; i ++) { 750 ranges[i] = Token.createRange(); 751 } 752 int type; 753 for (int i = 0; i < 0x10000; i ++) { 754 type = Character.getType((char)i); 755 if (type == Character.START_PUNCTUATION || 756 type == Character.END_PUNCTUATION) { 757 //build table of Pi values 758 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C || 759 i == 0x201F || i == 0x2039) { 760 type = CHAR_INIT_QUOTE; 761 } 762 //build table of Pf values 763 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) { 764 type = CHAR_FINAL_QUOTE; 765 } 766 } 767 ranges[type].addRange(i, i); 768 switch (type) { 769 case Character.UPPERCASE_LETTER: 770 case Character.LOWERCASE_LETTER: 771 case Character.TITLECASE_LETTER: 772 case Character.MODIFIER_LETTER: 773 case Character.OTHER_LETTER: 774 type = CHAR_LETTER; 775 break; 776 case Character.NON_SPACING_MARK: 777 case Character.COMBINING_SPACING_MARK: 778 case Character.ENCLOSING_MARK: 779 type = CHAR_MARK; 780 break; 781 case Character.DECIMAL_DIGIT_NUMBER: 782 case Character.LETTER_NUMBER: 783 case Character.OTHER_NUMBER: 784 type = CHAR_NUMBER; 785 break; 786 case Character.SPACE_SEPARATOR: 787 case Character.LINE_SEPARATOR: 788 case Character.PARAGRAPH_SEPARATOR: 789 type = CHAR_SEPARATOR; 790 break; 791 case Character.CONTROL: 792 case Character.FORMAT: 793 case Character.SURROGATE: 794 case Character.PRIVATE_USE: 795 case Character.UNASSIGNED: 796 type = CHAR_OTHER; 797 break; 798 case Character.CONNECTOR_PUNCTUATION: 799 case Character.DASH_PUNCTUATION: 800 case Character.START_PUNCTUATION: 801 case Character.END_PUNCTUATION: 802 case CHAR_INIT_QUOTE: 803 case CHAR_FINAL_QUOTE: 804 case Character.OTHER_PUNCTUATION: 805 type = CHAR_PUNCTUATION; 806 break; 807 case Character.MATH_SYMBOL: 808 case Character.CURRENCY_SYMBOL: 809 case Character.MODIFIER_SYMBOL: 810 case Character.OTHER_SYMBOL: 811 type = CHAR_SYMBOL; 812 break; 813 default: 814 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type); 815 } 816 ranges[type].addRange(i, i); 817 } // for all characters 818 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX); 819 820 for (int i = 0; i < ranges.length; i ++) { 821 if (Token.categoryNames[i] != null) { 822 if (i == Character.UNASSIGNED) { // Unassigned 823 ranges[i].addRange(0x10000, Token.UTF16_MAX); 824 } 825 Token.categories.put(Token.categoryNames[i], ranges[i]); 826 Token.categories2.put(Token.categoryNames[i], 827 Token.complementRanges(ranges[i])); 828 } 829 } 830 //REVISIT: do we really need to support block names as in Unicode 3.1 831 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)? 832 // 833 StringBuilder buffer = new StringBuilder(50); 834 for (int i = 0; i < Token.blockNames.length; i ++) { 835 Token r1 = Token.createRange(); 836 int location; 837 if (i < NONBMP_BLOCK_START) { 838 location = i*2; 839 int rstart = Token.blockRanges.charAt(location); 840 int rend = Token.blockRanges.charAt(location+1); 841 //DEBUGING 842 //System.out.println(n+" " +Integer.toHexString(rstart) 843 // +"-"+ Integer.toHexString(rend)); 844 r1.addRange(rstart, rend); 845 } else { 846 location = (i - NONBMP_BLOCK_START) * 2; 847 r1.addRange(Token.nonBMPBlockRanges[location], 848 Token.nonBMPBlockRanges[location + 1]); 849 } 850 String n = Token.blockNames[i]; 851 if (n.equals("Specials")) 852 r1.addRange(0xfff0, 0xfffd); 853 if (n.equals("Private Use")) { 854 r1.addRange(0xF0000,0xFFFFD); 855 r1.addRange(0x100000,0x10FFFD); 856 } 857 Token.categories.put(n, r1); 858 Token.categories2.put(n, Token.complementRanges(r1)); 859 buffer.setLength(0); 860 buffer.append("Is"); 861 if (n.indexOf(' ') >= 0) { 862 for (int ci = 0; ci < n.length(); ci ++) 863 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci)); 864 } 865 else { 866 buffer.append(n); 867 } 868 Token.setAlias(buffer.toString(), n, true); 869 } 870 871 // TR#18 1.2 872 Token.setAlias("ASSIGNED", "Cn", false); 873 Token.setAlias("UNASSIGNED", "Cn", true); 874 Token all = Token.createRange(); 875 all.addRange(0, Token.UTF16_MAX); 876 Token.categories.put("ALL", all); 877 Token.categories2.put("ALL", Token.complementRanges(all)); 878 Token.registerNonXS("ASSIGNED"); 879 Token.registerNonXS("UNASSIGNED"); 880 Token.registerNonXS("ALL"); 881 882 Token isalpha = Token.createRange(); 883 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu 884 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll 885 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo 886 Token.categories.put("IsAlpha", isalpha); 887 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha)); 888 Token.registerNonXS("IsAlpha"); 889 890 Token isalnum = Token.createRange(); 891 isalnum.mergeRanges(isalpha); // Lu Ll Lo 892 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd 893 Token.categories.put("IsAlnum", isalnum); 894 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum)); 895 Token.registerNonXS("IsAlnum"); 896 897 Token isspace = Token.createRange(); 898 isspace.mergeRanges(Token.token_spaces); 899 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z 900 Token.categories.put("IsSpace", isspace); 901 Token.categories2.put("IsSpace", Token.complementRanges(isspace)); 902 Token.registerNonXS("IsSpace"); 903 904 Token isword = Token.createRange(); 905 isword.mergeRanges(isalnum); // Lu Ll Lo Nd 906 isword.addRange('_', '_'); 907 Token.categories.put("IsWord", isword); 908 Token.categories2.put("IsWord", Token.complementRanges(isword)); 909 Token.registerNonXS("IsWord"); 910 911 Token isascii = Token.createRange(); 912 isascii.addRange(0, 127); 913 Token.categories.put("IsASCII", isascii); 914 Token.categories2.put("IsASCII", Token.complementRanges(isascii)); 915 Token.registerNonXS("IsASCII"); 916 917 Token isnotgraph = Token.createRange(); 918 isnotgraph.mergeRanges(ranges[CHAR_OTHER]); 919 isnotgraph.addRange(' ', ' '); 920 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph)); 921 Token.categories2.put("IsGraph", isnotgraph); 922 Token.registerNonXS("IsGraph"); 923 924 Token isxdigit = Token.createRange(); 925 isxdigit.addRange('0', '9'); 926 isxdigit.addRange('A', 'F'); 927 isxdigit.addRange('a', 'f'); 928 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit)); 929 Token.categories2.put("IsXDigit", isxdigit); 930 Token.registerNonXS("IsXDigit"); 931 932 Token.setAlias("IsDigit", "Nd", true); 933 Token.setAlias("IsUpper", "Lu", true); 934 Token.setAlias("IsLower", "Ll", true); 935 Token.setAlias("IsCntrl", "C", true); 936 Token.setAlias("IsPrint", "C", false); 937 Token.setAlias("IsPunct", "P", true); 938 Token.registerNonXS("IsDigit"); 939 Token.registerNonXS("IsUpper"); 940 Token.registerNonXS("IsLower"); 941 Token.registerNonXS("IsCntrl"); 942 Token.registerNonXS("IsPrint"); 943 Token.registerNonXS("IsPunct"); 944 945 Token.setAlias("alpha", "IsAlpha", true); 946 Token.setAlias("alnum", "IsAlnum", true); 947 Token.setAlias("ascii", "IsASCII", true); 948 Token.setAlias("cntrl", "IsCntrl", true); 949 Token.setAlias("digit", "IsDigit", true); 950 Token.setAlias("graph", "IsGraph", true); 951 Token.setAlias("lower", "IsLower", true); 952 Token.setAlias("print", "IsPrint", true); 953 Token.setAlias("punct", "IsPunct", true); 954 Token.setAlias("space", "IsSpace", true); 955 Token.setAlias("upper", "IsUpper", true); 956 Token.setAlias("word", "IsWord", true); // Perl extension 957 Token.setAlias("xdigit", "IsXDigit", true); 958 Token.registerNonXS("alpha"); 959 Token.registerNonXS("alnum"); 960 Token.registerNonXS("ascii"); 961 Token.registerNonXS("cntrl"); 962 Token.registerNonXS("digit"); 963 Token.registerNonXS("graph"); 964 Token.registerNonXS("lower"); 965 Token.registerNonXS("print"); 966 Token.registerNonXS("punct"); 967 Token.registerNonXS("space"); 968 Token.registerNonXS("upper"); 969 Token.registerNonXS("word"); 970 Token.registerNonXS("xdigit"); 971 } // synchronized 972 } // if null 973 RangeToken tok = positive ? (RangeToken)Token.categories.get(name) 974 : (RangeToken)Token.categories2.get(name); 975 //if (tok == null) System.out.println(name); 976 return tok; 977 } 978 static protected RangeToken getRange(String name, boolean positive, boolean xs) { 979 RangeToken range = Token.getRange(name, positive); 980 if (xs && range != null && Token.isRegisterNonXS(name)) 981 range = null; 982 return range; 983 } 984 985 static final Set<String> nonxs = Collections.synchronizedSet(new HashSet<>()); 986 /** 987 * This method is called by only getRange(). 988 * So this method need not MT-safe. 989 */ 990 static protected void registerNonXS(String name) { 991 Token.nonxs.add(name); 992 } 993 994 static protected boolean isRegisterNonXS(String name) { 995 return Token.nonxs.contains(name); 996 } 997 998 private static void setAlias(String newName, String name, boolean positive) { 999 Token t1 = (Token)Token.categories.get(name); 1000 Token t2 = (Token)Token.categories2.get(name); 1001 if (positive) { 1002 Token.categories.put(newName, t1); 1003 Token.categories2.put(newName, t2); 1004 } else { 1005 Token.categories2.put(newName, t1); 1006 Token.categories.put(newName, t2); 1007 } 1008 } 1009 1010 // ------------------------------------------------------ 1011 1012 static final String viramaString = 1013 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1014 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1015 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1016 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1017 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1018 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1019 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1020 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1021 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;; 1022 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;; 1023 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;; 1024 1025 static private Token token_grapheme = null; 1026 static synchronized Token getGraphemePattern() { 1027 if (Token.token_grapheme != null) 1028 return Token.token_grapheme; 1029 1030 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}] 1031 base_char.mergeRanges(Token.getRange("ASSIGNED", true)); 1032 base_char.subtractRanges(Token.getRange("M", true)); 1033 base_char.subtractRanges(Token.getRange("C", true)); 1034 1035 Token virama = Token.createRange(); 1036 for (int i = 0; i < Token.viramaString.length(); i++) { 1037 virama.addRange(i, i); 1038 } 1039 1040 Token combiner_wo_virama = Token.createRange(); 1041 combiner_wo_virama.mergeRanges(Token.getRange("M", true)); 1042 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final 1043 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras 1044 1045 Token left = Token.createUnion(); // base_char? 1046 left.addChild(base_char); 1047 left.addChild(Token.token_empty); 1048 1049 Token foo = Token.createUnion(); 1050 foo.addChild(Token.createConcat(virama, Token.getRange("L", true))); 1051 foo.addChild(combiner_wo_virama); 1052 1053 foo = Token.createClosure(foo); 1054 1055 foo = Token.createConcat(left, foo); 1056 1057 Token.token_grapheme = foo; 1058 return Token.token_grapheme; 1059 } 1060 1061 /** 1062 * Combing Character Sequence in Perl 5.6. 1063 */ 1064 static private Token token_ccs = null; 1065 static synchronized Token getCombiningCharacterSequence() { 1066 if (Token.token_ccs != null) 1067 return Token.token_ccs; 1068 1069 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM* 1070 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM* 1071 Token.token_ccs = foo; 1072 return Token.token_ccs; 1073 } 1074 1075 // ------------------------------------------------------ 1076 1077 // ------------------------------------------------------ 1078 /** 1079 * This class represents a node in parse tree. 1080 */ 1081 static class StringToken extends Token implements java.io.Serializable { 1082 1083 private static final long serialVersionUID = -4614366944218504172L; 1084 1085 String string; 1086 final int refNumber; 1087 1088 StringToken(int type, String str, int n) { 1089 super(type); 1090 this.string = str; 1091 this.refNumber = n; 1092 } 1093 1094 int getReferenceNumber() { // for STRING 1095 return this.refNumber; 1096 } 1097 String getString() { // for STRING 1098 return this.string; 1099 } 1100 1101 public String toString(int options) { 1102 if (this.type == BACKREFERENCE) 1103 return "\\"+this.refNumber; 1104 else 1105 return REUtil.quoteMeta(this.string); 1106 } 1107 } 1108 1109 /** 1110 * This class represents a node in parse tree. 1111 */ 1112 static class ConcatToken extends Token implements java.io.Serializable { 1113 1114 private static final long serialVersionUID = 8717321425541346381L; 1115 1116 final Token child; 1117 final Token child2; 1118 1119 ConcatToken(Token t1, Token t2) { 1120 super(Token.CONCAT); 1121 this.child = t1; 1122 this.child2 = t2; 1123 } 1124 1125 int size() { 1126 return 2; 1127 } 1128 Token getChild(int index) { 1129 return index == 0 ? this.child : this.child2; 1130 } 1131 1132 public String toString(int options) { 1133 String ret; 1134 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) { 1135 ret = this.child.toString(options)+"+"; 1136 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) { 1137 ret = this.child.toString(options)+"+?"; 1138 } else 1139 ret = this.child.toString(options)+this.child2.toString(options); 1140 return ret; 1141 } 1142 } 1143 1144 /** 1145 * This class represents a node in parse tree. 1146 */ 1147 static class CharToken extends Token implements java.io.Serializable { 1148 1149 private static final long serialVersionUID = -4394272816279496989L; 1150 1151 final int chardata; 1152 1153 CharToken(int type, int ch) { 1154 super(type); 1155 this.chardata = ch; 1156 } 1157 1158 int getChar() { 1159 return this.chardata; 1160 } 1161 1162 public String toString(int options) { 1163 String ret; 1164 switch (this.type) { 1165 case CHAR: 1166 switch (this.chardata) { 1167 case '|': case '*': case '+': case '?': 1168 case '(': case ')': case '.': case '[': 1169 case '{': case '\\': 1170 ret = "\\"+(char)this.chardata; 1171 break; 1172 case '\f': ret = "\\f"; break; 1173 case '\n': ret = "\\n"; break; 1174 case '\r': ret = "\\r"; break; 1175 case '\t': ret = "\\t"; break; 1176 case 0x1b: ret = "\\e"; break; 1177 //case 0x0b: ret = "\\v"; break; 1178 default: 1179 if (this.chardata >= 0x10000) { 1180 String pre = "0"+Integer.toHexString(this.chardata); 1181 ret = "\\v"+pre.substring(pre.length()-6, pre.length()); 1182 } else 1183 ret = ""+(char)this.chardata; 1184 } 1185 break; 1186 1187 case ANCHOR: 1188 if (this == Token.token_linebeginning || this == Token.token_lineend) 1189 ret = ""+(char)this.chardata; 1190 else 1191 ret = "\\"+(char)this.chardata; 1192 break; 1193 1194 default: 1195 ret = null; 1196 } 1197 return ret; 1198 } 1199 1200 boolean match(int ch) { 1201 if (this.type == CHAR) { 1202 return ch == this.chardata; 1203 } else 1204 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type); 1205 } 1206 } 1207 1208 /** 1209 * This class represents a node in parse tree. 1210 */ 1211 static class ClosureToken extends Token implements java.io.Serializable { 1212 1213 private static final long serialVersionUID = 1308971930673997452L; 1214 1215 int min; 1216 int max; 1217 final Token child; 1218 1219 ClosureToken(int type, Token tok) { 1220 super(type); 1221 this.child = tok; 1222 this.setMin(-1); 1223 this.setMax(-1); 1224 } 1225 1226 int size() { 1227 return 1; 1228 } 1229 Token getChild(int index) { 1230 return this.child; 1231 } 1232 1233 final void setMin(int min) { 1234 this.min = min; 1235 } 1236 final void setMax(int max) { 1237 this.max = max; 1238 } 1239 final int getMin() { 1240 return this.min; 1241 } 1242 final int getMax() { 1243 return this.max; 1244 } 1245 1246 public String toString(int options) { 1247 String ret; 1248 if (this.type == CLOSURE) { 1249 if (this.getMin() < 0 && this.getMax() < 0) { 1250 ret = this.child.toString(options)+"*"; 1251 } else if (this.getMin() == this.getMax()) { 1252 ret = this.child.toString(options)+"{"+this.getMin()+"}"; 1253 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1254 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}"; 1255 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1256 ret = this.child.toString(options)+"{"+this.getMin()+",}"; 1257 } else 1258 throw new RuntimeException("Token#toString(): CLOSURE " 1259 +this.getMin()+", "+this.getMax()); 1260 } else { 1261 if (this.getMin() < 0 && this.getMax() < 0) { 1262 ret = this.child.toString(options)+"*?"; 1263 } else if (this.getMin() == this.getMax()) { 1264 ret = this.child.toString(options)+"{"+this.getMin()+"}?"; 1265 } else if (this.getMin() >= 0 && this.getMax() >= 0) { 1266 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?"; 1267 } else if (this.getMin() >= 0 && this.getMax() < 0) { 1268 ret = this.child.toString(options)+"{"+this.getMin()+",}?"; 1269 } else 1270 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE " 1271 +this.getMin()+", "+this.getMax()); 1272 } 1273 return ret; 1274 } 1275 } 1276 1277 /** 1278 * This class represents a node in parse tree. 1279 */ 1280 static class ParenToken extends Token implements java.io.Serializable { 1281 1282 private static final long serialVersionUID = -5938014719827987704L; 1283 1284 final Token child; 1285 final int parennumber; 1286 1287 ParenToken(int type, Token tok, int paren) { 1288 super(type); 1289 this.child = tok; 1290 this.parennumber = paren; 1291 } 1292 1293 int size() { 1294 return 1; 1295 } 1296 Token getChild(int index) { 1297 return this.child; 1298 } 1299 1300 int getParenNumber() { 1301 return this.parennumber; 1302 } 1303 1304 public String toString(int options) { 1305 String ret = null; 1306 switch (this.type) { 1307 case PAREN: 1308 if (this.parennumber == 0) { 1309 ret = "(?:"+this.child.toString(options)+")"; 1310 } else { 1311 ret = "("+this.child.toString(options)+")"; 1312 } 1313 break; 1314 1315 case LOOKAHEAD: 1316 ret = "(?="+this.child.toString(options)+")"; 1317 break; 1318 case NEGATIVELOOKAHEAD: 1319 ret = "(?!"+this.child.toString(options)+")"; 1320 break; 1321 case LOOKBEHIND: 1322 ret = "(?<="+this.child.toString(options)+")"; 1323 break; 1324 case NEGATIVELOOKBEHIND: 1325 ret = "(?<!"+this.child.toString(options)+")"; 1326 break; 1327 case INDEPENDENT: 1328 ret = "(?>"+this.child.toString(options)+")"; 1329 break; 1330 } 1331 return ret; 1332 } 1333 } 1334 1335 /** 1336 * (?(condition)yes-pattern|no-pattern) 1337 */ 1338 static class ConditionToken extends Token implements java.io.Serializable { 1339 1340 private static final long serialVersionUID = 4353765277910594411L; 1341 1342 final int refNumber; 1343 final Token condition; 1344 final Token yes; 1345 final Token no; 1346 ConditionToken(int refno, Token cond, Token yespat, Token nopat) { 1347 super(Token.CONDITION); 1348 this.refNumber = refno; 1349 this.condition = cond; 1350 this.yes = yespat; 1351 this.no = nopat; 1352 } 1353 int size() { 1354 return this.no == null ? 1 : 2; 1355 } 1356 Token getChild(int index) { 1357 if (index == 0) return this.yes; 1358 if (index == 1) return this.no; 1359 throw new RuntimeException("Internal Error: "+index); 1360 } 1361 1362 public String toString(int options) { 1363 String ret; 1364 if (refNumber > 0) { 1365 ret = "(?("+refNumber+")"; 1366 } else if (this.condition.type == Token.ANCHOR) { 1367 ret = "(?("+this.condition+")"; 1368 } else { 1369 ret = "(?"+this.condition; 1370 } 1371 1372 if (this.no == null) { 1373 ret += this.yes+")"; 1374 } else { 1375 ret += this.yes+"|"+this.no+")"; 1376 } 1377 return ret; 1378 } 1379 } 1380 1381 /** 1382 * (ims-ims: .... ) 1383 */ 1384 static class ModifierToken extends Token implements java.io.Serializable { 1385 1386 private static final long serialVersionUID = -9114536559696480356L; 1387 1388 final Token child; 1389 final int add; 1390 final int mask; 1391 1392 ModifierToken(Token tok, int add, int mask) { 1393 super(Token.MODIFIERGROUP); 1394 this.child = tok; 1395 this.add = add; 1396 this.mask = mask; 1397 } 1398 1399 int size() { 1400 return 1; 1401 } 1402 Token getChild(int index) { 1403 return this.child; 1404 } 1405 1406 int getOptions() { 1407 return this.add; 1408 } 1409 int getOptionsMask() { 1410 return this.mask; 1411 } 1412 1413 public String toString(int options) { 1414 return "(?" 1415 +(this.add == 0 ? "" : REUtil.createOptionString(this.add)) 1416 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask)) 1417 +":" 1418 +this.child.toString(options) 1419 +")"; 1420 } 1421 } 1422 1423 /** 1424 * This class represents a node in parse tree. 1425 * for UNION or CONCAT. 1426 */ 1427 static class UnionToken extends Token implements java.io.Serializable { 1428 1429 private static final long serialVersionUID = -2568843945989489861L; 1430 1431 List<Token> children; 1432 1433 /** 1434 * @serialField children Vector children 1435 */ 1436 private static final ObjectStreamField[] serialPersistentFields = 1437 new ObjectStreamField[] { 1438 new ObjectStreamField("children", Vector.class), 1439 }; 1440 1441 UnionToken(int type) { 1442 super(type); 1443 } 1444 1445 @Override 1446 void addChild(Token tok) { 1447 if (tok == null) return; 1448 if (this.children == null) this.children = new ArrayList<>(); 1449 if (this.type == UNION) { 1450 this.children.add(tok); 1451 return; 1452 } 1453 // This is CONCAT, and new child is CONCAT. 1454 if (tok.type == CONCAT) { 1455 for (int i = 0; i < tok.size(); i ++) 1456 this.addChild(tok.getChild(i)); // Recursion 1457 return; 1458 } 1459 int size = this.children.size(); 1460 if (size == 0) { 1461 this.children.add(tok); 1462 return; 1463 } 1464 Token previous = this.children.get(size - 1); 1465 if (!((previous.type == CHAR || previous.type == STRING) 1466 && (tok.type == CHAR || tok.type == STRING))) { 1467 this.children.add(tok); 1468 return; 1469 } 1470 1471 //System.err.println("Merge '"+previous+"' and '"+tok+"'."); 1472 1473 StringBuilder buffer; 1474 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length()); 1475 if (previous.type == CHAR) { // Replace previous token by STRING 1476 buffer = new StringBuilder(2 + nextMaxLength); 1477 int ch = previous.getChar(); 1478 if (ch >= 0x10000) 1479 buffer.append(REUtil.decomposeToSurrogates(ch)); 1480 else 1481 buffer.append((char)ch); 1482 previous = Token.createString(null); 1483 this.children.set(size - 1, previous); 1484 } else { // STRING 1485 buffer = new StringBuilder(previous.getString().length() + nextMaxLength); 1486 buffer.append(previous.getString()); 1487 } 1488 1489 if (tok.type == CHAR) { 1490 int ch = tok.getChar(); 1491 if (ch >= 0x10000) 1492 buffer.append(REUtil.decomposeToSurrogates(ch)); 1493 else 1494 buffer.append((char)ch); 1495 } else { 1496 buffer.append(tok.getString()); 1497 } 1498 1499 ((StringToken)previous).string = new String(buffer); 1500 } 1501 1502 @Override 1503 int size() { 1504 return this.children == null ? 0 : this.children.size(); 1505 } 1506 @Override 1507 Token getChild(int index) { 1508 return this.children.get(index); 1509 } 1510 1511 @Override 1512 public String toString(int options) { 1513 String ret; 1514 if (this.type == CONCAT) { 1515 if (this.children.size() == 2) { 1516 Token ch = this.getChild(0); 1517 Token ch2 = this.getChild(1); 1518 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) { 1519 ret = ch.toString(options)+"+"; 1520 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) { 1521 ret = ch.toString(options)+"+?"; 1522 } else 1523 ret = ch.toString(options)+ch2.toString(options); 1524 } else { 1525 StringBuilder sb = new StringBuilder(); 1526 for (int i = 0; i < this.children.size(); i ++) { 1527 sb.append(((Token)this.children.get(i)).toString(options)); 1528 } 1529 ret = new String(sb); 1530 } 1531 return ret; 1532 } 1533 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) { 1534 ret = this.getChild(0).toString(options)+"?"; 1535 } else if (this.children.size() == 2 1536 && this.getChild(0).type == EMPTY) { 1537 ret = this.getChild(1).toString(options)+"??"; 1538 } else { 1539 StringBuilder sb = new StringBuilder(); 1540 sb.append((this.children.get(0)).toString(options)); 1541 for (int i = 1; i < this.children.size(); i ++) { 1542 sb.append((char)'|'); 1543 sb.append((this.children.get(i)).toString(options)); 1544 } 1545 ret = new String(sb); 1546 } 1547 return ret; 1548 } 1549 1550 /** 1551 * @serialData Serialized fields. Convert the List to Vector for backward compatibility. 1552 */ 1553 private void writeObject(ObjectOutputStream out) throws IOException { 1554 // Convert List to Vector 1555 Vector<Token> vChildren = (children == null)? null : new Vector<>(children); 1556 1557 // Write serialized fields 1558 ObjectOutputStream.PutField pf = out.putFields(); 1559 pf.put("children", vChildren); 1560 out.writeFields(); 1561 } 1562 1563 @SuppressWarnings("unchecked") 1564 private void readObject(ObjectInputStream in) 1565 throws IOException, ClassNotFoundException { 1566 // We have to read serialized fields first. 1567 ObjectInputStream.GetField gf = in.readFields(); 1568 Vector<Token> vChildren = (Vector<Token>)gf.get("children", null); 1569 1570 //convert Vector back to List 1571 if (vChildren != null) children = new ArrayList<>(vChildren); 1572 } 1573 } 1574 }