1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2004 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.regexp.internal; 22 23 import java.io.Serializable; 24 import java.util.Vector; 25 26 /** 27 * RE is an efficient, lightweight regular expression evaluator/matcher 28 * class. Regular expressions are pattern descriptions which enable 29 * sophisticated matching of strings. In addition to being able to 30 * match a string against a pattern, you can also extract parts of the 31 * match. This is especially useful in text parsing! Details on the 32 * syntax of regular expression patterns are given below. 33 * 34 * <p> 35 * To compile a regular expression (RE), you can simply construct an RE 36 * matcher object from the string specification of the pattern, like this: 37 * 38 * <pre> 39 * RE r = new RE("a*b"); 40 * </pre> 41 * 42 * <p> 43 * Once you have done this, you can call either of the RE.match methods to 44 * perform matching on a String. For example: 45 * 46 * <pre> 47 * boolean matched = r.match("aaaab"); 48 * </pre> 49 * 50 * will cause the boolean matched to be set to true because the 51 * pattern "a*b" matches the string "aaaab". 52 * 53 * <p> 54 * If you were interested in the <i>number</i> of a's which matched the 55 * first part of our example expression, you could change the expression to 56 * "(a*)b". Then when you compiled the expression and matched it against 57 * something like "xaaaab", you would get results like this: 58 * 59 * <pre> 60 * RE r = new RE("(a*)b"); // Compile expression 61 * boolean matched = r.match("xaaaab"); // Match against "xaaaab" 62 * 63 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab' 64 * String insideParens = r.getParen(1); // insideParens will be 'aaaa' 65 * 66 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1 67 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6 68 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5 69 * 70 * int startInside = r.getParenStart(1); // startInside will be index 1 71 * int endInside = r.getParenEnd(1); // endInside will be index 5 72 * int lenInside = r.getParenLength(1); // lenInside will be 4 73 * </pre> 74 * 75 * You can also refer to the contents of a parenthesized expression 76 * within a regular expression itself. This is called a 77 * 'backreference'. The first backreference in a regular expression is 78 * denoted by \1, the second by \2 and so on. So the expression: 79 * 80 * <pre> 81 * ([0-9]+)=\1 82 * </pre> 83 * 84 * will match any string of the form n=n (like 0=0 or 2=2). 85 * 86 * <p> 87 * The full regular expression syntax accepted by RE is described here: 88 * 89 * <pre> 90 * 91 * <b><font face=times roman>Characters</font></b> 92 * 93 * <i>unicodeChar</i> Matches any identical unicode character 94 * \ Used to quote a meta-character (like '*') 95 * \\ Matches a single '\' character 96 * \0nnn Matches a given octal character 97 * \xhh Matches a given 8-bit hexadecimal character 98 * \\uhhhh Matches a given 16-bit hexadecimal character 99 * \t Matches an ASCII tab character 100 * \n Matches an ASCII newline character 101 * \r Matches an ASCII return character 102 * \f Matches an ASCII form feed character 103 * 104 * 105 * <b><font face=times roman>Character Classes</font></b> 106 * 107 * [abc] Simple character class 108 * [a-zA-Z] Character class with ranges 109 * [^abc] Negated character class 110 * </pre> 111 * 112 * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts 113 * from zero" or "ends with last character". 114 * <br> 115 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF], 116 * [-] means "all characters". 117 * 118 * <pre> 119 * 120 * <b><font face=times roman>Standard POSIX Character Classes</font></b> 121 * 122 * [:alnum:] Alphanumeric characters. 123 * [:alpha:] Alphabetic characters. 124 * [:blank:] Space and tab characters. 125 * [:cntrl:] Control characters. 126 * [:digit:] Numeric characters. 127 * [:graph:] Characters that are printable and are also visible. 128 * (A space is printable, but not visible, while an 129 * `a' is both.) 130 * [:lower:] Lower-case alphabetic characters. 131 * [:print:] Printable characters (characters that are not 132 * control characters.) 133 * [:punct:] Punctuation characters (characters that are not letter, 134 * digits, control characters, or space characters). 135 * [:space:] Space characters (such as space, tab, and formfeed, 136 * to name a few). 137 * [:upper:] Upper-case alphabetic characters. 138 * [:xdigit:] Characters that are hexadecimal digits. 139 * 140 * 141 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b> 142 * 143 * [:javastart:] Start of a Java identifier 144 * [:javapart:] Part of a Java identifier 145 * 146 * 147 * <b><font face=times roman>Predefined Classes</font></b> 148 * 149 * . Matches any character other than newline 150 * \w Matches a "word" character (alphanumeric plus "_") 151 * \W Matches a non-word character 152 * \s Matches a whitespace character 153 * \S Matches a non-whitespace character 154 * \d Matches a digit character 155 * \D Matches a non-digit character 156 * 157 * 158 * <b><font face=times roman>Boundary Matchers</font></b> 159 * 160 * ^ Matches only at the beginning of a line 161 * $ Matches only at the end of a line 162 * \b Matches only at a word boundary 163 * \B Matches only at a non-word boundary 164 * 165 * 166 * <b><font face=times roman>Greedy Closures</font></b> 167 * 168 * A* Matches A 0 or more times (greedy) 169 * A+ Matches A 1 or more times (greedy) 170 * A? Matches A 1 or 0 times (greedy) 171 * A{n} Matches A exactly n times (greedy) 172 * A{n,} Matches A at least n times (greedy) 173 * A{n,m} Matches A at least n but not more than m times (greedy) 174 * 175 * 176 * <b><font face=times roman>Reluctant Closures</font></b> 177 * 178 * A*? Matches A 0 or more times (reluctant) 179 * A+? Matches A 1 or more times (reluctant) 180 * A?? Matches A 0 or 1 times (reluctant) 181 * 182 * 183 * <b><font face=times roman>Logical Operators</font></b> 184 * 185 * AB Matches A followed by B 186 * A|B Matches either A or B 187 * (A) Used for subexpression grouping 188 * (?:A) Used for subexpression clustering (just like grouping but 189 * no backrefs) 190 * 191 * 192 * <b><font face=times roman>Backreferences</font></b> 193 * 194 * \1 Backreference to 1st parenthesized subexpression 195 * \2 Backreference to 2nd parenthesized subexpression 196 * \3 Backreference to 3rd parenthesized subexpression 197 * \4 Backreference to 4th parenthesized subexpression 198 * \5 Backreference to 5th parenthesized subexpression 199 * \6 Backreference to 6th parenthesized subexpression 200 * \7 Backreference to 7th parenthesized subexpression 201 * \8 Backreference to 8th parenthesized subexpression 202 * \9 Backreference to 9th parenthesized subexpression 203 * </pre> 204 * 205 * <p> 206 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning 207 * that they match as many elements of the string as possible without 208 * causing the overall match to fail. If you want a closure to be 209 * reluctant (non-greedy), you can simply follow it with a '?'. A 210 * reluctant closure will match as few elements of the string as 211 * possible when finding matches. {m,n} closures don't currently 212 * support reluctancy. 213 * 214 * <p> 215 * <b><font face="times roman">Line terminators</font></b> 216 * <br> 217 * A line terminator is a one- or two-character sequence that marks 218 * the end of a line of the input character sequence. The following 219 * are recognized as line terminators: 220 * <ul> 221 * <li>A newline (line feed) character ('\n'),</li> 222 * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li> 223 * <li>A standalone carriage-return character ('\r'),</li> 224 * <li>A next-line character ('\u0085'),</li> 225 * <li>A line-separator character ('\u2028'), or</li> 226 * <li>A paragraph-separator character ('\u2029).</li> 227 * </ul> 228 * 229 * <p> 230 * RE runs programs compiled by the RECompiler class. But the RE 231 * matcher class does not include the actual regular expression compiler 232 * for reasons of efficiency. In fact, if you want to pre-compile one 233 * or more regular expressions, the 'recompile' class can be invoked 234 * from the command line to produce compiled output like this: 235 * 236 * <pre> 237 * // Pre-compiled regular expression "a*b" 238 * char[] re1Instructions = 239 * { 240 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041, 241 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047, 242 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000, 243 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000, 244 * 0x0000, 245 * }; 246 * 247 * 248 * REProgram re1 = new REProgram(re1Instructions); 249 * </pre> 250 * 251 * You can then construct a regular expression matcher (RE) object from 252 * the pre-compiled expression re1 and thus avoid the overhead of 253 * compiling the expression at runtime. If you require more dynamic 254 * regular expressions, you can construct a single RECompiler object and 255 * re-use it to compile each expression. Similarly, you can change the 256 * program run by a given matcher object at any time. However, RE and 257 * RECompiler are not threadsafe (for efficiency reasons, and because 258 * requiring thread safety in this class is deemed to be a rare 259 * requirement), so you will need to construct a separate compiler or 260 * matcher object for each thread (unless you do thread synchronization 261 * yourself). Once expression compiled into the REProgram object, REProgram 262 * can be safely shared across multiple threads and RE objects. 263 * 264 * <br><p><br> 265 * 266 * <font color="red"> 267 * <i>ISSUES:</i> 268 * 269 * <ul> 270 * <li>com.weusours.util.re is not currently compatible with all 271 * standard POSIX regcomp flags</li> 272 * <li>com.weusours.util.re does not support POSIX equivalence classes 273 * ([=foo=] syntax) (I18N/locale issue)</li> 274 * <li>com.weusours.util.re does not support nested POSIX character 275 * classes (definitely should, but not completely trivial)</li> 276 * <li>com.weusours.util.re Does not support POSIX character collation 277 * concepts ([.foo.] syntax) (I18N/locale issue)</li> 278 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li> 279 * <li>Should RE support character iterators (for backwards RE matching!)?</li> 280 * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li> 281 * <li>Not *all* possibilities are considered for greediness when backreferences 282 * are involved (as POSIX suggests should be the case). The POSIX RE 283 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match 284 * of acdacaa where \1 is "a". This is not the case in this RE package, 285 * and actually Perl doesn't go to this extent either! Until someone 286 * actually complains about this, I'm not sure it's worth "fixing". 287 * If it ever is fixed, test #137 in RETest.txt should be updated.</li> 288 * </ul> 289 * 290 * </font> 291 * 292 * @see recompile 293 * @see RECompiler 294 * 295 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> 296 * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a> 297 */ 298 public class RE implements Serializable 299 { 300 /** 301 * Specifies normal, case-sensitive matching behaviour. 302 */ 303 public static final int MATCH_NORMAL = 0x0000; 304 305 /** 306 * Flag to indicate that matching should be case-independent (folded) 307 */ 308 public static final int MATCH_CASEINDEPENDENT = 0x0001; 309 310 /** 311 * Newlines should match as BOL/EOL (^ and $) 312 */ 313 public static final int MATCH_MULTILINE = 0x0002; 314 315 /** 316 * Consider all input a single body of text - newlines are matched by . 317 */ 318 public static final int MATCH_SINGLELINE = 0x0004; 319 320 /************************************************ 321 * * 322 * The format of a node in a program is: * 323 * * 324 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] * 325 * * 326 * char OPCODE - instruction * 327 * char OPDATA - modifying data * 328 * char OPNEXT - next node (relative offset) * 329 * * 330 ************************************************/ 331 332 // Opcode Char Opdata/Operand Meaning 333 // ---------- ---------- --------------- -------------------------------------------------- 334 static final char OP_END = 'E'; // end of program 335 static final char OP_BOL = '^'; // match only if at beginning of line 336 static final char OP_EOL = '$'; // match only if at end of line 337 static final char OP_ANY = '.'; // match any single character except newline 338 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges 339 static final char OP_BRANCH = '|'; // node match this alternative or the next one 340 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself 341 static final char OP_STAR = '*'; // node kleene closure 342 static final char OP_PLUS = '+'; // node positive closure 343 static final char OP_MAYBE = '?'; // node optional closure 344 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code) 345 static final char OP_OPEN = '('; // number nth opening paren 346 static final char OP_OPEN_CLUSTER = '<'; // opening cluster 347 static final char OP_CLOSE = ')'; // number nth closing paren 348 static final char OP_CLOSE_CLUSTER = '>'; // closing cluster 349 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string 350 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer 351 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)' 352 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*') 353 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+') 354 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?') 355 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes 356 357 // Escape codes 358 static final char E_ALNUM = 'w'; // Alphanumeric 359 static final char E_NALNUM = 'W'; // Non-alphanumeric 360 static final char E_BOUND = 'b'; // Word boundary 361 static final char E_NBOUND = 'B'; // Non-word boundary 362 static final char E_SPACE = 's'; // Whitespace 363 static final char E_NSPACE = 'S'; // Non-whitespace 364 static final char E_DIGIT = 'd'; // Digit 365 static final char E_NDIGIT = 'D'; // Non-digit 366 367 // Posix character classes 368 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics 369 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics 370 static final char POSIX_CLASS_BLANK = 'b'; // Blanks 371 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters 372 static final char POSIX_CLASS_DIGIT = 'd'; // Digits 373 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters 374 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters 375 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters 376 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation 377 static final char POSIX_CLASS_SPACE = 's'; // Spaces 378 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters 379 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits 380 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start 381 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part 382 383 // Limits 384 static final int maxNode = 65536; // Maximum number of nodes in a program 385 static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs) 386 387 // Node layout constants 388 static final int offsetOpcode = 0; // Opcode offset (first character) 389 static final int offsetOpdata = 1; // Opdata offset (second char) 390 static final int offsetNext = 2; // Next index offset (third char) 391 static final int nodeSize = 3; // Node size (in chars) 392 393 // State of current program 394 REProgram program; // Compiled regular expression 'program' 395 transient CharacterIterator search; // The string being matched against 396 int matchFlags; // Match behaviour flags 397 int maxParen = MAX_PAREN; 398 399 // Parenthesized subexpressions 400 transient int parenCount; // Number of subexpressions matched (num open parens + 1) 401 transient int start0; // Cache of start[0] 402 transient int end0; // Cache of start[0] 403 transient int start1; // Cache of start[1] 404 transient int end1; // Cache of start[1] 405 transient int start2; // Cache of start[2] 406 transient int end2; // Cache of start[2] 407 transient int[] startn; // Lazy-alloced array of sub-expression starts 408 transient int[] endn; // Lazy-alloced array of sub-expression ends 409 410 // Backreferences 411 transient int[] startBackref; // Lazy-alloced array of backref starts 412 transient int[] endBackref; // Lazy-alloced array of backref ends 413 414 /** 415 * Constructs a regular expression matcher from a String by compiling it 416 * using a new instance of RECompiler. If you will be compiling many 417 * expressions, you may prefer to use a single RECompiler object instead. 418 * 419 * @param pattern The regular expression pattern to compile. 420 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 421 * @see RECompiler 422 * @see recompile 423 */ 424 public RE(String pattern) throws RESyntaxException 425 { 426 this(pattern, MATCH_NORMAL); 427 } 428 429 /** 430 * Constructs a regular expression matcher from a String by compiling it 431 * using a new instance of RECompiler. If you will be compiling many 432 * expressions, you may prefer to use a single RECompiler object instead. 433 * 434 * @param pattern The regular expression pattern to compile. 435 * @param matchFlags The matching style 436 * @exception RESyntaxException Thrown if the regular expression has invalid syntax. 437 * @see RECompiler 438 * @see recompile 439 */ 440 public RE(String pattern, int matchFlags) throws RESyntaxException 441 { 442 this(new RECompiler().compile(pattern)); 443 setMatchFlags(matchFlags); 444 } 445 446 /** 447 * Construct a matcher for a pre-compiled regular expression from program 448 * (bytecode) data. Permits special flags to be passed in to modify matching 449 * behaviour. 450 * 451 * @param program Compiled regular expression program (see RECompiler and/or recompile) 452 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): 453 * 454 * <pre> 455 * MATCH_NORMAL // Normal (case-sensitive) matching 456 * MATCH_CASEINDEPENDENT // Case folded comparisons 457 * MATCH_MULTILINE // Newline matches as BOL/EOL 458 * </pre> 459 * 460 * @see RECompiler 461 * @see REProgram 462 * @see recompile 463 */ 464 public RE(REProgram program, int matchFlags) 465 { 466 setProgram(program); 467 setMatchFlags(matchFlags); 468 } 469 470 /** 471 * Construct a matcher for a pre-compiled regular expression from program 472 * (bytecode) data. 473 * 474 * @param program Compiled regular expression program 475 * @see RECompiler 476 * @see recompile 477 */ 478 public RE(REProgram program) 479 { 480 this(program, MATCH_NORMAL); 481 } 482 483 /** 484 * Constructs a regular expression matcher with no initial program. 485 * This is likely to be an uncommon practice, but is still supported. 486 */ 487 public RE() 488 { 489 this((REProgram)null, MATCH_NORMAL); 490 } 491 492 /** 493 * Converts a 'simplified' regular expression to a full regular expression 494 * 495 * @param pattern The pattern to convert 496 * @return The full regular expression 497 */ 498 public static String simplePatternToFullRegularExpression(String pattern) 499 { 500 StringBuffer buf = new StringBuffer(); 501 for (int i = 0; i < pattern.length(); i++) 502 { 503 char c = pattern.charAt(i); 504 switch (c) 505 { 506 case '*': 507 buf.append(".*"); 508 break; 509 510 case '.': 511 case '[': 512 case ']': 513 case '\\': 514 case '+': 515 case '?': 516 case '{': 517 case '}': 518 case '$': 519 case '^': 520 case '|': 521 case '(': 522 case ')': 523 buf.append('\\'); 524 default: 525 buf.append(c); 526 break; 527 } 528 } 529 return buf.toString(); 530 } 531 532 /** 533 * Sets match behaviour flags which alter the way RE does matching. 534 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*): 535 * 536 * <pre> 537 * MATCH_NORMAL // Normal (case-sensitive) matching 538 * MATCH_CASEINDEPENDENT // Case folded comparisons 539 * MATCH_MULTILINE // Newline matches as BOL/EOL 540 * </pre> 541 */ 542 public void setMatchFlags(int matchFlags) 543 { 544 this.matchFlags = matchFlags; 545 } 546 547 /** 548 * Returns the current match behaviour flags. 549 * @return Current match behaviour flags (RE.MATCH_*). 550 * 551 * <pre> 552 * MATCH_NORMAL // Normal (case-sensitive) matching 553 * MATCH_CASEINDEPENDENT // Case folded comparisons 554 * MATCH_MULTILINE // Newline matches as BOL/EOL 555 * </pre> 556 * 557 * @see #setMatchFlags 558 */ 559 public int getMatchFlags() 560 { 561 return matchFlags; 562 } 563 564 /** 565 * Sets the current regular expression program used by this matcher object. 566 * 567 * @param program Regular expression program compiled by RECompiler. 568 * @see RECompiler 569 * @see REProgram 570 * @see recompile 571 */ 572 public void setProgram(REProgram program) 573 { 574 this.program = program; 575 if (program != null && program.maxParens != -1) { 576 this.maxParen = program.maxParens; 577 } else { 578 this.maxParen = MAX_PAREN; 579 } 580 } 581 582 /** 583 * Returns the current regular expression program in use by this matcher object. 584 * 585 * @return Regular expression program 586 * @see #setProgram 587 */ 588 public REProgram getProgram() 589 { 590 return program; 591 } 592 593 /** 594 * Returns the number of parenthesized subexpressions available after a successful match. 595 * 596 * @return Number of available parenthesized subexpressions 597 */ 598 public int getParenCount() 599 { 600 return parenCount; 601 } 602 603 /** 604 * Gets the contents of a parenthesized subexpression after a successful match. 605 * 606 * @param which Nesting level of subexpression 607 * @return String 608 */ 609 public String getParen(int which) 610 { 611 int start; 612 if (which < parenCount && (start = getParenStart(which)) >= 0) 613 { 614 return search.substring(start, getParenEnd(which)); 615 } 616 return null; 617 } 618 619 /** 620 * Returns the start index of a given paren level. 621 * 622 * @param which Nesting level of subexpression 623 * @return String index 624 */ 625 public final int getParenStart(int which) 626 { 627 if (which < parenCount) 628 { 629 switch (which) 630 { 631 case 0: 632 return start0; 633 634 case 1: 635 return start1; 636 637 case 2: 638 return start2; 639 640 default: 641 if (startn == null) 642 { 643 allocParens(); 644 } 645 return startn[which]; 646 } 647 } 648 return -1; 649 } 650 651 /** 652 * Returns the end index of a given paren level. 653 * 654 * @param which Nesting level of subexpression 655 * @return String index 656 */ 657 public final int getParenEnd(int which) 658 { 659 if (which < parenCount) 660 { 661 switch (which) 662 { 663 case 0: 664 return end0; 665 666 case 1: 667 return end1; 668 669 case 2: 670 return end2; 671 672 default: 673 if (endn == null) 674 { 675 allocParens(); 676 } 677 return endn[which]; 678 } 679 } 680 return -1; 681 } 682 683 /** 684 * Returns the length of a given paren level. 685 * 686 * @param which Nesting level of subexpression 687 * @return Number of characters in the parenthesized subexpression 688 */ 689 public final int getParenLength(int which) 690 { 691 if (which < parenCount) 692 { 693 return getParenEnd(which) - getParenStart(which); 694 } 695 return -1; 696 } 697 698 /** 699 * Sets the start of a paren level 700 * 701 * @param which Which paren level 702 * @param i Index in input array 703 */ 704 protected final void setParenStart(int which, int i) 705 { 706 if (which < parenCount) 707 { 708 switch (which) 709 { 710 case 0: 711 start0 = i; 712 break; 713 714 case 1: 715 start1 = i; 716 break; 717 718 case 2: 719 start2 = i; 720 break; 721 722 default: 723 if (startn == null) 724 { 725 allocParens(); 726 } 727 startn[which] = i; 728 break; 729 } 730 } 731 } 732 733 /** 734 * Sets the end of a paren level 735 * 736 * @param which Which paren level 737 * @param i Index in input array 738 */ 739 protected final void setParenEnd(int which, int i) 740 { 741 if (which < parenCount) 742 { 743 switch (which) 744 { 745 case 0: 746 end0 = i; 747 break; 748 749 case 1: 750 end1 = i; 751 break; 752 753 case 2: 754 end2 = i; 755 break; 756 757 default: 758 if (endn == null) 759 { 760 allocParens(); 761 } 762 endn[which] = i; 763 break; 764 } 765 } 766 } 767 768 /** 769 * Throws an Error representing an internal error condition probably resulting 770 * from a bug in the regular expression compiler (or possibly data corruption). 771 * In practice, this should be very rare. 772 * 773 * @param s Error description 774 */ 775 protected void internalError(String s) throws Error 776 { 777 throw new Error("RE internal error: " + s); 778 } 779 780 /** 781 * Performs lazy allocation of subexpression arrays 782 */ 783 private final void allocParens() 784 { 785 // Allocate arrays for subexpressions 786 startn = new int[maxParen]; 787 endn = new int[maxParen]; 788 789 // Set sub-expression pointers to invalid values 790 for (int i = 0; i < maxParen; i++) 791 { 792 startn[i] = -1; 793 endn[i] = -1; 794 } 795 } 796 797 /** 798 * Try to match a string against a subset of nodes in the program 799 * 800 * @param firstNode Node to start at in program 801 * @param lastNode Last valid node (used for matching a subexpression without 802 * matching the rest of the program as well). 803 * @param idxStart Starting position in character array 804 * @return Final input array index if match succeeded. -1 if not. 805 */ 806 protected int matchNodes(int firstNode, int lastNode, int idxStart) 807 { 808 // Our current place in the string 809 int idx = idxStart; 810 811 // Loop while node is valid 812 int next, opcode, opdata; 813 int idxNew; 814 char[] instruction = program.instruction; 815 for (int node = firstNode; node < lastNode; ) 816 { 817 opcode = instruction[node + offsetOpcode]; 818 next = node + (short)instruction[node + offsetNext]; 819 opdata = instruction[node + offsetOpdata]; 820 821 switch (opcode) 822 { 823 case OP_RELUCTANTMAYBE: 824 { 825 int once = 0; 826 do 827 { 828 // Try to match the rest without using the reluctant subexpr 829 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 830 { 831 return idxNew; 832 } 833 } 834 while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1); 835 return -1; 836 } 837 838 case OP_RELUCTANTPLUS: 839 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) 840 { 841 // Try to match the rest without using the reluctant subexpr 842 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 843 { 844 return idxNew; 845 } 846 } 847 return -1; 848 849 case OP_RELUCTANTSTAR: 850 do 851 { 852 // Try to match the rest without using the reluctant subexpr 853 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 854 { 855 return idxNew; 856 } 857 } 858 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1); 859 return -1; 860 861 case OP_OPEN: 862 863 // Match subexpression 864 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 865 { 866 startBackref[opdata] = idx; 867 } 868 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 869 { 870 // Increase valid paren count 871 if ((opdata + 1) > parenCount) 872 { 873 parenCount = opdata + 1; 874 } 875 876 // Don't set paren if already set later on 877 if (getParenStart(opdata) == -1) 878 { 879 setParenStart(opdata, idx); 880 } 881 } 882 return idxNew; 883 884 case OP_CLOSE: 885 886 // Done matching subexpression 887 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 888 { 889 endBackref[opdata] = idx; 890 } 891 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) 892 { 893 // Increase valid paren count 894 if ((opdata + 1) > parenCount) 895 { 896 parenCount = opdata + 1; 897 } 898 899 // Don't set paren if already set later on 900 if (getParenEnd(opdata) == -1) 901 { 902 setParenEnd(opdata, idx); 903 } 904 } 905 return idxNew; 906 907 case OP_OPEN_CLUSTER: 908 case OP_CLOSE_CLUSTER: 909 // starting or ending the matching of a subexpression which has no backref. 910 return matchNodes( next, maxNode, idx ); 911 912 case OP_BACKREF: 913 { 914 // Get the start and end of the backref 915 int s = startBackref[opdata]; 916 int e = endBackref[opdata]; 917 918 // We don't know the backref yet 919 if (s == -1 || e == -1) 920 { 921 return -1; 922 } 923 924 // The backref is empty size 925 if (s == e) 926 { 927 break; 928 } 929 930 // Get the length of the backref 931 int l = e - s; 932 933 // If there's not enough input left, give up. 934 if (search.isEnd(idx + l - 1)) 935 { 936 return -1; 937 } 938 939 // Case fold the backref? 940 final boolean caseFold = 941 ((matchFlags & MATCH_CASEINDEPENDENT) != 0); 942 // Compare backref to input 943 for (int i = 0; i < l; i++) 944 { 945 if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0) 946 { 947 return -1; 948 } 949 } 950 } 951 break; 952 953 case OP_BOL: 954 955 // Fail if we're not at the start of the string 956 if (idx != 0) 957 { 958 // If we're multiline matching, we could still be at the start of a line 959 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) 960 { 961 // If not at start of line, give up 962 if (idx <= 0 || !isNewline(idx - 1)) { 963 return -1; 964 } else { 965 break; 966 } 967 } 968 return -1; 969 } 970 break; 971 972 case OP_EOL: 973 974 // If we're not at the end of string 975 if (!search.isEnd(0) && !search.isEnd(idx)) 976 { 977 // If we're multi-line matching 978 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) 979 { 980 // Give up if we're not at the end of a line 981 if (!isNewline(idx)) { 982 return -1; 983 } else { 984 break; 985 } 986 } 987 return -1; 988 } 989 break; 990 991 case OP_ESCAPE: 992 993 // Which escape? 994 switch (opdata) 995 { 996 // Word boundary match 997 case E_NBOUND: 998 case E_BOUND: 999 { 1000 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1)); 1001 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx)); 1002 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) 1003 { 1004 return -1; 1005 } 1006 } 1007 break; 1008 1009 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit 1010 case E_ALNUM: 1011 case E_NALNUM: 1012 case E_DIGIT: 1013 case E_NDIGIT: 1014 case E_SPACE: 1015 case E_NSPACE: 1016 1017 // Give up if out of input 1018 if (search.isEnd(idx)) 1019 { 1020 return -1; 1021 } 1022 1023 char c = search.charAt(idx); 1024 1025 // Switch on escape 1026 switch (opdata) 1027 { 1028 case E_ALNUM: 1029 case E_NALNUM: 1030 if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM))) 1031 { 1032 return -1; 1033 } 1034 break; 1035 1036 case E_DIGIT: 1037 case E_NDIGIT: 1038 if (!(Character.isDigit(c) == (opdata == E_DIGIT))) 1039 { 1040 return -1; 1041 } 1042 break; 1043 1044 case E_SPACE: 1045 case E_NSPACE: 1046 if (!(Character.isWhitespace(c) == (opdata == E_SPACE))) 1047 { 1048 return -1; 1049 } 1050 break; 1051 } 1052 idx++; 1053 break; 1054 1055 default: 1056 internalError("Unrecognized escape '" + opdata + "'"); 1057 } 1058 break; 1059 1060 case OP_ANY: 1061 1062 if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) { 1063 // Match anything 1064 if (search.isEnd(idx)) 1065 { 1066 return -1; 1067 } 1068 } 1069 else 1070 { 1071 // Match anything but a newline 1072 if (search.isEnd(idx) || isNewline(idx)) 1073 { 1074 return -1; 1075 } 1076 } 1077 idx++; 1078 break; 1079 1080 case OP_ATOM: 1081 { 1082 // Match an atom value 1083 if (search.isEnd(idx)) 1084 { 1085 return -1; 1086 } 1087 1088 // Get length of atom and starting index 1089 int lenAtom = opdata; 1090 int startAtom = node + nodeSize; 1091 1092 // Give up if not enough input remains to have a match 1093 if (search.isEnd(lenAtom + idx - 1)) 1094 { 1095 return -1; 1096 } 1097 1098 // Match atom differently depending on casefolding flag 1099 final boolean caseFold = 1100 ((matchFlags & MATCH_CASEINDEPENDENT) != 0); 1101 1102 for (int i = 0; i < lenAtom; i++) 1103 { 1104 if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0) 1105 { 1106 return -1; 1107 } 1108 } 1109 } 1110 break; 1111 1112 case OP_POSIXCLASS: 1113 { 1114 // Out of input? 1115 if (search.isEnd(idx)) 1116 { 1117 return -1; 1118 } 1119 1120 switch (opdata) 1121 { 1122 case POSIX_CLASS_ALNUM: 1123 if (!Character.isLetterOrDigit(search.charAt(idx))) 1124 { 1125 return -1; 1126 } 1127 break; 1128 1129 case POSIX_CLASS_ALPHA: 1130 if (!Character.isLetter(search.charAt(idx))) 1131 { 1132 return -1; 1133 } 1134 break; 1135 1136 case POSIX_CLASS_DIGIT: 1137 if (!Character.isDigit(search.charAt(idx))) 1138 { 1139 return -1; 1140 } 1141 break; 1142 1143 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right?? 1144 if (!Character.isSpaceChar(search.charAt(idx))) 1145 { 1146 return -1; 1147 } 1148 break; 1149 1150 case POSIX_CLASS_SPACE: 1151 if (!Character.isWhitespace(search.charAt(idx))) 1152 { 1153 return -1; 1154 } 1155 break; 1156 1157 case POSIX_CLASS_CNTRL: 1158 if (Character.getType(search.charAt(idx)) != Character.CONTROL) 1159 { 1160 return -1; 1161 } 1162 break; 1163 1164 case POSIX_CLASS_GRAPH: // JWL - bugbug??? 1165 switch (Character.getType(search.charAt(idx))) 1166 { 1167 case Character.MATH_SYMBOL: 1168 case Character.CURRENCY_SYMBOL: 1169 case Character.MODIFIER_SYMBOL: 1170 case Character.OTHER_SYMBOL: 1171 break; 1172 1173 default: 1174 return -1; 1175 } 1176 break; 1177 1178 case POSIX_CLASS_LOWER: 1179 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) 1180 { 1181 return -1; 1182 } 1183 break; 1184 1185 case POSIX_CLASS_UPPER: 1186 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) 1187 { 1188 return -1; 1189 } 1190 break; 1191 1192 case POSIX_CLASS_PRINT: 1193 if (Character.getType(search.charAt(idx)) == Character.CONTROL) 1194 { 1195 return -1; 1196 } 1197 break; 1198 1199 case POSIX_CLASS_PUNCT: 1200 { 1201 int type = Character.getType(search.charAt(idx)); 1202 switch(type) 1203 { 1204 case Character.DASH_PUNCTUATION: 1205 case Character.START_PUNCTUATION: 1206 case Character.END_PUNCTUATION: 1207 case Character.CONNECTOR_PUNCTUATION: 1208 case Character.OTHER_PUNCTUATION: 1209 break; 1210 1211 default: 1212 return -1; 1213 } 1214 } 1215 break; 1216 1217 case POSIX_CLASS_XDIGIT: // JWL - bugbug?? 1218 { 1219 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') || 1220 (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') || 1221 (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F')); 1222 if (!isXDigit) 1223 { 1224 return -1; 1225 } 1226 } 1227 break; 1228 1229 case POSIX_CLASS_JSTART: 1230 if (!Character.isJavaIdentifierStart(search.charAt(idx))) 1231 { 1232 return -1; 1233 } 1234 break; 1235 1236 case POSIX_CLASS_JPART: 1237 if (!Character.isJavaIdentifierPart(search.charAt(idx))) 1238 { 1239 return -1; 1240 } 1241 break; 1242 1243 default: 1244 internalError("Bad posix class"); 1245 break; 1246 } 1247 1248 // Matched. 1249 idx++; 1250 } 1251 break; 1252 1253 case OP_ANYOF: 1254 { 1255 // Out of input? 1256 if (search.isEnd(idx)) 1257 { 1258 return -1; 1259 } 1260 1261 // Get character to match against character class and maybe casefold 1262 char c = search.charAt(idx); 1263 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0; 1264 // Loop through character class checking our match character 1265 int idxRange = node + nodeSize; 1266 int idxEnd = idxRange + (opdata * 2); 1267 boolean match = false; 1268 for (int i = idxRange; !match && i < idxEnd; ) 1269 { 1270 // Get start, end and match characters 1271 char s = instruction[i++]; 1272 char e = instruction[i++]; 1273 1274 match = ((compareChars(c, s, caseFold) >= 0) 1275 && (compareChars(c, e, caseFold) <= 0)); 1276 } 1277 1278 // Fail if we didn't match the character class 1279 if (!match) 1280 { 1281 return -1; 1282 } 1283 idx++; 1284 } 1285 break; 1286 1287 case OP_BRANCH: 1288 { 1289 // Check for choices 1290 if (instruction[next + offsetOpcode] != OP_BRANCH) 1291 { 1292 // If there aren't any other choices, just evaluate this branch. 1293 node += nodeSize; 1294 continue; 1295 } 1296 1297 // Try all available branches 1298 short nextBranch; 1299 do 1300 { 1301 // Try matching the branch against the string 1302 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) 1303 { 1304 return idxNew; 1305 } 1306 1307 // Go to next branch (if any) 1308 nextBranch = (short)instruction[node + offsetNext]; 1309 node += nextBranch; 1310 } 1311 while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH)); 1312 1313 // Failed to match any branch! 1314 return -1; 1315 } 1316 1317 case OP_NOTHING: 1318 case OP_GOTO: 1319 1320 // Just advance to the next node without doing anything 1321 break; 1322 1323 case OP_END: 1324 1325 // Match has succeeded! 1326 setParenEnd(0, idx); 1327 return idx; 1328 1329 default: 1330 1331 // Corrupt program 1332 internalError("Invalid opcode '" + opcode + "'"); 1333 } 1334 1335 // Advance to the next node in the program 1336 node = next; 1337 } 1338 1339 // We "should" never end up here 1340 internalError("Corrupt program"); 1341 return -1; 1342 } 1343 1344 /** 1345 * Match the current regular expression program against the current 1346 * input string, starting at index i of the input string. This method 1347 * is only meant for internal use. 1348 * 1349 * @param i The input string index to start matching at 1350 * @return True if the input matched the expression 1351 */ 1352 protected boolean matchAt(int i) 1353 { 1354 // Initialize start pointer, paren cache and paren count 1355 start0 = -1; 1356 end0 = -1; 1357 start1 = -1; 1358 end1 = -1; 1359 start2 = -1; 1360 end2 = -1; 1361 startn = null; 1362 endn = null; 1363 parenCount = 1; 1364 setParenStart(0, i); 1365 1366 // Allocate backref arrays (unless optimizations indicate otherwise) 1367 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) 1368 { 1369 startBackref = new int[maxParen]; 1370 endBackref = new int[maxParen]; 1371 } 1372 1373 // Match against string 1374 int idx; 1375 if ((idx = matchNodes(0, maxNode, i)) != -1) 1376 { 1377 setParenEnd(0, idx); 1378 return true; 1379 } 1380 1381 // Didn't match 1382 parenCount = 0; 1383 return false; 1384 } 1385 1386 /** 1387 * Matches the current regular expression program against a character array, 1388 * starting at a given index. 1389 * 1390 * @param search String to match against 1391 * @param i Index to start searching at 1392 * @return True if string matched 1393 */ 1394 public boolean match(String search, int i) 1395 { 1396 return match(new StringCharacterIterator(search), i); 1397 } 1398 1399 /** 1400 * Matches the current regular expression program against a character array, 1401 * starting at a given index. 1402 * 1403 * @param search String to match against 1404 * @param i Index to start searching at 1405 * @return True if string matched 1406 */ 1407 public boolean match(CharacterIterator search, int i) 1408 { 1409 // There is no compiled program to search with! 1410 if (program == null) 1411 { 1412 // This should be uncommon enough to be an error case rather 1413 // than an exception (which would have to be handled everywhere) 1414 internalError("No RE program to run!"); 1415 } 1416 1417 // Save string to search 1418 this.search = search; 1419 1420 // Can we optimize the search by looking for a prefix string? 1421 if (program.prefix == null) 1422 { 1423 // Unprefixed matching must try for a match at each character 1424 for ( ;! search.isEnd(i - 1); i++) 1425 { 1426 // Try a match at index i 1427 if (matchAt(i)) 1428 { 1429 return true; 1430 } 1431 } 1432 return false; 1433 } 1434 else 1435 { 1436 // Prefix-anchored matching is possible 1437 boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0; 1438 char[] prefix = program.prefix; 1439 for ( ; !search.isEnd(i + prefix.length - 1); i++) 1440 { 1441 int j = i; 1442 int k = 0; 1443 1444 boolean match; 1445 do { 1446 // If there's a mismatch of any character in the prefix, give up 1447 match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0); 1448 } while (match && k < prefix.length); 1449 1450 // See if the whole prefix string matched 1451 if (k == prefix.length) 1452 { 1453 // We matched the full prefix at firstChar, so try it 1454 if (matchAt(i)) 1455 { 1456 return true; 1457 } 1458 } 1459 } 1460 return false; 1461 } 1462 } 1463 1464 /** 1465 * Matches the current regular expression program against a String. 1466 * 1467 * @param search String to match against 1468 * @return True if string matched 1469 */ 1470 public boolean match(String search) 1471 { 1472 return match(search, 0); 1473 } 1474 1475 /** 1476 * Splits a string into an array of strings on regular expression boundaries. 1477 * This function works the same way as the Perl function of the same name. 1478 * Given a regular expression of "[ab]+" and a string to split of 1479 * "xyzzyababbayyzabbbab123", the result would be the array of Strings 1480 * "[xyzzy, yyz, 123]". 1481 * 1482 * <p>Please note that the first string in the resulting array may be an empty 1483 * string. This happens when the very first character of input string is 1484 * matched by the pattern. 1485 * 1486 * @param s String to split on this regular exression 1487 * @return Array of strings 1488 */ 1489 public String[] split(String s) 1490 { 1491 // Create new vector 1492 Vector v = new Vector(); 1493 1494 // Start at position 0 and search the whole string 1495 int pos = 0; 1496 int len = s.length(); 1497 1498 // Try a match at each position 1499 while (pos < len && match(s, pos)) 1500 { 1501 // Get start of match 1502 int start = getParenStart(0); 1503 1504 // Get end of match 1505 int newpos = getParenEnd(0); 1506 1507 // Check if no progress was made 1508 if (newpos == pos) 1509 { 1510 v.addElement(s.substring(pos, start + 1)); 1511 newpos++; 1512 } 1513 else 1514 { 1515 v.addElement(s.substring(pos, start)); 1516 } 1517 1518 // Move to new position 1519 pos = newpos; 1520 } 1521 1522 // Push remainder if it's not empty 1523 String remainder = s.substring(pos); 1524 if (remainder.length() != 0) 1525 { 1526 v.addElement(remainder); 1527 } 1528 1529 // Return vector as an array of strings 1530 String[] ret = new String[v.size()]; 1531 v.copyInto(ret); 1532 return ret; 1533 } 1534 1535 /** 1536 * Flag bit that indicates that subst should replace all occurrences of this 1537 * regular expression. 1538 */ 1539 public static final int REPLACE_ALL = 0x0000; 1540 1541 /** 1542 * Flag bit that indicates that subst should only replace the first occurrence 1543 * of this regular expression. 1544 */ 1545 public static final int REPLACE_FIRSTONLY = 0x0001; 1546 1547 /** 1548 * Flag bit that indicates that subst should replace backreferences 1549 */ 1550 public static final int REPLACE_BACKREFERENCES = 0x0002; 1551 1552 /** 1553 * Substitutes a string for this regular expression in another string. 1554 * This method works like the Perl function of the same name. 1555 * Given a regular expression of "a*b", a String to substituteIn of 1556 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the 1557 * resulting String returned by subst would be "-foo-garply-wacky-". 1558 * 1559 * @param substituteIn String to substitute within 1560 * @param substitution String to substitute for all matches of this regular expression. 1561 * @return The string substituteIn with zero or more occurrences of the current 1562 * regular expression replaced with the substitution String (if this regular 1563 * expression object doesn't match at any position, the original String is returned 1564 * unchanged). 1565 */ 1566 public String subst(String substituteIn, String substitution) 1567 { 1568 return subst(substituteIn, substitution, REPLACE_ALL); 1569 } 1570 1571 /** 1572 * Substitutes a string for this regular expression in another string. 1573 * This method works like the Perl function of the same name. 1574 * Given a regular expression of "a*b", a String to substituteIn of 1575 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the 1576 * resulting String returned by subst would be "-foo-garply-wacky-". 1577 * <p> 1578 * It is also possible to reference the contents of a parenthesized expression 1579 * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+", 1580 * a String to substituteIn of "visit us: http://www.apache.org!" and the 1581 * substitution String "<a href=\"$0\">$0</a>", the resulting String 1582 * returned by subst would be 1583 * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!". 1584 * <p> 1585 * <i>Note:</i> $0 represents the whole match. 1586 * 1587 * @param substituteIn String to substitute within 1588 * @param substitution String to substitute for matches of this regular expression 1589 * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY 1590 * flag bit is set, only the first occurrence of this regular expression is replaced. 1591 * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be 1592 * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will 1593 * be processed. 1594 * @return The string substituteIn with zero or more occurrences of the current 1595 * regular expression replaced with the substitution String (if this regular 1596 * expression object doesn't match at any position, the original String is returned 1597 * unchanged). 1598 */ 1599 public String subst(String substituteIn, String substitution, int flags) 1600 { 1601 // String to return 1602 StringBuffer ret = new StringBuffer(); 1603 1604 // Start at position 0 and search the whole string 1605 int pos = 0; 1606 int len = substituteIn.length(); 1607 1608 // Try a match at each position 1609 while (pos < len && match(substituteIn, pos)) 1610 { 1611 // Append string before match 1612 ret.append(substituteIn.substring(pos, getParenStart(0))); 1613 1614 if ((flags & REPLACE_BACKREFERENCES) != 0) 1615 { 1616 // Process backreferences 1617 int lCurrentPosition = 0; 1618 int lLastPosition = -2; 1619 int lLength = substitution.length(); 1620 boolean bAddedPrefix = false; 1621 1622 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0) 1623 { 1624 if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\') 1625 && lCurrentPosition+1 < lLength) 1626 { 1627 char c = substitution.charAt(lCurrentPosition + 1); 1628 if (c >= '0' && c <= '9') 1629 { 1630 if (bAddedPrefix == false) 1631 { 1632 // Append everything between the beginning of the 1633 // substitution string and the current $ sign 1634 ret.append(substitution.substring(0, lCurrentPosition)); 1635 bAddedPrefix = true; 1636 } 1637 else 1638 { 1639 // Append everything between the last and the current $ sign 1640 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition)); 1641 } 1642 1643 // Append the parenthesized expression 1644 // Note: if a parenthesized expression of the requested 1645 // index is not available "null" is added to the string 1646 ret.append(getParen(c - '0')); 1647 lLastPosition = lCurrentPosition; 1648 } 1649 } 1650 1651 // Move forward, skipping past match 1652 lCurrentPosition++; 1653 } 1654 1655 // Append everything after the last $ sign 1656 ret.append(substitution.substring(lLastPosition + 2, lLength)); 1657 } 1658 else 1659 { 1660 // Append substitution without processing backreferences 1661 ret.append(substitution); 1662 } 1663 1664 // Move forward, skipping past match 1665 int newpos = getParenEnd(0); 1666 1667 // We always want to make progress! 1668 if (newpos == pos) 1669 { 1670 newpos++; 1671 } 1672 1673 // Try new position 1674 pos = newpos; 1675 1676 // Break out if we're only supposed to replace one occurrence 1677 if ((flags & REPLACE_FIRSTONLY) != 0) 1678 { 1679 break; 1680 } 1681 } 1682 1683 // If there's remaining input, append it 1684 if (pos < len) 1685 { 1686 ret.append(substituteIn.substring(pos)); 1687 } 1688 1689 // Return string buffer as string 1690 return ret.toString(); 1691 } 1692 1693 /** 1694 * Returns an array of Strings, whose toString representation matches a regular 1695 * expression. This method works like the Perl function of the same name. Given 1696 * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz, 1697 * aaaab], the array of Strings returned by grep would be [aab, aaaab]. 1698 * 1699 * @param search Array of Objects to search 1700 * @return Array of Strings whose toString() value matches this regular expression. 1701 */ 1702 public String[] grep(Object[] search) 1703 { 1704 // Create new vector to hold return items 1705 Vector v = new Vector(); 1706 1707 // Traverse array of objects 1708 for (int i = 0; i < search.length; i++) 1709 { 1710 // Get next object as a string 1711 String s = search[i].toString(); 1712 1713 // If it matches this regexp, add it to the list 1714 if (match(s)) 1715 { 1716 v.addElement(s); 1717 } 1718 } 1719 1720 // Return vector as an array of strings 1721 String[] ret = new String[v.size()]; 1722 v.copyInto(ret); 1723 return ret; 1724 } 1725 1726 /** 1727 * @return true if character at i-th position in the <code>search</code> string is a newline 1728 */ 1729 private boolean isNewline(int i) 1730 { 1731 char nextChar = search.charAt(i); 1732 1733 if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' 1734 || nextChar == '\u2028' || nextChar == '\u2029') 1735 { 1736 return true; 1737 } 1738 1739 return false; 1740 } 1741 1742 /** 1743 * Compares two characters. 1744 * 1745 * @param c1 first character to compare. 1746 * @param c2 second character to compare. 1747 * @param caseIndependent whether comparision is case insensitive or not. 1748 * @return negative, 0, or positive integer as the first character 1749 * less than, equal to, or greater then the second. 1750 */ 1751 private int compareChars(char c1, char c2, boolean caseIndependent) 1752 { 1753 if (caseIndependent) 1754 { 1755 c1 = Character.toLowerCase(c1); 1756 c2 = Character.toLowerCase(c2); 1757 } 1758 return ((int)c1 - (int)c2); 1759 } 1760 }