Old src/com/sun/org/apache/regexp/internal/RE.java

   1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2004 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.regexp.internal;
  22 
  23 import java.io.Serializable;
  24 import java.util.Vector;
  25 
  26 /**
  27  * RE is an efficient, lightweight regular expression evaluator/matcher
  28  * class. Regular expressions are pattern descriptions which enable
  29  * sophisticated matching of strings.  In addition to being able to
  30  * match a string against a pattern, you can also extract parts of the
  31  * match.  This is especially useful in text parsing! Details on the
  32  * syntax of regular expression patterns are given below.
  33  *
  34  * <p>
  35  * To compile a regular expression (RE), you can simply construct an RE
  36  * matcher object from the string specification of the pattern, like this:
  37  *
  38  * <pre>
  39  *  RE r = new RE("a*b");
  40  * </pre>
  41  *
  42  * <p>
  43  * Once you have done this, you can call either of the RE.match methods to
  44  * perform matching on a String.  For example:
  45  *
  46  * <pre>
  47  *  boolean matched = r.match("aaaab");
  48  * </pre>
  49  *
  50  * will cause the boolean matched to be set to true because the
  51  * pattern "a*b" matches the string "aaaab".
  52  *
  53  * <p>
  54  * If you were interested in the <i>number</i> of a's which matched the
  55  * first part of our example expression, you could change the expression to
  56  * "(a*)b".  Then when you compiled the expression and matched it against
  57  * something like "xaaaab", you would get results like this:
  58  *
  59  * <pre>
  60  *  RE r = new RE("(a*)b");                  // Compile expression
  61  *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
  62  *
  63  *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
  64  *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
  65  *
  66  *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
  67  *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
  68  *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
  69  *
  70  *  int startInside = r.getParenStart(1);    // startInside will be index 1
  71  *  int endInside = r.getParenEnd(1);        // endInside will be index 5
  72  *  int lenInside = r.getParenLength(1);     // lenInside will be 4
  73  * </pre>
  74  *
  75  * You can also refer to the contents of a parenthesized expression
  76  * within a regular expression itself.  This is called a
  77  * 'backreference'.  The first backreference in a regular expression is
  78  * denoted by \1, the second by \2 and so on.  So the expression:
  79  *
  80  * <pre>
  81  *  ([0-9]+)=\1
  82  * </pre>
  83  *
  84  * will match any string of the form n=n (like 0=0 or 2=2).
  85  *
  86  * <p>
  87  * The full regular expression syntax accepted by RE is described here:
  88  *
  89  * <pre>
  90  *
  91  *  <b><font face=times roman>Characters</font></b>
  92  *
  93  *    <i>unicodeChar</i>   Matches any identical unicode character
  94  *    \                    Used to quote a meta-character (like '*')
  95  *    \\                   Matches a single '\' character
  96  *    \0nnn                Matches a given octal character
  97  *    \xhh                 Matches a given 8-bit hexadecimal character
  98  *    \\uhhhh              Matches a given 16-bit hexadecimal character
  99  *    \t                   Matches an ASCII tab character
 100  *    \n                   Matches an ASCII newline character
 101  *    \r                   Matches an ASCII return character
 102  *    \f                   Matches an ASCII form feed character
 103  *
 104  *
 105  *  <b><font face=times roman>Character Classes</font></b>
 106  *
 107  *    [abc]                Simple character class
 108  *    [a-zA-Z]             Character class with ranges
 109  *    [^abc]               Negated character class
 110  * </pre>
 111  *
 112  * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
 113  * from zero&quot; or &quot;ends with last character&quot;.
 114  * <br>
 115  * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
 116  * [-] means &quot;all characters&quot;.
 117  *
 118  * <pre>
 119  *
 120  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
 121  *
 122  *    [:alnum:]            Alphanumeric characters.
 123  *    [:alpha:]            Alphabetic characters.
 124  *    [:blank:]            Space and tab characters.
 125  *    [:cntrl:]            Control characters.
 126  *    [:digit:]            Numeric characters.
 127  *    [:graph:]            Characters that are printable and are also visible.
 128  *                         (A space is printable, but not visible, while an
 129  *                         `a' is both.)
 130  *    [:lower:]            Lower-case alphabetic characters.
 131  *    [:print:]            Printable characters (characters that are not
 132  *                         control characters.)
 133  *    [:punct:]            Punctuation characters (characters that are not letter,
 134  *                         digits, control characters, or space characters).
 135  *    [:space:]            Space characters (such as space, tab, and formfeed,
 136  *                         to name a few).
 137  *    [:upper:]            Upper-case alphabetic characters.
 138  *    [:xdigit:]           Characters that are hexadecimal digits.
 139  *
 140  *
 141  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
 142  *
 143  *    [:javastart:]        Start of a Java identifier
 144  *    [:javapart:]         Part of a Java identifier
 145  *
 146  *
 147  *  <b><font face=times roman>Predefined Classes</font></b>
 148  *
 149  *    .         Matches any character other than newline
 150  *    \w        Matches a "word" character (alphanumeric plus "_")
 151  *    \W        Matches a non-word character
 152  *    \s        Matches a whitespace character
 153  *    \S        Matches a non-whitespace character
 154  *    \d        Matches a digit character
 155  *    \D        Matches a non-digit character
 156  *
 157  *
 158  *  <b><font face=times roman>Boundary Matchers</font></b>
 159  *
 160  *    ^         Matches only at the beginning of a line
 161  *    $         Matches only at the end of a line
 162  *    \b        Matches only at a word boundary
 163  *    \B        Matches only at a non-word boundary
 164  *
 165  *
 166  *  <b><font face=times roman>Greedy Closures</font></b>
 167  *
 168  *    A*        Matches A 0 or more times (greedy)
 169  *    A+        Matches A 1 or more times (greedy)
 170  *    A?        Matches A 1 or 0 times (greedy)
 171  *    A{n}      Matches A exactly n times (greedy)
 172  *    A{n,}     Matches A at least n times (greedy)
 173  *    A{n,m}    Matches A at least n but not more than m times (greedy)
 174  *
 175  *
 176  *  <b><font face=times roman>Reluctant Closures</font></b>
 177  *
 178  *    A*?       Matches A 0 or more times (reluctant)
 179  *    A+?       Matches A 1 or more times (reluctant)
 180  *    A??       Matches A 0 or 1 times (reluctant)
 181  *
 182  *
 183  *  <b><font face=times roman>Logical Operators</font></b>
 184  *
 185  *    AB        Matches A followed by B
 186  *    A|B       Matches either A or B
 187  *    (A)       Used for subexpression grouping
 188  *   (?:A)      Used for subexpression clustering (just like grouping but
 189  *              no backrefs)
 190  *
 191  *
 192  *  <b><font face=times roman>Backreferences</font></b>
 193  *
 194  *    \1    Backreference to 1st parenthesized subexpression
 195  *    \2    Backreference to 2nd parenthesized subexpression
 196  *    \3    Backreference to 3rd parenthesized subexpression
 197  *    \4    Backreference to 4th parenthesized subexpression
 198  *    \5    Backreference to 5th parenthesized subexpression
 199  *    \6    Backreference to 6th parenthesized subexpression
 200  *    \7    Backreference to 7th parenthesized subexpression
 201  *    \8    Backreference to 8th parenthesized subexpression
 202  *    \9    Backreference to 9th parenthesized subexpression
 203  * </pre>
 204  *
 205  * <p>
 206  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
 207  * that they match as many elements of the string as possible without
 208  * causing the overall match to fail.  If you want a closure to be
 209  * reluctant (non-greedy), you can simply follow it with a '?'.  A
 210  * reluctant closure will match as few elements of the string as
 211  * possible when finding matches.  {m,n} closures don't currently
 212  * support reluctancy.
 213  *
 214  * <p>
 215  * <b><font face="times roman">Line terminators</font></b>
 216  * <br>
 217  * A line terminator is a one- or two-character sequence that marks
 218  * the end of a line of the input character sequence. The following
 219  * are recognized as line terminators:
 220  * <ul>
 221  * <li>A newline (line feed) character ('\n'),</li>
 222  * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
 223  * <li>A standalone carriage-return character ('\r'),</li>
 224  * <li>A next-line character ('\u0085'),</li>
 225  * <li>A line-separator character ('\u2028'), or</li>
 226  * <li>A paragraph-separator character ('\u2029).</li>
 227  * </ul>
 228  *
 229  * <p>
 230  * RE runs programs compiled by the RECompiler class.  But the RE
 231  * matcher class does not include the actual regular expression compiler
 232  * for reasons of efficiency.  In fact, if you want to pre-compile one
 233  * or more regular expressions, the 'recompile' class can be invoked
 234  * from the command line to produce compiled output like this:
 235  *
 236  * <pre>
 237  *    // Pre-compiled regular expression "a*b"
 238  *    char[] re1Instructions =
 239  *    {
 240  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
 241  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
 242  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
 243  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
 244  *        0x0000,
 245  *    };
 246  *
 247  *
 248  *    REProgram re1 = new REProgram(re1Instructions);
 249  * </pre>
 250  *
 251  * You can then construct a regular expression matcher (RE) object from
 252  * the pre-compiled expression re1 and thus avoid the overhead of
 253  * compiling the expression at runtime. If you require more dynamic
 254  * regular expressions, you can construct a single RECompiler object and
 255  * re-use it to compile each expression. Similarly, you can change the
 256  * program run by a given matcher object at any time. However, RE and
 257  * RECompiler are not threadsafe (for efficiency reasons, and because
 258  * requiring thread safety in this class is deemed to be a rare
 259  * requirement), so you will need to construct a separate compiler or
 260  * matcher object for each thread (unless you do thread synchronization
 261  * yourself). Once expression compiled into the REProgram object, REProgram
 262  * can be safely shared across multiple threads and RE objects.
 263  *
 264  * <br><p><br>
 265  *
 266  * <font color="red">
 267  * <i>ISSUES:</i>
 268  *
 269  * <ul>
 270  *  <li>com.weusours.util.re is not currently compatible with all
 271  *      standard POSIX regcomp flags</li>
 272  *  <li>com.weusours.util.re does not support POSIX equivalence classes
 273  *      ([=foo=] syntax) (I18N/locale issue)</li>
 274  *  <li>com.weusours.util.re does not support nested POSIX character
 275  *      classes (definitely should, but not completely trivial)</li>
 276  *  <li>com.weusours.util.re Does not support POSIX character collation
 277  *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
 278  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
 279  *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
 280  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
 281  *  <li>Not *all* possibilities are considered for greediness when backreferences
 282  *      are involved (as POSIX suggests should be the case).  The POSIX RE
 283  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
 284  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
 285  *      and actually Perl doesn't go to this extent either!  Until someone
 286  *      actually complains about this, I'm not sure it's worth "fixing".
 287  *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
 288  * </ul>
 289  *
 290  * </font>
 291  *
 292  * @see recompile
 293  * @see RECompiler
 294  *
 295  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
 296  * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
 297  */
 298 public class RE implements Serializable
 299 {
 300     /**
 301      * Specifies normal, case-sensitive matching behaviour.
 302      */
 303     public static final int MATCH_NORMAL          = 0x0000;
 304 
 305     /**
 306      * Flag to indicate that matching should be case-independent (folded)
 307      */
 308     public static final int MATCH_CASEINDEPENDENT = 0x0001;
 309 
 310     /**
 311      * Newlines should match as BOL/EOL (^ and $)
 312      */
 313     public static final int MATCH_MULTILINE       = 0x0002;
 314 
 315     /**
 316      * Consider all input a single body of text - newlines are matched by .
 317      */
 318     public static final int MATCH_SINGLELINE      = 0x0004;
 319 
 320     /************************************************
 321      *                                              *
 322      * The format of a node in a program is:        *
 323      *                                              *
 324      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
 325      *                                              *
 326      * char OPCODE - instruction                    *
 327      * char OPDATA - modifying data                 *
 328      * char OPNEXT - next node (relative offset)    *
 329      *                                              *
 330      ************************************************/
 331 
 332                  //   Opcode              Char       Opdata/Operand  Meaning
 333                  //   ----------          ---------- --------------- --------------------------------------------------
 334     static final char OP_END              = 'E';  //                 end of program
 335     static final char OP_BOL              = '^';  //                 match only if at beginning of line
 336     static final char OP_EOL              = '$';  //                 match only if at end of line
 337     static final char OP_ANY              = '.';  //                 match any single character except newline
 338     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
 339     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
 340     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
 341     static final char OP_STAR             = '*';  // node            kleene closure
 342     static final char OP_PLUS             = '+';  // node            positive closure
 343     static final char OP_MAYBE            = '?';  // node            optional closure
 344     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
 345     static final char OP_OPEN             = '(';  // number          nth opening paren
 346     static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
 347     static final char OP_CLOSE            = ')';  // number          nth closing paren
 348     static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
 349     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
 350     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
 351     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
 352     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
 353     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
 354     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
 355     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
 356 
 357     // Escape codes
 358     static final char E_ALNUM             = 'w';  // Alphanumeric
 359     static final char E_NALNUM            = 'W';  // Non-alphanumeric
 360     static final char E_BOUND             = 'b';  // Word boundary
 361     static final char E_NBOUND            = 'B';  // Non-word boundary
 362     static final char E_SPACE             = 's';  // Whitespace
 363     static final char E_NSPACE            = 'S';  // Non-whitespace
 364     static final char E_DIGIT             = 'd';  // Digit
 365     static final char E_NDIGIT            = 'D';  // Non-digit
 366 
 367     // Posix character classes
 368     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
 369     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
 370     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
 371     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
 372     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
 373     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
 374     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
 375     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
 376     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
 377     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
 378     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
 379     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
 380     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
 381     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
 382 
 383     // Limits
 384     static final int maxNode  = 65536;            // Maximum number of nodes in a program
 385     static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
 386 
 387     // Node layout constants
 388     static final int offsetOpcode = 0;            // Opcode offset (first character)
 389     static final int offsetOpdata = 1;            // Opdata offset (second char)
 390     static final int offsetNext   = 2;            // Next index offset (third char)
 391     static final int nodeSize     = 3;            // Node size (in chars)
 392 
 393     // State of current program
 394     REProgram program;                            // Compiled regular expression 'program'
 395     transient CharacterIterator search;           // The string being matched against
 396     int matchFlags;                               // Match behaviour flags
 397     int maxParen = MAX_PAREN;
 398 
 399     // Parenthesized subexpressions
 400     transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
 401     transient int start0;                         // Cache of start[0]
 402     transient int end0;                           // Cache of start[0]
 403     transient int start1;                         // Cache of start[1]
 404     transient int end1;                           // Cache of start[1]
 405     transient int start2;                         // Cache of start[2]
 406     transient int end2;                           // Cache of start[2]
 407     transient int[] startn;                       // Lazy-alloced array of sub-expression starts
 408     transient int[] endn;                         // Lazy-alloced array of sub-expression ends
 409 
 410     // Backreferences
 411     transient int[] startBackref;                 // Lazy-alloced array of backref starts
 412     transient int[] endBackref;                   // Lazy-alloced array of backref ends
 413 
 414     /**
 415      * Constructs a regular expression matcher from a String by compiling it
 416      * using a new instance of RECompiler.  If you will be compiling many
 417      * expressions, you may prefer to use a single RECompiler object instead.
 418      *
 419      * @param pattern The regular expression pattern to compile.
 420      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
 421      * @see RECompiler
 422      * @see recompile
 423      */
 424     public RE(String pattern) throws RESyntaxException
 425     {
 426         this(pattern, MATCH_NORMAL);
 427     }
 428 
 429     /**
 430      * Constructs a regular expression matcher from a String by compiling it
 431      * using a new instance of RECompiler.  If you will be compiling many
 432      * expressions, you may prefer to use a single RECompiler object instead.
 433      *
 434      * @param pattern The regular expression pattern to compile.
 435      * @param matchFlags The matching style
 436      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
 437      * @see RECompiler
 438      * @see recompile
 439      */
 440     public RE(String pattern, int matchFlags) throws RESyntaxException
 441     {
 442         this(new RECompiler().compile(pattern));
 443         setMatchFlags(matchFlags);
 444     }
 445 
 446     /**
 447      * Construct a matcher for a pre-compiled regular expression from program
 448      * (bytecode) data.  Permits special flags to be passed in to modify matching
 449      * behaviour.
 450      *
 451      * @param program Compiled regular expression program (see RECompiler and/or recompile)
 452      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
 453      *
 454      * <pre>
 455      *   MATCH_NORMAL              // Normal (case-sensitive) matching
 456      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
 457      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
 458      * </pre>
 459      *
 460      * @see RECompiler
 461      * @see REProgram
 462      * @see recompile
 463      */
 464     public RE(REProgram program, int matchFlags)
 465     {
 466         setProgram(program);
 467         setMatchFlags(matchFlags);
 468     }
 469 
 470     /**
 471      * Construct a matcher for a pre-compiled regular expression from program
 472      * (bytecode) data.
 473      *
 474      * @param program Compiled regular expression program
 475      * @see RECompiler
 476      * @see recompile
 477      */
 478     public RE(REProgram program)
 479     {
 480         this(program, MATCH_NORMAL);
 481     }
 482 
 483     /**
 484      * Constructs a regular expression matcher with no initial program.
 485      * This is likely to be an uncommon practice, but is still supported.
 486      */
 487     public RE()
 488     {
 489         this((REProgram)null, MATCH_NORMAL);
 490     }
 491 
 492     /**
 493      * Converts a 'simplified' regular expression to a full regular expression
 494      *
 495      * @param pattern The pattern to convert
 496      * @return The full regular expression
 497      */
 498     public static String simplePatternToFullRegularExpression(String pattern)
 499     {
 500         StringBuffer buf = new StringBuffer();
 501         for (int i = 0; i < pattern.length(); i++)
 502         {
 503             char c = pattern.charAt(i);
 504             switch (c)
 505             {
 506                 case '*':
 507                     buf.append(".*");
 508                     break;
 509 
 510                 case '.':
 511                 case '[':
 512                 case ']':
 513                 case '\\':
 514                 case '+':
 515                 case '?':
 516                 case '{':
 517                 case '}':
 518                 case '$':
 519                 case '^':
 520                 case '|':
 521                 case '(':
 522                 case ')':
 523                     buf.append('\\');
 524                 default:
 525                     buf.append(c);
 526                     break;
 527             }
 528         }
 529         return buf.toString();
 530     }
 531 
 532     /**
 533      * Sets match behaviour flags which alter the way RE does matching.
 534      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
 535      *
 536      * <pre>
 537      *   MATCH_NORMAL              // Normal (case-sensitive) matching
 538      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
 539      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
 540      * </pre>
 541      */
 542     public void setMatchFlags(int matchFlags)
 543     {
 544         this.matchFlags = matchFlags;
 545     }
 546 
 547     /**
 548      * Returns the current match behaviour flags.
 549      * @return Current match behaviour flags (RE.MATCH_*).
 550      *
 551      * <pre>
 552      *   MATCH_NORMAL              // Normal (case-sensitive) matching
 553      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
 554      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
 555      * </pre>
 556      *
 557      * @see #setMatchFlags
 558      */
 559     public int getMatchFlags()
 560     {
 561         return matchFlags;
 562     }
 563 
 564     /**
 565      * Sets the current regular expression program used by this matcher object.
 566      *
 567      * @param program Regular expression program compiled by RECompiler.
 568      * @see RECompiler
 569      * @see REProgram
 570      * @see recompile
 571      */
 572     public void setProgram(REProgram program)
 573     {
 574         this.program = program;
 575         if (program != null && program.maxParens != -1) {
 576             this.maxParen = program.maxParens;
 577         } else {
 578             this.maxParen = MAX_PAREN;
 579         }
 580     }
 581 
 582     /**
 583      * Returns the current regular expression program in use by this matcher object.
 584      *
 585      * @return Regular expression program
 586      * @see #setProgram
 587      */
 588     public REProgram getProgram()
 589     {
 590         return program;
 591     }
 592 
 593     /**
 594      * Returns the number of parenthesized subexpressions available after a successful match.
 595      *
 596      * @return Number of available parenthesized subexpressions
 597      */
 598     public int getParenCount()
 599     {
 600         return parenCount;
 601     }
 602 
 603     /**
 604      * Gets the contents of a parenthesized subexpression after a successful match.
 605      *
 606      * @param which Nesting level of subexpression
 607      * @return String
 608      */
 609     public String getParen(int which)
 610     {
 611         int start;
 612         if (which < parenCount && (start = getParenStart(which)) >= 0)
 613         {
 614             return search.substring(start, getParenEnd(which));
 615         }
 616         return null;
 617     }
 618 
 619     /**
 620      * Returns the start index of a given paren level.
 621      *
 622      * @param which Nesting level of subexpression
 623      * @return String index
 624      */
 625     public final int getParenStart(int which)
 626     {
 627         if (which < parenCount)
 628         {
 629             switch (which)
 630             {
 631                 case 0:
 632                     return start0;
 633 
 634                 case 1:
 635                     return start1;
 636 
 637                 case 2:
 638                     return start2;
 639 
 640                 default:
 641                     if (startn == null)
 642                     {
 643                         allocParens();
 644                     }
 645                     return startn[which];
 646             }
 647         }
 648         return -1;
 649     }
 650 
 651     /**
 652      * Returns the end index of a given paren level.
 653      *
 654      * @param which Nesting level of subexpression
 655      * @return String index
 656      */
 657     public final int getParenEnd(int which)
 658     {
 659         if (which < parenCount)
 660         {
 661             switch (which)
 662             {
 663                 case 0:
 664                     return end0;
 665 
 666                 case 1:
 667                     return end1;
 668 
 669                 case 2:
 670                     return end2;
 671 
 672                 default:
 673                     if (endn == null)
 674                     {
 675                         allocParens();
 676                     }
 677                     return endn[which];
 678             }
 679         }
 680         return -1;
 681     }
 682 
 683     /**
 684      * Returns the length of a given paren level.
 685      *
 686      * @param which Nesting level of subexpression
 687      * @return Number of characters in the parenthesized subexpression
 688      */
 689     public final int getParenLength(int which)
 690     {
 691         if (which < parenCount)
 692         {
 693             return getParenEnd(which) - getParenStart(which);
 694         }
 695         return -1;
 696     }
 697 
 698     /**
 699      * Sets the start of a paren level
 700      *
 701      * @param which Which paren level
 702      * @param i Index in input array
 703      */
 704     protected final void setParenStart(int which, int i)
 705     {
 706         if (which < parenCount)
 707         {
 708             switch (which)
 709             {
 710                 case 0:
 711                     start0 = i;
 712                     break;
 713 
 714                 case 1:
 715                     start1 = i;
 716                     break;
 717 
 718                 case 2:
 719                     start2 = i;
 720                     break;
 721 
 722                 default:
 723                     if (startn == null)
 724                     {
 725                         allocParens();
 726                     }
 727                     startn[which] = i;
 728                     break;
 729             }
 730         }
 731     }
 732 
 733     /**
 734      * Sets the end of a paren level
 735      *
 736      * @param which Which paren level
 737      * @param i Index in input array
 738      */
 739     protected final void setParenEnd(int which, int i)
 740     {
 741         if (which < parenCount)
 742         {
 743             switch (which)
 744             {
 745                 case 0:
 746                     end0 = i;
 747                     break;
 748 
 749                 case 1:
 750                     end1 = i;
 751                     break;
 752 
 753                 case 2:
 754                     end2 = i;
 755                     break;
 756 
 757                 default:
 758                     if (endn == null)
 759                     {
 760                         allocParens();
 761                     }
 762                     endn[which] = i;
 763                     break;
 764             }
 765         }
 766     }
 767 
 768     /**
 769      * Throws an Error representing an internal error condition probably resulting
 770      * from a bug in the regular expression compiler (or possibly data corruption).
 771      * In practice, this should be very rare.
 772      *
 773      * @param s Error description
 774      */
 775     protected void internalError(String s) throws Error
 776     {
 777         throw new Error("RE internal error: " + s);
 778     }
 779 
 780     /**
 781      * Performs lazy allocation of subexpression arrays
 782      */
 783     private final void allocParens()
 784     {
 785         // Allocate arrays for subexpressions
 786         startn = new int[maxParen];
 787         endn = new int[maxParen];
 788 
 789         // Set sub-expression pointers to invalid values
 790         for (int i = 0; i < maxParen; i++)
 791         {
 792             startn[i] = -1;
 793             endn[i] = -1;
 794         }
 795     }
 796 
 797     /**
 798      * Try to match a string against a subset of nodes in the program
 799      *
 800      * @param firstNode Node to start at in program
 801      * @param lastNode  Last valid node (used for matching a subexpression without
 802      *                  matching the rest of the program as well).
 803      * @param idxStart  Starting position in character array
 804      * @return Final input array index if match succeeded.  -1 if not.
 805      */
 806     protected int matchNodes(int firstNode, int lastNode, int idxStart)
 807     {
 808         // Our current place in the string
 809         int idx = idxStart;
 810 
 811         // Loop while node is valid
 812         int next, opcode, opdata;
 813         int idxNew;
 814         char[] instruction = program.instruction;
 815         for (int node = firstNode; node < lastNode; )
 816         {
 817             opcode = instruction[node + offsetOpcode];
 818             next   = node + (short)instruction[node + offsetNext];
 819             opdata = instruction[node + offsetOpdata];
 820 
 821             switch (opcode)
 822             {
 823                 case OP_RELUCTANTMAYBE:
 824                     {
 825                         int once = 0;
 826                         do
 827                         {
 828                             // Try to match the rest without using the reluctant subexpr
 829                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
 830                             {
 831                                 return idxNew;
 832                             }
 833                         }
 834                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
 835                         return -1;
 836                     }
 837 
 838                 case OP_RELUCTANTPLUS:
 839                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
 840                     {
 841                         // Try to match the rest without using the reluctant subexpr
 842                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
 843                         {
 844                             return idxNew;
 845                         }
 846                     }
 847                     return -1;
 848 
 849                 case OP_RELUCTANTSTAR:
 850                     do
 851                     {
 852                         // Try to match the rest without using the reluctant subexpr
 853                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
 854                         {
 855                             return idxNew;
 856                         }
 857                     }
 858                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
 859                     return -1;
 860 
 861                 case OP_OPEN:
 862 
 863                     // Match subexpression
 864                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
 865                     {
 866                         startBackref[opdata] = idx;
 867                     }
 868                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
 869                     {
 870                         // Increase valid paren count
 871                         if ((opdata + 1) > parenCount)
 872                         {
 873                             parenCount = opdata + 1;
 874                         }
 875 
 876                         // Don't set paren if already set later on
 877                         if (getParenStart(opdata) == -1)
 878                         {
 879                             setParenStart(opdata, idx);
 880                         }
 881                     }
 882                     return idxNew;
 883 
 884                 case OP_CLOSE:
 885 
 886                     // Done matching subexpression
 887                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
 888                     {
 889                         endBackref[opdata] = idx;
 890                     }
 891                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
 892                     {
 893                         // Increase valid paren count
 894                         if ((opdata + 1) > parenCount)
 895                         {
 896                             parenCount = opdata + 1;
 897                         }
 898 
 899                         // Don't set paren if already set later on
 900                         if (getParenEnd(opdata) == -1)
 901                         {
 902                             setParenEnd(opdata, idx);
 903                         }
 904                     }
 905                     return idxNew;
 906 
 907                 case OP_OPEN_CLUSTER:
 908                 case OP_CLOSE_CLUSTER:
 909                     // starting or ending the matching of a subexpression which has no backref.
 910                     return matchNodes( next, maxNode, idx );
 911 
 912                 case OP_BACKREF:
 913                     {
 914                         // Get the start and end of the backref
 915                         int s = startBackref[opdata];
 916                         int e = endBackref[opdata];
 917 
 918                         // We don't know the backref yet
 919                         if (s == -1 || e == -1)
 920                         {
 921                             return -1;
 922                         }
 923 
 924                         // The backref is empty size
 925                         if (s == e)
 926                         {
 927                             break;
 928                         }
 929 
 930                         // Get the length of the backref
 931                         int l = e - s;
 932 
 933                         // If there's not enough input left, give up.
 934                         if (search.isEnd(idx + l - 1))
 935                         {
 936                             return -1;
 937                         }
 938 
 939                         // Case fold the backref?
 940                         final boolean caseFold =
 941                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
 942                         // Compare backref to input
 943                         for (int i = 0; i < l; i++)
 944                         {
 945                             if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
 946                             {
 947                                 return -1;
 948                             }
 949                         }
 950                     }
 951                     break;
 952 
 953                 case OP_BOL:
 954 
 955                     // Fail if we're not at the start of the string
 956                     if (idx != 0)
 957                     {
 958                         // If we're multiline matching, we could still be at the start of a line
 959                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
 960                         {
 961                             // If not at start of line, give up
 962                             if (idx <= 0 || !isNewline(idx - 1)) {
 963                                 return -1;
 964                             } else {
 965                                 break;
 966                             }
 967                         }
 968                         return -1;
 969                     }
 970                     break;
 971 
 972                 case OP_EOL:
 973 
 974                     // If we're not at the end of string
 975                     if (!search.isEnd(0) && !search.isEnd(idx))
 976                     {
 977                         // If we're multi-line matching
 978                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
 979                         {
 980                             // Give up if we're not at the end of a line
 981                             if (!isNewline(idx)) {
 982                                 return -1;
 983                             } else {
 984                                 break;
 985                             }
 986                         }
 987                         return -1;
 988                     }
 989                     break;
 990 
 991                 case OP_ESCAPE:
 992 
 993                     // Which escape?
 994                     switch (opdata)
 995                     {
 996                         // Word boundary match
 997                         case E_NBOUND:
 998                         case E_BOUND:
 999                             {
1000                                 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1001                                 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1002                                 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1003                                 {
1004                                     return -1;
1005                                 }
1006                             }
1007                             break;
1008 
1009                         // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1010                         case E_ALNUM:
1011                         case E_NALNUM:
1012                         case E_DIGIT:
1013                         case E_NDIGIT:
1014                         case E_SPACE:
1015                         case E_NSPACE:
1016 
1017                             // Give up if out of input
1018                             if (search.isEnd(idx))
1019                             {
1020                                 return -1;
1021                             }
1022 
1023                             char c = search.charAt(idx);
1024 
1025                             // Switch on escape
1026                             switch (opdata)
1027                             {
1028                                 case E_ALNUM:
1029                                 case E_NALNUM:
1030                                     if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1031                                     {
1032                                         return -1;
1033                                     }
1034                                     break;
1035 
1036                                 case E_DIGIT:
1037                                 case E_NDIGIT:
1038                                     if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1039                                     {
1040                                         return -1;
1041                                     }
1042                                     break;
1043 
1044                                 case E_SPACE:
1045                                 case E_NSPACE:
1046                                     if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1047                                     {
1048                                         return -1;
1049                                     }
1050                                     break;
1051                             }
1052                             idx++;
1053                             break;
1054 
1055                         default:
1056                             internalError("Unrecognized escape '" + opdata + "'");
1057                     }
1058                     break;
1059 
1060                 case OP_ANY:
1061 
1062                     if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1063                         // Match anything
1064                         if (search.isEnd(idx))
1065                         {
1066                             return -1;
1067                         }
1068                     }
1069                     else
1070                     {
1071                         // Match anything but a newline
1072                         if (search.isEnd(idx) || isNewline(idx))
1073                         {
1074                             return -1;
1075                         }
1076                     }
1077                     idx++;
1078                     break;
1079 
1080                 case OP_ATOM:
1081                     {
1082                         // Match an atom value
1083                         if (search.isEnd(idx))
1084                         {
1085                             return -1;
1086                         }
1087 
1088                         // Get length of atom and starting index
1089                         int lenAtom = opdata;
1090                         int startAtom = node + nodeSize;
1091 
1092                         // Give up if not enough input remains to have a match
1093                         if (search.isEnd(lenAtom + idx - 1))
1094                         {
1095                             return -1;
1096                         }
1097 
1098                         // Match atom differently depending on casefolding flag
1099                         final boolean caseFold =
1100                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1101 
1102                         for (int i = 0; i < lenAtom; i++)
1103                         {
1104                             if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1105                             {
1106                                 return -1;
1107                             }
1108                         }
1109                     }
1110                     break;
1111 
1112                 case OP_POSIXCLASS:
1113                     {
1114                         // Out of input?
1115                         if (search.isEnd(idx))
1116                         {
1117                             return -1;
1118                         }
1119 
1120                         switch (opdata)
1121                         {
1122                             case POSIX_CLASS_ALNUM:
1123                                 if (!Character.isLetterOrDigit(search.charAt(idx)))
1124                                 {
1125                                     return -1;
1126                                 }
1127                                 break;
1128 
1129                             case POSIX_CLASS_ALPHA:
1130                                 if (!Character.isLetter(search.charAt(idx)))
1131                                 {
1132                                     return -1;
1133                                 }
1134                                 break;
1135 
1136                             case POSIX_CLASS_DIGIT:
1137                                 if (!Character.isDigit(search.charAt(idx)))
1138                                 {
1139                                     return -1;
1140                                 }
1141                                 break;
1142 
1143                             case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1144                                 if (!Character.isSpaceChar(search.charAt(idx)))
1145                                 {
1146                                     return -1;
1147                                 }
1148                                 break;
1149 
1150                             case POSIX_CLASS_SPACE:
1151                                 if (!Character.isWhitespace(search.charAt(idx)))
1152                                 {
1153                                     return -1;
1154                                 }
1155                                 break;
1156 
1157                             case POSIX_CLASS_CNTRL:
1158                                 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1159                                 {
1160                                     return -1;
1161                                 }
1162                                 break;
1163 
1164                             case POSIX_CLASS_GRAPH: // JWL - bugbug???
1165                                 switch (Character.getType(search.charAt(idx)))
1166                                 {
1167                                     case Character.MATH_SYMBOL:
1168                                     case Character.CURRENCY_SYMBOL:
1169                                     case Character.MODIFIER_SYMBOL:
1170                                     case Character.OTHER_SYMBOL:
1171                                         break;
1172 
1173                                     default:
1174                                         return -1;
1175                                 }
1176                                 break;
1177 
1178                             case POSIX_CLASS_LOWER:
1179                                 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1180                                 {
1181                                     return -1;
1182                                 }
1183                                 break;
1184 
1185                             case POSIX_CLASS_UPPER:
1186                                 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1187                                 {
1188                                     return -1;
1189                                 }
1190                                 break;
1191 
1192                             case POSIX_CLASS_PRINT:
1193                                 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1194                                 {
1195                                     return -1;
1196                                 }
1197                                 break;
1198 
1199                             case POSIX_CLASS_PUNCT:
1200                             {
1201                                 int type = Character.getType(search.charAt(idx));
1202                                 switch(type)
1203                                 {
1204                                     case Character.DASH_PUNCTUATION:
1205                                     case Character.START_PUNCTUATION:
1206                                     case Character.END_PUNCTUATION:
1207                                     case Character.CONNECTOR_PUNCTUATION:
1208                                     case Character.OTHER_PUNCTUATION:
1209                                         break;
1210 
1211                                     default:
1212                                         return -1;
1213                                 }
1214                             }
1215                             break;
1216 
1217                             case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1218                             {
1219                                 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1220                                                     (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1221                                                     (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1222                                 if (!isXDigit)
1223                                 {
1224                                     return -1;
1225                                 }
1226                             }
1227                             break;
1228 
1229                             case POSIX_CLASS_JSTART:
1230                                 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1231                                 {
1232                                     return -1;
1233                                 }
1234                                 break;
1235 
1236                             case POSIX_CLASS_JPART:
1237                                 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1238                                 {
1239                                     return -1;
1240                                 }
1241                                 break;
1242 
1243                             default:
1244                                 internalError("Bad posix class");
1245                                 break;
1246                         }
1247 
1248                         // Matched.
1249                         idx++;
1250                     }
1251                     break;
1252 
1253                 case OP_ANYOF:
1254                     {
1255                         // Out of input?
1256                         if (search.isEnd(idx))
1257                         {
1258                             return -1;
1259                         }
1260 
1261                         // Get character to match against character class and maybe casefold
1262                         char c = search.charAt(idx);
1263                         boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1264                         // Loop through character class checking our match character
1265                         int idxRange = node + nodeSize;
1266                         int idxEnd = idxRange + (opdata * 2);
1267                         boolean match = false;
1268                         for (int i = idxRange; !match && i < idxEnd; )
1269                         {
1270                             // Get start, end and match characters
1271                             char s = instruction[i++];
1272                             char e = instruction[i++];
1273 
1274                             match = ((compareChars(c, s, caseFold) >= 0)
1275                                      && (compareChars(c, e, caseFold) <= 0));
1276                         }
1277 
1278                         // Fail if we didn't match the character class
1279                         if (!match)
1280                         {
1281                             return -1;
1282                         }
1283                         idx++;
1284                     }
1285                     break;
1286 
1287                 case OP_BRANCH:
1288                 {
1289                     // Check for choices
1290                     if (instruction[next + offsetOpcode] != OP_BRANCH)
1291                     {
1292                         // If there aren't any other choices, just evaluate this branch.
1293                         node += nodeSize;
1294                         continue;
1295                     }
1296 
1297                     // Try all available branches
1298                     short nextBranch;
1299                     do
1300                     {
1301                         // Try matching the branch against the string
1302                         if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1303                         {
1304                             return idxNew;
1305                         }
1306 
1307                         // Go to next branch (if any)
1308                         nextBranch = (short)instruction[node + offsetNext];
1309                         node += nextBranch;
1310                     }
1311                     while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1312 
1313                     // Failed to match any branch!
1314                     return -1;
1315                 }
1316 
1317                 case OP_NOTHING:
1318                 case OP_GOTO:
1319 
1320                     // Just advance to the next node without doing anything
1321                     break;
1322 
1323                 case OP_END:
1324 
1325                     // Match has succeeded!
1326                     setParenEnd(0, idx);
1327                     return idx;
1328 
1329                 default:
1330 
1331                     // Corrupt program
1332                     internalError("Invalid opcode '" + opcode + "'");
1333             }
1334 
1335             // Advance to the next node in the program
1336             node = next;
1337         }
1338 
1339         // We "should" never end up here
1340         internalError("Corrupt program");
1341         return -1;
1342     }
1343 
1344     /**
1345      * Match the current regular expression program against the current
1346      * input string, starting at index i of the input string.  This method
1347      * is only meant for internal use.
1348      *
1349      * @param i The input string index to start matching at
1350      * @return True if the input matched the expression
1351      */
1352     protected boolean matchAt(int i)
1353     {
1354         // Initialize start pointer, paren cache and paren count
1355         start0 = -1;
1356         end0   = -1;
1357         start1 = -1;
1358         end1   = -1;
1359         start2 = -1;
1360         end2   = -1;
1361         startn = null;
1362         endn   = null;
1363         parenCount = 1;
1364         setParenStart(0, i);
1365 
1366         // Allocate backref arrays (unless optimizations indicate otherwise)
1367         if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1368         {
1369             startBackref = new int[maxParen];
1370             endBackref = new int[maxParen];
1371         }
1372 
1373         // Match against string
1374         int idx;
1375         if ((idx = matchNodes(0, maxNode, i)) != -1)
1376         {
1377             setParenEnd(0, idx);
1378             return true;
1379         }
1380 
1381         // Didn't match
1382         parenCount = 0;
1383         return false;
1384     }
1385 
1386     /**
1387      * Matches the current regular expression program against a character array,
1388      * starting at a given index.
1389      *
1390      * @param search String to match against
1391      * @param i Index to start searching at
1392      * @return True if string matched
1393      */
1394     public boolean match(String search, int i)
1395     {
1396         return match(new StringCharacterIterator(search), i);
1397     }
1398 
1399     /**
1400      * Matches the current regular expression program against a character array,
1401      * starting at a given index.
1402      *
1403      * @param search String to match against
1404      * @param i Index to start searching at
1405      * @return True if string matched
1406      */
1407     public boolean match(CharacterIterator search, int i)
1408     {
1409         // There is no compiled program to search with!
1410         if (program == null)
1411         {
1412             // This should be uncommon enough to be an error case rather
1413             // than an exception (which would have to be handled everywhere)
1414             internalError("No RE program to run!");
1415         }
1416 
1417         // Save string to search
1418         this.search = search;
1419 
1420         // Can we optimize the search by looking for a prefix string?
1421         if (program.prefix == null)
1422         {
1423             // Unprefixed matching must try for a match at each character
1424             for ( ;! search.isEnd(i - 1); i++)
1425             {
1426                 // Try a match at index i
1427                 if (matchAt(i))
1428                 {
1429                     return true;
1430                 }
1431             }
1432             return false;
1433         }
1434         else
1435         {
1436             // Prefix-anchored matching is possible
1437             boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1438             char[] prefix = program.prefix;
1439             for ( ; !search.isEnd(i + prefix.length - 1); i++)
1440             {
1441                 int j = i;
1442                 int k = 0;
1443 
1444                 boolean match;
1445                 do {
1446                     // If there's a mismatch of any character in the prefix, give up
1447                     match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1448                 } while (match && k < prefix.length);
1449 
1450                 // See if the whole prefix string matched
1451                 if (k == prefix.length)
1452                 {
1453                     // We matched the full prefix at firstChar, so try it
1454                     if (matchAt(i))
1455                     {
1456                         return true;
1457                     }
1458                 }
1459             }
1460             return false;
1461         }
1462     }
1463 
1464     /**
1465      * Matches the current regular expression program against a String.
1466      *
1467      * @param search String to match against
1468      * @return True if string matched
1469      */
1470     public boolean match(String search)
1471     {
1472         return match(search, 0);
1473     }
1474 
1475     /**
1476      * Splits a string into an array of strings on regular expression boundaries.
1477      * This function works the same way as the Perl function of the same name.
1478      * Given a regular expression of "[ab]+" and a string to split of
1479      * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1480      * "[xyzzy, yyz, 123]".
1481      *
1482      * <p>Please note that the first string in the resulting array may be an empty
1483      * string. This happens when the very first character of input string is
1484      * matched by the pattern.
1485      *
1486      * @param s String to split on this regular exression
1487      * @return Array of strings
1488      */
1489     public String[] split(String s)
1490     {
1491         // Create new vector
1492         Vector v = new Vector();
1493 
1494         // Start at position 0 and search the whole string
1495         int pos = 0;
1496         int len = s.length();
1497 
1498         // Try a match at each position
1499         while (pos < len && match(s, pos))
1500         {
1501             // Get start of match
1502             int start = getParenStart(0);
1503 
1504             // Get end of match
1505             int newpos = getParenEnd(0);
1506 
1507             // Check if no progress was made
1508             if (newpos == pos)
1509             {
1510                 v.addElement(s.substring(pos, start + 1));
1511                 newpos++;
1512             }
1513             else
1514             {
1515                 v.addElement(s.substring(pos, start));
1516             }
1517 
1518             // Move to new position
1519             pos = newpos;
1520         }
1521 
1522         // Push remainder if it's not empty
1523         String remainder = s.substring(pos);
1524         if (remainder.length() != 0)
1525         {
1526             v.addElement(remainder);
1527         }
1528 
1529         // Return vector as an array of strings
1530         String[] ret = new String[v.size()];
1531         v.copyInto(ret);
1532         return ret;
1533     }
1534 
1535     /**
1536      * Flag bit that indicates that subst should replace all occurrences of this
1537      * regular expression.
1538      */
1539     public static final int REPLACE_ALL            = 0x0000;
1540 
1541     /**
1542      * Flag bit that indicates that subst should only replace the first occurrence
1543      * of this regular expression.
1544      */
1545     public static final int REPLACE_FIRSTONLY      = 0x0001;
1546 
1547     /**
1548      * Flag bit that indicates that subst should replace backreferences
1549      */
1550     public static final int REPLACE_BACKREFERENCES = 0x0002;
1551 
1552     /**
1553      * Substitutes a string for this regular expression in another string.
1554      * This method works like the Perl function of the same name.
1555      * Given a regular expression of "a*b", a String to substituteIn of
1556      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1557      * resulting String returned by subst would be "-foo-garply-wacky-".
1558      *
1559      * @param substituteIn String to substitute within
1560      * @param substitution String to substitute for all matches of this regular expression.
1561      * @return The string substituteIn with zero or more occurrences of the current
1562      * regular expression replaced with the substitution String (if this regular
1563      * expression object doesn't match at any position, the original String is returned
1564      * unchanged).
1565      */
1566     public String subst(String substituteIn, String substitution)
1567     {
1568         return subst(substituteIn, substitution, REPLACE_ALL);
1569     }
1570 
1571     /**
1572      * Substitutes a string for this regular expression in another string.
1573      * This method works like the Perl function of the same name.
1574      * Given a regular expression of "a*b", a String to substituteIn of
1575      * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1576      * resulting String returned by subst would be "-foo-garply-wacky-".
1577      * <p>
1578      * It is also possible to reference the contents of a parenthesized expression
1579      * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1580      * a String to substituteIn of "visit us: http://www.apache.org!" and the
1581      * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
1582      * returned by subst would be
1583      * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
1584      * <p>
1585      * <i>Note:</i> $0 represents the whole match.
1586      *
1587      * @param substituteIn String to substitute within
1588      * @param substitution String to substitute for matches of this regular expression
1589      * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
1590      * flag bit is set, only the first occurrence of this regular expression is replaced.
1591      * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1592      * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1593      * be processed.
1594      * @return The string substituteIn with zero or more occurrences of the current
1595      * regular expression replaced with the substitution String (if this regular
1596      * expression object doesn't match at any position, the original String is returned
1597      * unchanged).
1598      */
1599     public String subst(String substituteIn, String substitution, int flags)
1600     {
1601         // String to return
1602         StringBuffer ret = new StringBuffer();
1603 
1604         // Start at position 0 and search the whole string
1605         int pos = 0;
1606         int len = substituteIn.length();
1607 
1608         // Try a match at each position
1609         while (pos < len && match(substituteIn, pos))
1610         {
1611             // Append string before match
1612             ret.append(substituteIn.substring(pos, getParenStart(0)));
1613 
1614             if ((flags & REPLACE_BACKREFERENCES) != 0)
1615             {
1616                 // Process backreferences
1617                 int lCurrentPosition = 0;
1618                 int lLastPosition = -2;
1619                 int lLength = substitution.length();
1620                 boolean bAddedPrefix = false;
1621 
1622                 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1623                 {
1624                     if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1625                         && lCurrentPosition+1 < lLength)
1626                     {
1627                         char c = substitution.charAt(lCurrentPosition + 1);
1628                         if (c >= '0' && c <= '9')
1629                         {
1630                             if (bAddedPrefix == false)
1631                             {
1632                                 // Append everything between the beginning of the
1633                                 // substitution string and the current $ sign
1634                                 ret.append(substitution.substring(0, lCurrentPosition));
1635                                 bAddedPrefix = true;
1636                             }
1637                             else
1638                             {
1639                                 // Append everything between the last and the current $ sign
1640                                 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
1641                             }
1642 
1643                             // Append the parenthesized expression
1644                             // Note: if a parenthesized expression of the requested
1645                             // index is not available "null" is added to the string
1646                             ret.append(getParen(c - '0'));
1647                             lLastPosition = lCurrentPosition;
1648                         }
1649                     }
1650 
1651                     // Move forward, skipping past match
1652                     lCurrentPosition++;
1653                 }
1654 
1655                 // Append everything after the last $ sign
1656                 ret.append(substitution.substring(lLastPosition + 2, lLength));
1657             }
1658             else
1659             {
1660                 // Append substitution without processing backreferences
1661                 ret.append(substitution);
1662             }
1663 
1664             // Move forward, skipping past match
1665             int newpos = getParenEnd(0);
1666 
1667             // We always want to make progress!
1668             if (newpos == pos)
1669             {
1670                 newpos++;
1671             }
1672 
1673             // Try new position
1674             pos = newpos;
1675 
1676             // Break out if we're only supposed to replace one occurrence
1677             if ((flags & REPLACE_FIRSTONLY) != 0)
1678             {
1679                 break;
1680             }
1681         }
1682 
1683         // If there's remaining input, append it
1684         if (pos < len)
1685         {
1686             ret.append(substituteIn.substring(pos));
1687         }
1688 
1689         // Return string buffer as string
1690         return ret.toString();
1691     }
1692 
1693     /**
1694      * Returns an array of Strings, whose toString representation matches a regular
1695      * expression. This method works like the Perl function of the same name.  Given
1696      * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1697      * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1698      *
1699      * @param search Array of Objects to search
1700      * @return Array of Strings whose toString() value matches this regular expression.
1701      */
1702     public String[] grep(Object[] search)
1703     {
1704         // Create new vector to hold return items
1705         Vector v = new Vector();
1706 
1707         // Traverse array of objects
1708         for (int i = 0; i < search.length; i++)
1709         {
1710             // Get next object as a string
1711             String s = search[i].toString();
1712 
1713             // If it matches this regexp, add it to the list
1714             if (match(s))
1715             {
1716                 v.addElement(s);
1717             }
1718         }
1719 
1720         // Return vector as an array of strings
1721         String[] ret = new String[v.size()];
1722         v.copyInto(ret);
1723         return ret;
1724     }
1725 
1726     /**
1727      * @return true if character at i-th position in the <code>search</code> string is a newline
1728      */
1729     private boolean isNewline(int i)
1730     {
1731         char nextChar = search.charAt(i);
1732 
1733         if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
1734             || nextChar == '\u2028' || nextChar == '\u2029')
1735         {
1736             return true;
1737         }
1738 
1739         return false;
1740     }
1741 
1742     /**
1743      * Compares two characters.
1744      *
1745      * @param c1 first character to compare.
1746      * @param c2 second character to compare.
1747      * @param caseIndependent whether comparision is case insensitive or not.
1748      * @return negative, 0, or positive integer as the first character
1749      *         less than, equal to, or greater then the second.
1750      */
1751     private int compareChars(char c1, char c2, boolean caseIndependent)
1752     {
1753         if (caseIndependent)
1754         {
1755             c1 = Character.toLowerCase(c1);
1756             c2 = Character.toLowerCase(c2);
1757         }
1758         return ((int)c1 - (int)c2);
1759     }
1760 }