New src/com/sun/org/apache/xerces/internal/impl/xpath/regex/RegularExpression.java

   1 /*
   2  * reserved comment block
   3  * DO NOT REMOVE OR ALTER!
   4  */
   5 /*
   6  * Copyright 1999-2002,2004,2005 The Apache Software Foundation.
   7  *
   8  * Licensed under the Apache License, Version 2.0 (the "License");
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  22 
  23 import java.text.CharacterIterator;
  24 import java.util.Locale;
  25 import java.util.Stack;
  26 
  27 import com.sun.org.apache.xerces.internal.util.IntStack;
  28 
  29 /**
  30  * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
  31  * This engine does not conform to the POSIX regular expression.
  32  *
  33  * <hr width="50%">
  34  * <h3>How to use</h3>
  35  *
  36  * <dl>
  37  *   <dt>A. Standard way
  38  *   <dd>
  39  * <pre>
  40  * RegularExpression re = new RegularExpression(<var>regex</var>);
  41  * if (re.matches(text)) { ... }
  42  * </pre>
  43  *
  44  *   <dt>B. Capturing groups
  45  *   <dd>
  46  * <pre>
  47  * RegularExpression re = new RegularExpression(<var>regex</var>);
  48  * Match match = new Match();
  49  * if (re.matches(text, match)) {
  50  *     ... // You can refer captured texts with methods of the <code>Match</code> class.
  51  * }
  52  * </pre>
  53  *
  54  * </dl>
  55  *
  56  * <h4>Case-insensitive matching</h4>
  57  * <pre>
  58  * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
  59  * if (re.matches(text) >= 0) { ...}
  60  * </pre>
  61  *
  62  * <h4>Options</h4>
  63  * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
  64  *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
  65  *    This <var>options</var> parameter consists of the following characters.
  66  * </p>
  67  * <dl>
  68  *   <dt><a name="I_OPTION"><code>"i"</code></a>
  69  *   <dd>This option indicates case-insensitive matching.
  70  *   <dt><a name="M_OPTION"><code>"m"</code></a>
  71  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
  72  *   <dt><a name="S_OPTION"><code>"s"</code></a>
  73  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
  74  *   <dt><a name="U_OPTION"><code>"u"</code></a>
  75  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
  76  *   <dt><a name="W_OPTION"><code>"w"</code></a>
  77  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
  78  *      'Unicode Regular Expression Guidelines' Revision 4.
  79  *      When "w" and "u" are specified at the same time,
  80  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
  81  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
  82  *   <dd>The parser treats a comma in a character class as a range separator.
  83  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
  84  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
  85  *
  86  *   <dt><a name="X_OPTION"><code>"X"</code></a>
  87  *   <dd class="REGEX">
  88  *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
  89  *       The <code>match()</code> method does not do subsring matching
  90  *       but entire string matching.
  91  *
  92  * </dl>
  93  *
  94  * <hr width="50%">
  95  * <h3>Syntax</h3>
  96  * <table border="1" bgcolor="#ddeeff">
  97  *   <tr>
  98  *    <td>
  99  *     <h4>Differences from the Perl 5 regular expression</h4>
 100  *     <ul>
 101  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
 102  *      <li>Supports subtraction, union, and intersection operations for character classes.
 103  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
 104  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
 105  *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
 106  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
 107  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
 108  *     </ul>
 109  *    </td>
 110  *   </tr>
 111  * </table>
 112  *
 113  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
 114  * <ul>
 115  *   <li>Character
 116  *     <dl>
 117  *       <dt class="REGEX"><kbd>.</kbd> (A period)
 118  *       <dd>Matches any one character except the following characters.
 119  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
 120  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
 121  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
 122  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
 123  *           it matches any character including the above four characters.
 124  *
 125  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
 126  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
 127  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
 128  *
 129  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
 130  *       <dd>Matches a control character.
 131  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
 132  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
 133  *           It matches a control character of which the character code is less than
 134  *           the character code of the <var>C</var> by 0x0040.
 135  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
 136  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
 137  *
 138  *       <dt class="REGEX">a non-meta character
 139  *       <dd>Matches the character.
 140  *
 141  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
 142  *       <dd>Matches the meta character.
 143  *
 144  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
 145  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
 146  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
 147  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
 148  *
 149  *       <!--
 150  *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
 151  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
 152  *       -->
 153  *
 154  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
 155  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
 156  *
 157  *       <dt class="REGEX"><kbd>\g</kbd>
 158  *       <dd>Matches a grapheme.
 159  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
 160  *
 161  *       <dt class="REGEX"><kbd>\X</kbd>
 162  *       <dd class="REGEX">Matches a combining character sequence.
 163  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
 164  *     </dl>
 165  *   </li>
 166  *
 167  *   <li>Character class
 168  *     <dl>
 169 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
 170 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
 171  *       <dd>Positive character class.  It matches a character in ranges.
 172  *       <dd><var>R<sub>n</sub></var>:
 173  *       <ul>
 174  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
 175  *             <p>This range matches the character.
 176  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
 177  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
 178 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
 179 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
 180  *             <p>...
 181  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
 182  *             <p>These expressions specifies the same ranges as the following expressions.
 183  *       </ul>
 184  *       <p class="REGEX">Enumerated ranges are merged (union operation).
 185  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
 186  *
 187  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
 188  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
 189  *       <dd>Negative character class.  It matches a character not in ranges.
 190  *
 191  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
 192  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
 193  *       <dd>Subtraction or union or intersection for character classes.
 194  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
 195  *       <dd>The result of this operations is a <u>positive character class</u>
 196  *           even if an expression includes any negative character classes.
 197  *           You have to take care on this in case-insensitive matching.
 198  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
 199  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
 200  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
 201  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
 202  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
 203  *
 204  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
 205  *       <dd>Character class subtraction for the XML Schema.
 206  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
 207  *
 208  *       <dt class="REGEX"><kbd>\d</kbd>
 209  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
 210  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 211  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
 212  *
 213  *       <dt class="REGEX"><kbd>\D</kbd>
 214  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
 215  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 216  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
 217  *
 218  *       <dt class="REGEX"><kbd>\s</kbd>
 219  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
 220  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 221  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
 222  *
 223  *       <dt class="REGEX"><kbd>\S</kbd>
 224  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
 225  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 226  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
 227  *
 228  *       <dt class="REGEX"><kbd>\w</kbd>
 229  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
 230  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 231  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 232  *
 233  *       <dt class="REGEX"><kbd>\W</kbd>
 234  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
 235  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
 236  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
 237  *
 238  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
 239  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
 240  *       The following names are available:
 241  *       <dl>
 242  *         <dt>Unicode General Categories:
 243  *         <dd><kbd>
 244  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
 245  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
 246  *         </kbd>
 247  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
 248  *         <dt>Unicode Blocks:
 249  *         <dd><kbd>
 250  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
 251  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
 252  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
 253  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
 254  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
 255  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
 256  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
 257  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
 258  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
 259  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
 260  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
 261  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
 262  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
 263  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
 264  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
 265  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
 266  *       Halfwidth and Fullwidth Forms
 267  *         </kbd>
 268  *         <dt>Others:
 269  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
 270  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
 271  *         <dd><kbd>UNASSGINED</kbd>
 272  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
 273  *       </dl>
 274  *
 275  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
 276  *       <dd>Matches one character not in the specified General Category or the specified Block.
 277  *     </dl>
 278  *   </li>
 279  *
 280  *   <li>Selection and Quantifier
 281  *     <dl>
 282  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
 283  *       <dd>...
 284  *
 285  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
 286  *       <dd>Matches 0 or more <var>X</var>.
 287  *
 288  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
 289  *       <dd>Matches 1 or more <var>X</var>.
 290  *
 291  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
 292  *       <dd>Matches 0 or 1 <var>X</var>.
 293  *
 294  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
 295  *       <dd>Matches <var>number</var> times.
 296  *
 297  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
 298  *       <dd>...
 299  *
 300  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
 301  *       <dd>...
 302  *
 303  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
 304  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
 305  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
 306  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
 307  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
 308  *       <dd>Non-greedy matching.
 309  *     </dl>
 310  *   </li>
 311  *
 312  *   <li>Grouping, Capturing, and Back-reference
 313  *     <dl>
 314  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
 315  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
 316  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
 317  *       you have to write "<KBD>(?:foo)+</KBD>".
 318  *
 319  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
 320  *       <dd>Grouping with capturing.
 321  * It make a group and applications can know
 322  * where in target text a group matched with methods of a <code>Match</code> instance
 323  * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>.
 324  * The 0th group means whole of this regular expression.
 325  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
 326  *
 327  *   <p>For instance, a regular expression is
 328  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
 329  *   and target text is
 330  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
 331  *   <ul>
 332  *     <li><code>Match.getCapturedText(0)</code>:
 333  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
 334  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
 335  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
 336  *   </ul>
 337  *
 338  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
 339  *       <dd>
 340  *
 341  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
 342  *       <dd>Independent expression group. ................
 343  *
 344  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 345  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
 346  *       <dd>............................
 347  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
 348  *           Note that it can not contain 'u'.
 349  *
 350  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
 351  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
 352  *       <dd>......
 353  *       <dd>These expressions must be at the beginning of a group.
 354  *     </dl>
 355  *   </li>
 356  *
 357  *   <li>Anchor
 358  *     <dl>
 359  *       <dt class="REGEX"><kbd>\A</kbd>
 360  *       <dd>Matches the beginnig of the text.
 361  *
 362  *       <dt class="REGEX"><kbd>\Z</kbd>
 363  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 364  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 365  *
 366  *       <dt class="REGEX"><kbd>\z</kbd>
 367  *       <dd>Matches the end of the text.
 368  *
 369  *       <dt class="REGEX"><kbd>^</kbd>
 370  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
 371  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 372  *           it matches the beginning of the text, or after one of EOL characters (
 373  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
 374  *           PARAGRAPH SEPARATOR (U+2029).)
 375  *
 376  *       <dt class="REGEX"><kbd>$</kbd>
 377  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
 378  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
 379  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
 380  *           it matches the end of the text, or before an EOL character.
 381  *
 382  *       <dt class="REGEX"><kbd>\b</kbd>
 383  *       <dd>Matches word boundary.
 384  *           (See <a href="#W_OPTION">a "w" option</a>)
 385  *
 386  *       <dt class="REGEX"><kbd>\B</kbd>
 387  *       <dd>Matches non word boundary.
 388  *           (See <a href="#W_OPTION">a "w" option</a>)
 389  *
 390  *       <dt class="REGEX"><kbd>\&lt;</kbd>
 391  *       <dd>Matches the beginning of a word.
 392  *           (See <a href="#W_OPTION">a "w" option</a>)
 393  *
 394  *       <dt class="REGEX"><kbd>\&gt;</kbd>
 395  *       <dd>Matches the end of a word.
 396  *           (See <a href="#W_OPTION">a "w" option</a>)
 397  *     </dl>
 398  *   </li>
 399  *   <li>Lookahead and lookbehind
 400  *     <dl>
 401  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
 402  *       <dd>Lookahead.
 403  *
 404  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
 405  *       <dd>Negative lookahead.
 406  *
 407  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
 408  *       <dd>Lookbehind.
 409  *       <dd>(Note for text capturing......)
 410  *
 411  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
 412  *       <dd>Negative lookbehind.
 413  *     </dl>
 414  *   </li>
 415  *
 416  *   <li>Misc.
 417  *     <dl>
 418  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
 419  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
 420  *       <dd>......
 421  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
 422  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
 423  *           You can not write comments in character classes and before quantifiers.
 424  *     </dl>
 425  *   </li>
 426  * </ul>
 427  *
 428  *
 429  * <hr width="50%">
 430  * <h3>BNF for the regular expression</h3>
 431  * <pre>
 432  * regex ::= ('(?' options ')')? term ('|' term)*
 433  * term ::= factor+
 434  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
 435  *            | '(?#' [^)]* ')'
 436  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
 437  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
 438  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
 439  *          | '(?>' regex ')' | '(?' options ':' regex ')'
 440  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
 441  * options ::= [imsw]* ('-' [imsw]+)?
 442  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
 443  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
 444  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
 445  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
 446  * category-block ::= '\' [pP] category-symbol-1
 447  *                    | ('\p{' | '\P{') (category-symbol | block-name
 448  *                                       | other-properties) '}'
 449  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
 450  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
 451  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
 452  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
 453  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
 454  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
 455  * block-name ::= (See above)
 456  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
 457  * character-1 ::= (any character except meta-characters)
 458  *
 459  * char-class ::= '[' ranges ']'
 460  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
 461  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
 462  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
 463  *           | range-char | range-char '-' range-char
 464  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
 465  * code-point ::= '\x' hex-char hex-char
 466  *                | '\x{' hex-char+ '}'
 467  * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
 468  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
 469  * hex-char ::= [0-9a-fA-F]
 470  * character-2 ::= (any character except \[]-,)
 471  * </pre>
 472  *
 473  * <hr width="50%">
 474  * <h3>TODO</h3>
 475  * <ul>
 476  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
 477  *     <ul>
 478  *       <li>2.4 Canonical Equivalents
 479  *       <li>Level 3
 480  *     </ul>
 481  *   <li>Parsing performance
 482  * </ul>
 483  *
 484  * <hr width="50%">
 485  *
 486  * @xerces.internal
 487  *
 488  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
 489  */
 490 public class RegularExpression implements java.io.Serializable {
 491 
 492     private static final long serialVersionUID = 6242499334195006401L;
 493 
 494     static final boolean DEBUG = false;
 495 
 496     /**
 497      * Compiles a token tree into an operation flow.
 498      */
 499     private synchronized void compile(Token tok) {
 500         if (this.operations != null)
 501             return;
 502         this.numberOfClosures = 0;
 503         this.operations = this.compile(tok, null, false);
 504     }
 505 
 506     /**
 507      * Converts a token to an operation.
 508      */
 509     private Op compile(Token tok, Op next, boolean reverse) {
 510         Op ret;
 511         switch (tok.type) {
 512         case Token.DOT:
 513             ret = Op.createDot();
 514             ret.next = next;
 515             break;
 516 
 517         case Token.CHAR:
 518             ret = Op.createChar(tok.getChar());
 519             ret.next = next;
 520             break;
 521 
 522         case Token.ANCHOR:
 523             ret = Op.createAnchor(tok.getChar());
 524             ret.next = next;
 525             break;
 526 
 527         case Token.RANGE:
 528         case Token.NRANGE:
 529             ret = Op.createRange(tok);
 530             ret.next = next;
 531             break;
 532 
 533         case Token.CONCAT:
 534             ret = next;
 535             if (!reverse) {
 536                 for (int i = tok.size()-1;  i >= 0;  i --) {
 537                     ret = compile(tok.getChild(i), ret, false);
 538                 }
 539             } else {
 540                 for (int i = 0;  i < tok.size();  i ++) {
 541                     ret = compile(tok.getChild(i), ret, true);
 542                 }
 543             }
 544             break;
 545 
 546         case Token.UNION:
 547             Op.UnionOp uni = Op.createUnion(tok.size());
 548             for (int i = 0;  i < tok.size();  i ++) {
 549                 uni.addElement(compile(tok.getChild(i), next, reverse));
 550             }
 551             ret = uni;                          // ret.next is null.
 552             break;
 553 
 554         case Token.CLOSURE:
 555         case Token.NONGREEDYCLOSURE:
 556             Token child = tok.getChild(0);
 557             int min = tok.getMin();
 558             int max = tok.getMax();
 559             if (min >= 0 && min == max) { // {n}
 560                 ret = next;
 561                 for (int i = 0; i < min;  i ++) {
 562                     ret = compile(child, ret, reverse);
 563                 }
 564                 break;
 565             }
 566             if (min > 0 && max > 0)
 567                 max -= min;
 568             if (max > 0) {
 569                 // X{2,6} -> XX(X(X(XX?)?)?)?
 570                 ret = next;
 571                 for (int i = 0;  i < max;  i ++) {
 572                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
 573                     q.next = next;
 574                     q.setChild(compile(child, ret, reverse));
 575                     ret = q;
 576                 }
 577             } else {
 578                 Op.ChildOp op;
 579                 if (tok.type == Token.NONGREEDYCLOSURE) {
 580                     op = Op.createNonGreedyClosure();
 581                 } else {                        // Token.CLOSURE
 582                     op = Op.createClosure(this.numberOfClosures++);
 583                 }
 584                 op.next = next;
 585                 op.setChild(compile(child, op, reverse));
 586                 ret = op;
 587             }
 588             if (min > 0) {
 589                 for (int i = 0;  i < min;  i ++) {
 590                     ret = compile(child, ret, reverse);
 591                 }
 592             }
 593             break;
 594 
 595         case Token.EMPTY:
 596             ret = next;
 597             break;
 598 
 599         case Token.STRING:
 600             ret = Op.createString(tok.getString());
 601             ret.next = next;
 602             break;
 603 
 604         case Token.BACKREFERENCE:
 605             ret = Op.createBackReference(tok.getReferenceNumber());
 606             ret.next = next;
 607             break;
 608 
 609         case Token.PAREN:
 610             if (tok.getParenNumber() == 0) {
 611                 ret = compile(tok.getChild(0), next, reverse);
 612             } else if (reverse) {
 613                 next = Op.createCapture(tok.getParenNumber(), next);
 614                 next = compile(tok.getChild(0), next, reverse);
 615                 ret = Op.createCapture(-tok.getParenNumber(), next);
 616             } else {
 617                 next = Op.createCapture(-tok.getParenNumber(), next);
 618                 next = compile(tok.getChild(0), next, reverse);
 619                 ret = Op.createCapture(tok.getParenNumber(), next);
 620             }
 621             break;
 622 
 623         case Token.LOOKAHEAD:
 624             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
 625             break;
 626         case Token.NEGATIVELOOKAHEAD:
 627             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
 628             break;
 629         case Token.LOOKBEHIND:
 630             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
 631             break;
 632         case Token.NEGATIVELOOKBEHIND:
 633             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
 634             break;
 635 
 636         case Token.INDEPENDENT:
 637             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
 638             break;
 639 
 640         case Token.MODIFIERGROUP:
 641             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
 642                                     ((Token.ModifierToken)tok).getOptions(),
 643                                     ((Token.ModifierToken)tok).getOptionsMask());
 644             break;
 645 
 646         case Token.CONDITION:
 647             Token.ConditionToken ctok = (Token.ConditionToken)tok;
 648             int ref = ctok.refNumber;
 649             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
 650             Op yes = compile(ctok.yes, next, reverse);
 651             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
 652             ret = Op.createCondition(next, ref, condition, yes, no);
 653             break;
 654 
 655         default:
 656             throw new RuntimeException("Unknown token type: "+tok.type);
 657         } // switch (tok.type)
 658         return ret;
 659     }
 660 
 661 
 662 //Public
 663 
 664     /**
 665      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 666      *
 667      * @return true if the target is matched to this regular expression.
 668      */
 669     public boolean matches(char[]  target) {
 670         return this.matches(target, 0,  target .length , (Match)null);
 671     }
 672 
 673     /**
 674      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 675      * in specified range or not.
 676      *
 677      * @param start Start offset of the range.
 678      * @param end  End offset +1 of the range.
 679      * @return true if the target is matched to this regular expression.
 680      */
 681     public boolean matches(char[]  target, int start, int end) {
 682         return this.matches(target, start, end, (Match)null);
 683     }
 684 
 685     /**
 686      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 687      *
 688      * @param match A Match instance for storing matching result.
 689      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 690      */
 691     public boolean matches(char[]  target, Match match) {
 692         return this.matches(target, 0,  target .length , match);
 693     }
 694 
 695 
 696     /**
 697      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 698      * in specified range or not.
 699      *
 700      * @param start Start offset of the range.
 701      * @param end  End offset +1 of the range.
 702      * @param match A Match instance for storing matching result.
 703      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 704      */
 705     public boolean matches(char[] target, int start, int end, Match match) {
 706 
 707         synchronized (this) {
 708             if (this.operations == null)
 709                 this.prepare();
 710             if (this.context == null)
 711                 this.context = new Context();
 712         }
 713         Context con = null;
 714         synchronized (this.context) {
 715             con = this.context.inuse ? new Context() : this.context;
 716             con.reset(target, start, end, this.numberOfClosures);
 717         }
 718         if (match != null) {
 719             match.setNumberOfGroups(this.nofparen);
 720             match.setSource(target);
 721         } else if (this.hasBackReferences) {
 722             match = new Match();
 723             match.setNumberOfGroups(this.nofparen);
 724             // Need not to call setSource() because
 725             // a caller can not access this match instance.
 726         }
 727         con.match = match;
 728 
 729         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 730             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 731             //System.err.println("DEBUG: matchEnd="+matchEnd);
 732             if (matchEnd == con.limit) {
 733                 if (con.match != null) {
 734                     con.match.setBeginning(0, con.start);
 735                     con.match.setEnd(0, matchEnd);
 736                 }
 737                 con.setInUse(false);
 738                 return true;
 739             }
 740             return false;
 741         }
 742 
 743         /*
 744          * The pattern has only fixed string.
 745          * The engine uses Boyer-Moore.
 746          */
 747         if (this.fixedStringOnly) {
 748             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 749             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 750             if (o >= 0) {
 751                 if (con.match != null) {
 752                     con.match.setBeginning(0, o);
 753                     con.match.setEnd(0, o+this.fixedString.length());
 754                 }
 755                 con.setInUse(false);
 756                 return true;
 757             }
 758             con.setInUse(false);
 759             return false;
 760         }
 761 
 762         /*
 763          * The pattern contains a fixed string.
 764          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 765          * If not, it return with false.
 766          */
 767         if (this.fixedString != null) {
 768             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 769             if (o < 0) {
 770                 //System.err.println("Non-match in fixed-string search.");
 771                 con.setInUse(false);
 772                 return false;
 773             }
 774         }
 775 
 776         int limit = con.limit-this.minlength;
 777         int matchStart;
 778         int matchEnd = -1;
 779 
 780         /*
 781          * Checks whether the expression starts with ".*".
 782          */
 783         if (this.operations != null
 784             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 785             if (isSet(this.options, SINGLE_LINE)) {
 786                 matchStart = con.start;
 787                 matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 788             } else {
 789                 boolean previousIsEOL = true;
 790                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 791                     int ch =  target [  matchStart ] ;
 792                     if (isEOLChar(ch)) {
 793                         previousIsEOL = true;
 794                     } else {
 795                         if (previousIsEOL) {
 796                             if (0 <= (matchEnd = this. match(con, this.operations,
 797                                                              matchStart, 1, this.options)))
 798                                 break;
 799                         }
 800                         previousIsEOL = false;
 801                     }
 802                 }
 803             }
 804         }
 805 
 806         /*
 807          * Optimization against the first character.
 808          */
 809         else if (this.firstChar != null) {
 810             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 811             RangeToken range = this.firstChar;
 812             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 813                 int ch =  target [matchStart] ;
 814                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 815                     ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
 816                 }
 817                 if (!range.match(ch))  {
 818                     continue;
 819                 }
 820                 if (0 <= (matchEnd = this. match(con, this.operations,
 821                                                  matchStart, 1, this.options))) {
 822                         break;
 823                 }
 824             }
 825         }
 826 
 827         /*
 828          * Straightforward matching.
 829          */
 830         else {
 831             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 832                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
 833                     break;
 834             }
 835         }
 836 
 837         if (matchEnd >= 0) {
 838             if (con.match != null) {
 839                 con.match.setBeginning(0, matchStart);
 840                 con.match.setEnd(0, matchEnd);
 841             }
 842             con.setInUse(false);
 843             return true;
 844         } else {
 845             con.setInUse(false);
 846             return false;
 847         }
 848     }
 849 
 850     /**
 851      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 852      *
 853      * @return true if the target is matched to this regular expression.
 854      */
 855     public boolean matches(String  target) {
 856         return this.matches(target, 0,  target .length() , (Match)null);
 857     }
 858 
 859     /**
 860      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 861      * in specified range or not.
 862      *
 863      * @param start Start offset of the range.
 864      * @param end  End offset +1 of the range.
 865      * @return true if the target is matched to this regular expression.
 866      */
 867     public boolean matches(String  target, int start, int end) {
 868         return this.matches(target, start, end, (Match)null);
 869     }
 870 
 871     /**
 872      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 873      *
 874      * @param match A Match instance for storing matching result.
 875      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 876      */
 877     public boolean matches(String  target, Match match) {
 878         return this.matches(target, 0,  target .length() , match);
 879     }
 880 
 881     /**
 882      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 883      * in specified range or not.
 884      *
 885      * @param start Start offset of the range.
 886      * @param end  End offset +1 of the range.
 887      * @param match A Match instance for storing matching result.
 888      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 889      */
 890     public boolean matches(String  target, int start, int end, Match match) {
 891 
 892         synchronized (this) {
 893             if (this.operations == null)
 894                 this.prepare();
 895             if (this.context == null)
 896                 this.context = new Context();
 897         }
 898         Context con = null;
 899         synchronized (this.context) {
 900             con = this.context.inuse ? new Context() : this.context;
 901             con.reset(target, start, end, this.numberOfClosures);
 902         }
 903         if (match != null) {
 904             match.setNumberOfGroups(this.nofparen);
 905             match.setSource(target);
 906         } else if (this.hasBackReferences) {
 907             match = new Match();
 908             match.setNumberOfGroups(this.nofparen);
 909             // Need not to call setSource() because
 910             // a caller can not access this match instance.
 911         }
 912         con.match = match;
 913 
 914         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 915             if (DEBUG) {
 916                 System.err.println("target string="+target);
 917             }
 918             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
 919             if (DEBUG) {
 920                 System.err.println("matchEnd="+matchEnd);
 921                 System.err.println("con.limit="+con.limit);
 922             }
 923             if (matchEnd == con.limit) {
 924                 if (con.match != null) {
 925                     con.match.setBeginning(0, con.start);
 926                     con.match.setEnd(0, matchEnd);
 927                 }
 928                 con.setInUse(false);
 929                 return true;
 930             }
 931             return false;
 932         }
 933 
 934         /*
 935          * The pattern has only fixed string.
 936          * The engine uses Boyer-Moore.
 937          */
 938         if (this.fixedStringOnly) {
 939             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 940             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 941             if (o >= 0) {
 942                 if (con.match != null) {
 943                     con.match.setBeginning(0, o);
 944                     con.match.setEnd(0, o+this.fixedString.length());
 945                 }
 946                 con.setInUse(false);
 947                 return true;
 948             }
 949             con.setInUse(false);
 950             return false;
 951         }
 952 
 953         /*
 954          * The pattern contains a fixed string.
 955          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 956          * If not, it return with false.
 957          */
 958         if (this.fixedString != null) {
 959             int o = this.fixedStringTable.matches(target, con.start, con.limit);
 960             if (o < 0) {
 961                 //System.err.println("Non-match in fixed-string search.");
 962                 con.setInUse(false);
 963                 return false;
 964             }
 965         }
 966 
 967         int limit = con.limit-this.minlength;
 968         int matchStart;
 969         int matchEnd = -1;
 970 
 971         /*
 972          * Checks whether the expression starts with ".*".
 973          */
 974         if (this.operations != null
 975             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 976             if (isSet(this.options, SINGLE_LINE)) {
 977                 matchStart = con.start;
 978                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
 979             } else {
 980                 boolean previousIsEOL = true;
 981                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 982                     int ch =  target .charAt(  matchStart ) ;
 983                     if (isEOLChar(ch)) {
 984                         previousIsEOL = true;
 985                     } else {
 986                         if (previousIsEOL) {
 987                             if (0 <= (matchEnd = this.match(con, this.operations,
 988                                                             matchStart, 1, this.options)))
 989                                 break;
 990                         }
 991                         previousIsEOL = false;
 992                     }
 993                 }
 994             }
 995         }
 996 
 997         /*
 998          * Optimization against the first character.
 999          */
1000         else if (this.firstChar != null) {
1001             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1002             RangeToken range = this.firstChar;
1003             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1004                 int ch =  target .charAt(  matchStart ) ;
1005                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1006                     ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1007                 }
1008                 if (!range.match(ch)) {
1009                     continue;
1010                 }
1011                 if (0 <= (matchEnd = this.match(con, this.operations,
1012                                                 matchStart, 1, this.options))) {
1013                         break;
1014                 }
1015             }
1016         }
1017 
1018         /*
1019          * Straightforward matching.
1020          */
1021         else {
1022             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1023                 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options)))
1024                     break;
1025             }
1026         }
1027 
1028         if (matchEnd >= 0) {
1029             if (con.match != null) {
1030                 con.match.setBeginning(0, matchStart);
1031                 con.match.setEnd(0, matchEnd);
1032             }
1033             con.setInUse(false);
1034             return true;
1035         } else {
1036             con.setInUse(false);
1037             return false;
1038         }
1039     }
1040 
1041     /**
1042      * @return -1 when not match; offset of the end of matched string when match.
1043      */
1044     private int match(Context con, Op op, int offset, int dx, int opts) {
1045         final ExpressionTarget target = con.target;
1046         final Stack opStack = new Stack();
1047         final IntStack dataStack = new IntStack();
1048         final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1049         int retValue = -1;
1050         boolean returned = false;
1051 
1052         for (;;) {
1053             if (op == null || offset > con.limit || offset < con.start) {
1054                 if (op == null) {
1055                     retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1056                 }
1057                 else {
1058                    retValue = -1;
1059                 }
1060                 returned = true;
1061             }
1062             else  {
1063                 retValue = -1;
1064                 // dx value is either 1 or -1
1065                 switch (op.type) {
1066                 case Op.CHAR:
1067                     {
1068                         final int o1 = (dx > 0) ? offset : offset -1;
1069                         if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1070                             returned = true;
1071                             break;
1072                         }
1073                         offset += dx;
1074                         op = op.next;
1075                     }
1076                     break;
1077 
1078                 case Op.DOT:
1079                     {
1080                         int o1 = (dx > 0) ? offset : offset - 1;
1081                         if (o1 >= con.limit || o1 < 0) {
1082                             returned = true;
1083                             break;
1084                         }
1085                         if (isSet(opts, SINGLE_LINE)) {
1086                             if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1087                                 o1 += dx;
1088                             }
1089                         }
1090                         else {
1091                             int ch = target.charAt(o1);
1092                             if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1093                                 o1 += dx;
1094                                 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1095                             }
1096                             if (isEOLChar(ch)) {
1097                                 returned = true;
1098                                 break;
1099                             }
1100                         }
1101                         offset = (dx > 0) ? o1 + 1 : o1;
1102                         op = op.next;
1103                     }
1104                     break;
1105 
1106                 case Op.RANGE:
1107                 case Op.NRANGE:
1108                     {
1109                         int o1 = (dx > 0) ? offset : offset -1;
1110                         if (o1 >= con.limit || o1 < 0) {
1111                             returned = true;
1112                             break;
1113                         }
1114                         int ch = target.charAt(offset);
1115                         if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1116                             o1 += dx;
1117                             ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1118                         }
1119                         final RangeToken tok = op.getToken();
1120                         if (!tok.match(ch)) {
1121                             returned = true;
1122                             break;
1123                         }
1124                         offset = (dx > 0) ? o1+1 : o1;
1125                         op = op.next;
1126                     }
1127                     break;
1128 
1129                 case Op.ANCHOR:
1130                     {
1131                         if (!matchAnchor(target, op, con, offset, opts)) {
1132                             returned = true;
1133                             break;
1134                         }
1135                         op = op.next;
1136                     }
1137                     break;
1138 
1139                 case Op.BACKREFERENCE:
1140                     {
1141                         int refno = op.getData();
1142                         if (refno <= 0 || refno >= this.nofparen) {
1143                             throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1144                         }
1145                         if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1146                             returned = true;
1147                             break;
1148                         }
1149                         int o2 = con.match.getBeginning(refno);
1150                         int literallen = con.match.getEnd(refno)-o2;
1151                         if (dx > 0) {
1152                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1153                                 returned = true;
1154                                 break;
1155                             }
1156                             offset += literallen;
1157                         }
1158                         else {
1159                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1160                                 returned = true;
1161                                 break;
1162                             }
1163                             offset -= literallen;
1164                         }
1165                         op = op.next;
1166                     }
1167                     break;
1168 
1169                 case Op.STRING:
1170                     {
1171                         String literal = op.getString();
1172                         int literallen = literal.length();
1173                         if (dx > 0) {
1174                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1175                                 returned = true;
1176                                 break;
1177                             }
1178                             offset += literallen;
1179                         }
1180                         else {
1181                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1182                                 returned = true;
1183                                 break;
1184                             }
1185                             offset -= literallen;
1186                         }
1187                         op = op.next;
1188                     }
1189                     break;
1190 
1191                 case Op.CLOSURE:
1192                     {
1193                         // Saves current position to avoid zero-width repeats.
1194                         final int id = op.getData();
1195                         if (con.closureContexts[id].contains(offset)) {
1196                             returned = true;
1197                             break;
1198                         }
1199 
1200                         con.closureContexts[id].addOffset(offset);
1201                     }
1202                     // fall through
1203 
1204                 case Op.QUESTION:
1205                     {
1206                         opStack.push(op);
1207                         dataStack.push(offset);
1208                         op = op.getChild();
1209                     }
1210                     break;
1211 
1212                 case Op.NONGREEDYCLOSURE:
1213                 case Op.NONGREEDYQUESTION:
1214                     {
1215                         opStack.push(op);
1216                         dataStack.push(offset);
1217                         op = op.next;
1218                     }
1219                     break;
1220 
1221                 case Op.UNION:
1222                     if (op.size() == 0) {
1223                         returned = true;
1224                     }
1225                     else {
1226                         opStack.push(op);
1227                         dataStack.push(0);
1228                         dataStack.push(offset);
1229                         op = op.elementAt(0);
1230                     }
1231                     break;
1232 
1233                 case Op.CAPTURE:
1234                     {
1235                         final int refno = op.getData();
1236                         if (con.match != null) {
1237                             if (refno > 0) {
1238                                 dataStack.push(con.match.getBeginning(refno));
1239                                 con.match.setBeginning(refno, offset);
1240                             }
1241                             else {
1242                                 final int index = -refno;
1243                                 dataStack.push(con.match.getEnd(index));
1244                                 con.match.setEnd(index, offset);
1245                             }
1246                             opStack.push(op);
1247                             dataStack.push(offset);
1248                         }
1249                         op = op.next;
1250                     }
1251                     break;
1252 
1253                 case Op.LOOKAHEAD:
1254                 case Op.NEGATIVELOOKAHEAD:
1255                 case Op.LOOKBEHIND:
1256                 case Op.NEGATIVELOOKBEHIND:
1257                     {
1258                         opStack.push(op);
1259                         dataStack.push(dx);
1260                         dataStack.push(offset);
1261                         dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1262                         op = op.getChild();
1263                     }
1264                     break;
1265 
1266                 case Op.INDEPENDENT:
1267                     {
1268                         opStack.push(op);
1269                         dataStack.push(offset);
1270                         op = op.getChild();
1271                     }
1272                     break;
1273 
1274                 case Op.MODIFIER:
1275                     {
1276                         int localopts = opts;
1277                         localopts |= op.getData();
1278                         localopts &= ~op.getData2();
1279                         opStack.push(op);
1280                         dataStack.push(opts);
1281                         dataStack.push(offset);
1282                         opts = localopts;
1283                         op = op.getChild();
1284                     }
1285                     break;
1286 
1287                 case Op.CONDITION:
1288                     {
1289                         Op.ConditionOp cop = (Op.ConditionOp)op;
1290                         if (cop.refNumber > 0) {
1291                             if (cop.refNumber >= this.nofparen) {
1292                                 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1293                             }
1294                             if (con.match.getBeginning(cop.refNumber) >= 0
1295                                     && con.match.getEnd(cop.refNumber) >= 0) {
1296                                 op = cop.yes;
1297                             }
1298                             else if (cop.no != null) {
1299                                 op = cop.no;
1300                             }
1301                             else {
1302                                 op = cop.next;
1303                             }
1304                         }
1305                         else {
1306                             opStack.push(op);
1307                             dataStack.push(offset);
1308                             op = cop.condition;
1309                         }
1310                     }
1311                     break;
1312 
1313                 default:
1314                     throw new RuntimeException("Unknown operation type: " + op.type);
1315                 }
1316             }
1317 
1318             // handle recursive operations
1319             while (returned) {
1320                 // exhausted all the operations
1321                 if (opStack.isEmpty()) {
1322                     return retValue;
1323                 }
1324 
1325                 op = (Op) opStack.pop();
1326                 offset = dataStack.pop();
1327 
1328                 switch (op.type) {
1329                 case Op.CLOSURE:
1330                 case Op.QUESTION:
1331                     if (retValue < 0) {
1332                         op = op.next;
1333                         returned = false;
1334                     }
1335                     break;
1336 
1337                 case Op.NONGREEDYCLOSURE:
1338                 case Op.NONGREEDYQUESTION:
1339                     if (retValue < 0) {
1340                         op = op.getChild();
1341                         returned = false;
1342                     }
1343                     break;
1344 
1345                 case Op.UNION:
1346                     {
1347                         int unionIndex = dataStack.pop();
1348                         if (DEBUG) {
1349                             System.err.println("UNION: "+unionIndex+", ret="+retValue);
1350                         }
1351 
1352                         if (retValue < 0) {
1353                             if (++unionIndex < op.size()) {
1354                                 opStack.push(op);
1355                                 dataStack.push(unionIndex);
1356                                 dataStack.push(offset);
1357                                 op = op.elementAt(unionIndex);
1358                                 returned = false;
1359                             }
1360                             else {
1361                                 retValue = -1;
1362                             }
1363                         }
1364                     }
1365                     break;
1366 
1367                 case Op.CAPTURE:
1368                     final int refno = op.getData();
1369                     final int saved = dataStack.pop();
1370                     if (retValue < 0) {
1371                         if (refno > 0) {
1372                             con.match.setBeginning(refno, saved);
1373                         }
1374                         else {
1375                             con.match.setEnd(-refno, saved);
1376                         }
1377                     }
1378                     break;
1379 
1380                 case Op.LOOKAHEAD:
1381                 case Op.LOOKBEHIND:
1382                     {
1383                         dx = dataStack.pop();
1384                         if (0 <= retValue) {
1385                             op = op.next;
1386                             returned = false;
1387                         }
1388                         retValue = -1;
1389                     }
1390                     break;
1391 
1392                 case Op.NEGATIVELOOKAHEAD:
1393                 case Op.NEGATIVELOOKBEHIND:
1394                     {
1395                         dx = dataStack.pop();
1396                         if (0 > retValue)  {
1397                             op = op.next;
1398                             returned = false;
1399                         }
1400                         retValue = -1;
1401                     }
1402                     break;
1403 
1404                 case Op.MODIFIER:
1405                     opts = dataStack.pop();
1406                     // fall through
1407 
1408                 case Op.INDEPENDENT:
1409                     if (retValue >= 0)  {
1410                         offset = retValue;
1411                         op = op.next;
1412                         returned = false;
1413                     }
1414                     break;
1415 
1416                 case Op.CONDITION:
1417                     {
1418                         final Op.ConditionOp cop = (Op.ConditionOp)op;
1419                         if (0 <= retValue) {
1420                             op = cop.yes;
1421                         }
1422                         else if (cop.no != null) {
1423                             op = cop.no;
1424                         }
1425                         else {
1426                             op = cop.next;
1427                         }
1428                     }
1429                     returned = false;
1430                     break;
1431 
1432                 default:
1433                     break;
1434                 }
1435             }
1436         }
1437     }
1438 
1439     private boolean matchChar(int ch, int other, boolean ignoreCase) {
1440         return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1441     }
1442 
1443     boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1444         boolean go = false;
1445         switch (op.getData()) {
1446         case '^':
1447             if (isSet(opts, MULTIPLE_LINES)) {
1448                 if (!(offset == con.start
1449                       || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1))))
1450                     return false;
1451             } else {
1452                 if (offset != con.start)
1453                     return false;
1454             }
1455             break;
1456 
1457         case '@':                         // Internal use only.
1458             // The @ always matches line beginnings.
1459             if (!(offset == con.start
1460                   || offset > con.start && isEOLChar(target.charAt(offset-1))))
1461                 return false;
1462             break;
1463 
1464         case '$':
1465             if (isSet(opts, MULTIPLE_LINES)) {
1466                 if (!(offset == con.limit
1467                       || offset < con.limit && isEOLChar(target.charAt(offset))))
1468                     return false;
1469             } else {
1470                 if (!(offset == con.limit
1471                       || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1472                       || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1473                       &&  target.charAt(offset+1) == LINE_FEED))
1474                     return false;
1475             }
1476             break;
1477 
1478         case 'A':
1479             if (offset != con.start)  return false;
1480             break;
1481 
1482         case 'Z':
1483             if (!(offset == con.limit
1484                   || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1485                   || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1486                   &&  target.charAt(offset+1) == LINE_FEED))
1487                 return false;
1488             break;
1489 
1490         case 'z':
1491             if (offset != con.limit)  return false;
1492             break;
1493 
1494         case 'b':
1495             if (con.length == 0)
1496                 return false;
1497             {
1498                 int after = getWordType(target, con.start, con.limit, offset, opts);
1499                 if (after == WT_IGNORE)  return false;
1500                 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1501                 if (after == before)  return false;
1502             }
1503             break;
1504 
1505         case 'B':
1506             if (con.length == 0)
1507                 go = true;
1508             else {
1509                 int after = getWordType(target, con.start, con.limit, offset, opts);
1510                 go = after == WT_IGNORE
1511                      || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1512             }
1513             if (!go)  return false;
1514             break;
1515 
1516         case '<':
1517             if (con.length == 0 || offset == con.limit)  return false;
1518             if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1519                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1520                 return false;
1521             break;
1522 
1523         case '>':
1524             if (con.length == 0 || offset == con.start)  return false;
1525             if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1526                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1527                 return false;
1528             break;
1529         } // switch anchor type
1530 
1531         return true;
1532     }
1533 
1534     private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1535                                                  int offset, int opts) {
1536         int ret = getWordType(target, begin, end, --offset, opts);
1537         while (ret == WT_IGNORE)
1538             ret = getWordType(target, begin, end, --offset, opts);
1539         return ret;
1540     }
1541 
1542     private static final int getWordType(ExpressionTarget target, int begin, int end,
1543                                          int offset, int opts) {
1544         if (offset < begin || offset >= end)  return WT_OTHER;
1545         return getWordType0(target.charAt(offset) , opts);
1546     }
1547 
1548 
1549     /**
1550      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1551      *
1552      * @return true if the target is matched to this regular expression.
1553      */
1554     public boolean matches(CharacterIterator target) {
1555         return this.matches(target, (Match)null);
1556     }
1557 
1558 
1559     /**
1560      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1561      *
1562      * @param match A Match instance for storing matching result.
1563      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1564      */
1565     public boolean matches(CharacterIterator  target, Match match) {
1566         int start = target.getBeginIndex();
1567         int end = target.getEndIndex();
1568 
1569 
1570 
1571         synchronized (this) {
1572             if (this.operations == null)
1573                 this.prepare();
1574             if (this.context == null)
1575                 this.context = new Context();
1576         }
1577         Context con = null;
1578         synchronized (this.context) {
1579             con = this.context.inuse ? new Context() : this.context;
1580             con.reset(target, start, end, this.numberOfClosures);
1581         }
1582         if (match != null) {
1583             match.setNumberOfGroups(this.nofparen);
1584             match.setSource(target);
1585         } else if (this.hasBackReferences) {
1586             match = new Match();
1587             match.setNumberOfGroups(this.nofparen);
1588             // Need not to call setSource() because
1589             // a caller can not access this match instance.
1590         }
1591         con.match = match;
1592 
1593         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1594             int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1595             //System.err.println("DEBUG: matchEnd="+matchEnd);
1596             if (matchEnd == con.limit) {
1597                 if (con.match != null) {
1598                     con.match.setBeginning(0, con.start);
1599                     con.match.setEnd(0, matchEnd);
1600                 }
1601                 con.setInUse(false);
1602                 return true;
1603             }
1604             return false;
1605         }
1606 
1607         /*
1608          * The pattern has only fixed string.
1609          * The engine uses Boyer-Moore.
1610          */
1611         if (this.fixedStringOnly) {
1612             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1613             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1614             if (o >= 0) {
1615                 if (con.match != null) {
1616                     con.match.setBeginning(0, o);
1617                     con.match.setEnd(0, o+this.fixedString.length());
1618                 }
1619                 con.setInUse(false);
1620                 return true;
1621             }
1622             con.setInUse(false);
1623             return false;
1624         }
1625 
1626         /*
1627          * The pattern contains a fixed string.
1628          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1629          * If not, it return with false.
1630          */
1631         if (this.fixedString != null) {
1632             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1633             if (o < 0) {
1634                 //System.err.println("Non-match in fixed-string search.");
1635                 con.setInUse(false);
1636                 return false;
1637             }
1638         }
1639 
1640         int limit = con.limit-this.minlength;
1641         int matchStart;
1642         int matchEnd = -1;
1643 
1644         /*
1645          * Checks whether the expression starts with ".*".
1646          */
1647         if (this.operations != null
1648             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1649             if (isSet(this.options, SINGLE_LINE)) {
1650                 matchStart = con.start;
1651                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1652             } else {
1653                 boolean previousIsEOL = true;
1654                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1655                     int ch =  target .setIndex(  matchStart ) ;
1656                     if (isEOLChar(ch)) {
1657                         previousIsEOL = true;
1658                     } else {
1659                         if (previousIsEOL) {
1660                             if (0 <= (matchEnd = this.match(con, this.operations,
1661                                                             matchStart, 1, this.options)))
1662                                 break;
1663                         }
1664                         previousIsEOL = false;
1665                     }
1666                 }
1667             }
1668         }
1669 
1670         /*
1671          * Optimization against the first character.
1672          */
1673         else if (this.firstChar != null) {
1674             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1675             RangeToken range = this.firstChar;
1676             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1677                 int ch =  target .setIndex(  matchStart ) ;
1678                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1679                     ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1680                 }
1681                 if (!range.match(ch)) {
1682                     continue;
1683                 }
1684                 if (0 <= (matchEnd = this.match(con, this.operations,
1685                                                 matchStart, 1, this.options))) {
1686                     break;
1687                 }
1688             }
1689         }
1690 
1691         /*
1692          * Straightforward matching.
1693          */
1694         else {
1695             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1696                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
1697                     break;
1698             }
1699         }
1700 
1701         if (matchEnd >= 0) {
1702             if (con.match != null) {
1703                 con.match.setBeginning(0, matchStart);
1704                 con.match.setEnd(0, matchEnd);
1705             }
1706             con.setInUse(false);
1707             return true;
1708         } else {
1709             con.setInUse(false);
1710             return false;
1711         }
1712     }
1713 
1714     // ================================================================
1715 
1716     /**
1717      * A regular expression.
1718      * @serial
1719      */
1720     String regex;
1721     /**
1722      * @serial
1723      */
1724     int options;
1725 
1726     /**
1727      * The number of parenthesis in the regular expression.
1728      * @serial
1729      */
1730     int nofparen;
1731     /**
1732      * Internal representation of the regular expression.
1733      * @serial
1734      */
1735     Token tokentree;
1736 
1737     boolean hasBackReferences = false;
1738 
1739     transient int minlength;
1740     transient Op operations = null;
1741     transient int numberOfClosures;
1742     transient Context context = null;
1743     transient RangeToken firstChar = null;
1744 
1745     transient String fixedString = null;
1746     transient int fixedStringOptions;
1747     transient BMPattern fixedStringTable = null;
1748     transient boolean fixedStringOnly = false;
1749 
1750     static abstract class ExpressionTarget {
1751         abstract char charAt(int index);
1752         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
1753         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1754     }
1755 
1756     static final class StringTarget extends ExpressionTarget {
1757 
1758         private String target;
1759 
1760         StringTarget(String target) {
1761             this.target = target;
1762         }
1763 
1764         final void resetTarget(String target) {
1765             this.target = target;
1766         }
1767 
1768         final char charAt(int index) {
1769             return target.charAt(index);
1770         }
1771 
1772         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1773                               String part, int partlen) {
1774             if (limit-offset < partlen) {
1775                 return false;
1776             }
1777             return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1778         }
1779 
1780         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1781                                     int offset2, int partlen) {
1782             if (limit-offset < partlen) {
1783                 return false;
1784             }
1785             return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1786                                 : target.regionMatches(offset, target, offset2, partlen);
1787         }
1788     }
1789 
1790     static final class CharArrayTarget extends ExpressionTarget {
1791 
1792         char[] target;
1793 
1794         CharArrayTarget(char[] target) {
1795             this.target = target;
1796         }
1797 
1798         final void resetTarget(char[] target) {
1799             this.target = target;
1800         }
1801 
1802         char charAt(int index) {
1803             return target[index];
1804         }
1805 
1806         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1807                 String part, int partlen) {
1808             if (offset < 0 || limit-offset < partlen)  {
1809                 return false;
1810             }
1811             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1812                                 : regionMatches(offset, limit, part, partlen);
1813         }
1814 
1815         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1816             int i = 0;
1817             while (partlen-- > 0) {
1818                 if (target[offset++] != part.charAt(i++)) {
1819                     return false;
1820                 }
1821             }
1822             return true;
1823         }
1824 
1825         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1826             int i = 0;
1827             while (partlen-- > 0) {
1828                 final char ch1 = target[offset++] ;
1829                 final char ch2 = part.charAt(i++);
1830                 if (ch1 == ch2) {
1831                     continue;
1832                 }
1833                 final char uch1 = Character.toUpperCase(ch1);
1834                 final char uch2 = Character.toUpperCase(ch2);
1835                 if (uch1 == uch2) {
1836                     continue;
1837                 }
1838                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1839                     return false;
1840                 }
1841             }
1842             return true;
1843         }
1844 
1845         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1846             if (offset < 0 || limit-offset < partlen) {
1847                 return false;
1848             }
1849             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1850                                 : regionMatches(offset, limit, offset2, partlen);
1851         }
1852 
1853         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1854             int i = offset2;
1855             while (partlen-- > 0) {
1856                 if ( target [  offset++ ]  !=  target [  i++ ] )
1857                     return false;
1858             }
1859             return true;
1860         }
1861 
1862         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1863             int i = offset2;
1864             while (partlen-- > 0) {
1865                 final char ch1 =  target[offset++] ;
1866                 final char ch2 =  target[i++] ;
1867                 if (ch1 == ch2) {
1868                     continue;
1869                 }
1870                 final char uch1 = Character.toUpperCase(ch1);
1871                 final char uch2 = Character.toUpperCase(ch2);
1872                 if (uch1 == uch2) {
1873                     continue;
1874                 }
1875                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1876                     return false;
1877                 }
1878             }
1879             return true;
1880         }
1881     }
1882 
1883     static final class CharacterIteratorTarget extends ExpressionTarget {
1884         CharacterIterator target;
1885 
1886         CharacterIteratorTarget(CharacterIterator target) {
1887             this.target = target;
1888         }
1889 
1890         final void resetTarget(CharacterIterator target) {
1891             this.target = target;
1892         }
1893 
1894         final char charAt(int index) {
1895             return target.setIndex(index);
1896         }
1897 
1898         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1899                 String part, int partlen) {
1900             if (offset < 0 || limit-offset < partlen)  {
1901                 return false;
1902             }
1903             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1904                                 : regionMatches(offset, limit, part, partlen);
1905         }
1906 
1907         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1908             int i = 0;
1909             while (partlen-- > 0) {
1910                 if (target.setIndex(offset++) != part.charAt(i++)) {
1911                     return false;
1912                 }
1913             }
1914             return true;
1915         }
1916 
1917         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1918             int i = 0;
1919             while (partlen-- > 0) {
1920                 final char ch1 = target.setIndex(offset++) ;
1921                 final char ch2 = part.charAt(i++);
1922                 if (ch1 == ch2) {
1923                     continue;
1924                 }
1925                 final char uch1 = Character.toUpperCase(ch1);
1926                 final char uch2 = Character.toUpperCase(ch2);
1927                 if (uch1 == uch2) {
1928                     continue;
1929                 }
1930                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1931                     return false;
1932                 }
1933             }
1934             return true;
1935         }
1936 
1937         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1938             if (offset < 0 || limit-offset < partlen) {
1939                 return false;
1940             }
1941             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1942                                 : regionMatches(offset, limit, offset2, partlen);
1943         }
1944 
1945         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1946             int i = offset2;
1947             while (partlen-- > 0) {
1948                 if (target.setIndex(offset++) != target.setIndex(i++)) {
1949                     return false;
1950                 }
1951             }
1952             return true;
1953         }
1954 
1955         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1956             int i = offset2;
1957             while (partlen-- > 0) {
1958                 final char ch1 = target.setIndex(offset++) ;
1959                 final char ch2 = target.setIndex(i++) ;
1960                 if (ch1 == ch2) {
1961                     continue;
1962                 }
1963                 final char uch1 = Character.toUpperCase(ch1);
1964                 final char uch2 = Character.toUpperCase(ch2);
1965                 if (uch1 == uch2) {
1966                     continue;
1967                 }
1968                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1969                     return false;
1970                 }
1971             }
1972             return true;
1973         }
1974     }
1975 
1976     static final class ClosureContext {
1977 
1978         int[] offsets = new int[4];
1979         int currentIndex = 0;
1980 
1981         boolean contains(int offset) {
1982             for (int i=0; i<currentIndex;++i) {
1983                 if (offsets[i] == offset) {
1984                     return true;
1985                 }
1986             }
1987             return false;
1988         }
1989 
1990         void reset() {
1991             currentIndex = 0;
1992         }
1993 
1994         void addOffset(int offset) {
1995             // We do not check for duplicates, caller is responsible for that
1996             if (currentIndex == offsets.length) {
1997                 offsets = expandOffsets();
1998             }
1999             offsets[currentIndex++] = offset;
2000         }
2001 
2002         private int[] expandOffsets() {
2003             final int len = offsets.length;
2004             final int newLen = len << 1;
2005             int[] newOffsets = new int[newLen];
2006 
2007             System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2008             return newOffsets;
2009         }
2010     }
2011 
2012     static final class Context {
2013         int start;
2014         int limit;
2015         int length;
2016         Match match;
2017         boolean inuse = false;
2018         ClosureContext[] closureContexts;
2019 
2020         private StringTarget stringTarget;
2021         private CharArrayTarget charArrayTarget;
2022         private CharacterIteratorTarget characterIteratorTarget;
2023 
2024         ExpressionTarget target;
2025 
2026         Context() {
2027         }
2028 
2029         private void resetCommon(int nofclosures) {
2030             this.length = this.limit-this.start;
2031             setInUse(true);
2032             this.match = null;
2033             if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2034                 this.closureContexts = new ClosureContext[nofclosures];
2035             }
2036             for (int i = 0;  i < nofclosures;  i ++)  {
2037                 if (this.closureContexts[i] == null) {
2038                     this.closureContexts[i] = new ClosureContext();
2039                 }
2040                 else {
2041                     this.closureContexts[i].reset();
2042                 }
2043             }
2044         }
2045 
2046         void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2047             if (characterIteratorTarget == null) {
2048                 characterIteratorTarget = new CharacterIteratorTarget(target);
2049             }
2050             else {
2051                 characterIteratorTarget.resetTarget(target);
2052             }
2053             this.target = characterIteratorTarget;
2054             this.start = start;
2055             this.limit = limit;
2056             this.resetCommon(nofclosures);
2057         }
2058 
2059         void reset(String target, int start, int limit, int nofclosures) {
2060             if (stringTarget == null) {
2061                 stringTarget = new StringTarget(target);
2062             }
2063             else {
2064                 stringTarget.resetTarget(target);
2065             }
2066             this.target = stringTarget;
2067             this.start = start;
2068             this.limit = limit;
2069             this.resetCommon(nofclosures);
2070         }
2071 
2072         void reset(char[] target, int start, int limit, int nofclosures) {
2073             if (charArrayTarget == null) {
2074                 charArrayTarget = new CharArrayTarget(target);
2075             }
2076             else {
2077                 charArrayTarget.resetTarget(target);
2078             }
2079             this.target = charArrayTarget;
2080             this.start = start;
2081             this.limit = limit;
2082             this.resetCommon(nofclosures);
2083         }
2084         synchronized void setInUse(boolean inUse) {
2085             this.inuse = inUse;
2086         }
2087     }
2088 
2089     /**
2090      * Prepares for matching.  This method is called just before starting matching.
2091      */
2092     void prepare() {
2093         if (Op.COUNT)  Op.nofinstances = 0;
2094         this.compile(this.tokentree);
2095         /*
2096         if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2097             Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2098             anchor.next = this.operations;
2099             this.operations = anchor;
2100         }
2101         */
2102         if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2103 
2104         this.minlength = this.tokentree.getMinLength();
2105 
2106         this.firstChar = null;
2107         if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2108             && !isSet(this.options, XMLSCHEMA_MODE)) {
2109             RangeToken firstChar = Token.createRange();
2110             int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2111             if (fresult == Token.FC_TERMINAL) {
2112                 firstChar.compactRanges();
2113                 this.firstChar = firstChar;
2114                 if (DEBUG)
2115                     System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2116             }
2117         }
2118 
2119         if (this.operations != null
2120             && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2121             && this.operations.next == null) {
2122             if (DEBUG)
2123                 System.err.print(" *** Only fixed string! *** ");
2124             this.fixedStringOnly = true;
2125             if (this.operations.type == Op.STRING)
2126                 this.fixedString = this.operations.getString();
2127             else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2128                 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2129             } else {
2130                 char[] ac = new char[1];
2131                 ac[0] = (char)this.operations.getData();
2132                 this.fixedString = new String(ac);
2133             }
2134             this.fixedStringOptions = this.options;
2135             this.fixedStringTable = new BMPattern(this.fixedString, 256,
2136                                                   isSet(this.fixedStringOptions, IGNORE_CASE));
2137         } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2138                    && !isSet(this.options, XMLSCHEMA_MODE)) {
2139             Token.FixedStringContainer container = new Token.FixedStringContainer();
2140             this.tokentree.findFixedString(container, this.options);
2141             this.fixedString = container.token == null ? null : container.token.getString();
2142             this.fixedStringOptions = container.options;
2143             if (this.fixedString != null && this.fixedString.length() < 2)
2144                 this.fixedString = null;
2145             // This pattern has a fixed string of which length is more than one.
2146             if (this.fixedString != null) {
2147                 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2148                                                       isSet(this.fixedStringOptions, IGNORE_CASE));
2149                 if (DEBUG) {
2150                     System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2151                                        +"/" //+this.fixedString
2152                                        +"/"+REUtil.createOptionString(this.fixedStringOptions));
2153                     System.err.print("String: ");
2154                     REUtil.dumpString(this.fixedString);
2155                 }
2156             }
2157         }
2158     }
2159 
2160     /**
2161      * An option.
2162      * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2163      * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2164      * does not capture.
2165      *
2166      * @see #RegularExpression(java.lang.String,int)
2167      * @see #setPattern(java.lang.String,int)
2168     static final int MARK_PARENS = 1<<0;
2169      */
2170 
2171     /**
2172      * "i"
2173      */
2174     static final int IGNORE_CASE = 1<<1;
2175 
2176     /**
2177      * "s"
2178      */
2179     static final int SINGLE_LINE = 1<<2;
2180 
2181     /**
2182      * "m"
2183      */
2184     static final int MULTIPLE_LINES = 1<<3;
2185 
2186     /**
2187      * "x"
2188      */
2189     static final int EXTENDED_COMMENT = 1<<4;
2190 
2191     /**
2192      * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2193      *
2194      * @see #RegularExpression(java.lang.String,int)
2195      * @see #setPattern(java.lang.String,int)
2196      * @see #UNICODE_WORD_BOUNDARY
2197      */
2198     static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2199 
2200     /**
2201      * An option.
2202      * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2203      * <p>By default, the engine considers a position between a word character
2204      * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2205      * is a word boundary.
2206      * <p>By this option, the engine checks word boundaries with the method of
2207      * 'Unicode Regular Expression Guidelines' Revision 4.
2208      *
2209      * @see #RegularExpression(java.lang.String,int)
2210      * @see #setPattern(java.lang.String,int)
2211      */
2212     static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2213 
2214     /**
2215      * "H"
2216      */
2217     static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2218     /**
2219      * "F"
2220      */
2221     static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2222     /**
2223      * "X". XML Schema mode.
2224      */
2225     static final int XMLSCHEMA_MODE = 1<<9;
2226     /**
2227      * ",".
2228      */
2229     static final int SPECIAL_COMMA = 1<<10;
2230 
2231 
2232     private static final boolean isSet(int options, int flag) {
2233         return (options & flag) == flag;
2234     }
2235 
2236     /**
2237      * Creates a new RegularExpression instance.
2238      *
2239      * @param regex A regular expression
2240      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2241      */
2242     public RegularExpression(String regex) throws ParseException {
2243         this(regex, null);
2244     }
2245 
2246     /**
2247      * Creates a new RegularExpression instance with options.
2248      *
2249      * @param regex A regular expression
2250      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2251      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2252      */
2253     public RegularExpression(String regex, String options) throws ParseException {
2254         this.setPattern(regex, options);
2255     }
2256 
2257     /**
2258      * Creates a new RegularExpression instance with options.
2259      *
2260      * @param regex A regular expression
2261      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2262      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2263      */
2264     public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2265         this.setPattern(regex, options, locale);
2266     }
2267 
2268     RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2269         this.regex = regex;
2270         this.tokentree = tok;
2271         this.nofparen = parens;
2272         this.options = options;
2273         this.hasBackReferences = hasBackReferences;
2274     }
2275 
2276     /**
2277      *
2278      */
2279     public void setPattern(String newPattern) throws ParseException {
2280         this.setPattern(newPattern, Locale.getDefault());
2281     }
2282 
2283     public void setPattern(String newPattern, Locale locale) throws ParseException {
2284         this.setPattern(newPattern, this.options, locale);
2285     }
2286 
2287     private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2288         this.regex = newPattern;
2289         this.options = options;
2290         RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2291                          ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2292         this.tokentree = rp.parse(this.regex, this.options);
2293         this.nofparen = rp.parennumber;
2294         this.hasBackReferences = rp.hasBackReferences;
2295 
2296         this.operations = null;
2297         this.context = null;
2298     }
2299     /**
2300      *
2301      */
2302     public void setPattern(String newPattern, String options) throws ParseException {
2303         this.setPattern(newPattern, options, Locale.getDefault());
2304     }
2305 
2306     public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2307         this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2308     }
2309 
2310     /**
2311      *
2312      */
2313     public String getPattern() {
2314         return this.regex;
2315     }
2316 
2317     /**
2318      * Represents this instence in String.
2319      */
2320     public String toString() {
2321         return this.tokentree.toString(this.options);
2322     }
2323 
2324     /**
2325      * Returns a option string.
2326      * The order of letters in it may be different from a string specified
2327      * in a constructor or <code>setPattern()</code>.
2328      *
2329      * @see #RegularExpression(java.lang.String,java.lang.String)
2330      * @see #setPattern(java.lang.String,java.lang.String)
2331      */
2332     public String getOptions() {
2333         return REUtil.createOptionString(this.options);
2334     }
2335 
2336     /**
2337      *  Return true if patterns are the same and the options are equivalent.
2338      */
2339     public boolean equals(Object obj) {
2340         if (obj == null)  return false;
2341         if (!(obj instanceof RegularExpression))
2342             return false;
2343         RegularExpression r = (RegularExpression)obj;
2344         return this.regex.equals(r.regex) && this.options == r.options;
2345     }
2346 
2347     boolean equals(String pattern, int options) {
2348         return this.regex.equals(pattern) && this.options == options;
2349     }
2350 
2351     /**
2352      *
2353      */
2354     public int hashCode() {
2355         return (this.regex+"/"+this.getOptions()).hashCode();
2356     }
2357 
2358     /**
2359      * Return the number of regular expression groups.
2360      * This method returns 1 when the regular expression has no capturing-parenthesis.
2361      *
2362      */
2363     public int getNumberOfGroups() {
2364         return this.nofparen;
2365     }
2366 
2367     // ================================================================
2368 
2369     private static final int WT_IGNORE = 0;
2370     private static final int WT_LETTER = 1;
2371     private static final int WT_OTHER = 2;
2372     private static final int getWordType0(char ch, int opts) {
2373         if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2374             if (isSet(opts, USE_UNICODE_CATEGORY)) {
2375                 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2376             }
2377             return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2378         }
2379 
2380         switch (Character.getType(ch)) {
2381         case Character.UPPERCASE_LETTER:      // L
2382         case Character.LOWERCASE_LETTER:      // L
2383         case Character.TITLECASE_LETTER:      // L
2384         case Character.MODIFIER_LETTER:       // L
2385         case Character.OTHER_LETTER:          // L
2386         case Character.LETTER_NUMBER:         // N
2387         case Character.DECIMAL_DIGIT_NUMBER:  // N
2388         case Character.OTHER_NUMBER:          // N
2389         case Character.COMBINING_SPACING_MARK: // Mc
2390             return WT_LETTER;
2391 
2392         case Character.FORMAT:                // Cf
2393         case Character.NON_SPACING_MARK:      // Mn
2394         case Character.ENCLOSING_MARK:        // Mc
2395             return WT_IGNORE;
2396 
2397         case Character.CONTROL:               // Cc
2398             switch (ch) {
2399             case '\t':
2400             case '\n':
2401             case '\u000B':
2402             case '\f':
2403             case '\r':
2404                 return WT_OTHER;
2405             default:
2406                 return WT_IGNORE;
2407             }
2408 
2409         default:
2410             return WT_OTHER;
2411         }
2412     }
2413 
2414     // ================================================================
2415 
2416     static final int LINE_FEED = 0x000A;
2417     static final int CARRIAGE_RETURN = 0x000D;
2418     static final int LINE_SEPARATOR = 0x2028;
2419     static final int PARAGRAPH_SEPARATOR = 0x2029;
2420 
2421     private static final boolean isEOLChar(int ch) {
2422         return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2423         || ch == PARAGRAPH_SEPARATOR;
2424     }
2425 
2426     private static final boolean isWordChar(int ch) { // Legacy word characters
2427         if (ch == '_')  return true;
2428         if (ch < '0')  return false;
2429         if (ch > 'z')  return false;
2430         if (ch <= '9')  return true;
2431         if (ch < 'A')  return false;
2432         if (ch <= 'Z')  return true;
2433         if (ch < 'a')  return false;
2434         return true;
2435     }
2436 
2437     private static final boolean matchIgnoreCase(int chardata, int ch) {
2438         if (chardata == ch)  return true;
2439         if (chardata > 0xffff || ch > 0xffff)  return false;
2440         char uch1 = Character.toUpperCase((char)chardata);
2441         char uch2 = Character.toUpperCase((char)ch);
2442         if (uch1 == uch2)  return true;
2443         return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
2444     }
2445 }