1 /* 2 * reserved comment block 3 * DO NOT REMOVE OR ALTER! 4 */ 5 /* 6 * Copyright 1999-2002,2004,2005 The Apache Software Foundation. 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 import java.text.CharacterIterator; 24 import java.util.Locale; 25 import java.util.Stack; 26 27 import com.sun.org.apache.xerces.internal.util.IntStack; 28 29 /** 30 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). 31 * This engine does not conform to the POSIX regular expression. 32 * 33 * <hr width="50%"> 34 * <h3>How to use</h3> 35 * 36 * <dl> 37 * <dt>A. Standard way 38 * <dd> 39 * <pre> 40 * RegularExpression re = new RegularExpression(<var>regex</var>); 41 * if (re.matches(text)) { ... } 42 * </pre> 43 * 44 * <dt>B. Capturing groups 45 * <dd> 46 * <pre> 47 * RegularExpression re = new RegularExpression(<var>regex</var>); 48 * Match match = new Match(); 49 * if (re.matches(text, match)) { 50 * ... // You can refer captured texts with methods of the <code>Match</code> class. 51 * } 52 * </pre> 53 * 54 * </dl> 55 * 56 * <h4>Case-insensitive matching</h4> 57 * <pre> 58 * RegularExpression re = new RegularExpression(<var>regex</var>, "i"); 59 * if (re.matches(text) >= 0) { ...} 60 * </pre> 61 * 62 * <h4>Options</h4> 63 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> 64 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. 65 * This <var>options</var> parameter consists of the following characters. 66 * </p> 67 * <dl> 68 * <dt><a name="I_OPTION"><code>"i"</code></a> 69 * <dd>This option indicates case-insensitive matching. 70 * <dt><a name="M_OPTION"><code>"m"</code></a> 71 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. 72 * <dt><a name="S_OPTION"><code>"s"</code></a> 73 * <dd class="REGEX"><kbd>.</kbd> matches any one character. 74 * <dt><a name="U_OPTION"><code>"u"</code></a> 75 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. 76 * <dt><a name="W_OPTION"><code>"w"</code></a> 77 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of 78 * 'Unicode Regular Expression Guidelines' Revision 4. 79 * When "w" and "u" are specified at the same time, 80 * <kbd>\b \B \< \></kbd> are processed for the "w" option. 81 * <dt><a name="COMMA_OPTION"><code>","</code></a> 82 * <dd>The parser treats a comma in a character class as a range separator. 83 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. 84 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. 85 * 86 * <dt><a name="X_OPTION"><code>"X"</code></a> 87 * <dd class="REGEX"> 88 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. 89 * The <code>match()</code> method does not do subsring matching 90 * but entire string matching. 91 * 92 * </dl> 93 * 94 * <hr width="50%"> 95 * <h3>Syntax</h3> 96 * <table border="1" bgcolor="#ddeeff"> 97 * <tr> 98 * <td> 99 * <h4>Differences from the Perl 5 regular expression</h4> 100 * <ul> 101 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) 102 * <li>Supports subtraction, union, and intersection operations for character classes. 103 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), 104 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, 105 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, 106 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, 107 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> 108 * </ul> 109 * </td> 110 * </tr> 111 * </table> 112 * 113 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> 114 * <ul> 115 * <li>Character 116 * <dl> 117 * <dt class="REGEX"><kbd>.</kbd> (A period) 118 * <dd>Matches any one character except the following characters. 119 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), 120 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) 121 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. 122 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified, 123 * it matches any character including the above four characters. 124 * 125 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> 126 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), 127 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) 128 * 129 * <dt class="REGEX"><kbd>\c</kbd><var>C</var> 130 * <dd>Matches a control character. 131 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', 132 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. 133 * It matches a control character of which the character code is less than 134 * the character code of the <var>C</var> by 0x0040. 135 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), 136 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). 137 * 138 * <dt class="REGEX">a non-meta character 139 * <dd>Matches the character. 140 * 141 * <dt class="REGEX"><KBD>\</KBD> + a meta character 142 * <dd>Matches the meta character. 143 * 144 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> 145 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. 146 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and 147 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. 148 * 149 * <!-- 150 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> 151 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. 152 * --> 153 * 154 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> 155 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. 156 * 157 * <dt class="REGEX"><kbd>\g</kbd> 158 * <dd>Matches a grapheme. 159 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> 160 * 161 * <dt class="REGEX"><kbd>\X</kbd> 162 * <dd class="REGEX">Matches a combining character sequence. 163 * It is equivalent to <kbd>(?:\PM\pM*)</kbd> 164 * </dl> 165 * </li> 166 * 167 * <li>Character class 168 * <dl> 169 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) 170 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) 171 * <dd>Positive character class. It matches a character in ranges. 172 * <dd><var>R<sub>n</sub></var>: 173 * <ul> 174 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) 175 * <p>This range matches the character. 176 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> 177 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. 178 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, 179 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> 180 * <p>... 181 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> 182 * <p>These expressions specifies the same ranges as the following expressions. 183 * </ul> 184 * <p class="REGEX">Enumerated ranges are merged (union operation). 185 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> 186 * 187 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) 188 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) 189 * <dd>Negative character class. It matches a character not in ranges. 190 * 191 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> 192 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) 193 * <dd>Subtraction or union or intersection for character classes. 194 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. 195 * <dd>The result of this operations is a <u>positive character class</u> 196 * even if an expression includes any negative character classes. 197 * You have to take care on this in case-insensitive matching. 198 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, 199 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. 200 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because 201 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' 202 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. 203 * 204 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> 205 * <dd>Character class subtraction for the XML Schema. 206 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. 207 * 208 * <dt class="REGEX"><kbd>\d</kbd> 209 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. 210 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 211 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>. 212 * 213 * <dt class="REGEX"><kbd>\D</kbd> 214 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> 215 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 216 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>. 217 * 218 * <dt class="REGEX"><kbd>\s</kbd> 219 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> 220 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 221 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. 222 * 223 * <dt class="REGEX"><kbd>\S</kbd> 224 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> 225 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 226 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. 227 * 228 * <dt class="REGEX"><kbd>\w</kbd> 229 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> 230 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 231 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 232 * 233 * <dt class="REGEX"><kbd>\W</kbd> 234 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> 235 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 236 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 237 * 238 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> 239 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. 240 * The following names are available: 241 * <dl> 242 * <dt>Unicode General Categories: 243 * <dd><kbd> 244 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, 245 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, 246 * </kbd> 247 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) 248 * <dt>Unicode Blocks: 249 * <dd><kbd> 250 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, 251 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, 252 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, 253 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, 254 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, 255 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, 256 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, 257 * Miscellaneous Technical, Control Pictures, Optical Character Recognition, 258 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, 259 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, 260 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, 261 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, 262 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, 263 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, 264 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, 265 * Small Form Variants, Arabic Presentation Forms-B, Specials, 266 * Halfwidth and Fullwidth Forms 267 * </kbd> 268 * <dt>Others: 269 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) 270 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) 271 * <dd><kbd>UNASSGINED</kbd> 272 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) 273 * </dl> 274 * 275 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> 276 * <dd>Matches one character not in the specified General Category or the specified Block. 277 * </dl> 278 * </li> 279 * 280 * <li>Selection and Quantifier 281 * <dl> 282 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> 283 * <dd>... 284 * 285 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> 286 * <dd>Matches 0 or more <var>X</var>. 287 * 288 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> 289 * <dd>Matches 1 or more <var>X</var>. 290 * 291 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> 292 * <dd>Matches 0 or 1 <var>X</var>. 293 * 294 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> 295 * <dd>Matches <var>number</var> times. 296 * 297 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> 298 * <dd>... 299 * 300 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> 301 * <dd>... 302 * 303 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> 304 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> 305 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> 306 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> 307 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> 308 * <dd>Non-greedy matching. 309 * </dl> 310 * </li> 311 * 312 * <li>Grouping, Capturing, and Back-reference 313 * <dl> 314 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> 315 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". 316 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", 317 * you have to write "<KBD>(?:foo)+</KBD>". 318 * 319 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> 320 * <dd>Grouping with capturing. 321 * It make a group and applications can know 322 * where in target text a group matched with methods of a <code>Match</code> instance 323 * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>. 324 * The 0th group means whole of this regular expression. 325 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. 326 * 327 * <p>For instance, a regular expression is 328 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" 329 * and target text is 330 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": 331 * <ul> 332 * <li><code>Match.getCapturedText(0)</code>: 333 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" 334 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" 335 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" 336 * </ul> 337 * 338 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> 339 * <dd> 340 * 341 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> 342 * <dd>Independent expression group. ................ 343 * 344 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 345 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 346 * <dd>............................ 347 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. 348 * Note that it can not contain 'u'. 349 * 350 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> 351 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> 352 * <dd>...... 353 * <dd>These expressions must be at the beginning of a group. 354 * </dl> 355 * </li> 356 * 357 * <li>Anchor 358 * <dl> 359 * <dt class="REGEX"><kbd>\A</kbd> 360 * <dd>Matches the beginnig of the text. 361 * 362 * <dt class="REGEX"><kbd>\Z</kbd> 363 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 364 * or CARRIAGE RETURN + LINE FEED at the end of the text. 365 * 366 * <dt class="REGEX"><kbd>\z</kbd> 367 * <dd>Matches the end of the text. 368 * 369 * <dt class="REGEX"><kbd>^</kbd> 370 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. 371 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 372 * it matches the beginning of the text, or after one of EOL characters ( 373 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), 374 * PARAGRAPH SEPARATOR (U+2029).) 375 * 376 * <dt class="REGEX"><kbd>$</kbd> 377 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 378 * or CARRIAGE RETURN + LINE FEED at the end of the text. 379 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 380 * it matches the end of the text, or before an EOL character. 381 * 382 * <dt class="REGEX"><kbd>\b</kbd> 383 * <dd>Matches word boundary. 384 * (See <a href="#W_OPTION">a "w" option</a>) 385 * 386 * <dt class="REGEX"><kbd>\B</kbd> 387 * <dd>Matches non word boundary. 388 * (See <a href="#W_OPTION">a "w" option</a>) 389 * 390 * <dt class="REGEX"><kbd>\<</kbd> 391 * <dd>Matches the beginning of a word. 392 * (See <a href="#W_OPTION">a "w" option</a>) 393 * 394 * <dt class="REGEX"><kbd>\></kbd> 395 * <dd>Matches the end of a word. 396 * (See <a href="#W_OPTION">a "w" option</a>) 397 * </dl> 398 * </li> 399 * <li>Lookahead and lookbehind 400 * <dl> 401 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> 402 * <dd>Lookahead. 403 * 404 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> 405 * <dd>Negative lookahead. 406 * 407 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> 408 * <dd>Lookbehind. 409 * <dd>(Note for text capturing......) 410 * 411 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> 412 * <dd>Negative lookbehind. 413 * </dl> 414 * </li> 415 * 416 * <li>Misc. 417 * <dl> 418 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, 419 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> 420 * <dd>...... 421 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> 422 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. 423 * You can not write comments in character classes and before quantifiers. 424 * </dl> 425 * </li> 426 * </ul> 427 * 428 * 429 * <hr width="50%"> 430 * <h3>BNF for the regular expression</h3> 431 * <pre> 432 * regex ::= ('(?' options ')')? term ('|' term)* 433 * term ::= factor+ 434 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? 435 * | '(?#' [^)]* ')' 436 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' 437 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 438 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' 439 * | '(?>' regex ')' | '(?' options ':' regex ')' 440 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' 441 * options ::= [imsw]* ('-' [imsw]+)? 442 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 443 * looks ::= '(?=' regex ')' | '(?!' regex ')' 444 * | '(?<=' regex ')' | '(?<!' regex ')' 445 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 446 * category-block ::= '\' [pP] category-symbol-1 447 * | ('\p{' | '\P{') (category-symbol | block-name 448 * | other-properties) '}' 449 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' 450 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' 451 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' 452 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' 453 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' 454 * | 'Sm' | 'Sc' | 'Sk' | 'So' 455 * block-name ::= (See above) 456 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' 457 * character-1 ::= (any character except meta-characters) 458 * 459 * char-class ::= '[' ranges ']' 460 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' 461 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ 462 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block 463 * | range-char | range-char '-' range-char 464 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 465 * code-point ::= '\x' hex-char hex-char 466 * | '\x{' hex-char+ '}' 467 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char 468 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char 469 * hex-char ::= [0-9a-fA-F] 470 * character-2 ::= (any character except \[]-,) 471 * </pre> 472 * 473 * <hr width="50%"> 474 * <h3>TODO</h3> 475 * <ul> 476 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> 477 * <ul> 478 * <li>2.4 Canonical Equivalents 479 * <li>Level 3 480 * </ul> 481 * <li>Parsing performance 482 * </ul> 483 * 484 * <hr width="50%"> 485 * 486 * @xerces.internal 487 * 488 * @author TAMURA Kent <kent@trl.ibm.co.jp> 489 */ 490 public class RegularExpression implements java.io.Serializable { 491 492 private static final long serialVersionUID = 6242499334195006401L; 493 494 static final boolean DEBUG = false; 495 496 /** 497 * Compiles a token tree into an operation flow. 498 */ 499 private synchronized void compile(Token tok) { 500 if (this.operations != null) 501 return; 502 this.numberOfClosures = 0; 503 this.operations = this.compile(tok, null, false); 504 } 505 506 /** 507 * Converts a token to an operation. 508 */ 509 private Op compile(Token tok, Op next, boolean reverse) { 510 Op ret; 511 switch (tok.type) { 512 case Token.DOT: 513 ret = Op.createDot(); 514 ret.next = next; 515 break; 516 517 case Token.CHAR: 518 ret = Op.createChar(tok.getChar()); 519 ret.next = next; 520 break; 521 522 case Token.ANCHOR: 523 ret = Op.createAnchor(tok.getChar()); 524 ret.next = next; 525 break; 526 527 case Token.RANGE: 528 case Token.NRANGE: 529 ret = Op.createRange(tok); 530 ret.next = next; 531 break; 532 533 case Token.CONCAT: 534 ret = next; 535 if (!reverse) { 536 for (int i = tok.size()-1; i >= 0; i --) { 537 ret = compile(tok.getChild(i), ret, false); 538 } 539 } else { 540 for (int i = 0; i < tok.size(); i ++) { 541 ret = compile(tok.getChild(i), ret, true); 542 } 543 } 544 break; 545 546 case Token.UNION: 547 Op.UnionOp uni = Op.createUnion(tok.size()); 548 for (int i = 0; i < tok.size(); i ++) { 549 uni.addElement(compile(tok.getChild(i), next, reverse)); 550 } 551 ret = uni; // ret.next is null. 552 break; 553 554 case Token.CLOSURE: 555 case Token.NONGREEDYCLOSURE: 556 Token child = tok.getChild(0); 557 int min = tok.getMin(); 558 int max = tok.getMax(); 559 if (min >= 0 && min == max) { // {n} 560 ret = next; 561 for (int i = 0; i < min; i ++) { 562 ret = compile(child, ret, reverse); 563 } 564 break; 565 } 566 if (min > 0 && max > 0) 567 max -= min; 568 if (max > 0) { 569 // X{2,6} -> XX(X(X(XX?)?)?)? 570 ret = next; 571 for (int i = 0; i < max; i ++) { 572 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); 573 q.next = next; 574 q.setChild(compile(child, ret, reverse)); 575 ret = q; 576 } 577 } else { 578 Op.ChildOp op; 579 if (tok.type == Token.NONGREEDYCLOSURE) { 580 op = Op.createNonGreedyClosure(); 581 } else { // Token.CLOSURE 582 op = Op.createClosure(this.numberOfClosures++); 583 } 584 op.next = next; 585 op.setChild(compile(child, op, reverse)); 586 ret = op; 587 } 588 if (min > 0) { 589 for (int i = 0; i < min; i ++) { 590 ret = compile(child, ret, reverse); 591 } 592 } 593 break; 594 595 case Token.EMPTY: 596 ret = next; 597 break; 598 599 case Token.STRING: 600 ret = Op.createString(tok.getString()); 601 ret.next = next; 602 break; 603 604 case Token.BACKREFERENCE: 605 ret = Op.createBackReference(tok.getReferenceNumber()); 606 ret.next = next; 607 break; 608 609 case Token.PAREN: 610 if (tok.getParenNumber() == 0) { 611 ret = compile(tok.getChild(0), next, reverse); 612 } else if (reverse) { 613 next = Op.createCapture(tok.getParenNumber(), next); 614 next = compile(tok.getChild(0), next, reverse); 615 ret = Op.createCapture(-tok.getParenNumber(), next); 616 } else { 617 next = Op.createCapture(-tok.getParenNumber(), next); 618 next = compile(tok.getChild(0), next, reverse); 619 ret = Op.createCapture(tok.getParenNumber(), next); 620 } 621 break; 622 623 case Token.LOOKAHEAD: 624 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); 625 break; 626 case Token.NEGATIVELOOKAHEAD: 627 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); 628 break; 629 case Token.LOOKBEHIND: 630 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); 631 break; 632 case Token.NEGATIVELOOKBEHIND: 633 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); 634 break; 635 636 case Token.INDEPENDENT: 637 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); 638 break; 639 640 case Token.MODIFIERGROUP: 641 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), 642 ((Token.ModifierToken)tok).getOptions(), 643 ((Token.ModifierToken)tok).getOptionsMask()); 644 break; 645 646 case Token.CONDITION: 647 Token.ConditionToken ctok = (Token.ConditionToken)tok; 648 int ref = ctok.refNumber; 649 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); 650 Op yes = compile(ctok.yes, next, reverse); 651 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); 652 ret = Op.createCondition(next, ref, condition, yes, no); 653 break; 654 655 default: 656 throw new RuntimeException("Unknown token type: "+tok.type); 657 } // switch (tok.type) 658 return ret; 659 } 660 661 662 //Public 663 664 /** 665 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 666 * 667 * @return true if the target is matched to this regular expression. 668 */ 669 public boolean matches(char[] target) { 670 return this.matches(target, 0, target .length , (Match)null); 671 } 672 673 /** 674 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 675 * in specified range or not. 676 * 677 * @param start Start offset of the range. 678 * @param end End offset +1 of the range. 679 * @return true if the target is matched to this regular expression. 680 */ 681 public boolean matches(char[] target, int start, int end) { 682 return this.matches(target, start, end, (Match)null); 683 } 684 685 /** 686 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 687 * 688 * @param match A Match instance for storing matching result. 689 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 690 */ 691 public boolean matches(char[] target, Match match) { 692 return this.matches(target, 0, target .length , match); 693 } 694 695 696 /** 697 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 698 * in specified range or not. 699 * 700 * @param start Start offset of the range. 701 * @param end End offset +1 of the range. 702 * @param match A Match instance for storing matching result. 703 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 704 */ 705 public boolean matches(char[] target, int start, int end, Match match) { 706 707 synchronized (this) { 708 if (this.operations == null) 709 this.prepare(); 710 if (this.context == null) 711 this.context = new Context(); 712 } 713 Context con = null; 714 synchronized (this.context) { 715 con = this.context.inuse ? new Context() : this.context; 716 con.reset(target, start, end, this.numberOfClosures); 717 } 718 if (match != null) { 719 match.setNumberOfGroups(this.nofparen); 720 match.setSource(target); 721 } else if (this.hasBackReferences) { 722 match = new Match(); 723 match.setNumberOfGroups(this.nofparen); 724 // Need not to call setSource() because 725 // a caller can not access this match instance. 726 } 727 con.match = match; 728 729 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 730 int matchEnd = this. match(con, this.operations, con.start, 1, this.options); 731 //System.err.println("DEBUG: matchEnd="+matchEnd); 732 if (matchEnd == con.limit) { 733 if (con.match != null) { 734 con.match.setBeginning(0, con.start); 735 con.match.setEnd(0, matchEnd); 736 } 737 con.setInUse(false); 738 return true; 739 } 740 return false; 741 } 742 743 /* 744 * The pattern has only fixed string. 745 * The engine uses Boyer-Moore. 746 */ 747 if (this.fixedStringOnly) { 748 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 749 int o = this.fixedStringTable.matches(target, con.start, con.limit); 750 if (o >= 0) { 751 if (con.match != null) { 752 con.match.setBeginning(0, o); 753 con.match.setEnd(0, o+this.fixedString.length()); 754 } 755 con.setInUse(false); 756 return true; 757 } 758 con.setInUse(false); 759 return false; 760 } 761 762 /* 763 * The pattern contains a fixed string. 764 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 765 * If not, it return with false. 766 */ 767 if (this.fixedString != null) { 768 int o = this.fixedStringTable.matches(target, con.start, con.limit); 769 if (o < 0) { 770 //System.err.println("Non-match in fixed-string search."); 771 con.setInUse(false); 772 return false; 773 } 774 } 775 776 int limit = con.limit-this.minlength; 777 int matchStart; 778 int matchEnd = -1; 779 780 /* 781 * Checks whether the expression starts with ".*". 782 */ 783 if (this.operations != null 784 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 785 if (isSet(this.options, SINGLE_LINE)) { 786 matchStart = con.start; 787 matchEnd = this. match(con, this.operations, con.start, 1, this.options); 788 } else { 789 boolean previousIsEOL = true; 790 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 791 int ch = target [ matchStart ] ; 792 if (isEOLChar(ch)) { 793 previousIsEOL = true; 794 } else { 795 if (previousIsEOL) { 796 if (0 <= (matchEnd = this. match(con, this.operations, 797 matchStart, 1, this.options))) 798 break; 799 } 800 previousIsEOL = false; 801 } 802 } 803 } 804 } 805 806 /* 807 * Optimization against the first character. 808 */ 809 else if (this.firstChar != null) { 810 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 811 RangeToken range = this.firstChar; 812 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 813 int ch = target [matchStart] ; 814 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 815 ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]); 816 } 817 if (!range.match(ch)) { 818 continue; 819 } 820 if (0 <= (matchEnd = this. match(con, this.operations, 821 matchStart, 1, this.options))) { 822 break; 823 } 824 } 825 } 826 827 /* 828 * Straightforward matching. 829 */ 830 else { 831 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 832 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) 833 break; 834 } 835 } 836 837 if (matchEnd >= 0) { 838 if (con.match != null) { 839 con.match.setBeginning(0, matchStart); 840 con.match.setEnd(0, matchEnd); 841 } 842 con.setInUse(false); 843 return true; 844 } else { 845 con.setInUse(false); 846 return false; 847 } 848 } 849 850 /** 851 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 852 * 853 * @return true if the target is matched to this regular expression. 854 */ 855 public boolean matches(String target) { 856 return this.matches(target, 0, target .length() , (Match)null); 857 } 858 859 /** 860 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 861 * in specified range or not. 862 * 863 * @param start Start offset of the range. 864 * @param end End offset +1 of the range. 865 * @return true if the target is matched to this regular expression. 866 */ 867 public boolean matches(String target, int start, int end) { 868 return this.matches(target, start, end, (Match)null); 869 } 870 871 /** 872 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 873 * 874 * @param match A Match instance for storing matching result. 875 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 876 */ 877 public boolean matches(String target, Match match) { 878 return this.matches(target, 0, target .length() , match); 879 } 880 881 /** 882 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 883 * in specified range or not. 884 * 885 * @param start Start offset of the range. 886 * @param end End offset +1 of the range. 887 * @param match A Match instance for storing matching result. 888 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 889 */ 890 public boolean matches(String target, int start, int end, Match match) { 891 892 synchronized (this) { 893 if (this.operations == null) 894 this.prepare(); 895 if (this.context == null) 896 this.context = new Context(); 897 } 898 Context con = null; 899 synchronized (this.context) { 900 con = this.context.inuse ? new Context() : this.context; 901 con.reset(target, start, end, this.numberOfClosures); 902 } 903 if (match != null) { 904 match.setNumberOfGroups(this.nofparen); 905 match.setSource(target); 906 } else if (this.hasBackReferences) { 907 match = new Match(); 908 match.setNumberOfGroups(this.nofparen); 909 // Need not to call setSource() because 910 // a caller can not access this match instance. 911 } 912 con.match = match; 913 914 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 915 if (DEBUG) { 916 System.err.println("target string="+target); 917 } 918 int matchEnd = this. match(con, this.operations, con.start, 1, this.options); 919 if (DEBUG) { 920 System.err.println("matchEnd="+matchEnd); 921 System.err.println("con.limit="+con.limit); 922 } 923 if (matchEnd == con.limit) { 924 if (con.match != null) { 925 con.match.setBeginning(0, con.start); 926 con.match.setEnd(0, matchEnd); 927 } 928 con.setInUse(false); 929 return true; 930 } 931 return false; 932 } 933 934 /* 935 * The pattern has only fixed string. 936 * The engine uses Boyer-Moore. 937 */ 938 if (this.fixedStringOnly) { 939 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 940 int o = this.fixedStringTable.matches(target, con.start, con.limit); 941 if (o >= 0) { 942 if (con.match != null) { 943 con.match.setBeginning(0, o); 944 con.match.setEnd(0, o+this.fixedString.length()); 945 } 946 con.setInUse(false); 947 return true; 948 } 949 con.setInUse(false); 950 return false; 951 } 952 953 /* 954 * The pattern contains a fixed string. 955 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 956 * If not, it return with false. 957 */ 958 if (this.fixedString != null) { 959 int o = this.fixedStringTable.matches(target, con.start, con.limit); 960 if (o < 0) { 961 //System.err.println("Non-match in fixed-string search."); 962 con.setInUse(false); 963 return false; 964 } 965 } 966 967 int limit = con.limit-this.minlength; 968 int matchStart; 969 int matchEnd = -1; 970 971 /* 972 * Checks whether the expression starts with ".*". 973 */ 974 if (this.operations != null 975 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 976 if (isSet(this.options, SINGLE_LINE)) { 977 matchStart = con.start; 978 matchEnd = this.match(con, this.operations, con.start, 1, this.options); 979 } else { 980 boolean previousIsEOL = true; 981 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 982 int ch = target .charAt( matchStart ) ; 983 if (isEOLChar(ch)) { 984 previousIsEOL = true; 985 } else { 986 if (previousIsEOL) { 987 if (0 <= (matchEnd = this.match(con, this.operations, 988 matchStart, 1, this.options))) 989 break; 990 } 991 previousIsEOL = false; 992 } 993 } 994 } 995 } 996 997 /* 998 * Optimization against the first character. 999 */ 1000 else if (this.firstChar != null) { 1001 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1002 RangeToken range = this.firstChar; 1003 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1004 int ch = target .charAt( matchStart ) ; 1005 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1006 ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1)); 1007 } 1008 if (!range.match(ch)) { 1009 continue; 1010 } 1011 if (0 <= (matchEnd = this.match(con, this.operations, 1012 matchStart, 1, this.options))) { 1013 break; 1014 } 1015 } 1016 } 1017 1018 /* 1019 * Straightforward matching. 1020 */ 1021 else { 1022 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1023 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) 1024 break; 1025 } 1026 } 1027 1028 if (matchEnd >= 0) { 1029 if (con.match != null) { 1030 con.match.setBeginning(0, matchStart); 1031 con.match.setEnd(0, matchEnd); 1032 } 1033 con.setInUse(false); 1034 return true; 1035 } else { 1036 con.setInUse(false); 1037 return false; 1038 } 1039 } 1040 1041 /** 1042 * @return -1 when not match; offset of the end of matched string when match. 1043 */ 1044 private int match(Context con, Op op, int offset, int dx, int opts) { 1045 final ExpressionTarget target = con.target; 1046 final Stack opStack = new Stack(); 1047 final IntStack dataStack = new IntStack(); 1048 final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE); 1049 int retValue = -1; 1050 boolean returned = false; 1051 1052 for (;;) { 1053 if (op == null || offset > con.limit || offset < con.start) { 1054 if (op == null) { 1055 retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 1056 } 1057 else { 1058 retValue = -1; 1059 } 1060 returned = true; 1061 } 1062 else { 1063 retValue = -1; 1064 // dx value is either 1 or -1 1065 switch (op.type) { 1066 case Op.CHAR: 1067 { 1068 final int o1 = (dx > 0) ? offset : offset -1; 1069 if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) { 1070 returned = true; 1071 break; 1072 } 1073 offset += dx; 1074 op = op.next; 1075 } 1076 break; 1077 1078 case Op.DOT: 1079 { 1080 int o1 = (dx > 0) ? offset : offset - 1; 1081 if (o1 >= con.limit || o1 < 0) { 1082 returned = true; 1083 break; 1084 } 1085 if (isSet(opts, SINGLE_LINE)) { 1086 if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) { 1087 o1 += dx; 1088 } 1089 } 1090 else { 1091 int ch = target.charAt(o1); 1092 if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) { 1093 o1 += dx; 1094 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); 1095 } 1096 if (isEOLChar(ch)) { 1097 returned = true; 1098 break; 1099 } 1100 } 1101 offset = (dx > 0) ? o1 + 1 : o1; 1102 op = op.next; 1103 } 1104 break; 1105 1106 case Op.RANGE: 1107 case Op.NRANGE: 1108 { 1109 int o1 = (dx > 0) ? offset : offset -1; 1110 if (o1 >= con.limit || o1 < 0) { 1111 returned = true; 1112 break; 1113 } 1114 int ch = target.charAt(offset); 1115 if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) { 1116 o1 += dx; 1117 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); 1118 } 1119 final RangeToken tok = op.getToken(); 1120 if (!tok.match(ch)) { 1121 returned = true; 1122 break; 1123 } 1124 offset = (dx > 0) ? o1+1 : o1; 1125 op = op.next; 1126 } 1127 break; 1128 1129 case Op.ANCHOR: 1130 { 1131 if (!matchAnchor(target, op, con, offset, opts)) { 1132 returned = true; 1133 break; 1134 } 1135 op = op.next; 1136 } 1137 break; 1138 1139 case Op.BACKREFERENCE: 1140 { 1141 int refno = op.getData(); 1142 if (refno <= 0 || refno >= this.nofparen) { 1143 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1144 } 1145 if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) { 1146 returned = true; 1147 break; 1148 } 1149 int o2 = con.match.getBeginning(refno); 1150 int literallen = con.match.getEnd(refno)-o2; 1151 if (dx > 0) { 1152 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) { 1153 returned = true; 1154 break; 1155 } 1156 offset += literallen; 1157 } 1158 else { 1159 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) { 1160 returned = true; 1161 break; 1162 } 1163 offset -= literallen; 1164 } 1165 op = op.next; 1166 } 1167 break; 1168 1169 case Op.STRING: 1170 { 1171 String literal = op.getString(); 1172 int literallen = literal.length(); 1173 if (dx > 0) { 1174 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) { 1175 returned = true; 1176 break; 1177 } 1178 offset += literallen; 1179 } 1180 else { 1181 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) { 1182 returned = true; 1183 break; 1184 } 1185 offset -= literallen; 1186 } 1187 op = op.next; 1188 } 1189 break; 1190 1191 case Op.CLOSURE: 1192 { 1193 // Saves current position to avoid zero-width repeats. 1194 final int id = op.getData(); 1195 if (con.closureContexts[id].contains(offset)) { 1196 returned = true; 1197 break; 1198 } 1199 1200 con.closureContexts[id].addOffset(offset); 1201 } 1202 // fall through 1203 1204 case Op.QUESTION: 1205 { 1206 opStack.push(op); 1207 dataStack.push(offset); 1208 op = op.getChild(); 1209 } 1210 break; 1211 1212 case Op.NONGREEDYCLOSURE: 1213 case Op.NONGREEDYQUESTION: 1214 { 1215 opStack.push(op); 1216 dataStack.push(offset); 1217 op = op.next; 1218 } 1219 break; 1220 1221 case Op.UNION: 1222 if (op.size() == 0) { 1223 returned = true; 1224 } 1225 else { 1226 opStack.push(op); 1227 dataStack.push(0); 1228 dataStack.push(offset); 1229 op = op.elementAt(0); 1230 } 1231 break; 1232 1233 case Op.CAPTURE: 1234 { 1235 final int refno = op.getData(); 1236 if (con.match != null) { 1237 if (refno > 0) { 1238 dataStack.push(con.match.getBeginning(refno)); 1239 con.match.setBeginning(refno, offset); 1240 } 1241 else { 1242 final int index = -refno; 1243 dataStack.push(con.match.getEnd(index)); 1244 con.match.setEnd(index, offset); 1245 } 1246 opStack.push(op); 1247 dataStack.push(offset); 1248 } 1249 op = op.next; 1250 } 1251 break; 1252 1253 case Op.LOOKAHEAD: 1254 case Op.NEGATIVELOOKAHEAD: 1255 case Op.LOOKBEHIND: 1256 case Op.NEGATIVELOOKBEHIND: 1257 { 1258 opStack.push(op); 1259 dataStack.push(dx); 1260 dataStack.push(offset); 1261 dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1; 1262 op = op.getChild(); 1263 } 1264 break; 1265 1266 case Op.INDEPENDENT: 1267 { 1268 opStack.push(op); 1269 dataStack.push(offset); 1270 op = op.getChild(); 1271 } 1272 break; 1273 1274 case Op.MODIFIER: 1275 { 1276 int localopts = opts; 1277 localopts |= op.getData(); 1278 localopts &= ~op.getData2(); 1279 opStack.push(op); 1280 dataStack.push(opts); 1281 dataStack.push(offset); 1282 opts = localopts; 1283 op = op.getChild(); 1284 } 1285 break; 1286 1287 case Op.CONDITION: 1288 { 1289 Op.ConditionOp cop = (Op.ConditionOp)op; 1290 if (cop.refNumber > 0) { 1291 if (cop.refNumber >= this.nofparen) { 1292 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1293 } 1294 if (con.match.getBeginning(cop.refNumber) >= 0 1295 && con.match.getEnd(cop.refNumber) >= 0) { 1296 op = cop.yes; 1297 } 1298 else if (cop.no != null) { 1299 op = cop.no; 1300 } 1301 else { 1302 op = cop.next; 1303 } 1304 } 1305 else { 1306 opStack.push(op); 1307 dataStack.push(offset); 1308 op = cop.condition; 1309 } 1310 } 1311 break; 1312 1313 default: 1314 throw new RuntimeException("Unknown operation type: " + op.type); 1315 } 1316 } 1317 1318 // handle recursive operations 1319 while (returned) { 1320 // exhausted all the operations 1321 if (opStack.isEmpty()) { 1322 return retValue; 1323 } 1324 1325 op = (Op) opStack.pop(); 1326 offset = dataStack.pop(); 1327 1328 switch (op.type) { 1329 case Op.CLOSURE: 1330 case Op.QUESTION: 1331 if (retValue < 0) { 1332 op = op.next; 1333 returned = false; 1334 } 1335 break; 1336 1337 case Op.NONGREEDYCLOSURE: 1338 case Op.NONGREEDYQUESTION: 1339 if (retValue < 0) { 1340 op = op.getChild(); 1341 returned = false; 1342 } 1343 break; 1344 1345 case Op.UNION: 1346 { 1347 int unionIndex = dataStack.pop(); 1348 if (DEBUG) { 1349 System.err.println("UNION: "+unionIndex+", ret="+retValue); 1350 } 1351 1352 if (retValue < 0) { 1353 if (++unionIndex < op.size()) { 1354 opStack.push(op); 1355 dataStack.push(unionIndex); 1356 dataStack.push(offset); 1357 op = op.elementAt(unionIndex); 1358 returned = false; 1359 } 1360 else { 1361 retValue = -1; 1362 } 1363 } 1364 } 1365 break; 1366 1367 case Op.CAPTURE: 1368 final int refno = op.getData(); 1369 final int saved = dataStack.pop(); 1370 if (retValue < 0) { 1371 if (refno > 0) { 1372 con.match.setBeginning(refno, saved); 1373 } 1374 else { 1375 con.match.setEnd(-refno, saved); 1376 } 1377 } 1378 break; 1379 1380 case Op.LOOKAHEAD: 1381 case Op.LOOKBEHIND: 1382 { 1383 dx = dataStack.pop(); 1384 if (0 <= retValue) { 1385 op = op.next; 1386 returned = false; 1387 } 1388 retValue = -1; 1389 } 1390 break; 1391 1392 case Op.NEGATIVELOOKAHEAD: 1393 case Op.NEGATIVELOOKBEHIND: 1394 { 1395 dx = dataStack.pop(); 1396 if (0 > retValue) { 1397 op = op.next; 1398 returned = false; 1399 } 1400 retValue = -1; 1401 } 1402 break; 1403 1404 case Op.MODIFIER: 1405 opts = dataStack.pop(); 1406 // fall through 1407 1408 case Op.INDEPENDENT: 1409 if (retValue >= 0) { 1410 offset = retValue; 1411 op = op.next; 1412 returned = false; 1413 } 1414 break; 1415 1416 case Op.CONDITION: 1417 { 1418 final Op.ConditionOp cop = (Op.ConditionOp)op; 1419 if (0 <= retValue) { 1420 op = cop.yes; 1421 } 1422 else if (cop.no != null) { 1423 op = cop.no; 1424 } 1425 else { 1426 op = cop.next; 1427 } 1428 } 1429 returned = false; 1430 break; 1431 1432 default: 1433 break; 1434 } 1435 } 1436 } 1437 } 1438 1439 private boolean matchChar(int ch, int other, boolean ignoreCase) { 1440 return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other; 1441 } 1442 1443 boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) { 1444 boolean go = false; 1445 switch (op.getData()) { 1446 case '^': 1447 if (isSet(opts, MULTIPLE_LINES)) { 1448 if (!(offset == con.start 1449 || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) 1450 return false; 1451 } else { 1452 if (offset != con.start) 1453 return false; 1454 } 1455 break; 1456 1457 case '@': // Internal use only. 1458 // The @ always matches line beginnings. 1459 if (!(offset == con.start 1460 || offset > con.start && isEOLChar(target.charAt(offset-1)))) 1461 return false; 1462 break; 1463 1464 case '$': 1465 if (isSet(opts, MULTIPLE_LINES)) { 1466 if (!(offset == con.limit 1467 || offset < con.limit && isEOLChar(target.charAt(offset)))) 1468 return false; 1469 } else { 1470 if (!(offset == con.limit 1471 || offset+1 == con.limit && isEOLChar(target.charAt(offset)) 1472 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN 1473 && target.charAt(offset+1) == LINE_FEED)) 1474 return false; 1475 } 1476 break; 1477 1478 case 'A': 1479 if (offset != con.start) return false; 1480 break; 1481 1482 case 'Z': 1483 if (!(offset == con.limit 1484 || offset+1 == con.limit && isEOLChar(target.charAt(offset)) 1485 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN 1486 && target.charAt(offset+1) == LINE_FEED)) 1487 return false; 1488 break; 1489 1490 case 'z': 1491 if (offset != con.limit) return false; 1492 break; 1493 1494 case 'b': 1495 if (con.length == 0) 1496 return false; 1497 { 1498 int after = getWordType(target, con.start, con.limit, offset, opts); 1499 if (after == WT_IGNORE) return false; 1500 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1501 if (after == before) return false; 1502 } 1503 break; 1504 1505 case 'B': 1506 if (con.length == 0) 1507 go = true; 1508 else { 1509 int after = getWordType(target, con.start, con.limit, offset, opts); 1510 go = after == WT_IGNORE 1511 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1512 } 1513 if (!go) return false; 1514 break; 1515 1516 case '<': 1517 if (con.length == 0 || offset == con.limit) return false; 1518 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1519 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1520 return false; 1521 break; 1522 1523 case '>': 1524 if (con.length == 0 || offset == con.start) return false; 1525 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1526 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1527 return false; 1528 break; 1529 } // switch anchor type 1530 1531 return true; 1532 } 1533 1534 private static final int getPreviousWordType(ExpressionTarget target, int begin, int end, 1535 int offset, int opts) { 1536 int ret = getWordType(target, begin, end, --offset, opts); 1537 while (ret == WT_IGNORE) 1538 ret = getWordType(target, begin, end, --offset, opts); 1539 return ret; 1540 } 1541 1542 private static final int getWordType(ExpressionTarget target, int begin, int end, 1543 int offset, int opts) { 1544 if (offset < begin || offset >= end) return WT_OTHER; 1545 return getWordType0(target.charAt(offset) , opts); 1546 } 1547 1548 1549 /** 1550 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1551 * 1552 * @return true if the target is matched to this regular expression. 1553 */ 1554 public boolean matches(CharacterIterator target) { 1555 return this.matches(target, (Match)null); 1556 } 1557 1558 1559 /** 1560 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1561 * 1562 * @param match A Match instance for storing matching result. 1563 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1564 */ 1565 public boolean matches(CharacterIterator target, Match match) { 1566 int start = target.getBeginIndex(); 1567 int end = target.getEndIndex(); 1568 1569 1570 1571 synchronized (this) { 1572 if (this.operations == null) 1573 this.prepare(); 1574 if (this.context == null) 1575 this.context = new Context(); 1576 } 1577 Context con = null; 1578 synchronized (this.context) { 1579 con = this.context.inuse ? new Context() : this.context; 1580 con.reset(target, start, end, this.numberOfClosures); 1581 } 1582 if (match != null) { 1583 match.setNumberOfGroups(this.nofparen); 1584 match.setSource(target); 1585 } else if (this.hasBackReferences) { 1586 match = new Match(); 1587 match.setNumberOfGroups(this.nofparen); 1588 // Need not to call setSource() because 1589 // a caller can not access this match instance. 1590 } 1591 con.match = match; 1592 1593 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 1594 int matchEnd = this.match(con, this.operations, con.start, 1, this.options); 1595 //System.err.println("DEBUG: matchEnd="+matchEnd); 1596 if (matchEnd == con.limit) { 1597 if (con.match != null) { 1598 con.match.setBeginning(0, con.start); 1599 con.match.setEnd(0, matchEnd); 1600 } 1601 con.setInUse(false); 1602 return true; 1603 } 1604 return false; 1605 } 1606 1607 /* 1608 * The pattern has only fixed string. 1609 * The engine uses Boyer-Moore. 1610 */ 1611 if (this.fixedStringOnly) { 1612 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 1613 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1614 if (o >= 0) { 1615 if (con.match != null) { 1616 con.match.setBeginning(0, o); 1617 con.match.setEnd(0, o+this.fixedString.length()); 1618 } 1619 con.setInUse(false); 1620 return true; 1621 } 1622 con.setInUse(false); 1623 return false; 1624 } 1625 1626 /* 1627 * The pattern contains a fixed string. 1628 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 1629 * If not, it return with false. 1630 */ 1631 if (this.fixedString != null) { 1632 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1633 if (o < 0) { 1634 //System.err.println("Non-match in fixed-string search."); 1635 con.setInUse(false); 1636 return false; 1637 } 1638 } 1639 1640 int limit = con.limit-this.minlength; 1641 int matchStart; 1642 int matchEnd = -1; 1643 1644 /* 1645 * Checks whether the expression starts with ".*". 1646 */ 1647 if (this.operations != null 1648 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 1649 if (isSet(this.options, SINGLE_LINE)) { 1650 matchStart = con.start; 1651 matchEnd = this.match(con, this.operations, con.start, 1, this.options); 1652 } else { 1653 boolean previousIsEOL = true; 1654 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1655 int ch = target .setIndex( matchStart ) ; 1656 if (isEOLChar(ch)) { 1657 previousIsEOL = true; 1658 } else { 1659 if (previousIsEOL) { 1660 if (0 <= (matchEnd = this.match(con, this.operations, 1661 matchStart, 1, this.options))) 1662 break; 1663 } 1664 previousIsEOL = false; 1665 } 1666 } 1667 } 1668 } 1669 1670 /* 1671 * Optimization against the first character. 1672 */ 1673 else if (this.firstChar != null) { 1674 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1675 RangeToken range = this.firstChar; 1676 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1677 int ch = target .setIndex( matchStart ) ; 1678 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1679 ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1)); 1680 } 1681 if (!range.match(ch)) { 1682 continue; 1683 } 1684 if (0 <= (matchEnd = this.match(con, this.operations, 1685 matchStart, 1, this.options))) { 1686 break; 1687 } 1688 } 1689 } 1690 1691 /* 1692 * Straightforward matching. 1693 */ 1694 else { 1695 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1696 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) 1697 break; 1698 } 1699 } 1700 1701 if (matchEnd >= 0) { 1702 if (con.match != null) { 1703 con.match.setBeginning(0, matchStart); 1704 con.match.setEnd(0, matchEnd); 1705 } 1706 con.setInUse(false); 1707 return true; 1708 } else { 1709 con.setInUse(false); 1710 return false; 1711 } 1712 } 1713 1714 // ================================================================ 1715 1716 /** 1717 * A regular expression. 1718 * @serial 1719 */ 1720 String regex; 1721 /** 1722 * @serial 1723 */ 1724 int options; 1725 1726 /** 1727 * The number of parenthesis in the regular expression. 1728 * @serial 1729 */ 1730 int nofparen; 1731 /** 1732 * Internal representation of the regular expression. 1733 * @serial 1734 */ 1735 Token tokentree; 1736 1737 boolean hasBackReferences = false; 1738 1739 transient int minlength; 1740 transient Op operations = null; 1741 transient int numberOfClosures; 1742 transient Context context = null; 1743 transient RangeToken firstChar = null; 1744 1745 transient String fixedString = null; 1746 transient int fixedStringOptions; 1747 transient BMPattern fixedStringTable = null; 1748 transient boolean fixedStringOnly = false; 1749 1750 static abstract class ExpressionTarget { 1751 abstract char charAt(int index); 1752 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen); 1753 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen); 1754 } 1755 1756 static final class StringTarget extends ExpressionTarget { 1757 1758 private String target; 1759 1760 StringTarget(String target) { 1761 this.target = target; 1762 } 1763 1764 final void resetTarget(String target) { 1765 this.target = target; 1766 } 1767 1768 final char charAt(int index) { 1769 return target.charAt(index); 1770 } 1771 1772 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1773 String part, int partlen) { 1774 if (limit-offset < partlen) { 1775 return false; 1776 } 1777 return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen); 1778 } 1779 1780 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1781 int offset2, int partlen) { 1782 if (limit-offset < partlen) { 1783 return false; 1784 } 1785 return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen) 1786 : target.regionMatches(offset, target, offset2, partlen); 1787 } 1788 } 1789 1790 static final class CharArrayTarget extends ExpressionTarget { 1791 1792 char[] target; 1793 1794 CharArrayTarget(char[] target) { 1795 this.target = target; 1796 } 1797 1798 final void resetTarget(char[] target) { 1799 this.target = target; 1800 } 1801 1802 char charAt(int index) { 1803 return target[index]; 1804 } 1805 1806 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1807 String part, int partlen) { 1808 if (offset < 0 || limit-offset < partlen) { 1809 return false; 1810 } 1811 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) 1812 : regionMatches(offset, limit, part, partlen); 1813 } 1814 1815 private final boolean regionMatches(int offset, int limit, String part, int partlen) { 1816 int i = 0; 1817 while (partlen-- > 0) { 1818 if (target[offset++] != part.charAt(i++)) { 1819 return false; 1820 } 1821 } 1822 return true; 1823 } 1824 1825 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { 1826 int i = 0; 1827 while (partlen-- > 0) { 1828 final char ch1 = target[offset++] ; 1829 final char ch2 = part.charAt(i++); 1830 if (ch1 == ch2) { 1831 continue; 1832 } 1833 final char uch1 = Character.toUpperCase(ch1); 1834 final char uch2 = Character.toUpperCase(ch2); 1835 if (uch1 == uch2) { 1836 continue; 1837 } 1838 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1839 return false; 1840 } 1841 } 1842 return true; 1843 } 1844 1845 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { 1846 if (offset < 0 || limit-offset < partlen) { 1847 return false; 1848 } 1849 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) 1850 : regionMatches(offset, limit, offset2, partlen); 1851 } 1852 1853 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { 1854 int i = offset2; 1855 while (partlen-- > 0) { 1856 if ( target [ offset++ ] != target [ i++ ] ) 1857 return false; 1858 } 1859 return true; 1860 } 1861 1862 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { 1863 int i = offset2; 1864 while (partlen-- > 0) { 1865 final char ch1 = target[offset++] ; 1866 final char ch2 = target[i++] ; 1867 if (ch1 == ch2) { 1868 continue; 1869 } 1870 final char uch1 = Character.toUpperCase(ch1); 1871 final char uch2 = Character.toUpperCase(ch2); 1872 if (uch1 == uch2) { 1873 continue; 1874 } 1875 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1876 return false; 1877 } 1878 } 1879 return true; 1880 } 1881 } 1882 1883 static final class CharacterIteratorTarget extends ExpressionTarget { 1884 CharacterIterator target; 1885 1886 CharacterIteratorTarget(CharacterIterator target) { 1887 this.target = target; 1888 } 1889 1890 final void resetTarget(CharacterIterator target) { 1891 this.target = target; 1892 } 1893 1894 final char charAt(int index) { 1895 return target.setIndex(index); 1896 } 1897 1898 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1899 String part, int partlen) { 1900 if (offset < 0 || limit-offset < partlen) { 1901 return false; 1902 } 1903 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) 1904 : regionMatches(offset, limit, part, partlen); 1905 } 1906 1907 private final boolean regionMatches(int offset, int limit, String part, int partlen) { 1908 int i = 0; 1909 while (partlen-- > 0) { 1910 if (target.setIndex(offset++) != part.charAt(i++)) { 1911 return false; 1912 } 1913 } 1914 return true; 1915 } 1916 1917 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { 1918 int i = 0; 1919 while (partlen-- > 0) { 1920 final char ch1 = target.setIndex(offset++) ; 1921 final char ch2 = part.charAt(i++); 1922 if (ch1 == ch2) { 1923 continue; 1924 } 1925 final char uch1 = Character.toUpperCase(ch1); 1926 final char uch2 = Character.toUpperCase(ch2); 1927 if (uch1 == uch2) { 1928 continue; 1929 } 1930 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1931 return false; 1932 } 1933 } 1934 return true; 1935 } 1936 1937 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { 1938 if (offset < 0 || limit-offset < partlen) { 1939 return false; 1940 } 1941 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) 1942 : regionMatches(offset, limit, offset2, partlen); 1943 } 1944 1945 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { 1946 int i = offset2; 1947 while (partlen-- > 0) { 1948 if (target.setIndex(offset++) != target.setIndex(i++)) { 1949 return false; 1950 } 1951 } 1952 return true; 1953 } 1954 1955 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { 1956 int i = offset2; 1957 while (partlen-- > 0) { 1958 final char ch1 = target.setIndex(offset++) ; 1959 final char ch2 = target.setIndex(i++) ; 1960 if (ch1 == ch2) { 1961 continue; 1962 } 1963 final char uch1 = Character.toUpperCase(ch1); 1964 final char uch2 = Character.toUpperCase(ch2); 1965 if (uch1 == uch2) { 1966 continue; 1967 } 1968 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1969 return false; 1970 } 1971 } 1972 return true; 1973 } 1974 } 1975 1976 static final class ClosureContext { 1977 1978 int[] offsets = new int[4]; 1979 int currentIndex = 0; 1980 1981 boolean contains(int offset) { 1982 for (int i=0; i<currentIndex;++i) { 1983 if (offsets[i] == offset) { 1984 return true; 1985 } 1986 } 1987 return false; 1988 } 1989 1990 void reset() { 1991 currentIndex = 0; 1992 } 1993 1994 void addOffset(int offset) { 1995 // We do not check for duplicates, caller is responsible for that 1996 if (currentIndex == offsets.length) { 1997 offsets = expandOffsets(); 1998 } 1999 offsets[currentIndex++] = offset; 2000 } 2001 2002 private int[] expandOffsets() { 2003 final int len = offsets.length; 2004 final int newLen = len << 1; 2005 int[] newOffsets = new int[newLen]; 2006 2007 System.arraycopy(offsets, 0, newOffsets, 0, currentIndex); 2008 return newOffsets; 2009 } 2010 } 2011 2012 static final class Context { 2013 int start; 2014 int limit; 2015 int length; 2016 Match match; 2017 boolean inuse = false; 2018 ClosureContext[] closureContexts; 2019 2020 private StringTarget stringTarget; 2021 private CharArrayTarget charArrayTarget; 2022 private CharacterIteratorTarget characterIteratorTarget; 2023 2024 ExpressionTarget target; 2025 2026 Context() { 2027 } 2028 2029 private void resetCommon(int nofclosures) { 2030 this.length = this.limit-this.start; 2031 setInUse(true); 2032 this.match = null; 2033 if (this.closureContexts == null || this.closureContexts.length != nofclosures) { 2034 this.closureContexts = new ClosureContext[nofclosures]; 2035 } 2036 for (int i = 0; i < nofclosures; i ++) { 2037 if (this.closureContexts[i] == null) { 2038 this.closureContexts[i] = new ClosureContext(); 2039 } 2040 else { 2041 this.closureContexts[i].reset(); 2042 } 2043 } 2044 } 2045 2046 void reset(CharacterIterator target, int start, int limit, int nofclosures) { 2047 if (characterIteratorTarget == null) { 2048 characterIteratorTarget = new CharacterIteratorTarget(target); 2049 } 2050 else { 2051 characterIteratorTarget.resetTarget(target); 2052 } 2053 this.target = characterIteratorTarget; 2054 this.start = start; 2055 this.limit = limit; 2056 this.resetCommon(nofclosures); 2057 } 2058 2059 void reset(String target, int start, int limit, int nofclosures) { 2060 if (stringTarget == null) { 2061 stringTarget = new StringTarget(target); 2062 } 2063 else { 2064 stringTarget.resetTarget(target); 2065 } 2066 this.target = stringTarget; 2067 this.start = start; 2068 this.limit = limit; 2069 this.resetCommon(nofclosures); 2070 } 2071 2072 void reset(char[] target, int start, int limit, int nofclosures) { 2073 if (charArrayTarget == null) { 2074 charArrayTarget = new CharArrayTarget(target); 2075 } 2076 else { 2077 charArrayTarget.resetTarget(target); 2078 } 2079 this.target = charArrayTarget; 2080 this.start = start; 2081 this.limit = limit; 2082 this.resetCommon(nofclosures); 2083 } 2084 synchronized void setInUse(boolean inUse) { 2085 this.inuse = inUse; 2086 } 2087 } 2088 2089 /** 2090 * Prepares for matching. This method is called just before starting matching. 2091 */ 2092 void prepare() { 2093 if (Op.COUNT) Op.nofinstances = 0; 2094 this.compile(this.tokentree); 2095 /* 2096 if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* 2097 Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); 2098 anchor.next = this.operations; 2099 this.operations = anchor; 2100 } 2101 */ 2102 if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); 2103 2104 this.minlength = this.tokentree.getMinLength(); 2105 2106 this.firstChar = null; 2107 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) 2108 && !isSet(this.options, XMLSCHEMA_MODE)) { 2109 RangeToken firstChar = Token.createRange(); 2110 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); 2111 if (fresult == Token.FC_TERMINAL) { 2112 firstChar.compactRanges(); 2113 this.firstChar = firstChar; 2114 if (DEBUG) 2115 System.err.println("DEBUG: Use the first character optimization: "+firstChar); 2116 } 2117 } 2118 2119 if (this.operations != null 2120 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) 2121 && this.operations.next == null) { 2122 if (DEBUG) 2123 System.err.print(" *** Only fixed string! *** "); 2124 this.fixedStringOnly = true; 2125 if (this.operations.type == Op.STRING) 2126 this.fixedString = this.operations.getString(); 2127 else if (this.operations.getData() >= 0x10000) { // Op.CHAR 2128 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); 2129 } else { 2130 char[] ac = new char[1]; 2131 ac[0] = (char)this.operations.getData(); 2132 this.fixedString = new String(ac); 2133 } 2134 this.fixedStringOptions = this.options; 2135 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2136 isSet(this.fixedStringOptions, IGNORE_CASE)); 2137 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) 2138 && !isSet(this.options, XMLSCHEMA_MODE)) { 2139 Token.FixedStringContainer container = new Token.FixedStringContainer(); 2140 this.tokentree.findFixedString(container, this.options); 2141 this.fixedString = container.token == null ? null : container.token.getString(); 2142 this.fixedStringOptions = container.options; 2143 if (this.fixedString != null && this.fixedString.length() < 2) 2144 this.fixedString = null; 2145 // This pattern has a fixed string of which length is more than one. 2146 if (this.fixedString != null) { 2147 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2148 isSet(this.fixedStringOptions, IGNORE_CASE)); 2149 if (DEBUG) { 2150 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() 2151 +"/" //+this.fixedString 2152 +"/"+REUtil.createOptionString(this.fixedStringOptions)); 2153 System.err.print("String: "); 2154 REUtil.dumpString(this.fixedString); 2155 } 2156 } 2157 } 2158 } 2159 2160 /** 2161 * An option. 2162 * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> 2163 * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> 2164 * does not capture. 2165 * 2166 * @see #RegularExpression(java.lang.String,int) 2167 * @see #setPattern(java.lang.String,int) 2168 static final int MARK_PARENS = 1<<0; 2169 */ 2170 2171 /** 2172 * "i" 2173 */ 2174 static final int IGNORE_CASE = 1<<1; 2175 2176 /** 2177 * "s" 2178 */ 2179 static final int SINGLE_LINE = 1<<2; 2180 2181 /** 2182 * "m" 2183 */ 2184 static final int MULTIPLE_LINES = 1<<3; 2185 2186 /** 2187 * "x" 2188 */ 2189 static final int EXTENDED_COMMENT = 1<<4; 2190 2191 /** 2192 * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. 2193 * 2194 * @see #RegularExpression(java.lang.String,int) 2195 * @see #setPattern(java.lang.String,int) 2196 * @see #UNICODE_WORD_BOUNDARY 2197 */ 2198 static final int USE_UNICODE_CATEGORY = 1<<5; // "u" 2199 2200 /** 2201 * An option. 2202 * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. 2203 * <p>By default, the engine considers a position between a word character 2204 * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character 2205 * is a word boundary. 2206 * <p>By this option, the engine checks word boundaries with the method of 2207 * 'Unicode Regular Expression Guidelines' Revision 4. 2208 * 2209 * @see #RegularExpression(java.lang.String,int) 2210 * @see #setPattern(java.lang.String,int) 2211 */ 2212 static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" 2213 2214 /** 2215 * "H" 2216 */ 2217 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; 2218 /** 2219 * "F" 2220 */ 2221 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; 2222 /** 2223 * "X". XML Schema mode. 2224 */ 2225 static final int XMLSCHEMA_MODE = 1<<9; 2226 /** 2227 * ",". 2228 */ 2229 static final int SPECIAL_COMMA = 1<<10; 2230 2231 2232 private static final boolean isSet(int options, int flag) { 2233 return (options & flag) == flag; 2234 } 2235 2236 /** 2237 * Creates a new RegularExpression instance. 2238 * 2239 * @param regex A regular expression 2240 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2241 */ 2242 public RegularExpression(String regex) throws ParseException { 2243 this(regex, null); 2244 } 2245 2246 /** 2247 * Creates a new RegularExpression instance with options. 2248 * 2249 * @param regex A regular expression 2250 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2251 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2252 */ 2253 public RegularExpression(String regex, String options) throws ParseException { 2254 this.setPattern(regex, options); 2255 } 2256 2257 /** 2258 * Creates a new RegularExpression instance with options. 2259 * 2260 * @param regex A regular expression 2261 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2262 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2263 */ 2264 public RegularExpression(String regex, String options, Locale locale) throws ParseException { 2265 this.setPattern(regex, options, locale); 2266 } 2267 2268 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { 2269 this.regex = regex; 2270 this.tokentree = tok; 2271 this.nofparen = parens; 2272 this.options = options; 2273 this.hasBackReferences = hasBackReferences; 2274 } 2275 2276 /** 2277 * 2278 */ 2279 public void setPattern(String newPattern) throws ParseException { 2280 this.setPattern(newPattern, Locale.getDefault()); 2281 } 2282 2283 public void setPattern(String newPattern, Locale locale) throws ParseException { 2284 this.setPattern(newPattern, this.options, locale); 2285 } 2286 2287 private void setPattern(String newPattern, int options, Locale locale) throws ParseException { 2288 this.regex = newPattern; 2289 this.options = options; 2290 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) 2291 ? new ParserForXMLSchema(locale) : new RegexParser(locale); 2292 this.tokentree = rp.parse(this.regex, this.options); 2293 this.nofparen = rp.parennumber; 2294 this.hasBackReferences = rp.hasBackReferences; 2295 2296 this.operations = null; 2297 this.context = null; 2298 } 2299 /** 2300 * 2301 */ 2302 public void setPattern(String newPattern, String options) throws ParseException { 2303 this.setPattern(newPattern, options, Locale.getDefault()); 2304 } 2305 2306 public void setPattern(String newPattern, String options, Locale locale) throws ParseException { 2307 this.setPattern(newPattern, REUtil.parseOptions(options), locale); 2308 } 2309 2310 /** 2311 * 2312 */ 2313 public String getPattern() { 2314 return this.regex; 2315 } 2316 2317 /** 2318 * Represents this instence in String. 2319 */ 2320 public String toString() { 2321 return this.tokentree.toString(this.options); 2322 } 2323 2324 /** 2325 * Returns a option string. 2326 * The order of letters in it may be different from a string specified 2327 * in a constructor or <code>setPattern()</code>. 2328 * 2329 * @see #RegularExpression(java.lang.String,java.lang.String) 2330 * @see #setPattern(java.lang.String,java.lang.String) 2331 */ 2332 public String getOptions() { 2333 return REUtil.createOptionString(this.options); 2334 } 2335 2336 /** 2337 * Return true if patterns are the same and the options are equivalent. 2338 */ 2339 public boolean equals(Object obj) { 2340 if (obj == null) return false; 2341 if (!(obj instanceof RegularExpression)) 2342 return false; 2343 RegularExpression r = (RegularExpression)obj; 2344 return this.regex.equals(r.regex) && this.options == r.options; 2345 } 2346 2347 boolean equals(String pattern, int options) { 2348 return this.regex.equals(pattern) && this.options == options; 2349 } 2350 2351 /** 2352 * 2353 */ 2354 public int hashCode() { 2355 return (this.regex+"/"+this.getOptions()).hashCode(); 2356 } 2357 2358 /** 2359 * Return the number of regular expression groups. 2360 * This method returns 1 when the regular expression has no capturing-parenthesis. 2361 * 2362 */ 2363 public int getNumberOfGroups() { 2364 return this.nofparen; 2365 } 2366 2367 // ================================================================ 2368 2369 private static final int WT_IGNORE = 0; 2370 private static final int WT_LETTER = 1; 2371 private static final int WT_OTHER = 2; 2372 private static final int getWordType0(char ch, int opts) { 2373 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { 2374 if (isSet(opts, USE_UNICODE_CATEGORY)) { 2375 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; 2376 } 2377 return isWordChar(ch) ? WT_LETTER : WT_OTHER; 2378 } 2379 2380 switch (Character.getType(ch)) { 2381 case Character.UPPERCASE_LETTER: // L 2382 case Character.LOWERCASE_LETTER: // L 2383 case Character.TITLECASE_LETTER: // L 2384 case Character.MODIFIER_LETTER: // L 2385 case Character.OTHER_LETTER: // L 2386 case Character.LETTER_NUMBER: // N 2387 case Character.DECIMAL_DIGIT_NUMBER: // N 2388 case Character.OTHER_NUMBER: // N 2389 case Character.COMBINING_SPACING_MARK: // Mc 2390 return WT_LETTER; 2391 2392 case Character.FORMAT: // Cf 2393 case Character.NON_SPACING_MARK: // Mn 2394 case Character.ENCLOSING_MARK: // Mc 2395 return WT_IGNORE; 2396 2397 case Character.CONTROL: // Cc 2398 switch (ch) { 2399 case '\t': 2400 case '\n': 2401 case '\u000B': 2402 case '\f': 2403 case '\r': 2404 return WT_OTHER; 2405 default: 2406 return WT_IGNORE; 2407 } 2408 2409 default: 2410 return WT_OTHER; 2411 } 2412 } 2413 2414 // ================================================================ 2415 2416 static final int LINE_FEED = 0x000A; 2417 static final int CARRIAGE_RETURN = 0x000D; 2418 static final int LINE_SEPARATOR = 0x2028; 2419 static final int PARAGRAPH_SEPARATOR = 0x2029; 2420 2421 private static final boolean isEOLChar(int ch) { 2422 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR 2423 || ch == PARAGRAPH_SEPARATOR; 2424 } 2425 2426 private static final boolean isWordChar(int ch) { // Legacy word characters 2427 if (ch == '_') return true; 2428 if (ch < '0') return false; 2429 if (ch > 'z') return false; 2430 if (ch <= '9') return true; 2431 if (ch < 'A') return false; 2432 if (ch <= 'Z') return true; 2433 if (ch < 'a') return false; 2434 return true; 2435 } 2436 2437 private static final boolean matchIgnoreCase(int chardata, int ch) { 2438 if (chardata == ch) return true; 2439 if (chardata > 0xffff || ch > 0xffff) return false; 2440 char uch1 = Character.toUpperCase((char)chardata); 2441 char uch2 = Character.toUpperCase((char)ch); 2442 if (uch1 == uch2) return true; 2443 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); 2444 } 2445 }