1 /* 2 * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.security.AccessController; 29 import java.security.PrivilegedAction; 30 import java.text.CharacterIterator; 31 import java.text.Normalizer; 32 import java.util.Locale; 33 import java.util.Map; 34 import java.util.ArrayList; 35 import java.util.HashMap; 36 import java.util.Arrays; 37 38 39 /** 40 * A compiled representation of a regular expression. 41 * 42 * <p> A regular expression, specified as a string, must first be compiled into 43 * an instance of this class. The resulting pattern can then be used to create 44 * a {@link Matcher} object that can match arbitrary {@link 45 * java.lang.CharSequence </code>character sequences<code>} against the regular 46 * expression. All of the state involved in performing a match resides in the 47 * matcher, so many matchers can share the same pattern. 48 * 49 * <p> A typical invocation sequence is thus 50 * 51 * <blockquote><pre> 52 * Pattern p = Pattern.{@link #compile compile}("a*b"); 53 * Matcher m = p.{@link #matcher matcher}("aaaaab"); 54 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote> 55 * 56 * <p> A {@link #matches matches} method is defined by this class as a 57 * convenience for when a regular expression is used just once. This method 58 * compiles an expression and matches an input sequence against it in a single 59 * invocation. The statement 60 * 61 * <blockquote><pre> 62 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote> 63 * 64 * is equivalent to the three statements above, though for repeated matches it 65 * is less efficient since it does not allow the compiled pattern to be reused. 66 * 67 * <p> Instances of this class are immutable and are safe for use by multiple 68 * concurrent threads. Instances of the {@link Matcher} class are not safe for 69 * such use. 70 * 71 * 72 * <a name="sum"> 73 * <h4> Summary of regular-expression constructs </h4> 74 * 75 * <table border="0" cellpadding="1" cellspacing="0" 76 * summary="Regular expression constructs, and what they match"> 77 * 78 * <tr align="left"> 79 * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th> 80 * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th> 81 * </tr> 82 * 83 * <tr><th> </th></tr> 84 * <tr align="left"><th colspan="2" id="characters">Characters</th></tr> 85 * 86 * <tr><td valign="top" headers="construct characters"><i>x</i></td> 87 * <td headers="matches">The character <i>x</i></td></tr> 88 * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td> 89 * <td headers="matches">The backslash character</td></tr> 90 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td> 91 * <td headers="matches">The character with octal value <tt>0</tt><i>n</i> 92 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr> 93 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td> 94 * <td headers="matches">The character with octal value <tt>0</tt><i>nn</i> 95 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr> 96 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td> 97 * <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i> 98 * (0 <tt><=</tt> <i>m</i> <tt><=</tt> 3, 99 * 0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr> 100 * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td> 101 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hh</i></td></tr> 102 * <tr><td valign="top" headers="construct characters"><tt>\u</tt><i>hhhh</i></td> 103 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hhhh</i></td></tr> 104 * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>{h...h}</i></td> 105 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>h...h</i> 106 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} 107 * <= <tt>0x</tt><i>h...h</i> <=  108 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> 109 * <tr><td valign="top" headers="matches"><tt>\t</tt></td> 110 * <td headers="matches">The tab character (<tt>'\u0009'</tt>)</td></tr> 111 * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td> 112 * <td headers="matches">The newline (line feed) character (<tt>'\u000A'</tt>)</td></tr> 113 * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td> 114 * <td headers="matches">The carriage-return character (<tt>'\u000D'</tt>)</td></tr> 115 * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td> 116 * <td headers="matches">The form-feed character (<tt>'\u000C'</tt>)</td></tr> 117 * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td> 118 * <td headers="matches">The alert (bell) character (<tt>'\u0007'</tt>)</td></tr> 119 * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td> 120 * <td headers="matches">The escape character (<tt>'\u001B'</tt>)</td></tr> 121 * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td> 122 * <td headers="matches">The control character corresponding to <i>x</i></td></tr> 123 * 124 * <tr><th> </th></tr> 125 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr> 126 * 127 * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td> 128 * <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr> 129 * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td> 130 * <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr> 131 * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td> 132 * <td headers="matches"><tt>a</tt> through <tt>z</tt> 133 * or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr> 134 * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td> 135 * <td headers="matches"><tt>a</tt> through <tt>d</tt>, 136 * or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr> 137 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td> 138 * <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr> 139 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td> 140 * <td headers="matches"><tt>a</tt> through <tt>z</tt>, 141 * except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr> 142 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td> 143 * <td headers="matches"><tt>a</tt> through <tt>z</tt>, 144 * and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr> 145 * <tr><th> </th></tr> 146 * 147 * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr> 148 * 149 * <tr><td valign="top" headers="construct predef"><tt>.</tt></td> 150 * <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr> 151 * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td> 152 * <td headers="matches">A digit: <tt>[0-9]</tt></td></tr> 153 * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td> 154 * <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr> 155 * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td> 156 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr> 157 * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td> 158 * <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr> 159 * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td> 160 * <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr> 161 * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td> 162 * <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr> 163 * 164 * <tr><th> </th></tr> 165 * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr> 166 * 167 * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td> 168 * <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr> 169 * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td> 170 * <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr> 171 * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td> 172 * <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr> 173 * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td> 174 * <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr> 175 * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td> 176 * <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr> 177 * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td> 178 * <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr> 179 * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td> 180 * <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr> 181 * <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt> 182 * <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> --> 183 * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td> 184 * <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr> 185 * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td> 186 * <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr> 187 * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td> 188 * <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr> 189 * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td> 190 * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr> 191 * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td> 192 * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr> 193 * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td> 194 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr> 195 * 196 * <tr><th> </th></tr> 197 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr> 198 * 199 * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td> 200 * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr> 201 * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td> 202 * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr> 203 * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td> 204 * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr> 205 * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td> 206 * <td>Equivalent to java.lang.Character.isMirrored()</td></tr> 207 * 208 * <tr><th> </th></tr> 209 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr> 210 * * <tr><td valign="top" headers="construct unicode"><tt>\p{IsLatin}</tt></td> 211 * <td headers="matches">A Latin script character (<a href="#usc">script</a>)</td></tr> 212 * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td> 213 * <td headers="matches">A character in the Greek block (<a href="#ubc">block</a>)</td></tr> 214 * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td> 215 * <td headers="matches">An uppercase letter (<a href="#ucc">category</a>)</td></tr> 216 * <tr><td valign="top" headers="construct unicode"><tt>\p{IsAlphabetic}</tt></td> 217 * <td headers="matches">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr> 218 * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td> 219 * <td headers="matches">A currency symbol</td></tr> 220 * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td> 221 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr> 222 * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td> 223 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr> 224 * 225 * <tr><th> </th></tr> 226 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr> 227 * 228 * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td> 229 * <td headers="matches">The beginning of a line</td></tr> 230 * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td> 231 * <td headers="matches">The end of a line</td></tr> 232 * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td> 233 * <td headers="matches">A word boundary</td></tr> 234 * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td> 235 * <td headers="matches">A non-word boundary</td></tr> 236 * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td> 237 * <td headers="matches">The beginning of the input</td></tr> 238 * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td> 239 * <td headers="matches">The end of the previous match</td></tr> 240 * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td> 241 * <td headers="matches">The end of the input but for the final 242 * <a href="#lt">terminator</a>, if any</td></tr> 243 * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td> 244 * <td headers="matches">The end of the input</td></tr> 245 * 246 * <tr><th> </th></tr> 247 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr> 248 * 249 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td> 250 * <td headers="matches"><i>X</i>, once or not at all</td></tr> 251 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td> 252 * <td headers="matches"><i>X</i>, zero or more times</td></tr> 253 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td> 254 * <td headers="matches"><i>X</i>, one or more times</td></tr> 255 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td> 256 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr> 257 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td> 258 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr> 259 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td> 260 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 261 * 262 * <tr><th> </th></tr> 263 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr> 264 * 265 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td> 266 * <td headers="matches"><i>X</i>, once or not at all</td></tr> 267 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td> 268 * <td headers="matches"><i>X</i>, zero or more times</td></tr> 269 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td> 270 * <td headers="matches"><i>X</i>, one or more times</td></tr> 271 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td> 272 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr> 273 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td> 274 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr> 275 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td> 276 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 277 * 278 * <tr><th> </th></tr> 279 * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr> 280 * 281 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td> 282 * <td headers="matches"><i>X</i>, once or not at all</td></tr> 283 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td> 284 * <td headers="matches"><i>X</i>, zero or more times</td></tr> 285 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td> 286 * <td headers="matches"><i>X</i>, one or more times</td></tr> 287 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td> 288 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr> 289 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td> 290 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr> 291 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td> 292 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 293 * 294 * <tr><th> </th></tr> 295 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr> 296 * 297 * <tr><td valign="top" headers="construct logical"><i>XY</i></td> 298 * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr> 299 * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td> 300 * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr> 301 * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td> 302 * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr> 303 * 304 * <tr><th> </th></tr> 305 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr> 306 * 307 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td> 308 * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup> 309 * <a href="#cg">capturing group</a> matched</td></tr> 310 * 311 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>k</i><<i>name</i>></td> 312 * <td valign="bottom" headers="matches">Whatever the 313 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr> 314 * 315 * <tr><th> </th></tr> 316 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr> 317 * 318 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td> 319 * <td headers="matches">Nothing, but quotes the following character</td></tr> 320 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td> 321 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr> 322 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td> 323 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr> 324 * <!-- Metachars: !$()*+.<>?[\]^{|} --> 325 * 326 * <tr><th> </th></tr> 327 * <tr align="left"><th colspan="2" id="special">Special constructs (named-capturing and non-capturing)</th></tr> 328 * 329 * <tr><td valign="top" headers="construct special"><tt>(?<<a href="#groupname">name</a>></tt><i>X</i><tt>)</tt></td> 330 * <td headers="matches"><i>X</i>, as a named-capturing group</td></tr> 331 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td> 332 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr> 333 * <tr><td valign="top" headers="construct special"><tt>(?idmsuxU-idmsuxU) </tt></td> 334 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a> 335 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> 336 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> 337 * on - off</td></tr> 338 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt> </td> 339 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the 340 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> 341 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > 342 * <a href="#COMMENTS">x</a> on - off</td></tr> 343 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td> 344 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr> 345 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td> 346 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr> 347 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td> 348 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr> 349 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td> 350 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr> 351 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td> 352 * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr> 353 * 354 * </table> 355 * 356 * <hr> 357 * 358 * 359 * <a name="bs"> 360 * <h4> Backslashes, escapes, and quoting </h4> 361 * 362 * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped 363 * constructs, as defined in the table above, as well as to quote characters 364 * that otherwise would be interpreted as unescaped constructs. Thus the 365 * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a 366 * left brace. 367 * 368 * <p> It is an error to use a backslash prior to any alphabetic character that 369 * does not denote an escaped construct; these are reserved for future 370 * extensions to the regular-expression language. A backslash may be used 371 * prior to a non-alphabetic character regardless of whether that character is 372 * part of an unescaped construct. 373 * 374 * <p> Backslashes within string literals in Java source code are interpreted 375 * as required by 376 * <cite>The Java™ Language Specification</cite> 377 * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) 378 * It is therefore necessary to double backslashes in string 379 * literals that represent regular expressions to protect them from 380 * interpretation by the Java bytecode compiler. The string literal 381 * <tt>"\b"</tt>, for example, matches a single backspace character when 382 * interpreted as a regular expression, while <tt>"\\b"</tt> matches a 383 * word boundary. The string literal <tt>"\(hello\)"</tt> is illegal 384 * and leads to a compile-time error; in order to match the string 385 * <tt>(hello)</tt> the string literal <tt>"\\(hello\\)"</tt> 386 * must be used. 387 * 388 * <a name="cc"> 389 * <h4> Character Classes </h4> 390 * 391 * <p> Character classes may appear within other character classes, and 392 * may be composed by the union operator (implicit) and the intersection 393 * operator (<tt>&&</tt>). 394 * The union operator denotes a class that contains every character that is 395 * in at least one of its operand classes. The intersection operator 396 * denotes a class that contains every character that is in both of its 397 * operand classes. 398 * 399 * <p> The precedence of character-class operators is as follows, from 400 * highest to lowest: 401 * 402 * <blockquote><table border="0" cellpadding="1" cellspacing="0" 403 * summary="Precedence of character class operators."> 404 * <tr><th>1 </th> 405 * <td>Literal escape </td> 406 * <td><tt>\x</tt></td></tr> 407 * <tr><th>2 </th> 408 * <td>Grouping</td> 409 * <td><tt>[...]</tt></td></tr> 410 * <tr><th>3 </th> 411 * <td>Range</td> 412 * <td><tt>a-z</tt></td></tr> 413 * <tr><th>4 </th> 414 * <td>Union</td> 415 * <td><tt>[a-e][i-u]</tt></td></tr> 416 * <tr><th>5 </th> 417 * <td>Intersection</td> 418 * <td><tt>[a-z&&[aeiou]]</tt></td></tr> 419 * </table></blockquote> 420 * 421 * <p> Note that a different set of metacharacters are in effect inside 422 * a character class than outside a character class. For instance, the 423 * regular expression <tt>.</tt> loses its special meaning inside a 424 * character class, while the expression <tt>-</tt> becomes a range 425 * forming metacharacter. 426 * 427 * <a name="lt"> 428 * <h4> Line terminators </h4> 429 * 430 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks 431 * the end of a line of the input character sequence. The following are 432 * recognized as line terminators: 433 * 434 * <ul> 435 * 436 * <li> A newline (line feed) character (<tt>'\n'</tt>), 437 * 438 * <li> A carriage-return character followed immediately by a newline 439 * character (<tt>"\r\n"</tt>), 440 * 441 * <li> A standalone carriage-return character (<tt>'\r'</tt>), 442 * 443 * <li> A next-line character (<tt>'\u0085'</tt>), 444 * 445 * <li> A line-separator character (<tt>'\u2028'</tt>), or 446 * 447 * <li> A paragraph-separator character (<tt>'\u2029</tt>). 448 * 449 * </ul> 450 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators 451 * recognized are newline characters. 452 * 453 * <p> The regular expression <tt>.</tt> matches any character except a line 454 * terminator unless the {@link #DOTALL} flag is specified. 455 * 456 * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore 457 * line terminators and only match at the beginning and the end, respectively, 458 * of the entire input sequence. If {@link #MULTILINE} mode is activated then 459 * <tt>^</tt> matches at the beginning of input and after any line terminator 460 * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt> 461 * matches just before a line terminator or the end of the input sequence. 462 * 463 * <a name="cg"> 464 * <h4> Groups and capturing </h4> 465 * 466 * <a name="gnumber"> 467 * <h5> Group number </h5> 468 * <p> Capturing groups are numbered by counting their opening parentheses from 469 * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there 470 * are four such groups: </p> 471 * 472 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings"> 473 * <tr><th>1 </th> 474 * <td><tt>((A)(B(C)))</tt></td></tr> 475 * <tr><th>2 </th> 476 * <td><tt>(A)</tt></td></tr> 477 * <tr><th>3 </th> 478 * <td><tt>(B(C))</tt></td></tr> 479 * <tr><th>4 </th> 480 * <td><tt>(C)</tt></td></tr> 481 * </table></blockquote> 482 * 483 * <p> Group zero always stands for the entire expression. 484 * 485 * <p> Capturing groups are so named because, during a match, each subsequence 486 * of the input sequence that matches such a group is saved. The captured 487 * subsequence may be used later in the expression, via a back reference, and 488 * may also be retrieved from the matcher once the match operation is complete. 489 * 490 * <a name="groupname"> 491 * <h5> Group name </h5> 492 * <p>A capturing group can also be assigned a "name", a <tt>named-capturing group</tt>, 493 * and then be back-referenced later by the "name". Group names are composed of 494 * the following characters. The first character must be a <tt>letter</tt>. 495 * 496 * <ul> 497 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> 498 * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>), 499 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> 500 * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>), 501 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt> 502 * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>), 503 * </ul> 504 * 505 * <p> A <tt>named-capturing group</tt> is still numbered as described in 506 * <a href="#gnumber">Group number</a>. 507 * 508 * <p> The captured input associated with a group is always the subsequence 509 * that the group most recently matched. If a group is evaluated a second time 510 * because of quantification then its previously-captured value, if any, will 511 * be retained if the second evaluation fails. Matching the string 512 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves 513 * group two set to <tt>"b"</tt>. All captured input is discarded at the 514 * beginning of each match. 515 * 516 * <p> Groups beginning with <tt>(?</tt> are either pure, <i>non-capturing</i> groups 517 * that do not capture text and do not count towards the group total, or 518 * <i>named-capturing</i> group. 519 * 520 * <h4> Unicode support </h4> 521 * 522 * <p> This class is in conformance with Level 1 of <a 523 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 524 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1 525 * Canonical Equivalents. 526 * <p> 527 * <b>Unicode escape sequences</b> such as <tt>\u2014</tt> in Java source code 528 * are processed as described in section 3.3 of 529 * <cite>The Java™ Language Specification</cite>. 530 * Such escape sequences are also implemented directly by the regular-expression 531 * parser so that Unicode escapes can be used in expressions that are read from 532 * files or from the keyboard. Thus the strings <tt>"\u2014"</tt> and 533 * <tt>"\\u2014"</tt>, while not equal, compile into the same pattern, which 534 * matches the character with hexadecimal value <tt>0x2014</tt>. 535 * <p> 536 * A Unicode character can also be represented in a regular-expression by 537 * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct 538 * <tt>\x{...}</tt>, for example a supplementary character U+2011F 539 * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive 540 * Unicode escape sequences of the surrogate pair 541 * <tt>\uD840</tt><tt>\uDD1F</tt>. 542 * <p> 543 * Unicode scripts, blocks, categories and binary properties are written with 544 * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl. 545 * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if 546 * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> 547 * does not match if the input has that property. 548 * <p> 549 * Scripts, blocks, categories and binary properties can be used both inside 550 * and outside of a character class. 551 * <a name="usc"> 552 * <p> 553 * <b>Scripts</b> are specified either with the prefix {@code Is}, as in 554 * {@code IsHiragana}, or by using the {@code script} keyword (or its short 555 * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}. 556 * <p> 557 * The script names supported by <code>Pattern</code> are the valid script names 558 * accepted and defined by 559 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. 560 * <a name="ubc"> 561 * <p> 562 * <b>Blocks</b> are specified with the prefix {@code In}, as in 563 * {@code InMongolian}, or by using the keyword {@code block} (or its short 564 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. 565 * <p> 566 * The block names supported by <code>Pattern</code> are the valid block names 567 * accepted and defined by 568 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. 569 * <p> 570 * <a name="ucc"> 571 * <b>Categories</b> may be specified with the optional prefix {@code Is}: 572 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode 573 * letters. Same as scripts and blocks, categories can also be specified 574 * by using the keyword {@code general_category} (or its short form 575 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. 576 * <p> 577 * The supported categories are those of 578 * <a href="http://www.unicode.org/unicode/standard/standard.html"> 579 * <i>The Unicode Standard</i></a> in the version specified by the 580 * {@link java.lang.Character Character} class. The category names are those 581 * defined in the Standard, both normative and informative. 582 * <p> 583 * <a name="ubpc"> 584 * <b>Binary properties</b> are specified with the prefix {@code Is}, as in 585 * {@code IsAlphabetic}. The supported binary properties by <code>Pattern</code> 586 * are 587 * <ul> 588 * <li> Alphabetic 589 * <li> Ideographic 590 * <li> Letter 591 * <li> Lowercase 592 * <li> Uppercase 593 * <li> Titlecase 594 * <li> Punctuation 595 * <Li> Control 596 * <li> White_Space 597 * <li> Digit 598 * <li> Hex_Digit 599 * <li> Noncharacter_Code_Point 600 * <li> Assigned 601 * </ul> 602 603 604 * <p> 605 * <b>Predefined Character classes</b> and <b>POSIX character classes</b> are in 606 * conformance with the recommendation of <i>Annex C: Compatibility Properties</i> 607 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression 608 * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified. 609 * <p> 610 * <table border="0" cellpadding="1" cellspacing="0" 611 * summary="predefined and posix character classes in Unicode mode"> 612 * <tr align="left"> 613 * <th bgcolor="#CCCCFF" align="left" id="classes">Classes</th> 614 * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th> 615 *</tr> 616 * <tr><td><tt>\p{Lower}</tt></td> 617 * <td>A lowercase character:<tt>\p{IsLowercase}</tt></td></tr> 618 * <tr><td><tt>\p{Upper}</tt></td> 619 * <td>An uppercase character:<tt>\p{IsUppercase}</tt></td></tr> 620 * <tr><td><tt>\p{ASCII}</tt></td> 621 * <td>All ASCII:<tt>[\x00-\x7F]</tt></td></tr> 622 * <tr><td><tt>\p{Alpha}</tt></td> 623 * <td>An alphabetic character:<tt>\p{IsAlphabetic}</tt></td></tr> 624 * <tr><td><tt>\p{Digit}</tt></td> 625 * <td>A decimal digit character:<tt>p{IsDigit}</tt></td></tr> 626 * <tr><td><tt>\p{Alnum}</tt></td> 627 * <td>An alphanumeric character:<tt>[\p{IsAlphabetic}\p{IsDigit}]</tt></td></tr> 628 * <tr><td><tt>\p{Punct}</tt></td> 629 * <td>A punctuation character:<tt>p{IsPunctuation}</tt></td></tr> 630 * <tr><td><tt>\p{Graph}</tt></td> 631 * <td>A visible character: <tt>[^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]</tt></td></tr> 632 * <tr><td><tt>\p{Print}</tt></td> 633 * <td>A printable character: <tt>[\p{Graph}\p{Blank}&&[^\p{Cntrl}]]</tt></td></tr> 634 * <tr><td><tt>\p{Blank}</tt></td> 635 * <td>A space or a tab: <tt>[\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]</tt></td></tr> 636 * <tr><td><tt>\p{Cntrl}</tt></td> 637 * <td>A control character: <tt>\p{gc=Cc}</tt></td></tr> 638 * <tr><td><tt>\p{XDigit}</tt></td> 639 * <td>A hexadecimal digit: <tt>[\p{gc=Nd}\p{IsHex_Digit}]</tt></td></tr> 640 * <tr><td><tt>\p{Space}</tt></td> 641 * <td>A whitespace character:<tt>\p{IsWhite_Space}</tt></td></tr> 642 * <tr><td><tt>\d</tt></td> 643 * <td>A digit: <tt>\p{IsDigit}</tt></td></tr> 644 * <tr><td><tt>\D</tt></td> 645 * <td>A non-digit: <tt>[^\d]</tt></td></tr> 646 * <tr><td><tt>\s</tt></td> 647 * <td>A whitespace character: <tt>\p{IsWhite_Space}</tt></td></tr> 648 * <tr><td><tt>\S</tt></td> 649 * <td>A non-whitespace character: <tt>[^\s]</tt></td></tr> 650 * <tr><td><tt>\w</tt></td> 651 * <td>A word character: <tt>[\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}]</tt></td></tr> 652 * <tr><td><tt>\W</tt></td> 653 * <td>A non-word character: <tt>[^\w]</tt></td></tr> 654 * </table> 655 * <p> 656 * <a name="jcc"> 657 * Categories that behave like the java.lang.Character 658 * boolean is<i>methodname</i> methods (except for the deprecated ones) are 659 * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where 660 * the specified property has the name <tt>java<i>methodname</i></tt>. 661 * 662 * <h4> Comparison to Perl 5 </h4> 663 * 664 * <p>The <code>Pattern</code> engine performs traditional NFA-based matching 665 * with ordered alternation as occurs in Perl 5. 666 * 667 * <p> Perl constructs not supported by this class: </p> 668 * 669 * <ul> 670 * <li><p> Predefined character classes (Unicode character) 671 * <p><tt>\h </tt>A horizontal whitespace 672 * <p><tt>\H </tt>A non horizontal whitespace 673 * <p><tt>\v </tt>A vertical whitespace 674 * <p><tt>\V </tt>A non vertical whitespace 675 * <p><tt>\R </tt>Any Unicode linebreak sequence 676 * <tt>\u005cu000D\u005cu000A|[\u005cu000A\u005cu000B\u005cu000C\u005cu000D\u005cu0085\u005cu2028\u005cu2029]</tt> 677 * <p><tt>\X </tt>Match Unicode 678 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> 679 * <i>extended grapheme cluster</i></a> 680 * </p></li> 681 * 682 * <li><p> The backreference constructs, <tt>\g{</tt><i>n</i><tt>}</tt> for 683 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and 684 * <tt>\g{</tt><i>name</i><tt>}</tt> for 685 * <a href="#groupname">named-capturing group</a>. 686 * </p></li> 687 * 688 * <li><p> The named character construct, <tt>\N{</tt><i>name</i><tt>}</tt> 689 * for a Unicode character by its name. 690 * </p></li> 691 * 692 * <li><p> The conditional constructs 693 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>)</tt> and 694 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>, 695 * </p></li> 696 * 697 * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt> 698 * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li> 699 * 700 * <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li> 701 * 702 * <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>, 703 * <tt>\L</tt>, and <tt>\U</tt>. </p></li> 704 * 705 * </ul> 706 * 707 * <p> Constructs supported by this class but not by Perl: </p> 708 * 709 * <ul> 710 * 711 * <li><p> Character-class union and intersection as described 712 * <a href="#cc">above</a>.</p></li> 713 * 714 * </ul> 715 * 716 * <p> Notable differences from Perl: </p> 717 * 718 * <ul> 719 * 720 * <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted 721 * as back references; a backslash-escaped number greater than <tt>9</tt> is 722 * treated as a back reference if at least that many subexpressions exist, 723 * otherwise it is interpreted, if possible, as an octal escape. In this 724 * class octal escapes must always begin with a zero. In this class, 725 * <tt>\1</tt> through <tt>\9</tt> are always interpreted as back 726 * references, and a larger number is accepted as a back reference if at 727 * least that many subexpressions exist at that point in the regular 728 * expression, otherwise the parser will drop digits until the number is 729 * smaller or equal to the existing number of groups or it is one digit. 730 * </p></li> 731 * 732 * <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes 733 * where the last match left off. This functionality is provided implicitly 734 * by the {@link Matcher} class: Repeated invocations of the {@link 735 * Matcher#find find} method will resume where the last match left off, 736 * unless the matcher is reset. </p></li> 737 * 738 * <li><p> In Perl, embedded flags at the top level of an expression affect 739 * the whole expression. In this class, embedded flags always take effect 740 * at the point at which they appear, whether they are at the top level or 741 * within a group; in the latter case, flags are restored at the end of the 742 * group just as in Perl. </p></li> 743 * 744 * </ul> 745 * 746 * 747 * <p> For a more precise description of the behavior of regular expression 748 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/"> 749 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl, 750 * O'Reilly and Associates, 2006.</a> 751 * </p> 752 * 753 * @see java.lang.String#split(String, int) 754 * @see java.lang.String#split(String) 755 * 756 * @author Mike McCloskey 757 * @author Mark Reinhold 758 * @author JSR-51 Expert Group 759 * @since 1.4 760 * @spec JSR-51 761 */ 762 763 public final class Pattern 764 implements java.io.Serializable 765 { 766 767 /** 768 * Regular expression modifier values. Instead of being passed as 769 * arguments, they can also be passed as inline modifiers. 770 * For example, the following statements have the same effect. 771 * <pre> 772 * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M); 773 * RegExp r2 = RegExp.compile("(?im)abc", 0); 774 * </pre> 775 * 776 * The flags are duplicated so that the familiar Perl match flag 777 * names are available. 778 */ 779 780 /** 781 * Enables Unix lines mode. 782 * 783 * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized 784 * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>. 785 * 786 * <p> Unix lines mode can also be enabled via the embedded flag 787 * expression <tt>(?d)</tt>. 788 */ 789 public static final int UNIX_LINES = 0x01; 790 791 /** 792 * Enables case-insensitive matching. 793 * 794 * <p> By default, case-insensitive matching assumes that only characters 795 * in the US-ASCII charset are being matched. Unicode-aware 796 * case-insensitive matching can be enabled by specifying the {@link 797 * #UNICODE_CASE} flag in conjunction with this flag. 798 * 799 * <p> Case-insensitive matching can also be enabled via the embedded flag 800 * expression <tt>(?i)</tt>. 801 * 802 * <p> Specifying this flag may impose a slight performance penalty. </p> 803 */ 804 public static final int CASE_INSENSITIVE = 0x02; 805 806 /** 807 * Permits whitespace and comments in pattern. 808 * 809 * <p> In this mode, whitespace is ignored, and embedded comments starting 810 * with <tt>#</tt> are ignored until the end of a line. 811 * 812 * <p> Comments mode can also be enabled via the embedded flag 813 * expression <tt>(?x)</tt>. 814 */ 815 public static final int COMMENTS = 0x04; 816 817 /** 818 * Enables multiline mode. 819 * 820 * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match 821 * just after or just before, respectively, a line terminator or the end of 822 * the input sequence. By default these expressions only match at the 823 * beginning and the end of the entire input sequence. 824 * 825 * <p> Multiline mode can also be enabled via the embedded flag 826 * expression <tt>(?m)</tt>. </p> 827 */ 828 public static final int MULTILINE = 0x08; 829 830 /** 831 * Enables literal parsing of the pattern. 832 * 833 * <p> When this flag is specified then the input string that specifies 834 * the pattern is treated as a sequence of literal characters. 835 * Metacharacters or escape sequences in the input sequence will be 836 * given no special meaning. 837 * 838 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on 839 * matching when used in conjunction with this flag. The other flags 840 * become superfluous. 841 * 842 * <p> There is no embedded flag character for enabling literal parsing. 843 * @since 1.5 844 */ 845 public static final int LITERAL = 0x10; 846 847 /** 848 * Enables dotall mode. 849 * 850 * <p> In dotall mode, the expression <tt>.</tt> matches any character, 851 * including a line terminator. By default this expression does not match 852 * line terminators. 853 * 854 * <p> Dotall mode can also be enabled via the embedded flag 855 * expression <tt>(?s)</tt>. (The <tt>s</tt> is a mnemonic for 856 * "single-line" mode, which is what this is called in Perl.) </p> 857 */ 858 public static final int DOTALL = 0x20; 859 860 /** 861 * Enables Unicode-aware case folding. 862 * 863 * <p> When this flag is specified then case-insensitive matching, when 864 * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner 865 * consistent with the Unicode Standard. By default, case-insensitive 866 * matching assumes that only characters in the US-ASCII charset are being 867 * matched. 868 * 869 * <p> Unicode-aware case folding can also be enabled via the embedded flag 870 * expression <tt>(?u)</tt>. 871 * 872 * <p> Specifying this flag may impose a performance penalty. </p> 873 */ 874 public static final int UNICODE_CASE = 0x40; 875 876 /** 877 * Enables canonical equivalence. 878 * 879 * <p> When this flag is specified then two characters will be considered 880 * to match if, and only if, their full canonical decompositions match. 881 * The expression <tt>"a\u030A"</tt>, for example, will match the 882 * string <tt>"\u00E5"</tt> when this flag is specified. By default, 883 * matching does not take canonical equivalence into account. 884 * 885 * <p> There is no embedded flag character for enabling canonical 886 * equivalence. 887 * 888 * <p> Specifying this flag may impose a performance penalty. </p> 889 */ 890 public static final int CANON_EQ = 0x80; 891 892 /** 893 * Enables the Unicode version of <i>Predefined character classes</i> and 894 * <i>POSIX character classes</i>. 895 * 896 * <p> When this flag is specified then the (US-ASCII only) 897 * <i>Predefined character classes</i> and <i>POSIX character classes</i> 898 * are in conformance with 899 * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 900 * Standard #18: Unicode Regular Expression</i></a> 901 * <i>Annex C: Compatibility Properties</i>. 902 * <p> 903 * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded 904 * flag expression <tt>(?U)</tt>. 905 * <p> 906 * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case 907 * folding. 908 * <p> 909 * Specifying this flag may impose a performance penalty. </p> 910 * @since 1.7 911 */ 912 public static final int UNICODE_CHARACTER_CLASS = 0x100; 913 914 /* Pattern has only two serialized components: The pattern string 915 * and the flags, which are all that is needed to recompile the pattern 916 * when it is deserialized. 917 */ 918 919 /** use serialVersionUID from Merlin b59 for interoperability */ 920 private static final long serialVersionUID = 5073258162644648461L; 921 922 /** 923 * The original regular-expression pattern string. 924 * 925 * @serial 926 */ 927 private String pattern; 928 929 /** 930 * The original pattern flags. 931 * 932 * @serial 933 */ 934 private int flags; 935 936 /** 937 * Boolean indicating this Pattern is compiled; this is necessary in order 938 * to lazily compile deserialized Patterns. 939 */ 940 private transient volatile boolean compiled = false; 941 942 /** 943 * The normalized pattern string. 944 */ 945 private transient String normalizedPattern; 946 947 /** 948 * The starting point of state machine for the find operation. This allows 949 * a match to start anywhere in the input. 950 */ 951 transient Node root; 952 953 /** 954 * The root of object tree for a match operation. The pattern is matched 955 * at the beginning. This may include a find that uses BnM or a First 956 * node. 957 */ 958 transient Node matchRoot; 959 960 /** 961 * Temporary storage used by parsing pattern slice. 962 */ 963 transient int[] buffer; 964 965 /** 966 * Map the "name" of the "named capturing group" to its group id 967 * node. 968 */ 969 transient volatile Map<String, Integer> namedGroups; 970 971 /** 972 * Temporary storage used while parsing group references. 973 */ 974 transient GroupHead[] groupNodes; 975 976 /** 977 * Temporary null terminated code point array used by pattern compiling. 978 */ 979 private transient int[] temp; 980 981 /** 982 * The number of capturing groups in this Pattern. Used by matchers to 983 * allocate storage needed to perform a match. 984 */ 985 transient int capturingGroupCount; 986 987 /** 988 * The local variable count used by parsing tree. Used by matchers to 989 * allocate storage needed to perform a match. 990 */ 991 transient int localCount; 992 993 /** 994 * Index into the pattern string that keeps track of how much has been 995 * parsed. 996 */ 997 private transient int cursor; 998 999 /** 1000 * Holds the length of the pattern string. 1001 */ 1002 private transient int patternLength; 1003 1004 /** 1005 * If the Start node might possibly match supplementary characters. 1006 * It is set to true during compiling if 1007 * (1) There is supplementary char in pattern, or 1008 * (2) There is complement node of Category or Block 1009 */ 1010 private transient boolean hasSupplementary; 1011 1012 /** 1013 * Compiles the given regular expression into a pattern. </p> 1014 * 1015 * @param regex 1016 * The expression to be compiled 1017 * 1018 * @throws PatternSyntaxException 1019 * If the expression's syntax is invalid 1020 */ 1021 public static Pattern compile(String regex) { 1022 return new Pattern(regex, 0); 1023 } 1024 1025 /** 1026 * Compiles the given regular expression into a pattern with the given 1027 * flags. </p> 1028 * 1029 * @param regex 1030 * The expression to be compiled 1031 * 1032 * @param flags 1033 * Match flags, a bit mask that may include 1034 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, 1035 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, 1036 * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} 1037 * and {@link #COMMENTS} 1038 * 1039 * @throws IllegalArgumentException 1040 * If bit values other than those corresponding to the defined 1041 * match flags are set in <tt>flags</tt> 1042 * 1043 * @throws PatternSyntaxException 1044 * If the expression's syntax is invalid 1045 */ 1046 public static Pattern compile(String regex, int flags) { 1047 return new Pattern(regex, flags); 1048 } 1049 1050 /** 1051 * Returns the regular expression from which this pattern was compiled. 1052 * </p> 1053 * 1054 * @return The source of this pattern 1055 */ 1056 public String pattern() { 1057 return pattern; 1058 } 1059 1060 /** 1061 * <p>Returns the string representation of this pattern. This 1062 * is the regular expression from which this pattern was 1063 * compiled.</p> 1064 * 1065 * @return The string representation of this pattern 1066 * @since 1.5 1067 */ 1068 public String toString() { 1069 return pattern; 1070 } 1071 1072 /** 1073 * Creates a matcher that will match the given input against this pattern. 1074 * </p> 1075 * 1076 * @param input 1077 * The character sequence to be matched 1078 * 1079 * @return A new matcher for this pattern 1080 */ 1081 public Matcher matcher(CharSequence input) { 1082 if (!compiled) { 1083 synchronized(this) { 1084 if (!compiled) 1085 compile(); 1086 } 1087 } 1088 Matcher m = new Matcher(this, input); 1089 return m; 1090 } 1091 1092 /** 1093 * Returns this pattern's match flags. </p> 1094 * 1095 * @return The match flags specified when this pattern was compiled 1096 */ 1097 public int flags() { 1098 return flags; 1099 } 1100 1101 /** 1102 * Compiles the given regular expression and attempts to match the given 1103 * input against it. 1104 * 1105 * <p> An invocation of this convenience method of the form 1106 * 1107 * <blockquote><pre> 1108 * Pattern.matches(regex, input);</pre></blockquote> 1109 * 1110 * behaves in exactly the same way as the expression 1111 * 1112 * <blockquote><pre> 1113 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote> 1114 * 1115 * <p> If a pattern is to be used multiple times, compiling it once and reusing 1116 * it will be more efficient than invoking this method each time. </p> 1117 * 1118 * @param regex 1119 * The expression to be compiled 1120 * 1121 * @param input 1122 * The character sequence to be matched 1123 * 1124 * @throws PatternSyntaxException 1125 * If the expression's syntax is invalid 1126 */ 1127 public static boolean matches(String regex, CharSequence input) { 1128 Pattern p = Pattern.compile(regex); 1129 Matcher m = p.matcher(input); 1130 return m.matches(); 1131 } 1132 1133 /** 1134 * Splits the given input sequence around matches of this pattern. 1135 * 1136 * <p> The array returned by this method contains each substring of the 1137 * input sequence that is terminated by another subsequence that matches 1138 * this pattern or is terminated by the end of the input sequence. The 1139 * substrings in the array are in the order in which they occur in the 1140 * input. If this pattern does not match any subsequence of the input then 1141 * the resulting array has just one element, namely the input sequence in 1142 * string form. 1143 * 1144 * <p> The <tt>limit</tt> parameter controls the number of times the 1145 * pattern is applied and therefore affects the length of the resulting 1146 * array. If the limit <i>n</i> is greater than zero then the pattern 1147 * will be applied at most <i>n</i> - 1 times, the array's 1148 * length will be no greater than <i>n</i>, and the array's last entry 1149 * will contain all input beyond the last matched delimiter. If <i>n</i> 1150 * is non-positive then the pattern will be applied as many times as 1151 * possible and the array can have any length. If <i>n</i> is zero then 1152 * the pattern will be applied as many times as possible, the array can 1153 * have any length, and trailing empty strings will be discarded. 1154 * 1155 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following 1156 * results with these parameters: 1157 * 1158 * <blockquote><table cellpadding=1 cellspacing=0 1159 * summary="Split examples showing regex, limit, and result"> 1160 * <tr><th><P align="left"><i>Regex </i></th> 1161 * <th><P align="left"><i>Limit </i></th> 1162 * <th><P align="left"><i>Result </i></th></tr> 1163 * <tr><td align=center>:</td> 1164 * <td align=center>2</td> 1165 * <td><tt>{ "boo", "and:foo" }</tt></td></tr> 1166 * <tr><td align=center>:</td> 1167 * <td align=center>5</td> 1168 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr> 1169 * <tr><td align=center>:</td> 1170 * <td align=center>-2</td> 1171 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr> 1172 * <tr><td align=center>o</td> 1173 * <td align=center>5</td> 1174 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr> 1175 * <tr><td align=center>o</td> 1176 * <td align=center>-2</td> 1177 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr> 1178 * <tr><td align=center>o</td> 1179 * <td align=center>0</td> 1180 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr> 1181 * </table></blockquote> 1182 * 1183 * 1184 * @param input 1185 * The character sequence to be split 1186 * 1187 * @param limit 1188 * The result threshold, as described above 1189 * 1190 * @return The array of strings computed by splitting the input 1191 * around matches of this pattern 1192 */ 1193 public String[] split(CharSequence input, int limit) { 1194 int index = 0; 1195 boolean matchLimited = limit > 0; 1196 ArrayList<String> matchList = new ArrayList<>(); 1197 Matcher m = matcher(input); 1198 1199 // Add segments before each match found 1200 while(m.find()) { 1201 if (!matchLimited || matchList.size() < limit - 1) { 1202 String match = input.subSequence(index, m.start()).toString(); 1203 matchList.add(match); 1204 index = m.end(); 1205 } else if (matchList.size() == limit - 1) { // last one 1206 String match = input.subSequence(index, 1207 input.length()).toString(); 1208 matchList.add(match); 1209 index = m.end(); 1210 } 1211 } 1212 1213 // If no match was found, return this 1214 if (index == 0) 1215 return new String[] {input.toString()}; 1216 1217 // Add remaining segment 1218 if (!matchLimited || matchList.size() < limit) 1219 matchList.add(input.subSequence(index, input.length()).toString()); 1220 1221 // Construct result 1222 int resultSize = matchList.size(); 1223 if (limit == 0) 1224 while (resultSize > 0 && matchList.get(resultSize-1).equals("")) 1225 resultSize--; 1226 String[] result = new String[resultSize]; 1227 return matchList.subList(0, resultSize).toArray(result); 1228 } 1229 1230 /** 1231 * Splits the given input sequence around matches of this pattern. 1232 * 1233 * <p> This method works as if by invoking the two-argument {@link 1234 * #split(java.lang.CharSequence, int) split} method with the given input 1235 * sequence and a limit argument of zero. Trailing empty strings are 1236 * therefore not included in the resulting array. </p> 1237 * 1238 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following 1239 * results with these expressions: 1240 * 1241 * <blockquote><table cellpadding=1 cellspacing=0 1242 * summary="Split examples showing regex and result"> 1243 * <tr><th><P align="left"><i>Regex </i></th> 1244 * <th><P align="left"><i>Result</i></th></tr> 1245 * <tr><td align=center>:</td> 1246 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr> 1247 * <tr><td align=center>o</td> 1248 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr> 1249 * </table></blockquote> 1250 * 1251 * 1252 * @param input 1253 * The character sequence to be split 1254 * 1255 * @return The array of strings computed by splitting the input 1256 * around matches of this pattern 1257 */ 1258 public String[] split(CharSequence input) { 1259 return split(input, 0); 1260 } 1261 1262 /** 1263 * Returns a literal pattern <code>String</code> for the specified 1264 * <code>String</code>. 1265 * 1266 * <p>This method produces a <code>String</code> that can be used to 1267 * create a <code>Pattern</code> that would match the string 1268 * <code>s</code> as if it were a literal pattern.</p> Metacharacters 1269 * or escape sequences in the input sequence will be given no special 1270 * meaning. 1271 * 1272 * @param s The string to be literalized 1273 * @return A literal string replacement 1274 * @since 1.5 1275 */ 1276 public static String quote(String s) { 1277 int slashEIndex = s.indexOf("\\E"); 1278 if (slashEIndex == -1) 1279 return "\\Q" + s + "\\E"; 1280 1281 StringBuilder sb = new StringBuilder(s.length() * 2); 1282 sb.append("\\Q"); 1283 slashEIndex = 0; 1284 int current = 0; 1285 while ((slashEIndex = s.indexOf("\\E", current)) != -1) { 1286 sb.append(s.substring(current, slashEIndex)); 1287 current = slashEIndex + 2; 1288 sb.append("\\E\\\\E\\Q"); 1289 } 1290 sb.append(s.substring(current, s.length())); 1291 sb.append("\\E"); 1292 return sb.toString(); 1293 } 1294 1295 /** 1296 * Recompile the Pattern instance from a stream. The original pattern 1297 * string is read in and the object tree is recompiled from it. 1298 */ 1299 private void readObject(java.io.ObjectInputStream s) 1300 throws java.io.IOException, ClassNotFoundException { 1301 1302 // Read in all fields 1303 s.defaultReadObject(); 1304 1305 // Initialize counts 1306 capturingGroupCount = 1; 1307 localCount = 0; 1308 1309 // if length > 0, the Pattern is lazily compiled 1310 compiled = false; 1311 if (pattern.length() == 0) { 1312 root = new Start(lastAccept); 1313 matchRoot = lastAccept; 1314 compiled = true; 1315 } 1316 } 1317 1318 /** 1319 * This private constructor is used to create all Patterns. The pattern 1320 * string and match flags are all that is needed to completely describe 1321 * a Pattern. An empty pattern string results in an object tree with 1322 * only a Start node and a LastNode node. 1323 */ 1324 private Pattern(String p, int f) { 1325 pattern = p; 1326 flags = f; 1327 1328 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present 1329 if ((flags & UNICODE_CHARACTER_CLASS) != 0) 1330 flags |= UNICODE_CASE; 1331 1332 // Reset group index count 1333 capturingGroupCount = 1; 1334 localCount = 0; 1335 1336 if (pattern.length() > 0) { 1337 compile(); 1338 } else { 1339 root = new Start(lastAccept); 1340 matchRoot = lastAccept; 1341 } 1342 } 1343 1344 /** 1345 * The pattern is converted to normalizedD form and then a pure group 1346 * is constructed to match canonical equivalences of the characters. 1347 */ 1348 private void normalize() { 1349 boolean inCharClass = false; 1350 int lastCodePoint = -1; 1351 1352 // Convert pattern into normalizedD form 1353 normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD); 1354 patternLength = normalizedPattern.length(); 1355 1356 // Modify pattern to match canonical equivalences 1357 StringBuilder newPattern = new StringBuilder(patternLength); 1358 for(int i=0; i<patternLength; ) { 1359 int c = normalizedPattern.codePointAt(i); 1360 StringBuilder sequenceBuffer; 1361 if ((Character.getType(c) == Character.NON_SPACING_MARK) 1362 && (lastCodePoint != -1)) { 1363 sequenceBuffer = new StringBuilder(); 1364 sequenceBuffer.appendCodePoint(lastCodePoint); 1365 sequenceBuffer.appendCodePoint(c); 1366 while(Character.getType(c) == Character.NON_SPACING_MARK) { 1367 i += Character.charCount(c); 1368 if (i >= patternLength) 1369 break; 1370 c = normalizedPattern.codePointAt(i); 1371 sequenceBuffer.appendCodePoint(c); 1372 } 1373 String ea = produceEquivalentAlternation( 1374 sequenceBuffer.toString()); 1375 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint)); 1376 newPattern.append("(?:").append(ea).append(")"); 1377 } else if (c == '[' && lastCodePoint != '\\') { 1378 i = normalizeCharClass(newPattern, i); 1379 } else { 1380 newPattern.appendCodePoint(c); 1381 } 1382 lastCodePoint = c; 1383 i += Character.charCount(c); 1384 } 1385 normalizedPattern = newPattern.toString(); 1386 } 1387 1388 /** 1389 * Complete the character class being parsed and add a set 1390 * of alternations to it that will match the canonical equivalences 1391 * of the characters within the class. 1392 */ 1393 private int normalizeCharClass(StringBuilder newPattern, int i) { 1394 StringBuilder charClass = new StringBuilder(); 1395 StringBuilder eq = null; 1396 int lastCodePoint = -1; 1397 String result; 1398 1399 i++; 1400 charClass.append("["); 1401 while(true) { 1402 int c = normalizedPattern.codePointAt(i); 1403 StringBuilder sequenceBuffer; 1404 1405 if (c == ']' && lastCodePoint != '\\') { 1406 charClass.append((char)c); 1407 break; 1408 } else if (Character.getType(c) == Character.NON_SPACING_MARK) { 1409 sequenceBuffer = new StringBuilder(); 1410 sequenceBuffer.appendCodePoint(lastCodePoint); 1411 while(Character.getType(c) == Character.NON_SPACING_MARK) { 1412 sequenceBuffer.appendCodePoint(c); 1413 i += Character.charCount(c); 1414 if (i >= normalizedPattern.length()) 1415 break; 1416 c = normalizedPattern.codePointAt(i); 1417 } 1418 String ea = produceEquivalentAlternation( 1419 sequenceBuffer.toString()); 1420 1421 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint)); 1422 if (eq == null) 1423 eq = new StringBuilder(); 1424 eq.append('|'); 1425 eq.append(ea); 1426 } else { 1427 charClass.appendCodePoint(c); 1428 i++; 1429 } 1430 if (i == normalizedPattern.length()) 1431 throw error("Unclosed character class"); 1432 lastCodePoint = c; 1433 } 1434 1435 if (eq != null) { 1436 result = "(?:"+charClass.toString()+eq.toString()+")"; 1437 } else { 1438 result = charClass.toString(); 1439 } 1440 1441 newPattern.append(result); 1442 return i; 1443 } 1444 1445 /** 1446 * Given a specific sequence composed of a regular character and 1447 * combining marks that follow it, produce the alternation that will 1448 * match all canonical equivalences of that sequence. 1449 */ 1450 private String produceEquivalentAlternation(String source) { 1451 int len = countChars(source, 0, 1); 1452 if (source.length() == len) 1453 // source has one character. 1454 return source; 1455 1456 String base = source.substring(0,len); 1457 String combiningMarks = source.substring(len); 1458 1459 String[] perms = producePermutations(combiningMarks); 1460 StringBuilder result = new StringBuilder(source); 1461 1462 // Add combined permutations 1463 for(int x=0; x<perms.length; x++) { 1464 String next = base + perms[x]; 1465 if (x>0) 1466 result.append("|"+next); 1467 next = composeOneStep(next); 1468 if (next != null) 1469 result.append("|"+produceEquivalentAlternation(next)); 1470 } 1471 return result.toString(); 1472 } 1473 1474 /** 1475 * Returns an array of strings that have all the possible 1476 * permutations of the characters in the input string. 1477 * This is used to get a list of all possible orderings 1478 * of a set of combining marks. Note that some of the permutations 1479 * are invalid because of combining class collisions, and these 1480 * possibilities must be removed because they are not canonically 1481 * equivalent. 1482 */ 1483 private String[] producePermutations(String input) { 1484 if (input.length() == countChars(input, 0, 1)) 1485 return new String[] {input}; 1486 1487 if (input.length() == countChars(input, 0, 2)) { 1488 int c0 = Character.codePointAt(input, 0); 1489 int c1 = Character.codePointAt(input, Character.charCount(c0)); 1490 if (getClass(c1) == getClass(c0)) { 1491 return new String[] {input}; 1492 } 1493 String[] result = new String[2]; 1494 result[0] = input; 1495 StringBuilder sb = new StringBuilder(2); 1496 sb.appendCodePoint(c1); 1497 sb.appendCodePoint(c0); 1498 result[1] = sb.toString(); 1499 return result; 1500 } 1501 1502 int length = 1; 1503 int nCodePoints = countCodePoints(input); 1504 for(int x=1; x<nCodePoints; x++) 1505 length = length * (x+1); 1506 1507 String[] temp = new String[length]; 1508 1509 int combClass[] = new int[nCodePoints]; 1510 for(int x=0, i=0; x<nCodePoints; x++) { 1511 int c = Character.codePointAt(input, i); 1512 combClass[x] = getClass(c); 1513 i += Character.charCount(c); 1514 } 1515 1516 // For each char, take it out and add the permutations 1517 // of the remaining chars 1518 int index = 0; 1519 int len; 1520 // offset maintains the index in code units. 1521 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { 1522 len = countChars(input, offset, 1); 1523 boolean skip = false; 1524 for(int y=x-1; y>=0; y--) { 1525 if (combClass[y] == combClass[x]) { 1526 continue loop; 1527 } 1528 } 1529 StringBuilder sb = new StringBuilder(input); 1530 String otherChars = sb.delete(offset, offset+len).toString(); 1531 String[] subResult = producePermutations(otherChars); 1532 1533 String prefix = input.substring(offset, offset+len); 1534 for(int y=0; y<subResult.length; y++) 1535 temp[index++] = prefix + subResult[y]; 1536 } 1537 String[] result = new String[index]; 1538 for (int x=0; x<index; x++) 1539 result[x] = temp[x]; 1540 return result; 1541 } 1542 1543 private int getClass(int c) { 1544 return sun.text.Normalizer.getCombiningClass(c); 1545 } 1546 1547 /** 1548 * Attempts to compose input by combining the first character 1549 * with the first combining mark following it. Returns a String 1550 * that is the composition of the leading character with its first 1551 * combining mark followed by the remaining combining marks. Returns 1552 * null if the first two characters cannot be further composed. 1553 */ 1554 private String composeOneStep(String input) { 1555 int len = countChars(input, 0, 2); 1556 String firstTwoCharacters = input.substring(0, len); 1557 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); 1558 1559 if (result.equals(firstTwoCharacters)) 1560 return null; 1561 else { 1562 String remainder = input.substring(len); 1563 return result + remainder; 1564 } 1565 } 1566 1567 /** 1568 * Preprocess any \Q...\E sequences in `temp', meta-quoting them. 1569 * See the description of `quotemeta' in perlfunc(1). 1570 */ 1571 private void RemoveQEQuoting() { 1572 final int pLen = patternLength; 1573 int i = 0; 1574 while (i < pLen-1) { 1575 if (temp[i] != '\\') 1576 i += 1; 1577 else if (temp[i + 1] != 'Q') 1578 i += 2; 1579 else 1580 break; 1581 } 1582 if (i >= pLen - 1) // No \Q sequence found 1583 return; 1584 int j = i; 1585 i += 2; 1586 int[] newtemp = new int[j + 2*(pLen-i) + 2]; 1587 System.arraycopy(temp, 0, newtemp, 0, j); 1588 1589 boolean inQuote = true; 1590 while (i < pLen) { 1591 int c = temp[i++]; 1592 if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) { 1593 newtemp[j++] = c; 1594 } else if (c != '\\') { 1595 if (inQuote) newtemp[j++] = '\\'; 1596 newtemp[j++] = c; 1597 } else if (inQuote) { 1598 if (temp[i] == 'E') { 1599 i++; 1600 inQuote = false; 1601 } else { 1602 newtemp[j++] = '\\'; 1603 newtemp[j++] = '\\'; 1604 } 1605 } else { 1606 if (temp[i] == 'Q') { 1607 i++; 1608 inQuote = true; 1609 } else { 1610 newtemp[j++] = c; 1611 if (i != pLen) 1612 newtemp[j++] = temp[i++]; 1613 } 1614 } 1615 } 1616 1617 patternLength = j; 1618 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination 1619 } 1620 1621 /** 1622 * Copies regular expression to an int array and invokes the parsing 1623 * of the expression which will create the object tree. 1624 */ 1625 private void compile() { 1626 // Handle canonical equivalences 1627 if (has(CANON_EQ) && !has(LITERAL)) { 1628 normalize(); 1629 } else { 1630 normalizedPattern = pattern; 1631 } 1632 patternLength = normalizedPattern.length(); 1633 1634 // Copy pattern to int array for convenience 1635 // Use double zero to terminate pattern 1636 temp = new int[patternLength + 2]; 1637 1638 hasSupplementary = false; 1639 int c, count = 0; 1640 // Convert all chars into code points 1641 for (int x = 0; x < patternLength; x += Character.charCount(c)) { 1642 c = normalizedPattern.codePointAt(x); 1643 if (isSupplementary(c)) { 1644 hasSupplementary = true; 1645 } 1646 temp[count++] = c; 1647 } 1648 1649 patternLength = count; // patternLength now in code points 1650 1651 if (! has(LITERAL)) 1652 RemoveQEQuoting(); 1653 1654 // Allocate all temporary objects here. 1655 buffer = new int[32]; 1656 groupNodes = new GroupHead[10]; 1657 namedGroups = null; 1658 1659 if (has(LITERAL)) { 1660 // Literal pattern handling 1661 matchRoot = newSlice(temp, patternLength, hasSupplementary); 1662 matchRoot.next = lastAccept; 1663 } else { 1664 // Start recursive descent parsing 1665 matchRoot = expr(lastAccept); 1666 // Check extra pattern characters 1667 if (patternLength != cursor) { 1668 if (peek() == ')') { 1669 throw error("Unmatched closing ')'"); 1670 } else { 1671 throw error("Unexpected internal error"); 1672 } 1673 } 1674 } 1675 1676 // Peephole optimization 1677 if (matchRoot instanceof Slice) { 1678 root = BnM.optimize(matchRoot); 1679 if (root == matchRoot) { 1680 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1681 } 1682 } else if (matchRoot instanceof Begin || matchRoot instanceof First) { 1683 root = matchRoot; 1684 } else { 1685 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1686 } 1687 1688 // Release temporary storage 1689 temp = null; 1690 buffer = null; 1691 groupNodes = null; 1692 patternLength = 0; 1693 compiled = true; 1694 } 1695 1696 Map<String, Integer> namedGroups() { 1697 if (namedGroups == null) 1698 namedGroups = new HashMap<>(2); 1699 return namedGroups; 1700 } 1701 1702 /** 1703 * Used to print out a subtree of the Pattern to help with debugging. 1704 */ 1705 private static void printObjectTree(Node node) { 1706 while(node != null) { 1707 if (node instanceof Prolog) { 1708 System.out.println(node); 1709 printObjectTree(((Prolog)node).loop); 1710 System.out.println("**** end contents prolog loop"); 1711 } else if (node instanceof Loop) { 1712 System.out.println(node); 1713 printObjectTree(((Loop)node).body); 1714 System.out.println("**** end contents Loop body"); 1715 } else if (node instanceof Curly) { 1716 System.out.println(node); 1717 printObjectTree(((Curly)node).atom); 1718 System.out.println("**** end contents Curly body"); 1719 } else if (node instanceof GroupCurly) { 1720 System.out.println(node); 1721 printObjectTree(((GroupCurly)node).atom); 1722 System.out.println("**** end contents GroupCurly body"); 1723 } else if (node instanceof GroupTail) { 1724 System.out.println(node); 1725 System.out.println("Tail next is "+node.next); 1726 return; 1727 } else { 1728 System.out.println(node); 1729 } 1730 node = node.next; 1731 if (node != null) 1732 System.out.println("->next:"); 1733 if (node == Pattern.accept) { 1734 System.out.println("Accept Node"); 1735 node = null; 1736 } 1737 } 1738 } 1739 1740 /** 1741 * Used to accumulate information about a subtree of the object graph 1742 * so that optimizations can be applied to the subtree. 1743 */ 1744 static final class TreeInfo { 1745 int minLength; 1746 int maxLength; 1747 boolean maxValid; 1748 boolean deterministic; 1749 1750 TreeInfo() { 1751 reset(); 1752 } 1753 void reset() { 1754 minLength = 0; 1755 maxLength = 0; 1756 maxValid = true; 1757 deterministic = true; 1758 } 1759 } 1760 1761 /* 1762 * The following private methods are mainly used to improve the 1763 * readability of the code. In order to let the Java compiler easily 1764 * inline them, we should not put many assertions or error checks in them. 1765 */ 1766 1767 /** 1768 * Indicates whether a particular flag is set or not. 1769 */ 1770 private boolean has(int f) { 1771 return (flags & f) != 0; 1772 } 1773 1774 /** 1775 * Match next character, signal error if failed. 1776 */ 1777 private void accept(int ch, String s) { 1778 int testChar = temp[cursor++]; 1779 if (has(COMMENTS)) 1780 testChar = parsePastWhitespace(testChar); 1781 if (ch != testChar) { 1782 throw error(s); 1783 } 1784 } 1785 1786 /** 1787 * Mark the end of pattern with a specific character. 1788 */ 1789 private void mark(int c) { 1790 temp[patternLength] = c; 1791 } 1792 1793 /** 1794 * Peek the next character, and do not advance the cursor. 1795 */ 1796 private int peek() { 1797 int ch = temp[cursor]; 1798 if (has(COMMENTS)) 1799 ch = peekPastWhitespace(ch); 1800 return ch; 1801 } 1802 1803 /** 1804 * Read the next character, and advance the cursor by one. 1805 */ 1806 private int read() { 1807 int ch = temp[cursor++]; 1808 if (has(COMMENTS)) 1809 ch = parsePastWhitespace(ch); 1810 return ch; 1811 } 1812 1813 /** 1814 * Read the next character, and advance the cursor by one, 1815 * ignoring the COMMENTS setting 1816 */ 1817 private int readEscaped() { 1818 int ch = temp[cursor++]; 1819 return ch; 1820 } 1821 1822 /** 1823 * Advance the cursor by one, and peek the next character. 1824 */ 1825 private int next() { 1826 int ch = temp[++cursor]; 1827 if (has(COMMENTS)) 1828 ch = peekPastWhitespace(ch); 1829 return ch; 1830 } 1831 1832 /** 1833 * Advance the cursor by one, and peek the next character, 1834 * ignoring the COMMENTS setting 1835 */ 1836 private int nextEscaped() { 1837 int ch = temp[++cursor]; 1838 return ch; 1839 } 1840 1841 /** 1842 * If in xmode peek past whitespace and comments. 1843 */ 1844 private int peekPastWhitespace(int ch) { 1845 while (ASCII.isSpace(ch) || ch == '#') { 1846 while (ASCII.isSpace(ch)) 1847 ch = temp[++cursor]; 1848 if (ch == '#') { 1849 ch = peekPastLine(); 1850 } 1851 } 1852 return ch; 1853 } 1854 1855 /** 1856 * If in xmode parse past whitespace and comments. 1857 */ 1858 private int parsePastWhitespace(int ch) { 1859 while (ASCII.isSpace(ch) || ch == '#') { 1860 while (ASCII.isSpace(ch)) 1861 ch = temp[cursor++]; 1862 if (ch == '#') 1863 ch = parsePastLine(); 1864 } 1865 return ch; 1866 } 1867 1868 /** 1869 * xmode parse past comment to end of line. 1870 */ 1871 private int parsePastLine() { 1872 int ch = temp[cursor++]; 1873 while (ch != 0 && !isLineSeparator(ch)) 1874 ch = temp[cursor++]; 1875 return ch; 1876 } 1877 1878 /** 1879 * xmode peek past comment to end of line. 1880 */ 1881 private int peekPastLine() { 1882 int ch = temp[++cursor]; 1883 while (ch != 0 && !isLineSeparator(ch)) 1884 ch = temp[++cursor]; 1885 return ch; 1886 } 1887 1888 /** 1889 * Determines if character is a line separator in the current mode 1890 */ 1891 private boolean isLineSeparator(int ch) { 1892 if (has(UNIX_LINES)) { 1893 return ch == '\n'; 1894 } else { 1895 return (ch == '\n' || 1896 ch == '\r' || 1897 (ch|1) == '\u2029' || 1898 ch == '\u0085'); 1899 } 1900 } 1901 1902 /** 1903 * Read the character after the next one, and advance the cursor by two. 1904 */ 1905 private int skip() { 1906 int i = cursor; 1907 int ch = temp[i+1]; 1908 cursor = i + 2; 1909 return ch; 1910 } 1911 1912 /** 1913 * Unread one next character, and retreat cursor by one. 1914 */ 1915 private void unread() { 1916 cursor--; 1917 } 1918 1919 /** 1920 * Internal method used for handling all syntax errors. The pattern is 1921 * displayed with a pointer to aid in locating the syntax error. 1922 */ 1923 private PatternSyntaxException error(String s) { 1924 return new PatternSyntaxException(s, normalizedPattern, cursor - 1); 1925 } 1926 1927 /** 1928 * Determines if there is any supplementary character or unpaired 1929 * surrogate in the specified range. 1930 */ 1931 private boolean findSupplementary(int start, int end) { 1932 for (int i = start; i < end; i++) { 1933 if (isSupplementary(temp[i])) 1934 return true; 1935 } 1936 return false; 1937 } 1938 1939 /** 1940 * Determines if the specified code point is a supplementary 1941 * character or unpaired surrogate. 1942 */ 1943 private static final boolean isSupplementary(int ch) { 1944 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || 1945 Character.isSurrogate((char)ch); 1946 } 1947 1948 /** 1949 * The following methods handle the main parsing. They are sorted 1950 * according to their precedence order, the lowest one first. 1951 */ 1952 1953 /** 1954 * The expression is parsed with branch nodes added for alternations. 1955 * This may be called recursively to parse sub expressions that may 1956 * contain alternations. 1957 */ 1958 private Node expr(Node end) { 1959 Node prev = null; 1960 Node firstTail = null; 1961 Node branchConn = null; 1962 1963 for (;;) { 1964 Node node = sequence(end); 1965 Node nodeTail = root; //double return 1966 if (prev == null) { 1967 prev = node; 1968 firstTail = nodeTail; 1969 } else { 1970 // Branch 1971 if (branchConn == null) { 1972 branchConn = new BranchConn(); 1973 branchConn.next = end; 1974 } 1975 if (node == end) { 1976 // if the node returned from sequence() is "end" 1977 // we have an empty expr, set a null atom into 1978 // the branch to indicate to go "next" directly. 1979 node = null; 1980 } else { 1981 // the "tail.next" of each atom goes to branchConn 1982 nodeTail.next = branchConn; 1983 } 1984 if (prev instanceof Branch) { 1985 ((Branch)prev).add(node); 1986 } else { 1987 if (prev == end) { 1988 prev = null; 1989 } else { 1990 // replace the "end" with "branchConn" at its tail.next 1991 // when put the "prev" into the branch as the first atom. 1992 firstTail.next = branchConn; 1993 } 1994 prev = new Branch(prev, node, branchConn); 1995 } 1996 } 1997 if (peek() != '|') { 1998 return prev; 1999 } 2000 next(); 2001 } 2002 } 2003 2004 /** 2005 * Parsing of sequences between alternations. 2006 */ 2007 private Node sequence(Node end) { 2008 Node head = null; 2009 Node tail = null; 2010 Node node = null; 2011 LOOP: 2012 for (;;) { 2013 int ch = peek(); 2014 switch (ch) { 2015 case '(': 2016 // Because group handles its own closure, 2017 // we need to treat it differently 2018 node = group0(); 2019 // Check for comment or flag group 2020 if (node == null) 2021 continue; 2022 if (head == null) 2023 head = node; 2024 else 2025 tail.next = node; 2026 // Double return: Tail was returned in root 2027 tail = root; 2028 continue; 2029 case '[': 2030 node = clazz(true); 2031 break; 2032 case '\\': 2033 ch = nextEscaped(); 2034 if (ch == 'p' || ch == 'P') { 2035 boolean oneLetter = true; 2036 boolean comp = (ch == 'P'); 2037 ch = next(); // Consume { if present 2038 if (ch != '{') { 2039 unread(); 2040 } else { 2041 oneLetter = false; 2042 } 2043 node = family(oneLetter, comp); 2044 } else { 2045 unread(); 2046 node = atom(); 2047 } 2048 break; 2049 case '^': 2050 next(); 2051 if (has(MULTILINE)) { 2052 if (has(UNIX_LINES)) 2053 node = new UnixCaret(); 2054 else 2055 node = new Caret(); 2056 } else { 2057 node = new Begin(); 2058 } 2059 break; 2060 case '$': 2061 next(); 2062 if (has(UNIX_LINES)) 2063 node = new UnixDollar(has(MULTILINE)); 2064 else 2065 node = new Dollar(has(MULTILINE)); 2066 break; 2067 case '.': 2068 next(); 2069 if (has(DOTALL)) { 2070 node = new All(); 2071 } else { 2072 if (has(UNIX_LINES)) 2073 node = new UnixDot(); 2074 else { 2075 node = new Dot(); 2076 } 2077 } 2078 break; 2079 case '|': 2080 case ')': 2081 break LOOP; 2082 case ']': // Now interpreting dangling ] and } as literals 2083 case '}': 2084 node = atom(); 2085 break; 2086 case '?': 2087 case '*': 2088 case '+': 2089 next(); 2090 throw error("Dangling meta character '" + ((char)ch) + "'"); 2091 case 0: 2092 if (cursor >= patternLength) { 2093 break LOOP; 2094 } 2095 // Fall through 2096 default: 2097 node = atom(); 2098 break; 2099 } 2100 2101 node = closure(node); 2102 2103 if (head == null) { 2104 head = tail = node; 2105 } else { 2106 tail.next = node; 2107 tail = node; 2108 } 2109 } 2110 if (head == null) { 2111 return end; 2112 } 2113 tail.next = end; 2114 root = tail; //double return 2115 return head; 2116 } 2117 2118 /** 2119 * Parse and add a new Single or Slice. 2120 */ 2121 private Node atom() { 2122 int first = 0; 2123 int prev = -1; 2124 boolean hasSupplementary = false; 2125 int ch = peek(); 2126 for (;;) { 2127 switch (ch) { 2128 case '*': 2129 case '+': 2130 case '?': 2131 case '{': 2132 if (first > 1) { 2133 cursor = prev; // Unwind one character 2134 first--; 2135 } 2136 break; 2137 case '$': 2138 case '.': 2139 case '^': 2140 case '(': 2141 case '[': 2142 case '|': 2143 case ')': 2144 break; 2145 case '\\': 2146 ch = nextEscaped(); 2147 if (ch == 'p' || ch == 'P') { // Property 2148 if (first > 0) { // Slice is waiting; handle it first 2149 unread(); 2150 break; 2151 } else { // No slice; just return the family node 2152 boolean comp = (ch == 'P'); 2153 boolean oneLetter = true; 2154 ch = next(); // Consume { if present 2155 if (ch != '{') 2156 unread(); 2157 else 2158 oneLetter = false; 2159 return family(oneLetter, comp); 2160 } 2161 } 2162 unread(); 2163 prev = cursor; 2164 ch = escape(false, first == 0); 2165 if (ch >= 0) { 2166 append(ch, first); 2167 first++; 2168 if (isSupplementary(ch)) { 2169 hasSupplementary = true; 2170 } 2171 ch = peek(); 2172 continue; 2173 } else if (first == 0) { 2174 return root; 2175 } 2176 // Unwind meta escape sequence 2177 cursor = prev; 2178 break; 2179 case 0: 2180 if (cursor >= patternLength) { 2181 break; 2182 } 2183 // Fall through 2184 default: 2185 prev = cursor; 2186 append(ch, first); 2187 first++; 2188 if (isSupplementary(ch)) { 2189 hasSupplementary = true; 2190 } 2191 ch = next(); 2192 continue; 2193 } 2194 break; 2195 } 2196 if (first == 1) { 2197 return newSingle(buffer[0]); 2198 } else { 2199 return newSlice(buffer, first, hasSupplementary); 2200 } 2201 } 2202 2203 private void append(int ch, int len) { 2204 if (len >= buffer.length) { 2205 int[] tmp = new int[len+len]; 2206 System.arraycopy(buffer, 0, tmp, 0, len); 2207 buffer = tmp; 2208 } 2209 buffer[len] = ch; 2210 } 2211 2212 /** 2213 * Parses a backref greedily, taking as many numbers as it 2214 * can. The first digit is always treated as a backref, but 2215 * multi digit numbers are only treated as a backref if at 2216 * least that many backrefs exist at this point in the regex. 2217 */ 2218 private Node ref(int refNum) { 2219 boolean done = false; 2220 while(!done) { 2221 int ch = peek(); 2222 switch(ch) { 2223 case '0': 2224 case '1': 2225 case '2': 2226 case '3': 2227 case '4': 2228 case '5': 2229 case '6': 2230 case '7': 2231 case '8': 2232 case '9': 2233 int newRefNum = (refNum * 10) + (ch - '0'); 2234 // Add another number if it doesn't make a group 2235 // that doesn't exist 2236 if (capturingGroupCount - 1 < newRefNum) { 2237 done = true; 2238 break; 2239 } 2240 refNum = newRefNum; 2241 read(); 2242 break; 2243 default: 2244 done = true; 2245 break; 2246 } 2247 } 2248 if (has(CASE_INSENSITIVE)) 2249 return new CIBackRef(refNum, has(UNICODE_CASE)); 2250 else 2251 return new BackRef(refNum); 2252 } 2253 2254 /** 2255 * Parses an escape sequence to determine the actual value that needs 2256 * to be matched. 2257 * If -1 is returned and create was true a new object was added to the tree 2258 * to handle the escape sequence. 2259 * If the returned value is greater than zero, it is the value that 2260 * matches the escape sequence. 2261 */ 2262 private int escape(boolean inclass, boolean create) { 2263 int ch = skip(); 2264 switch (ch) { 2265 case '0': 2266 return o(); 2267 case '1': 2268 case '2': 2269 case '3': 2270 case '4': 2271 case '5': 2272 case '6': 2273 case '7': 2274 case '8': 2275 case '9': 2276 if (inclass) break; 2277 if (create) { 2278 root = ref((ch - '0')); 2279 } 2280 return -1; 2281 case 'A': 2282 if (inclass) break; 2283 if (create) root = new Begin(); 2284 return -1; 2285 case 'B': 2286 if (inclass) break; 2287 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); 2288 return -1; 2289 case 'C': 2290 break; 2291 case 'D': 2292 if (create) root = has(UNICODE_CHARACTER_CLASS) 2293 ? new Utype(UnicodeProp.DIGIT).complement() 2294 : new Ctype(ASCII.DIGIT).complement(); 2295 return -1; 2296 case 'E': 2297 case 'F': 2298 break; 2299 case 'G': 2300 if (inclass) break; 2301 if (create) root = new LastMatch(); 2302 return -1; 2303 case 'H': 2304 case 'I': 2305 case 'J': 2306 case 'K': 2307 case 'L': 2308 case 'M': 2309 case 'N': 2310 case 'O': 2311 case 'P': 2312 case 'Q': 2313 case 'R': 2314 break; 2315 case 'S': 2316 if (create) root = has(UNICODE_CHARACTER_CLASS) 2317 ? new Utype(UnicodeProp.WHITE_SPACE).complement() 2318 : new Ctype(ASCII.SPACE).complement(); 2319 return -1; 2320 case 'T': 2321 case 'U': 2322 case 'V': 2323 break; 2324 case 'W': 2325 if (create) root = has(UNICODE_CHARACTER_CLASS) 2326 ? new Utype(UnicodeProp.WORD).complement() 2327 : new Ctype(ASCII.WORD).complement(); 2328 return -1; 2329 case 'X': 2330 case 'Y': 2331 break; 2332 case 'Z': 2333 if (inclass) break; 2334 if (create) { 2335 if (has(UNIX_LINES)) 2336 root = new UnixDollar(false); 2337 else 2338 root = new Dollar(false); 2339 } 2340 return -1; 2341 case 'a': 2342 return '\007'; 2343 case 'b': 2344 if (inclass) break; 2345 if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); 2346 return -1; 2347 case 'c': 2348 return c(); 2349 case 'd': 2350 if (create) root = has(UNICODE_CHARACTER_CLASS) 2351 ? new Utype(UnicodeProp.DIGIT) 2352 : new Ctype(ASCII.DIGIT); 2353 return -1; 2354 case 'e': 2355 return '\033'; 2356 case 'f': 2357 return '\f'; 2358 case 'g': 2359 case 'h': 2360 case 'i': 2361 case 'j': 2362 break; 2363 case 'k': 2364 if (inclass) 2365 break; 2366 if (read() != '<') 2367 throw error("\\k is not followed by '<' for named capturing group"); 2368 String name = groupname(read()); 2369 if (!namedGroups().containsKey(name)) 2370 throw error("(named capturing group <"+ name+"> does not exit"); 2371 if (create) { 2372 if (has(CASE_INSENSITIVE)) 2373 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); 2374 else 2375 root = new BackRef(namedGroups().get(name)); 2376 } 2377 return -1; 2378 case 'l': 2379 case 'm': 2380 break; 2381 case 'n': 2382 return '\n'; 2383 case 'o': 2384 case 'p': 2385 case 'q': 2386 break; 2387 case 'r': 2388 return '\r'; 2389 case 's': 2390 if (create) root = has(UNICODE_CHARACTER_CLASS) 2391 ? new Utype(UnicodeProp.WHITE_SPACE) 2392 : new Ctype(ASCII.SPACE); 2393 return -1; 2394 case 't': 2395 return '\t'; 2396 case 'u': 2397 return u(); 2398 case 'v': 2399 return '\013'; 2400 case 'w': 2401 if (create) root = has(UNICODE_CHARACTER_CLASS) 2402 ? new Utype(UnicodeProp.WORD) 2403 : new Ctype(ASCII.WORD); 2404 return -1; 2405 case 'x': 2406 return x(); 2407 case 'y': 2408 break; 2409 case 'z': 2410 if (inclass) break; 2411 if (create) root = new End(); 2412 return -1; 2413 default: 2414 return ch; 2415 } 2416 throw error("Illegal/unsupported escape sequence"); 2417 } 2418 2419 /** 2420 * Parse a character class, and return the node that matches it. 2421 * 2422 * Consumes a ] on the way out if consume is true. Usually consume 2423 * is true except for the case of [abc&&def] where def is a separate 2424 * right hand node with "understood" brackets. 2425 */ 2426 private CharProperty clazz(boolean consume) { 2427 CharProperty prev = null; 2428 CharProperty node = null; 2429 BitClass bits = new BitClass(); 2430 boolean include = true; 2431 boolean firstInClass = true; 2432 int ch = next(); 2433 for (;;) { 2434 switch (ch) { 2435 case '^': 2436 // Negates if first char in a class, otherwise literal 2437 if (firstInClass) { 2438 if (temp[cursor-1] != '[') 2439 break; 2440 ch = next(); 2441 include = !include; 2442 continue; 2443 } else { 2444 // ^ not first in class, treat as literal 2445 break; 2446 } 2447 case '[': 2448 firstInClass = false; 2449 node = clazz(true); 2450 if (prev == null) 2451 prev = node; 2452 else 2453 prev = union(prev, node); 2454 ch = peek(); 2455 continue; 2456 case '&': 2457 firstInClass = false; 2458 ch = next(); 2459 if (ch == '&') { 2460 ch = next(); 2461 CharProperty rightNode = null; 2462 while (ch != ']' && ch != '&') { 2463 if (ch == '[') { 2464 if (rightNode == null) 2465 rightNode = clazz(true); 2466 else 2467 rightNode = union(rightNode, clazz(true)); 2468 } else { // abc&&def 2469 unread(); 2470 rightNode = clazz(false); 2471 } 2472 ch = peek(); 2473 } 2474 if (rightNode != null) 2475 node = rightNode; 2476 if (prev == null) { 2477 if (rightNode == null) 2478 throw error("Bad class syntax"); 2479 else 2480 prev = rightNode; 2481 } else { 2482 prev = intersection(prev, node); 2483 } 2484 } else { 2485 // treat as a literal & 2486 unread(); 2487 break; 2488 } 2489 continue; 2490 case 0: 2491 firstInClass = false; 2492 if (cursor >= patternLength) 2493 throw error("Unclosed character class"); 2494 break; 2495 case ']': 2496 firstInClass = false; 2497 if (prev != null) { 2498 if (consume) 2499 next(); 2500 return prev; 2501 } 2502 break; 2503 default: 2504 firstInClass = false; 2505 break; 2506 } 2507 node = range(bits); 2508 if (include) { 2509 if (prev == null) { 2510 prev = node; 2511 } else { 2512 if (prev != node) 2513 prev = union(prev, node); 2514 } 2515 } else { 2516 if (prev == null) { 2517 prev = node.complement(); 2518 } else { 2519 if (prev != node) 2520 prev = setDifference(prev, node); 2521 } 2522 } 2523 ch = peek(); 2524 } 2525 } 2526 2527 private CharProperty bitsOrSingle(BitClass bits, int ch) { 2528 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2529 Use "single" node instead of bits when dealing with unicode 2530 case folding for codepoints listed below. 2531 (1)Uppercase out of range: u+00ff, u+00b5 2532 toUpperCase(u+00ff) -> u+0178 2533 toUpperCase(u+00b5) -> u+039c 2534 (2)LatinSmallLetterLongS u+17f 2535 toUpperCase(u+017f) -> u+0053 2536 (3)LatinSmallLetterDotlessI u+131 2537 toUpperCase(u+0131) -> u+0049 2538 (4)LatinCapitalLetterIWithDotAbove u+0130 2539 toLowerCase(u+0130) -> u+0069 2540 (5)KelvinSign u+212a 2541 toLowerCase(u+212a) ==> u+006B 2542 (6)AngstromSign u+212b 2543 toLowerCase(u+212b) ==> u+00e5 2544 */ 2545 int d; 2546 if (ch < 256 && 2547 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && 2548 (ch == 0xff || ch == 0xb5 || 2549 ch == 0x49 || ch == 0x69 || //I and i 2550 ch == 0x53 || ch == 0x73 || //S and s 2551 ch == 0x4b || ch == 0x6b || //K and k 2552 ch == 0xc5 || ch == 0xe5))) //A+ring 2553 return bits.add(ch, flags()); 2554 return newSingle(ch); 2555 } 2556 2557 /** 2558 * Parse a single character or a character range in a character class 2559 * and return its representative node. 2560 */ 2561 private CharProperty range(BitClass bits) { 2562 int ch = peek(); 2563 if (ch == '\\') { 2564 ch = nextEscaped(); 2565 if (ch == 'p' || ch == 'P') { // A property 2566 boolean comp = (ch == 'P'); 2567 boolean oneLetter = true; 2568 // Consume { if present 2569 ch = next(); 2570 if (ch != '{') 2571 unread(); 2572 else 2573 oneLetter = false; 2574 return family(oneLetter, comp); 2575 } else { // ordinary escape 2576 unread(); 2577 ch = escape(true, true); 2578 if (ch == -1) 2579 return (CharProperty) root; 2580 } 2581 } else { 2582 ch = single(); 2583 } 2584 if (ch >= 0) { 2585 if (peek() == '-') { 2586 int endRange = temp[cursor+1]; 2587 if (endRange == '[') { 2588 return bitsOrSingle(bits, ch); 2589 } 2590 if (endRange != ']') { 2591 next(); 2592 int m = single(); 2593 if (m < ch) 2594 throw error("Illegal character range"); 2595 if (has(CASE_INSENSITIVE)) 2596 return caseInsensitiveRangeFor(ch, m); 2597 else 2598 return rangeFor(ch, m); 2599 } 2600 } 2601 return bitsOrSingle(bits, ch); 2602 } 2603 throw error("Unexpected character '"+((char)ch)+"'"); 2604 } 2605 2606 private int single() { 2607 int ch = peek(); 2608 switch (ch) { 2609 case '\\': 2610 return escape(true, false); 2611 default: 2612 next(); 2613 return ch; 2614 } 2615 } 2616 2617 /** 2618 * Parses a Unicode character family and returns its representative node. 2619 */ 2620 private CharProperty family(boolean singleLetter, 2621 boolean maybeComplement) 2622 { 2623 next(); 2624 String name; 2625 CharProperty node = null; 2626 2627 if (singleLetter) { 2628 int c = temp[cursor]; 2629 if (!Character.isSupplementaryCodePoint(c)) { 2630 name = String.valueOf((char)c); 2631 } else { 2632 name = new String(temp, cursor, 1); 2633 } 2634 read(); 2635 } else { 2636 int i = cursor; 2637 mark('}'); 2638 while(read() != '}') { 2639 } 2640 mark('\000'); 2641 int j = cursor; 2642 if (j > patternLength) 2643 throw error("Unclosed character family"); 2644 if (i + 1 >= j) 2645 throw error("Empty character family"); 2646 name = new String(temp, i, j-i-1); 2647 } 2648 2649 int i = name.indexOf('='); 2650 if (i != -1) { 2651 // property construct \p{name=value} 2652 String value = name.substring(i + 1); 2653 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); 2654 if ("sc".equals(name) || "script".equals(name)) { 2655 node = unicodeScriptPropertyFor(value); 2656 } else if ("blk".equals(name) || "block".equals(name)) { 2657 node = unicodeBlockPropertyFor(value); 2658 } else if ("gc".equals(name) || "general_category".equals(name)) { 2659 node = charPropertyNodeFor(value); 2660 } else { 2661 throw error("Unknown Unicode property {name=<" + name + ">, " 2662 + "value=<" + value + ">}"); 2663 } 2664 } else { 2665 if (name.startsWith("In")) { 2666 // \p{inBlockName} 2667 node = unicodeBlockPropertyFor(name.substring(2)); 2668 } else if (name.startsWith("Is")) { 2669 // \p{isGeneralCategory} and \p{isScriptName} 2670 name = name.substring(2); 2671 UnicodeProp uprop = UnicodeProp.forName(name); 2672 if (uprop != null) 2673 node = new Utype(uprop); 2674 if (node == null) 2675 node = CharPropertyNames.charPropertyFor(name); 2676 if (node == null) 2677 node = unicodeScriptPropertyFor(name); 2678 } else { 2679 if (has(UNICODE_CHARACTER_CLASS)) { 2680 UnicodeProp uprop = UnicodeProp.forPOSIXName(name); 2681 if (uprop != null) 2682 node = new Utype(uprop); 2683 } 2684 if (node == null) 2685 node = charPropertyNodeFor(name); 2686 } 2687 } 2688 if (maybeComplement) { 2689 if (node instanceof Category || node instanceof Block) 2690 hasSupplementary = true; 2691 node = node.complement(); 2692 } 2693 return node; 2694 } 2695 2696 2697 /** 2698 * Returns a CharProperty matching all characters belong to 2699 * a UnicodeScript. 2700 */ 2701 private CharProperty unicodeScriptPropertyFor(String name) { 2702 final Character.UnicodeScript script; 2703 try { 2704 script = Character.UnicodeScript.forName(name); 2705 } catch (IllegalArgumentException iae) { 2706 throw error("Unknown character script name {" + name + "}"); 2707 } 2708 return new Script(script); 2709 } 2710 2711 /** 2712 * Returns a CharProperty matching all characters in a UnicodeBlock. 2713 */ 2714 private CharProperty unicodeBlockPropertyFor(String name) { 2715 final Character.UnicodeBlock block; 2716 try { 2717 block = Character.UnicodeBlock.forName(name); 2718 } catch (IllegalArgumentException iae) { 2719 throw error("Unknown character block name {" + name + "}"); 2720 } 2721 return new Block(block); 2722 } 2723 2724 /** 2725 * Returns a CharProperty matching all characters in a named property. 2726 */ 2727 private CharProperty charPropertyNodeFor(String name) { 2728 CharProperty p = CharPropertyNames.charPropertyFor(name); 2729 if (p == null) 2730 throw error("Unknown character property name {" + name + "}"); 2731 return p; 2732 } 2733 2734 /** 2735 * Parses and returns the name of a "named capturing group", the trailing 2736 * ">" is consumed after parsing. 2737 */ 2738 private String groupname(int ch) { 2739 StringBuilder sb = new StringBuilder(); 2740 sb.append(Character.toChars(ch)); 2741 while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) || 2742 ASCII.isDigit(ch)) { 2743 sb.append(Character.toChars(ch)); 2744 } 2745 if (sb.length() == 0) 2746 throw error("named capturing group has 0 length name"); 2747 if (ch != '>') 2748 throw error("named capturing group is missing trailing '>'"); 2749 return sb.toString(); 2750 } 2751 2752 /** 2753 * Parses a group and returns the head node of a set of nodes that process 2754 * the group. Sometimes a double return system is used where the tail is 2755 * returned in root. 2756 */ 2757 private Node group0() { 2758 boolean capturingGroup = false; 2759 Node head = null; 2760 Node tail = null; 2761 int save = flags; 2762 root = null; 2763 int ch = next(); 2764 if (ch == '?') { 2765 ch = skip(); 2766 switch (ch) { 2767 case ':': // (?:xxx) pure group 2768 head = createGroup(true); 2769 tail = root; 2770 head.next = expr(tail); 2771 break; 2772 case '=': // (?=xxx) and (?!xxx) lookahead 2773 case '!': 2774 head = createGroup(true); 2775 tail = root; 2776 head.next = expr(tail); 2777 if (ch == '=') { 2778 head = tail = new Pos(head); 2779 } else { 2780 head = tail = new Neg(head); 2781 } 2782 break; 2783 case '>': // (?>xxx) independent group 2784 head = createGroup(true); 2785 tail = root; 2786 head.next = expr(tail); 2787 head = tail = new Ques(head, INDEPENDENT); 2788 break; 2789 case '<': // (?<xxx) look behind 2790 ch = read(); 2791 if (ASCII.isLower(ch) || ASCII.isUpper(ch)) { 2792 // named captured group 2793 String name = groupname(ch); 2794 if (namedGroups().containsKey(name)) 2795 throw error("Named capturing group <" + name 2796 + "> is already defined"); 2797 capturingGroup = true; 2798 head = createGroup(false); 2799 tail = root; 2800 namedGroups().put(name, capturingGroupCount-1); 2801 head.next = expr(tail); 2802 break; 2803 } 2804 int start = cursor; 2805 head = createGroup(true); 2806 tail = root; 2807 head.next = expr(tail); 2808 tail.next = lookbehindEnd; 2809 TreeInfo info = new TreeInfo(); 2810 head.study(info); 2811 if (info.maxValid == false) { 2812 throw error("Look-behind group does not have " 2813 + "an obvious maximum length"); 2814 } 2815 boolean hasSupplementary = findSupplementary(start, patternLength); 2816 if (ch == '=') { 2817 head = tail = (hasSupplementary ? 2818 new BehindS(head, info.maxLength, 2819 info.minLength) : 2820 new Behind(head, info.maxLength, 2821 info.minLength)); 2822 } else if (ch == '!') { 2823 head = tail = (hasSupplementary ? 2824 new NotBehindS(head, info.maxLength, 2825 info.minLength) : 2826 new NotBehind(head, info.maxLength, 2827 info.minLength)); 2828 } else { 2829 throw error("Unknown look-behind group"); 2830 } 2831 break; 2832 case '$': 2833 case '@': 2834 throw error("Unknown group type"); 2835 default: // (?xxx:) inlined match flags 2836 unread(); 2837 addFlag(); 2838 ch = read(); 2839 if (ch == ')') { 2840 return null; // Inline modifier only 2841 } 2842 if (ch != ':') { 2843 throw error("Unknown inline modifier"); 2844 } 2845 head = createGroup(true); 2846 tail = root; 2847 head.next = expr(tail); 2848 break; 2849 } 2850 } else { // (xxx) a regular group 2851 capturingGroup = true; 2852 head = createGroup(false); 2853 tail = root; 2854 head.next = expr(tail); 2855 } 2856 2857 accept(')', "Unclosed group"); 2858 flags = save; 2859 2860 // Check for quantifiers 2861 Node node = closure(head); 2862 if (node == head) { // No closure 2863 root = tail; 2864 return node; // Dual return 2865 } 2866 if (head == tail) { // Zero length assertion 2867 root = node; 2868 return node; // Dual return 2869 } 2870 2871 if (node instanceof Ques) { 2872 Ques ques = (Ques) node; 2873 if (ques.type == POSSESSIVE) { 2874 root = node; 2875 return node; 2876 } 2877 tail.next = new BranchConn(); 2878 tail = tail.next; 2879 if (ques.type == GREEDY) { 2880 head = new Branch(head, null, tail); 2881 } else { // Reluctant quantifier 2882 head = new Branch(null, head, tail); 2883 } 2884 root = tail; 2885 return head; 2886 } else if (node instanceof Curly) { 2887 Curly curly = (Curly) node; 2888 if (curly.type == POSSESSIVE) { 2889 root = node; 2890 return node; 2891 } 2892 // Discover if the group is deterministic 2893 TreeInfo info = new TreeInfo(); 2894 if (head.study(info)) { // Deterministic 2895 GroupTail temp = (GroupTail) tail; 2896 head = root = new GroupCurly(head.next, curly.cmin, 2897 curly.cmax, curly.type, 2898 ((GroupTail)tail).localIndex, 2899 ((GroupTail)tail).groupIndex, 2900 capturingGroup); 2901 return head; 2902 } else { // Non-deterministic 2903 int temp = ((GroupHead) head).localIndex; 2904 Loop loop; 2905 if (curly.type == GREEDY) 2906 loop = new Loop(this.localCount, temp); 2907 else // Reluctant Curly 2908 loop = new LazyLoop(this.localCount, temp); 2909 Prolog prolog = new Prolog(loop); 2910 this.localCount += 1; 2911 loop.cmin = curly.cmin; 2912 loop.cmax = curly.cmax; 2913 loop.body = head; 2914 tail.next = loop; 2915 root = loop; 2916 return prolog; // Dual return 2917 } 2918 } 2919 throw error("Internal logic error"); 2920 } 2921 2922 /** 2923 * Create group head and tail nodes using double return. If the group is 2924 * created with anonymous true then it is a pure group and should not 2925 * affect group counting. 2926 */ 2927 private Node createGroup(boolean anonymous) { 2928 int localIndex = localCount++; 2929 int groupIndex = 0; 2930 if (!anonymous) 2931 groupIndex = capturingGroupCount++; 2932 GroupHead head = new GroupHead(localIndex); 2933 root = new GroupTail(localIndex, groupIndex); 2934 if (!anonymous && groupIndex < 10) 2935 groupNodes[groupIndex] = head; 2936 return head; 2937 } 2938 2939 /** 2940 * Parses inlined match flags and set them appropriately. 2941 */ 2942 private void addFlag() { 2943 int ch = peek(); 2944 for (;;) { 2945 switch (ch) { 2946 case 'i': 2947 flags |= CASE_INSENSITIVE; 2948 break; 2949 case 'm': 2950 flags |= MULTILINE; 2951 break; 2952 case 's': 2953 flags |= DOTALL; 2954 break; 2955 case 'd': 2956 flags |= UNIX_LINES; 2957 break; 2958 case 'u': 2959 flags |= UNICODE_CASE; 2960 break; 2961 case 'c': 2962 flags |= CANON_EQ; 2963 break; 2964 case 'x': 2965 flags |= COMMENTS; 2966 break; 2967 case 'U': 2968 flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); 2969 break; 2970 case '-': // subFlag then fall through 2971 ch = next(); 2972 subFlag(); 2973 default: 2974 return; 2975 } 2976 ch = next(); 2977 } 2978 } 2979 2980 /** 2981 * Parses the second part of inlined match flags and turns off 2982 * flags appropriately. 2983 */ 2984 private void subFlag() { 2985 int ch = peek(); 2986 for (;;) { 2987 switch (ch) { 2988 case 'i': 2989 flags &= ~CASE_INSENSITIVE; 2990 break; 2991 case 'm': 2992 flags &= ~MULTILINE; 2993 break; 2994 case 's': 2995 flags &= ~DOTALL; 2996 break; 2997 case 'd': 2998 flags &= ~UNIX_LINES; 2999 break; 3000 case 'u': 3001 flags &= ~UNICODE_CASE; 3002 break; 3003 case 'c': 3004 flags &= ~CANON_EQ; 3005 break; 3006 case 'x': 3007 flags &= ~COMMENTS; 3008 break; 3009 case 'U': 3010 flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3011 default: 3012 return; 3013 } 3014 ch = next(); 3015 } 3016 } 3017 3018 static final int MAX_REPS = 0x7FFFFFFF; 3019 3020 static final int GREEDY = 0; 3021 3022 static final int LAZY = 1; 3023 3024 static final int POSSESSIVE = 2; 3025 3026 static final int INDEPENDENT = 3; 3027 3028 /** 3029 * Processes repetition. If the next character peeked is a quantifier 3030 * then new nodes must be appended to handle the repetition. 3031 * Prev could be a single or a group, so it could be a chain of nodes. 3032 */ 3033 private Node closure(Node prev) { 3034 Node atom; 3035 int ch = peek(); 3036 switch (ch) { 3037 case '?': 3038 ch = next(); 3039 if (ch == '?') { 3040 next(); 3041 return new Ques(prev, LAZY); 3042 } else if (ch == '+') { 3043 next(); 3044 return new Ques(prev, POSSESSIVE); 3045 } 3046 return new Ques(prev, GREEDY); 3047 case '*': 3048 ch = next(); 3049 if (ch == '?') { 3050 next(); 3051 return new Curly(prev, 0, MAX_REPS, LAZY); 3052 } else if (ch == '+') { 3053 next(); 3054 return new Curly(prev, 0, MAX_REPS, POSSESSIVE); 3055 } 3056 return new Curly(prev, 0, MAX_REPS, GREEDY); 3057 case '+': 3058 ch = next(); 3059 if (ch == '?') { 3060 next(); 3061 return new Curly(prev, 1, MAX_REPS, LAZY); 3062 } else if (ch == '+') { 3063 next(); 3064 return new Curly(prev, 1, MAX_REPS, POSSESSIVE); 3065 } 3066 return new Curly(prev, 1, MAX_REPS, GREEDY); 3067 case '{': 3068 ch = temp[cursor+1]; 3069 if (ASCII.isDigit(ch)) { 3070 skip(); 3071 int cmin = 0; 3072 do { 3073 cmin = cmin * 10 + (ch - '0'); 3074 } while (ASCII.isDigit(ch = read())); 3075 int cmax = cmin; 3076 if (ch == ',') { 3077 ch = read(); 3078 cmax = MAX_REPS; 3079 if (ch != '}') { 3080 cmax = 0; 3081 while (ASCII.isDigit(ch)) { 3082 cmax = cmax * 10 + (ch - '0'); 3083 ch = read(); 3084 } 3085 } 3086 } 3087 if (ch != '}') 3088 throw error("Unclosed counted closure"); 3089 if (((cmin) | (cmax) | (cmax - cmin)) < 0) 3090 throw error("Illegal repetition range"); 3091 Curly curly; 3092 ch = peek(); 3093 if (ch == '?') { 3094 next(); 3095 curly = new Curly(prev, cmin, cmax, LAZY); 3096 } else if (ch == '+') { 3097 next(); 3098 curly = new Curly(prev, cmin, cmax, POSSESSIVE); 3099 } else { 3100 curly = new Curly(prev, cmin, cmax, GREEDY); 3101 } 3102 return curly; 3103 } else { 3104 throw error("Illegal repetition"); 3105 } 3106 default: 3107 return prev; 3108 } 3109 } 3110 3111 /** 3112 * Utility method for parsing control escape sequences. 3113 */ 3114 private int c() { 3115 if (cursor < patternLength) { 3116 return read() ^ 64; 3117 } 3118 throw error("Illegal control escape sequence"); 3119 } 3120 3121 /** 3122 * Utility method for parsing octal escape sequences. 3123 */ 3124 private int o() { 3125 int n = read(); 3126 if (((n-'0')|('7'-n)) >= 0) { 3127 int m = read(); 3128 if (((m-'0')|('7'-m)) >= 0) { 3129 int o = read(); 3130 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { 3131 return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); 3132 } 3133 unread(); 3134 return (n - '0') * 8 + (m - '0'); 3135 } 3136 unread(); 3137 return (n - '0'); 3138 } 3139 throw error("Illegal octal escape sequence"); 3140 } 3141 3142 /** 3143 * Utility method for parsing hexadecimal escape sequences. 3144 */ 3145 private int x() { 3146 int n = read(); 3147 if (ASCII.isHexDigit(n)) { 3148 int m = read(); 3149 if (ASCII.isHexDigit(m)) { 3150 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); 3151 } 3152 } else if (n == '{' && ASCII.isHexDigit(peek())) { 3153 int ch = 0; 3154 while (ASCII.isHexDigit(n = read())) { 3155 ch = (ch << 4) + ASCII.toDigit(n); 3156 if (ch > Character.MAX_CODE_POINT) 3157 throw error("Hexadecimal codepoint is too big"); 3158 } 3159 if (n != '}') 3160 throw error("Unclosed hexadecimal escape sequence"); 3161 return ch; 3162 } 3163 throw error("Illegal hexadecimal escape sequence"); 3164 } 3165 3166 /** 3167 * Utility method for parsing unicode escape sequences. 3168 */ 3169 private int cursor() { 3170 return cursor; 3171 } 3172 3173 private void setcursor(int pos) { 3174 cursor = pos; 3175 } 3176 3177 private int uxxxx() { 3178 int n = 0; 3179 for (int i = 0; i < 4; i++) { 3180 int ch = read(); 3181 if (!ASCII.isHexDigit(ch)) { 3182 throw error("Illegal Unicode escape sequence"); 3183 } 3184 n = n * 16 + ASCII.toDigit(ch); 3185 } 3186 return n; 3187 } 3188 3189 private int u() { 3190 int n = uxxxx(); 3191 if (Character.isHighSurrogate((char)n)) { 3192 int cur = cursor(); 3193 if (read() == '\\' && read() == 'u') { 3194 int n2 = uxxxx(); 3195 if (Character.isLowSurrogate((char)n2)) 3196 return Character.toCodePoint((char)n, (char)n2); 3197 } 3198 setcursor(cur); 3199 } 3200 return n; 3201 } 3202 3203 // 3204 // Utility methods for code point support 3205 // 3206 3207 private static final int countChars(CharSequence seq, int index, 3208 int lengthInCodePoints) { 3209 // optimization 3210 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { 3211 assert (index >= 0 && index < seq.length()); 3212 return 1; 3213 } 3214 int length = seq.length(); 3215 int x = index; 3216 if (lengthInCodePoints >= 0) { 3217 assert (index >= 0 && index < length); 3218 for (int i = 0; x < length && i < lengthInCodePoints; i++) { 3219 if (Character.isHighSurrogate(seq.charAt(x++))) { 3220 if (x < length && Character.isLowSurrogate(seq.charAt(x))) { 3221 x++; 3222 } 3223 } 3224 } 3225 return x - index; 3226 } 3227 3228 assert (index >= 0 && index <= length); 3229 if (index == 0) { 3230 return 0; 3231 } 3232 int len = -lengthInCodePoints; 3233 for (int i = 0; x > 0 && i < len; i++) { 3234 if (Character.isLowSurrogate(seq.charAt(--x))) { 3235 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { 3236 x--; 3237 } 3238 } 3239 } 3240 return index - x; 3241 } 3242 3243 private static final int countCodePoints(CharSequence seq) { 3244 int length = seq.length(); 3245 int n = 0; 3246 for (int i = 0; i < length; ) { 3247 n++; 3248 if (Character.isHighSurrogate(seq.charAt(i++))) { 3249 if (i < length && Character.isLowSurrogate(seq.charAt(i))) { 3250 i++; 3251 } 3252 } 3253 } 3254 return n; 3255 } 3256 3257 /** 3258 * Creates a bit vector for matching Latin-1 values. A normal BitClass 3259 * never matches values above Latin-1, and a complemented BitClass always 3260 * matches values above Latin-1. 3261 */ 3262 private static final class BitClass extends BmpCharProperty { 3263 final boolean[] bits; 3264 BitClass() { bits = new boolean[256]; } 3265 private BitClass(boolean[] bits) { this.bits = bits; } 3266 BitClass add(int c, int flags) { 3267 assert c >= 0 && c <= 255; 3268 if ((flags & CASE_INSENSITIVE) != 0) { 3269 if (ASCII.isAscii(c)) { 3270 bits[ASCII.toUpper(c)] = true; 3271 bits[ASCII.toLower(c)] = true; 3272 } else if ((flags & UNICODE_CASE) != 0) { 3273 bits[Character.toLowerCase(c)] = true; 3274 bits[Character.toUpperCase(c)] = true; 3275 } 3276 } 3277 bits[c] = true; 3278 return this; 3279 } 3280 boolean isSatisfiedBy(int ch) { 3281 return ch < 256 && bits[ch]; 3282 } 3283 } 3284 3285 /** 3286 * Returns a suitably optimized, single character matcher. 3287 */ 3288 private CharProperty newSingle(final int ch) { 3289 if (has(CASE_INSENSITIVE)) { 3290 int lower, upper; 3291 if (has(UNICODE_CASE)) { 3292 upper = Character.toUpperCase(ch); 3293 lower = Character.toLowerCase(upper); 3294 if (upper != lower) 3295 return new SingleU(lower); 3296 } else if (ASCII.isAscii(ch)) { 3297 lower = ASCII.toLower(ch); 3298 upper = ASCII.toUpper(ch); 3299 if (lower != upper) 3300 return new SingleI(lower, upper); 3301 } 3302 } 3303 if (isSupplementary(ch)) 3304 return new SingleS(ch); // Match a given Unicode character 3305 return new Single(ch); // Match a given BMP character 3306 } 3307 3308 /** 3309 * Utility method for creating a string slice matcher. 3310 */ 3311 private Node newSlice(int[] buf, int count, boolean hasSupplementary) { 3312 int[] tmp = new int[count]; 3313 if (has(CASE_INSENSITIVE)) { 3314 if (has(UNICODE_CASE)) { 3315 for (int i = 0; i < count; i++) { 3316 tmp[i] = Character.toLowerCase( 3317 Character.toUpperCase(buf[i])); 3318 } 3319 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); 3320 } 3321 for (int i = 0; i < count; i++) { 3322 tmp[i] = ASCII.toLower(buf[i]); 3323 } 3324 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); 3325 } 3326 for (int i = 0; i < count; i++) { 3327 tmp[i] = buf[i]; 3328 } 3329 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); 3330 } 3331 3332 /** 3333 * The following classes are the building components of the object 3334 * tree that represents a compiled regular expression. The object tree 3335 * is made of individual elements that handle constructs in the Pattern. 3336 * Each type of object knows how to match its equivalent construct with 3337 * the match() method. 3338 */ 3339 3340 /** 3341 * Base class for all node classes. Subclasses should override the match() 3342 * method as appropriate. This class is an accepting node, so its match() 3343 * always returns true. 3344 */ 3345 static class Node extends Object { 3346 Node next; 3347 Node() { 3348 next = Pattern.accept; 3349 } 3350 /** 3351 * This method implements the classic accept node. 3352 */ 3353 boolean match(Matcher matcher, int i, CharSequence seq) { 3354 matcher.last = i; 3355 matcher.groups[0] = matcher.first; 3356 matcher.groups[1] = matcher.last; 3357 return true; 3358 } 3359 /** 3360 * This method is good for all zero length assertions. 3361 */ 3362 boolean study(TreeInfo info) { 3363 if (next != null) { 3364 return next.study(info); 3365 } else { 3366 return info.deterministic; 3367 } 3368 } 3369 } 3370 3371 static class LastNode extends Node { 3372 /** 3373 * This method implements the classic accept node with 3374 * the addition of a check to see if the match occurred 3375 * using all of the input. 3376 */ 3377 boolean match(Matcher matcher, int i, CharSequence seq) { 3378 if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to) 3379 return false; 3380 matcher.last = i; 3381 matcher.groups[0] = matcher.first; 3382 matcher.groups[1] = matcher.last; 3383 return true; 3384 } 3385 } 3386 3387 /** 3388 * Used for REs that can start anywhere within the input string. 3389 * This basically tries to match repeatedly at each spot in the 3390 * input string, moving forward after each try. An anchored search 3391 * or a BnM will bypass this node completely. 3392 */ 3393 static class Start extends Node { 3394 int minLength; 3395 Start(Node node) { 3396 this.next = node; 3397 TreeInfo info = new TreeInfo(); 3398 next.study(info); 3399 minLength = info.minLength; 3400 } 3401 boolean match(Matcher matcher, int i, CharSequence seq) { 3402 if (i > matcher.to - minLength) { 3403 matcher.hitEnd = true; 3404 return false; 3405 } 3406 int guard = matcher.to - minLength; 3407 for (; i <= guard; i++) { 3408 if (next.match(matcher, i, seq)) { 3409 matcher.first = i; 3410 matcher.groups[0] = matcher.first; 3411 matcher.groups[1] = matcher.last; 3412 return true; 3413 } 3414 } 3415 matcher.hitEnd = true; 3416 return false; 3417 } 3418 boolean study(TreeInfo info) { 3419 next.study(info); 3420 info.maxValid = false; 3421 info.deterministic = false; 3422 return false; 3423 } 3424 } 3425 3426 /* 3427 * StartS supports supplementary characters, including unpaired surrogates. 3428 */ 3429 static final class StartS extends Start { 3430 StartS(Node node) { 3431 super(node); 3432 } 3433 boolean match(Matcher matcher, int i, CharSequence seq) { 3434 if (i > matcher.to - minLength) { 3435 matcher.hitEnd = true; 3436 return false; 3437 } 3438 int guard = matcher.to - minLength; 3439 while (i <= guard) { 3440 //if ((ret = next.match(matcher, i, seq)) || i == guard) 3441 if (next.match(matcher, i, seq)) { 3442 matcher.first = i; 3443 matcher.groups[0] = matcher.first; 3444 matcher.groups[1] = matcher.last; 3445 return true; 3446 } 3447 if (i == guard) 3448 break; 3449 // Optimization to move to the next character. This is 3450 // faster than countChars(seq, i, 1). 3451 if (Character.isHighSurrogate(seq.charAt(i++))) { 3452 if (i < seq.length() && 3453 Character.isLowSurrogate(seq.charAt(i))) { 3454 i++; 3455 } 3456 } 3457 } 3458 matcher.hitEnd = true; 3459 return false; 3460 } 3461 } 3462 3463 /** 3464 * Node to anchor at the beginning of input. This object implements the 3465 * match for a \A sequence, and the caret anchor will use this if not in 3466 * multiline mode. 3467 */ 3468 static final class Begin extends Node { 3469 boolean match(Matcher matcher, int i, CharSequence seq) { 3470 int fromIndex = (matcher.anchoringBounds) ? 3471 matcher.from : 0; 3472 if (i == fromIndex && next.match(matcher, i, seq)) { 3473 matcher.first = i; 3474 matcher.groups[0] = i; 3475 matcher.groups[1] = matcher.last; 3476 return true; 3477 } else { 3478 return false; 3479 } 3480 } 3481 } 3482 3483 /** 3484 * Node to anchor at the end of input. This is the absolute end, so this 3485 * should not match at the last newline before the end as $ will. 3486 */ 3487 static final class End extends Node { 3488 boolean match(Matcher matcher, int i, CharSequence seq) { 3489 int endIndex = (matcher.anchoringBounds) ? 3490 matcher.to : matcher.getTextLength(); 3491 if (i == endIndex) { 3492 matcher.hitEnd = true; 3493 return next.match(matcher, i, seq); 3494 } 3495 return false; 3496 } 3497 } 3498 3499 /** 3500 * Node to anchor at the beginning of a line. This is essentially the 3501 * object to match for the multiline ^. 3502 */ 3503 static final class Caret extends Node { 3504 boolean match(Matcher matcher, int i, CharSequence seq) { 3505 int startIndex = matcher.from; 3506 int endIndex = matcher.to; 3507 if (!matcher.anchoringBounds) { 3508 startIndex = 0; 3509 endIndex = matcher.getTextLength(); 3510 } 3511 // Perl does not match ^ at end of input even after newline 3512 if (i == endIndex) { 3513 matcher.hitEnd = true; 3514 return false; 3515 } 3516 if (i > startIndex) { 3517 char ch = seq.charAt(i-1); 3518 if (ch != '\n' && ch != '\r' 3519 && (ch|1) != '\u2029' 3520 && ch != '\u0085' ) { 3521 return false; 3522 } 3523 // Should treat /r/n as one newline 3524 if (ch == '\r' && seq.charAt(i) == '\n') 3525 return false; 3526 } 3527 return next.match(matcher, i, seq); 3528 } 3529 } 3530 3531 /** 3532 * Node to anchor at the beginning of a line when in unixdot mode. 3533 */ 3534 static final class UnixCaret extends Node { 3535 boolean match(Matcher matcher, int i, CharSequence seq) { 3536 int startIndex = matcher.from; 3537 int endIndex = matcher.to; 3538 if (!matcher.anchoringBounds) { 3539 startIndex = 0; 3540 endIndex = matcher.getTextLength(); 3541 } 3542 // Perl does not match ^ at end of input even after newline 3543 if (i == endIndex) { 3544 matcher.hitEnd = true; 3545 return false; 3546 } 3547 if (i > startIndex) { 3548 char ch = seq.charAt(i-1); 3549 if (ch != '\n') { 3550 return false; 3551 } 3552 } 3553 return next.match(matcher, i, seq); 3554 } 3555 } 3556 3557 /** 3558 * Node to match the location where the last match ended. 3559 * This is used for the \G construct. 3560 */ 3561 static final class LastMatch extends Node { 3562 boolean match(Matcher matcher, int i, CharSequence seq) { 3563 if (i != matcher.oldLast) 3564 return false; 3565 return next.match(matcher, i, seq); 3566 } 3567 } 3568 3569 /** 3570 * Node to anchor at the end of a line or the end of input based on the 3571 * multiline mode. 3572 * 3573 * When not in multiline mode, the $ can only match at the very end 3574 * of the input, unless the input ends in a line terminator in which 3575 * it matches right before the last line terminator. 3576 * 3577 * Note that \r\n is considered an atomic line terminator. 3578 * 3579 * Like ^ the $ operator matches at a position, it does not match the 3580 * line terminators themselves. 3581 */ 3582 static final class Dollar extends Node { 3583 boolean multiline; 3584 Dollar(boolean mul) { 3585 multiline = mul; 3586 } 3587 boolean match(Matcher matcher, int i, CharSequence seq) { 3588 int endIndex = (matcher.anchoringBounds) ? 3589 matcher.to : matcher.getTextLength(); 3590 if (!multiline) { 3591 if (i < endIndex - 2) 3592 return false; 3593 if (i == endIndex - 2) { 3594 char ch = seq.charAt(i); 3595 if (ch != '\r') 3596 return false; 3597 ch = seq.charAt(i + 1); 3598 if (ch != '\n') 3599 return false; 3600 } 3601 } 3602 // Matches before any line terminator; also matches at the 3603 // end of input 3604 // Before line terminator: 3605 // If multiline, we match here no matter what 3606 // If not multiline, fall through so that the end 3607 // is marked as hit; this must be a /r/n or a /n 3608 // at the very end so the end was hit; more input 3609 // could make this not match here 3610 if (i < endIndex) { 3611 char ch = seq.charAt(i); 3612 if (ch == '\n') { 3613 // No match between \r\n 3614 if (i > 0 && seq.charAt(i-1) == '\r') 3615 return false; 3616 if (multiline) 3617 return next.match(matcher, i, seq); 3618 } else if (ch == '\r' || ch == '\u0085' || 3619 (ch|1) == '\u2029') { 3620 if (multiline) 3621 return next.match(matcher, i, seq); 3622 } else { // No line terminator, no match 3623 return false; 3624 } 3625 } 3626 // Matched at current end so hit end 3627 matcher.hitEnd = true; 3628 // If a $ matches because of end of input, then more input 3629 // could cause it to fail! 3630 matcher.requireEnd = true; 3631 return next.match(matcher, i, seq); 3632 } 3633 boolean study(TreeInfo info) { 3634 next.study(info); 3635 return info.deterministic; 3636 } 3637 } 3638 3639 /** 3640 * Node to anchor at the end of a line or the end of input based on the 3641 * multiline mode when in unix lines mode. 3642 */ 3643 static final class UnixDollar extends Node { 3644 boolean multiline; 3645 UnixDollar(boolean mul) { 3646 multiline = mul; 3647 } 3648 boolean match(Matcher matcher, int i, CharSequence seq) { 3649 int endIndex = (matcher.anchoringBounds) ? 3650 matcher.to : matcher.getTextLength(); 3651 if (i < endIndex) { 3652 char ch = seq.charAt(i); 3653 if (ch == '\n') { 3654 // If not multiline, then only possible to 3655 // match at very end or one before end 3656 if (multiline == false && i != endIndex - 1) 3657 return false; 3658 // If multiline return next.match without setting 3659 // matcher.hitEnd 3660 if (multiline) 3661 return next.match(matcher, i, seq); 3662 } else { 3663 return false; 3664 } 3665 } 3666 // Matching because at the end or 1 before the end; 3667 // more input could change this so set hitEnd 3668 matcher.hitEnd = true; 3669 // If a $ matches because of end of input, then more input 3670 // could cause it to fail! 3671 matcher.requireEnd = true; 3672 return next.match(matcher, i, seq); 3673 } 3674 boolean study(TreeInfo info) { 3675 next.study(info); 3676 return info.deterministic; 3677 } 3678 } 3679 3680 /** 3681 * Abstract node class to match one character satisfying some 3682 * boolean property. 3683 */ 3684 private static abstract class CharProperty extends Node { 3685 abstract boolean isSatisfiedBy(int ch); 3686 CharProperty complement() { 3687 return new CharProperty() { 3688 boolean isSatisfiedBy(int ch) { 3689 return ! CharProperty.this.isSatisfiedBy(ch);}}; 3690 } 3691 boolean match(Matcher matcher, int i, CharSequence seq) { 3692 if (i < matcher.to) { 3693 int ch = Character.codePointAt(seq, i); 3694 return isSatisfiedBy(ch) 3695 && next.match(matcher, i+Character.charCount(ch), seq); 3696 } else { 3697 matcher.hitEnd = true; 3698 return false; 3699 } 3700 } 3701 boolean study(TreeInfo info) { 3702 info.minLength++; 3703 info.maxLength++; 3704 return next.study(info); 3705 } 3706 } 3707 3708 /** 3709 * Optimized version of CharProperty that works only for 3710 * properties never satisfied by Supplementary characters. 3711 */ 3712 private static abstract class BmpCharProperty extends CharProperty { 3713 boolean match(Matcher matcher, int i, CharSequence seq) { 3714 if (i < matcher.to) { 3715 return isSatisfiedBy(seq.charAt(i)) 3716 && next.match(matcher, i+1, seq); 3717 } else { 3718 matcher.hitEnd = true; 3719 return false; 3720 } 3721 } 3722 } 3723 3724 /** 3725 * Node class that matches a Supplementary Unicode character 3726 */ 3727 static final class SingleS extends CharProperty { 3728 final int c; 3729 SingleS(int c) { this.c = c; } 3730 boolean isSatisfiedBy(int ch) { 3731 return ch == c; 3732 } 3733 } 3734 3735 /** 3736 * Optimization -- matches a given BMP character 3737 */ 3738 static final class Single extends BmpCharProperty { 3739 final int c; 3740 Single(int c) { this.c = c; } 3741 boolean isSatisfiedBy(int ch) { 3742 return ch == c; 3743 } 3744 } 3745 3746 /** 3747 * Case insensitive matches a given BMP character 3748 */ 3749 static final class SingleI extends BmpCharProperty { 3750 final int lower; 3751 final int upper; 3752 SingleI(int lower, int upper) { 3753 this.lower = lower; 3754 this.upper = upper; 3755 } 3756 boolean isSatisfiedBy(int ch) { 3757 return ch == lower || ch == upper; 3758 } 3759 } 3760 3761 /** 3762 * Unicode case insensitive matches a given Unicode character 3763 */ 3764 static final class SingleU extends CharProperty { 3765 final int lower; 3766 SingleU(int lower) { 3767 this.lower = lower; 3768 } 3769 boolean isSatisfiedBy(int ch) { 3770 return lower == ch || 3771 lower == Character.toLowerCase(Character.toUpperCase(ch)); 3772 } 3773 } 3774 3775 3776 /** 3777 * Node class that matches a Unicode block. 3778 */ 3779 static final class Block extends CharProperty { 3780 final Character.UnicodeBlock block; 3781 Block(Character.UnicodeBlock block) { 3782 this.block = block; 3783 } 3784 boolean isSatisfiedBy(int ch) { 3785 return block == Character.UnicodeBlock.of(ch); 3786 } 3787 } 3788 3789 /** 3790 * Node class that matches a Unicode script 3791 */ 3792 static final class Script extends CharProperty { 3793 final Character.UnicodeScript script; 3794 Script(Character.UnicodeScript script) { 3795 this.script = script; 3796 } 3797 boolean isSatisfiedBy(int ch) { 3798 return script == Character.UnicodeScript.of(ch); 3799 } 3800 } 3801 3802 /** 3803 * Node class that matches a Unicode category. 3804 */ 3805 static final class Category extends CharProperty { 3806 final int typeMask; 3807 Category(int typeMask) { this.typeMask = typeMask; } 3808 boolean isSatisfiedBy(int ch) { 3809 return (typeMask & (1 << Character.getType(ch))) != 0; 3810 } 3811 } 3812 3813 /** 3814 * Node class that matches a Unicode "type" 3815 */ 3816 static final class Utype extends CharProperty { 3817 final UnicodeProp uprop; 3818 Utype(UnicodeProp uprop) { this.uprop = uprop; } 3819 boolean isSatisfiedBy(int ch) { 3820 return uprop.is(ch); 3821 } 3822 } 3823 3824 3825 /** 3826 * Node class that matches a POSIX type. 3827 */ 3828 static final class Ctype extends BmpCharProperty { 3829 final int ctype; 3830 Ctype(int ctype) { this.ctype = ctype; } 3831 boolean isSatisfiedBy(int ch) { 3832 return ch < 128 && ASCII.isType(ch, ctype); 3833 } 3834 } 3835 3836 /** 3837 * Base class for all Slice nodes 3838 */ 3839 static class SliceNode extends Node { 3840 int[] buffer; 3841 SliceNode(int[] buf) { 3842 buffer = buf; 3843 } 3844 boolean study(TreeInfo info) { 3845 info.minLength += buffer.length; 3846 info.maxLength += buffer.length; 3847 return next.study(info); 3848 } 3849 } 3850 3851 /** 3852 * Node class for a case sensitive/BMP-only sequence of literal 3853 * characters. 3854 */ 3855 static final class Slice extends SliceNode { 3856 Slice(int[] buf) { 3857 super(buf); 3858 } 3859 boolean match(Matcher matcher, int i, CharSequence seq) { 3860 int[] buf = buffer; 3861 int len = buf.length; 3862 for (int j=0; j<len; j++) { 3863 if ((i+j) >= matcher.to) { 3864 matcher.hitEnd = true; 3865 return false; 3866 } 3867 if (buf[j] != seq.charAt(i+j)) 3868 return false; 3869 } 3870 return next.match(matcher, i+len, seq); 3871 } 3872 } 3873 3874 /** 3875 * Node class for a case_insensitive/BMP-only sequence of literal 3876 * characters. 3877 */ 3878 static class SliceI extends SliceNode { 3879 SliceI(int[] buf) { 3880 super(buf); 3881 } 3882 boolean match(Matcher matcher, int i, CharSequence seq) { 3883 int[] buf = buffer; 3884 int len = buf.length; 3885 for (int j=0; j<len; j++) { 3886 if ((i+j) >= matcher.to) { 3887 matcher.hitEnd = true; 3888 return false; 3889 } 3890 int c = seq.charAt(i+j); 3891 if (buf[j] != c && 3892 buf[j] != ASCII.toLower(c)) 3893 return false; 3894 } 3895 return next.match(matcher, i+len, seq); 3896 } 3897 } 3898 3899 /** 3900 * Node class for a unicode_case_insensitive/BMP-only sequence of 3901 * literal characters. Uses unicode case folding. 3902 */ 3903 static final class SliceU extends SliceNode { 3904 SliceU(int[] buf) { 3905 super(buf); 3906 } 3907 boolean match(Matcher matcher, int i, CharSequence seq) { 3908 int[] buf = buffer; 3909 int len = buf.length; 3910 for (int j=0; j<len; j++) { 3911 if ((i+j) >= matcher.to) { 3912 matcher.hitEnd = true; 3913 return false; 3914 } 3915 int c = seq.charAt(i+j); 3916 if (buf[j] != c && 3917 buf[j] != Character.toLowerCase(Character.toUpperCase(c))) 3918 return false; 3919 } 3920 return next.match(matcher, i+len, seq); 3921 } 3922 } 3923 3924 /** 3925 * Node class for a case sensitive sequence of literal characters 3926 * including supplementary characters. 3927 */ 3928 static final class SliceS extends SliceNode { 3929 SliceS(int[] buf) { 3930 super(buf); 3931 } 3932 boolean match(Matcher matcher, int i, CharSequence seq) { 3933 int[] buf = buffer; 3934 int x = i; 3935 for (int j = 0; j < buf.length; j++) { 3936 if (x >= matcher.to) { 3937 matcher.hitEnd = true; 3938 return false; 3939 } 3940 int c = Character.codePointAt(seq, x); 3941 if (buf[j] != c) 3942 return false; 3943 x += Character.charCount(c); 3944 if (x > matcher.to) { 3945 matcher.hitEnd = true; 3946 return false; 3947 } 3948 } 3949 return next.match(matcher, x, seq); 3950 } 3951 } 3952 3953 /** 3954 * Node class for a case insensitive sequence of literal characters 3955 * including supplementary characters. 3956 */ 3957 static class SliceIS extends SliceNode { 3958 SliceIS(int[] buf) { 3959 super(buf); 3960 } 3961 int toLower(int c) { 3962 return ASCII.toLower(c); 3963 } 3964 boolean match(Matcher matcher, int i, CharSequence seq) { 3965 int[] buf = buffer; 3966 int x = i; 3967 for (int j = 0; j < buf.length; j++) { 3968 if (x >= matcher.to) { 3969 matcher.hitEnd = true; 3970 return false; 3971 } 3972 int c = Character.codePointAt(seq, x); 3973 if (buf[j] != c && buf[j] != toLower(c)) 3974 return false; 3975 x += Character.charCount(c); 3976 if (x > matcher.to) { 3977 matcher.hitEnd = true; 3978 return false; 3979 } 3980 } 3981 return next.match(matcher, x, seq); 3982 } 3983 } 3984 3985 /** 3986 * Node class for a case insensitive sequence of literal characters. 3987 * Uses unicode case folding. 3988 */ 3989 static final class SliceUS extends SliceIS { 3990 SliceUS(int[] buf) { 3991 super(buf); 3992 } 3993 int toLower(int c) { 3994 return Character.toLowerCase(Character.toUpperCase(c)); 3995 } 3996 } 3997 3998 private static boolean inRange(int lower, int ch, int upper) { 3999 return lower <= ch && ch <= upper; 4000 } 4001 4002 /** 4003 * Returns node for matching characters within an explicit value range. 4004 */ 4005 private static CharProperty rangeFor(final int lower, 4006 final int upper) { 4007 return new CharProperty() { 4008 boolean isSatisfiedBy(int ch) { 4009 return inRange(lower, ch, upper);}}; 4010 } 4011 4012 /** 4013 * Returns node for matching characters within an explicit value 4014 * range in a case insensitive manner. 4015 */ 4016 private CharProperty caseInsensitiveRangeFor(final int lower, 4017 final int upper) { 4018 if (has(UNICODE_CASE)) 4019 return new CharProperty() { 4020 boolean isSatisfiedBy(int ch) { 4021 if (inRange(lower, ch, upper)) 4022 return true; 4023 int up = Character.toUpperCase(ch); 4024 return inRange(lower, up, upper) || 4025 inRange(lower, Character.toLowerCase(up), upper);}}; 4026 return new CharProperty() { 4027 boolean isSatisfiedBy(int ch) { 4028 return inRange(lower, ch, upper) || 4029 ASCII.isAscii(ch) && 4030 (inRange(lower, ASCII.toUpper(ch), upper) || 4031 inRange(lower, ASCII.toLower(ch), upper)); 4032 }}; 4033 } 4034 4035 /** 4036 * Implements the Unicode category ALL and the dot metacharacter when 4037 * in dotall mode. 4038 */ 4039 static final class All extends CharProperty { 4040 boolean isSatisfiedBy(int ch) { 4041 return true; 4042 } 4043 } 4044 4045 /** 4046 * Node class for the dot metacharacter when dotall is not enabled. 4047 */ 4048 static final class Dot extends CharProperty { 4049 boolean isSatisfiedBy(int ch) { 4050 return (ch != '\n' && ch != '\r' 4051 && (ch|1) != '\u2029' 4052 && ch != '\u0085'); 4053 } 4054 } 4055 4056 /** 4057 * Node class for the dot metacharacter when dotall is not enabled 4058 * but UNIX_LINES is enabled. 4059 */ 4060 static final class UnixDot extends CharProperty { 4061 boolean isSatisfiedBy(int ch) { 4062 return ch != '\n'; 4063 } 4064 } 4065 4066 /** 4067 * The 0 or 1 quantifier. This one class implements all three types. 4068 */ 4069 static final class Ques extends Node { 4070 Node atom; 4071 int type; 4072 Ques(Node node, int type) { 4073 this.atom = node; 4074 this.type = type; 4075 } 4076 boolean match(Matcher matcher, int i, CharSequence seq) { 4077 switch (type) { 4078 case GREEDY: 4079 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) 4080 || next.match(matcher, i, seq); 4081 case LAZY: 4082 return next.match(matcher, i, seq) 4083 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); 4084 case POSSESSIVE: 4085 if (atom.match(matcher, i, seq)) i = matcher.last; 4086 return next.match(matcher, i, seq); 4087 default: 4088 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); 4089 } 4090 } 4091 boolean study(TreeInfo info) { 4092 if (type != INDEPENDENT) { 4093 int minL = info.minLength; 4094 atom.study(info); 4095 info.minLength = minL; 4096 info.deterministic = false; 4097 return next.study(info); 4098 } else { 4099 atom.study(info); 4100 return next.study(info); 4101 } 4102 } 4103 } 4104 4105 /** 4106 * Handles the curly-brace style repetition with a specified minimum and 4107 * maximum occurrences. The * quantifier is handled as a special case. 4108 * This class handles the three types. 4109 */ 4110 static final class Curly extends Node { 4111 Node atom; 4112 int type; 4113 int cmin; 4114 int cmax; 4115 4116 Curly(Node node, int cmin, int cmax, int type) { 4117 this.atom = node; 4118 this.type = type; 4119 this.cmin = cmin; 4120 this.cmax = cmax; 4121 } 4122 boolean match(Matcher matcher, int i, CharSequence seq) { 4123 int j; 4124 for (j = 0; j < cmin; j++) { 4125 if (atom.match(matcher, i, seq)) { 4126 i = matcher.last; 4127 continue; 4128 } 4129 return false; 4130 } 4131 if (type == GREEDY) 4132 return match0(matcher, i, j, seq); 4133 else if (type == LAZY) 4134 return match1(matcher, i, j, seq); 4135 else 4136 return match2(matcher, i, j, seq); 4137 } 4138 // Greedy match. 4139 // i is the index to start matching at 4140 // j is the number of atoms that have matched 4141 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4142 if (j >= cmax) { 4143 // We have matched the maximum... continue with the rest of 4144 // the regular expression 4145 return next.match(matcher, i, seq); 4146 } 4147 int backLimit = j; 4148 while (atom.match(matcher, i, seq)) { 4149 // k is the length of this match 4150 int k = matcher.last - i; 4151 if (k == 0) // Zero length match 4152 break; 4153 // Move up index and number matched 4154 i = matcher.last; 4155 j++; 4156 // We are greedy so match as many as we can 4157 while (j < cmax) { 4158 if (!atom.match(matcher, i, seq)) 4159 break; 4160 if (i + k != matcher.last) { 4161 if (match0(matcher, matcher.last, j+1, seq)) 4162 return true; 4163 break; 4164 } 4165 i += k; 4166 j++; 4167 } 4168 // Handle backing off if match fails 4169 while (j >= backLimit) { 4170 if (next.match(matcher, i, seq)) 4171 return true; 4172 i -= k; 4173 j--; 4174 } 4175 return false; 4176 } 4177 return next.match(matcher, i, seq); 4178 } 4179 // Reluctant match. At this point, the minimum has been satisfied. 4180 // i is the index to start matching at 4181 // j is the number of atoms that have matched 4182 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4183 for (;;) { 4184 // Try finishing match without consuming any more 4185 if (next.match(matcher, i, seq)) 4186 return true; 4187 // At the maximum, no match found 4188 if (j >= cmax) 4189 return false; 4190 // Okay, must try one more atom 4191 if (!atom.match(matcher, i, seq)) 4192 return false; 4193 // If we haven't moved forward then must break out 4194 if (i == matcher.last) 4195 return false; 4196 // Move up index and number matched 4197 i = matcher.last; 4198 j++; 4199 } 4200 } 4201 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4202 for (; j < cmax; j++) { 4203 if (!atom.match(matcher, i, seq)) 4204 break; 4205 if (i == matcher.last) 4206 break; 4207 i = matcher.last; 4208 } 4209 return next.match(matcher, i, seq); 4210 } 4211 boolean study(TreeInfo info) { 4212 // Save original info 4213 int minL = info.minLength; 4214 int maxL = info.maxLength; 4215 boolean maxV = info.maxValid; 4216 boolean detm = info.deterministic; 4217 info.reset(); 4218 4219 atom.study(info); 4220 4221 int temp = info.minLength * cmin + minL; 4222 if (temp < minL) { 4223 temp = 0xFFFFFFF; // arbitrary large number 4224 } 4225 info.minLength = temp; 4226 4227 if (maxV & info.maxValid) { 4228 temp = info.maxLength * cmax + maxL; 4229 info.maxLength = temp; 4230 if (temp < maxL) { 4231 info.maxValid = false; 4232 } 4233 } else { 4234 info.maxValid = false; 4235 } 4236 4237 if (info.deterministic && cmin == cmax) 4238 info.deterministic = detm; 4239 else 4240 info.deterministic = false; 4241 4242 return next.study(info); 4243 } 4244 } 4245 4246 /** 4247 * Handles the curly-brace style repetition with a specified minimum and 4248 * maximum occurrences in deterministic cases. This is an iterative 4249 * optimization over the Prolog and Loop system which would handle this 4250 * in a recursive way. The * quantifier is handled as a special case. 4251 * If capture is true then this class saves group settings and ensures 4252 * that groups are unset when backing off of a group match. 4253 */ 4254 static final class GroupCurly extends Node { 4255 Node atom; 4256 int type; 4257 int cmin; 4258 int cmax; 4259 int localIndex; 4260 int groupIndex; 4261 boolean capture; 4262 4263 GroupCurly(Node node, int cmin, int cmax, int type, int local, 4264 int group, boolean capture) { 4265 this.atom = node; 4266 this.type = type; 4267 this.cmin = cmin; 4268 this.cmax = cmax; 4269 this.localIndex = local; 4270 this.groupIndex = group; 4271 this.capture = capture; 4272 } 4273 boolean match(Matcher matcher, int i, CharSequence seq) { 4274 int[] groups = matcher.groups; 4275 int[] locals = matcher.locals; 4276 int save0 = locals[localIndex]; 4277 int save1 = 0; 4278 int save2 = 0; 4279 4280 if (capture) { 4281 save1 = groups[groupIndex]; 4282 save2 = groups[groupIndex+1]; 4283 } 4284 4285 // Notify GroupTail there is no need to setup group info 4286 // because it will be set here 4287 locals[localIndex] = -1; 4288 4289 boolean ret = true; 4290 for (int j = 0; j < cmin; j++) { 4291 if (atom.match(matcher, i, seq)) { 4292 if (capture) { 4293 groups[groupIndex] = i; 4294 groups[groupIndex+1] = matcher.last; 4295 } 4296 i = matcher.last; 4297 } else { 4298 ret = false; 4299 break; 4300 } 4301 } 4302 if (ret) { 4303 if (type == GREEDY) { 4304 ret = match0(matcher, i, cmin, seq); 4305 } else if (type == LAZY) { 4306 ret = match1(matcher, i, cmin, seq); 4307 } else { 4308 ret = match2(matcher, i, cmin, seq); 4309 } 4310 } 4311 if (!ret) { 4312 locals[localIndex] = save0; 4313 if (capture) { 4314 groups[groupIndex] = save1; 4315 groups[groupIndex+1] = save2; 4316 } 4317 } 4318 return ret; 4319 } 4320 // Aggressive group match 4321 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4322 int[] groups = matcher.groups; 4323 int save0 = 0; 4324 int save1 = 0; 4325 if (capture) { 4326 save0 = groups[groupIndex]; 4327 save1 = groups[groupIndex+1]; 4328 } 4329 for (;;) { 4330 if (j >= cmax) 4331 break; 4332 if (!atom.match(matcher, i, seq)) 4333 break; 4334 int k = matcher.last - i; 4335 if (k <= 0) { 4336 if (capture) { 4337 groups[groupIndex] = i; 4338 groups[groupIndex+1] = i + k; 4339 } 4340 i = i + k; 4341 break; 4342 } 4343 for (;;) { 4344 if (capture) { 4345 groups[groupIndex] = i; 4346 groups[groupIndex+1] = i + k; 4347 } 4348 i = i + k; 4349 if (++j >= cmax) 4350 break; 4351 if (!atom.match(matcher, i, seq)) 4352 break; 4353 if (i + k != matcher.last) { 4354 if (match0(matcher, i, j, seq)) 4355 return true; 4356 break; 4357 } 4358 } 4359 while (j > cmin) { 4360 if (next.match(matcher, i, seq)) { 4361 if (capture) { 4362 groups[groupIndex+1] = i; 4363 groups[groupIndex] = i - k; 4364 } 4365 i = i - k; 4366 return true; 4367 } 4368 // backing off 4369 if (capture) { 4370 groups[groupIndex+1] = i; 4371 groups[groupIndex] = i - k; 4372 } 4373 i = i - k; 4374 j--; 4375 } 4376 break; 4377 } 4378 if (capture) { 4379 groups[groupIndex] = save0; 4380 groups[groupIndex+1] = save1; 4381 } 4382 return next.match(matcher, i, seq); 4383 } 4384 // Reluctant matching 4385 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4386 for (;;) { 4387 if (next.match(matcher, i, seq)) 4388 return true; 4389 if (j >= cmax) 4390 return false; 4391 if (!atom.match(matcher, i, seq)) 4392 return false; 4393 if (i == matcher.last) 4394 return false; 4395 if (capture) { 4396 matcher.groups[groupIndex] = i; 4397 matcher.groups[groupIndex+1] = matcher.last; 4398 } 4399 i = matcher.last; 4400 j++; 4401 } 4402 } 4403 // Possessive matching 4404 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4405 for (; j < cmax; j++) { 4406 if (!atom.match(matcher, i, seq)) { 4407 break; 4408 } 4409 if (capture) { 4410 matcher.groups[groupIndex] = i; 4411 matcher.groups[groupIndex+1] = matcher.last; 4412 } 4413 if (i == matcher.last) { 4414 break; 4415 } 4416 i = matcher.last; 4417 } 4418 return next.match(matcher, i, seq); 4419 } 4420 boolean study(TreeInfo info) { 4421 // Save original info 4422 int minL = info.minLength; 4423 int maxL = info.maxLength; 4424 boolean maxV = info.maxValid; 4425 boolean detm = info.deterministic; 4426 info.reset(); 4427 4428 atom.study(info); 4429 4430 int temp = info.minLength * cmin + minL; 4431 if (temp < minL) { 4432 temp = 0xFFFFFFF; // Arbitrary large number 4433 } 4434 info.minLength = temp; 4435 4436 if (maxV & info.maxValid) { 4437 temp = info.maxLength * cmax + maxL; 4438 info.maxLength = temp; 4439 if (temp < maxL) { 4440 info.maxValid = false; 4441 } 4442 } else { 4443 info.maxValid = false; 4444 } 4445 4446 if (info.deterministic && cmin == cmax) { 4447 info.deterministic = detm; 4448 } else { 4449 info.deterministic = false; 4450 } 4451 4452 return next.study(info); 4453 } 4454 } 4455 4456 /** 4457 * A Guard node at the end of each atom node in a Branch. It 4458 * serves the purpose of chaining the "match" operation to 4459 * "next" but not the "study", so we can collect the TreeInfo 4460 * of each atom node without including the TreeInfo of the 4461 * "next". 4462 */ 4463 static final class BranchConn extends Node { 4464 BranchConn() {}; 4465 boolean match(Matcher matcher, int i, CharSequence seq) { 4466 return next.match(matcher, i, seq); 4467 } 4468 boolean study(TreeInfo info) { 4469 return info.deterministic; 4470 } 4471 } 4472 4473 /** 4474 * Handles the branching of alternations. Note this is also used for 4475 * the ? quantifier to branch between the case where it matches once 4476 * and where it does not occur. 4477 */ 4478 static final class Branch extends Node { 4479 Node[] atoms = new Node[2]; 4480 int size = 2; 4481 Node conn; 4482 Branch(Node first, Node second, Node branchConn) { 4483 conn = branchConn; 4484 atoms[0] = first; 4485 atoms[1] = second; 4486 } 4487 4488 void add(Node node) { 4489 if (size >= atoms.length) { 4490 Node[] tmp = new Node[atoms.length*2]; 4491 System.arraycopy(atoms, 0, tmp, 0, atoms.length); 4492 atoms = tmp; 4493 } 4494 atoms[size++] = node; 4495 } 4496 4497 boolean match(Matcher matcher, int i, CharSequence seq) { 4498 for (int n = 0; n < size; n++) { 4499 if (atoms[n] == null) { 4500 if (conn.next.match(matcher, i, seq)) 4501 return true; 4502 } else if (atoms[n].match(matcher, i, seq)) { 4503 return true; 4504 } 4505 } 4506 return false; 4507 } 4508 4509 boolean study(TreeInfo info) { 4510 int minL = info.minLength; 4511 int maxL = info.maxLength; 4512 boolean maxV = info.maxValid; 4513 4514 int minL2 = Integer.MAX_VALUE; //arbitrary large enough num 4515 int maxL2 = -1; 4516 for (int n = 0; n < size; n++) { 4517 info.reset(); 4518 if (atoms[n] != null) 4519 atoms[n].study(info); 4520 minL2 = Math.min(minL2, info.minLength); 4521 maxL2 = Math.max(maxL2, info.maxLength); 4522 maxV = (maxV & info.maxValid); 4523 } 4524 4525 minL += minL2; 4526 maxL += maxL2; 4527 4528 info.reset(); 4529 conn.next.study(info); 4530 4531 info.minLength += minL; 4532 info.maxLength += maxL; 4533 info.maxValid &= maxV; 4534 info.deterministic = false; 4535 return false; 4536 } 4537 } 4538 4539 /** 4540 * The GroupHead saves the location where the group begins in the locals 4541 * and restores them when the match is done. 4542 * 4543 * The matchRef is used when a reference to this group is accessed later 4544 * in the expression. The locals will have a negative value in them to 4545 * indicate that we do not want to unset the group if the reference 4546 * doesn't match. 4547 */ 4548 static final class GroupHead extends Node { 4549 int localIndex; 4550 GroupHead(int localCount) { 4551 localIndex = localCount; 4552 } 4553 boolean match(Matcher matcher, int i, CharSequence seq) { 4554 int save = matcher.locals[localIndex]; 4555 matcher.locals[localIndex] = i; 4556 boolean ret = next.match(matcher, i, seq); 4557 matcher.locals[localIndex] = save; 4558 return ret; 4559 } 4560 boolean matchRef(Matcher matcher, int i, CharSequence seq) { 4561 int save = matcher.locals[localIndex]; 4562 matcher.locals[localIndex] = ~i; // HACK 4563 boolean ret = next.match(matcher, i, seq); 4564 matcher.locals[localIndex] = save; 4565 return ret; 4566 } 4567 } 4568 4569 /** 4570 * Recursive reference to a group in the regular expression. It calls 4571 * matchRef because if the reference fails to match we would not unset 4572 * the group. 4573 */ 4574 static final class GroupRef extends Node { 4575 GroupHead head; 4576 GroupRef(GroupHead head) { 4577 this.head = head; 4578 } 4579 boolean match(Matcher matcher, int i, CharSequence seq) { 4580 return head.matchRef(matcher, i, seq) 4581 && next.match(matcher, matcher.last, seq); 4582 } 4583 boolean study(TreeInfo info) { 4584 info.maxValid = false; 4585 info.deterministic = false; 4586 return next.study(info); 4587 } 4588 } 4589 4590 /** 4591 * The GroupTail handles the setting of group beginning and ending 4592 * locations when groups are successfully matched. It must also be able to 4593 * unset groups that have to be backed off of. 4594 * 4595 * The GroupTail node is also used when a previous group is referenced, 4596 * and in that case no group information needs to be set. 4597 */ 4598 static final class GroupTail extends Node { 4599 int localIndex; 4600 int groupIndex; 4601 GroupTail(int localCount, int groupCount) { 4602 localIndex = localCount; 4603 groupIndex = groupCount + groupCount; 4604 } 4605 boolean match(Matcher matcher, int i, CharSequence seq) { 4606 int tmp = matcher.locals[localIndex]; 4607 if (tmp >= 0) { // This is the normal group case. 4608 // Save the group so we can unset it if it 4609 // backs off of a match. 4610 int groupStart = matcher.groups[groupIndex]; 4611 int groupEnd = matcher.groups[groupIndex+1]; 4612 4613 matcher.groups[groupIndex] = tmp; 4614 matcher.groups[groupIndex+1] = i; 4615 if (next.match(matcher, i, seq)) { 4616 return true; 4617 } 4618 matcher.groups[groupIndex] = groupStart; 4619 matcher.groups[groupIndex+1] = groupEnd; 4620 return false; 4621 } else { 4622 // This is a group reference case. We don't need to save any 4623 // group info because it isn't really a group. 4624 matcher.last = i; 4625 return true; 4626 } 4627 } 4628 } 4629 4630 /** 4631 * This sets up a loop to handle a recursive quantifier structure. 4632 */ 4633 static final class Prolog extends Node { 4634 Loop loop; 4635 Prolog(Loop loop) { 4636 this.loop = loop; 4637 } 4638 boolean match(Matcher matcher, int i, CharSequence seq) { 4639 return loop.matchInit(matcher, i, seq); 4640 } 4641 boolean study(TreeInfo info) { 4642 return loop.study(info); 4643 } 4644 } 4645 4646 /** 4647 * Handles the repetition count for a greedy Curly. The matchInit 4648 * is called from the Prolog to save the index of where the group 4649 * beginning is stored. A zero length group check occurs in the 4650 * normal match but is skipped in the matchInit. 4651 */ 4652 static class Loop extends Node { 4653 Node body; 4654 int countIndex; // local count index in matcher locals 4655 int beginIndex; // group beginning index 4656 int cmin, cmax; 4657 Loop(int countIndex, int beginIndex) { 4658 this.countIndex = countIndex; 4659 this.beginIndex = beginIndex; 4660 } 4661 boolean match(Matcher matcher, int i, CharSequence seq) { 4662 // Avoid infinite loop in zero-length case. 4663 if (i > matcher.locals[beginIndex]) { 4664 int count = matcher.locals[countIndex]; 4665 4666 // This block is for before we reach the minimum 4667 // iterations required for the loop to match 4668 if (count < cmin) { 4669 matcher.locals[countIndex] = count + 1; 4670 boolean b = body.match(matcher, i, seq); 4671 // If match failed we must backtrack, so 4672 // the loop count should NOT be incremented 4673 if (!b) 4674 matcher.locals[countIndex] = count; 4675 // Return success or failure since we are under 4676 // minimum 4677 return b; 4678 } 4679 // This block is for after we have the minimum 4680 // iterations required for the loop to match 4681 if (count < cmax) { 4682 matcher.locals[countIndex] = count + 1; 4683 boolean b = body.match(matcher, i, seq); 4684 // If match failed we must backtrack, so 4685 // the loop count should NOT be incremented 4686 if (!b) 4687 matcher.locals[countIndex] = count; 4688 else 4689 return true; 4690 } 4691 } 4692 return next.match(matcher, i, seq); 4693 } 4694 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4695 int save = matcher.locals[countIndex]; 4696 boolean ret = false; 4697 if (0 < cmin) { 4698 matcher.locals[countIndex] = 1; 4699 ret = body.match(matcher, i, seq); 4700 } else if (0 < cmax) { 4701 matcher.locals[countIndex] = 1; 4702 ret = body.match(matcher, i, seq); 4703 if (ret == false) 4704 ret = next.match(matcher, i, seq); 4705 } else { 4706 ret = next.match(matcher, i, seq); 4707 } 4708 matcher.locals[countIndex] = save; 4709 return ret; 4710 } 4711 boolean study(TreeInfo info) { 4712 info.maxValid = false; 4713 info.deterministic = false; 4714 return false; 4715 } 4716 } 4717 4718 /** 4719 * Handles the repetition count for a reluctant Curly. The matchInit 4720 * is called from the Prolog to save the index of where the group 4721 * beginning is stored. A zero length group check occurs in the 4722 * normal match but is skipped in the matchInit. 4723 */ 4724 static final class LazyLoop extends Loop { 4725 LazyLoop(int countIndex, int beginIndex) { 4726 super(countIndex, beginIndex); 4727 } 4728 boolean match(Matcher matcher, int i, CharSequence seq) { 4729 // Check for zero length group 4730 if (i > matcher.locals[beginIndex]) { 4731 int count = matcher.locals[countIndex]; 4732 if (count < cmin) { 4733 matcher.locals[countIndex] = count + 1; 4734 boolean result = body.match(matcher, i, seq); 4735 // If match failed we must backtrack, so 4736 // the loop count should NOT be incremented 4737 if (!result) 4738 matcher.locals[countIndex] = count; 4739 return result; 4740 } 4741 if (next.match(matcher, i, seq)) 4742 return true; 4743 if (count < cmax) { 4744 matcher.locals[countIndex] = count + 1; 4745 boolean result = body.match(matcher, i, seq); 4746 // If match failed we must backtrack, so 4747 // the loop count should NOT be incremented 4748 if (!result) 4749 matcher.locals[countIndex] = count; 4750 return result; 4751 } 4752 return false; 4753 } 4754 return next.match(matcher, i, seq); 4755 } 4756 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4757 int save = matcher.locals[countIndex]; 4758 boolean ret = false; 4759 if (0 < cmin) { 4760 matcher.locals[countIndex] = 1; 4761 ret = body.match(matcher, i, seq); 4762 } else if (next.match(matcher, i, seq)) { 4763 ret = true; 4764 } else if (0 < cmax) { 4765 matcher.locals[countIndex] = 1; 4766 ret = body.match(matcher, i, seq); 4767 } 4768 matcher.locals[countIndex] = save; 4769 return ret; 4770 } 4771 boolean study(TreeInfo info) { 4772 info.maxValid = false; 4773 info.deterministic = false; 4774 return false; 4775 } 4776 } 4777 4778 /** 4779 * Refers to a group in the regular expression. Attempts to match 4780 * whatever the group referred to last matched. 4781 */ 4782 static class BackRef extends Node { 4783 int groupIndex; 4784 BackRef(int groupCount) { 4785 super(); 4786 groupIndex = groupCount + groupCount; 4787 } 4788 boolean match(Matcher matcher, int i, CharSequence seq) { 4789 int j = matcher.groups[groupIndex]; 4790 int k = matcher.groups[groupIndex+1]; 4791 4792 int groupSize = k - j; 4793 4794 // If the referenced group didn't match, neither can this 4795 if (j < 0) 4796 return false; 4797 4798 // If there isn't enough input left no match 4799 if (i + groupSize > matcher.to) { 4800 matcher.hitEnd = true; 4801 return false; 4802 } 4803 4804 // Check each new char to make sure it matches what the group 4805 // referenced matched last time around 4806 for (int index=0; index<groupSize; index++) 4807 if (seq.charAt(i+index) != seq.charAt(j+index)) 4808 return false; 4809 4810 return next.match(matcher, i+groupSize, seq); 4811 } 4812 boolean study(TreeInfo info) { 4813 info.maxValid = false; 4814 return next.study(info); 4815 } 4816 } 4817 4818 static class CIBackRef extends Node { 4819 int groupIndex; 4820 boolean doUnicodeCase; 4821 CIBackRef(int groupCount, boolean doUnicodeCase) { 4822 super(); 4823 groupIndex = groupCount + groupCount; 4824 this.doUnicodeCase = doUnicodeCase; 4825 } 4826 boolean match(Matcher matcher, int i, CharSequence seq) { 4827 int j = matcher.groups[groupIndex]; 4828 int k = matcher.groups[groupIndex+1]; 4829 4830 int groupSize = k - j; 4831 4832 // If the referenced group didn't match, neither can this 4833 if (j < 0) 4834 return false; 4835 4836 // If there isn't enough input left no match 4837 if (i + groupSize > matcher.to) { 4838 matcher.hitEnd = true; 4839 return false; 4840 } 4841 4842 // Check each new char to make sure it matches what the group 4843 // referenced matched last time around 4844 int x = i; 4845 for (int index=0; index<groupSize; index++) { 4846 int c1 = Character.codePointAt(seq, x); 4847 int c2 = Character.codePointAt(seq, j); 4848 if (c1 != c2) { 4849 if (doUnicodeCase) { 4850 int cc1 = Character.toUpperCase(c1); 4851 int cc2 = Character.toUpperCase(c2); 4852 if (cc1 != cc2 && 4853 Character.toLowerCase(cc1) != 4854 Character.toLowerCase(cc2)) 4855 return false; 4856 } else { 4857 if (ASCII.toLower(c1) != ASCII.toLower(c2)) 4858 return false; 4859 } 4860 } 4861 x += Character.charCount(c1); 4862 j += Character.charCount(c2); 4863 } 4864 4865 return next.match(matcher, i+groupSize, seq); 4866 } 4867 boolean study(TreeInfo info) { 4868 info.maxValid = false; 4869 return next.study(info); 4870 } 4871 } 4872 4873 /** 4874 * Searches until the next instance of its atom. This is useful for 4875 * finding the atom efficiently without passing an instance of it 4876 * (greedy problem) and without a lot of wasted search time (reluctant 4877 * problem). 4878 */ 4879 static final class First extends Node { 4880 Node atom; 4881 First(Node node) { 4882 this.atom = BnM.optimize(node); 4883 } 4884 boolean match(Matcher matcher, int i, CharSequence seq) { 4885 if (atom instanceof BnM) { 4886 return atom.match(matcher, i, seq) 4887 && next.match(matcher, matcher.last, seq); 4888 } 4889 for (;;) { 4890 if (i > matcher.to) { 4891 matcher.hitEnd = true; 4892 return false; 4893 } 4894 if (atom.match(matcher, i, seq)) { 4895 return next.match(matcher, matcher.last, seq); 4896 } 4897 i += countChars(seq, i, 1); 4898 matcher.first++; 4899 } 4900 } 4901 boolean study(TreeInfo info) { 4902 atom.study(info); 4903 info.maxValid = false; 4904 info.deterministic = false; 4905 return next.study(info); 4906 } 4907 } 4908 4909 static final class Conditional extends Node { 4910 Node cond, yes, not; 4911 Conditional(Node cond, Node yes, Node not) { 4912 this.cond = cond; 4913 this.yes = yes; 4914 this.not = not; 4915 } 4916 boolean match(Matcher matcher, int i, CharSequence seq) { 4917 if (cond.match(matcher, i, seq)) { 4918 return yes.match(matcher, i, seq); 4919 } else { 4920 return not.match(matcher, i, seq); 4921 } 4922 } 4923 boolean study(TreeInfo info) { 4924 int minL = info.minLength; 4925 int maxL = info.maxLength; 4926 boolean maxV = info.maxValid; 4927 info.reset(); 4928 yes.study(info); 4929 4930 int minL2 = info.minLength; 4931 int maxL2 = info.maxLength; 4932 boolean maxV2 = info.maxValid; 4933 info.reset(); 4934 not.study(info); 4935 4936 info.minLength = minL + Math.min(minL2, info.minLength); 4937 info.maxLength = maxL + Math.max(maxL2, info.maxLength); 4938 info.maxValid = (maxV & maxV2 & info.maxValid); 4939 info.deterministic = false; 4940 return next.study(info); 4941 } 4942 } 4943 4944 /** 4945 * Zero width positive lookahead. 4946 */ 4947 static final class Pos extends Node { 4948 Node cond; 4949 Pos(Node cond) { 4950 this.cond = cond; 4951 } 4952 boolean match(Matcher matcher, int i, CharSequence seq) { 4953 int savedTo = matcher.to; 4954 boolean conditionMatched = false; 4955 4956 // Relax transparent region boundaries for lookahead 4957 if (matcher.transparentBounds) 4958 matcher.to = matcher.getTextLength(); 4959 try { 4960 conditionMatched = cond.match(matcher, i, seq); 4961 } finally { 4962 // Reinstate region boundaries 4963 matcher.to = savedTo; 4964 } 4965 return conditionMatched && next.match(matcher, i, seq); 4966 } 4967 } 4968 4969 /** 4970 * Zero width negative lookahead. 4971 */ 4972 static final class Neg extends Node { 4973 Node cond; 4974 Neg(Node cond) { 4975 this.cond = cond; 4976 } 4977 boolean match(Matcher matcher, int i, CharSequence seq) { 4978 int savedTo = matcher.to; 4979 boolean conditionMatched = false; 4980 4981 // Relax transparent region boundaries for lookahead 4982 if (matcher.transparentBounds) 4983 matcher.to = matcher.getTextLength(); 4984 try { 4985 if (i < matcher.to) { 4986 conditionMatched = !cond.match(matcher, i, seq); 4987 } else { 4988 // If a negative lookahead succeeds then more input 4989 // could cause it to fail! 4990 matcher.requireEnd = true; 4991 conditionMatched = !cond.match(matcher, i, seq); 4992 } 4993 } finally { 4994 // Reinstate region boundaries 4995 matcher.to = savedTo; 4996 } 4997 return conditionMatched && next.match(matcher, i, seq); 4998 } 4999 } 5000 5001 /** 5002 * For use with lookbehinds; matches the position where the lookbehind 5003 * was encountered. 5004 */ 5005 static Node lookbehindEnd = new Node() { 5006 boolean match(Matcher matcher, int i, CharSequence seq) { 5007 return i == matcher.lookbehindTo; 5008 } 5009 }; 5010 5011 /** 5012 * Zero width positive lookbehind. 5013 */ 5014 static class Behind extends Node { 5015 Node cond; 5016 int rmax, rmin; 5017 Behind(Node cond, int rmax, int rmin) { 5018 this.cond = cond; 5019 this.rmax = rmax; 5020 this.rmin = rmin; 5021 } 5022 5023 boolean match(Matcher matcher, int i, CharSequence seq) { 5024 int savedFrom = matcher.from; 5025 boolean conditionMatched = false; 5026 int startIndex = (!matcher.transparentBounds) ? 5027 matcher.from : 0; 5028 int from = Math.max(i - rmax, startIndex); 5029 // Set end boundary 5030 int savedLBT = matcher.lookbehindTo; 5031 matcher.lookbehindTo = i; 5032 // Relax transparent region boundaries for lookbehind 5033 if (matcher.transparentBounds) 5034 matcher.from = 0; 5035 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5036 conditionMatched = cond.match(matcher, j, seq); 5037 } 5038 matcher.from = savedFrom; 5039 matcher.lookbehindTo = savedLBT; 5040 return conditionMatched && next.match(matcher, i, seq); 5041 } 5042 } 5043 5044 /** 5045 * Zero width positive lookbehind, including supplementary 5046 * characters or unpaired surrogates. 5047 */ 5048 static final class BehindS extends Behind { 5049 BehindS(Node cond, int rmax, int rmin) { 5050 super(cond, rmax, rmin); 5051 } 5052 boolean match(Matcher matcher, int i, CharSequence seq) { 5053 int rmaxChars = countChars(seq, i, -rmax); 5054 int rminChars = countChars(seq, i, -rmin); 5055 int savedFrom = matcher.from; 5056 int startIndex = (!matcher.transparentBounds) ? 5057 matcher.from : 0; 5058 boolean conditionMatched = false; 5059 int from = Math.max(i - rmaxChars, startIndex); 5060 // Set end boundary 5061 int savedLBT = matcher.lookbehindTo; 5062 matcher.lookbehindTo = i; 5063 // Relax transparent region boundaries for lookbehind 5064 if (matcher.transparentBounds) 5065 matcher.from = 0; 5066 5067 for (int j = i - rminChars; 5068 !conditionMatched && j >= from; 5069 j -= j>from ? countChars(seq, j, -1) : 1) { 5070 conditionMatched = cond.match(matcher, j, seq); 5071 } 5072 matcher.from = savedFrom; 5073 matcher.lookbehindTo = savedLBT; 5074 return conditionMatched && next.match(matcher, i, seq); 5075 } 5076 } 5077 5078 /** 5079 * Zero width negative lookbehind. 5080 */ 5081 static class NotBehind extends Node { 5082 Node cond; 5083 int rmax, rmin; 5084 NotBehind(Node cond, int rmax, int rmin) { 5085 this.cond = cond; 5086 this.rmax = rmax; 5087 this.rmin = rmin; 5088 } 5089 5090 boolean match(Matcher matcher, int i, CharSequence seq) { 5091 int savedLBT = matcher.lookbehindTo; 5092 int savedFrom = matcher.from; 5093 boolean conditionMatched = false; 5094 int startIndex = (!matcher.transparentBounds) ? 5095 matcher.from : 0; 5096 int from = Math.max(i - rmax, startIndex); 5097 matcher.lookbehindTo = i; 5098 // Relax transparent region boundaries for lookbehind 5099 if (matcher.transparentBounds) 5100 matcher.from = 0; 5101 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5102 conditionMatched = cond.match(matcher, j, seq); 5103 } 5104 // Reinstate region boundaries 5105 matcher.from = savedFrom; 5106 matcher.lookbehindTo = savedLBT; 5107 return !conditionMatched && next.match(matcher, i, seq); 5108 } 5109 } 5110 5111 /** 5112 * Zero width negative lookbehind, including supplementary 5113 * characters or unpaired surrogates. 5114 */ 5115 static final class NotBehindS extends NotBehind { 5116 NotBehindS(Node cond, int rmax, int rmin) { 5117 super(cond, rmax, rmin); 5118 } 5119 boolean match(Matcher matcher, int i, CharSequence seq) { 5120 int rmaxChars = countChars(seq, i, -rmax); 5121 int rminChars = countChars(seq, i, -rmin); 5122 int savedFrom = matcher.from; 5123 int savedLBT = matcher.lookbehindTo; 5124 boolean conditionMatched = false; 5125 int startIndex = (!matcher.transparentBounds) ? 5126 matcher.from : 0; 5127 int from = Math.max(i - rmaxChars, startIndex); 5128 matcher.lookbehindTo = i; 5129 // Relax transparent region boundaries for lookbehind 5130 if (matcher.transparentBounds) 5131 matcher.from = 0; 5132 for (int j = i - rminChars; 5133 !conditionMatched && j >= from; 5134 j -= j>from ? countChars(seq, j, -1) : 1) { 5135 conditionMatched = cond.match(matcher, j, seq); 5136 } 5137 //Reinstate region boundaries 5138 matcher.from = savedFrom; 5139 matcher.lookbehindTo = savedLBT; 5140 return !conditionMatched && next.match(matcher, i, seq); 5141 } 5142 } 5143 5144 /** 5145 * Returns the set union of two CharProperty nodes. 5146 */ 5147 private static CharProperty union(final CharProperty lhs, 5148 final CharProperty rhs) { 5149 return new CharProperty() { 5150 boolean isSatisfiedBy(int ch) { 5151 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; 5152 } 5153 5154 /** 5155 * Returns the set intersection of two CharProperty nodes. 5156 */ 5157 private static CharProperty intersection(final CharProperty lhs, 5158 final CharProperty rhs) { 5159 return new CharProperty() { 5160 boolean isSatisfiedBy(int ch) { 5161 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; 5162 } 5163 5164 /** 5165 * Returns the set difference of two CharProperty nodes. 5166 */ 5167 private static CharProperty setDifference(final CharProperty lhs, 5168 final CharProperty rhs) { 5169 return new CharProperty() { 5170 boolean isSatisfiedBy(int ch) { 5171 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; 5172 } 5173 5174 /** 5175 * Handles word boundaries. Includes a field to allow this one class to 5176 * deal with the different types of word boundaries we can match. The word 5177 * characters include underscores, letters, and digits. Non spacing marks 5178 * can are also part of a word if they have a base character, otherwise 5179 * they are ignored for purposes of finding word boundaries. 5180 */ 5181 static final class Bound extends Node { 5182 static int LEFT = 0x1; 5183 static int RIGHT= 0x2; 5184 static int BOTH = 0x3; 5185 static int NONE = 0x4; 5186 int type; 5187 boolean useUWORD; 5188 Bound(int n, boolean useUWORD) { 5189 type = n; 5190 this.useUWORD = useUWORD; 5191 } 5192 5193 boolean isWord(int ch) { 5194 return useUWORD ? UnicodeProp.WORD.is(ch) 5195 : (ch == '_' || Character.isLetterOrDigit(ch)); 5196 } 5197 5198 int check(Matcher matcher, int i, CharSequence seq) { 5199 int ch; 5200 boolean left = false; 5201 int startIndex = matcher.from; 5202 int endIndex = matcher.to; 5203 if (matcher.transparentBounds) { 5204 startIndex = 0; 5205 endIndex = matcher.getTextLength(); 5206 } 5207 if (i > startIndex) { 5208 ch = Character.codePointBefore(seq, i); 5209 left = (isWord(ch) || 5210 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5211 && hasBaseCharacter(matcher, i-1, seq))); 5212 } 5213 boolean right = false; 5214 if (i < endIndex) { 5215 ch = Character.codePointAt(seq, i); 5216 right = (isWord(ch) || 5217 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5218 && hasBaseCharacter(matcher, i, seq))); 5219 } else { 5220 // Tried to access char past the end 5221 matcher.hitEnd = true; 5222 // The addition of another char could wreck a boundary 5223 matcher.requireEnd = true; 5224 } 5225 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); 5226 } 5227 boolean match(Matcher matcher, int i, CharSequence seq) { 5228 return (check(matcher, i, seq) & type) > 0 5229 && next.match(matcher, i, seq); 5230 } 5231 } 5232 5233 /** 5234 * Non spacing marks only count as word characters in bounds calculations 5235 * if they have a base character. 5236 */ 5237 private static boolean hasBaseCharacter(Matcher matcher, int i, 5238 CharSequence seq) 5239 { 5240 int start = (!matcher.transparentBounds) ? 5241 matcher.from : 0; 5242 for (int x=i; x >= start; x--) { 5243 int ch = Character.codePointAt(seq, x); 5244 if (Character.isLetterOrDigit(ch)) 5245 return true; 5246 if (Character.getType(ch) == Character.NON_SPACING_MARK) 5247 continue; 5248 return false; 5249 } 5250 return false; 5251 } 5252 5253 /** 5254 * Attempts to match a slice in the input using the Boyer-Moore string 5255 * matching algorithm. The algorithm is based on the idea that the 5256 * pattern can be shifted farther ahead in the search text if it is 5257 * matched right to left. 5258 * <p> 5259 * The pattern is compared to the input one character at a time, from 5260 * the rightmost character in the pattern to the left. If the characters 5261 * all match the pattern has been found. If a character does not match, 5262 * the pattern is shifted right a distance that is the maximum of two 5263 * functions, the bad character shift and the good suffix shift. This 5264 * shift moves the attempted match position through the input more 5265 * quickly than a naive one position at a time check. 5266 * <p> 5267 * The bad character shift is based on the character from the text that 5268 * did not match. If the character does not appear in the pattern, the 5269 * pattern can be shifted completely beyond the bad character. If the 5270 * character does occur in the pattern, the pattern can be shifted to 5271 * line the pattern up with the next occurrence of that character. 5272 * <p> 5273 * The good suffix shift is based on the idea that some subset on the right 5274 * side of the pattern has matched. When a bad character is found, the 5275 * pattern can be shifted right by the pattern length if the subset does 5276 * not occur again in pattern, or by the amount of distance to the 5277 * next occurrence of the subset in the pattern. 5278 * 5279 * Boyer-Moore search methods adapted from code by Amy Yu. 5280 */ 5281 static class BnM extends Node { 5282 int[] buffer; 5283 int[] lastOcc; 5284 int[] optoSft; 5285 5286 /** 5287 * Pre calculates arrays needed to generate the bad character 5288 * shift and the good suffix shift. Only the last seven bits 5289 * are used to see if chars match; This keeps the tables small 5290 * and covers the heavily used ASCII range, but occasionally 5291 * results in an aliased match for the bad character shift. 5292 */ 5293 static Node optimize(Node node) { 5294 if (!(node instanceof Slice)) { 5295 return node; 5296 } 5297 5298 int[] src = ((Slice) node).buffer; 5299 int patternLength = src.length; 5300 // The BM algorithm requires a bit of overhead; 5301 // If the pattern is short don't use it, since 5302 // a shift larger than the pattern length cannot 5303 // be used anyway. 5304 if (patternLength < 4) { 5305 return node; 5306 } 5307 int i, j, k; 5308 int[] lastOcc = new int[128]; 5309 int[] optoSft = new int[patternLength]; 5310 // Precalculate part of the bad character shift 5311 // It is a table for where in the pattern each 5312 // lower 7-bit value occurs 5313 for (i = 0; i < patternLength; i++) { 5314 lastOcc[src[i]&0x7F] = i + 1; 5315 } 5316 // Precalculate the good suffix shift 5317 // i is the shift amount being considered 5318 NEXT: for (i = patternLength; i > 0; i--) { 5319 // j is the beginning index of suffix being considered 5320 for (j = patternLength - 1; j >= i; j--) { 5321 // Testing for good suffix 5322 if (src[j] == src[j-i]) { 5323 // src[j..len] is a good suffix 5324 optoSft[j-1] = i; 5325 } else { 5326 // No match. The array has already been 5327 // filled up with correct values before. 5328 continue NEXT; 5329 } 5330 } 5331 // This fills up the remaining of optoSft 5332 // any suffix can not have larger shift amount 5333 // then its sub-suffix. Why??? 5334 while (j > 0) { 5335 optoSft[--j] = i; 5336 } 5337 } 5338 // Set the guard value because of unicode compression 5339 optoSft[patternLength-1] = 1; 5340 if (node instanceof SliceS) 5341 return new BnMS(src, lastOcc, optoSft, node.next); 5342 return new BnM(src, lastOcc, optoSft, node.next); 5343 } 5344 BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5345 this.buffer = src; 5346 this.lastOcc = lastOcc; 5347 this.optoSft = optoSft; 5348 this.next = next; 5349 } 5350 boolean match(Matcher matcher, int i, CharSequence seq) { 5351 int[] src = buffer; 5352 int patternLength = src.length; 5353 int last = matcher.to - patternLength; 5354 5355 // Loop over all possible match positions in text 5356 NEXT: while (i <= last) { 5357 // Loop over pattern from right to left 5358 for (int j = patternLength - 1; j >= 0; j--) { 5359 int ch = seq.charAt(i+j); 5360 if (ch != src[j]) { 5361 // Shift search to the right by the maximum of the 5362 // bad character shift and the good suffix shift 5363 i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); 5364 continue NEXT; 5365 } 5366 } 5367 // Entire pattern matched starting at i 5368 matcher.first = i; 5369 boolean ret = next.match(matcher, i + patternLength, seq); 5370 if (ret) { 5371 matcher.first = i; 5372 matcher.groups[0] = matcher.first; 5373 matcher.groups[1] = matcher.last; 5374 return true; 5375 } 5376 i++; 5377 } 5378 // BnM is only used as the leading node in the unanchored case, 5379 // and it replaced its Start() which always searches to the end 5380 // if it doesn't find what it's looking for, so hitEnd is true. 5381 matcher.hitEnd = true; 5382 return false; 5383 } 5384 boolean study(TreeInfo info) { 5385 info.minLength += buffer.length; 5386 info.maxValid = false; 5387 return next.study(info); 5388 } 5389 } 5390 5391 /** 5392 * Supplementary support version of BnM(). Unpaired surrogates are 5393 * also handled by this class. 5394 */ 5395 static final class BnMS extends BnM { 5396 int lengthInChars; 5397 5398 BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5399 super(src, lastOcc, optoSft, next); 5400 for (int x = 0; x < buffer.length; x++) { 5401 lengthInChars += Character.charCount(buffer[x]); 5402 } 5403 } 5404 boolean match(Matcher matcher, int i, CharSequence seq) { 5405 int[] src = buffer; 5406 int patternLength = src.length; 5407 int last = matcher.to - lengthInChars; 5408 5409 // Loop over all possible match positions in text 5410 NEXT: while (i <= last) { 5411 // Loop over pattern from right to left 5412 int ch; 5413 for (int j = countChars(seq, i, patternLength), x = patternLength - 1; 5414 j > 0; j -= Character.charCount(ch), x--) { 5415 ch = Character.codePointBefore(seq, i+j); 5416 if (ch != src[x]) { 5417 // Shift search to the right by the maximum of the 5418 // bad character shift and the good suffix shift 5419 int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); 5420 i += countChars(seq, i, n); 5421 continue NEXT; 5422 } 5423 } 5424 // Entire pattern matched starting at i 5425 matcher.first = i; 5426 boolean ret = next.match(matcher, i + lengthInChars, seq); 5427 if (ret) { 5428 matcher.first = i; 5429 matcher.groups[0] = matcher.first; 5430 matcher.groups[1] = matcher.last; 5431 return true; 5432 } 5433 i += countChars(seq, i, 1); 5434 } 5435 matcher.hitEnd = true; 5436 return false; 5437 } 5438 } 5439 5440 /////////////////////////////////////////////////////////////////////////////// 5441 /////////////////////////////////////////////////////////////////////////////// 5442 5443 /** 5444 * This must be the very first initializer. 5445 */ 5446 static Node accept = new Node(); 5447 5448 static Node lastAccept = new LastNode(); 5449 5450 private static class CharPropertyNames { 5451 5452 static CharProperty charPropertyFor(String name) { 5453 CharPropertyFactory m = map.get(name); 5454 return m == null ? null : m.make(); 5455 } 5456 5457 private static abstract class CharPropertyFactory { 5458 abstract CharProperty make(); 5459 } 5460 5461 private static void defCategory(String name, 5462 final int typeMask) { 5463 map.put(name, new CharPropertyFactory() { 5464 CharProperty make() { return new Category(typeMask);}}); 5465 } 5466 5467 private static void defRange(String name, 5468 final int lower, final int upper) { 5469 map.put(name, new CharPropertyFactory() { 5470 CharProperty make() { return rangeFor(lower, upper);}}); 5471 } 5472 5473 private static void defCtype(String name, 5474 final int ctype) { 5475 map.put(name, new CharPropertyFactory() { 5476 CharProperty make() { return new Ctype(ctype);}}); 5477 } 5478 5479 private static abstract class CloneableProperty 5480 extends CharProperty implements Cloneable 5481 { 5482 public CloneableProperty clone() { 5483 try { 5484 return (CloneableProperty) super.clone(); 5485 } catch (CloneNotSupportedException e) { 5486 throw new AssertionError(e); 5487 } 5488 } 5489 } 5490 5491 private static void defClone(String name, 5492 final CloneableProperty p) { 5493 map.put(name, new CharPropertyFactory() { 5494 CharProperty make() { return p.clone();}}); 5495 } 5496 5497 private static final HashMap<String, CharPropertyFactory> map 5498 = new HashMap<>(); 5499 5500 static { 5501 // Unicode character property aliases, defined in 5502 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 5503 defCategory("Cn", 1<<Character.UNASSIGNED); 5504 defCategory("Lu", 1<<Character.UPPERCASE_LETTER); 5505 defCategory("Ll", 1<<Character.LOWERCASE_LETTER); 5506 defCategory("Lt", 1<<Character.TITLECASE_LETTER); 5507 defCategory("Lm", 1<<Character.MODIFIER_LETTER); 5508 defCategory("Lo", 1<<Character.OTHER_LETTER); 5509 defCategory("Mn", 1<<Character.NON_SPACING_MARK); 5510 defCategory("Me", 1<<Character.ENCLOSING_MARK); 5511 defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK); 5512 defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER); 5513 defCategory("Nl", 1<<Character.LETTER_NUMBER); 5514 defCategory("No", 1<<Character.OTHER_NUMBER); 5515 defCategory("Zs", 1<<Character.SPACE_SEPARATOR); 5516 defCategory("Zl", 1<<Character.LINE_SEPARATOR); 5517 defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR); 5518 defCategory("Cc", 1<<Character.CONTROL); 5519 defCategory("Cf", 1<<Character.FORMAT); 5520 defCategory("Co", 1<<Character.PRIVATE_USE); 5521 defCategory("Cs", 1<<Character.SURROGATE); 5522 defCategory("Pd", 1<<Character.DASH_PUNCTUATION); 5523 defCategory("Ps", 1<<Character.START_PUNCTUATION); 5524 defCategory("Pe", 1<<Character.END_PUNCTUATION); 5525 defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION); 5526 defCategory("Po", 1<<Character.OTHER_PUNCTUATION); 5527 defCategory("Sm", 1<<Character.MATH_SYMBOL); 5528 defCategory("Sc", 1<<Character.CURRENCY_SYMBOL); 5529 defCategory("Sk", 1<<Character.MODIFIER_SYMBOL); 5530 defCategory("So", 1<<Character.OTHER_SYMBOL); 5531 defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION); 5532 defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION); 5533 defCategory("L", ((1<<Character.UPPERCASE_LETTER) | 5534 (1<<Character.LOWERCASE_LETTER) | 5535 (1<<Character.TITLECASE_LETTER) | 5536 (1<<Character.MODIFIER_LETTER) | 5537 (1<<Character.OTHER_LETTER))); 5538 defCategory("M", ((1<<Character.NON_SPACING_MARK) | 5539 (1<<Character.ENCLOSING_MARK) | 5540 (1<<Character.COMBINING_SPACING_MARK))); 5541 defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) | 5542 (1<<Character.LETTER_NUMBER) | 5543 (1<<Character.OTHER_NUMBER))); 5544 defCategory("Z", ((1<<Character.SPACE_SEPARATOR) | 5545 (1<<Character.LINE_SEPARATOR) | 5546 (1<<Character.PARAGRAPH_SEPARATOR))); 5547 defCategory("C", ((1<<Character.CONTROL) | 5548 (1<<Character.FORMAT) | 5549 (1<<Character.PRIVATE_USE) | 5550 (1<<Character.SURROGATE))); // Other 5551 defCategory("P", ((1<<Character.DASH_PUNCTUATION) | 5552 (1<<Character.START_PUNCTUATION) | 5553 (1<<Character.END_PUNCTUATION) | 5554 (1<<Character.CONNECTOR_PUNCTUATION) | 5555 (1<<Character.OTHER_PUNCTUATION) | 5556 (1<<Character.INITIAL_QUOTE_PUNCTUATION) | 5557 (1<<Character.FINAL_QUOTE_PUNCTUATION))); 5558 defCategory("S", ((1<<Character.MATH_SYMBOL) | 5559 (1<<Character.CURRENCY_SYMBOL) | 5560 (1<<Character.MODIFIER_SYMBOL) | 5561 (1<<Character.OTHER_SYMBOL))); 5562 defCategory("LC", ((1<<Character.UPPERCASE_LETTER) | 5563 (1<<Character.LOWERCASE_LETTER) | 5564 (1<<Character.TITLECASE_LETTER))); 5565 defCategory("LD", ((1<<Character.UPPERCASE_LETTER) | 5566 (1<<Character.LOWERCASE_LETTER) | 5567 (1<<Character.TITLECASE_LETTER) | 5568 (1<<Character.MODIFIER_LETTER) | 5569 (1<<Character.OTHER_LETTER) | 5570 (1<<Character.DECIMAL_DIGIT_NUMBER))); 5571 defRange("L1", 0x00, 0xFF); // Latin-1 5572 map.put("all", new CharPropertyFactory() { 5573 CharProperty make() { return new All(); }}); 5574 5575 // Posix regular expression character classes, defined in 5576 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html 5577 defRange("ASCII", 0x00, 0x7F); // ASCII 5578 defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters 5579 defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters 5580 defCtype("Blank", ASCII.BLANK); // Space and tab characters 5581 defCtype("Cntrl", ASCII.CNTRL); // Control characters 5582 defRange("Digit", '0', '9'); // Numeric characters 5583 defCtype("Graph", ASCII.GRAPH); // printable and visible 5584 defRange("Lower", 'a', 'z'); // Lower-case alphabetic 5585 defRange("Print", 0x20, 0x7E); // Printable characters 5586 defCtype("Punct", ASCII.PUNCT); // Punctuation characters 5587 defCtype("Space", ASCII.SPACE); // Space characters 5588 defRange("Upper", 'A', 'Z'); // Upper-case alphabetic 5589 defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits 5590 5591 // Java character properties, defined by methods in Character.java 5592 defClone("javaLowerCase", new CloneableProperty() { 5593 boolean isSatisfiedBy(int ch) { 5594 return Character.isLowerCase(ch);}}); 5595 defClone("javaUpperCase", new CloneableProperty() { 5596 boolean isSatisfiedBy(int ch) { 5597 return Character.isUpperCase(ch);}}); 5598 defClone("javaAlphabetic", new CloneableProperty() { 5599 boolean isSatisfiedBy(int ch) { 5600 return Character.isAlphabetic(ch);}}); 5601 defClone("javaIdeographic", new CloneableProperty() { 5602 boolean isSatisfiedBy(int ch) { 5603 return Character.isIdeographic(ch);}}); 5604 defClone("javaTitleCase", new CloneableProperty() { 5605 boolean isSatisfiedBy(int ch) { 5606 return Character.isTitleCase(ch);}}); 5607 defClone("javaDigit", new CloneableProperty() { 5608 boolean isSatisfiedBy(int ch) { 5609 return Character.isDigit(ch);}}); 5610 defClone("javaDefined", new CloneableProperty() { 5611 boolean isSatisfiedBy(int ch) { 5612 return Character.isDefined(ch);}}); 5613 defClone("javaLetter", new CloneableProperty() { 5614 boolean isSatisfiedBy(int ch) { 5615 return Character.isLetter(ch);}}); 5616 defClone("javaLetterOrDigit", new CloneableProperty() { 5617 boolean isSatisfiedBy(int ch) { 5618 return Character.isLetterOrDigit(ch);}}); 5619 defClone("javaJavaIdentifierStart", new CloneableProperty() { 5620 boolean isSatisfiedBy(int ch) { 5621 return Character.isJavaIdentifierStart(ch);}}); 5622 defClone("javaJavaIdentifierPart", new CloneableProperty() { 5623 boolean isSatisfiedBy(int ch) { 5624 return Character.isJavaIdentifierPart(ch);}}); 5625 defClone("javaUnicodeIdentifierStart", new CloneableProperty() { 5626 boolean isSatisfiedBy(int ch) { 5627 return Character.isUnicodeIdentifierStart(ch);}}); 5628 defClone("javaUnicodeIdentifierPart", new CloneableProperty() { 5629 boolean isSatisfiedBy(int ch) { 5630 return Character.isUnicodeIdentifierPart(ch);}}); 5631 defClone("javaIdentifierIgnorable", new CloneableProperty() { 5632 boolean isSatisfiedBy(int ch) { 5633 return Character.isIdentifierIgnorable(ch);}}); 5634 defClone("javaSpaceChar", new CloneableProperty() { 5635 boolean isSatisfiedBy(int ch) { 5636 return Character.isSpaceChar(ch);}}); 5637 defClone("javaWhitespace", new CloneableProperty() { 5638 boolean isSatisfiedBy(int ch) { 5639 return Character.isWhitespace(ch);}}); 5640 defClone("javaISOControl", new CloneableProperty() { 5641 boolean isSatisfiedBy(int ch) { 5642 return Character.isISOControl(ch);}}); 5643 defClone("javaMirrored", new CloneableProperty() { 5644 boolean isSatisfiedBy(int ch) { 5645 return Character.isMirrored(ch);}}); 5646 } 5647 } 5648 }