1 /* 2 * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.text.Normalizer; 29 import java.text.Normalizer.Form; 30 import java.util.Locale; 31 import java.util.Iterator; 32 import java.util.Map; 33 import java.util.ArrayList; 34 import java.util.HashMap; 35 import java.util.LinkedHashSet; 36 import java.util.List; 37 import java.util.Set; 38 import java.util.Arrays; 39 import java.util.NoSuchElementException; 40 import java.util.Spliterator; 41 import java.util.Spliterators; 42 import java.util.function.Predicate; 43 import java.util.stream.Stream; 44 import java.util.stream.StreamSupport; 45 46 47 /** 48 * A compiled representation of a regular expression. 49 * 50 * <p> A regular expression, specified as a string, must first be compiled into 51 * an instance of this class. The resulting pattern can then be used to create 52 * a {@link Matcher} object that can match arbitrary {@linkplain 53 * java.lang.CharSequence character sequences} against the regular 54 * expression. All of the state involved in performing a match resides in the 55 * matcher, so many matchers can share the same pattern. 56 * 57 * <p> A typical invocation sequence is thus 58 * 59 * <blockquote><pre> 60 * Pattern p = Pattern.{@link #compile compile}("a*b"); 61 * Matcher m = p.{@link #matcher matcher}("aaaaab"); 62 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote> 63 * 64 * <p> A {@link #matches matches} method is defined by this class as a 65 * convenience for when a regular expression is used just once. This method 66 * compiles an expression and matches an input sequence against it in a single 67 * invocation. The statement 68 * 69 * <blockquote><pre> 70 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote> 71 * 72 * is equivalent to the three statements above, though for repeated matches it 73 * is less efficient since it does not allow the compiled pattern to be reused. 74 * 75 * <p> Instances of this class are immutable and are safe for use by multiple 76 * concurrent threads. Instances of the {@link Matcher} class are not safe for 77 * such use. 78 * 79 * 80 * <h3><a id="sum">Summary of regular-expression constructs</a></h3> 81 * 82 * <table class="borderless"> 83 * <caption style="display:none">Regular expression constructs, and what they match</caption> 84 * <thead style="text-align:left"> 85 * <tr> 86 * <th id="construct">Construct</th> 87 * <th id="matches">Matches</th> 88 * </tr> 89 * </thead> 90 * <tbody style="text-align:left"> 91 * 92 * <tr><th colspan="2" style="padding-top:20px" id="characters">Characters</th></tr> 93 * 94 * <tr><th style="vertical-align:top; font-weight: normal" id="x"><i>x</i></th> 95 * <td headers="matches characters x">The character <i>x</i></td></tr> 96 * <tr><th style="vertical-align:top; font-weight: normal" id="backslash">{@code \\}</th> 97 * <td headers="matches characters backslash">The backslash character</td></tr> 98 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_n">{@code \0}<i>n</i></th> 99 * <td headers="matches characters octal_n">The character with octal value {@code 0}<i>n</i> 100 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 101 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nn">{@code \0}<i>nn</i></th> 102 * <td headers="matches characters octal_nn">The character with octal value {@code 0}<i>nn</i> 103 * (0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 104 * <tr><th style="vertical-align:top; font-weight: normal" id="octal_nnn">{@code \0}<i>mnn</i></th> 105 * <td headers="matches characters octal_nnn">The character with octal value {@code 0}<i>mnn</i> 106 * (0 {@code <=} <i>m</i> {@code <=} 3, 107 * 0 {@code <=} <i>n</i> {@code <=} 7)</td></tr> 108 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hh">{@code \x}<i>hh</i></th> 109 * <td headers="matches characters hex_hh">The character with hexadecimal value {@code 0x}<i>hh</i></td></tr> 110 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_hhhh"><code>\u</code><i>hhhh</i></th> 111 * <td headers="matches characters hex_hhhh">The character with hexadecimal value {@code 0x}<i>hhhh</i></td></tr> 112 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th> 113 * <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i> 114 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT} 115 * <= {@code 0x}<i>h...h</i> <= 116 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr> 117 * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th> 118 * <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr> 119 * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th> 120 * <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr> 121 * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th> 122 * <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr> 123 * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th> 124 * <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr> 125 * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th> 126 * <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr> 127 * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th> 128 * <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr> 129 * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th> 130 * <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr> 131 * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th> 132 * <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr> 133 * 134 * <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr> 135 * 136 * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th> 137 * <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr> 138 * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th> 139 * <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr> 140 * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th> 141 * <td headers="matches classes range">{@code a} through {@code z} 142 * or {@code A} through {@code Z}, inclusive (range)</td></tr> 143 * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th> 144 * <td headers="matches classes union">{@code a} through {@code d}, 145 * or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr> 146 * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th> 147 * <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr> 148 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th> 149 * <td headers="matches classes subtraction1">{@code a} through {@code z}, 150 * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr> 151 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th> 152 * <td headers="matches classes subtraction2">{@code a} through {@code z}, 153 * and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr> 154 * 155 * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr> 156 * 157 * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th> 158 * <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr> 159 * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th> 160 * <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr> 161 * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th> 162 * <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr> 163 * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th> 164 * <td headers="matches predef horiz_white">A horizontal whitespace character: 165 * <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr> 166 * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th> 167 * <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr> 168 * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th> 169 * <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr> 170 * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th> 171 * <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr> 172 * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th> 173 * <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code> 174 * </td></tr> 175 * <tr><th style="vertical-align:top; font-weight:normal" id="non_vert_white">{@code \V}</th> 176 * <td headers="matches predef non_vert_white">A non-vertical whitespace character: {@code [^\v]}</td></tr> 177 * <tr><th style="vertical-align:top; font-weight:normal" id="word">{@code \w}</th> 178 * <td headers="matches predef word">A word character: {@code [a-zA-Z_0-9]}</td></tr> 179 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word">{@code \W}</th> 180 * <td headers="matches predef non_word">A non-word character: {@code [^\w]}</td></tr> 181 * 182 * <tr><th colspan="2" style="padding-top:20px" id="posix"><b>POSIX character classes (US-ASCII only)</b></th></tr> 183 * 184 * <tr><th style="vertical-align:top; font-weight:normal" id="Lower">{@code \p{Lower}}</th> 185 * <td headers="matches posix Lower">A lower-case alphabetic character: {@code [a-z]}</td></tr> 186 * <tr><th style="vertical-align:top; font-weight:normal" id="Upper">{@code \p{Upper}}</th> 187 * <td headers="matches posix Upper">An upper-case alphabetic character:{@code [A-Z]}</td></tr> 188 * <tr><th style="vertical-align:top; font-weight:normal" id="ASCII">{@code \p{ASCII}}</th> 189 * <td headers="matches posix ASCII">All ASCII:{@code [\x00-\x7F]}</td></tr> 190 * <tr><th style="vertical-align:top; font-weight:normal" id="Alpha">{@code \p{Alpha}}</th> 191 * <td headers="matches posix Alpha">An alphabetic character:{@code [\p{Lower}\p{Upper}]}</td></tr> 192 * <tr><th style="vertical-align:top; font-weight:normal" id="Digit">{@code \p{Digit}}</th> 193 * <td headers="matches posix Digit">A decimal digit: {@code [0-9]}</td></tr> 194 * <tr><th style="vertical-align:top; font-weight:normal" id="Alnum">{@code \p{Alnum}}</th> 195 * <td headers="matches posix Alnum">An alphanumeric character:{@code [\p{Alpha}\p{Digit}]}</td></tr> 196 * <tr><th style="vertical-align:top; font-weight:normal" id="Punct">{@code \p{Punct}}</th> 197 * <td headers="matches posix Punct">Punctuation: One of {@code !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~}</td></tr> 198 * <!-- {@code [\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]} 199 * {@code [\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]} --> 200 * <tr><th style="vertical-align:top; font-weight:normal" id="Graph">{@code \p{Graph}}</th> 201 * <td headers="matches posix Graph">A visible character: {@code [\p{Alnum}\p{Punct}]}</td></tr> 202 * <tr><th style="vertical-align:top; font-weight:normal" id="Print">{@code \p{Print}}</th> 203 * <td headers="matches posix Print">A printable character: {@code [\p{Graph}\x20]}</td></tr> 204 * <tr><th style="vertical-align:top; font-weight:normal" id="Blank">{@code \p{Blank}}</th> 205 * <td headers="matches posix Blank">A space or a tab: {@code [ \t]}</td></tr> 206 * <tr><th style="vertical-align:top; font-weight:normal" id="Cntrl">{@code \p{Cntrl}}</th> 207 * <td headers="matches posix Cntrl">A control character: {@code [\x00-\x1F\x7F]}</td></tr> 208 * <tr><th style="vertical-align:top; font-weight:normal" id="XDigit">{@code \p{XDigit}}</th> 209 * <td headers="matches posix XDigit">A hexadecimal digit: {@code [0-9a-fA-F]}</td></tr> 210 * <tr><th style="vertical-align:top; font-weight:normal" id="Space">{@code \p{Space}}</th> 211 * <td headers="matches posix Space">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr> 212 * 213 * <tr><th colspan="2" style="padding-top:20px" id="java">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr> 214 * 215 * <tr><th style="vertical-align:top; font-weight:normal" id="javaLowerCase">{@code \p{javaLowerCase}}</th> 216 * <td headers="matches java javaLowerCase">Equivalent to java.lang.Character.isLowerCase()</td></tr> 217 * <tr><th style="vertical-align:top; font-weight:normal" id="javaUpperCase">{@code \p{javaUpperCase}}</th> 218 * <td headers="matches java javaUpperCase">Equivalent to java.lang.Character.isUpperCase()</td></tr> 219 * <tr><th style="vertical-align:top; font-weight:normal" id="javaWhitespace">{@code \p{javaWhitespace}}</th> 220 * <td headers="matches java javaWhitespace">Equivalent to java.lang.Character.isWhitespace()</td></tr> 221 * <tr><th style="vertical-align:top; font-weight:normal" id="javaMirrored">{@code \p{javaMirrored}}</th> 222 * <td headers="matches java javaMirrored">Equivalent to java.lang.Character.isMirrored()</td></tr> 223 * 224 * <tr><th colspan="2" style="padding-top:20px" id="unicode">Classes for Unicode scripts, blocks, categories and binary properties</th></tr> 225 * 226 * <tr><th style="vertical-align:top; font-weight:normal" id="IsLatin">{@code \p{IsLatin}}</th> 227 * <td headers="matches unicode IsLatin">A Latin script character (<a href="#usc">script</a>)</td></tr> 228 * <tr><th style="vertical-align:top; font-weight:normal" id="InGreek">{@code \p{InGreek}}</th> 229 * <td headers="matches unicode InGreek">A character in the Greek block (<a href="#ubc">block</a>)</td></tr> 230 * <tr><th style="vertical-align:top; font-weight:normal" id="Lu">{@code \p{Lu}}</th> 231 * <td headers="matches unicode Lu">An uppercase letter (<a href="#ucc">category</a>)</td></tr> 232 * <tr><th style="vertical-align:top; font-weight:normal" id="IsAlphabetic">{@code \p{IsAlphabetic}}</th> 233 * <td headers="matches unicode IsAlphabetic">An alphabetic character (<a href="#ubpc">binary property</a>)</td></tr> 234 * <tr><th style="vertical-align:top; font-weight:normal" id="Sc">{@code \p{Sc}}</th> 235 * <td headers="matches unicode Sc">A currency symbol</td></tr> 236 * <tr><th style="vertical-align:top; font-weight:normal" id="not_InGreek">{@code \P{InGreek}}</th> 237 * <td headers="matches unicode not_InGreek">Any character except one in the Greek block (negation)</td></tr> 238 * <tr><th style="vertical-align:top; font-weight:normal" id="not_uppercase">{@code [\p{L}&&[^\p{Lu}]]}</th> 239 * <td headers="matches unicode not_uppercase">Any letter except an uppercase letter (subtraction)</td></tr> 240 * 241 * <tr><th colspan="2" style="padding-top:20px" id="bounds">Boundary matchers</th></tr> 242 * 243 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_line">{@code ^}</th> 244 * <td headers="matches bounds begin_line">The beginning of a line</td></tr> 245 * <tr><th style="vertical-align:top; font-weight:normal" id="end_line">{@code $}</th> 246 * <td headers="matches bounds end_line">The end of a line</td></tr> 247 * <tr><th style="vertical-align:top; font-weight:normal" id="word_boundary">{@code \b}</th> 248 * <td headers="matches bounds word_boundary">A word boundary</td></tr> 249 * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_cluster_boundary">{@code \b{g}}</th> 250 * <td headers="matches bounds grapheme_cluster_boundary">A Unicode extended grapheme cluster boundary</td></tr> 251 * <tr><th style="vertical-align:top; font-weight:normal" id="non_word_boundary">{@code \B}</th> 252 * <td headers="matches bounds non_word_boundary">A non-word boundary</td></tr> 253 * <tr><th style="vertical-align:top; font-weight:normal" id="begin_input">{@code \A}</th> 254 * <td headers="matches bounds begin_input">The beginning of the input</td></tr> 255 * <tr><th style="vertical-align:top; font-weight:normal" id="end_prev_match">{@code \G}</th> 256 * <td headers="matches bounds end_prev_match">The end of the previous match</td></tr> 257 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input_except_term">{@code \Z}</th> 258 * <td headers="matches bounds end_input_except_term">The end of the input but for the final 259 * <a href="#lt">terminator</a>, if any</td></tr> 260 * <tr><th style="vertical-align:top; font-weight:normal" id="end_input">{@code \z}</th> 261 * <td headers="matches bounds end_input">The end of the input</td></tr> 262 * 263 * <tr><th colspan="2" style="padding-top:20px" id="linebreak">Linebreak matcher</th></tr> 264 * 265 * <tr><th style="vertical-align:top; font-weight:normal" id="any_unicode_linebreak">{@code \R}</th> 266 * <td headers="matches linebreak any_unicode_linebreak">Any Unicode linebreak sequence, is equivalent to 267 * <code>\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029] 268 * </code></td></tr> 269 * 270 * <tr><th colspan="2" style="padding-top:20px" id="grapheme">Unicode Extended Grapheme matcher</th></tr> 271 * 272 * <tr><th style="vertical-align:top; font-weight:normal" id="grapheme_any">{@code \X}</th> 273 * <td headers="matches grapheme grapheme_any">Any Unicode extended grapheme cluster</td></tr> 274 * 275 * <tr><th colspan="2" style="padding-top:20px" id="greedy">Greedy quantifiers</th></tr> 276 * 277 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_once_or_not"><i>X</i>{@code ?}</th> 278 * <td headers="matches greedy greedy_once_or_not"><i>X</i>, once or not at all</td></tr> 279 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_zero_or_more"><i>X</i>{@code *}</th> 280 * <td headers="matches greedy greedy_zero_or_more"><i>X</i>, zero or more times</td></tr> 281 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_one_or_more"><i>X</i>{@code +}</th> 282 * <td headers="matches greedy greedy_one_or_more"><i>X</i>, one or more times</td></tr> 283 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_exactly"><i>X</i><code>{</code><i>n</i><code>}</code></th> 284 * <td headers="matches greedy greedy_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 285 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least"><i>X</i><code>{</code><i>n</i>{@code ,}}</th> 286 * <td headers="matches greedy greedy_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 287 * <tr><th style="vertical-align:top; font-weight:normal" id="greedy_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}</code></th> 288 * <td headers="matches greedy greedy_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 289 * 290 * <tr><th colspan="2" style="padding-top:20px" id="reluc">Reluctant quantifiers</th></tr> 291 * 292 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_once_or_not"><i>X</i>{@code ??}</th> 293 * <td headers="matches reluc reluc_once_or_not"><i>X</i>, once or not at all</td></tr> 294 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_zero_or_more"><i>X</i>{@code *?}</th> 295 * <td headers="matches reluc reluc_zero_or_more"><i>X</i>, zero or more times</td></tr> 296 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_one_or_more"><i>X</i>{@code +?}</th> 297 * <td headers="matches reluc reluc_one_or_more"><i>X</i>, one or more times</td></tr> 298 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_exactly"><i>X</i><code>{</code><i>n</i><code>}?</code></th> 299 * <td headers="matches reluc reluc_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 300 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least"><i>X</i><code>{</code><i>n</i><code>,}?</code></th> 301 * <td headers="matches reluc reluc_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 302 * <tr><th style="vertical-align:top; font-weight:normal" id="reluc_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}?</code></th> 303 * <td headers="matches reluc reluc_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 304 * 305 * <tr><th colspan="2" style="padding-top:20px" id="poss">Possessive quantifiers</th></tr> 306 * 307 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_once_or_not"><i>X</i>{@code ?+}</th> 308 * <td headers="matches poss poss_once_or_not"><i>X</i>, once or not at all</td></tr> 309 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_zero_or_more"><i>X</i>{@code *+}</th> 310 * <td headers="matches poss poss_zero_or_more"><i>X</i>, zero or more times</td></tr> 311 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_one_or_more"><i>X</i>{@code ++}</th> 312 * <td headers="matches poss poss_one_or_more"><i>X</i>, one or more times</td></tr> 313 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_exactly"><i>X</i><code>{</code><i>n</i><code>}+</code></th> 314 * <td headers="matches poss poss_exactly"><i>X</i>, exactly <i>n</i> times</td></tr> 315 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least"><i>X</i><code>{</code><i>n</i><code>,}+</code></th> 316 * <td headers="matches poss poss_at_least"><i>X</i>, at least <i>n</i> times</td></tr> 317 * <tr><th style="vertical-align:top; font-weight:normal" id="poss_at_least_up_to"><i>X</i><code>{</code><i>n</i>{@code ,}<i>m</i><code>}+</code></th> 318 * <td headers="matches poss poss_at_least_up_to"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr> 319 * 320 * <tr><th colspan="2" style="padding-top:20px" id="logical">Logical operators</th></tr> 321 * 322 * <tr><th style="vertical-align:top; font-weight:normal" id="concat"><i>XY</i></th> 323 * <td headers="matches logical concat"><i>X</i> followed by <i>Y</i></td></tr> 324 * <tr><th style="vertical-align:top; font-weight:normal" id="alternate"><i>X</i>{@code |}<i>Y</i></th> 325 * <td headers="matches logical alternate">Either <i>X</i> or <i>Y</i></td></tr> 326 * <tr><th style="vertical-align:top; font-weight:normal" id="group">{@code (}<i>X</i>{@code )}</th> 327 * <td headers="matches logical group">X, as a <a href="#cg">capturing group</a></td></tr> 328 * 329 * <tr><th colspan="2" style="padding-top:20px" id="backref">Back references</th></tr> 330 * 331 * <tr><th style="vertical-align:top; font-weight:normal" id="back_nth">{@code \}<i>n</i></th> 332 * <td headers="matches backref back_nth">Whatever the <i>n</i><sup>th</sup> 333 * <a href="#cg">capturing group</a> matched</td></tr> 334 * <tr><th style="vertical-align:top; font-weight:normal" id="back_named">{@code \}<i>k</i><<i>name</i>></th> 335 * <td headers="matches backref back_named">Whatever the 336 * <a href="#groupname">named-capturing group</a> "name" matched</td></tr> 337 * 338 * <tr><th colspan="2" style="padding-top:20px" id="quote">Quotation</th></tr> 339 * 340 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_follow">{@code \}</th> 341 * <td headers="matches quote quote_follow">Nothing, but quotes the following character</td></tr> 342 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_begin">{@code \Q}</th> 343 * <td headers="matches quote quote_begin">Nothing, but quotes all characters until {@code \E}</td></tr> 344 * <tr><th style="vertical-align:top; font-weight:normal" id="quote_end">{@code \E}</th> 345 * <td headers="matches quote quote_end">Nothing, but ends quoting started by {@code \Q}</td></tr> 346 * <!-- Metachars: !$()*+.<>?[\]^{|} --> 347 * 348 * <tr><th colspan="2" style="padding-top:20px" id="special">Special constructs (named-capturing and non-capturing)</th></tr> 349 * 350 * <tr><th style="vertical-align:top; font-weight:normal" id="named_group"><code>(?<<a href="#groupname">name</a>></code><i>X</i>{@code )}</th> 351 * <td headers="matches special named_group"><i>X</i>, as a named-capturing group</td></tr> 352 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group">{@code (?:}<i>X</i>{@code )}</th> 353 * <td headers="matches special non_capture_group"><i>X</i>, as a non-capturing group</td></tr> 354 * <tr><th style="vertical-align:top; font-weight:normal" id="flags"><code>(?idmsuxU-idmsuxU) </code></th> 355 * <td headers="matches special flags">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a> 356 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> 357 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> <a href="#UNICODE_CHARACTER_CLASS">U</a> 358 * on - off</td></tr> 359 * <tr><th style="vertical-align:top; font-weight:normal" id="non_capture_group_flags"><code>(?idmsux-idmsux:</code><i>X</i>{@code )} </th> 360 * <td headers="matches special non_capture_group_flags"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the 361 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a> 362 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a > 363 * <a href="#COMMENTS">x</a> on - off</td></tr> 364 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookahead">{@code (?=}<i>X</i>{@code )}</th> 365 * <td headers="matches special pos_lookahead"><i>X</i>, via zero-width positive lookahead</td></tr> 366 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookahead">{@code (?!}<i>X</i>{@code )}</th> 367 * <td headers="matches special neg_lookahead"><i>X</i>, via zero-width negative lookahead</td></tr> 368 * <tr><th style="vertical-align:top; font-weight:normal" id="pos_lookbehind">{@code (?<=}<i>X</i>{@code )}</th> 369 * <td headers="matches special pos_lookbehind"><i>X</i>, via zero-width positive lookbehind</td></tr> 370 * <tr><th style="vertical-align:top; font-weight:normal" id="neg_lookbehind">{@code (?<!}<i>X</i>{@code )}</th> 371 * <td headers="matches special neg_lookbehind"><i>X</i>, via zero-width negative lookbehind</td></tr> 372 * <tr><th style="vertical-align:top; font-weight:normal" id="indep_non_capture_group">{@code (?>}<i>X</i>{@code )}</th> 373 * <td headers="matches special indep_non_capture_group"><i>X</i>, as an independent, non-capturing group</td></tr> 374 * 375 * </tbody> 376 * </table> 377 * 378 * <hr> 379 * 380 * 381 * <h3><a id="bs">Backslashes, escapes, and quoting</a></h3> 382 * 383 * <p> The backslash character ({@code '\'}) serves to introduce escaped 384 * constructs, as defined in the table above, as well as to quote characters 385 * that otherwise would be interpreted as unescaped constructs. Thus the 386 * expression {@code \\} matches a single backslash and <code>\{</code> matches a 387 * left brace. 388 * 389 * <p> It is an error to use a backslash prior to any alphabetic character that 390 * does not denote an escaped construct; these are reserved for future 391 * extensions to the regular-expression language. A backslash may be used 392 * prior to a non-alphabetic character regardless of whether that character is 393 * part of an unescaped construct. 394 * 395 * <p> Backslashes within string literals in Java source code are interpreted 396 * as required by 397 * <cite>The Java™ Language Specification</cite> 398 * as either Unicode escapes (section 3.3) or other character escapes (section 3.10.6) 399 * It is therefore necessary to double backslashes in string 400 * literals that represent regular expressions to protect them from 401 * interpretation by the Java bytecode compiler. The string literal 402 * <code>"\b"</code>, for example, matches a single backspace character when 403 * interpreted as a regular expression, while {@code "\\b"} matches a 404 * word boundary. The string literal {@code "\(hello\)"} is illegal 405 * and leads to a compile-time error; in order to match the string 406 * {@code (hello)} the string literal {@code "\\(hello\\)"} 407 * must be used. 408 * 409 * <h3><a id="cc">Character Classes</a></h3> 410 * 411 * <p> Character classes may appear within other character classes, and 412 * may be composed by the union operator (implicit) and the intersection 413 * operator ({@code &&}). 414 * The union operator denotes a class that contains every character that is 415 * in at least one of its operand classes. The intersection operator 416 * denotes a class that contains every character that is in both of its 417 * operand classes. 418 * 419 * <p> The precedence of character-class operators is as follows, from 420 * highest to lowest: 421 * 422 * <table class="striped" style="margin-left: 2em;"> 423 * <caption style="display:none">Precedence of character class operators.</caption> 424 * <thead> 425 * <tr><th scope="col">Precedence<th scope="col">Name<th scope="col">Example 426 * </thead> 427 * <tbody> 428 * <tr><th scope="row">1</th> 429 * <td>Literal escape </td> 430 * <td>{@code \x}</td></tr> 431 * <tr><th scope="row">2</th> 432 * <td>Grouping</td> 433 * <td>{@code [...]}</td></tr> 434 * <tr><th scope="row">3</th> 435 * <td>Range</td> 436 * <td>{@code a-z}</td></tr> 437 * <tr><th scope="row">4</th> 438 * <td>Union</td> 439 * <td>{@code [a-e][i-u]}</td></tr> 440 * <tr><th scope="row">5</th> 441 * <td>Intersection</td> 442 * <td>{@code [a-z&&[aeiou]]}</td></tr> 443 * </tbody> 444 * </table> 445 * 446 * <p> Note that a different set of metacharacters are in effect inside 447 * a character class than outside a character class. For instance, the 448 * regular expression {@code .} loses its special meaning inside a 449 * character class, while the expression {@code -} becomes a range 450 * forming metacharacter. 451 * 452 * <h3><a id="lt">Line terminators</a></h3> 453 * 454 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks 455 * the end of a line of the input character sequence. The following are 456 * recognized as line terminators: 457 * 458 * <ul> 459 * 460 * <li> A newline (line feed) character ({@code '\n'}), 461 * 462 * <li> A carriage-return character followed immediately by a newline 463 * character ({@code "\r\n"}), 464 * 465 * <li> A standalone carriage-return character ({@code '\r'}), 466 * 467 * <li> A next-line character (<code>'\u0085'</code>), 468 * 469 * <li> A line-separator character (<code>'\u2028'</code>), or 470 * 471 * <li> A paragraph-separator character (<code>'\u2029'</code>). 472 * 473 * </ul> 474 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators 475 * recognized are newline characters. 476 * 477 * <p> The regular expression {@code .} matches any character except a line 478 * terminator unless the {@link #DOTALL} flag is specified. 479 * 480 * <p> By default, the regular expressions {@code ^} and {@code $} ignore 481 * line terminators and only match at the beginning and the end, respectively, 482 * of the entire input sequence. If {@link #MULTILINE} mode is activated then 483 * {@code ^} matches at the beginning of input and after any line terminator 484 * except at the end of input. When in {@link #MULTILINE} mode {@code $} 485 * matches just before a line terminator or the end of the input sequence. 486 * 487 * <h3><a id="cg">Groups and capturing</a></h3> 488 * 489 * <h4><a id="gnumber">Group number</a></h4> 490 * <p> Capturing groups are numbered by counting their opening parentheses from 491 * left to right. In the expression {@code ((A)(B(C)))}, for example, there 492 * are four such groups: </p> 493 * 494 * <ol style="margin-left:2em;"> 495 * <li> {@code ((A)(B(C)))} 496 * <li> {@code (A)} 497 * <li> {@code (B(C))} 498 * <li> {@code (C)} 499 * </ol> 500 * 501 * <p> Group zero always stands for the entire expression. 502 * 503 * <p> Capturing groups are so named because, during a match, each subsequence 504 * of the input sequence that matches such a group is saved. The captured 505 * subsequence may be used later in the expression, via a back reference, and 506 * may also be retrieved from the matcher once the match operation is complete. 507 * 508 * <h4><a id="groupname">Group name</a></h4> 509 * <p>A capturing group can also be assigned a "name", a {@code named-capturing group}, 510 * and then be back-referenced later by the "name". Group names are composed of 511 * the following characters. The first character must be a {@code letter}. 512 * 513 * <ul> 514 * <li> The uppercase letters {@code 'A'} through {@code 'Z'} 515 * (<code>'\u0041'</code> through <code>'\u005a'</code>), 516 * <li> The lowercase letters {@code 'a'} through {@code 'z'} 517 * (<code>'\u0061'</code> through <code>'\u007a'</code>), 518 * <li> The digits {@code '0'} through {@code '9'} 519 * (<code>'\u0030'</code> through <code>'\u0039'</code>), 520 * </ul> 521 * 522 * <p> A {@code named-capturing group} is still numbered as described in 523 * <a href="#gnumber">Group number</a>. 524 * 525 * <p> The captured input associated with a group is always the subsequence 526 * that the group most recently matched. If a group is evaluated a second time 527 * because of quantification then its previously-captured value, if any, will 528 * be retained if the second evaluation fails. Matching the string 529 * {@code "aba"} against the expression {@code (a(b)?)+}, for example, leaves 530 * group two set to {@code "b"}. All captured input is discarded at the 531 * beginning of each match. 532 * 533 * <p> Groups beginning with {@code (?} are either pure, <i>non-capturing</i> groups 534 * that do not capture text and do not count towards the group total, or 535 * <i>named-capturing</i> group. 536 * 537 * <h3> Unicode support </h3> 538 * 539 * <p> This class is in conformance with Level 1 of <a 540 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 541 * Standard #18: Unicode Regular Expression</i></a>, plus RL2.1 542 * Canonical Equivalents. 543 * <p> 544 * <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code 545 * are processed as described in section 3.3 of 546 * <cite>The Java™ Language Specification</cite>. 547 * Such escape sequences are also implemented directly by the regular-expression 548 * parser so that Unicode escapes can be used in expressions that are read from 549 * files or from the keyboard. Thus the strings <code>"\u2014"</code> and 550 * {@code "\\u2014"}, while not equal, compile into the same pattern, which 551 * matches the character with hexadecimal value {@code 0x2014}. 552 * <p> 553 * A Unicode character can also be represented by using its <b>Hex notation</b> 554 * (hexadecimal code point value) directly as described in construct 555 * <code>\x{...}</code>, for example a supplementary character U+2011F can be 556 * specified as <code>\x{2011F}</code>, instead of two consecutive Unicode escape 557 * sequences of the surrogate pair <code>\uD840</code><code>\uDD1F</code>. 558 * <p> 559 * <b>Unicode character names</b> are supported by the named character construct 560 * <code>\N{</code>...<code>}</code>, for example, <code>\N{WHITE SMILING FACE}</code> 561 * specifies character <code>\u263A</code>. The character names supported 562 * by this class are the valid Unicode character names matched by 563 * {@link java.lang.Character#codePointOf(String) Character.codePointOf(name)}. 564 * <p> 565 * <a href="http://www.unicode.org/reports/tr18/#Default_Grapheme_Clusters"> 566 * <b>Unicode extended grapheme clusters</b></a> are supported by the grapheme 567 * cluster matcher {@code \X} and the corresponding boundary matcher {@code \b{g}}. 568 * <p> 569 * Unicode scripts, blocks, categories and binary properties are written with 570 * the {@code \p} and {@code \P} constructs as in Perl. 571 * <code>\p{</code><i>prop</i><code>}</code> matches if 572 * the input has the property <i>prop</i>, while <code>\P{</code><i>prop</i><code>}</code> 573 * does not match if the input has that property. 574 * <p> 575 * Scripts, blocks, categories and binary properties can be used both inside 576 * and outside of a character class. 577 * 578 * <p> 579 * <b><a id="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in 580 * {@code IsHiragana}, or by using the {@code script} keyword (or its short 581 * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}. 582 * <p> 583 * The script names supported by {@code Pattern} are the valid script names 584 * accepted and defined by 585 * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}. 586 * 587 * <p> 588 * <b><a id="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in 589 * {@code InMongolian}, or by using the keyword {@code block} (or its short 590 * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}. 591 * <p> 592 * The block names supported by {@code Pattern} are the valid block names 593 * accepted and defined by 594 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}. 595 * <p> 596 * 597 * <b><a id="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}: 598 * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode 599 * letters. Same as scripts and blocks, categories can also be specified 600 * by using the keyword {@code general_category} (or its short form 601 * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}. 602 * <p> 603 * The supported categories are those of 604 * <a href="http://www.unicode.org/unicode/standard/standard.html"> 605 * <i>The Unicode Standard</i></a> in the version specified by the 606 * {@link java.lang.Character Character} class. The category names are those 607 * defined in the Standard, both normative and informative. 608 * <p> 609 * 610 * <b><a id="ubpc">Binary properties</a></b> are specified with the prefix {@code Is}, as in 611 * {@code IsAlphabetic}. The supported binary properties by {@code Pattern} 612 * are 613 * <ul> 614 * <li> Alphabetic 615 * <li> Ideographic 616 * <li> Letter 617 * <li> Lowercase 618 * <li> Uppercase 619 * <li> Titlecase 620 * <li> Punctuation 621 * <Li> Control 622 * <li> White_Space 623 * <li> Digit 624 * <li> Hex_Digit 625 * <li> Join_Control 626 * <li> Noncharacter_Code_Point 627 * <li> Assigned 628 * </ul> 629 * <p> 630 * The following <b>Predefined Character classes</b> and <b>POSIX character classes</b> 631 * are in conformance with the recommendation of <i>Annex C: Compatibility Properties</i> 632 * of <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Regular Expression 633 * </i></a>, when {@link #UNICODE_CHARACTER_CLASS} flag is specified. 634 * 635 * <table class="striped"> 636 * <caption style="display:none">predefined and posix character classes in Unicode mode</caption> 637 * <thead> 638 * <tr> 639 * <th scope="col" id="predef_classes">Classes</th> 640 * <th scope="col" id="predef_matches">Matches</th> 641 * </tr> 642 * </thead> 643 * <tbody> 644 * <tr><th scope="row">{@code \p{Lower}}</th> 645 * <td>A lowercase character:{@code \p{IsLowercase}}</td></tr> 646 * <tr><th scope="row">{@code \p{Upper}}</th> 647 * <td>An uppercase character:{@code \p{IsUppercase}}</td></tr> 648 * <tr><th scope="row">{@code \p{ASCII}}</th> 649 * <td>All ASCII:{@code [\x00-\x7F]}</td></tr> 650 * <tr><th scope="row">{@code \p{Alpha}}</th> 651 * <td>An alphabetic character:{@code \p{IsAlphabetic}}</td></tr> 652 * <tr><th scope="row">{@code \p{Digit}}</th> 653 * <td>A decimal digit character:{@code \p{IsDigit}}</td></tr> 654 * <tr><th scope="row">{@code \p{Alnum}}</th> 655 * <td>An alphanumeric character:{@code [\p{IsAlphabetic}\p{IsDigit}]}</td></tr> 656 * <tr><th scope="row">{@code \p{Punct}}</th> 657 * <td>A punctuation character:{@code \p{IsPunctuation}}</td></tr> 658 * <tr><th scope="row">{@code \p{Graph}}</th> 659 * <td>A visible character: {@code [^\p{IsWhite_Space}\p{gc=Cc}\p{gc=Cs}\p{gc=Cn}]}</td></tr> 660 * <tr><th scope="row">{@code \p{Print}}</th> 661 * <td>A printable character: {@code [\p{Graph}\p{Blank}&&[^\p{Cntrl}]]}</td></tr> 662 * <tr><th scope="row">{@code \p{Blank}}</th> 663 * <td>A space or a tab: {@code [\p{IsWhite_Space}&&[^\p{gc=Zl}\p{gc=Zp}\x0a\x0b\x0c\x0d\x85]]}</td></tr> 664 * <tr><th scope="row">{@code \p{Cntrl}}</th> 665 * <td>A control character: {@code \p{gc=Cc}}</td></tr> 666 * <tr><th scope="row">{@code \p{XDigit}}</th> 667 * <td>A hexadecimal digit: {@code [\p{gc=Nd}\p{IsHex_Digit}]}</td></tr> 668 * <tr><th scope="row">{@code \p{Space}}</th> 669 * <td>A whitespace character:{@code \p{IsWhite_Space}}</td></tr> 670 * <tr><th scope="row">{@code \d}</th> 671 * <td>A digit: {@code \p{IsDigit}}</td></tr> 672 * <tr><th scope="row">{@code \D}</th> 673 * <td>A non-digit: {@code [^\d]}</td></tr> 674 * <tr><th scope="row">{@code \s}</th> 675 * <td>A whitespace character: {@code \p{IsWhite_Space}}</td></tr> 676 * <tr><th scope="row">{@code \S}</th> 677 * <td>A non-whitespace character: {@code [^\s]}</td></tr> 678 * <tr><th scope="row">{@code \w}</th> 679 * <td>A word character: {@code [\p{Alpha}\p{gc=Mn}\p{gc=Me}\p{gc=Mc}\p{Digit}\p{gc=Pc}\p{IsJoin_Control}]}</td></tr> 680 * <tr><th scope="row">{@code \W}</th> 681 * <td>A non-word character: {@code [^\w]}</td></tr> 682 * </tbody> 683 * </table> 684 * <p> 685 * <a id="jcc"> 686 * Categories that behave like the java.lang.Character 687 * boolean is<i>methodname</i> methods (except for the deprecated ones) are 688 * available through the same <code>\p{</code><i>prop</i><code>}</code> syntax where 689 * the specified property has the name <code>java<i>methodname</i></code></a>. 690 * 691 * <h3> Comparison to Perl 5 </h3> 692 * 693 * <p>The {@code Pattern} engine performs traditional NFA-based matching 694 * with ordered alternation as occurs in Perl 5. 695 * 696 * <p> Perl constructs not supported by this class: </p> 697 * 698 * <ul> 699 * <li><p> The backreference constructs, <code>\g{</code><i>n</i><code>}</code> for 700 * the <i>n</i><sup>th</sup><a href="#cg">capturing group</a> and 701 * <code>\g{</code><i>name</i><code>}</code> for 702 * <a href="#groupname">named-capturing group</a>. 703 * </p></li> 704 * 705 * <li><p> The conditional constructs 706 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code )} and 707 * {@code (?(}<i>condition</i>{@code )}<i>X</i>{@code |}<i>Y</i>{@code )}, 708 * </p></li> 709 * 710 * <li><p> The embedded code constructs <code>(?{</code><i>code</i><code>})</code> 711 * and <code>(??{</code><i>code</i><code>})</code>,</p></li> 712 * 713 * <li><p> The embedded comment syntax {@code (?#comment)}, and </p></li> 714 * 715 * <li><p> The preprocessing operations {@code \l} <code>\u</code>, 716 * {@code \L}, and {@code \U}. </p></li> 717 * 718 * </ul> 719 * 720 * <p> Constructs supported by this class but not by Perl: </p> 721 * 722 * <ul> 723 * 724 * <li><p> Character-class union and intersection as described 725 * <a href="#cc">above</a>.</p></li> 726 * 727 * </ul> 728 * 729 * <p> Notable differences from Perl: </p> 730 * 731 * <ul> 732 * 733 * <li><p> In Perl, {@code \1} through {@code \9} are always interpreted 734 * as back references; a backslash-escaped number greater than {@code 9} is 735 * treated as a back reference if at least that many subexpressions exist, 736 * otherwise it is interpreted, if possible, as an octal escape. In this 737 * class octal escapes must always begin with a zero. In this class, 738 * {@code \1} through {@code \9} are always interpreted as back 739 * references, and a larger number is accepted as a back reference if at 740 * least that many subexpressions exist at that point in the regular 741 * expression, otherwise the parser will drop digits until the number is 742 * smaller or equal to the existing number of groups or it is one digit. 743 * </p></li> 744 * 745 * <li><p> Perl uses the {@code g} flag to request a match that resumes 746 * where the last match left off. This functionality is provided implicitly 747 * by the {@link Matcher} class: Repeated invocations of the {@link 748 * Matcher#find find} method will resume where the last match left off, 749 * unless the matcher is reset. </p></li> 750 * 751 * <li><p> In Perl, embedded flags at the top level of an expression affect 752 * the whole expression. In this class, embedded flags always take effect 753 * at the point at which they appear, whether they are at the top level or 754 * within a group; in the latter case, flags are restored at the end of the 755 * group just as in Perl. </p></li> 756 * 757 * </ul> 758 * 759 * 760 * <p> For a more precise description of the behavior of regular expression 761 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/"> 762 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl, 763 * O'Reilly and Associates, 2006.</a> 764 * </p> 765 * 766 * @see java.lang.String#split(String, int) 767 * @see java.lang.String#split(String) 768 * 769 * @author Mike McCloskey 770 * @author Mark Reinhold 771 * @author JSR-51 Expert Group 772 * @since 1.4 773 * @spec JSR-51 774 */ 775 776 public final class Pattern 777 implements java.io.Serializable 778 { 779 780 /** 781 * Regular expression modifier values. Instead of being passed as 782 * arguments, they can also be passed as inline modifiers. 783 * For example, the following statements have the same effect. 784 * <pre> 785 * Pattern p1 = Pattern.compile("abc", Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); 786 * Pattern p2 = Pattern.compile("(?im)abc", 0); 787 * </pre> 788 */ 789 790 /** 791 * Enables Unix lines mode. 792 * 793 * <p> In this mode, only the {@code '\n'} line terminator is recognized 794 * in the behavior of {@code .}, {@code ^}, and {@code $}. 795 * 796 * <p> Unix lines mode can also be enabled via the embedded flag 797 * expression {@code (?d)}. 798 */ 799 public static final int UNIX_LINES = 0x01; 800 801 /** 802 * Enables case-insensitive matching. 803 * 804 * <p> By default, case-insensitive matching assumes that only characters 805 * in the US-ASCII charset are being matched. Unicode-aware 806 * case-insensitive matching can be enabled by specifying the {@link 807 * #UNICODE_CASE} flag in conjunction with this flag. 808 * 809 * <p> Case-insensitive matching can also be enabled via the embedded flag 810 * expression {@code (?i)}. 811 * 812 * <p> Specifying this flag may impose a slight performance penalty. </p> 813 */ 814 public static final int CASE_INSENSITIVE = 0x02; 815 816 /** 817 * Permits whitespace and comments in pattern. 818 * 819 * <p> In this mode, whitespace is ignored, and embedded comments starting 820 * with {@code #} are ignored until the end of a line. 821 * 822 * <p> Comments mode can also be enabled via the embedded flag 823 * expression {@code (?x)}. 824 */ 825 public static final int COMMENTS = 0x04; 826 827 /** 828 * Enables multiline mode. 829 * 830 * <p> In multiline mode the expressions {@code ^} and {@code $} match 831 * just after or just before, respectively, a line terminator or the end of 832 * the input sequence. By default these expressions only match at the 833 * beginning and the end of the entire input sequence. 834 * 835 * <p> Multiline mode can also be enabled via the embedded flag 836 * expression {@code (?m)}. </p> 837 */ 838 public static final int MULTILINE = 0x08; 839 840 /** 841 * Enables literal parsing of the pattern. 842 * 843 * <p> When this flag is specified then the input string that specifies 844 * the pattern is treated as a sequence of literal characters. 845 * Metacharacters or escape sequences in the input sequence will be 846 * given no special meaning. 847 * 848 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on 849 * matching when used in conjunction with this flag. The other flags 850 * become superfluous. 851 * 852 * <p> There is no embedded flag character for enabling literal parsing. 853 * @since 1.5 854 */ 855 public static final int LITERAL = 0x10; 856 857 /** 858 * Enables dotall mode. 859 * 860 * <p> In dotall mode, the expression {@code .} matches any character, 861 * including a line terminator. By default this expression does not match 862 * line terminators. 863 * 864 * <p> Dotall mode can also be enabled via the embedded flag 865 * expression {@code (?s)}. (The {@code s} is a mnemonic for 866 * "single-line" mode, which is what this is called in Perl.) </p> 867 */ 868 public static final int DOTALL = 0x20; 869 870 /** 871 * Enables Unicode-aware case folding. 872 * 873 * <p> When this flag is specified then case-insensitive matching, when 874 * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner 875 * consistent with the Unicode Standard. By default, case-insensitive 876 * matching assumes that only characters in the US-ASCII charset are being 877 * matched. 878 * 879 * <p> Unicode-aware case folding can also be enabled via the embedded flag 880 * expression {@code (?u)}. 881 * 882 * <p> Specifying this flag may impose a performance penalty. </p> 883 */ 884 public static final int UNICODE_CASE = 0x40; 885 886 /** 887 * Enables canonical equivalence. 888 * 889 * <p> When this flag is specified then two characters will be considered 890 * to match if, and only if, their full canonical decompositions match. 891 * The expression <code>"a\u030A"</code>, for example, will match the 892 * string <code>"\u00E5"</code> when this flag is specified. By default, 893 * matching does not take canonical equivalence into account. 894 * 895 * <p> There is no embedded flag character for enabling canonical 896 * equivalence. 897 * 898 * <p> Specifying this flag may impose a performance penalty. </p> 899 */ 900 public static final int CANON_EQ = 0x80; 901 902 /** 903 * Enables the Unicode version of <i>Predefined character classes</i> and 904 * <i>POSIX character classes</i>. 905 * 906 * <p> When this flag is specified then the (US-ASCII only) 907 * <i>Predefined character classes</i> and <i>POSIX character classes</i> 908 * are in conformance with 909 * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical 910 * Standard #18: Unicode Regular Expression</i></a> 911 * <i>Annex C: Compatibility Properties</i>. 912 * <p> 913 * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded 914 * flag expression {@code (?U)}. 915 * <p> 916 * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case 917 * folding. 918 * <p> 919 * Specifying this flag may impose a performance penalty. </p> 920 * @since 1.7 921 */ 922 public static final int UNICODE_CHARACTER_CLASS = 0x100; 923 924 /** 925 * Contains all possible flags for compile(regex, flags). 926 */ 927 private static final int ALL_FLAGS = CASE_INSENSITIVE | MULTILINE | 928 DOTALL | UNICODE_CASE | CANON_EQ | UNIX_LINES | LITERAL | 929 UNICODE_CHARACTER_CLASS | COMMENTS; 930 931 /* Pattern has only two serialized components: The pattern string 932 * and the flags, which are all that is needed to recompile the pattern 933 * when it is deserialized. 934 */ 935 936 /** use serialVersionUID from Merlin b59 for interoperability */ 937 private static final long serialVersionUID = 5073258162644648461L; 938 939 /** 940 * The original regular-expression pattern string. 941 * 942 * @serial 943 */ 944 private String pattern; 945 946 /** 947 * The original pattern flags. 948 * 949 * @serial 950 */ 951 private int flags; 952 953 /** 954 * The temporary pattern flags used during compiling. The flags might be turn 955 * on and off by embedded flag. 956 */ 957 private transient int flags0; 958 959 /** 960 * Boolean indicating this Pattern is compiled; this is necessary in order 961 * to lazily compile deserialized Patterns. 962 */ 963 private transient volatile boolean compiled; 964 965 /** 966 * The normalized pattern string. 967 */ 968 private transient String normalizedPattern; 969 970 /** 971 * The starting point of state machine for the find operation. This allows 972 * a match to start anywhere in the input. 973 */ 974 transient Node root; 975 976 /** 977 * The root of object tree for a match operation. The pattern is matched 978 * at the beginning. This may include a find that uses BnM or a First 979 * node. 980 */ 981 transient Node matchRoot; 982 983 /** 984 * Temporary storage used by parsing pattern slice. 985 */ 986 transient int[] buffer; 987 988 /** 989 * A temporary storage used for predicate for double return. 990 */ 991 transient CharPredicate predicate; 992 993 /** 994 * Map the "name" of the "named capturing group" to its group id 995 * node. 996 */ 997 transient volatile Map<String, Integer> namedGroups; 998 999 /** 1000 * Temporary storage used while parsing group references. 1001 */ 1002 transient GroupHead[] groupNodes; 1003 1004 /** 1005 * Temporary storage used to store the top level closure nodes. 1006 */ 1007 transient List<Node> topClosureNodes; 1008 1009 /** 1010 * The number of top greedy closure nodes in this Pattern. Used by 1011 * matchers to allocate storage needed for a IntHashSet to keep the 1012 * beginning pos {@code i} of all failed match. 1013 */ 1014 transient int localTCNCount; 1015 1016 /* 1017 * Turn off the stop-exponential-backtracking optimization if there 1018 * is a group ref in the pattern. 1019 */ 1020 transient boolean hasGroupRef; 1021 1022 /** 1023 * Temporary null terminated code point array used by pattern compiling. 1024 */ 1025 private transient int[] temp; 1026 1027 /** 1028 * The number of capturing groups in this Pattern. Used by matchers to 1029 * allocate storage needed to perform a match. 1030 */ 1031 transient int capturingGroupCount; 1032 1033 /** 1034 * The local variable count used by parsing tree. Used by matchers to 1035 * allocate storage needed to perform a match. 1036 */ 1037 transient int localCount; 1038 1039 /** 1040 * Index into the pattern string that keeps track of how much has been 1041 * parsed. 1042 */ 1043 private transient int cursor; 1044 1045 /** 1046 * Holds the length of the pattern string. 1047 */ 1048 private transient int patternLength; 1049 1050 /** 1051 * If the Start node might possibly match supplementary characters. 1052 * It is set to true during compiling if 1053 * (1) There is supplementary char in pattern, or 1054 * (2) There is complement node of a "family" CharProperty 1055 */ 1056 private transient boolean hasSupplementary; 1057 1058 /** 1059 * Compiles the given regular expression into a pattern. 1060 * 1061 * @param regex 1062 * The expression to be compiled 1063 * @return the given regular expression compiled into a pattern 1064 * @throws PatternSyntaxException 1065 * If the expression's syntax is invalid 1066 */ 1067 public static Pattern compile(String regex) { 1068 return new Pattern(regex, 0); 1069 } 1070 1071 /** 1072 * Compiles the given regular expression into a pattern with the given 1073 * flags. 1074 * 1075 * @param regex 1076 * The expression to be compiled 1077 * 1078 * @param flags 1079 * Match flags, a bit mask that may include 1080 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL}, 1081 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES}, 1082 * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS} 1083 * and {@link #COMMENTS} 1084 * 1085 * @return the given regular expression compiled into a pattern with the given flags 1086 * @throws IllegalArgumentException 1087 * If bit values other than those corresponding to the defined 1088 * match flags are set in {@code flags} 1089 * 1090 * @throws PatternSyntaxException 1091 * If the expression's syntax is invalid 1092 */ 1093 public static Pattern compile(String regex, int flags) { 1094 return new Pattern(regex, flags); 1095 } 1096 1097 /** 1098 * Returns the regular expression from which this pattern was compiled. 1099 * 1100 * @return The source of this pattern 1101 */ 1102 public String pattern() { 1103 return pattern; 1104 } 1105 1106 /** 1107 * <p>Returns the string representation of this pattern. This 1108 * is the regular expression from which this pattern was 1109 * compiled.</p> 1110 * 1111 * @return The string representation of this pattern 1112 * @since 1.5 1113 */ 1114 public String toString() { 1115 return pattern; 1116 } 1117 1118 /** 1119 * Creates a matcher that will match the given input against this pattern. 1120 * 1121 * @param input 1122 * The character sequence to be matched 1123 * 1124 * @return A new matcher for this pattern 1125 */ 1126 public Matcher matcher(CharSequence input) { 1127 if (!compiled) { 1128 synchronized(this) { 1129 if (!compiled) 1130 compile(); 1131 } 1132 } 1133 Matcher m = new Matcher(this, input); 1134 return m; 1135 } 1136 1137 /** 1138 * Returns this pattern's match flags. 1139 * 1140 * @return The match flags specified when this pattern was compiled 1141 */ 1142 public int flags() { 1143 return flags0; 1144 } 1145 1146 /** 1147 * Compiles the given regular expression and attempts to match the given 1148 * input against it. 1149 * 1150 * <p> An invocation of this convenience method of the form 1151 * 1152 * <blockquote><pre> 1153 * Pattern.matches(regex, input);</pre></blockquote> 1154 * 1155 * behaves in exactly the same way as the expression 1156 * 1157 * <blockquote><pre> 1158 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote> 1159 * 1160 * <p> If a pattern is to be used multiple times, compiling it once and reusing 1161 * it will be more efficient than invoking this method each time. </p> 1162 * 1163 * @param regex 1164 * The expression to be compiled 1165 * 1166 * @param input 1167 * The character sequence to be matched 1168 * @return whether or not the regular expression matches on the input 1169 * @throws PatternSyntaxException 1170 * If the expression's syntax is invalid 1171 */ 1172 public static boolean matches(String regex, CharSequence input) { 1173 Pattern p = Pattern.compile(regex); 1174 Matcher m = p.matcher(input); 1175 return m.matches(); 1176 } 1177 1178 /** 1179 * Splits the given input sequence around matches of this pattern. 1180 * 1181 * <p> The array returned by this method contains each substring of the 1182 * input sequence that is terminated by another subsequence that matches 1183 * this pattern or is terminated by the end of the input sequence. The 1184 * substrings in the array are in the order in which they occur in the 1185 * input. If this pattern does not match any subsequence of the input then 1186 * the resulting array has just one element, namely the input sequence in 1187 * string form. 1188 * 1189 * <p> When there is a positive-width match at the beginning of the input 1190 * sequence then an empty leading substring is included at the beginning 1191 * of the resulting array. A zero-width match at the beginning however 1192 * never produces such empty leading substring. 1193 * 1194 * <p> The {@code limit} parameter controls the number of times the 1195 * pattern is applied and therefore affects the length of the resulting 1196 * array. If the limit <i>n</i> is greater than zero then the pattern 1197 * will be applied at most <i>n</i> - 1 times, the array's 1198 * length will be no greater than <i>n</i>, and the array's last entry 1199 * will contain all input beyond the last matched delimiter. If <i>n</i> 1200 * is non-positive then the pattern will be applied as many times as 1201 * possible and the array can have any length. If <i>n</i> is zero then 1202 * the pattern will be applied as many times as possible, the array can 1203 * have any length, and trailing empty strings will be discarded. 1204 * 1205 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1206 * results with these parameters: 1207 * 1208 * <table class="plain" style="margin-left:2em;"> 1209 * <caption style="display:none">Split example showing regex, limit, and result</caption> 1210 * <thead> 1211 * <tr> 1212 * <th scope="col">Regex</th> 1213 * <th scope="col">Limit</th> 1214 * <th scope="col">Result</th> 1215 * </tr> 1216 * </thead> 1217 * <tbody> 1218 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th> 1219 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 1220 * <td>{@code { "boo", "and:foo" }}</td></tr> 1221 * <tr><!-- : --> 1222 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1223 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1224 * <tr><!-- : --> 1225 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1226 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1227 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 1228 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 1229 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1230 * <tr><!-- o --> 1231 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 1232 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 1233 * <tr><!-- o --> 1234 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 1235 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1236 * </tbody> 1237 * </table> 1238 * 1239 * @param input 1240 * The character sequence to be split 1241 * 1242 * @param limit 1243 * The result threshold, as described above 1244 * 1245 * @return The array of strings computed by splitting the input 1246 * around matches of this pattern 1247 */ 1248 public String[] split(CharSequence input, int limit) { 1249 int index = 0; 1250 boolean matchLimited = limit > 0; 1251 ArrayList<String> matchList = new ArrayList<>(); 1252 Matcher m = matcher(input); 1253 1254 // Add segments before each match found 1255 while(m.find()) { 1256 if (!matchLimited || matchList.size() < limit - 1) { 1257 if (index == 0 && index == m.start() && m.start() == m.end()) { 1258 // no empty leading substring included for zero-width match 1259 // at the beginning of the input char sequence. 1260 continue; 1261 } 1262 String match = input.subSequence(index, m.start()).toString(); 1263 matchList.add(match); 1264 index = m.end(); 1265 } else if (matchList.size() == limit - 1) { // last one 1266 String match = input.subSequence(index, 1267 input.length()).toString(); 1268 matchList.add(match); 1269 index = m.end(); 1270 } 1271 } 1272 1273 // If no match was found, return this 1274 if (index == 0) 1275 return new String[] {input.toString()}; 1276 1277 // Add remaining segment 1278 if (!matchLimited || matchList.size() < limit) 1279 matchList.add(input.subSequence(index, input.length()).toString()); 1280 1281 // Construct result 1282 int resultSize = matchList.size(); 1283 if (limit == 0) 1284 while (resultSize > 0 && matchList.get(resultSize-1).equals("")) 1285 resultSize--; 1286 String[] result = new String[resultSize]; 1287 return matchList.subList(0, resultSize).toArray(result); 1288 } 1289 1290 /** 1291 * Splits the given input sequence around matches of this pattern. 1292 * 1293 * <p> This method works as if by invoking the two-argument {@link 1294 * #split(java.lang.CharSequence, int) split} method with the given input 1295 * sequence and a limit argument of zero. Trailing empty strings are 1296 * therefore not included in the resulting array. </p> 1297 * 1298 * <p> The input {@code "boo:and:foo"}, for example, yields the following 1299 * results with these expressions: 1300 * 1301 * <table class="plain" style="margin-left:2em"> 1302 * <caption style="display:none">Split examples showing regex and result</caption> 1303 * <thead> 1304 * <tr> 1305 * <th scope="col">Regex</th> 1306 * <th scope="col">Result</th> 1307 * </tr> 1308 * </thead> 1309 * <tbody> 1310 * <tr><th scope="row" style="text-weight:normal">:</th> 1311 * <td>{@code { "boo", "and", "foo" }}</td></tr> 1312 * <tr><th scope="row" style="text-weight:normal">o</th> 1313 * <td>{@code { "b", "", ":and:f" }}</td></tr> 1314 * </tbody> 1315 * </table> 1316 * 1317 * 1318 * @param input 1319 * The character sequence to be split 1320 * 1321 * @return The array of strings computed by splitting the input 1322 * around matches of this pattern 1323 */ 1324 public String[] split(CharSequence input) { 1325 return split(input, 0); 1326 } 1327 1328 /** 1329 * Returns a literal pattern {@code String} for the specified 1330 * {@code String}. 1331 * 1332 * <p>This method produces a {@code String} that can be used to 1333 * create a {@code Pattern} that would match the string 1334 * {@code s} as if it were a literal pattern.</p> Metacharacters 1335 * or escape sequences in the input sequence will be given no special 1336 * meaning. 1337 * 1338 * @param s The string to be literalized 1339 * @return A literal string replacement 1340 * @since 1.5 1341 */ 1342 public static String quote(String s) { 1343 int slashEIndex = s.indexOf("\\E"); 1344 if (slashEIndex == -1) 1345 return "\\Q" + s + "\\E"; 1346 1347 int lenHint = s.length(); 1348 lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ? 1349 (lenHint << 1) : (Integer.MAX_VALUE - 8); 1350 1351 StringBuilder sb = new StringBuilder(lenHint); 1352 sb.append("\\Q"); 1353 int current = 0; 1354 do { 1355 sb.append(s, current, slashEIndex) 1356 .append("\\E\\\\E\\Q"); 1357 current = slashEIndex + 2; 1358 } while ((slashEIndex = s.indexOf("\\E", current)) != -1); 1359 1360 return sb.append(s, current, s.length()) 1361 .append("\\E") 1362 .toString(); 1363 } 1364 1365 /** 1366 * Recompile the Pattern instance from a stream. The original pattern 1367 * string is read in and the object tree is recompiled from it. 1368 */ 1369 private void readObject(java.io.ObjectInputStream s) 1370 throws java.io.IOException, ClassNotFoundException { 1371 1372 // Read in all fields 1373 s.defaultReadObject(); 1374 1375 // reset the flags 1376 flags0 = flags; 1377 1378 // Initialize counts 1379 capturingGroupCount = 1; 1380 localCount = 0; 1381 localTCNCount = 0; 1382 1383 // if length > 0, the Pattern is lazily compiled 1384 if (pattern.length() == 0) { 1385 root = new Start(lastAccept); 1386 matchRoot = lastAccept; 1387 compiled = true; 1388 } 1389 } 1390 1391 /** 1392 * This private constructor is used to create all Patterns. The pattern 1393 * string and match flags are all that is needed to completely describe 1394 * a Pattern. An empty pattern string results in an object tree with 1395 * only a Start node and a LastNode node. 1396 */ 1397 private Pattern(String p, int f) { 1398 if ((f & ~ALL_FLAGS) != 0) { 1399 throw new IllegalArgumentException("Unknown flag 0x" 1400 + Integer.toHexString(f)); 1401 } 1402 pattern = p; 1403 flags = f; 1404 1405 // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present 1406 if ((flags & UNICODE_CHARACTER_CLASS) != 0) 1407 flags |= UNICODE_CASE; 1408 1409 // 'flags' for compiling 1410 flags0 = flags; 1411 1412 // Reset group index count 1413 capturingGroupCount = 1; 1414 localCount = 0; 1415 localTCNCount = 0; 1416 1417 if (pattern.length() > 0) { 1418 compile(); 1419 } else { 1420 root = new Start(lastAccept); 1421 matchRoot = lastAccept; 1422 } 1423 } 1424 1425 /** 1426 * The pattern is converted to normalized form ({@link 1427 * java.text.Normalizer.Form.NFC NFC}, canonical decomposition, 1428 * followed by canonical composition for the character class 1429 * part, and {@link java.text.Normalizer.Form.NFD NFD}, 1430 * canonical decomposition) for the rest), and then a pure 1431 * group is constructed to match canonical equivalences of the 1432 * characters. 1433 */ 1434 private static String normalize(String pattern) { 1435 int plen = pattern.length(); 1436 StringBuilder pbuf = new StringBuilder(plen); 1437 char last = 0; 1438 int lastStart = 0; 1439 char cc = 0; 1440 for (int i = 0; i < plen;) { 1441 char c = pattern.charAt(i); 1442 if (cc == 0 && // top level 1443 c == '\\' && i + 1 < plen && pattern.charAt(i + 1) == '\\') { 1444 i += 2; last = 0; 1445 continue; 1446 } 1447 if (c == '[' && last != '\\') { 1448 if (cc == 0) { 1449 if (lastStart < i) 1450 normalizeSlice(pattern, lastStart, i, pbuf); 1451 lastStart = i; 1452 } 1453 cc++; 1454 } else if (c == ']' && last != '\\') { 1455 cc--; 1456 if (cc == 0) { 1457 normalizeClazz(pattern, lastStart, i + 1, pbuf); 1458 lastStart = i + 1; 1459 } 1460 } 1461 last = c; 1462 i++; 1463 } 1464 assert (cc == 0); 1465 if (lastStart < plen) 1466 normalizeSlice(pattern, lastStart, plen, pbuf); 1467 return pbuf.toString(); 1468 } 1469 1470 private static void normalizeSlice(String src, int off, int limit, 1471 StringBuilder dst) 1472 { 1473 int len = src.length(); 1474 int off0 = off; 1475 while (off < limit && ASCII.isAscii(src.charAt(off))) { 1476 off++; 1477 } 1478 if (off == limit) { 1479 dst.append(src, off0, limit); 1480 return; 1481 } 1482 off--; 1483 if (off < off0) 1484 off = off0; 1485 else 1486 dst.append(src, off0, off); 1487 while (off < limit) { 1488 int ch0 = src.codePointAt(off); 1489 if (".$|()[]{}^?*+\\".indexOf(ch0) != -1) { 1490 dst.append((char)ch0); 1491 off++; 1492 continue; 1493 } 1494 int j = off + Character.charCount(ch0); 1495 int ch1; 1496 while (j < limit) { 1497 ch1 = src.codePointAt(j); 1498 if (Grapheme.isBoundary(ch0, ch1)) 1499 break; 1500 ch0 = ch1; 1501 j += Character.charCount(ch1); 1502 } 1503 String seq = src.substring(off, j); 1504 String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD); 1505 off = j; 1506 if (nfd.length() > 1) { 1507 ch0 = nfd.codePointAt(0); 1508 ch1 = nfd.codePointAt(Character.charCount(ch0)); 1509 if (Character.getType(ch1) == Character.NON_SPACING_MARK) { 1510 Set<String> altns = new LinkedHashSet<>(); 1511 altns.add(seq); 1512 produceEquivalentAlternation(nfd, altns); 1513 dst.append("(?:"); 1514 altns.forEach( s -> dst.append(s).append('|')); 1515 dst.delete(dst.length() - 1, dst.length()); 1516 dst.append(")"); 1517 continue; 1518 } 1519 } 1520 String nfc = Normalizer.normalize(seq, Normalizer.Form.NFC); 1521 if (!seq.equals(nfc) && !nfd.equals(nfc)) 1522 dst.append("(?:" + seq + "|" + nfd + "|" + nfc + ")"); 1523 else if (!seq.equals(nfd)) 1524 dst.append("(?:" + seq + "|" + nfd + ")"); 1525 else 1526 dst.append(seq); 1527 } 1528 } 1529 1530 private static void normalizeClazz(String src, int off, int limit, 1531 StringBuilder dst) 1532 { 1533 dst.append(Normalizer.normalize(src.substring(off, limit), Form.NFC)); 1534 } 1535 1536 /** 1537 * Given a specific sequence composed of a regular character and 1538 * combining marks that follow it, produce the alternation that will 1539 * match all canonical equivalences of that sequence. 1540 */ 1541 private static void produceEquivalentAlternation(String src, 1542 Set<String> dst) 1543 { 1544 int len = countChars(src, 0, 1); 1545 if (src.length() == len) { 1546 dst.add(src); // source has one character. 1547 return; 1548 } 1549 String base = src.substring(0,len); 1550 String combiningMarks = src.substring(len); 1551 String[] perms = producePermutations(combiningMarks); 1552 // Add combined permutations 1553 for(int x = 0; x < perms.length; x++) { 1554 String next = base + perms[x]; 1555 dst.add(next); 1556 next = composeOneStep(next); 1557 if (next != null) { 1558 produceEquivalentAlternation(next, dst); 1559 } 1560 } 1561 } 1562 1563 /** 1564 * Returns an array of strings that have all the possible 1565 * permutations of the characters in the input string. 1566 * This is used to get a list of all possible orderings 1567 * of a set of combining marks. Note that some of the permutations 1568 * are invalid because of combining class collisions, and these 1569 * possibilities must be removed because they are not canonically 1570 * equivalent. 1571 */ 1572 private static String[] producePermutations(String input) { 1573 if (input.length() == countChars(input, 0, 1)) 1574 return new String[] {input}; 1575 1576 if (input.length() == countChars(input, 0, 2)) { 1577 int c0 = Character.codePointAt(input, 0); 1578 int c1 = Character.codePointAt(input, Character.charCount(c0)); 1579 if (getClass(c1) == getClass(c0)) { 1580 return new String[] {input}; 1581 } 1582 String[] result = new String[2]; 1583 result[0] = input; 1584 StringBuilder sb = new StringBuilder(2); 1585 sb.appendCodePoint(c1); 1586 sb.appendCodePoint(c0); 1587 result[1] = sb.toString(); 1588 return result; 1589 } 1590 1591 int length = 1; 1592 int nCodePoints = countCodePoints(input); 1593 for(int x=1; x<nCodePoints; x++) 1594 length = length * (x+1); 1595 1596 String[] temp = new String[length]; 1597 1598 int combClass[] = new int[nCodePoints]; 1599 for(int x=0, i=0; x<nCodePoints; x++) { 1600 int c = Character.codePointAt(input, i); 1601 combClass[x] = getClass(c); 1602 i += Character.charCount(c); 1603 } 1604 1605 // For each char, take it out and add the permutations 1606 // of the remaining chars 1607 int index = 0; 1608 int len; 1609 // offset maintains the index in code units. 1610 loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) { 1611 len = countChars(input, offset, 1); 1612 for(int y=x-1; y>=0; y--) { 1613 if (combClass[y] == combClass[x]) { 1614 continue loop; 1615 } 1616 } 1617 StringBuilder sb = new StringBuilder(input); 1618 String otherChars = sb.delete(offset, offset+len).toString(); 1619 String[] subResult = producePermutations(otherChars); 1620 1621 String prefix = input.substring(offset, offset+len); 1622 for (String sre : subResult) 1623 temp[index++] = prefix + sre; 1624 } 1625 String[] result = new String[index]; 1626 System.arraycopy(temp, 0, result, 0, index); 1627 return result; 1628 } 1629 1630 private static int getClass(int c) { 1631 return sun.text.Normalizer.getCombiningClass(c); 1632 } 1633 1634 /** 1635 * Attempts to compose input by combining the first character 1636 * with the first combining mark following it. Returns a String 1637 * that is the composition of the leading character with its first 1638 * combining mark followed by the remaining combining marks. Returns 1639 * null if the first two characters cannot be further composed. 1640 */ 1641 private static String composeOneStep(String input) { 1642 int len = countChars(input, 0, 2); 1643 String firstTwoCharacters = input.substring(0, len); 1644 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC); 1645 if (result.equals(firstTwoCharacters)) 1646 return null; 1647 else { 1648 String remainder = input.substring(len); 1649 return result + remainder; 1650 } 1651 } 1652 1653 /** 1654 * Preprocess any \Q...\E sequences in `temp', meta-quoting them. 1655 * See the description of `quotemeta' in perlfunc(1). 1656 */ 1657 private void RemoveQEQuoting() { 1658 final int pLen = patternLength; 1659 int i = 0; 1660 while (i < pLen-1) { 1661 if (temp[i] != '\\') 1662 i += 1; 1663 else if (temp[i + 1] != 'Q') 1664 i += 2; 1665 else 1666 break; 1667 } 1668 if (i >= pLen - 1) // No \Q sequence found 1669 return; 1670 int j = i; 1671 i += 2; 1672 int[] newtemp = new int[j + 3*(pLen-i) + 2]; 1673 System.arraycopy(temp, 0, newtemp, 0, j); 1674 1675 boolean inQuote = true; 1676 boolean beginQuote = true; 1677 while (i < pLen) { 1678 int c = temp[i++]; 1679 if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) { 1680 newtemp[j++] = c; 1681 } else if (ASCII.isDigit(c)) { 1682 if (beginQuote) { 1683 /* 1684 * A unicode escape \[0xu] could be before this quote, 1685 * and we don't want this numeric char to processed as 1686 * part of the escape. 1687 */ 1688 newtemp[j++] = '\\'; 1689 newtemp[j++] = 'x'; 1690 newtemp[j++] = '3'; 1691 } 1692 newtemp[j++] = c; 1693 } else if (c != '\\') { 1694 if (inQuote) newtemp[j++] = '\\'; 1695 newtemp[j++] = c; 1696 } else if (inQuote) { 1697 if (temp[i] == 'E') { 1698 i++; 1699 inQuote = false; 1700 } else { 1701 newtemp[j++] = '\\'; 1702 newtemp[j++] = '\\'; 1703 } 1704 } else { 1705 if (temp[i] == 'Q') { 1706 i++; 1707 inQuote = true; 1708 beginQuote = true; 1709 continue; 1710 } else { 1711 newtemp[j++] = c; 1712 if (i != pLen) 1713 newtemp[j++] = temp[i++]; 1714 } 1715 } 1716 1717 beginQuote = false; 1718 } 1719 1720 patternLength = j; 1721 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination 1722 } 1723 1724 /** 1725 * Copies regular expression to an int array and invokes the parsing 1726 * of the expression which will create the object tree. 1727 */ 1728 private void compile() { 1729 // Handle canonical equivalences 1730 if (has(CANON_EQ) && !has(LITERAL)) { 1731 normalizedPattern = normalize(pattern); 1732 } else { 1733 normalizedPattern = pattern; 1734 } 1735 patternLength = normalizedPattern.length(); 1736 1737 // Copy pattern to int array for convenience 1738 // Use double zero to terminate pattern 1739 temp = new int[patternLength + 2]; 1740 1741 hasSupplementary = false; 1742 int c, count = 0; 1743 // Convert all chars into code points 1744 for (int x = 0; x < patternLength; x += Character.charCount(c)) { 1745 c = normalizedPattern.codePointAt(x); 1746 if (isSupplementary(c)) { 1747 hasSupplementary = true; 1748 } 1749 temp[count++] = c; 1750 } 1751 1752 patternLength = count; // patternLength now in code points 1753 1754 if (! has(LITERAL)) 1755 RemoveQEQuoting(); 1756 1757 // Allocate all temporary objects here. 1758 buffer = new int[32]; 1759 groupNodes = new GroupHead[10]; 1760 namedGroups = null; 1761 topClosureNodes = new ArrayList<>(10); 1762 1763 if (has(LITERAL)) { 1764 // Literal pattern handling 1765 matchRoot = newSlice(temp, patternLength, hasSupplementary); 1766 matchRoot.next = lastAccept; 1767 } else { 1768 // Start recursive descent parsing 1769 matchRoot = expr(lastAccept); 1770 // Check extra pattern characters 1771 if (patternLength != cursor) { 1772 if (peek() == ')') { 1773 throw error("Unmatched closing ')'"); 1774 } else { 1775 throw error("Unexpected internal error"); 1776 } 1777 } 1778 } 1779 1780 // Peephole optimization 1781 if (matchRoot instanceof Slice) { 1782 root = BnM.optimize(matchRoot); 1783 if (root == matchRoot) { 1784 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1785 } 1786 } else if (matchRoot instanceof Begin || matchRoot instanceof First) { 1787 root = matchRoot; 1788 } else { 1789 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot); 1790 } 1791 1792 // Optimize the greedy Loop to prevent exponential backtracking, IF there 1793 // is no group ref in this pattern. With a non-negative localTCNCount value, 1794 // the greedy type Loop, Curly will skip the backtracking for any starting 1795 // position "i" that failed in the past. 1796 if (!hasGroupRef) { 1797 for (Node node : topClosureNodes) { 1798 if (node instanceof Loop) { 1799 // non-deterministic-greedy-group 1800 ((Loop)node).posIndex = localTCNCount++; 1801 } 1802 } 1803 } 1804 1805 // Release temporary storage 1806 temp = null; 1807 buffer = null; 1808 groupNodes = null; 1809 patternLength = 0; 1810 compiled = true; 1811 topClosureNodes = null; 1812 } 1813 1814 Map<String, Integer> namedGroups() { 1815 Map<String, Integer> groups = namedGroups; 1816 if (groups == null) { 1817 namedGroups = groups = new HashMap<>(2); 1818 } 1819 return groups; 1820 } 1821 1822 /** 1823 * Used to accumulate information about a subtree of the object graph 1824 * so that optimizations can be applied to the subtree. 1825 */ 1826 static final class TreeInfo { 1827 int minLength; 1828 int maxLength; 1829 boolean maxValid; 1830 boolean deterministic; 1831 1832 TreeInfo() { 1833 reset(); 1834 } 1835 void reset() { 1836 minLength = 0; 1837 maxLength = 0; 1838 maxValid = true; 1839 deterministic = true; 1840 } 1841 } 1842 1843 /* 1844 * The following private methods are mainly used to improve the 1845 * readability of the code. In order to let the Java compiler easily 1846 * inline them, we should not put many assertions or error checks in them. 1847 */ 1848 1849 /** 1850 * Indicates whether a particular flag is set or not. 1851 */ 1852 private boolean has(int f) { 1853 return (flags0 & f) != 0; 1854 } 1855 1856 /** 1857 * Match next character, signal error if failed. 1858 */ 1859 private void accept(int ch, String s) { 1860 int testChar = temp[cursor++]; 1861 if (has(COMMENTS)) 1862 testChar = parsePastWhitespace(testChar); 1863 if (ch != testChar) { 1864 throw error(s); 1865 } 1866 } 1867 1868 /** 1869 * Mark the end of pattern with a specific character. 1870 */ 1871 private void mark(int c) { 1872 temp[patternLength] = c; 1873 } 1874 1875 /** 1876 * Peek the next character, and do not advance the cursor. 1877 */ 1878 private int peek() { 1879 int ch = temp[cursor]; 1880 if (has(COMMENTS)) 1881 ch = peekPastWhitespace(ch); 1882 return ch; 1883 } 1884 1885 /** 1886 * Read the next character, and advance the cursor by one. 1887 */ 1888 private int read() { 1889 int ch = temp[cursor++]; 1890 if (has(COMMENTS)) 1891 ch = parsePastWhitespace(ch); 1892 return ch; 1893 } 1894 1895 /** 1896 * Read the next character, and advance the cursor by one, 1897 * ignoring the COMMENTS setting 1898 */ 1899 private int readEscaped() { 1900 int ch = temp[cursor++]; 1901 return ch; 1902 } 1903 1904 /** 1905 * Advance the cursor by one, and peek the next character. 1906 */ 1907 private int next() { 1908 int ch = temp[++cursor]; 1909 if (has(COMMENTS)) 1910 ch = peekPastWhitespace(ch); 1911 return ch; 1912 } 1913 1914 /** 1915 * Advance the cursor by one, and peek the next character, 1916 * ignoring the COMMENTS setting 1917 */ 1918 private int nextEscaped() { 1919 int ch = temp[++cursor]; 1920 return ch; 1921 } 1922 1923 /** 1924 * If in xmode peek past whitespace and comments. 1925 */ 1926 private int peekPastWhitespace(int ch) { 1927 while (ASCII.isSpace(ch) || ch == '#') { 1928 while (ASCII.isSpace(ch)) 1929 ch = temp[++cursor]; 1930 if (ch == '#') { 1931 ch = peekPastLine(); 1932 } 1933 } 1934 return ch; 1935 } 1936 1937 /** 1938 * If in xmode parse past whitespace and comments. 1939 */ 1940 private int parsePastWhitespace(int ch) { 1941 while (ASCII.isSpace(ch) || ch == '#') { 1942 while (ASCII.isSpace(ch)) 1943 ch = temp[cursor++]; 1944 if (ch == '#') 1945 ch = parsePastLine(); 1946 } 1947 return ch; 1948 } 1949 1950 /** 1951 * xmode parse past comment to end of line. 1952 */ 1953 private int parsePastLine() { 1954 int ch = temp[cursor++]; 1955 while (ch != 0 && !isLineSeparator(ch)) 1956 ch = temp[cursor++]; 1957 return ch; 1958 } 1959 1960 /** 1961 * xmode peek past comment to end of line. 1962 */ 1963 private int peekPastLine() { 1964 int ch = temp[++cursor]; 1965 while (ch != 0 && !isLineSeparator(ch)) 1966 ch = temp[++cursor]; 1967 return ch; 1968 } 1969 1970 /** 1971 * Determines if character is a line separator in the current mode 1972 */ 1973 private boolean isLineSeparator(int ch) { 1974 if (has(UNIX_LINES)) { 1975 return ch == '\n'; 1976 } else { 1977 return (ch == '\n' || 1978 ch == '\r' || 1979 (ch|1) == '\u2029' || 1980 ch == '\u0085'); 1981 } 1982 } 1983 1984 /** 1985 * Read the character after the next one, and advance the cursor by two. 1986 */ 1987 private int skip() { 1988 int i = cursor; 1989 int ch = temp[i+1]; 1990 cursor = i + 2; 1991 return ch; 1992 } 1993 1994 /** 1995 * Unread one next character, and retreat cursor by one. 1996 */ 1997 private void unread() { 1998 cursor--; 1999 } 2000 2001 /** 2002 * Internal method used for handling all syntax errors. The pattern is 2003 * displayed with a pointer to aid in locating the syntax error. 2004 */ 2005 private PatternSyntaxException error(String s) { 2006 return new PatternSyntaxException(s, normalizedPattern, cursor - 1); 2007 } 2008 2009 /** 2010 * Determines if there is any supplementary character or unpaired 2011 * surrogate in the specified range. 2012 */ 2013 private boolean findSupplementary(int start, int end) { 2014 for (int i = start; i < end; i++) { 2015 if (isSupplementary(temp[i])) 2016 return true; 2017 } 2018 return false; 2019 } 2020 2021 /** 2022 * Determines if the specified code point is a supplementary 2023 * character or unpaired surrogate. 2024 */ 2025 private static final boolean isSupplementary(int ch) { 2026 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || 2027 Character.isSurrogate((char)ch); 2028 } 2029 2030 /** 2031 * The following methods handle the main parsing. They are sorted 2032 * according to their precedence order, the lowest one first. 2033 */ 2034 2035 /** 2036 * The expression is parsed with branch nodes added for alternations. 2037 * This may be called recursively to parse sub expressions that may 2038 * contain alternations. 2039 */ 2040 private Node expr(Node end) { 2041 Node prev = null; 2042 Node firstTail = null; 2043 Branch branch = null; 2044 Node branchConn = null; 2045 2046 for (;;) { 2047 Node node = sequence(end); 2048 Node nodeTail = root; //double return 2049 if (prev == null) { 2050 prev = node; 2051 firstTail = nodeTail; 2052 } else { 2053 // Branch 2054 if (branchConn == null) { 2055 branchConn = new BranchConn(); 2056 branchConn.next = end; 2057 } 2058 if (node == end) { 2059 // if the node returned from sequence() is "end" 2060 // we have an empty expr, set a null atom into 2061 // the branch to indicate to go "next" directly. 2062 node = null; 2063 } else { 2064 // the "tail.next" of each atom goes to branchConn 2065 nodeTail.next = branchConn; 2066 } 2067 if (prev == branch) { 2068 branch.add(node); 2069 } else { 2070 if (prev == end) { 2071 prev = null; 2072 } else { 2073 // replace the "end" with "branchConn" at its tail.next 2074 // when put the "prev" into the branch as the first atom. 2075 firstTail.next = branchConn; 2076 } 2077 prev = branch = new Branch(prev, node, branchConn); 2078 } 2079 } 2080 if (peek() != '|') { 2081 return prev; 2082 } 2083 next(); 2084 } 2085 } 2086 2087 @SuppressWarnings("fallthrough") 2088 /** 2089 * Parsing of sequences between alternations. 2090 */ 2091 private Node sequence(Node end) { 2092 Node head = null; 2093 Node tail = null; 2094 Node node = null; 2095 LOOP: 2096 for (;;) { 2097 int ch = peek(); 2098 switch (ch) { 2099 case '(': 2100 // Because group handles its own closure, 2101 // we need to treat it differently 2102 node = group0(); 2103 // Check for comment or flag group 2104 if (node == null) 2105 continue; 2106 if (head == null) 2107 head = node; 2108 else 2109 tail.next = node; 2110 // Double return: Tail was returned in root 2111 tail = root; 2112 continue; 2113 case '[': 2114 if (has(CANON_EQ) && !has(LITERAL)) 2115 node = new NFCCharProperty(clazz(true)); 2116 else 2117 node = newCharProperty(clazz(true)); 2118 break; 2119 case '\\': 2120 ch = nextEscaped(); 2121 if (ch == 'p' || ch == 'P') { 2122 boolean oneLetter = true; 2123 boolean comp = (ch == 'P'); 2124 ch = next(); // Consume { if present 2125 if (ch != '{') { 2126 unread(); 2127 } else { 2128 oneLetter = false; 2129 } 2130 // node = newCharProperty(family(oneLetter, comp)); 2131 if (has(CANON_EQ) && !has(LITERAL)) 2132 node = new NFCCharProperty(family(oneLetter, comp)); 2133 else 2134 node = newCharProperty(family(oneLetter, comp)); 2135 } else { 2136 unread(); 2137 node = atom(); 2138 } 2139 break; 2140 case '^': 2141 next(); 2142 if (has(MULTILINE)) { 2143 if (has(UNIX_LINES)) 2144 node = new UnixCaret(); 2145 else 2146 node = new Caret(); 2147 } else { 2148 node = new Begin(); 2149 } 2150 break; 2151 case '$': 2152 next(); 2153 if (has(UNIX_LINES)) 2154 node = new UnixDollar(has(MULTILINE)); 2155 else 2156 node = new Dollar(has(MULTILINE)); 2157 break; 2158 case '.': 2159 next(); 2160 if (has(DOTALL)) { 2161 node = new CharProperty(ALL()); 2162 } else { 2163 if (has(UNIX_LINES)) { 2164 node = new CharProperty(UNIXDOT()); 2165 } else { 2166 node = new CharProperty(DOT()); 2167 } 2168 } 2169 break; 2170 case '|': 2171 case ')': 2172 break LOOP; 2173 case ']': // Now interpreting dangling ] and } as literals 2174 case '}': 2175 node = atom(); 2176 break; 2177 case '?': 2178 case '*': 2179 case '+': 2180 next(); 2181 throw error("Dangling meta character '" + ((char)ch) + "'"); 2182 case 0: 2183 if (cursor >= patternLength) { 2184 break LOOP; 2185 } 2186 // Fall through 2187 default: 2188 node = atom(); 2189 break; 2190 } 2191 2192 node = closure(node); 2193 /* save the top dot-greedy nodes (.*, .+) as well 2194 if (node instanceof GreedyCharProperty && 2195 ((GreedyCharProperty)node).cp instanceof Dot) { 2196 topClosureNodes.add(node); 2197 } 2198 */ 2199 if (head == null) { 2200 head = tail = node; 2201 } else { 2202 tail.next = node; 2203 tail = node; 2204 } 2205 } 2206 if (head == null) { 2207 return end; 2208 } 2209 tail.next = end; 2210 root = tail; //double return 2211 return head; 2212 } 2213 2214 @SuppressWarnings("fallthrough") 2215 /** 2216 * Parse and add a new Single or Slice. 2217 */ 2218 private Node atom() { 2219 int first = 0; 2220 int prev = -1; 2221 boolean hasSupplementary = false; 2222 int ch = peek(); 2223 for (;;) { 2224 switch (ch) { 2225 case '*': 2226 case '+': 2227 case '?': 2228 case '{': 2229 if (first > 1) { 2230 cursor = prev; // Unwind one character 2231 first--; 2232 } 2233 break; 2234 case '$': 2235 case '.': 2236 case '^': 2237 case '(': 2238 case '[': 2239 case '|': 2240 case ')': 2241 break; 2242 case '\\': 2243 ch = nextEscaped(); 2244 if (ch == 'p' || ch == 'P') { // Property 2245 if (first > 0) { // Slice is waiting; handle it first 2246 unread(); 2247 break; 2248 } else { // No slice; just return the family node 2249 boolean comp = (ch == 'P'); 2250 boolean oneLetter = true; 2251 ch = next(); // Consume { if present 2252 if (ch != '{') 2253 unread(); 2254 else 2255 oneLetter = false; 2256 if (has(CANON_EQ) && !has(LITERAL)) 2257 return new NFCCharProperty(family(oneLetter, comp)); 2258 else 2259 return newCharProperty(family(oneLetter, comp)); 2260 } 2261 } 2262 unread(); 2263 prev = cursor; 2264 ch = escape(false, first == 0, false); 2265 if (ch >= 0) { 2266 append(ch, first); 2267 first++; 2268 if (isSupplementary(ch)) { 2269 hasSupplementary = true; 2270 } 2271 ch = peek(); 2272 continue; 2273 } else if (first == 0) { 2274 return root; 2275 } 2276 // Unwind meta escape sequence 2277 cursor = prev; 2278 break; 2279 case 0: 2280 if (cursor >= patternLength) { 2281 break; 2282 } 2283 // Fall through 2284 default: 2285 prev = cursor; 2286 append(ch, first); 2287 first++; 2288 if (isSupplementary(ch)) { 2289 hasSupplementary = true; 2290 } 2291 ch = next(); 2292 continue; 2293 } 2294 break; 2295 } 2296 if (first == 1) { 2297 return newCharProperty(single(buffer[0])); 2298 } else { 2299 return newSlice(buffer, first, hasSupplementary); 2300 } 2301 } 2302 2303 private void append(int ch, int len) { 2304 if (len >= buffer.length) { 2305 int[] tmp = new int[len+len]; 2306 System.arraycopy(buffer, 0, tmp, 0, len); 2307 buffer = tmp; 2308 } 2309 buffer[len] = ch; 2310 } 2311 2312 /** 2313 * Parses a backref greedily, taking as many numbers as it 2314 * can. The first digit is always treated as a backref, but 2315 * multi digit numbers are only treated as a backref if at 2316 * least that many backrefs exist at this point in the regex. 2317 */ 2318 private Node ref(int refNum) { 2319 boolean done = false; 2320 while(!done) { 2321 int ch = peek(); 2322 switch(ch) { 2323 case '0': 2324 case '1': 2325 case '2': 2326 case '3': 2327 case '4': 2328 case '5': 2329 case '6': 2330 case '7': 2331 case '8': 2332 case '9': 2333 int newRefNum = (refNum * 10) + (ch - '0'); 2334 // Add another number if it doesn't make a group 2335 // that doesn't exist 2336 if (capturingGroupCount - 1 < newRefNum) { 2337 done = true; 2338 break; 2339 } 2340 refNum = newRefNum; 2341 read(); 2342 break; 2343 default: 2344 done = true; 2345 break; 2346 } 2347 } 2348 hasGroupRef = true; 2349 if (has(CASE_INSENSITIVE)) 2350 return new CIBackRef(refNum, has(UNICODE_CASE)); 2351 else 2352 return new BackRef(refNum); 2353 } 2354 2355 /** 2356 * Parses an escape sequence to determine the actual value that needs 2357 * to be matched. 2358 * If -1 is returned and create was true a new object was added to the tree 2359 * to handle the escape sequence. 2360 * If the returned value is greater than zero, it is the value that 2361 * matches the escape sequence. 2362 */ 2363 private int escape(boolean inclass, boolean create, boolean isrange) { 2364 int ch = skip(); 2365 switch (ch) { 2366 case '0': 2367 return o(); 2368 case '1': 2369 case '2': 2370 case '3': 2371 case '4': 2372 case '5': 2373 case '6': 2374 case '7': 2375 case '8': 2376 case '9': 2377 if (inclass) break; 2378 if (create) { 2379 root = ref((ch - '0')); 2380 } 2381 return -1; 2382 case 'A': 2383 if (inclass) break; 2384 if (create) root = new Begin(); 2385 return -1; 2386 case 'B': 2387 if (inclass) break; 2388 if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS)); 2389 return -1; 2390 case 'C': 2391 break; 2392 case 'D': 2393 if (create) { 2394 predicate = has(UNICODE_CHARACTER_CLASS) ? 2395 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2396 predicate = predicate.negate(); 2397 if (!inclass) 2398 root = newCharProperty(predicate); 2399 } 2400 return -1; 2401 case 'E': 2402 case 'F': 2403 break; 2404 case 'G': 2405 if (inclass) break; 2406 if (create) root = new LastMatch(); 2407 return -1; 2408 case 'H': 2409 if (create) { 2410 predicate = HorizWS().negate(); 2411 if (!inclass) 2412 root = newCharProperty(predicate); 2413 } 2414 return -1; 2415 case 'I': 2416 case 'J': 2417 case 'K': 2418 case 'L': 2419 case 'M': 2420 break; 2421 case 'N': 2422 return N(); 2423 case 'O': 2424 case 'P': 2425 case 'Q': 2426 break; 2427 case 'R': 2428 if (inclass) break; 2429 if (create) root = new LineEnding(); 2430 return -1; 2431 case 'S': 2432 if (create) { 2433 predicate = has(UNICODE_CHARACTER_CLASS) ? 2434 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2435 predicate = predicate.negate(); 2436 if (!inclass) 2437 root = newCharProperty(predicate); 2438 } 2439 return -1; 2440 case 'T': 2441 case 'U': 2442 break; 2443 case 'V': 2444 if (create) { 2445 predicate = VertWS().negate(); 2446 if (!inclass) 2447 root = newCharProperty(predicate); 2448 } 2449 return -1; 2450 case 'W': 2451 if (create) { 2452 predicate = has(UNICODE_CHARACTER_CLASS) ? 2453 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2454 predicate = predicate.negate(); 2455 if (!inclass) 2456 root = newCharProperty(predicate); 2457 } 2458 return -1; 2459 case 'X': 2460 if (inclass) break; 2461 if (create) { 2462 root = new XGrapheme(); 2463 } 2464 return -1; 2465 case 'Y': 2466 break; 2467 case 'Z': 2468 if (inclass) break; 2469 if (create) { 2470 if (has(UNIX_LINES)) 2471 root = new UnixDollar(false); 2472 else 2473 root = new Dollar(false); 2474 } 2475 return -1; 2476 case 'a': 2477 return '\007'; 2478 case 'b': 2479 if (inclass) break; 2480 if (create) { 2481 if (peek() == '{') { 2482 if (skip() == 'g') { 2483 if (read() == '}') { 2484 root = new GraphemeBound(); 2485 return -1; 2486 } 2487 break; // error missing trailing } 2488 } 2489 unread(); unread(); 2490 } 2491 root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS)); 2492 } 2493 return -1; 2494 case 'c': 2495 return c(); 2496 case 'd': 2497 if (create) { 2498 predicate = has(UNICODE_CHARACTER_CLASS) ? 2499 CharPredicates.DIGIT() : CharPredicates.ASCII_DIGIT(); 2500 if (!inclass) 2501 root = newCharProperty(predicate); 2502 } 2503 return -1; 2504 case 'e': 2505 return '\033'; 2506 case 'f': 2507 return '\f'; 2508 case 'g': 2509 break; 2510 case 'h': 2511 if (create) { 2512 predicate = HorizWS(); 2513 if (!inclass) 2514 root = newCharProperty(predicate); 2515 } 2516 return -1; 2517 case 'i': 2518 case 'j': 2519 break; 2520 case 'k': 2521 if (inclass) 2522 break; 2523 if (read() != '<') 2524 throw error("\\k is not followed by '<' for named capturing group"); 2525 String name = groupname(read()); 2526 if (!namedGroups().containsKey(name)) 2527 throw error("named capturing group <" + name + "> does not exist"); 2528 if (create) { 2529 hasGroupRef = true; 2530 if (has(CASE_INSENSITIVE)) 2531 root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE)); 2532 else 2533 root = new BackRef(namedGroups().get(name)); 2534 } 2535 return -1; 2536 case 'l': 2537 case 'm': 2538 break; 2539 case 'n': 2540 return '\n'; 2541 case 'o': 2542 case 'p': 2543 case 'q': 2544 break; 2545 case 'r': 2546 return '\r'; 2547 case 's': 2548 if (create) { 2549 predicate = has(UNICODE_CHARACTER_CLASS) ? 2550 CharPredicates.WHITE_SPACE() : CharPredicates.ASCII_SPACE(); 2551 if (!inclass) 2552 root = newCharProperty(predicate); 2553 } 2554 return -1; 2555 case 't': 2556 return '\t'; 2557 case 'u': 2558 return u(); 2559 case 'v': 2560 // '\v' was implemented as VT/0x0B in releases < 1.8 (though 2561 // undocumented). In JDK8 '\v' is specified as a predefined 2562 // character class for all vertical whitespace characters. 2563 // So [-1, root=VertWS node] pair is returned (instead of a 2564 // single 0x0B). This breaks the range if '\v' is used as 2565 // the start or end value, such as [\v-...] or [...-\v], in 2566 // which a single definite value (0x0B) is expected. For 2567 // compatibility concern '\013'/0x0B is returned if isrange. 2568 if (isrange) 2569 return '\013'; 2570 if (create) { 2571 predicate = VertWS(); 2572 if (!inclass) 2573 root = newCharProperty(predicate); 2574 } 2575 return -1; 2576 case 'w': 2577 if (create) { 2578 predicate = has(UNICODE_CHARACTER_CLASS) ? 2579 CharPredicates.WORD() : CharPredicates.ASCII_WORD(); 2580 if (!inclass) 2581 root = newCharProperty(predicate); 2582 } 2583 return -1; 2584 case 'x': 2585 return x(); 2586 case 'y': 2587 break; 2588 case 'z': 2589 if (inclass) break; 2590 if (create) root = new End(); 2591 return -1; 2592 default: 2593 return ch; 2594 } 2595 throw error("Illegal/unsupported escape sequence"); 2596 } 2597 2598 /** 2599 * Parse a character class, and return the node that matches it. 2600 * 2601 * Consumes a ] on the way out if consume is true. Usually consume 2602 * is true except for the case of [abc&&def] where def is a separate 2603 * right hand node with "understood" brackets. 2604 */ 2605 private CharPredicate clazz(boolean consume) { 2606 CharPredicate prev = null; 2607 CharPredicate curr = null; 2608 BitClass bits = new BitClass(); 2609 BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch]; 2610 2611 boolean isNeg = false; 2612 boolean hasBits = false; 2613 int ch = next(); 2614 2615 // Negates if first char in a class, otherwise literal 2616 if (ch == '^' && temp[cursor-1] == '[') { 2617 ch = next(); 2618 isNeg = true; 2619 } 2620 for (;;) { 2621 switch (ch) { 2622 case '[': 2623 curr = clazz(true); 2624 if (prev == null) 2625 prev = curr; 2626 else 2627 prev = prev.union(curr); 2628 ch = peek(); 2629 continue; 2630 case '&': 2631 ch = next(); 2632 if (ch == '&') { 2633 ch = next(); 2634 CharPredicate right = null; 2635 while (ch != ']' && ch != '&') { 2636 if (ch == '[') { 2637 if (right == null) 2638 right = clazz(true); 2639 else 2640 right = right.union(clazz(true)); 2641 } else { // abc&&def 2642 unread(); 2643 right = clazz(false); 2644 } 2645 ch = peek(); 2646 } 2647 if (hasBits) { 2648 // bits used, union has high precedence 2649 if (prev == null) { 2650 prev = curr = bitsP; 2651 } else { 2652 prev = prev.union(bitsP); 2653 } 2654 hasBits = false; 2655 } 2656 if (right != null) 2657 curr = right; 2658 if (prev == null) { 2659 if (right == null) 2660 throw error("Bad class syntax"); 2661 else 2662 prev = right; 2663 } else { 2664 prev = prev.and(curr); 2665 } 2666 } else { 2667 // treat as a literal & 2668 unread(); 2669 break; 2670 } 2671 continue; 2672 case 0: 2673 if (cursor >= patternLength) 2674 throw error("Unclosed character class"); 2675 break; 2676 case ']': 2677 if (prev != null || hasBits) { 2678 if (consume) 2679 next(); 2680 if (prev == null) 2681 prev = bitsP; 2682 else if (hasBits) 2683 prev = prev.union(bitsP); 2684 if (isNeg) 2685 return prev.negate(); 2686 return prev; 2687 } 2688 break; 2689 default: 2690 break; 2691 } 2692 curr = range(bits); 2693 if (curr == null) { // the bits used 2694 hasBits = true; 2695 } else { 2696 if (prev == null) 2697 prev = curr; 2698 else if (prev != curr) 2699 prev = prev.union(curr); 2700 } 2701 ch = peek(); 2702 } 2703 } 2704 2705 private CharPredicate bitsOrSingle(BitClass bits, int ch) { 2706 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2707 Use "single" node instead of bits when dealing with unicode 2708 case folding for codepoints listed below. 2709 (1)Uppercase out of range: u+00ff, u+00b5 2710 toUpperCase(u+00ff) -> u+0178 2711 toUpperCase(u+00b5) -> u+039c 2712 (2)LatinSmallLetterLongS u+17f 2713 toUpperCase(u+017f) -> u+0053 2714 (3)LatinSmallLetterDotlessI u+131 2715 toUpperCase(u+0131) -> u+0049 2716 (4)LatinCapitalLetterIWithDotAbove u+0130 2717 toLowerCase(u+0130) -> u+0069 2718 (5)KelvinSign u+212a 2719 toLowerCase(u+212a) ==> u+006B 2720 (6)AngstromSign u+212b 2721 toLowerCase(u+212b) ==> u+00e5 2722 */ 2723 if (ch < 256 && 2724 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) && 2725 (ch == 0xff || ch == 0xb5 || 2726 ch == 0x49 || ch == 0x69 || //I and i 2727 ch == 0x53 || ch == 0x73 || //S and s 2728 ch == 0x4b || ch == 0x6b || //K and k 2729 ch == 0xc5 || ch == 0xe5))) { //A+ring 2730 bits.add(ch, flags0); 2731 return null; 2732 } 2733 return single(ch); 2734 } 2735 2736 /** 2737 * Returns a suitably optimized, single character predicate 2738 */ 2739 private CharPredicate single(final int ch) { 2740 if (has(CASE_INSENSITIVE)) { 2741 int lower, upper; 2742 if (has(UNICODE_CASE)) { 2743 upper = Character.toUpperCase(ch); 2744 lower = Character.toLowerCase(upper); 2745 // Unicode case insensitive matches 2746 if (upper != lower) 2747 return SingleU(lower); 2748 } else if (ASCII.isAscii(ch)) { 2749 lower = ASCII.toLower(ch); 2750 upper = ASCII.toUpper(ch); 2751 // Case insensitive matches a given BMP character 2752 if (lower != upper) 2753 return SingleI(lower, upper); 2754 } 2755 } 2756 if (isSupplementary(ch)) 2757 return SingleS(ch); 2758 return Single(ch); // Match a given BMP character 2759 } 2760 2761 /** 2762 * Parse a single character or a character range in a character class 2763 * and return its representative node. 2764 */ 2765 private CharPredicate range(BitClass bits) { 2766 int ch = peek(); 2767 if (ch == '\\') { 2768 ch = nextEscaped(); 2769 if (ch == 'p' || ch == 'P') { // A property 2770 boolean comp = (ch == 'P'); 2771 boolean oneLetter = true; 2772 // Consume { if present 2773 ch = next(); 2774 if (ch != '{') 2775 unread(); 2776 else 2777 oneLetter = false; 2778 return family(oneLetter, comp); 2779 } else { // ordinary escape 2780 boolean isrange = temp[cursor+1] == '-'; 2781 unread(); 2782 ch = escape(true, true, isrange); 2783 if (ch == -1) 2784 return predicate; 2785 } 2786 } else { 2787 next(); 2788 } 2789 if (ch >= 0) { 2790 if (peek() == '-') { 2791 int endRange = temp[cursor+1]; 2792 if (endRange == '[') { 2793 return bitsOrSingle(bits, ch); 2794 } 2795 if (endRange != ']') { 2796 next(); 2797 int m = peek(); 2798 if (m == '\\') { 2799 m = escape(true, false, true); 2800 } else { 2801 next(); 2802 } 2803 if (m < ch) { 2804 throw error("Illegal character range"); 2805 } 2806 if (has(CASE_INSENSITIVE)) { 2807 if (has(UNICODE_CASE)) 2808 return CIRangeU(ch, m); 2809 return CIRange(ch, m); 2810 } else { 2811 return Range(ch, m); 2812 } 2813 } 2814 } 2815 return bitsOrSingle(bits, ch); 2816 } 2817 throw error("Unexpected character '"+((char)ch)+"'"); 2818 } 2819 2820 /** 2821 * Parses a Unicode character family and returns its representative node. 2822 */ 2823 private CharPredicate family(boolean singleLetter, boolean isComplement) { 2824 next(); 2825 String name; 2826 CharPredicate p = null; 2827 2828 if (singleLetter) { 2829 int c = temp[cursor]; 2830 if (!Character.isSupplementaryCodePoint(c)) { 2831 name = String.valueOf((char)c); 2832 } else { 2833 name = new String(temp, cursor, 1); 2834 } 2835 read(); 2836 } else { 2837 int i = cursor; 2838 mark('}'); 2839 while(read() != '}') { 2840 } 2841 mark('\000'); 2842 int j = cursor; 2843 if (j > patternLength) 2844 throw error("Unclosed character family"); 2845 if (i + 1 >= j) 2846 throw error("Empty character family"); 2847 name = new String(temp, i, j-i-1); 2848 } 2849 2850 int i = name.indexOf('='); 2851 if (i != -1) { 2852 // property construct \p{name=value} 2853 String value = name.substring(i + 1); 2854 name = name.substring(0, i).toLowerCase(Locale.ENGLISH); 2855 switch (name) { 2856 case "sc": 2857 case "script": 2858 p = CharPredicates.forUnicodeScript(value); 2859 break; 2860 case "blk": 2861 case "block": 2862 p = CharPredicates.forUnicodeBlock(value); 2863 break; 2864 case "gc": 2865 case "general_category": 2866 p = CharPredicates.forProperty(value); 2867 break; 2868 default: 2869 break; 2870 } 2871 if (p == null) 2872 throw error("Unknown Unicode property {name=<" + name + ">, " 2873 + "value=<" + value + ">}"); 2874 2875 } else { 2876 if (name.startsWith("In")) { 2877 // \p{InBlockName} 2878 p = CharPredicates.forUnicodeBlock(name.substring(2)); 2879 } else if (name.startsWith("Is")) { 2880 // \p{IsGeneralCategory} and \p{IsScriptName} 2881 name = name.substring(2); 2882 p = CharPredicates.forUnicodeProperty(name); 2883 if (p == null) 2884 p = CharPredicates.forProperty(name); 2885 if (p == null) 2886 p = CharPredicates.forUnicodeScript(name); 2887 } else { 2888 if (has(UNICODE_CHARACTER_CLASS)) { 2889 p = CharPredicates.forPOSIXName(name); 2890 } 2891 if (p == null) 2892 p = CharPredicates.forProperty(name); 2893 } 2894 if (p == null) 2895 throw error("Unknown character property name {In/Is" + name + "}"); 2896 } 2897 if (isComplement) { 2898 // it might be too expensive to detect if a complement of 2899 // CharProperty can match "certain" supplementary. So just 2900 // go with StartS. 2901 hasSupplementary = true; 2902 p = p.negate(); 2903 } 2904 return p; 2905 } 2906 2907 private CharProperty newCharProperty(CharPredicate p) { 2908 if (p == null) 2909 return null; 2910 if (p instanceof BmpCharPredicate) 2911 return new BmpCharProperty((BmpCharPredicate)p); 2912 else 2913 return new CharProperty(p); 2914 } 2915 2916 /** 2917 * Parses and returns the name of a "named capturing group", the trailing 2918 * ">" is consumed after parsing. 2919 */ 2920 private String groupname(int ch) { 2921 StringBuilder sb = new StringBuilder(); 2922 if (!ASCII.isAlpha(ch)) 2923 throw error("capturing group name does not start with a Latin letter"); 2924 do { 2925 sb.append((char) ch); 2926 } while (ASCII.isAlnum(ch=read())); 2927 if (ch != '>') 2928 throw error("named capturing group is missing trailing '>'"); 2929 return sb.toString(); 2930 } 2931 2932 /** 2933 * Parses a group and returns the head node of a set of nodes that process 2934 * the group. Sometimes a double return system is used where the tail is 2935 * returned in root. 2936 */ 2937 private Node group0() { 2938 boolean capturingGroup = false; 2939 Node head = null; 2940 Node tail = null; 2941 int save = flags0; 2942 int saveTCNCount = topClosureNodes.size(); 2943 root = null; 2944 int ch = next(); 2945 if (ch == '?') { 2946 ch = skip(); 2947 switch (ch) { 2948 case ':': // (?:xxx) pure group 2949 head = createGroup(true); 2950 tail = root; 2951 head.next = expr(tail); 2952 break; 2953 case '=': // (?=xxx) and (?!xxx) lookahead 2954 case '!': 2955 head = createGroup(true); 2956 tail = root; 2957 head.next = expr(tail); 2958 if (ch == '=') { 2959 head = tail = new Pos(head); 2960 } else { 2961 head = tail = new Neg(head); 2962 } 2963 break; 2964 case '>': // (?>xxx) independent group 2965 head = createGroup(true); 2966 tail = root; 2967 head.next = expr(tail); 2968 head = tail = new Ques(head, Qtype.INDEPENDENT); 2969 break; 2970 case '<': // (?<xxx) look behind 2971 ch = read(); 2972 if (ch != '=' && ch != '!') { 2973 // named captured group 2974 String name = groupname(ch); 2975 if (namedGroups().containsKey(name)) 2976 throw error("Named capturing group <" + name 2977 + "> is already defined"); 2978 capturingGroup = true; 2979 head = createGroup(false); 2980 tail = root; 2981 namedGroups().put(name, capturingGroupCount-1); 2982 head.next = expr(tail); 2983 break; 2984 } 2985 int start = cursor; 2986 head = createGroup(true); 2987 tail = root; 2988 head.next = expr(tail); 2989 tail.next = lookbehindEnd; 2990 TreeInfo info = new TreeInfo(); 2991 head.study(info); 2992 if (info.maxValid == false) { 2993 throw error("Look-behind group does not have " 2994 + "an obvious maximum length"); 2995 } 2996 boolean hasSupplementary = findSupplementary(start, patternLength); 2997 if (ch == '=') { 2998 head = tail = (hasSupplementary ? 2999 new BehindS(head, info.maxLength, 3000 info.minLength) : 3001 new Behind(head, info.maxLength, 3002 info.minLength)); 3003 } else { // if (ch == '!') 3004 head = tail = (hasSupplementary ? 3005 new NotBehindS(head, info.maxLength, 3006 info.minLength) : 3007 new NotBehind(head, info.maxLength, 3008 info.minLength)); 3009 } 3010 // clear all top-closure-nodes inside lookbehind 3011 if (saveTCNCount < topClosureNodes.size()) 3012 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3013 break; 3014 case '$': 3015 case '@': 3016 throw error("Unknown group type"); 3017 default: // (?xxx:) inlined match flags 3018 unread(); 3019 addFlag(); 3020 ch = read(); 3021 if (ch == ')') { 3022 return null; // Inline modifier only 3023 } 3024 if (ch != ':') { 3025 throw error("Unknown inline modifier"); 3026 } 3027 head = createGroup(true); 3028 tail = root; 3029 head.next = expr(tail); 3030 break; 3031 } 3032 } else { // (xxx) a regular group 3033 capturingGroup = true; 3034 head = createGroup(false); 3035 tail = root; 3036 head.next = expr(tail); 3037 } 3038 3039 accept(')', "Unclosed group"); 3040 flags0 = save; 3041 3042 // Check for quantifiers 3043 Node node = closure(head); 3044 if (node == head) { // No closure 3045 root = tail; 3046 return node; // Dual return 3047 } 3048 if (head == tail) { // Zero length assertion 3049 root = node; 3050 return node; // Dual return 3051 } 3052 3053 // have group closure, clear all inner closure nodes from the 3054 // top list (no backtracking stopper optimization for inner 3055 if (saveTCNCount < topClosureNodes.size()) 3056 topClosureNodes.subList(saveTCNCount, topClosureNodes.size()).clear(); 3057 3058 if (node instanceof Ques) { 3059 Ques ques = (Ques) node; 3060 if (ques.type == Qtype.POSSESSIVE) { 3061 root = node; 3062 return node; 3063 } 3064 tail.next = new BranchConn(); 3065 tail = tail.next; 3066 if (ques.type == Qtype.GREEDY) { 3067 head = new Branch(head, null, tail); 3068 } else { // Reluctant quantifier 3069 head = new Branch(null, head, tail); 3070 } 3071 root = tail; 3072 return head; 3073 } else if (node instanceof Curly) { 3074 Curly curly = (Curly) node; 3075 if (curly.type == Qtype.POSSESSIVE) { 3076 root = node; 3077 return node; 3078 } 3079 // Discover if the group is deterministic 3080 TreeInfo info = new TreeInfo(); 3081 if (head.study(info)) { // Deterministic 3082 GroupTail temp = (GroupTail) tail; 3083 head = root = new GroupCurly(head.next, curly.cmin, 3084 curly.cmax, curly.type, 3085 ((GroupTail)tail).localIndex, 3086 ((GroupTail)tail).groupIndex, 3087 capturingGroup); 3088 return head; 3089 } else { // Non-deterministic 3090 int temp = ((GroupHead) head).localIndex; 3091 Loop loop; 3092 if (curly.type == Qtype.GREEDY) { 3093 loop = new Loop(this.localCount, temp); 3094 // add the max_reps greedy to the top-closure-node list 3095 if (curly.cmax == MAX_REPS) 3096 topClosureNodes.add(loop); 3097 } else { // Reluctant Curly 3098 loop = new LazyLoop(this.localCount, temp); 3099 } 3100 Prolog prolog = new Prolog(loop); 3101 this.localCount += 1; 3102 loop.cmin = curly.cmin; 3103 loop.cmax = curly.cmax; 3104 loop.body = head; 3105 tail.next = loop; 3106 root = loop; 3107 return prolog; // Dual return 3108 } 3109 } 3110 throw error("Internal logic error"); 3111 } 3112 3113 /** 3114 * Create group head and tail nodes using double return. If the group is 3115 * created with anonymous true then it is a pure group and should not 3116 * affect group counting. 3117 */ 3118 private Node createGroup(boolean anonymous) { 3119 int localIndex = localCount++; 3120 int groupIndex = 0; 3121 if (!anonymous) 3122 groupIndex = capturingGroupCount++; 3123 GroupHead head = new GroupHead(localIndex); 3124 root = new GroupTail(localIndex, groupIndex); 3125 3126 // for debug/print only, head.match does NOT need the "tail" info 3127 head.tail = (GroupTail)root; 3128 3129 if (!anonymous && groupIndex < 10) 3130 groupNodes[groupIndex] = head; 3131 return head; 3132 } 3133 3134 @SuppressWarnings("fallthrough") 3135 /** 3136 * Parses inlined match flags and set them appropriately. 3137 */ 3138 private void addFlag() { 3139 int ch = peek(); 3140 for (;;) { 3141 switch (ch) { 3142 case 'i': 3143 flags0 |= CASE_INSENSITIVE; 3144 break; 3145 case 'm': 3146 flags0 |= MULTILINE; 3147 break; 3148 case 's': 3149 flags0 |= DOTALL; 3150 break; 3151 case 'd': 3152 flags0 |= UNIX_LINES; 3153 break; 3154 case 'u': 3155 flags0 |= UNICODE_CASE; 3156 break; 3157 case 'c': 3158 flags0 |= CANON_EQ; 3159 break; 3160 case 'x': 3161 flags0 |= COMMENTS; 3162 break; 3163 case 'U': 3164 flags0 |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3165 break; 3166 case '-': // subFlag then fall through 3167 ch = next(); 3168 subFlag(); 3169 default: 3170 return; 3171 } 3172 ch = next(); 3173 } 3174 } 3175 3176 @SuppressWarnings("fallthrough") 3177 /** 3178 * Parses the second part of inlined match flags and turns off 3179 * flags appropriately. 3180 */ 3181 private void subFlag() { 3182 int ch = peek(); 3183 for (;;) { 3184 switch (ch) { 3185 case 'i': 3186 flags0 &= ~CASE_INSENSITIVE; 3187 break; 3188 case 'm': 3189 flags0 &= ~MULTILINE; 3190 break; 3191 case 's': 3192 flags0 &= ~DOTALL; 3193 break; 3194 case 'd': 3195 flags0 &= ~UNIX_LINES; 3196 break; 3197 case 'u': 3198 flags0 &= ~UNICODE_CASE; 3199 break; 3200 case 'c': 3201 flags0 &= ~CANON_EQ; 3202 break; 3203 case 'x': 3204 flags0 &= ~COMMENTS; 3205 break; 3206 case 'U': 3207 flags0 &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE); 3208 break; 3209 default: 3210 return; 3211 } 3212 ch = next(); 3213 } 3214 } 3215 3216 static final int MAX_REPS = 0x7FFFFFFF; 3217 3218 static enum Qtype { 3219 GREEDY, LAZY, POSSESSIVE, INDEPENDENT 3220 } 3221 3222 private Node curly(Node prev, int cmin) { 3223 int ch = next(); 3224 if (ch == '?') { 3225 next(); 3226 return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY); 3227 } else if (ch == '+') { 3228 next(); 3229 return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE); 3230 } 3231 if (prev instanceof BmpCharProperty) { 3232 return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin); 3233 } else if (prev instanceof CharProperty) { 3234 return new CharPropertyGreedy((CharProperty)prev, cmin); 3235 } 3236 return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY); 3237 } 3238 3239 /** 3240 * Processes repetition. If the next character peeked is a quantifier 3241 * then new nodes must be appended to handle the repetition. 3242 * Prev could be a single or a group, so it could be a chain of nodes. 3243 */ 3244 private Node closure(Node prev) { 3245 Node atom; 3246 int ch = peek(); 3247 switch (ch) { 3248 case '?': 3249 ch = next(); 3250 if (ch == '?') { 3251 next(); 3252 return new Ques(prev, Qtype.LAZY); 3253 } else if (ch == '+') { 3254 next(); 3255 return new Ques(prev, Qtype.POSSESSIVE); 3256 } 3257 return new Ques(prev, Qtype.GREEDY); 3258 case '*': 3259 return curly(prev, 0); 3260 case '+': 3261 return curly(prev, 1); 3262 case '{': 3263 ch = temp[cursor+1]; 3264 if (ASCII.isDigit(ch)) { 3265 skip(); 3266 int cmin = 0; 3267 do { 3268 cmin = cmin * 10 + (ch - '0'); 3269 } while (ASCII.isDigit(ch = read())); 3270 int cmax = cmin; 3271 if (ch == ',') { 3272 ch = read(); 3273 cmax = MAX_REPS; 3274 if (ch != '}') { 3275 cmax = 0; 3276 while (ASCII.isDigit(ch)) { 3277 cmax = cmax * 10 + (ch - '0'); 3278 ch = read(); 3279 } 3280 } 3281 } 3282 if (ch != '}') 3283 throw error("Unclosed counted closure"); 3284 if (((cmin) | (cmax) | (cmax - cmin)) < 0) 3285 throw error("Illegal repetition range"); 3286 Curly curly; 3287 ch = peek(); 3288 if (ch == '?') { 3289 next(); 3290 curly = new Curly(prev, cmin, cmax, Qtype.LAZY); 3291 } else if (ch == '+') { 3292 next(); 3293 curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE); 3294 } else { 3295 curly = new Curly(prev, cmin, cmax, Qtype.GREEDY); 3296 } 3297 return curly; 3298 } else { 3299 throw error("Illegal repetition"); 3300 } 3301 default: 3302 return prev; 3303 } 3304 } 3305 3306 /** 3307 * Utility method for parsing control escape sequences. 3308 */ 3309 private int c() { 3310 if (cursor < patternLength) { 3311 return read() ^ 64; 3312 } 3313 throw error("Illegal control escape sequence"); 3314 } 3315 3316 /** 3317 * Utility method for parsing octal escape sequences. 3318 */ 3319 private int o() { 3320 int n = read(); 3321 if (((n-'0')|('7'-n)) >= 0) { 3322 int m = read(); 3323 if (((m-'0')|('7'-m)) >= 0) { 3324 int o = read(); 3325 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) { 3326 return (n - '0') * 64 + (m - '0') * 8 + (o - '0'); 3327 } 3328 unread(); 3329 return (n - '0') * 8 + (m - '0'); 3330 } 3331 unread(); 3332 return (n - '0'); 3333 } 3334 throw error("Illegal octal escape sequence"); 3335 } 3336 3337 /** 3338 * Utility method for parsing hexadecimal escape sequences. 3339 */ 3340 private int x() { 3341 int n = read(); 3342 if (ASCII.isHexDigit(n)) { 3343 int m = read(); 3344 if (ASCII.isHexDigit(m)) { 3345 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m); 3346 } 3347 } else if (n == '{' && ASCII.isHexDigit(peek())) { 3348 int ch = 0; 3349 while (ASCII.isHexDigit(n = read())) { 3350 ch = (ch << 4) + ASCII.toDigit(n); 3351 if (ch > Character.MAX_CODE_POINT) 3352 throw error("Hexadecimal codepoint is too big"); 3353 } 3354 if (n != '}') 3355 throw error("Unclosed hexadecimal escape sequence"); 3356 return ch; 3357 } 3358 throw error("Illegal hexadecimal escape sequence"); 3359 } 3360 3361 /** 3362 * Utility method for parsing unicode escape sequences. 3363 */ 3364 private int cursor() { 3365 return cursor; 3366 } 3367 3368 private void setcursor(int pos) { 3369 cursor = pos; 3370 } 3371 3372 private int uxxxx() { 3373 int n = 0; 3374 for (int i = 0; i < 4; i++) { 3375 int ch = read(); 3376 if (!ASCII.isHexDigit(ch)) { 3377 throw error("Illegal Unicode escape sequence"); 3378 } 3379 n = n * 16 + ASCII.toDigit(ch); 3380 } 3381 return n; 3382 } 3383 3384 private int u() { 3385 int n = uxxxx(); 3386 if (Character.isHighSurrogate((char)n)) { 3387 int cur = cursor(); 3388 if (read() == '\\' && read() == 'u') { 3389 int n2 = uxxxx(); 3390 if (Character.isLowSurrogate((char)n2)) 3391 return Character.toCodePoint((char)n, (char)n2); 3392 } 3393 setcursor(cur); 3394 } 3395 return n; 3396 } 3397 3398 private int N() { 3399 if (read() == '{') { 3400 int i = cursor; 3401 while (cursor < patternLength && read() != '}') {} 3402 if (cursor > patternLength) 3403 throw error("Unclosed character name escape sequence"); 3404 String name = new String(temp, i, cursor - i - 1); 3405 try { 3406 return Character.codePointOf(name); 3407 } catch (IllegalArgumentException x) { 3408 throw error("Unknown character name [" + name + "]"); 3409 } 3410 } 3411 throw error("Illegal character name escape sequence"); 3412 } 3413 3414 // 3415 // Utility methods for code point support 3416 // 3417 private static final int countChars(CharSequence seq, int index, 3418 int lengthInCodePoints) { 3419 // optimization 3420 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) { 3421 assert (index >= 0 && index < seq.length()); 3422 return 1; 3423 } 3424 int length = seq.length(); 3425 int x = index; 3426 if (lengthInCodePoints >= 0) { 3427 assert (index >= 0 && index < length); 3428 for (int i = 0; x < length && i < lengthInCodePoints; i++) { 3429 if (Character.isHighSurrogate(seq.charAt(x++))) { 3430 if (x < length && Character.isLowSurrogate(seq.charAt(x))) { 3431 x++; 3432 } 3433 } 3434 } 3435 return x - index; 3436 } 3437 3438 assert (index >= 0 && index <= length); 3439 if (index == 0) { 3440 return 0; 3441 } 3442 int len = -lengthInCodePoints; 3443 for (int i = 0; x > 0 && i < len; i++) { 3444 if (Character.isLowSurrogate(seq.charAt(--x))) { 3445 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) { 3446 x--; 3447 } 3448 } 3449 } 3450 return index - x; 3451 } 3452 3453 private static final int countCodePoints(CharSequence seq) { 3454 int length = seq.length(); 3455 int n = 0; 3456 for (int i = 0; i < length; ) { 3457 n++; 3458 if (Character.isHighSurrogate(seq.charAt(i++))) { 3459 if (i < length && Character.isLowSurrogate(seq.charAt(i))) { 3460 i++; 3461 } 3462 } 3463 } 3464 return n; 3465 } 3466 3467 /** 3468 * Creates a bit vector for matching Latin-1 values. A normal BitClass 3469 * never matches values above Latin-1, and a complemented BitClass always 3470 * matches values above Latin-1. 3471 */ 3472 static final class BitClass extends BmpCharProperty { 3473 final boolean[] bits; 3474 BitClass() { 3475 this(new boolean[256]); 3476 } 3477 private BitClass(boolean[] bits) { 3478 super( ch -> ch < 256 && bits[ch]); 3479 this.bits = bits; 3480 } 3481 BitClass add(int c, int flags) { 3482 assert c >= 0 && c <= 255; 3483 if ((flags & CASE_INSENSITIVE) != 0) { 3484 if (ASCII.isAscii(c)) { 3485 bits[ASCII.toUpper(c)] = true; 3486 bits[ASCII.toLower(c)] = true; 3487 } else if ((flags & UNICODE_CASE) != 0) { 3488 bits[Character.toLowerCase(c)] = true; 3489 bits[Character.toUpperCase(c)] = true; 3490 } 3491 } 3492 bits[c] = true; 3493 return this; 3494 } 3495 } 3496 3497 /** 3498 * Utility method for creating a string slice matcher. 3499 */ 3500 private Node newSlice(int[] buf, int count, boolean hasSupplementary) { 3501 int[] tmp = new int[count]; 3502 if (has(CASE_INSENSITIVE)) { 3503 if (has(UNICODE_CASE)) { 3504 for (int i = 0; i < count; i++) { 3505 tmp[i] = Character.toLowerCase( 3506 Character.toUpperCase(buf[i])); 3507 } 3508 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp); 3509 } 3510 for (int i = 0; i < count; i++) { 3511 tmp[i] = ASCII.toLower(buf[i]); 3512 } 3513 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp); 3514 } 3515 for (int i = 0; i < count; i++) { 3516 tmp[i] = buf[i]; 3517 } 3518 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp); 3519 } 3520 3521 /** 3522 * The following classes are the building components of the object 3523 * tree that represents a compiled regular expression. The object tree 3524 * is made of individual elements that handle constructs in the Pattern. 3525 * Each type of object knows how to match its equivalent construct with 3526 * the match() method. 3527 */ 3528 3529 /** 3530 * Base class for all node classes. Subclasses should override the match() 3531 * method as appropriate. This class is an accepting node, so its match() 3532 * always returns true. 3533 */ 3534 static class Node extends Object { 3535 Node next; 3536 Node() { 3537 next = Pattern.accept; 3538 } 3539 /** 3540 * This method implements the classic accept node. 3541 */ 3542 boolean match(Matcher matcher, int i, CharSequence seq) { 3543 matcher.last = i; 3544 matcher.groups[0] = matcher.first; 3545 matcher.groups[1] = matcher.last; 3546 return true; 3547 } 3548 /** 3549 * This method is good for all zero length assertions. 3550 */ 3551 boolean study(TreeInfo info) { 3552 if (next != null) { 3553 return next.study(info); 3554 } else { 3555 return info.deterministic; 3556 } 3557 } 3558 } 3559 3560 static class LastNode extends Node { 3561 /** 3562 * This method implements the classic accept node with 3563 * the addition of a check to see if the match occurred 3564 * using all of the input. 3565 */ 3566 boolean match(Matcher matcher, int i, CharSequence seq) { 3567 if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to) 3568 return false; 3569 matcher.last = i; 3570 matcher.groups[0] = matcher.first; 3571 matcher.groups[1] = matcher.last; 3572 return true; 3573 } 3574 } 3575 3576 /** 3577 * Used for REs that can start anywhere within the input string. 3578 * This basically tries to match repeatedly at each spot in the 3579 * input string, moving forward after each try. An anchored search 3580 * or a BnM will bypass this node completely. 3581 */ 3582 static class Start extends Node { 3583 int minLength; 3584 Start(Node node) { 3585 this.next = node; 3586 TreeInfo info = new TreeInfo(); 3587 next.study(info); 3588 minLength = info.minLength; 3589 } 3590 boolean match(Matcher matcher, int i, CharSequence seq) { 3591 if (i > matcher.to - minLength) { 3592 matcher.hitEnd = true; 3593 return false; 3594 } 3595 int guard = matcher.to - minLength; 3596 for (; i <= guard; i++) { 3597 if (next.match(matcher, i, seq)) { 3598 matcher.first = i; 3599 matcher.groups[0] = matcher.first; 3600 matcher.groups[1] = matcher.last; 3601 return true; 3602 } 3603 } 3604 matcher.hitEnd = true; 3605 return false; 3606 } 3607 boolean study(TreeInfo info) { 3608 next.study(info); 3609 info.maxValid = false; 3610 info.deterministic = false; 3611 return false; 3612 } 3613 } 3614 3615 /* 3616 * StartS supports supplementary characters, including unpaired surrogates. 3617 */ 3618 static final class StartS extends Start { 3619 StartS(Node node) { 3620 super(node); 3621 } 3622 boolean match(Matcher matcher, int i, CharSequence seq) { 3623 if (i > matcher.to - minLength) { 3624 matcher.hitEnd = true; 3625 return false; 3626 } 3627 int guard = matcher.to - minLength; 3628 while (i <= guard) { 3629 //if ((ret = next.match(matcher, i, seq)) || i == guard) 3630 if (next.match(matcher, i, seq)) { 3631 matcher.first = i; 3632 matcher.groups[0] = matcher.first; 3633 matcher.groups[1] = matcher.last; 3634 return true; 3635 } 3636 if (i == guard) 3637 break; 3638 // Optimization to move to the next character. This is 3639 // faster than countChars(seq, i, 1). 3640 if (Character.isHighSurrogate(seq.charAt(i++))) { 3641 if (i < seq.length() && 3642 Character.isLowSurrogate(seq.charAt(i))) { 3643 i++; 3644 } 3645 } 3646 } 3647 matcher.hitEnd = true; 3648 return false; 3649 } 3650 } 3651 3652 /** 3653 * Node to anchor at the beginning of input. This object implements the 3654 * match for a \A sequence, and the caret anchor will use this if not in 3655 * multiline mode. 3656 */ 3657 static final class Begin extends Node { 3658 boolean match(Matcher matcher, int i, CharSequence seq) { 3659 int fromIndex = (matcher.anchoringBounds) ? 3660 matcher.from : 0; 3661 if (i == fromIndex && next.match(matcher, i, seq)) { 3662 matcher.first = i; 3663 matcher.groups[0] = i; 3664 matcher.groups[1] = matcher.last; 3665 return true; 3666 } else { 3667 return false; 3668 } 3669 } 3670 } 3671 3672 /** 3673 * Node to anchor at the end of input. This is the absolute end, so this 3674 * should not match at the last newline before the end as $ will. 3675 */ 3676 static final class End extends Node { 3677 boolean match(Matcher matcher, int i, CharSequence seq) { 3678 int endIndex = (matcher.anchoringBounds) ? 3679 matcher.to : matcher.getTextLength(); 3680 if (i == endIndex) { 3681 matcher.hitEnd = true; 3682 return next.match(matcher, i, seq); 3683 } 3684 return false; 3685 } 3686 } 3687 3688 /** 3689 * Node to anchor at the beginning of a line. This is essentially the 3690 * object to match for the multiline ^. 3691 */ 3692 static final class Caret extends Node { 3693 boolean match(Matcher matcher, int i, CharSequence seq) { 3694 int startIndex = matcher.from; 3695 int endIndex = matcher.to; 3696 if (!matcher.anchoringBounds) { 3697 startIndex = 0; 3698 endIndex = matcher.getTextLength(); 3699 } 3700 // Perl does not match ^ at end of input even after newline 3701 if (i == endIndex) { 3702 matcher.hitEnd = true; 3703 return false; 3704 } 3705 if (i > startIndex) { 3706 char ch = seq.charAt(i-1); 3707 if (ch != '\n' && ch != '\r' 3708 && (ch|1) != '\u2029' 3709 && ch != '\u0085' ) { 3710 return false; 3711 } 3712 // Should treat /r/n as one newline 3713 if (ch == '\r' && seq.charAt(i) == '\n') 3714 return false; 3715 } 3716 return next.match(matcher, i, seq); 3717 } 3718 } 3719 3720 /** 3721 * Node to anchor at the beginning of a line when in unixdot mode. 3722 */ 3723 static final class UnixCaret extends Node { 3724 boolean match(Matcher matcher, int i, CharSequence seq) { 3725 int startIndex = matcher.from; 3726 int endIndex = matcher.to; 3727 if (!matcher.anchoringBounds) { 3728 startIndex = 0; 3729 endIndex = matcher.getTextLength(); 3730 } 3731 // Perl does not match ^ at end of input even after newline 3732 if (i == endIndex) { 3733 matcher.hitEnd = true; 3734 return false; 3735 } 3736 if (i > startIndex) { 3737 char ch = seq.charAt(i-1); 3738 if (ch != '\n') { 3739 return false; 3740 } 3741 } 3742 return next.match(matcher, i, seq); 3743 } 3744 } 3745 3746 /** 3747 * Node to match the location where the last match ended. 3748 * This is used for the \G construct. 3749 */ 3750 static final class LastMatch extends Node { 3751 boolean match(Matcher matcher, int i, CharSequence seq) { 3752 if (i != matcher.oldLast) 3753 return false; 3754 return next.match(matcher, i, seq); 3755 } 3756 } 3757 3758 /** 3759 * Node to anchor at the end of a line or the end of input based on the 3760 * multiline mode. 3761 * 3762 * When not in multiline mode, the $ can only match at the very end 3763 * of the input, unless the input ends in a line terminator in which 3764 * it matches right before the last line terminator. 3765 * 3766 * Note that \r\n is considered an atomic line terminator. 3767 * 3768 * Like ^ the $ operator matches at a position, it does not match the 3769 * line terminators themselves. 3770 */ 3771 static final class Dollar extends Node { 3772 boolean multiline; 3773 Dollar(boolean mul) { 3774 multiline = mul; 3775 } 3776 boolean match(Matcher matcher, int i, CharSequence seq) { 3777 int endIndex = (matcher.anchoringBounds) ? 3778 matcher.to : matcher.getTextLength(); 3779 if (!multiline) { 3780 if (i < endIndex - 2) 3781 return false; 3782 if (i == endIndex - 2) { 3783 char ch = seq.charAt(i); 3784 if (ch != '\r') 3785 return false; 3786 ch = seq.charAt(i + 1); 3787 if (ch != '\n') 3788 return false; 3789 } 3790 } 3791 // Matches before any line terminator; also matches at the 3792 // end of input 3793 // Before line terminator: 3794 // If multiline, we match here no matter what 3795 // If not multiline, fall through so that the end 3796 // is marked as hit; this must be a /r/n or a /n 3797 // at the very end so the end was hit; more input 3798 // could make this not match here 3799 if (i < endIndex) { 3800 char ch = seq.charAt(i); 3801 if (ch == '\n') { 3802 // No match between \r\n 3803 if (i > 0 && seq.charAt(i-1) == '\r') 3804 return false; 3805 if (multiline) 3806 return next.match(matcher, i, seq); 3807 } else if (ch == '\r' || ch == '\u0085' || 3808 (ch|1) == '\u2029') { 3809 if (multiline) 3810 return next.match(matcher, i, seq); 3811 } else { // No line terminator, no match 3812 return false; 3813 } 3814 } 3815 // Matched at current end so hit end 3816 matcher.hitEnd = true; 3817 // If a $ matches because of end of input, then more input 3818 // could cause it to fail! 3819 matcher.requireEnd = true; 3820 return next.match(matcher, i, seq); 3821 } 3822 boolean study(TreeInfo info) { 3823 next.study(info); 3824 return info.deterministic; 3825 } 3826 } 3827 3828 /** 3829 * Node to anchor at the end of a line or the end of input based on the 3830 * multiline mode when in unix lines mode. 3831 */ 3832 static final class UnixDollar extends Node { 3833 boolean multiline; 3834 UnixDollar(boolean mul) { 3835 multiline = mul; 3836 } 3837 boolean match(Matcher matcher, int i, CharSequence seq) { 3838 int endIndex = (matcher.anchoringBounds) ? 3839 matcher.to : matcher.getTextLength(); 3840 if (i < endIndex) { 3841 char ch = seq.charAt(i); 3842 if (ch == '\n') { 3843 // If not multiline, then only possible to 3844 // match at very end or one before end 3845 if (multiline == false && i != endIndex - 1) 3846 return false; 3847 // If multiline return next.match without setting 3848 // matcher.hitEnd 3849 if (multiline) 3850 return next.match(matcher, i, seq); 3851 } else { 3852 return false; 3853 } 3854 } 3855 // Matching because at the end or 1 before the end; 3856 // more input could change this so set hitEnd 3857 matcher.hitEnd = true; 3858 // If a $ matches because of end of input, then more input 3859 // could cause it to fail! 3860 matcher.requireEnd = true; 3861 return next.match(matcher, i, seq); 3862 } 3863 boolean study(TreeInfo info) { 3864 next.study(info); 3865 return info.deterministic; 3866 } 3867 } 3868 3869 /** 3870 * Node class that matches a Unicode line ending '\R' 3871 */ 3872 static final class LineEnding extends Node { 3873 boolean match(Matcher matcher, int i, CharSequence seq) { 3874 // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029]) 3875 if (i < matcher.to) { 3876 int ch = seq.charAt(i); 3877 if (ch == 0x0A || ch == 0x0B || ch == 0x0C || 3878 ch == 0x85 || ch == 0x2028 || ch == 0x2029) 3879 return next.match(matcher, i + 1, seq); 3880 if (ch == 0x0D) { 3881 i++; 3882 if (i < matcher.to) { 3883 if (seq.charAt(i) == 0x0A && 3884 next.match(matcher, i + 1, seq)) { 3885 return true; 3886 } 3887 } else { 3888 matcher.hitEnd = true; 3889 } 3890 return next.match(matcher, i, seq); 3891 } 3892 } else { 3893 matcher.hitEnd = true; 3894 } 3895 return false; 3896 } 3897 boolean study(TreeInfo info) { 3898 info.minLength++; 3899 info.maxLength += 2; 3900 return next.study(info); 3901 } 3902 } 3903 3904 /** 3905 * Abstract node class to match one character satisfying some 3906 * boolean property. 3907 */ 3908 static class CharProperty extends Node { 3909 CharPredicate predicate; 3910 3911 CharProperty (CharPredicate predicate) { 3912 this.predicate = predicate; 3913 } 3914 boolean match(Matcher matcher, int i, CharSequence seq) { 3915 if (i < matcher.to) { 3916 int ch = Character.codePointAt(seq, i); 3917 return predicate.is(ch) && 3918 next.match(matcher, i + Character.charCount(ch), seq); 3919 } else { 3920 matcher.hitEnd = true; 3921 return false; 3922 } 3923 } 3924 boolean study(TreeInfo info) { 3925 info.minLength++; 3926 info.maxLength++; 3927 return next.study(info); 3928 } 3929 } 3930 3931 /** 3932 * Optimized version of CharProperty that works only for 3933 * properties never satisfied by Supplementary characters. 3934 */ 3935 private static class BmpCharProperty extends CharProperty { 3936 BmpCharProperty (BmpCharPredicate predicate) { 3937 super(predicate); 3938 } 3939 boolean match(Matcher matcher, int i, CharSequence seq) { 3940 if (i < matcher.to) { 3941 return predicate.is(seq.charAt(i)) && 3942 next.match(matcher, i + 1, seq); 3943 } else { 3944 matcher.hitEnd = true; 3945 return false; 3946 } 3947 } 3948 } 3949 3950 private static class NFCCharProperty extends Node { 3951 CharPredicate predicate; 3952 NFCCharProperty (CharPredicate predicate) { 3953 this.predicate = predicate; 3954 } 3955 3956 boolean match(Matcher matcher, int i, CharSequence seq) { 3957 if (i < matcher.to) { 3958 int ch0 = Character.codePointAt(seq, i); 3959 int n = Character.charCount(ch0); 3960 int j = i + n; 3961 while (j < matcher.to) { 3962 int ch1 = Character.codePointAt(seq, j); 3963 if (Grapheme.isBoundary(ch0, ch1)) 3964 break; 3965 ch0 = ch1; 3966 j += Character.charCount(ch1); 3967 } 3968 if (i + n == j) { // single, assume nfc cp 3969 if (predicate.is(ch0)) 3970 return next.match(matcher, j, seq); 3971 } else { 3972 while (i + n < j) { 3973 String nfc = Normalizer.normalize( 3974 seq.toString().substring(i, j), Normalizer.Form.NFC); 3975 if (nfc.codePointCount(0, nfc.length()) == 1) { 3976 if (predicate.is(nfc.codePointAt(0)) && 3977 next.match(matcher, j, seq)) { 3978 return true; 3979 } 3980 } 3981 3982 ch0 = Character.codePointBefore(seq, j); 3983 j -= Character.charCount(ch0); 3984 } 3985 } 3986 if (j < matcher.to) 3987 return false; 3988 } 3989 matcher.hitEnd = true; 3990 return false; 3991 } 3992 3993 boolean study(TreeInfo info) { 3994 info.minLength++; 3995 info.deterministic = false; 3996 return next.study(info); 3997 } 3998 } 3999 4000 /** 4001 * Node class that matches an unicode extended grapheme cluster 4002 */ 4003 static class XGrapheme extends Node { 4004 boolean match(Matcher matcher, int i, CharSequence seq) { 4005 if (i < matcher.to) { 4006 int ch0 = Character.codePointAt(seq, i); 4007 i += Character.charCount(ch0); 4008 while (i < matcher.to) { 4009 int ch1 = Character.codePointAt(seq, i); 4010 if (Grapheme.isBoundary(ch0, ch1)) 4011 break; 4012 ch0 = ch1; 4013 i += Character.charCount(ch1); 4014 } 4015 return next.match(matcher, i, seq); 4016 } 4017 matcher.hitEnd = true; 4018 return false; 4019 } 4020 4021 boolean study(TreeInfo info) { 4022 info.minLength++; 4023 info.deterministic = false; 4024 return next.study(info); 4025 } 4026 } 4027 4028 /** 4029 * Node class that handles grapheme boundaries 4030 */ 4031 static class GraphemeBound extends Node { 4032 boolean match(Matcher matcher, int i, CharSequence seq) { 4033 int startIndex = matcher.from; 4034 int endIndex = matcher.to; 4035 if (matcher.transparentBounds) { 4036 startIndex = 0; 4037 endIndex = matcher.getTextLength(); 4038 } 4039 if (i == startIndex) { 4040 return next.match(matcher, i, seq); 4041 } 4042 if (i < endIndex) { 4043 if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) || 4044 !Grapheme.isBoundary(Character.codePointBefore(seq, i), 4045 Character.codePointAt(seq, i))) { 4046 return false; 4047 } 4048 } else { 4049 matcher.hitEnd = true; 4050 matcher.requireEnd = true; 4051 } 4052 return next.match(matcher, i, seq); 4053 } 4054 } 4055 4056 /** 4057 * Base class for all Slice nodes 4058 */ 4059 static class SliceNode extends Node { 4060 int[] buffer; 4061 SliceNode(int[] buf) { 4062 buffer = buf; 4063 } 4064 boolean study(TreeInfo info) { 4065 info.minLength += buffer.length; 4066 info.maxLength += buffer.length; 4067 return next.study(info); 4068 } 4069 } 4070 4071 /** 4072 * Node class for a case sensitive/BMP-only sequence of literal 4073 * characters. 4074 */ 4075 static class Slice extends SliceNode { 4076 Slice(int[] buf) { 4077 super(buf); 4078 } 4079 boolean match(Matcher matcher, int i, CharSequence seq) { 4080 int[] buf = buffer; 4081 int len = buf.length; 4082 for (int j=0; j<len; j++) { 4083 if ((i+j) >= matcher.to) { 4084 matcher.hitEnd = true; 4085 return false; 4086 } 4087 if (buf[j] != seq.charAt(i+j)) 4088 return false; 4089 } 4090 return next.match(matcher, i+len, seq); 4091 } 4092 } 4093 4094 /** 4095 * Node class for a case_insensitive/BMP-only sequence of literal 4096 * characters. 4097 */ 4098 static class SliceI extends SliceNode { 4099 SliceI(int[] buf) { 4100 super(buf); 4101 } 4102 boolean match(Matcher matcher, int i, CharSequence seq) { 4103 int[] buf = buffer; 4104 int len = buf.length; 4105 for (int j=0; j<len; j++) { 4106 if ((i+j) >= matcher.to) { 4107 matcher.hitEnd = true; 4108 return false; 4109 } 4110 int c = seq.charAt(i+j); 4111 if (buf[j] != c && 4112 buf[j] != ASCII.toLower(c)) 4113 return false; 4114 } 4115 return next.match(matcher, i+len, seq); 4116 } 4117 } 4118 4119 /** 4120 * Node class for a unicode_case_insensitive/BMP-only sequence of 4121 * literal characters. Uses unicode case folding. 4122 */ 4123 static final class SliceU extends SliceNode { 4124 SliceU(int[] buf) { 4125 super(buf); 4126 } 4127 boolean match(Matcher matcher, int i, CharSequence seq) { 4128 int[] buf = buffer; 4129 int len = buf.length; 4130 for (int j=0; j<len; j++) { 4131 if ((i+j) >= matcher.to) { 4132 matcher.hitEnd = true; 4133 return false; 4134 } 4135 int c = seq.charAt(i+j); 4136 if (buf[j] != c && 4137 buf[j] != Character.toLowerCase(Character.toUpperCase(c))) 4138 return false; 4139 } 4140 return next.match(matcher, i+len, seq); 4141 } 4142 } 4143 4144 /** 4145 * Node class for a case sensitive sequence of literal characters 4146 * including supplementary characters. 4147 */ 4148 static final class SliceS extends Slice { 4149 SliceS(int[] buf) { 4150 super(buf); 4151 } 4152 boolean match(Matcher matcher, int i, CharSequence seq) { 4153 int[] buf = buffer; 4154 int x = i; 4155 for (int j = 0; j < buf.length; j++) { 4156 if (x >= matcher.to) { 4157 matcher.hitEnd = true; 4158 return false; 4159 } 4160 int c = Character.codePointAt(seq, x); 4161 if (buf[j] != c) 4162 return false; 4163 x += Character.charCount(c); 4164 if (x > matcher.to) { 4165 matcher.hitEnd = true; 4166 return false; 4167 } 4168 } 4169 return next.match(matcher, x, seq); 4170 } 4171 } 4172 4173 /** 4174 * Node class for a case insensitive sequence of literal characters 4175 * including supplementary characters. 4176 */ 4177 static class SliceIS extends SliceNode { 4178 SliceIS(int[] buf) { 4179 super(buf); 4180 } 4181 int toLower(int c) { 4182 return ASCII.toLower(c); 4183 } 4184 boolean match(Matcher matcher, int i, CharSequence seq) { 4185 int[] buf = buffer; 4186 int x = i; 4187 for (int j = 0; j < buf.length; j++) { 4188 if (x >= matcher.to) { 4189 matcher.hitEnd = true; 4190 return false; 4191 } 4192 int c = Character.codePointAt(seq, x); 4193 if (buf[j] != c && buf[j] != toLower(c)) 4194 return false; 4195 x += Character.charCount(c); 4196 if (x > matcher.to) { 4197 matcher.hitEnd = true; 4198 return false; 4199 } 4200 } 4201 return next.match(matcher, x, seq); 4202 } 4203 } 4204 4205 /** 4206 * Node class for a case insensitive sequence of literal characters. 4207 * Uses unicode case folding. 4208 */ 4209 static final class SliceUS extends SliceIS { 4210 SliceUS(int[] buf) { 4211 super(buf); 4212 } 4213 int toLower(int c) { 4214 return Character.toLowerCase(Character.toUpperCase(c)); 4215 } 4216 } 4217 4218 /** 4219 * The 0 or 1 quantifier. This one class implements all three types. 4220 */ 4221 static final class Ques extends Node { 4222 Node atom; 4223 Qtype type; 4224 Ques(Node node, Qtype type) { 4225 this.atom = node; 4226 this.type = type; 4227 } 4228 boolean match(Matcher matcher, int i, CharSequence seq) { 4229 switch (type) { 4230 case GREEDY: 4231 return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)) 4232 || next.match(matcher, i, seq); 4233 case LAZY: 4234 return next.match(matcher, i, seq) 4235 || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq)); 4236 case POSSESSIVE: 4237 if (atom.match(matcher, i, seq)) i = matcher.last; 4238 return next.match(matcher, i, seq); 4239 default: 4240 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq); 4241 } 4242 } 4243 boolean study(TreeInfo info) { 4244 if (type != Qtype.INDEPENDENT) { 4245 int minL = info.minLength; 4246 atom.study(info); 4247 info.minLength = minL; 4248 info.deterministic = false; 4249 return next.study(info); 4250 } else { 4251 atom.study(info); 4252 return next.study(info); 4253 } 4254 } 4255 } 4256 4257 /** 4258 * Handles the greedy style repetition with the minimum either be 4259 * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier. 4260 */ 4261 static class CharPropertyGreedy extends Node { 4262 final CharPredicate predicate; 4263 final int cmin; 4264 4265 CharPropertyGreedy(CharProperty cp, int cmin) { 4266 this.predicate = cp.predicate; 4267 this.cmin = cmin; 4268 } 4269 boolean match(Matcher matcher, int i, CharSequence seq) { 4270 int n = 0; 4271 int to = matcher.to; 4272 // greedy, all the way down 4273 while (i < to) { 4274 int ch = Character.codePointAt(seq, i); 4275 if (!predicate.is(ch)) 4276 break; 4277 i += Character.charCount(ch); 4278 n++; 4279 } 4280 if (i >= to) { 4281 matcher.hitEnd = true; 4282 } 4283 while (n >= cmin) { 4284 if (next.match(matcher, i, seq)) 4285 return true; 4286 if (n == cmin) 4287 return false; 4288 // backing off if match fails 4289 int ch = Character.codePointBefore(seq, i); 4290 i -= Character.charCount(ch); 4291 n--; 4292 } 4293 return false; 4294 } 4295 4296 boolean study(TreeInfo info) { 4297 info.minLength += cmin; 4298 if (info.maxValid) { 4299 info.maxLength += MAX_REPS; 4300 } 4301 info.deterministic = false; 4302 return next.study(info); 4303 } 4304 } 4305 4306 static final class BmpCharPropertyGreedy extends CharPropertyGreedy { 4307 4308 BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) { 4309 super(bcp, cmin); 4310 } 4311 4312 boolean match(Matcher matcher, int i, CharSequence seq) { 4313 int n = 0; 4314 int to = matcher.to; 4315 while (i < to && predicate.is(seq.charAt(i))) { 4316 i++; n++; 4317 } 4318 if (i >= to) { 4319 matcher.hitEnd = true; 4320 } 4321 while (n >= cmin) { 4322 if (next.match(matcher, i, seq)) 4323 return true; 4324 i--; n--; // backing off if match fails 4325 } 4326 return false; 4327 } 4328 } 4329 4330 /** 4331 * Handles the curly-brace style repetition with a specified minimum and 4332 * maximum occurrences. The * quantifier is handled as a special case. 4333 * This class handles the three types. 4334 */ 4335 static final class Curly extends Node { 4336 Node atom; 4337 Qtype type; 4338 int cmin; 4339 int cmax; 4340 4341 Curly(Node node, int cmin, int cmax, Qtype type) { 4342 this.atom = node; 4343 this.type = type; 4344 this.cmin = cmin; 4345 this.cmax = cmax; 4346 } 4347 boolean match(Matcher matcher, int i, CharSequence seq) { 4348 int j; 4349 for (j = 0; j < cmin; j++) { 4350 if (atom.match(matcher, i, seq)) { 4351 i = matcher.last; 4352 continue; 4353 } 4354 return false; 4355 } 4356 if (type == Qtype.GREEDY) 4357 return match0(matcher, i, j, seq); 4358 else if (type == Qtype.LAZY) 4359 return match1(matcher, i, j, seq); 4360 else 4361 return match2(matcher, i, j, seq); 4362 } 4363 // Greedy match. 4364 // i is the index to start matching at 4365 // j is the number of atoms that have matched 4366 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4367 if (j >= cmax) { 4368 // We have matched the maximum... continue with the rest of 4369 // the regular expression 4370 return next.match(matcher, i, seq); 4371 } 4372 int backLimit = j; 4373 while (atom.match(matcher, i, seq)) { 4374 // k is the length of this match 4375 int k = matcher.last - i; 4376 if (k == 0) // Zero length match 4377 break; 4378 // Move up index and number matched 4379 i = matcher.last; 4380 j++; 4381 // We are greedy so match as many as we can 4382 while (j < cmax) { 4383 if (!atom.match(matcher, i, seq)) 4384 break; 4385 if (i + k != matcher.last) { 4386 if (match0(matcher, matcher.last, j+1, seq)) 4387 return true; 4388 break; 4389 } 4390 i += k; 4391 j++; 4392 } 4393 // Handle backing off if match fails 4394 while (j >= backLimit) { 4395 if (next.match(matcher, i, seq)) 4396 return true; 4397 i -= k; 4398 j--; 4399 } 4400 return false; 4401 } 4402 return next.match(matcher, i, seq); 4403 } 4404 // Reluctant match. At this point, the minimum has been satisfied. 4405 // i is the index to start matching at 4406 // j is the number of atoms that have matched 4407 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4408 for (;;) { 4409 // Try finishing match without consuming any more 4410 if (next.match(matcher, i, seq)) 4411 return true; 4412 // At the maximum, no match found 4413 if (j >= cmax) 4414 return false; 4415 // Okay, must try one more atom 4416 if (!atom.match(matcher, i, seq)) 4417 return false; 4418 // If we haven't moved forward then must break out 4419 if (i == matcher.last) 4420 return false; 4421 // Move up index and number matched 4422 i = matcher.last; 4423 j++; 4424 } 4425 } 4426 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4427 for (; j < cmax; j++) { 4428 if (!atom.match(matcher, i, seq)) 4429 break; 4430 if (i == matcher.last) 4431 break; 4432 i = matcher.last; 4433 } 4434 return next.match(matcher, i, seq); 4435 } 4436 boolean study(TreeInfo info) { 4437 // Save original info 4438 int minL = info.minLength; 4439 int maxL = info.maxLength; 4440 boolean maxV = info.maxValid; 4441 boolean detm = info.deterministic; 4442 info.reset(); 4443 4444 atom.study(info); 4445 4446 int temp = info.minLength * cmin + minL; 4447 if (temp < minL) { 4448 temp = 0xFFFFFFF; // arbitrary large number 4449 } 4450 info.minLength = temp; 4451 4452 if (maxV & info.maxValid) { 4453 temp = info.maxLength * cmax + maxL; 4454 info.maxLength = temp; 4455 if (temp < maxL) { 4456 info.maxValid = false; 4457 } 4458 } else { 4459 info.maxValid = false; 4460 } 4461 4462 if (info.deterministic && cmin == cmax) 4463 info.deterministic = detm; 4464 else 4465 info.deterministic = false; 4466 return next.study(info); 4467 } 4468 } 4469 4470 /** 4471 * Handles the curly-brace style repetition with a specified minimum and 4472 * maximum occurrences in deterministic cases. This is an iterative 4473 * optimization over the Prolog and Loop system which would handle this 4474 * in a recursive way. The * quantifier is handled as a special case. 4475 * If capture is true then this class saves group settings and ensures 4476 * that groups are unset when backing off of a group match. 4477 */ 4478 static final class GroupCurly extends Node { 4479 Node atom; 4480 Qtype type; 4481 int cmin; 4482 int cmax; 4483 int localIndex; 4484 int groupIndex; 4485 boolean capture; 4486 4487 GroupCurly(Node node, int cmin, int cmax, Qtype type, int local, 4488 int group, boolean capture) { 4489 this.atom = node; 4490 this.type = type; 4491 this.cmin = cmin; 4492 this.cmax = cmax; 4493 this.localIndex = local; 4494 this.groupIndex = group; 4495 this.capture = capture; 4496 } 4497 boolean match(Matcher matcher, int i, CharSequence seq) { 4498 int[] groups = matcher.groups; 4499 int[] locals = matcher.locals; 4500 int save0 = locals[localIndex]; 4501 int save1 = 0; 4502 int save2 = 0; 4503 4504 if (capture) { 4505 save1 = groups[groupIndex]; 4506 save2 = groups[groupIndex+1]; 4507 } 4508 4509 // Notify GroupTail there is no need to setup group info 4510 // because it will be set here 4511 locals[localIndex] = -1; 4512 4513 boolean ret = true; 4514 for (int j = 0; j < cmin; j++) { 4515 if (atom.match(matcher, i, seq)) { 4516 if (capture) { 4517 groups[groupIndex] = i; 4518 groups[groupIndex+1] = matcher.last; 4519 } 4520 i = matcher.last; 4521 } else { 4522 ret = false; 4523 break; 4524 } 4525 } 4526 if (ret) { 4527 if (type == Qtype.GREEDY) { 4528 ret = match0(matcher, i, cmin, seq); 4529 } else if (type == Qtype.LAZY) { 4530 ret = match1(matcher, i, cmin, seq); 4531 } else { 4532 ret = match2(matcher, i, cmin, seq); 4533 } 4534 } 4535 if (!ret) { 4536 locals[localIndex] = save0; 4537 if (capture) { 4538 groups[groupIndex] = save1; 4539 groups[groupIndex+1] = save2; 4540 } 4541 } 4542 return ret; 4543 } 4544 // Aggressive group match 4545 boolean match0(Matcher matcher, int i, int j, CharSequence seq) { 4546 // don't back off passing the starting "j" 4547 int min = j; 4548 int[] groups = matcher.groups; 4549 int save0 = 0; 4550 int save1 = 0; 4551 if (capture) { 4552 save0 = groups[groupIndex]; 4553 save1 = groups[groupIndex+1]; 4554 } 4555 for (;;) { 4556 if (j >= cmax) 4557 break; 4558 if (!atom.match(matcher, i, seq)) 4559 break; 4560 int k = matcher.last - i; 4561 if (k <= 0) { 4562 if (capture) { 4563 groups[groupIndex] = i; 4564 groups[groupIndex+1] = i + k; 4565 } 4566 i = i + k; 4567 break; 4568 } 4569 for (;;) { 4570 if (capture) { 4571 groups[groupIndex] = i; 4572 groups[groupIndex+1] = i + k; 4573 } 4574 i = i + k; 4575 if (++j >= cmax) 4576 break; 4577 if (!atom.match(matcher, i, seq)) 4578 break; 4579 if (i + k != matcher.last) { 4580 if (match0(matcher, i, j, seq)) 4581 return true; 4582 break; 4583 } 4584 } 4585 while (j > min) { 4586 if (next.match(matcher, i, seq)) { 4587 if (capture) { 4588 groups[groupIndex+1] = i; 4589 groups[groupIndex] = i - k; 4590 } 4591 return true; 4592 } 4593 // backing off 4594 i = i - k; 4595 if (capture) { 4596 groups[groupIndex+1] = i; 4597 groups[groupIndex] = i - k; 4598 } 4599 j--; 4600 4601 } 4602 break; 4603 } 4604 if (capture) { 4605 groups[groupIndex] = save0; 4606 groups[groupIndex+1] = save1; 4607 } 4608 return next.match(matcher, i, seq); 4609 } 4610 // Reluctant matching 4611 boolean match1(Matcher matcher, int i, int j, CharSequence seq) { 4612 for (;;) { 4613 if (next.match(matcher, i, seq)) 4614 return true; 4615 if (j >= cmax) 4616 return false; 4617 if (!atom.match(matcher, i, seq)) 4618 return false; 4619 if (i == matcher.last) 4620 return false; 4621 if (capture) { 4622 matcher.groups[groupIndex] = i; 4623 matcher.groups[groupIndex+1] = matcher.last; 4624 } 4625 i = matcher.last; 4626 j++; 4627 } 4628 } 4629 // Possessive matching 4630 boolean match2(Matcher matcher, int i, int j, CharSequence seq) { 4631 for (; j < cmax; j++) { 4632 if (!atom.match(matcher, i, seq)) { 4633 break; 4634 } 4635 if (capture) { 4636 matcher.groups[groupIndex] = i; 4637 matcher.groups[groupIndex+1] = matcher.last; 4638 } 4639 if (i == matcher.last) { 4640 break; 4641 } 4642 i = matcher.last; 4643 } 4644 return next.match(matcher, i, seq); 4645 } 4646 boolean study(TreeInfo info) { 4647 // Save original info 4648 int minL = info.minLength; 4649 int maxL = info.maxLength; 4650 boolean maxV = info.maxValid; 4651 boolean detm = info.deterministic; 4652 info.reset(); 4653 4654 atom.study(info); 4655 4656 int temp = info.minLength * cmin + minL; 4657 if (temp < minL) { 4658 temp = 0xFFFFFFF; // Arbitrary large number 4659 } 4660 info.minLength = temp; 4661 4662 if (maxV & info.maxValid) { 4663 temp = info.maxLength * cmax + maxL; 4664 info.maxLength = temp; 4665 if (temp < maxL) { 4666 info.maxValid = false; 4667 } 4668 } else { 4669 info.maxValid = false; 4670 } 4671 4672 if (info.deterministic && cmin == cmax) { 4673 info.deterministic = detm; 4674 } else { 4675 info.deterministic = false; 4676 } 4677 return next.study(info); 4678 } 4679 } 4680 4681 /** 4682 * A Guard node at the end of each atom node in a Branch. It 4683 * serves the purpose of chaining the "match" operation to 4684 * "next" but not the "study", so we can collect the TreeInfo 4685 * of each atom node without including the TreeInfo of the 4686 * "next". 4687 */ 4688 static final class BranchConn extends Node { 4689 BranchConn() {}; 4690 boolean match(Matcher matcher, int i, CharSequence seq) { 4691 return next.match(matcher, i, seq); 4692 } 4693 boolean study(TreeInfo info) { 4694 return info.deterministic; 4695 } 4696 } 4697 4698 /** 4699 * Handles the branching of alternations. Note this is also used for 4700 * the ? quantifier to branch between the case where it matches once 4701 * and where it does not occur. 4702 */ 4703 static final class Branch extends Node { 4704 Node[] atoms = new Node[2]; 4705 int size = 2; 4706 Node conn; 4707 Branch(Node first, Node second, Node branchConn) { 4708 conn = branchConn; 4709 atoms[0] = first; 4710 atoms[1] = second; 4711 } 4712 4713 void add(Node node) { 4714 if (size >= atoms.length) { 4715 Node[] tmp = new Node[atoms.length*2]; 4716 System.arraycopy(atoms, 0, tmp, 0, atoms.length); 4717 atoms = tmp; 4718 } 4719 atoms[size++] = node; 4720 } 4721 4722 boolean match(Matcher matcher, int i, CharSequence seq) { 4723 for (int n = 0; n < size; n++) { 4724 if (atoms[n] == null) { 4725 if (conn.next.match(matcher, i, seq)) 4726 return true; 4727 } else if (atoms[n].match(matcher, i, seq)) { 4728 return true; 4729 } 4730 } 4731 return false; 4732 } 4733 4734 boolean study(TreeInfo info) { 4735 int minL = info.minLength; 4736 int maxL = info.maxLength; 4737 boolean maxV = info.maxValid; 4738 4739 int minL2 = Integer.MAX_VALUE; //arbitrary large enough num 4740 int maxL2 = -1; 4741 for (int n = 0; n < size; n++) { 4742 info.reset(); 4743 if (atoms[n] != null) 4744 atoms[n].study(info); 4745 minL2 = Math.min(minL2, info.minLength); 4746 maxL2 = Math.max(maxL2, info.maxLength); 4747 maxV = (maxV & info.maxValid); 4748 } 4749 4750 minL += minL2; 4751 maxL += maxL2; 4752 4753 info.reset(); 4754 conn.next.study(info); 4755 4756 info.minLength += minL; 4757 info.maxLength += maxL; 4758 info.maxValid &= maxV; 4759 info.deterministic = false; 4760 return false; 4761 } 4762 } 4763 4764 /** 4765 * The GroupHead saves the location where the group begins in the locals 4766 * and restores them when the match is done. 4767 * 4768 * The matchRef is used when a reference to this group is accessed later 4769 * in the expression. The locals will have a negative value in them to 4770 * indicate that we do not want to unset the group if the reference 4771 * doesn't match. 4772 */ 4773 static final class GroupHead extends Node { 4774 int localIndex; 4775 GroupTail tail; // for debug/print only, match does not need to know 4776 GroupHead(int localCount) { 4777 localIndex = localCount; 4778 } 4779 boolean match(Matcher matcher, int i, CharSequence seq) { 4780 int save = matcher.locals[localIndex]; 4781 matcher.locals[localIndex] = i; 4782 boolean ret = next.match(matcher, i, seq); 4783 matcher.locals[localIndex] = save; 4784 return ret; 4785 } 4786 boolean matchRef(Matcher matcher, int i, CharSequence seq) { 4787 int save = matcher.locals[localIndex]; 4788 matcher.locals[localIndex] = ~i; // HACK 4789 boolean ret = next.match(matcher, i, seq); 4790 matcher.locals[localIndex] = save; 4791 return ret; 4792 } 4793 } 4794 4795 /** 4796 * Recursive reference to a group in the regular expression. It calls 4797 * matchRef because if the reference fails to match we would not unset 4798 * the group. 4799 */ 4800 static final class GroupRef extends Node { 4801 GroupHead head; 4802 GroupRef(GroupHead head) { 4803 this.head = head; 4804 } 4805 boolean match(Matcher matcher, int i, CharSequence seq) { 4806 return head.matchRef(matcher, i, seq) 4807 && next.match(matcher, matcher.last, seq); 4808 } 4809 boolean study(TreeInfo info) { 4810 info.maxValid = false; 4811 info.deterministic = false; 4812 return next.study(info); 4813 } 4814 } 4815 4816 /** 4817 * The GroupTail handles the setting of group beginning and ending 4818 * locations when groups are successfully matched. It must also be able to 4819 * unset groups that have to be backed off of. 4820 * 4821 * The GroupTail node is also used when a previous group is referenced, 4822 * and in that case no group information needs to be set. 4823 */ 4824 static final class GroupTail extends Node { 4825 int localIndex; 4826 int groupIndex; 4827 GroupTail(int localCount, int groupCount) { 4828 localIndex = localCount; 4829 groupIndex = groupCount + groupCount; 4830 } 4831 boolean match(Matcher matcher, int i, CharSequence seq) { 4832 int tmp = matcher.locals[localIndex]; 4833 if (tmp >= 0) { // This is the normal group case. 4834 // Save the group so we can unset it if it 4835 // backs off of a match. 4836 int groupStart = matcher.groups[groupIndex]; 4837 int groupEnd = matcher.groups[groupIndex+1]; 4838 4839 matcher.groups[groupIndex] = tmp; 4840 matcher.groups[groupIndex+1] = i; 4841 if (next.match(matcher, i, seq)) { 4842 return true; 4843 } 4844 matcher.groups[groupIndex] = groupStart; 4845 matcher.groups[groupIndex+1] = groupEnd; 4846 return false; 4847 } else { 4848 // This is a group reference case. We don't need to save any 4849 // group info because it isn't really a group. 4850 matcher.last = i; 4851 return true; 4852 } 4853 } 4854 } 4855 4856 /** 4857 * This sets up a loop to handle a recursive quantifier structure. 4858 */ 4859 static final class Prolog extends Node { 4860 Loop loop; 4861 Prolog(Loop loop) { 4862 this.loop = loop; 4863 } 4864 boolean match(Matcher matcher, int i, CharSequence seq) { 4865 return loop.matchInit(matcher, i, seq); 4866 } 4867 boolean study(TreeInfo info) { 4868 return loop.study(info); 4869 } 4870 } 4871 4872 /** 4873 * Handles the repetition count for a greedy Curly. The matchInit 4874 * is called from the Prolog to save the index of where the group 4875 * beginning is stored. A zero length group check occurs in the 4876 * normal match but is skipped in the matchInit. 4877 */ 4878 static class Loop extends Node { 4879 Node body; 4880 int countIndex; // local count index in matcher locals 4881 int beginIndex; // group beginning index 4882 int cmin, cmax; 4883 int posIndex; 4884 Loop(int countIndex, int beginIndex) { 4885 this.countIndex = countIndex; 4886 this.beginIndex = beginIndex; 4887 this.posIndex = -1; 4888 } 4889 boolean match(Matcher matcher, int i, CharSequence seq) { 4890 // Avoid infinite loop in zero-length case. 4891 if (i > matcher.locals[beginIndex]) { 4892 int count = matcher.locals[countIndex]; 4893 4894 // This block is for before we reach the minimum 4895 // iterations required for the loop to match 4896 if (count < cmin) { 4897 matcher.locals[countIndex] = count + 1; 4898 boolean b = body.match(matcher, i, seq); 4899 // If match failed we must backtrack, so 4900 // the loop count should NOT be incremented 4901 if (!b) 4902 matcher.locals[countIndex] = count; 4903 // Return success or failure since we are under 4904 // minimum 4905 return b; 4906 } 4907 // This block is for after we have the minimum 4908 // iterations required for the loop to match 4909 if (count < cmax) { 4910 // Let's check if we have already tried and failed 4911 // at this starting position "i" in the past. 4912 // If yes, then just return false wihtout trying 4913 // again, to stop the exponential backtracking. 4914 if (posIndex != -1 && 4915 matcher.localsPos[posIndex].contains(i)) { 4916 return next.match(matcher, i, seq); 4917 } 4918 matcher.locals[countIndex] = count + 1; 4919 boolean b = body.match(matcher, i, seq); 4920 // If match failed we must backtrack, so 4921 // the loop count should NOT be incremented 4922 if (b) 4923 return true; 4924 matcher.locals[countIndex] = count; 4925 // save the failed position 4926 if (posIndex != -1) { 4927 matcher.localsPos[posIndex].add(i); 4928 } 4929 } 4930 } 4931 return next.match(matcher, i, seq); 4932 } 4933 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4934 int save = matcher.locals[countIndex]; 4935 boolean ret = false; 4936 if (posIndex != -1 && matcher.localsPos[posIndex] == null) { 4937 matcher.localsPos[posIndex] = new IntHashSet(); 4938 } 4939 if (0 < cmin) { 4940 matcher.locals[countIndex] = 1; 4941 ret = body.match(matcher, i, seq); 4942 } else if (0 < cmax) { 4943 matcher.locals[countIndex] = 1; 4944 ret = body.match(matcher, i, seq); 4945 if (ret == false) 4946 ret = next.match(matcher, i, seq); 4947 } else { 4948 ret = next.match(matcher, i, seq); 4949 } 4950 matcher.locals[countIndex] = save; 4951 return ret; 4952 } 4953 boolean study(TreeInfo info) { 4954 info.maxValid = false; 4955 info.deterministic = false; 4956 return false; 4957 } 4958 } 4959 4960 /** 4961 * Handles the repetition count for a reluctant Curly. The matchInit 4962 * is called from the Prolog to save the index of where the group 4963 * beginning is stored. A zero length group check occurs in the 4964 * normal match but is skipped in the matchInit. 4965 */ 4966 static final class LazyLoop extends Loop { 4967 LazyLoop(int countIndex, int beginIndex) { 4968 super(countIndex, beginIndex); 4969 } 4970 boolean match(Matcher matcher, int i, CharSequence seq) { 4971 // Check for zero length group 4972 if (i > matcher.locals[beginIndex]) { 4973 int count = matcher.locals[countIndex]; 4974 if (count < cmin) { 4975 matcher.locals[countIndex] = count + 1; 4976 boolean result = body.match(matcher, i, seq); 4977 // If match failed we must backtrack, so 4978 // the loop count should NOT be incremented 4979 if (!result) 4980 matcher.locals[countIndex] = count; 4981 return result; 4982 } 4983 if (next.match(matcher, i, seq)) 4984 return true; 4985 if (count < cmax) { 4986 matcher.locals[countIndex] = count + 1; 4987 boolean result = body.match(matcher, i, seq); 4988 // If match failed we must backtrack, so 4989 // the loop count should NOT be incremented 4990 if (!result) 4991 matcher.locals[countIndex] = count; 4992 return result; 4993 } 4994 return false; 4995 } 4996 return next.match(matcher, i, seq); 4997 } 4998 boolean matchInit(Matcher matcher, int i, CharSequence seq) { 4999 int save = matcher.locals[countIndex]; 5000 boolean ret = false; 5001 if (0 < cmin) { 5002 matcher.locals[countIndex] = 1; 5003 ret = body.match(matcher, i, seq); 5004 } else if (next.match(matcher, i, seq)) { 5005 ret = true; 5006 } else if (0 < cmax) { 5007 matcher.locals[countIndex] = 1; 5008 ret = body.match(matcher, i, seq); 5009 } 5010 matcher.locals[countIndex] = save; 5011 return ret; 5012 } 5013 boolean study(TreeInfo info) { 5014 info.maxValid = false; 5015 info.deterministic = false; 5016 return false; 5017 } 5018 } 5019 5020 /** 5021 * Refers to a group in the regular expression. Attempts to match 5022 * whatever the group referred to last matched. 5023 */ 5024 static class BackRef extends Node { 5025 int groupIndex; 5026 BackRef(int groupCount) { 5027 super(); 5028 groupIndex = groupCount + groupCount; 5029 } 5030 boolean match(Matcher matcher, int i, CharSequence seq) { 5031 int j = matcher.groups[groupIndex]; 5032 int k = matcher.groups[groupIndex+1]; 5033 5034 int groupSize = k - j; 5035 // If the referenced group didn't match, neither can this 5036 if (j < 0) 5037 return false; 5038 5039 // If there isn't enough input left no match 5040 if (i + groupSize > matcher.to) { 5041 matcher.hitEnd = true; 5042 return false; 5043 } 5044 // Check each new char to make sure it matches what the group 5045 // referenced matched last time around 5046 for (int index=0; index<groupSize; index++) 5047 if (seq.charAt(i+index) != seq.charAt(j+index)) 5048 return false; 5049 5050 return next.match(matcher, i+groupSize, seq); 5051 } 5052 boolean study(TreeInfo info) { 5053 info.maxValid = false; 5054 return next.study(info); 5055 } 5056 } 5057 5058 static class CIBackRef extends Node { 5059 int groupIndex; 5060 boolean doUnicodeCase; 5061 CIBackRef(int groupCount, boolean doUnicodeCase) { 5062 super(); 5063 groupIndex = groupCount + groupCount; 5064 this.doUnicodeCase = doUnicodeCase; 5065 } 5066 boolean match(Matcher matcher, int i, CharSequence seq) { 5067 int j = matcher.groups[groupIndex]; 5068 int k = matcher.groups[groupIndex+1]; 5069 5070 int groupSize = k - j; 5071 5072 // If the referenced group didn't match, neither can this 5073 if (j < 0) 5074 return false; 5075 5076 // If there isn't enough input left no match 5077 if (i + groupSize > matcher.to) { 5078 matcher.hitEnd = true; 5079 return false; 5080 } 5081 5082 // Check each new char to make sure it matches what the group 5083 // referenced matched last time around 5084 int x = i; 5085 for (int index=0; index<groupSize; index++) { 5086 int c1 = Character.codePointAt(seq, x); 5087 int c2 = Character.codePointAt(seq, j); 5088 if (c1 != c2) { 5089 if (doUnicodeCase) { 5090 int cc1 = Character.toUpperCase(c1); 5091 int cc2 = Character.toUpperCase(c2); 5092 if (cc1 != cc2 && 5093 Character.toLowerCase(cc1) != 5094 Character.toLowerCase(cc2)) 5095 return false; 5096 } else { 5097 if (ASCII.toLower(c1) != ASCII.toLower(c2)) 5098 return false; 5099 } 5100 } 5101 x += Character.charCount(c1); 5102 j += Character.charCount(c2); 5103 } 5104 5105 return next.match(matcher, i+groupSize, seq); 5106 } 5107 boolean study(TreeInfo info) { 5108 info.maxValid = false; 5109 return next.study(info); 5110 } 5111 } 5112 5113 /** 5114 * Searches until the next instance of its atom. This is useful for 5115 * finding the atom efficiently without passing an instance of it 5116 * (greedy problem) and without a lot of wasted search time (reluctant 5117 * problem). 5118 */ 5119 static final class First extends Node { 5120 Node atom; 5121 First(Node node) { 5122 this.atom = BnM.optimize(node); 5123 } 5124 boolean match(Matcher matcher, int i, CharSequence seq) { 5125 if (atom instanceof BnM) { 5126 return atom.match(matcher, i, seq) 5127 && next.match(matcher, matcher.last, seq); 5128 } 5129 for (;;) { 5130 if (i > matcher.to) { 5131 matcher.hitEnd = true; 5132 return false; 5133 } 5134 if (atom.match(matcher, i, seq)) { 5135 return next.match(matcher, matcher.last, seq); 5136 } 5137 i += countChars(seq, i, 1); 5138 matcher.first++; 5139 } 5140 } 5141 boolean study(TreeInfo info) { 5142 atom.study(info); 5143 info.maxValid = false; 5144 info.deterministic = false; 5145 return next.study(info); 5146 } 5147 } 5148 5149 static final class Conditional extends Node { 5150 Node cond, yes, not; 5151 Conditional(Node cond, Node yes, Node not) { 5152 this.cond = cond; 5153 this.yes = yes; 5154 this.not = not; 5155 } 5156 boolean match(Matcher matcher, int i, CharSequence seq) { 5157 if (cond.match(matcher, i, seq)) { 5158 return yes.match(matcher, i, seq); 5159 } else { 5160 return not.match(matcher, i, seq); 5161 } 5162 } 5163 boolean study(TreeInfo info) { 5164 int minL = info.minLength; 5165 int maxL = info.maxLength; 5166 boolean maxV = info.maxValid; 5167 info.reset(); 5168 yes.study(info); 5169 5170 int minL2 = info.minLength; 5171 int maxL2 = info.maxLength; 5172 boolean maxV2 = info.maxValid; 5173 info.reset(); 5174 not.study(info); 5175 5176 info.minLength = minL + Math.min(minL2, info.minLength); 5177 info.maxLength = maxL + Math.max(maxL2, info.maxLength); 5178 info.maxValid = (maxV & maxV2 & info.maxValid); 5179 info.deterministic = false; 5180 return next.study(info); 5181 } 5182 } 5183 5184 /** 5185 * Zero width positive lookahead. 5186 */ 5187 static final class Pos extends Node { 5188 Node cond; 5189 Pos(Node cond) { 5190 this.cond = cond; 5191 } 5192 boolean match(Matcher matcher, int i, CharSequence seq) { 5193 int savedTo = matcher.to; 5194 boolean conditionMatched = false; 5195 5196 // Relax transparent region boundaries for lookahead 5197 if (matcher.transparentBounds) 5198 matcher.to = matcher.getTextLength(); 5199 try { 5200 conditionMatched = cond.match(matcher, i, seq); 5201 } finally { 5202 // Reinstate region boundaries 5203 matcher.to = savedTo; 5204 } 5205 return conditionMatched && next.match(matcher, i, seq); 5206 } 5207 } 5208 5209 /** 5210 * Zero width negative lookahead. 5211 */ 5212 static final class Neg extends Node { 5213 Node cond; 5214 Neg(Node cond) { 5215 this.cond = cond; 5216 } 5217 boolean match(Matcher matcher, int i, CharSequence seq) { 5218 int savedTo = matcher.to; 5219 boolean conditionMatched = false; 5220 5221 // Relax transparent region boundaries for lookahead 5222 if (matcher.transparentBounds) 5223 matcher.to = matcher.getTextLength(); 5224 try { 5225 if (i < matcher.to) { 5226 conditionMatched = !cond.match(matcher, i, seq); 5227 } else { 5228 // If a negative lookahead succeeds then more input 5229 // could cause it to fail! 5230 matcher.requireEnd = true; 5231 conditionMatched = !cond.match(matcher, i, seq); 5232 } 5233 } finally { 5234 // Reinstate region boundaries 5235 matcher.to = savedTo; 5236 } 5237 return conditionMatched && next.match(matcher, i, seq); 5238 } 5239 } 5240 5241 /** 5242 * For use with lookbehinds; matches the position where the lookbehind 5243 * was encountered. 5244 */ 5245 static Node lookbehindEnd = new Node() { 5246 boolean match(Matcher matcher, int i, CharSequence seq) { 5247 return i == matcher.lookbehindTo; 5248 } 5249 }; 5250 5251 /** 5252 * Zero width positive lookbehind. 5253 */ 5254 static class Behind extends Node { 5255 Node cond; 5256 int rmax, rmin; 5257 Behind(Node cond, int rmax, int rmin) { 5258 this.cond = cond; 5259 this.rmax = rmax; 5260 this.rmin = rmin; 5261 } 5262 5263 boolean match(Matcher matcher, int i, CharSequence seq) { 5264 int savedFrom = matcher.from; 5265 boolean conditionMatched = false; 5266 int startIndex = (!matcher.transparentBounds) ? 5267 matcher.from : 0; 5268 int from = Math.max(i - rmax, startIndex); 5269 // Set end boundary 5270 int savedLBT = matcher.lookbehindTo; 5271 matcher.lookbehindTo = i; 5272 // Relax transparent region boundaries for lookbehind 5273 if (matcher.transparentBounds) 5274 matcher.from = 0; 5275 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5276 conditionMatched = cond.match(matcher, j, seq); 5277 } 5278 matcher.from = savedFrom; 5279 matcher.lookbehindTo = savedLBT; 5280 return conditionMatched && next.match(matcher, i, seq); 5281 } 5282 } 5283 5284 /** 5285 * Zero width positive lookbehind, including supplementary 5286 * characters or unpaired surrogates. 5287 */ 5288 static final class BehindS extends Behind { 5289 BehindS(Node cond, int rmax, int rmin) { 5290 super(cond, rmax, rmin); 5291 } 5292 boolean match(Matcher matcher, int i, CharSequence seq) { 5293 int rmaxChars = countChars(seq, i, -rmax); 5294 int rminChars = countChars(seq, i, -rmin); 5295 int savedFrom = matcher.from; 5296 int startIndex = (!matcher.transparentBounds) ? 5297 matcher.from : 0; 5298 boolean conditionMatched = false; 5299 int from = Math.max(i - rmaxChars, startIndex); 5300 // Set end boundary 5301 int savedLBT = matcher.lookbehindTo; 5302 matcher.lookbehindTo = i; 5303 // Relax transparent region boundaries for lookbehind 5304 if (matcher.transparentBounds) 5305 matcher.from = 0; 5306 5307 for (int j = i - rminChars; 5308 !conditionMatched && j >= from; 5309 j -= j>from ? countChars(seq, j, -1) : 1) { 5310 conditionMatched = cond.match(matcher, j, seq); 5311 } 5312 matcher.from = savedFrom; 5313 matcher.lookbehindTo = savedLBT; 5314 return conditionMatched && next.match(matcher, i, seq); 5315 } 5316 } 5317 5318 /** 5319 * Zero width negative lookbehind. 5320 */ 5321 static class NotBehind extends Node { 5322 Node cond; 5323 int rmax, rmin; 5324 NotBehind(Node cond, int rmax, int rmin) { 5325 this.cond = cond; 5326 this.rmax = rmax; 5327 this.rmin = rmin; 5328 } 5329 5330 boolean match(Matcher matcher, int i, CharSequence seq) { 5331 int savedLBT = matcher.lookbehindTo; 5332 int savedFrom = matcher.from; 5333 boolean conditionMatched = false; 5334 int startIndex = (!matcher.transparentBounds) ? 5335 matcher.from : 0; 5336 int from = Math.max(i - rmax, startIndex); 5337 matcher.lookbehindTo = i; 5338 // Relax transparent region boundaries for lookbehind 5339 if (matcher.transparentBounds) 5340 matcher.from = 0; 5341 for (int j = i - rmin; !conditionMatched && j >= from; j--) { 5342 conditionMatched = cond.match(matcher, j, seq); 5343 } 5344 // Reinstate region boundaries 5345 matcher.from = savedFrom; 5346 matcher.lookbehindTo = savedLBT; 5347 return !conditionMatched && next.match(matcher, i, seq); 5348 } 5349 } 5350 5351 /** 5352 * Zero width negative lookbehind, including supplementary 5353 * characters or unpaired surrogates. 5354 */ 5355 static final class NotBehindS extends NotBehind { 5356 NotBehindS(Node cond, int rmax, int rmin) { 5357 super(cond, rmax, rmin); 5358 } 5359 boolean match(Matcher matcher, int i, CharSequence seq) { 5360 int rmaxChars = countChars(seq, i, -rmax); 5361 int rminChars = countChars(seq, i, -rmin); 5362 int savedFrom = matcher.from; 5363 int savedLBT = matcher.lookbehindTo; 5364 boolean conditionMatched = false; 5365 int startIndex = (!matcher.transparentBounds) ? 5366 matcher.from : 0; 5367 int from = Math.max(i - rmaxChars, startIndex); 5368 matcher.lookbehindTo = i; 5369 // Relax transparent region boundaries for lookbehind 5370 if (matcher.transparentBounds) 5371 matcher.from = 0; 5372 for (int j = i - rminChars; 5373 !conditionMatched && j >= from; 5374 j -= j>from ? countChars(seq, j, -1) : 1) { 5375 conditionMatched = cond.match(matcher, j, seq); 5376 } 5377 //Reinstate region boundaries 5378 matcher.from = savedFrom; 5379 matcher.lookbehindTo = savedLBT; 5380 return !conditionMatched && next.match(matcher, i, seq); 5381 } 5382 } 5383 5384 /** 5385 * Handles word boundaries. Includes a field to allow this one class to 5386 * deal with the different types of word boundaries we can match. The word 5387 * characters include underscores, letters, and digits. Non spacing marks 5388 * can are also part of a word if they have a base character, otherwise 5389 * they are ignored for purposes of finding word boundaries. 5390 */ 5391 static final class Bound extends Node { 5392 static int LEFT = 0x1; 5393 static int RIGHT= 0x2; 5394 static int BOTH = 0x3; 5395 static int NONE = 0x4; 5396 int type; 5397 boolean useUWORD; 5398 Bound(int n, boolean useUWORD) { 5399 type = n; 5400 this.useUWORD = useUWORD; 5401 } 5402 5403 boolean isWord(int ch) { 5404 return useUWORD ? CharPredicates.WORD().is(ch) 5405 : (ch == '_' || Character.isLetterOrDigit(ch)); 5406 } 5407 5408 int check(Matcher matcher, int i, CharSequence seq) { 5409 int ch; 5410 boolean left = false; 5411 int startIndex = matcher.from; 5412 int endIndex = matcher.to; 5413 if (matcher.transparentBounds) { 5414 startIndex = 0; 5415 endIndex = matcher.getTextLength(); 5416 } 5417 if (i > startIndex) { 5418 ch = Character.codePointBefore(seq, i); 5419 left = (isWord(ch) || 5420 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5421 && hasBaseCharacter(matcher, i-1, seq))); 5422 } 5423 boolean right = false; 5424 if (i < endIndex) { 5425 ch = Character.codePointAt(seq, i); 5426 right = (isWord(ch) || 5427 ((Character.getType(ch) == Character.NON_SPACING_MARK) 5428 && hasBaseCharacter(matcher, i, seq))); 5429 } else { 5430 // Tried to access char past the end 5431 matcher.hitEnd = true; 5432 // The addition of another char could wreck a boundary 5433 matcher.requireEnd = true; 5434 } 5435 return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE); 5436 } 5437 boolean match(Matcher matcher, int i, CharSequence seq) { 5438 return (check(matcher, i, seq) & type) > 0 5439 && next.match(matcher, i, seq); 5440 } 5441 } 5442 5443 /** 5444 * Non spacing marks only count as word characters in bounds calculations 5445 * if they have a base character. 5446 */ 5447 private static boolean hasBaseCharacter(Matcher matcher, int i, 5448 CharSequence seq) 5449 { 5450 int start = (!matcher.transparentBounds) ? 5451 matcher.from : 0; 5452 for (int x=i; x >= start; x--) { 5453 int ch = Character.codePointAt(seq, x); 5454 if (Character.isLetterOrDigit(ch)) 5455 return true; 5456 if (Character.getType(ch) == Character.NON_SPACING_MARK) 5457 continue; 5458 return false; 5459 } 5460 return false; 5461 } 5462 5463 /** 5464 * Attempts to match a slice in the input using the Boyer-Moore string 5465 * matching algorithm. The algorithm is based on the idea that the 5466 * pattern can be shifted farther ahead in the search text if it is 5467 * matched right to left. 5468 * <p> 5469 * The pattern is compared to the input one character at a time, from 5470 * the rightmost character in the pattern to the left. If the characters 5471 * all match the pattern has been found. If a character does not match, 5472 * the pattern is shifted right a distance that is the maximum of two 5473 * functions, the bad character shift and the good suffix shift. This 5474 * shift moves the attempted match position through the input more 5475 * quickly than a naive one position at a time check. 5476 * <p> 5477 * The bad character shift is based on the character from the text that 5478 * did not match. If the character does not appear in the pattern, the 5479 * pattern can be shifted completely beyond the bad character. If the 5480 * character does occur in the pattern, the pattern can be shifted to 5481 * line the pattern up with the next occurrence of that character. 5482 * <p> 5483 * The good suffix shift is based on the idea that some subset on the right 5484 * side of the pattern has matched. When a bad character is found, the 5485 * pattern can be shifted right by the pattern length if the subset does 5486 * not occur again in pattern, or by the amount of distance to the 5487 * next occurrence of the subset in the pattern. 5488 * 5489 * Boyer-Moore search methods adapted from code by Amy Yu. 5490 */ 5491 static class BnM extends Node { 5492 int[] buffer; 5493 int[] lastOcc; 5494 int[] optoSft; 5495 5496 /** 5497 * Pre calculates arrays needed to generate the bad character 5498 * shift and the good suffix shift. Only the last seven bits 5499 * are used to see if chars match; This keeps the tables small 5500 * and covers the heavily used ASCII range, but occasionally 5501 * results in an aliased match for the bad character shift. 5502 */ 5503 static Node optimize(Node node) { 5504 if (!(node instanceof Slice)) { 5505 return node; 5506 } 5507 5508 int[] src = ((Slice) node).buffer; 5509 int patternLength = src.length; 5510 // The BM algorithm requires a bit of overhead; 5511 // If the pattern is short don't use it, since 5512 // a shift larger than the pattern length cannot 5513 // be used anyway. 5514 if (patternLength < 4) { 5515 return node; 5516 } 5517 int i, j, k; 5518 int[] lastOcc = new int[128]; 5519 int[] optoSft = new int[patternLength]; 5520 // Precalculate part of the bad character shift 5521 // It is a table for where in the pattern each 5522 // lower 7-bit value occurs 5523 for (i = 0; i < patternLength; i++) { 5524 lastOcc[src[i]&0x7F] = i + 1; 5525 } 5526 // Precalculate the good suffix shift 5527 // i is the shift amount being considered 5528 NEXT: for (i = patternLength; i > 0; i--) { 5529 // j is the beginning index of suffix being considered 5530 for (j = patternLength - 1; j >= i; j--) { 5531 // Testing for good suffix 5532 if (src[j] == src[j-i]) { 5533 // src[j..len] is a good suffix 5534 optoSft[j-1] = i; 5535 } else { 5536 // No match. The array has already been 5537 // filled up with correct values before. 5538 continue NEXT; 5539 } 5540 } 5541 // This fills up the remaining of optoSft 5542 // any suffix can not have larger shift amount 5543 // then its sub-suffix. Why??? 5544 while (j > 0) { 5545 optoSft[--j] = i; 5546 } 5547 } 5548 // Set the guard value because of unicode compression 5549 optoSft[patternLength-1] = 1; 5550 if (node instanceof SliceS) 5551 return new BnMS(src, lastOcc, optoSft, node.next); 5552 return new BnM(src, lastOcc, optoSft, node.next); 5553 } 5554 BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5555 this.buffer = src; 5556 this.lastOcc = lastOcc; 5557 this.optoSft = optoSft; 5558 this.next = next; 5559 } 5560 boolean match(Matcher matcher, int i, CharSequence seq) { 5561 int[] src = buffer; 5562 int patternLength = src.length; 5563 int last = matcher.to - patternLength; 5564 5565 // Loop over all possible match positions in text 5566 NEXT: while (i <= last) { 5567 // Loop over pattern from right to left 5568 for (int j = patternLength - 1; j >= 0; j--) { 5569 int ch = seq.charAt(i+j); 5570 if (ch != src[j]) { 5571 // Shift search to the right by the maximum of the 5572 // bad character shift and the good suffix shift 5573 i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); 5574 continue NEXT; 5575 } 5576 } 5577 // Entire pattern matched starting at i 5578 matcher.first = i; 5579 boolean ret = next.match(matcher, i + patternLength, seq); 5580 if (ret) { 5581 matcher.first = i; 5582 matcher.groups[0] = matcher.first; 5583 matcher.groups[1] = matcher.last; 5584 return true; 5585 } 5586 i++; 5587 } 5588 // BnM is only used as the leading node in the unanchored case, 5589 // and it replaced its Start() which always searches to the end 5590 // if it doesn't find what it's looking for, so hitEnd is true. 5591 matcher.hitEnd = true; 5592 return false; 5593 } 5594 boolean study(TreeInfo info) { 5595 info.minLength += buffer.length; 5596 info.maxValid = false; 5597 return next.study(info); 5598 } 5599 } 5600 5601 /** 5602 * Supplementary support version of BnM(). Unpaired surrogates are 5603 * also handled by this class. 5604 */ 5605 static final class BnMS extends BnM { 5606 int lengthInChars; 5607 5608 BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) { 5609 super(src, lastOcc, optoSft, next); 5610 for (int cp : buffer) { 5611 lengthInChars += Character.charCount(cp); 5612 } 5613 } 5614 boolean match(Matcher matcher, int i, CharSequence seq) { 5615 int[] src = buffer; 5616 int patternLength = src.length; 5617 int last = matcher.to - lengthInChars; 5618 5619 // Loop over all possible match positions in text 5620 NEXT: while (i <= last) { 5621 // Loop over pattern from right to left 5622 int ch; 5623 for (int j = countChars(seq, i, patternLength), x = patternLength - 1; 5624 j > 0; j -= Character.charCount(ch), x--) { 5625 ch = Character.codePointBefore(seq, i+j); 5626 if (ch != src[x]) { 5627 // Shift search to the right by the maximum of the 5628 // bad character shift and the good suffix shift 5629 int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]); 5630 i += countChars(seq, i, n); 5631 continue NEXT; 5632 } 5633 } 5634 // Entire pattern matched starting at i 5635 matcher.first = i; 5636 boolean ret = next.match(matcher, i + lengthInChars, seq); 5637 if (ret) { 5638 matcher.first = i; 5639 matcher.groups[0] = matcher.first; 5640 matcher.groups[1] = matcher.last; 5641 return true; 5642 } 5643 i += countChars(seq, i, 1); 5644 } 5645 matcher.hitEnd = true; 5646 return false; 5647 } 5648 } 5649 5650 @FunctionalInterface 5651 static interface CharPredicate { 5652 boolean is(int ch); 5653 5654 default CharPredicate and(CharPredicate p) { 5655 return ch -> is(ch) && p.is(ch); 5656 } 5657 default CharPredicate union(CharPredicate p) { 5658 return ch -> is(ch) || p.is(ch); 5659 } 5660 default CharPredicate union(CharPredicate p1, 5661 CharPredicate p2 ) { 5662 return ch -> is(ch) || p1.is(ch) || p2.is(ch); 5663 } 5664 default CharPredicate negate() { 5665 return ch -> !is(ch); 5666 } 5667 } 5668 5669 static interface BmpCharPredicate extends CharPredicate { 5670 5671 default CharPredicate and(CharPredicate p) { 5672 if(p instanceof BmpCharPredicate) 5673 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch)); 5674 return ch -> is(ch) && p.is(ch); 5675 } 5676 default CharPredicate union(CharPredicate p) { 5677 if (p instanceof BmpCharPredicate) 5678 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch)); 5679 return ch -> is(ch) || p.is(ch); 5680 } 5681 static CharPredicate union(CharPredicate... predicates) { 5682 CharPredicate cp = ch -> { 5683 for (CharPredicate p : predicates) { 5684 if (!p.is(ch)) 5685 return false; 5686 } 5687 return true; 5688 }; 5689 for (CharPredicate p : predicates) { 5690 if (! (p instanceof BmpCharPredicate)) 5691 return cp; 5692 } 5693 return (BmpCharPredicate)cp; 5694 } 5695 } 5696 5697 /** 5698 * matches a Perl vertical whitespace 5699 */ 5700 static BmpCharPredicate VertWS() { 5701 return cp -> (cp >= 0x0A && cp <= 0x0D) || 5702 cp == 0x85 || cp == 0x2028 || cp == 0x2029; 5703 } 5704 5705 /** 5706 * matches a Perl horizontal whitespace 5707 */ 5708 static BmpCharPredicate HorizWS() { 5709 return cp -> 5710 cp == 0x09 || cp == 0x20 || cp == 0xa0 || cp == 0x1680 || 5711 cp == 0x180e || cp >= 0x2000 && cp <= 0x200a || cp == 0x202f || 5712 cp == 0x205f || cp == 0x3000; 5713 } 5714 5715 /** 5716 * for the Unicode category ALL and the dot metacharacter when 5717 * in dotall mode. 5718 */ 5719 static CharPredicate ALL() { 5720 return ch -> true; 5721 } 5722 5723 /** 5724 * for the dot metacharacter when dotall is not enabled. 5725 */ 5726 static CharPredicate DOT() { 5727 return ch -> 5728 (ch != '\n' && ch != '\r' 5729 && (ch|1) != '\u2029' 5730 && ch != '\u0085'); 5731 } 5732 5733 /** 5734 * the dot metacharacter when dotall is not enabled but UNIX_LINES is enabled. 5735 */ 5736 static CharPredicate UNIXDOT() { 5737 return ch -> ch != '\n'; 5738 } 5739 5740 /** 5741 * Indicate that matches a Supplementary Unicode character 5742 */ 5743 static CharPredicate SingleS(int c) { 5744 return ch -> ch == c; 5745 } 5746 5747 /** 5748 * A bmp/optimized predicate of single 5749 */ 5750 static BmpCharPredicate Single(int c) { 5751 return ch -> ch == c; 5752 } 5753 5754 /** 5755 * Case insensitive matches a given BMP character 5756 */ 5757 static BmpCharPredicate SingleI(int lower, int upper) { 5758 return ch -> ch == lower || ch == upper; 5759 } 5760 5761 /** 5762 * Unicode case insensitive matches a given Unicode character 5763 */ 5764 static CharPredicate SingleU(int lower) { 5765 return ch -> lower == ch || 5766 lower == Character.toLowerCase(Character.toUpperCase(ch)); 5767 } 5768 5769 private static boolean inRange(int lower, int ch, int upper) { 5770 return lower <= ch && ch <= upper; 5771 } 5772 5773 /** 5774 * Charactrs within a explicit value range 5775 */ 5776 static CharPredicate Range(int lower, int upper) { 5777 if (upper < Character.MIN_HIGH_SURROGATE || 5778 lower > Character.MAX_HIGH_SURROGATE && 5779 upper < Character.MIN_SUPPLEMENTARY_CODE_POINT) 5780 return (BmpCharPredicate)(ch -> inRange(lower, ch, upper)); 5781 return ch -> inRange(lower, ch, upper); 5782 } 5783 5784 /** 5785 * Charactrs within a explicit value range in a case insensitive manner. 5786 */ 5787 static CharPredicate CIRange(int lower, int upper) { 5788 return ch -> inRange(lower, ch, upper) || 5789 ASCII.isAscii(ch) && 5790 (inRange(lower, ASCII.toUpper(ch), upper) || 5791 inRange(lower, ASCII.toLower(ch), upper)); 5792 } 5793 5794 static CharPredicate CIRangeU(int lower, int upper) { 5795 return ch -> { 5796 if (inRange(lower, ch, upper)) 5797 return true; 5798 int up = Character.toUpperCase(ch); 5799 return inRange(lower, up, upper) || 5800 inRange(lower, Character.toLowerCase(up), upper); 5801 }; 5802 } 5803 5804 /** 5805 * This must be the very first initializer. 5806 */ 5807 static final Node accept = new Node(); 5808 5809 static final Node lastAccept = new LastNode(); 5810 5811 /** 5812 * Creates a predicate that tests if for a given input string argument, an 5813 * input sequence, there exists a subsequence that matches this pattern. 5814 * 5815 * @apiNote 5816 * This method creates a predicate that behaves as if it creates a matcher 5817 * from the input sequence and then calls {@code find}, for example a 5818 * predicate of the form: 5819 * <pre>{@code 5820 * s -> matcher(s).find(); 5821 * }</pre> 5822 * 5823 * @return The predicate which can be used for finding a match on a subsequence of a string 5824 * @since 1.8 5825 * @see Matcher#find 5826 */ 5827 public Predicate<String> asPredicate() { 5828 return s -> matcher(s).find(); 5829 } 5830 5831 /** 5832 * Creates a stream from the given input sequence around matches of this 5833 * pattern. 5834 * 5835 * <p> The stream returned by this method contains each substring of the 5836 * input sequence that is terminated by another subsequence that matches 5837 * this pattern or is terminated by the end of the input sequence. The 5838 * substrings in the stream are in the order in which they occur in the 5839 * input. Trailing empty strings will be discarded and not encountered in 5840 * the stream. 5841 * 5842 * <p> If this pattern does not match any subsequence of the input then 5843 * the resulting stream has just one element, namely the input sequence in 5844 * string form. 5845 * 5846 * <p> When there is a positive-width match at the beginning of the input 5847 * sequence then an empty leading substring is included at the beginning 5848 * of the stream. A zero-width match at the beginning however never produces 5849 * such empty leading substring. 5850 * 5851 * <p> If the input sequence is mutable, it must remain constant during the 5852 * execution of the terminal stream operation. Otherwise, the result of the 5853 * terminal stream operation is undefined. 5854 * 5855 * @param input 5856 * The character sequence to be split 5857 * 5858 * @return The stream of strings computed by splitting the input 5859 * around matches of this pattern 5860 * @see #split(CharSequence) 5861 * @since 1.8 5862 */ 5863 public Stream<String> splitAsStream(final CharSequence input) { 5864 class MatcherIterator implements Iterator<String> { 5865 private Matcher matcher; 5866 // The start position of the next sub-sequence of input 5867 // when current == input.length there are no more elements 5868 private int current; 5869 // null if the next element, if any, needs to obtained 5870 private String nextElement; 5871 // > 0 if there are N next empty elements 5872 private int emptyElementCount; 5873 5874 public String next() { 5875 if (!hasNext()) 5876 throw new NoSuchElementException(); 5877 5878 if (emptyElementCount == 0) { 5879 String n = nextElement; 5880 nextElement = null; 5881 return n; 5882 } else { 5883 emptyElementCount--; 5884 return ""; 5885 } 5886 } 5887 5888 public boolean hasNext() { 5889 if (matcher == null) { 5890 matcher = matcher(input); 5891 // If the input is an empty string then the result can only be a 5892 // stream of the input. Induce that by setting the empty 5893 // element count to 1 5894 emptyElementCount = input.length() == 0 ? 1 : 0; 5895 } 5896 if (nextElement != null || emptyElementCount > 0) 5897 return true; 5898 5899 if (current == input.length()) 5900 return false; 5901 5902 // Consume the next matching element 5903 // Count sequence of matching empty elements 5904 while (matcher.find()) { 5905 nextElement = input.subSequence(current, matcher.start()).toString(); 5906 current = matcher.end(); 5907 if (!nextElement.isEmpty()) { 5908 return true; 5909 } else if (current > 0) { // no empty leading substring for zero-width 5910 // match at the beginning of the input 5911 emptyElementCount++; 5912 } 5913 } 5914 5915 // Consume last matching element 5916 nextElement = input.subSequence(current, input.length()).toString(); 5917 current = input.length(); 5918 if (!nextElement.isEmpty()) { 5919 return true; 5920 } else { 5921 // Ignore a terminal sequence of matching empty elements 5922 emptyElementCount = 0; 5923 nextElement = null; 5924 return false; 5925 } 5926 } 5927 } 5928 return StreamSupport.stream(Spliterators.spliteratorUnknownSize( 5929 new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false); 5930 } 5931 }