28 import java.text.Normalizer;
29 import java.text.Normalizer.Form;
30 import java.util.Locale;
31 import java.util.Iterator;
32 import java.util.Map;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.LinkedHashSet;
36 import java.util.List;
37 import java.util.Set;
38 import java.util.Arrays;
39 import java.util.NoSuchElementException;
40 import java.util.Spliterator;
41 import java.util.Spliterators;
42 import java.util.function.Predicate;
43 import java.util.stream.Stream;
44 import java.util.stream.StreamSupport;
45
46 import jdk.internal.util.ArraysSupport;
47
48 /**
49 * A compiled representation of a regular expression.
50 *
51 * <p> A regular expression, specified as a string, must first be compiled into
52 * an instance of this class. The resulting pattern can then be used to create
53 * a {@link Matcher} object that can match arbitrary {@linkplain
54 * java.lang.CharSequence character sequences} against the regular
55 * expression. All of the state involved in performing a match resides in the
56 * matcher, so many matchers can share the same pattern.
57 *
58 * <p> A typical invocation sequence is thus
59 *
60 * <blockquote><pre>
61 * Pattern p = Pattern.{@link #compile compile}("a*b");
62 * Matcher m = p.{@link #matcher matcher}("aaaaab");
63 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
64 *
65 * <p> A {@link #matches matches} method is defined by this class as a
66 * convenience for when a regular expression is used just once. This method
67 * compiles an expression and matches an input sequence against it in a single
113 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th>
114 * <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i>
115 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
116 * <= {@code 0x}<i>h...h</i> <=
117 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
118 * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th>
119 * <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr>
120 * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th>
121 * <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr>
122 * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th>
123 * <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
124 * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th>
125 * <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr>
126 * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th>
127 * <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr>
128 * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th>
129 * <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
130 * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th>
131 * <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr>
132 * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th>
133 * <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr>
134 *
135 * <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr>
136 *
137 * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th>
138 * <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
139 * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th>
140 * <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>
141 * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th>
142 * <td headers="matches classes range">{@code a} through {@code z}
143 * or {@code A} through {@code Z}, inclusive (range)</td></tr>
144 * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th>
145 * <td headers="matches classes union">{@code a} through {@code d},
146 * or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>
147 * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th>
148 * <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr>
149 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th>
150 * <td headers="matches classes subtraction1">{@code a} through {@code z},
151 * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>
152 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th>
153 * <td headers="matches classes subtraction2">{@code a} through {@code z},
154 * and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr>
155 *
156 * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr>
157 *
158 * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
159 * <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
160 * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
161 * <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
162 * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
163 * <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
164 * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
165 * <td headers="matches predef horiz_white">A horizontal whitespace character:
166 * <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr>
167 * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
168 * <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
169 * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
170 * <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
171 * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
172 * <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
173 * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
174 * <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code>
1041 /**
1042 * Index into the pattern string that keeps track of how much has been
1043 * parsed.
1044 */
1045 private transient int cursor;
1046
1047 /**
1048 * Holds the length of the pattern string.
1049 */
1050 private transient int patternLength;
1051
1052 /**
1053 * If the Start node might possibly match supplementary characters.
1054 * It is set to true during compiling if
1055 * (1) There is supplementary char in pattern, or
1056 * (2) There is complement node of a "family" CharProperty
1057 */
1058 private transient boolean hasSupplementary;
1059
1060 /**
1061 * Compiles the given regular expression into a pattern.
1062 *
1063 * @param regex
1064 * The expression to be compiled
1065 * @return the given regular expression compiled into a pattern
1066 * @throws PatternSyntaxException
1067 * If the expression's syntax is invalid
1068 */
1069 public static Pattern compile(String regex) {
1070 return new Pattern(regex, 0);
1071 }
1072
1073 /**
1074 * Compiles the given regular expression into a pattern with the given
1075 * flags.
1076 *
1077 * @param regex
1078 * The expression to be compiled
1079 *
1080 * @param flags
3308 return new Curly(prev, cmin, cmax, Qtype.LAZY);
3309 } else if (ch == '+') {
3310 next();
3311 return new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3312 } else {
3313 return new Curly(prev, cmin, cmax, Qtype.GREEDY);
3314 }
3315 } else {
3316 throw error("Illegal repetition");
3317 }
3318 default:
3319 return prev;
3320 }
3321 }
3322
3323 /**
3324 * Utility method for parsing control escape sequences.
3325 */
3326 private int c() {
3327 if (cursor < patternLength) {
3328 return read() ^ 64;
3329 }
3330 throw error("Illegal control escape sequence");
3331 }
3332
3333 /**
3334 * Utility method for parsing octal escape sequences.
3335 */
3336 private int o() {
3337 int n = read();
3338 if (((n-'0')|('7'-n)) >= 0) {
3339 int m = read();
3340 if (((m-'0')|('7'-m)) >= 0) {
3341 int o = read();
3342 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
3343 return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
3344 }
3345 unread();
3346 return (n - '0') * 8 + (m - '0');
3347 }
3348 unread();
|
28 import java.text.Normalizer;
29 import java.text.Normalizer.Form;
30 import java.util.Locale;
31 import java.util.Iterator;
32 import java.util.Map;
33 import java.util.ArrayList;
34 import java.util.HashMap;
35 import java.util.LinkedHashSet;
36 import java.util.List;
37 import java.util.Set;
38 import java.util.Arrays;
39 import java.util.NoSuchElementException;
40 import java.util.Spliterator;
41 import java.util.Spliterators;
42 import java.util.function.Predicate;
43 import java.util.stream.Stream;
44 import java.util.stream.StreamSupport;
45
46 import jdk.internal.util.ArraysSupport;
47
48 import sun.security.action.GetPropertyAction;
49
50 /**
51 * A compiled representation of a regular expression.
52 *
53 * <p> A regular expression, specified as a string, must first be compiled into
54 * an instance of this class. The resulting pattern can then be used to create
55 * a {@link Matcher} object that can match arbitrary {@linkplain
56 * java.lang.CharSequence character sequences} against the regular
57 * expression. All of the state involved in performing a match resides in the
58 * matcher, so many matchers can share the same pattern.
59 *
60 * <p> A typical invocation sequence is thus
61 *
62 * <blockquote><pre>
63 * Pattern p = Pattern.{@link #compile compile}("a*b");
64 * Matcher m = p.{@link #matcher matcher}("aaaaab");
65 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
66 *
67 * <p> A {@link #matches matches} method is defined by this class as a
68 * convenience for when a regular expression is used just once. This method
69 * compiles an expression and matches an input sequence against it in a single
115 * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th>
116 * <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i>
117 * ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
118 * <= {@code 0x}<i>h...h</i> <=
119 * {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
120 * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th>
121 * <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr>
122 * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th>
123 * <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr>
124 * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th>
125 * <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
126 * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th>
127 * <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr>
128 * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th>
129 * <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr>
130 * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th>
131 * <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
132 * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th>
133 * <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr>
134 * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th>
135 * <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i>
136 * (<i>x</i> is either {@code A} through {@code Z} or one of
137 * {@code ?}, {@code @}, {@code [}, {@code \\}, {@code ]}, {@code ^}, {@code _})</td></tr>
138 *
139 * <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr>
140 *
141 * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th>
142 * <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
143 * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th>
144 * <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>
145 * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th>
146 * <td headers="matches classes range">{@code a} through {@code z}
147 * or {@code A} through {@code Z}, inclusive (range)</td></tr>
148 * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th>
149 * <td headers="matches classes union">{@code a} through {@code d},
150 * or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>
151 * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th>
152 * <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr>
153 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th>
154 * <td headers="matches classes subtraction1">{@code a} through {@code z},
155 * except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>
156 * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th>
157 * <td headers="matches classes subtraction2">{@code a} through {@code z},
158 * and not {@code m} through {@code p}: {@code [a-lq-z]} (subtraction)</td></tr>
159 *
160 * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr>
161 *
162 * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
163 * <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
164 * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
165 * <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
166 * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
167 * <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
168 * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
169 * <td headers="matches predef horiz_white">A horizontal whitespace character:
170 * <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr>
171 * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
172 * <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
173 * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
174 * <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
175 * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
176 * <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
177 * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
178 * <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code>
1045 /**
1046 * Index into the pattern string that keeps track of how much has been
1047 * parsed.
1048 */
1049 private transient int cursor;
1050
1051 /**
1052 * Holds the length of the pattern string.
1053 */
1054 private transient int patternLength;
1055
1056 /**
1057 * If the Start node might possibly match supplementary characters.
1058 * It is set to true during compiling if
1059 * (1) There is supplementary char in pattern, or
1060 * (2) There is complement node of a "family" CharProperty
1061 */
1062 private transient boolean hasSupplementary;
1063
1064 /**
1065 * If {@code true} then only limited list of chars is accepted as
1066 * control-character IDs in regular expressions of form "\\cX":
1067 * 'A' through 'Z', '?', '@', '[', '\\', ']', '^', '_'.
1068 * Otherwise, no restrictions on the IDs are exposed.
1069 */
1070 private static final boolean RESTRICTED_CONTROL_CHAR_IDS = Boolean.valueOf(
1071 GetPropertyAction.privilegedGetProperty(
1072 "jdk.util.regex.restrictedControlCharIds", "true"));
1073
1074 /**
1075 * If {@code true} then lower-case control-character ids are mapped to the
1076 * their upper-case counterparts.
1077 * For example, "\\ca" will be the same as "\\cA".
1078 */
1079 private static final boolean ALLOW_LOWERCASE_CONTROL_CHAR_IDS = Boolean.valueOf(
1080 GetPropertyAction.privilegedGetProperty(
1081 "jdk.util.regex.allowLowerCaseControlCharIds", "false"));
1082
1083 /**
1084 * Compiles the given regular expression into a pattern.
1085 *
1086 * @param regex
1087 * The expression to be compiled
1088 * @return the given regular expression compiled into a pattern
1089 * @throws PatternSyntaxException
1090 * If the expression's syntax is invalid
1091 */
1092 public static Pattern compile(String regex) {
1093 return new Pattern(regex, 0);
1094 }
1095
1096 /**
1097 * Compiles the given regular expression into a pattern with the given
1098 * flags.
1099 *
1100 * @param regex
1101 * The expression to be compiled
1102 *
1103 * @param flags
3331 return new Curly(prev, cmin, cmax, Qtype.LAZY);
3332 } else if (ch == '+') {
3333 next();
3334 return new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3335 } else {
3336 return new Curly(prev, cmin, cmax, Qtype.GREEDY);
3337 }
3338 } else {
3339 throw error("Illegal repetition");
3340 }
3341 default:
3342 return prev;
3343 }
3344 }
3345
3346 /**
3347 * Utility method for parsing control escape sequences.
3348 */
3349 private int c() {
3350 if (cursor < patternLength) {
3351 int ch = read();
3352 if (ALLOW_LOWERCASE_CONTROL_CHAR_IDS && ASCII.isLower(ch))
3353 return ch ^ 0x60;
3354 if (!RESTRICTED_CONTROL_CHAR_IDS || ASCII.isCntrlId(ch))
3355 return ch ^ 0x40;
3356 }
3357 throw error("Illegal control escape sequence");
3358 }
3359
3360 /**
3361 * Utility method for parsing octal escape sequences.
3362 */
3363 private int o() {
3364 int n = read();
3365 if (((n-'0')|('7'-n)) >= 0) {
3366 int m = read();
3367 if (((m-'0')|('7'-m)) >= 0) {
3368 int o = read();
3369 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
3370 return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
3371 }
3372 unread();
3373 return (n - '0') * 8 + (m - '0');
3374 }
3375 unread();
|