open Sdiff src/java.base/share/classes/java/util/regex

src/java.base/share/classes/java/util/regex/Pattern.java

rev 56177 : [mq]: 8230365-Pattern-for-a-control-char-matches-non-control-characters

  28 import java.text.Normalizer;
  29 import java.text.Normalizer.Form;
  30 import java.util.Locale;
  31 import java.util.Iterator;
  32 import java.util.Map;
  33 import java.util.ArrayList;
  34 import java.util.HashMap;
  35 import java.util.LinkedHashSet;
  36 import java.util.List;
  37 import java.util.Set;
  38 import java.util.Arrays;
  39 import java.util.NoSuchElementException;
  40 import java.util.Spliterator;
  41 import java.util.Spliterators;
  42 import java.util.function.Predicate;
  43 import java.util.stream.Stream;
  44 import java.util.stream.StreamSupport;
  45 
  46 import jdk.internal.util.ArraysSupport;
  47 


  48 /**
  49  * A compiled representation of a regular expression.
  50  *
  51  * <p> A regular expression, specified as a string, must first be compiled into
  52  * an instance of this class.  The resulting pattern can then be used to create
  53  * a {@link Matcher} object that can match arbitrary {@linkplain
  54  * java.lang.CharSequence character sequences} against the regular
  55  * expression.  All of the state involved in performing a match resides in the
  56  * matcher, so many matchers can share the same pattern.
  57  *
  58  * <p> A typical invocation sequence is thus
  59  *
  60  * <blockquote><pre>
  61  * Pattern p = Pattern.{@link #compile compile}("a*b");
  62  * Matcher m = p.{@link #matcher matcher}("aaaaab");
  63  * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
  64  *
  65  * <p> A {@link #matches matches} method is defined by this class as a
  66  * convenience for when a regular expression is used just once.  This method
  67  * compiles an expression and matches an input sequence against it in a single

 113  * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th>
 114  *     <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i>
 115  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
 116  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
 117  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
 118  * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th>
 119  *     <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr>
 120  * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th>
 121  *     <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr>
 122  * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th>
 123  *     <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
 124  * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th>
 125  *     <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr>
 126  * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th>
 127  *     <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr>
 128  * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th>
 129  *     <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
 130  * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th>
 131  *     <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr>
 132  * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th>
 133  *     <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i></td></tr>


 134  *
 135  *  <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr>
 136  *
 137  * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th>
 138  *     <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
 139  * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th>
 140  *     <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>
 141  * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th>
 142  *     <td headers="matches classes range">{@code a} through {@code z}
 143  *         or {@code A} through {@code Z}, inclusive (range)</td></tr>
 144  * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th>
 145  *     <td headers="matches classes union">{@code a} through {@code d},
 146  *      or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>
 147  * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th>
 148  *     <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr>
 149  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th>
 150  *     <td headers="matches classes subtraction1">{@code a} through {@code z},
 151  *         except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>
 152  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th>
 153  *     <td headers="matches classes subtraction2">{@code a} through {@code z},
 154  *          and not {@code m} through {@code p}: {@code [a-lq-z]}(subtraction)</td></tr>
 155  *
 156  * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr>
 157  *
 158  * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
 159  *     <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
 160  * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
 161  *     <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
 162  * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
 163  *     <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
 164  * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
 165  *     <td headers="matches predef horiz_white">A horizontal whitespace character:
 166  *     <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr>
 167  * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
 168  *     <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
 169  * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
 170  *     <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
 171  * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
 172  *     <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
 173  * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
 174  *     <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code>

1041     /**
1042      * Index into the pattern string that keeps track of how much has been
1043      * parsed.
1044      */
1045     private transient int cursor;
1046 
1047     /**
1048      * Holds the length of the pattern string.
1049      */
1050     private transient int patternLength;
1051 
1052     /**
1053      * If the Start node might possibly match supplementary characters.
1054      * It is set to true during compiling if
1055      * (1) There is supplementary char in pattern, or
1056      * (2) There is complement node of a "family" CharProperty
1057      */
1058     private transient boolean hasSupplementary;
1059 
1060     /**



















1061      * Compiles the given regular expression into a pattern.
1062      *
1063      * @param  regex
1064      *         The expression to be compiled
1065      * @return the given regular expression compiled into a pattern
1066      * @throws  PatternSyntaxException
1067      *          If the expression's syntax is invalid
1068      */
1069     public static Pattern compile(String regex) {
1070         return new Pattern(regex, 0);
1071     }
1072 
1073     /**
1074      * Compiles the given regular expression into a pattern with the given
1075      * flags.
1076      *
1077      * @param  regex
1078      *         The expression to be compiled
1079      *
1080      * @param  flags

3308                     return new Curly(prev, cmin, cmax, Qtype.LAZY);
3309                 } else if (ch == '+') {
3310                     next();
3311                     return new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3312                 } else {
3313                     return new Curly(prev, cmin, cmax, Qtype.GREEDY);
3314                 }
3315             } else {
3316                 throw error("Illegal repetition");
3317             }
3318         default:
3319             return prev;
3320         }
3321     }
3322 
3323     /**
3324      *  Utility method for parsing control escape sequences.
3325      */
3326     private int c() {
3327         if (cursor < patternLength) {
3328             return read() ^ 64;




3329         }
3330         throw error("Illegal control escape sequence");
3331     }
3332 
3333     /**
3334      *  Utility method for parsing octal escape sequences.
3335      */
3336     private int o() {
3337         int n = read();
3338         if (((n-'0')|('7'-n)) >= 0) {
3339             int m = read();
3340             if (((m-'0')|('7'-m)) >= 0) {
3341                 int o = read();
3342                 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
3343                     return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
3344                 }
3345                 unread();
3346                 return (n - '0') * 8 + (m - '0');
3347             }
3348             unread();

  28 import java.text.Normalizer;
  29 import java.text.Normalizer.Form;
  30 import java.util.Locale;
  31 import java.util.Iterator;
  32 import java.util.Map;
  33 import java.util.ArrayList;
  34 import java.util.HashMap;
  35 import java.util.LinkedHashSet;
  36 import java.util.List;
  37 import java.util.Set;
  38 import java.util.Arrays;
  39 import java.util.NoSuchElementException;
  40 import java.util.Spliterator;
  41 import java.util.Spliterators;
  42 import java.util.function.Predicate;
  43 import java.util.stream.Stream;
  44 import java.util.stream.StreamSupport;
  45 
  46 import jdk.internal.util.ArraysSupport;
  47 
  48 import sun.security.action.GetPropertyAction;
  49 
  50 /**
  51  * A compiled representation of a regular expression.
  52  *
  53  * <p> A regular expression, specified as a string, must first be compiled into
  54  * an instance of this class.  The resulting pattern can then be used to create
  55  * a {@link Matcher} object that can match arbitrary {@linkplain
  56  * java.lang.CharSequence character sequences} against the regular
  57  * expression.  All of the state involved in performing a match resides in the
  58  * matcher, so many matchers can share the same pattern.
  59  *
  60  * <p> A typical invocation sequence is thus
  61  *
  62  * <blockquote><pre>
  63  * Pattern p = Pattern.{@link #compile compile}("a*b");
  64  * Matcher m = p.{@link #matcher matcher}("aaaaab");
  65  * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
  66  *
  67  * <p> A {@link #matches matches} method is defined by this class as a
  68  * convenience for when a regular expression is used just once.  This method
  69  * compiles an expression and matches an input sequence against it in a single

 115  * <tr><th style="vertical-align:top; font-weight: normal" id="hex_h_h"><code>\x</code><i>{h...h}</i></th>
 116  *     <td headers="matches characters hex_h_h">The character with hexadecimal value {@code 0x}<i>h...h</i>
 117  *         ({@link java.lang.Character#MIN_CODE_POINT Character.MIN_CODE_POINT}
 118  *         &nbsp;&lt;=&nbsp;{@code 0x}<i>h...h</i>&nbsp;&lt;=&nbsp;
 119  *          {@link java.lang.Character#MAX_CODE_POINT Character.MAX_CODE_POINT})</td></tr>
 120  * <tr><th style="vertical-align:top; font-weight: normal" id="unicode_name"><code>\N{</code><i>name</i><code>}</code></th>
 121  *     <td headers="matches characters unicode_name">The character with Unicode character name <i>'name'</i></td></tr>
 122  * <tr><th style="vertical-align:top; font-weight:normal" id="tab">{@code \t}</th>
 123  *     <td headers="matches characters tab">The tab character (<code>'\u0009'</code>)</td></tr>
 124  * <tr><th style="vertical-align:top; font-weight:normal" id="newline">{@code \n}</th>
 125  *     <td headers="matches characters newline">The newline (line feed) character (<code>'\u000A'</code>)</td></tr>
 126  * <tr><th style="vertical-align:top; font-weight:normal" id="return">{@code \r}</th>
 127  *     <td headers="matches characters return">The carriage-return character (<code>'\u000D'</code>)</td></tr>
 128  * <tr><th style="vertical-align:top; font-weight:normal" id="form_feed">{@code \f}</th>
 129  *     <td headers="matches characters form_feed">The form-feed character (<code>'\u000C'</code>)</td></tr>
 130  * <tr><th style="vertical-align:top; font-weight:normal" id="bell">{@code \a}</th>
 131  *     <td headers="matches characters bell">The alert (bell) character (<code>'\u0007'</code>)</td></tr>
 132  * <tr><th style="vertical-align:top; font-weight:normal" id="escape">{@code \e}</th>
 133  *     <td headers="matches characters escape">The escape character (<code>'\u001B'</code>)</td></tr>
 134  * <tr><th style="vertical-align:top; font-weight:normal" id="ctrl_x">{@code \c}<i>x</i></th>
 135  *     <td headers="matches characters ctrl_x">The control character corresponding to <i>x</i>
 136  *         (<i>x</i> is either {@code A} through {@code Z} or one of
 137  *          {@code ?}, {@code @}, {@code [}, {@code \\}, {@code ]}, {@code ^}, {@code _})</td></tr>
 138  *
 139  *  <tr><th colspan="2" style="padding-top:20px" id="classes">Character classes</th></tr>
 140  *
 141  * <tr><th style="vertical-align:top; font-weight:normal" id="simple">{@code [abc]}</th>
 142  *     <td headers="matches classes simple">{@code a}, {@code b}, or {@code c} (simple class)</td></tr>
 143  * <tr><th style="vertical-align:top; font-weight:normal" id="negation">{@code [^abc]}</th>
 144  *     <td headers="matches classes negation">Any character except {@code a}, {@code b}, or {@code c} (negation)</td></tr>
 145  * <tr><th style="vertical-align:top; font-weight:normal" id="range">{@code [a-zA-Z]}</th>
 146  *     <td headers="matches classes range">{@code a} through {@code z}
 147  *         or {@code A} through {@code Z}, inclusive (range)</td></tr>
 148  * <tr><th style="vertical-align:top; font-weight:normal" id="union">{@code [a-d[m-p]]}</th>
 149  *     <td headers="matches classes union">{@code a} through {@code d},
 150  *      or {@code m} through {@code p}: {@code [a-dm-p]} (union)</td></tr>
 151  * <tr><th style="vertical-align:top; font-weight:normal" id="intersection">{@code [a-z&&[def]]}</th>
 152  *     <td headers="matches classes intersection">{@code d}, {@code e}, or {@code f} (intersection)</tr>
 153  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction1">{@code [a-z&&[^bc]]}</th>
 154  *     <td headers="matches classes subtraction1">{@code a} through {@code z},
 155  *         except for {@code b} and {@code c}: {@code [ad-z]} (subtraction)</td></tr>
 156  * <tr><th style="vertical-align:top; font-weight:normal" id="subtraction2">{@code [a-z&&[^m-p]]}</th>
 157  *     <td headers="matches classes subtraction2">{@code a} through {@code z},
 158  *          and not {@code m} through {@code p}: {@code [a-lq-z]} (subtraction)</td></tr>
 159  *
 160  * <tr><th colspan="2" style="padding-top:20px" id="predef">Predefined character classes</th></tr>
 161  *
 162  * <tr><th style="vertical-align:top; font-weight:normal" id="any">{@code .}</th>
 163  *     <td headers="matches predef any">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
 164  * <tr><th style="vertical-align:top; font-weight:normal" id="digit">{@code \d}</th>
 165  *     <td headers="matches predef digit">A digit: {@code [0-9]}</td></tr>
 166  * <tr><th style="vertical-align:top; font-weight:normal" id="non_digit">{@code \D}</th>
 167  *     <td headers="matches predef non_digit">A non-digit: {@code [^0-9]}</td></tr>
 168  * <tr><th style="vertical-align:top; font-weight:normal" id="horiz_white">{@code \h}</th>
 169  *     <td headers="matches predef horiz_white">A horizontal whitespace character:
 170  *     <code>[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]</code></td></tr>
 171  * <tr><th style="vertical-align:top; font-weight:normal" id="non_horiz_white">{@code \H}</th>
 172  *     <td headers="matches predef non_horiz_white">A non-horizontal whitespace character: {@code [^\h]}</td></tr>
 173  * <tr><th style="vertical-align:top; font-weight:normal" id="white">{@code \s}</th>
 174  *     <td headers="matches predef white">A whitespace character: {@code [ \t\n\x0B\f\r]}</td></tr>
 175  * <tr><th style="vertical-align:top; font-weight:normal" id="non_white">{@code \S}</th>
 176  *     <td headers="matches predef non_white">A non-whitespace character: {@code [^\s]}</td></tr>
 177  * <tr><th style="vertical-align:top; font-weight:normal" id="vert_white">{@code \v}</th>
 178  *     <td headers="matches predef vert_white">A vertical whitespace character: <code>[\n\x0B\f\r\x85\u2028\u2029]</code>

1045     /**
1046      * Index into the pattern string that keeps track of how much has been
1047      * parsed.
1048      */
1049     private transient int cursor;
1050 
1051     /**
1052      * Holds the length of the pattern string.
1053      */
1054     private transient int patternLength;
1055 
1056     /**
1057      * If the Start node might possibly match supplementary characters.
1058      * It is set to true during compiling if
1059      * (1) There is supplementary char in pattern, or
1060      * (2) There is complement node of a "family" CharProperty
1061      */
1062     private transient boolean hasSupplementary;
1063 
1064     /**
1065      * If {@code true} then only limited list of chars is accepted as
1066      * control-character IDs in regular expressions of form "\\cX":
1067      * 'A' through 'Z', '?', '@', '[', '\\', ']', '^', '_'.
1068      * Otherwise, no restrictions on the IDs are exposed.
1069      */
1070     private static final boolean RESTRICTED_CONTROL_CHAR_IDS = Boolean.valueOf(
1071             GetPropertyAction.privilegedGetProperty(
1072                     "jdk.util.regex.restrictedControlCharIds", "true"));
1073 
1074     /**
1075      * If {@code true} then lower-case control-character ids are mapped to the
1076      * their upper-case counterparts.
1077      * For example, "\\ca" will be the same as "\\cA".
1078      */
1079     private static final boolean ALLOW_LOWERCASE_CONTROL_CHAR_IDS = Boolean.valueOf(
1080             GetPropertyAction.privilegedGetProperty(
1081                     "jdk.util.regex.allowLowerCaseControlCharIds", "false"));
1082 
1083     /**
1084      * Compiles the given regular expression into a pattern.
1085      *
1086      * @param  regex
1087      *         The expression to be compiled
1088      * @return the given regular expression compiled into a pattern
1089      * @throws  PatternSyntaxException
1090      *          If the expression's syntax is invalid
1091      */
1092     public static Pattern compile(String regex) {
1093         return new Pattern(regex, 0);
1094     }
1095 
1096     /**
1097      * Compiles the given regular expression into a pattern with the given
1098      * flags.
1099      *
1100      * @param  regex
1101      *         The expression to be compiled
1102      *
1103      * @param  flags

3331                     return new Curly(prev, cmin, cmax, Qtype.LAZY);
3332                 } else if (ch == '+') {
3333                     next();
3334                     return new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3335                 } else {
3336                     return new Curly(prev, cmin, cmax, Qtype.GREEDY);
3337                 }
3338             } else {
3339                 throw error("Illegal repetition");
3340             }
3341         default:
3342             return prev;
3343         }
3344     }
3345 
3346     /**
3347      *  Utility method for parsing control escape sequences.
3348      */
3349     private int c() {
3350         if (cursor < patternLength) {
3351             int ch = read();
3352             if (ALLOW_LOWERCASE_CONTROL_CHAR_IDS && ASCII.isLower(ch))
3353                 return ch ^ 0x60;
3354             if (!RESTRICTED_CONTROL_CHAR_IDS || ASCII.isCntrlId(ch))
3355                 return ch ^ 0x40;
3356         }
3357         throw error("Illegal control escape sequence");
3358     }
3359 
3360     /**
3361      *  Utility method for parsing octal escape sequences.
3362      */
3363     private int o() {
3364         int n = read();
3365         if (((n-'0')|('7'-n)) >= 0) {
3366             int m = read();
3367             if (((m-'0')|('7'-m)) >= 0) {
3368                 int o = read();
3369                 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
3370                     return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
3371                 }
3372                 unread();
3373                 return (n - '0') * 8 + (m - '0');
3374             }
3375             unread();

< prev index next >