< prev index next >

src/java.base/share/classes/java/util/regex/Pattern.java

Print this page
rev 12318 : [mq]: 8131034-Cleanup-in-j.u.regex.Pattern.quote


 548  * <p>
 549  * A Unicode character can also be represented in a regular-expression by
 550  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
 551  * <tt>\x{...}</tt>, for example a supplementary character U+2011F
 552  * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
 553  * Unicode escape sequences of the surrogate pair
 554  * <tt>\uD840</tt><tt>\uDD1F</tt>.
 555  * <p>
 556  * Unicode scripts, blocks, categories and binary properties are written with
 557  * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
 558  * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
 559  * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
 560  * does not match if the input has that property.
 561  * <p>
 562  * Scripts, blocks, categories and binary properties can be used both inside
 563  * and outside of a character class.
 564  *
 565  * <p>
 566  * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
 567  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
 568  * form {@code sc})as in {@code script=Hiragana} or {@code sc=Hiragana}.
 569  * <p>
 570  * The script names supported by <code>Pattern</code> are the valid script names
 571  * accepted and defined by
 572  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
 573  *
 574  * <p>
 575  * <b><a name="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in
 576  * {@code InMongolian}, or by using the keyword {@code block} (or its short
 577  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
 578  * <p>
 579  * The block names supported by <code>Pattern</code> are the valid block names
 580  * accepted and defined by
 581  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
 582  * <p>
 583  *
 584  * <b><a name="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}:
 585  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
 586  * letters. Same as scripts and blocks, categories can also be specified
 587  * by using the keyword {@code general_category} (or its short form
 588  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.


1282 
1283     /**
1284      * Returns a literal pattern <code>String</code> for the specified
1285      * <code>String</code>.
1286      *
1287      * <p>This method produces a <code>String</code> that can be used to
1288      * create a <code>Pattern</code> that would match the string
1289      * <code>s</code> as if it were a literal pattern.</p> Metacharacters
1290      * or escape sequences in the input sequence will be given no special
1291      * meaning.
1292      *
1293      * @param  s The string to be literalized
1294      * @return  A literal string replacement
1295      * @since 1.5
1296      */
1297     public static String quote(String s) {
1298         int slashEIndex = s.indexOf("\\E");
1299         if (slashEIndex == -1)
1300             return "\\Q" + s + "\\E";
1301 
1302         StringBuilder sb = new StringBuilder(s.length() * 2);




1303         sb.append("\\Q");
1304         slashEIndex = 0;
1305         int current = 0;
1306         while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
1307             sb.append(s.substring(current, slashEIndex));

1308             current = slashEIndex + 2;
1309             sb.append("\\E\\\\E\\Q");
1310         }
1311         sb.append(s.substring(current, s.length()));
1312         sb.append("\\E");
1313         return sb.toString();
1314     }
1315 
1316     /**
1317      * Recompile the Pattern instance from a stream.  The original pattern
1318      * string is read in and the object tree is recompiled from it.
1319      */
1320     private void readObject(java.io.ObjectInputStream s)
1321         throws java.io.IOException, ClassNotFoundException {
1322 
1323         // Read in all fields
1324         s.defaultReadObject();
1325 
1326         // Initialize counts
1327         capturingGroupCount = 1;
1328         localCount = 0;
1329 
1330         // if length > 0, the Pattern is lazily compiled
1331         compiled = false;
1332         if (pattern.length() == 0) {
1333             root = new Start(lastAccept);


1350         pattern = p;
1351         flags = f;
1352 
1353         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1354         if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1355             flags |= UNICODE_CASE;
1356 
1357         // Reset group index count
1358         capturingGroupCount = 1;
1359         localCount = 0;
1360 
1361         if (pattern.length() > 0) {
1362             compile();
1363         } else {
1364             root = new Start(lastAccept);
1365             matchRoot = lastAccept;
1366         }
1367     }
1368 
1369     /**
1370      * The pattern is converted to normalizedD form and then a pure group
1371      * is constructed to match canonical equivalences of the characters.
1372      */
1373     private void normalize() {
1374         boolean inCharClass = false;
1375         int lastCodePoint = -1;
1376 
1377         // Convert pattern into normalizedD form
1378         normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1379         patternLength = normalizedPattern.length();
1380 
1381         // Modify pattern to match canonical equivalences
1382         StringBuilder newPattern = new StringBuilder(patternLength);
1383         for(int i=0; i<patternLength; ) {
1384             int c = normalizedPattern.codePointAt(i);
1385             StringBuilder sequenceBuffer;
1386             if ((Character.getType(c) == Character.NON_SPACING_MARK)
1387                 && (lastCodePoint != -1)) {
1388                 sequenceBuffer = new StringBuilder();
1389                 sequenceBuffer.appendCodePoint(lastCodePoint);
1390                 sequenceBuffer.appendCodePoint(c);
1391                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1392                     i += Character.charCount(c);
1393                     if (i >= patternLength)
1394                         break;
1395                     c = normalizedPattern.codePointAt(i);
1396                     sequenceBuffer.appendCodePoint(c);
1397                 }




 548  * <p>
 549  * A Unicode character can also be represented in a regular-expression by
 550  * using its <b>Hex notation</b>(hexadecimal code point value) directly as described in construct
 551  * <tt>\x{...}</tt>, for example a supplementary character U+2011F
 552  * can be specified as <tt>\x{2011F}</tt>, instead of two consecutive
 553  * Unicode escape sequences of the surrogate pair
 554  * <tt>\uD840</tt><tt>\uDD1F</tt>.
 555  * <p>
 556  * Unicode scripts, blocks, categories and binary properties are written with
 557  * the <tt>\p</tt> and <tt>\P</tt> constructs as in Perl.
 558  * <tt>\p{</tt><i>prop</i><tt>}</tt> matches if
 559  * the input has the property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt>
 560  * does not match if the input has that property.
 561  * <p>
 562  * Scripts, blocks, categories and binary properties can be used both inside
 563  * and outside of a character class.
 564  *
 565  * <p>
 566  * <b><a name="usc">Scripts</a></b> are specified either with the prefix {@code Is}, as in
 567  * {@code IsHiragana}, or by using  the {@code script} keyword (or its short
 568  * form {@code sc}) as in {@code script=Hiragana} or {@code sc=Hiragana}.
 569  * <p>
 570  * The script names supported by <code>Pattern</code> are the valid script names
 571  * accepted and defined by
 572  * {@link java.lang.Character.UnicodeScript#forName(String) UnicodeScript.forName}.
 573  *
 574  * <p>
 575  * <b><a name="ubc">Blocks</a></b> are specified with the prefix {@code In}, as in
 576  * {@code InMongolian}, or by using the keyword {@code block} (or its short
 577  * form {@code blk}) as in {@code block=Mongolian} or {@code blk=Mongolian}.
 578  * <p>
 579  * The block names supported by <code>Pattern</code> are the valid block names
 580  * accepted and defined by
 581  * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
 582  * <p>
 583  *
 584  * <b><a name="ucc">Categories</a></b> may be specified with the optional prefix {@code Is}:
 585  * Both {@code \p{L}} and {@code \p{IsL}} denote the category of Unicode
 586  * letters. Same as scripts and blocks, categories can also be specified
 587  * by using the keyword {@code general_category} (or its short form
 588  * {@code gc}) as in {@code general_category=Lu} or {@code gc=Lu}.


1282 
1283     /**
1284      * Returns a literal pattern <code>String</code> for the specified
1285      * <code>String</code>.
1286      *
1287      * <p>This method produces a <code>String</code> that can be used to
1288      * create a <code>Pattern</code> that would match the string
1289      * <code>s</code> as if it were a literal pattern.</p> Metacharacters
1290      * or escape sequences in the input sequence will be given no special
1291      * meaning.
1292      *
1293      * @param  s The string to be literalized
1294      * @return  A literal string replacement
1295      * @since 1.5
1296      */
1297     public static String quote(String s) {
1298         int slashEIndex = s.indexOf("\\E");
1299         if (slashEIndex == -1)
1300             return "\\Q" + s + "\\E";
1301 
1302         int lenHint = s.length();
1303         lenHint = (lenHint < Integer.MAX_VALUE - 8 - lenHint) ?
1304                 (lenHint << 1) : (Integer.MAX_VALUE - 8);
1305 
1306         StringBuilder sb = new StringBuilder(lenHint);
1307         sb.append("\\Q");

1308         int current = 0;
1309         do {
1310             sb.append(s, current, slashEIndex)
1311                     .append("\\E\\\\E\\Q");
1312             current = slashEIndex + 2;
1313         } while ((slashEIndex = s.indexOf("\\E", current)) != -1);
1314 
1315         return sb.append(s, current, s.length())
1316                 .append("\\E")
1317                 .toString();
1318     }
1319 
1320     /**
1321      * Recompile the Pattern instance from a stream.  The original pattern
1322      * string is read in and the object tree is recompiled from it.
1323      */
1324     private void readObject(java.io.ObjectInputStream s)
1325         throws java.io.IOException, ClassNotFoundException {
1326 
1327         // Read in all fields
1328         s.defaultReadObject();
1329 
1330         // Initialize counts
1331         capturingGroupCount = 1;
1332         localCount = 0;
1333 
1334         // if length > 0, the Pattern is lazily compiled
1335         compiled = false;
1336         if (pattern.length() == 0) {
1337             root = new Start(lastAccept);


1354         pattern = p;
1355         flags = f;
1356 
1357         // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
1358         if ((flags & UNICODE_CHARACTER_CLASS) != 0)
1359             flags |= UNICODE_CASE;
1360 
1361         // Reset group index count
1362         capturingGroupCount = 1;
1363         localCount = 0;
1364 
1365         if (pattern.length() > 0) {
1366             compile();
1367         } else {
1368             root = new Start(lastAccept);
1369             matchRoot = lastAccept;
1370         }
1371     }
1372 
1373     /**
1374      * The pattern is converted to normalized form and then a pure group
1375      * is constructed to match canonical equivalences of the characters.
1376      */
1377     private void normalize() {
1378         boolean inCharClass = false;
1379         int lastCodePoint = -1;
1380 
1381         // Convert pattern into normalized form
1382         normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
1383         patternLength = normalizedPattern.length();
1384 
1385         // Modify pattern to match canonical equivalences
1386         StringBuilder newPattern = new StringBuilder(patternLength);
1387         for(int i=0; i<patternLength; ) {
1388             int c = normalizedPattern.codePointAt(i);
1389             StringBuilder sequenceBuffer;
1390             if ((Character.getType(c) == Character.NON_SPACING_MARK)
1391                 && (lastCodePoint != -1)) {
1392                 sequenceBuffer = new StringBuilder();
1393                 sequenceBuffer.appendCodePoint(lastCodePoint);
1394                 sequenceBuffer.appendCodePoint(c);
1395                 while(Character.getType(c) == Character.NON_SPACING_MARK) {
1396                     i += Character.charCount(c);
1397                     if (i >= patternLength)
1398                         break;
1399                     c = normalizedPattern.codePointAt(i);
1400                     sequenceBuffer.appendCodePoint(c);
1401                 }


< prev index next >