test/java/util/regex/RegExTest.java

Print this page




  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /**
  25  * @test
  26  * @summary tests RegExp framework (use -Dseed=X to set PRNG seed)
  27  * @author Mike McCloskey
  28  * @bug 4481568 4482696 4495089 4504687 4527731 4599621 4631553 4619345
  29  * 4630911 4672616 4711773 4727935 4750573 4792284 4803197 4757029 4808962
  30  * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
  31  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  32  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  33  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
  34  * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
  35  * 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819

  36  * @library /lib/testlibrary
  37  * @build jdk.testlibrary.*
  38  * @run main RegExTest
  39  * @key randomness
  40  */
  41 
  42 import java.util.function.Function;
  43 import java.util.regex.*;
  44 import java.util.Random;
  45 import java.util.Scanner;
  46 import java.io.*;
  47 import java.nio.file.*;
  48 import java.util.*;
  49 import java.nio.CharBuffer;
  50 import java.util.function.Predicate;
  51 import jdk.testlibrary.RandomFactory;
  52 
  53 /**
  54  * This is a test class created to check the operation of
  55  * the Pattern and Matcher classes.


2642             failCount++;
2643 
2644         // Marks that cannot legally change order and be equivalent
2645         p = Pattern.compile("testa\u0308\u0300", Pattern.CANON_EQ);
2646         check(p, "testa\u0308\u0300", true);
2647         check(p, "testa\u0300\u0308", false);
2648 
2649         // Marks that can legally change order and be equivalent
2650         p = Pattern.compile("testa\u0308\u0323", Pattern.CANON_EQ);
2651         check(p, "testa\u0308\u0323", true);
2652         check(p, "testa\u0323\u0308", true);
2653 
2654         // Test all equivalences of the sequence a\u0308\u0323\u0300
2655         p = Pattern.compile("testa\u0308\u0323\u0300", Pattern.CANON_EQ);
2656         check(p, "testa\u0308\u0323\u0300", true);
2657         check(p, "testa\u0323\u0308\u0300", true);
2658         check(p, "testa\u0308\u0300\u0323", true);
2659         check(p, "test\u00e4\u0323\u0300", true);
2660         check(p, "test\u00e4\u0300\u0323", true);
2661 
2662         /*
2663          * The following canonical equivalence tests don't work. Bug id: 4916384.
2664          *
2665         // Decomposed hangul (jamos)
2666         p = Pattern.compile("\u1100\u1161", Pattern.CANON_EQ);
2667         m = p.matcher("\u1100\u1161");
2668         if (!m.matches())
2669             failCount++;
2670 
2671         m.reset("\uac00");
2672         if (!m.matches())
2673             failCount++;


















































2674 
2675         // Composed hangul
2676         p = Pattern.compile("\uac00", Pattern.CANON_EQ);
2677         m = p.matcher("\u1100\u1161");
2678         if (!m.matches())
2679             failCount++;
2680 
2681         m.reset("\uac00");
2682         if (!m.matches())
2683             failCount++;
2684 
2685         // Decomposed supplementary outside char classes
2686         p = Pattern.compile("test\ud834\uddbc\ud834\udd6f", Pattern.CANON_EQ);
2687         m = p.matcher("test\ud834\uddc0");
2688         if (!m.matches())
2689             failCount++;
2690 
2691         m.reset("test\ud834\uddbc\ud834\udd6f");
2692         if (!m.matches())
2693             failCount++;

2694 


2695         // Composed supplementary outside char classes
2696         p = Pattern.compile("test\ud834\uddc0", Pattern.CANON_EQ);
2697         m.reset("test\ud834\uddbc\ud834\udd6f");
2698         if (!m.matches())
2699             failCount++;
2700 
2701         m = p.matcher("test\ud834\uddc0");
2702         if (!m.matches())
2703             failCount++;
2704 
2705         */


2706 

















2707         report("Canonical Equivalence");
2708     }
2709 
2710     /**
2711      * A basic sanity test of Matcher.replaceAll().
2712      */
2713     private static void globalSubstitute() throws Exception {
2714         // Global substitution with a literal
2715         Pattern p = Pattern.compile("(ab)(c*)");
2716         Matcher m = p.matcher("abccczzzabcczzzabccc");
2717         if (!m.replaceAll("test").equals("testzzztestzzztest"))
2718             failCount++;
2719 
2720         m.reset("zzzabccczzzabcczzzabccczzz");
2721         if (!m.replaceAll("test").equals("zzztestzzztestzzztestzzz"))
2722             failCount++;
2723 
2724         // Global substitution with groups
2725         m.reset("zzzabccczzzabcczzzabccczzz");
2726         String result = m.replaceAll("$1");


3829         if (m.start() != m.start(0))
3830             failCount++;
3831         //assert(m.end() = m.end(0);
3832         if (m.start() != m.start(0))
3833             failCount++;
3834         //assert(m.group() = m.group(0);
3835         if (!m.group().equals(m.group(0)))
3836             failCount++;
3837         try {
3838             m.group(50);
3839             failCount++;
3840         } catch (IndexOutOfBoundsException ise) {}
3841 
3842         return failCount;
3843     }
3844 
3845     private static Pattern compileTestPattern(String patternString) {
3846         if (!patternString.startsWith("'")) {
3847             return Pattern.compile(patternString);
3848         }
3849 
3850         int break1 = patternString.lastIndexOf("'");
3851         String flagString = patternString.substring(
3852                                           break1+1, patternString.length());
3853         patternString = patternString.substring(1, break1);
3854 
3855         if (flagString.equals("i"))
3856             return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
3857 
3858         if (flagString.equals("m"))
3859             return Pattern.compile(patternString, Pattern.MULTILINE);
3860 
3861         return Pattern.compile(patternString);
3862     }
3863 
3864     /**
3865      * Reads a line from the input file. Keeps reading lines until a non
3866      * empty non comment line is read. If the line contains a \n then
3867      * these two characters are replaced by a newline char. If a \\uxxxx
3868      * sequence is read then the sequence is replaced by the unicode char.
3869      */


4075                           "${dog}",
4076                           "zzzDogzzzDogzzz");
4077 
4078         // backref in Matcher & String
4079         if (!"abcdefghij".replaceFirst("cd(?<gn>ef)gh", "${gn}").equals("abefij") ||
4080             !"abbbcbdbefgh".replaceAll("(?<gn>[a-e])b", "${gn}").equals("abcdefgh"))
4081             failCount++;
4082 
4083         // negative
4084         checkExpectedFail("(?<groupnamehasnoascii.in>abc)(def)");
4085         checkExpectedFail("(?<groupnamehasnoascii_in>abc)(def)");
4086         checkExpectedFail("(?<6groupnamestartswithdigit>abc)(def)");
4087         checkExpectedFail("(?<gname>abc)(def)\\k<gnameX>");
4088         checkExpectedFail("(?<gname>abc)(?<gname>def)\\k<gnameX>");
4089         checkExpectedIAE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"),
4090                          "gnameX");
4091         checkExpectedNPE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"));
4092         report("NamedGroupCapture");
4093     }
4094 
4095     // This is for bug 6969132
4096     private static void nonBmpClassComplementTest() throws Exception {
4097         Pattern p = Pattern.compile("\\P{Lu}");
4098         Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));

4099         if (m.find() && m.start() == 1)
4100             failCount++;
4101 
4102         // from a unicode category
4103         p = Pattern.compile("\\P{Lu}");
4104         m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
4105         if (m.find())
4106             failCount++;
4107         if (!m.hitEnd())
4108             failCount++;
4109 
4110         // block
4111         p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
4112         m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
4113         if (m.find() && m.start() == 1)
4114             failCount++;
4115 





4116         report("NonBmpClassComplement");
4117     }
4118 
4119     private static void unicodePropertiesTest() throws Exception {
4120         // different forms
4121         if (!Pattern.compile("\\p{IsLu}").matcher("A").matches() ||
4122             !Pattern.compile("\\p{Lu}").matcher("A").matches() ||
4123             !Pattern.compile("\\p{gc=Lu}").matcher("A").matches() ||
4124             !Pattern.compile("\\p{general_category=Lu}").matcher("A").matches() ||
4125             !Pattern.compile("\\p{IsLatin}").matcher("B").matches() ||
4126             !Pattern.compile("\\p{sc=Latin}").matcher("B").matches() ||
4127             !Pattern.compile("\\p{script=Latin}").matcher("B").matches() ||
4128             !Pattern.compile("\\p{InBasicLatin}").matcher("c").matches() ||
4129             !Pattern.compile("\\p{blk=BasicLatin}").matcher("c").matches() ||
4130             !Pattern.compile("\\p{block=BasicLatin}").matcher("c").matches())
4131             failCount++;
4132 
4133         Matcher common  = Pattern.compile("\\p{script=Common}").matcher("");
4134         Matcher unknown = Pattern.compile("\\p{IsUnknown}").matcher("");
4135         Matcher lastSM  = common;




  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /**
  25  * @test
  26  * @summary tests RegExp framework (use -Dseed=X to set PRNG seed)
  27  * @author Mike McCloskey
  28  * @bug 4481568 4482696 4495089 4504687 4527731 4599621 4631553 4619345
  29  * 4630911 4672616 4711773 4727935 4750573 4792284 4803197 4757029 4808962
  30  * 4872664 4803179 4892980 4900747 4945394 4938995 4979006 4994840 4997476
  31  * 5013885 5003322 4988891 5098443 5110268 6173522 4829857 5027748 6376940
  32  * 6358731 6178785 6284152 6231989 6497148 6486934 6233084 6504326 6635133
  33  * 6350801 6676425 6878475 6919132 6931676 6948903 6990617 7014645 7039066
  34  * 7067045 7014640 7189363 8007395 8013252 8013254 8012646 8023647 6559590
  35  * 8027645 8035076 8039124 8035975 8074678 6854417 8143854 8147531 7071819
  36  * 8151481 4867170 7080302 6728861 6995635 6736245 4916384
  37  * @library /lib/testlibrary
  38  * @build jdk.testlibrary.*
  39  * @run main RegExTest
  40  * @key randomness
  41  */
  42 
  43 import java.util.function.Function;
  44 import java.util.regex.*;
  45 import java.util.Random;
  46 import java.util.Scanner;
  47 import java.io.*;
  48 import java.nio.file.*;
  49 import java.util.*;
  50 import java.nio.CharBuffer;
  51 import java.util.function.Predicate;
  52 import jdk.testlibrary.RandomFactory;
  53 
  54 /**
  55  * This is a test class created to check the operation of
  56  * the Pattern and Matcher classes.


2643             failCount++;
2644 
2645         // Marks that cannot legally change order and be equivalent
2646         p = Pattern.compile("testa\u0308\u0300", Pattern.CANON_EQ);
2647         check(p, "testa\u0308\u0300", true);
2648         check(p, "testa\u0300\u0308", false);
2649 
2650         // Marks that can legally change order and be equivalent
2651         p = Pattern.compile("testa\u0308\u0323", Pattern.CANON_EQ);
2652         check(p, "testa\u0308\u0323", true);
2653         check(p, "testa\u0323\u0308", true);
2654 
2655         // Test all equivalences of the sequence a\u0308\u0323\u0300
2656         p = Pattern.compile("testa\u0308\u0323\u0300", Pattern.CANON_EQ);
2657         check(p, "testa\u0308\u0323\u0300", true);
2658         check(p, "testa\u0323\u0308\u0300", true);
2659         check(p, "testa\u0308\u0300\u0323", true);
2660         check(p, "test\u00e4\u0323\u0300", true);
2661         check(p, "test\u00e4\u0300\u0323", true);
2662 
2663         Object[][] data = new Object[][] {







2664 
2665         // JDK-4867170
2666         { "[\u1f80-\u1f82]", "ab\u1f80cd",             "f", true },
2667         { "[\u1f80-\u1f82]", "ab\u1f81cd",             "f", true },
2668         { "[\u1f80-\u1f82]", "ab\u1f82cd",             "f", true },
2669         { "[\u1f80-\u1f82]", "ab\u03b1\u0314\u0345cd", "f", true },
2670         { "[\u1f80-\u1f82]", "ab\u03b1\u0345\u0314cd", "f", true },
2671         { "[\u1f80-\u1f82]", "ab\u1f01\u0345cd",       "f", true },
2672         { "[\u1f80-\u1f82]", "ab\u1f00\u0345cd",       "f", true },
2673 
2674         { "\\p{IsGreek}",    "ab\u1f80cd",             "f", true },
2675         { "\\p{IsGreek}",    "ab\u1f81cd",             "f", true },
2676         { "\\p{IsGreek}",    "ab\u1f82cd",             "f", true },
2677         { "\\p{IsGreek}",    "ab\u03b1\u0314\u0345cd", "f", true },
2678         { "\\p{IsGreek}",    "ab\u1f01\u0345cd",       "f", true },
2679 
2680         // backtracking, force to match "\u1f80", instead of \u1f82"
2681         { "ab\\p{IsGreek}\u0300cd", "ab\u03b1\u0313\u0345\u0300cd", "m", true },
2682 
2683         { "[\\p{IsGreek}]",  "\u03b1\u0314\u0345",     "m", true },
2684         { "\\p{IsGreek}",    "\u03b1\u0314\u0345",     "m", true },
2685  
2686         { "[^\u1f80-\u1f82]","\u1f81",                 "m", false },
2687         { "[^\u1f80-\u1f82]","\u03b1\u0314\u0345",     "m", false },
2688         { "[^\u1f01\u0345]", "\u1f81",                 "f", false },
2689  
2690         { "[^\u1f81]+",      "\u1f80\u1f82",           "f", true },
2691         { "[\u1f80]",        "ab\u1f80cd",             "f", true },
2692         { "\u1f80",          "ab\u1f80cd",             "f", true },
2693         { "\u1f00\u0345\u0300",  "\u1f82", "m", true },
2694         { "\u1f80",          "-\u1f00\u0345\u0300-",   "f", true },
2695         { "\u1f82",          "\u1f00\u0345\u0300",     "m", true },
2696         { "\u1f82",          "\u1f80\u0300",           "m", true },
2697  
2698         // JDK-7080302       # compile failed
2699         { "a(\u0041\u0301\u0328)", "a\u0041\u0301\u0328", "m", true},
2700 
2701         // JDK-6728861, same cause as above one
2702         // Pattern pt = Pattern.compile("één", Pattern.CANON_EQ);
2703         { "\u00e9\u00e9n", "e\u0301e\u0301n", "m", true},
2704 
2705         // JDK-6995635
2706         // Pattern patternThatIsGonnaBug=Pattern.compile("(ë)",Pattern.CANON_EQ);
2707         { "(\u00e9)", "e\u0301", "m", true },
2708 
2709         // JDK-6736245
2710         // intereting special case, nfc(u2add+u0338) -> u2add+u0338) NOT u2adc
2711         { "\u2ADC", "\u2ADC", "m", true},          // NFC
2712         { "\u2ADC", "\u2ADD\u0338", "m", true},    // NFD 
2713 
2714         //  4916384.
2715         // Decomposed hangul (jamos) works inside clazz
2716         { "[\u1100\u1161]", "\u1100\u1161", "m", true},
2717         { "[\u1100\u1161]", "\uac00", "m", true},
2718 
2719         { "[\uac00]", "\u1100\u1161", "m", true},
2720         { "[\uac00]", "\uac00", "m", true},



2721 
2722         // Decomposed hangul (jamos)
2723         { "\u1100\u1161", "\u1100\u1161", "m", true},
2724         { "\u1100\u1161", "\uac00", "m", true},
2725 
2726         // Composed hangul
2727         { "\uac00",  "\u1100\u1161", "m", true },
2728         { "\uac00",  "\uac00", "m", true },


2729 
2730         /* Need a NFDSlice to nfd the source to solve this issue
2731            u+1d1c0 -> nfd: <u+1d1ba><u+1d165><u+1d16f>  -> nfc: <u+1d1ba><u+1d165><u+1d16f>
2732            u+1d1bc -> nfd: <u+1d1ba><u+1d165>           -> nfc: <u+1d1ba><u+1d165>
2733            <u+1d1bc><u+1d16f> -> nfd: <u+1d1ba><u+1d165><u+1d16f> -> nfc: <u+1d1ba><u+1d165><u+1d16f>
2734 
2735         // Decomposed supplementary outside char classes
2736         // { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0", "m", true },
2737         // Composed supplementary outside char classes
2738         // { "test\ud834\uddc0", "test\ud834\uddbc\ud834\udd6f", "m", true },








2739         */
2740         { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddbc\ud834\udd6f", "m", true },
2741         { "test\ud834\uddc0",             "test\ud834\uddbc\ud834\udd6f", "m", true },
2742 
2743         { "test\ud834\uddc0",             "test\ud834\uddc0",             "m", true },
2744         { "test\ud834\uddbc\ud834\udd6f", "test\ud834\uddc0",             "m", true },
2745         };
2746 
2747         int failCount = 0;
2748         for (Object[] d : data) {
2749             String pn = (String)d[0];
2750             String tt = (String)d[1];
2751             boolean isFind = "f".equals(((String)d[2]));
2752             boolean expected = (boolean)d[3];
2753             boolean ret = isFind ? Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).find()
2754                                  : Pattern.compile(pn, Pattern.CANON_EQ).matcher(tt).matches();
2755             if (ret != expected) {
2756                 failCount++;
2757                 continue;
2758             }
2759         }
2760         report("Canonical Equivalence");
2761     }
2762 
2763     /**
2764      * A basic sanity test of Matcher.replaceAll().
2765      */
2766     private static void globalSubstitute() throws Exception {
2767         // Global substitution with a literal
2768         Pattern p = Pattern.compile("(ab)(c*)");
2769         Matcher m = p.matcher("abccczzzabcczzzabccc");
2770         if (!m.replaceAll("test").equals("testzzztestzzztest"))
2771             failCount++;
2772 
2773         m.reset("zzzabccczzzabcczzzabccczzz");
2774         if (!m.replaceAll("test").equals("zzztestzzztestzzztestzzz"))
2775             failCount++;
2776 
2777         // Global substitution with groups
2778         m.reset("zzzabccczzzabcczzzabccczzz");
2779         String result = m.replaceAll("$1");


3882         if (m.start() != m.start(0))
3883             failCount++;
3884         //assert(m.end() = m.end(0);
3885         if (m.start() != m.start(0))
3886             failCount++;
3887         //assert(m.group() = m.group(0);
3888         if (!m.group().equals(m.group(0)))
3889             failCount++;
3890         try {
3891             m.group(50);
3892             failCount++;
3893         } catch (IndexOutOfBoundsException ise) {}
3894 
3895         return failCount;
3896     }
3897 
3898     private static Pattern compileTestPattern(String patternString) {
3899         if (!patternString.startsWith("'")) {
3900             return Pattern.compile(patternString);
3901         }

3902         int break1 = patternString.lastIndexOf("'");
3903         String flagString = patternString.substring(
3904                                           break1+1, patternString.length());
3905         patternString = patternString.substring(1, break1);
3906 
3907         if (flagString.equals("i"))
3908             return Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
3909 
3910         if (flagString.equals("m"))
3911             return Pattern.compile(patternString, Pattern.MULTILINE);
3912 
3913         return Pattern.compile(patternString);
3914     }
3915 
3916     /**
3917      * Reads a line from the input file. Keeps reading lines until a non
3918      * empty non comment line is read. If the line contains a \n then
3919      * these two characters are replaced by a newline char. If a \\uxxxx
3920      * sequence is read then the sequence is replaced by the unicode char.
3921      */


4127                           "${dog}",
4128                           "zzzDogzzzDogzzz");
4129 
4130         // backref in Matcher & String
4131         if (!"abcdefghij".replaceFirst("cd(?<gn>ef)gh", "${gn}").equals("abefij") ||
4132             !"abbbcbdbefgh".replaceAll("(?<gn>[a-e])b", "${gn}").equals("abcdefgh"))
4133             failCount++;
4134 
4135         // negative
4136         checkExpectedFail("(?<groupnamehasnoascii.in>abc)(def)");
4137         checkExpectedFail("(?<groupnamehasnoascii_in>abc)(def)");
4138         checkExpectedFail("(?<6groupnamestartswithdigit>abc)(def)");
4139         checkExpectedFail("(?<gname>abc)(def)\\k<gnameX>");
4140         checkExpectedFail("(?<gname>abc)(?<gname>def)\\k<gnameX>");
4141         checkExpectedIAE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"),
4142                          "gnameX");
4143         checkExpectedNPE(Pattern.compile("(?<gname>abc)(def)").matcher("abcdef"));
4144         report("NamedGroupCapture");
4145     }
4146 
4147     // This is for bug 6919132
4148     private static void nonBmpClassComplementTest() throws Exception {
4149         Pattern p = Pattern.compile("\\P{Lu}");
4150         Matcher m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
4151 
4152         if (m.find() && m.start() == 1)
4153             failCount++;
4154 
4155         // from a unicode category
4156         p = Pattern.compile("\\P{Lu}");
4157         m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
4158         if (m.find())
4159             failCount++;
4160         if (!m.hitEnd())
4161             failCount++;
4162 
4163         // block
4164         p = Pattern.compile("\\P{InMathematicalAlphanumericSymbols}");
4165         m = p.matcher(new String(new int[] {0x1d400}, 0, 1));
4166         if (m.find() && m.start() == 1)
4167             failCount++;
4168 
4169         p = Pattern.compile("\\P{sc=GRANTHA}");
4170         m = p.matcher(new String(new int[] {0x11350}, 0, 1));
4171         if (m.find() && m.start() == 1)
4172             failCount++;
4173 
4174         report("NonBmpClassComplement");
4175     }
4176 
4177     private static void unicodePropertiesTest() throws Exception {
4178         // different forms
4179         if (!Pattern.compile("\\p{IsLu}").matcher("A").matches() ||
4180             !Pattern.compile("\\p{Lu}").matcher("A").matches() ||
4181             !Pattern.compile("\\p{gc=Lu}").matcher("A").matches() ||
4182             !Pattern.compile("\\p{general_category=Lu}").matcher("A").matches() ||
4183             !Pattern.compile("\\p{IsLatin}").matcher("B").matches() ||
4184             !Pattern.compile("\\p{sc=Latin}").matcher("B").matches() ||
4185             !Pattern.compile("\\p{script=Latin}").matcher("B").matches() ||
4186             !Pattern.compile("\\p{InBasicLatin}").matcher("c").matches() ||
4187             !Pattern.compile("\\p{blk=BasicLatin}").matcher("c").matches() ||
4188             !Pattern.compile("\\p{block=BasicLatin}").matcher("c").matches())
4189             failCount++;
4190 
4191         Matcher common  = Pattern.compile("\\p{script=Common}").matcher("");
4192         Matcher unknown = Pattern.compile("\\p{IsUnknown}").matcher("");
4193         Matcher lastSM  = common;