< prev index next >

src/java.base/share/classes/java/util/regex/Grapheme.java

Print this page
rev 55125 : 8225061: Performance regression in Regex
Reviewed-by: TBD


  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.util.Objects;
  29 
  30 final class Grapheme {
  31 
  32     /**













  33      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
  34      * the start of the char sequence is a boundary.
  35      * <p>
  36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  37      * for the extended grapheme cluster boundary rules. The following implementation
  38      * is based on version 12.0 of the annex.
  39      * (http://www.unicode.org/reports/tr29/tr29-35.html)
  40      *
  41      * @param src the {@code CharSequence} to be scanned
  42      * @param off offset to start looking for the next boundary in the src
  43      * @param limit limit offset in the src (exclusive)
  44      * @return the next possible boundary
  45      */
  46     static int nextBoundary(CharSequence src, int off, int limit) {
  47         Objects.checkFromToIndex(off, limit, src.length());
  48 
  49         int ch0 = Character.codePointAt(src, 0);
  50         int ret = Character.charCount(ch0);
  51         int ch1;
  52         // indicates whether gb11 or gb12 is underway
  53         boolean gb11 = EmojiData.isExtendedPictographic(ch0);
  54         int riCount = getType(ch0) == RI ? 1 : 0;

  55         while (ret < limit) {
  56             ch1 = Character.codePointAt(src, ret);
  57             int t0 = getType(ch0);
  58             int t1 = getType(ch1);
  59 
  60             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
  61                 gb11 = false;
  62             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
  63                 // continue for gb12
  64             } else if (rules[t0][t1]) {
  65                 if (ret > off) {
  66                     break;
  67                 } else {
  68                     gb11 = EmojiData.isExtendedPictographic(ch1);
  69                     riCount = 0;
  70                 }
  71             }
  72 
  73             riCount += getType(ch1) == RI ? 1 : 0;
  74             ch0 = ch1;

  75             ret += Character.charCount(ch1);
  76         }
  77         return ret;
  78     }
  79 
  80     // types
  81     private static final int OTHER = 0;
  82     private static final int CR = 1;
  83     private static final int LF = 2;
  84     private static final int CONTROL = 3;
  85     private static final int EXTEND = 4;
  86     private static final int ZWJ = 5;
  87     private static final int RI = 6;
  88     private static final int PREPEND = 7;
  89     private static final int SPACINGMARK = 8;
  90     private static final int L = 9;
  91     private static final int V = 10;
  92     private static final int T = 11;
  93     private static final int LV = 12;
  94     private static final int LVT = 13;


 146     private static final int VCOUNT = 21;
 147     private static final int TCOUNT = 28;
 148     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
 149     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
 150 
 151     // #tr29: SpacingMark exceptions: The following (which have
 152     // General_Category = Spacing_Mark and would otherwise be included)
 153     // are specifically excluded
 154     private static boolean isExcludedSpacingMark(int cp) {
 155        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
 156                cp >= 0x1062 && cp <= 0x1064 ||
 157                cp >= 0x1062 && cp <= 0x106D ||
 158                cp == 0x1083 ||
 159                cp >= 0x1087 && cp <= 0x108C ||
 160                cp == 0x108F ||
 161                cp >= 0x109A && cp <= 0x109C ||
 162                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
 163                cp == 0xAA7B || cp == 0xAA7D;
 164     }
 165 














 166     @SuppressWarnings("fallthrough")
 167     private static int getType(int cp) {
 168         if (EmojiData.isExtendedPictographic(cp)) {
 169             return EXTENDED_PICTOGRAPHIC;
 170         }
 171 
 172         int type = Character.getType(cp);
 173         switch(type) {
 174         case Character.CONTROL:
 175             if (cp == 0x000D)
 176                 return CR;
 177             if (cp == 0x000A)
 178                 return LF;
 179             return CONTROL;
 180         case Character.UNASSIGNED:
 181             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
 182             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
 183             // so type it as "Other" to make the test happy
 184             if (cp == 0x0378)
 185                 return OTHER;
 186 

 187         case Character.LINE_SEPARATOR:
 188         case Character.PARAGRAPH_SEPARATOR:
 189         case Character.SURROGATE:
 190             return CONTROL;
 191         case Character.FORMAT:
 192             if (cp == 0x200C ||
 193                 cp >= 0xE0020 && cp <= 0xE007F)
 194                 return EXTEND;
 195             if (cp == 0x200D)
 196                 return ZWJ;
 197             if (cp >= 0x0600 && cp <= 0x0605 ||
 198                 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
 199                 cp == 0x110BD || cp == 0x110CD)
 200                 return PREPEND;
 201             return CONTROL;
 202         case Character.NON_SPACING_MARK:
 203         case Character.ENCLOSING_MARK:
 204             // NOTE:
 205             // #tr29 "plus a few General_Category = Spacing_Mark needed for
 206             // canonical equivalence."




  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.util.Objects;
  29 
  30 final class Grapheme {
  31 
  32     /**
  33      * Determines if there is an extended  grapheme cluster boundary between two
  34      * continuing characters {@code cp1} and {@code cp2}.
  35      * <p>
  36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  37      * for the extended grapheme cluster boundary rules
  38      * <p>
  39      * Note: this method does not take care of stateful breaking.
  40      */
  41     static boolean isBoundary(int cp1, int cp2) {
  42         return rules[getType(cp1)][getType(cp2)];
  43     }
  44 
  45     /**
  46      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
  47      * the start of the char sequence is a boundary.
  48      * <p>
  49      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  50      * for the extended grapheme cluster boundary rules. The following implementation
  51      * is based on version 12.0 of the annex.
  52      * (http://www.unicode.org/reports/tr29/tr29-35.html)
  53      *
  54      * @param src the {@code CharSequence} to be scanned
  55      * @param off offset to start looking for the next boundary in the src
  56      * @param limit limit offset in the src (exclusive)
  57      * @return the next possible boundary
  58      */
  59     static int nextBoundary(CharSequence src, int off, int limit) {
  60         Objects.checkFromToIndex(off, limit, src.length());
  61 
  62         int ch0 = Character.codePointAt(src, 0);
  63         int ret = Character.charCount(ch0);
  64         int ch1;
  65         // indicates whether gb11 or gb12 is underway
  66         int t0 = getGraphemeType(ch0);
  67         int riCount = t0 == RI ? 1 : 0;
  68         boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
  69         while (ret < limit) {
  70             ch1 = Character.codePointAt(src, ret);
  71             int t1 = getGraphemeType(ch1);

  72 
  73             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
  74                 gb11 = false;
  75             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
  76                 // continue for gb12
  77             } else if (rules[t0][t1]) {
  78                 if (ret > off) {
  79                     break;
  80                 } else {
  81                     gb11 = t1 == EXTENDED_PICTOGRAPHIC;
  82                     riCount = 0;
  83                 }
  84             }
  85 
  86             riCount += (t1 == RI) ? 1 : 0;
  87             t0 = t1;
  88 
  89             ret += Character.charCount(ch1);
  90         }
  91         return ret;
  92     }
  93 
  94     // types
  95     private static final int OTHER = 0;
  96     private static final int CR = 1;
  97     private static final int LF = 2;
  98     private static final int CONTROL = 3;
  99     private static final int EXTEND = 4;
 100     private static final int ZWJ = 5;
 101     private static final int RI = 6;
 102     private static final int PREPEND = 7;
 103     private static final int SPACINGMARK = 8;
 104     private static final int L = 9;
 105     private static final int V = 10;
 106     private static final int T = 11;
 107     private static final int LV = 12;
 108     private static final int LVT = 13;


 160     private static final int VCOUNT = 21;
 161     private static final int TCOUNT = 28;
 162     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
 163     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
 164 
 165     // #tr29: SpacingMark exceptions: The following (which have
 166     // General_Category = Spacing_Mark and would otherwise be included)
 167     // are specifically excluded
 168     private static boolean isExcludedSpacingMark(int cp) {
 169        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
 170                cp >= 0x1062 && cp <= 0x1064 ||
 171                cp >= 0x1062 && cp <= 0x106D ||
 172                cp == 0x1083 ||
 173                cp >= 0x1087 && cp <= 0x108C ||
 174                cp == 0x108F ||
 175                cp >= 0x109A && cp <= 0x109C ||
 176                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
 177                cp == 0xAA7B || cp == 0xAA7D;
 178     }
 179 
 180     private static int getGraphemeType(int cp) {
 181         if (cp < 0x007F) { // ASCII
 182             if (cp < 32) { // Control characters
 183                 if (cp == 0x000D)
 184                     return CR;
 185                 if (cp == 0x000A)
 186                     return LF;
 187                 return CONTROL;
 188             }
 189             return OTHER;
 190         }
 191         return getType(cp);
 192     }
 193 
 194     @SuppressWarnings("fallthrough")
 195     private static int getType(int cp) {
 196         if (EmojiData.isExtendedPictographic(cp)) {
 197             return EXTENDED_PICTOGRAPHIC;
 198         }
 199 
 200         int type = Character.getType(cp);
 201         switch(type) {






 202         case Character.UNASSIGNED:
 203             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
 204             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
 205             // so type it as "Other" to make the test happy
 206             if (cp == 0x0378)
 207                 return OTHER;
 208 
 209         case Character.CONTROL:
 210         case Character.LINE_SEPARATOR:
 211         case Character.PARAGRAPH_SEPARATOR:
 212         case Character.SURROGATE:
 213             return CONTROL;
 214         case Character.FORMAT:
 215             if (cp == 0x200C ||
 216                 cp >= 0xE0020 && cp <= 0xE007F)
 217                 return EXTEND;
 218             if (cp == 0x200D)
 219                 return ZWJ;
 220             if (cp >= 0x0600 && cp <= 0x0605 ||
 221                 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
 222                 cp == 0x110BD || cp == 0x110CD)
 223                 return PREPEND;
 224             return CONTROL;
 225         case Character.NON_SPACING_MARK:
 226         case Character.ENCLOSING_MARK:
 227             // NOTE:
 228             // #tr29 "plus a few General_Category = Spacing_Mark needed for
 229             // canonical equivalence."


< prev index next >