1 /*
   2  * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.util.Objects;
  29 
  30 final class Grapheme {
  31 
  32     /**
  33      * Determines if there is an extended  grapheme cluster boundary between two
  34      * continuing characters {@code cp1} and {@code cp2}.
  35      * <p>
  36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  37      * for the extended grapheme cluster boundary rules
  38      * <p>
  39      * Note: this method does not take care of stateful breaking.
  40      */
  41     static boolean isBoundary(int cp1, int cp2) {
  42         return rules[getType(cp1)][getType(cp2)];
  43     }
  44 
  45     /**
  46      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
  47      * the start of the char sequence is a boundary.
  48      * <p>
  49      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  50      * for the extended grapheme cluster boundary rules. The following implementation
  51      * is based on version 12.0 of the annex.
  52      * (http://www.unicode.org/reports/tr29/tr29-35.html)
  53      *
  54      * @param src the {@code CharSequence} to be scanned
  55      * @param off offset to start looking for the next boundary in the src
  56      * @param limit limit offset in the src (exclusive)
  57      * @return the next possible boundary
  58      */
  59     static int nextBoundary(CharSequence src, int off, int limit) {
  60         Objects.checkFromToIndex(off, limit, src.length());
  61 
  62         int ch0 = Character.codePointAt(src, 0);
  63         int ret = Character.charCount(ch0);
  64         int ch1;
  65         // indicates whether gb11 or gb12 is underway
  66         int t0 = getGraphemeType(ch0);
  67         int riCount = t0 == RI ? 1 : 0;
  68         boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
  69         while (ret < limit) {
  70             ch1 = Character.codePointAt(src, ret);
  71             int t1 = getGraphemeType(ch1);
  72 
  73             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
  74                 gb11 = false;
  75             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
  76                 // continue for gb12
  77             } else if (rules[t0][t1]) {
  78                 if (ret > off) {
  79                     break;
  80                 } else {
  81                     gb11 = t1 == EXTENDED_PICTOGRAPHIC;
  82                     riCount = 0;
  83                 }
  84             }
  85 
  86             riCount += (t1 == RI) ? 1 : 0;
  87             t0 = t1;
  88 
  89             ret += Character.charCount(ch1);
  90         }
  91         return ret;
  92     }
  93 
  94     // types
  95     private static final int OTHER = 0;
  96     private static final int CR = 1;
  97     private static final int LF = 2;
  98     private static final int CONTROL = 3;
  99     private static final int EXTEND = 4;
 100     private static final int ZWJ = 5;
 101     private static final int RI = 6;
 102     private static final int PREPEND = 7;
 103     private static final int SPACINGMARK = 8;
 104     private static final int L = 9;
 105     private static final int V = 10;
 106     private static final int T = 11;
 107     private static final int LV = 12;
 108     private static final int LVT = 13;
 109     private static final int EXTENDED_PICTOGRAPHIC = 14;
 110 
 111     private static final int FIRST_TYPE = 0;
 112     private static final int LAST_TYPE = 14;
 113 
 114     private static boolean[][] rules;
 115     static {
 116         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
 117         // GB 999 Any + Any  -> default
 118         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
 119             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
 120                 rules[i][j] = true;
 121         // GB 6 L x (L | V | LV | VT)
 122         rules[L][L] = false;
 123         rules[L][V] = false;
 124         rules[L][LV] = false;
 125         rules[L][LVT] = false;
 126         // GB 7 (LV | V) x (V | T)
 127         rules[LV][V] = false;
 128         rules[LV][T] = false;
 129         rules[V][V] = false;
 130         rules[V][T] = false;
 131         // GB 8 (LVT | T) x T
 132         rules[LVT][T] = false;
 133         rules[T][T] = false;
 134         // GB 9 x (Extend|ZWJ)
 135         // GB 9a x Spacing Mark
 136         // GB 9b Prepend x
 137         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
 138             rules[i][EXTEND] = false;
 139             rules[i][ZWJ] = false;
 140             rules[i][SPACINGMARK] = false;
 141             rules[PREPEND][i] = false;
 142         }
 143         // GB 4  (Control | CR | LF) +
 144         // GB 5  + (Control | CR | LF)
 145         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
 146             for (int j = CR; j <= CONTROL; j++) {
 147                 rules[i][j] = true;
 148                 rules[j][i] = true;
 149             }
 150         // GB 3 CR x LF
 151         rules[CR][LF] = false;
 152         // GB 11 Exended_Pictographic x (Extend|ZWJ)
 153         rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false;
 154         rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false;
 155     }
 156 
 157     // Hangul syllables
 158     private static final int SYLLABLE_BASE = 0xAC00;
 159     private static final int LCOUNT = 19;
 160     private static final int VCOUNT = 21;
 161     private static final int TCOUNT = 28;
 162     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
 163     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
 164 
 165     // #tr29: SpacingMark exceptions: The following (which have
 166     // General_Category = Spacing_Mark and would otherwise be included)
 167     // are specifically excluded
 168     private static boolean isExcludedSpacingMark(int cp) {
 169        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
 170                cp >= 0x1062 && cp <= 0x1064 ||
 171                cp >= 0x1062 && cp <= 0x106D ||
 172                cp == 0x1083 ||
 173                cp >= 0x1087 && cp <= 0x108C ||
 174                cp == 0x108F ||
 175                cp >= 0x109A && cp <= 0x109C ||
 176                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
 177                cp == 0xAA7B || cp == 0xAA7D;
 178     }
 179 
 180     private static int getGraphemeType(int cp) {
 181         if (cp < 0x007F) { // ASCII
 182             if (cp < 32) { // Control characters
 183                 if (cp == 0x000D)
 184                     return CR;
 185                 if (cp == 0x000A)
 186                     return LF;
 187                 return CONTROL;
 188             }
 189             return OTHER;
 190         }
 191         return getType(cp);
 192     }
 193 
 194     @SuppressWarnings("fallthrough")
 195     private static int getType(int cp) {
 196         if (EmojiData.isExtendedPictographic(cp)) {
 197             return EXTENDED_PICTOGRAPHIC;
 198         }
 199 
 200         int type = Character.getType(cp);
 201         switch(type) {
 202         case Character.UNASSIGNED:
 203             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
 204             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
 205             // so type it as "Other" to make the test happy
 206             if (cp == 0x0378)
 207                 return OTHER;
 208 
 209         case Character.CONTROL:
 210         case Character.LINE_SEPARATOR:
 211         case Character.PARAGRAPH_SEPARATOR:
 212         case Character.SURROGATE:
 213             return CONTROL;
 214         case Character.FORMAT:
 215             if (cp == 0x200C ||
 216                 cp >= 0xE0020 && cp <= 0xE007F)
 217                 return EXTEND;
 218             if (cp == 0x200D)
 219                 return ZWJ;
 220             if (cp >= 0x0600 && cp <= 0x0605 ||
 221                 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
 222                 cp == 0x110BD || cp == 0x110CD)
 223                 return PREPEND;
 224             return CONTROL;
 225         case Character.NON_SPACING_MARK:
 226         case Character.ENCLOSING_MARK:
 227             // NOTE:
 228             // #tr29 "plus a few General_Category = Spacing_Mark needed for
 229             // canonical equivalence."
 230             // but for "extended grapheme clusters" support, there is no
 231             // need actually to diff "extend" and "spackmark" given GB9, GB9a
 232             return EXTEND;
 233         case  Character.COMBINING_SPACING_MARK:
 234             if (isExcludedSpacingMark(cp))
 235                 return OTHER;
 236             // NOTE:
 237             // 0x11720 and 0x11721 are mentioned in #tr29 as
 238             // OTHER_LETTER but it appears their category has been updated to
 239             // COMBING_SPACING_MARK already (verified in ver.8)
 240             return SPACINGMARK;
 241         case Character.OTHER_SYMBOL:
 242             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
 243                 return RI;
 244             return OTHER;
 245         case Character.MODIFIER_LETTER:
 246         case Character.MODIFIER_SYMBOL:
 247             // WARNING:
 248             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
 249             if (cp == 0xFF9E || cp == 0xFF9F ||
 250                 cp >= 0x1F3FB && cp <= 0x1F3FF)
 251                 return EXTEND;
 252             return OTHER;
 253         case Character.OTHER_LETTER:
 254             if (cp == 0x0E33 || cp == 0x0EB3)
 255                 return SPACINGMARK;
 256             // hangul jamo
 257             if (cp >= 0x1100 && cp <= 0x11FF) {
 258                 if (cp <= 0x115F)
 259                     return L;
 260                 if (cp <= 0x11A7)
 261                     return V;
 262                 return T;
 263             }
 264             // hangul syllables
 265             int sindex = cp - SYLLABLE_BASE;
 266             if (sindex >= 0 && sindex < SCOUNT) {
 267 
 268                 if (sindex % TCOUNT == 0)
 269                     return LV;
 270                 return LVT;
 271             }
 272             //  hangul jamo_extended A
 273             if (cp >= 0xA960 && cp <= 0xA97C)
 274                 return L;
 275             //  hangul jamo_extended B
 276             if (cp >= 0xD7B0 && cp <= 0xD7C6)
 277                 return V;
 278             if (cp >= 0xD7CB && cp <= 0xD7FB)
 279                 return T;
 280 
 281             // Prepend
 282             switch (cp) {
 283                 case 0x0D4E:
 284                 case 0x111C2:
 285                 case 0x111C3:
 286                 case 0x11A3A:
 287                 case 0x11A84:
 288                 case 0x11A85:
 289                 case 0x11A86:
 290                 case 0x11A87:
 291                 case 0x11A88:
 292                 case 0x11A89:
 293                 case 0x11D46:
 294                     return PREPEND;
 295             }
 296         }
 297         return OTHER;
 298     }
 299 }