1 /*
   2  * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 import java.util.Objects;
  29 
  30 final class Grapheme {
  31 
  32     /**
  33      * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
  34      * the start of the char sequence is a boundary.
  35      * <p>
  36      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  37      * for the extended grapheme cluster boundary rules. The following implementation
  38      * is based on version 12.0 of the annex.
  39      * (http://www.unicode.org/reports/tr29/tr29-35.html)
  40      *
  41      * @param src the {@code CharSequence} to be scanned
  42      * @param off offset to start looking for the next boundary in the src
  43      * @param limit limit offset in the src (exclusive)
  44      * @return the next possible boundary
  45      */
  46     static int nextBoundary(CharSequence src, int off, int limit) {
  47         Objects.checkFromToIndex(off, limit, src.length());
  48 
  49         int ch0 = Character.codePointAt(src, 0);
  50         int ret = Character.charCount(ch0);
  51         int ch1;
  52         // indicates whether gb11 or gb12 is underway
  53         boolean gb11 = EmojiData.isExtendedPictographic(ch0);
  54         int riCount = getType(ch0) == RI ? 1 : 0;
  55         while (ret < limit) {
  56             ch1 = Character.codePointAt(src, ret);
  57             int t0 = getType(ch0);
  58             int t1 = getType(ch1);
  59 
  60             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
  61                 gb11 = false;
  62             } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
  63                 // continue for gb12
  64             } else if (rules[t0][t1]) {
  65                 if (ret > off) {
  66                     break;
  67                 } else {
  68                     gb11 = EmojiData.isExtendedPictographic(ch1);
  69                     riCount = 0;
  70                 }
  71             }
  72 
  73             riCount += getType(ch1) == RI ? 1 : 0;
  74             ch0 = ch1;
  75             ret += Character.charCount(ch1);
  76         }
  77         return ret;
  78     }
  79 
  80     // types
  81     private static final int OTHER = 0;
  82     private static final int CR = 1;
  83     private static final int LF = 2;
  84     private static final int CONTROL = 3;
  85     private static final int EXTEND = 4;
  86     private static final int ZWJ = 5;
  87     private static final int RI = 6;
  88     private static final int PREPEND = 7;
  89     private static final int SPACINGMARK = 8;
  90     private static final int L = 9;
  91     private static final int V = 10;
  92     private static final int T = 11;
  93     private static final int LV = 12;
  94     private static final int LVT = 13;
  95     private static final int EXTENDED_PICTOGRAPHIC = 14;
  96 
  97     private static final int FIRST_TYPE = 0;
  98     private static final int LAST_TYPE = 14;
  99 
 100     private static boolean[][] rules;
 101     static {
 102         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
 103         // GB 999 Any + Any  -> default
 104         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
 105             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
 106                 rules[i][j] = true;
 107         // GB 6 L x (L | V | LV | VT)
 108         rules[L][L] = false;
 109         rules[L][V] = false;
 110         rules[L][LV] = false;
 111         rules[L][LVT] = false;
 112         // GB 7 (LV | V) x (V | T)
 113         rules[LV][V] = false;
 114         rules[LV][T] = false;
 115         rules[V][V] = false;
 116         rules[V][T] = false;
 117         // GB 8 (LVT | T) x T
 118         rules[LVT][T] = false;
 119         rules[T][T] = false;
 120         // GB 9 x (Extend|ZWJ)
 121         // GB 9a x Spacing Mark
 122         // GB 9b Prepend x
 123         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
 124             rules[i][EXTEND] = false;
 125             rules[i][ZWJ] = false;
 126             rules[i][SPACINGMARK] = false;
 127             rules[PREPEND][i] = false;
 128         }
 129         // GB 4  (Control | CR | LF) +
 130         // GB 5  + (Control | CR | LF)
 131         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
 132             for (int j = CR; j <= CONTROL; j++) {
 133                 rules[i][j] = true;
 134                 rules[j][i] = true;
 135             }
 136         // GB 3 CR x LF
 137         rules[CR][LF] = false;
 138         // GB 11 Exended_Pictographic x (Extend|ZWJ)
 139         rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false;
 140         rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false;
 141     }
 142 
 143     // Hangul syllables
 144     private static final int SYLLABLE_BASE = 0xAC00;
 145     private static final int LCOUNT = 19;
 146     private static final int VCOUNT = 21;
 147     private static final int TCOUNT = 28;
 148     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
 149     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
 150 
 151     // #tr29: SpacingMark exceptions: The following (which have
 152     // General_Category = Spacing_Mark and would otherwise be included)
 153     // are specifically excluded
 154     private static boolean isExcludedSpacingMark(int cp) {
 155        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
 156                cp >= 0x1062 && cp <= 0x1064 ||
 157                cp >= 0x1062 && cp <= 0x106D ||
 158                cp == 0x1083 ||
 159                cp >= 0x1087 && cp <= 0x108C ||
 160                cp == 0x108F ||
 161                cp >= 0x109A && cp <= 0x109C ||
 162                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
 163                cp == 0xAA7B || cp == 0xAA7D;
 164     }
 165 
 166     @SuppressWarnings("fallthrough")
 167     private static int getType(int cp) {
 168         if (EmojiData.isExtendedPictographic(cp)) {
 169             return EXTENDED_PICTOGRAPHIC;
 170         }
 171 
 172         int type = Character.getType(cp);
 173         switch(type) {
 174         case Character.CONTROL:
 175             if (cp == 0x000D)
 176                 return CR;
 177             if (cp == 0x000A)
 178                 return LF;
 179             return CONTROL;
 180         case Character.UNASSIGNED:
 181             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
 182             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
 183             // so type it as "Other" to make the test happy
 184             if (cp == 0x0378)
 185                 return OTHER;
 186 
 187         case Character.LINE_SEPARATOR:
 188         case Character.PARAGRAPH_SEPARATOR:
 189         case Character.SURROGATE:
 190             return CONTROL;
 191         case Character.FORMAT:
 192             if (cp == 0x200C ||
 193                 cp >= 0xE0020 && cp <= 0xE007F)
 194                 return EXTEND;
 195             if (cp == 0x200D)
 196                 return ZWJ;
 197             if (cp >= 0x0600 && cp <= 0x0605 ||
 198                 cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
 199                 cp == 0x110BD || cp == 0x110CD)
 200                 return PREPEND;
 201             return CONTROL;
 202         case Character.NON_SPACING_MARK:
 203         case Character.ENCLOSING_MARK:
 204             // NOTE:
 205             // #tr29 "plus a few General_Category = Spacing_Mark needed for
 206             // canonical equivalence."
 207             // but for "extended grapheme clusters" support, there is no
 208             // need actually to diff "extend" and "spackmark" given GB9, GB9a
 209             return EXTEND;
 210         case  Character.COMBINING_SPACING_MARK:
 211             if (isExcludedSpacingMark(cp))
 212                 return OTHER;
 213             // NOTE:
 214             // 0x11720 and 0x11721 are mentioned in #tr29 as
 215             // OTHER_LETTER but it appears their category has been updated to
 216             // COMBING_SPACING_MARK already (verified in ver.8)
 217             return SPACINGMARK;
 218         case Character.OTHER_SYMBOL:
 219             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
 220                 return RI;
 221             return OTHER;
 222         case Character.MODIFIER_LETTER:
 223         case Character.MODIFIER_SYMBOL:
 224             // WARNING:
 225             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
 226             if (cp == 0xFF9E || cp == 0xFF9F ||
 227                 cp >= 0x1F3FB && cp <= 0x1F3FF)
 228                 return EXTEND;
 229             return OTHER;
 230         case Character.OTHER_LETTER:
 231             if (cp == 0x0E33 || cp == 0x0EB3)
 232                 return SPACINGMARK;
 233             // hangul jamo
 234             if (cp >= 0x1100 && cp <= 0x11FF) {
 235                 if (cp <= 0x115F)
 236                     return L;
 237                 if (cp <= 0x11A7)
 238                     return V;
 239                 return T;
 240             }
 241             // hangul syllables
 242             int sindex = cp - SYLLABLE_BASE;
 243             if (sindex >= 0 && sindex < SCOUNT) {
 244 
 245                 if (sindex % TCOUNT == 0)
 246                     return LV;
 247                 return LVT;
 248             }
 249             //  hangul jamo_extended A
 250             if (cp >= 0xA960 && cp <= 0xA97C)
 251                 return L;
 252             //  hangul jamo_extended B
 253             if (cp >= 0xD7B0 && cp <= 0xD7C6)
 254                 return V;
 255             if (cp >= 0xD7CB && cp <= 0xD7FB)
 256                 return T;
 257 
 258             // Prepend
 259             switch (cp) {
 260                 case 0x0D4E:
 261                 case 0x111C2:
 262                 case 0x111C3:
 263                 case 0x11A3A:
 264                 case 0x11A84:
 265                 case 0x11A85:
 266                 case 0x11A86:
 267                 case 0x11A87:
 268                 case 0x11A88:
 269                 case 0x11A89:
 270                 case 0x11D46:
 271                     return PREPEND;
 272             }
 273         }
 274         return OTHER;
 275     }
 276 }