1 /*
   2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.util.regex;
  27 
  28 final class Grapheme {
  29 
  30     /**
  31      * Determines if there is an extended  grapheme cluster boundary between two
  32      * continuing characters {@code cp1} and {@code cp2}.
  33      * <p>
  34      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
  35      * for the extended grapheme cluster boundary rules
  36      */
  37     static boolean isBoundary(int cp1, int cp2) {
  38         return rules[getType(cp1)][getType(cp2)];
  39     }
  40 
  41     // types
  42     private static final int OTHER = 0;
  43     private static final int CR = 1;
  44     private static final int LF = 2;
  45     private static final int CONTROL = 3;
  46     private static final int EXTEND = 4;
  47     private static final int RI = 5;
  48     private static final int PREPEND = 6;
  49     private static final int SPACINGMARK = 7;
  50     private static final int L = 8;
  51     private static final int V = 9;
  52     private static final int T = 10;
  53     private static final int LV = 11;
  54     private static final int LVT = 12;
  55 
  56     private static final int FIRST_TYPE = 0;
  57     private static final int LAST_TYPE = 12;
  58 
  59     private static boolean[][] rules;
  60     static {
  61         rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
  62         // default, any + any
  63         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
  64             for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
  65                 rules[i][j] = true;
  66         // GB 6 L x (L | V | LV | VT)
  67         rules[L][L] = false;
  68         rules[L][V] = false;
  69         rules[L][LV] = false;
  70         rules[L][LVT] = false;
  71         // GB 7 (LV | V) x (V | T)
  72         rules[LV][V] = false;
  73         rules[LV][T] = false;
  74         rules[V][V] = false;
  75         rules[V][T] = false;
  76         // GB 8 (LVT | T) x T
  77         rules[LVT][T] = false;
  78         rules[T][T] = false;
  79         // GB 8a RI x RI
  80         rules[RI][RI] = false;
  81         // GB 9 x Extend
  82         // GB 9a x Spacing Mark
  83         // GB 9b Prepend x
  84         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
  85             rules[i][EXTEND] = false;
  86             rules[i][SPACINGMARK] = false;
  87             rules[PREPEND][i] = false;
  88         }
  89         // GB 4  (Control | CR | LF) +
  90         // GB 5  + (Control | CR | LF)
  91         for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
  92             for (int j = CR; j <= CONTROL; j++) {
  93                 rules[i][j] = true;
  94                 rules[j][i] = true;
  95             }
  96         // GB 3 CR x LF
  97         rules[CR][LF] = false;
  98         // GB 10 Any + Any  -> default
  99     }
 100 
 101     // Hangul syllables
 102     private static final int SYLLABLE_BASE = 0xAC00;
 103     private static final int LCOUNT = 19;
 104     private static final int VCOUNT = 21;
 105     private static final int TCOUNT = 28;
 106     private static final int NCOUNT = VCOUNT * TCOUNT; // 588
 107     private static final int SCOUNT = LCOUNT * NCOUNT; // 11172
 108 
 109     // #tr29: SpacingMark exceptions: The following (which have
 110     // General_Category = Spacing_Mark and would otherwise be included)
 111     // are specifically excluded
 112     private static boolean isExcludedSpacingMark(int cp) {
 113        return  cp == 0x102B || cp == 0x102C || cp == 0x1038 ||
 114                cp >= 0x1062 && cp <= 0x1064 ||
 115                cp >= 0x1062 && cp <= 0x106D ||
 116                cp == 0x1083 ||
 117                cp >= 0x1087 && cp <= 0x108C ||
 118                cp == 0x108F ||
 119                cp >= 0x109A && cp <= 0x109C ||
 120                cp == 0x1A61 || cp == 0x1A63 || cp == 0x1A64 ||
 121                cp == 0xAA7B || cp == 0xAA7D;
 122     }
 123 
 124     @SuppressWarnings("fallthrough")
 125     private static int getType(int cp) {
 126         int type = Character.getType(cp);
 127         switch(type) {
 128         case Character.CONTROL:
 129             if (cp == 0x000D)
 130                 return CR;
 131             if (cp == 0x000A)
 132                 return LF;
 133             return CONTROL;
 134          case Character.UNASSIGNED:
 135             // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
 136             // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
 137             // so type it as "Other" to make the test happy
 138              if (cp == 0x0378)
 139                  return OTHER;
 140 
 141         case Character.LINE_SEPARATOR:
 142         case Character.PARAGRAPH_SEPARATOR:
 143         case Character.SURROGATE:
 144             return CONTROL;
 145         case Character.FORMAT:
 146             if (cp == 0x200C || cp == 0x200D)
 147                 return EXTEND;
 148             return CONTROL;
 149         case Character.NON_SPACING_MARK:
 150         case Character.ENCLOSING_MARK:
 151              // NOTE:
 152              // #tr29 "plus a few General_Category = Spacing_Mark needed for
 153              // canonical equivalence."
 154              // but for "extended grapheme clusters" support, there is no
 155              // need actually to diff "extend" and "spackmark" given GB9, GB9a
 156              return EXTEND;
 157         case  Character.COMBINING_SPACING_MARK:
 158             if (isExcludedSpacingMark(cp))
 159                 return OTHER;
 160             // NOTE:
 161             // 0x11720 and 0x11721 are mentioned in #tr29 as
 162             // OTHER_LETTER but it appears their category has been updated to
 163             // COMBING_SPACING_MARK already (verified in ver.8)
 164             return SPACINGMARK;
 165         case Character.OTHER_SYMBOL:
 166             if (cp >= 0x1F1E6 && cp <= 0x1F1FF)
 167                 return RI;
 168             return OTHER;
 169         case Character.MODIFIER_LETTER:
 170             // WARNING:
 171             // not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
 172             if (cp == 0xFF9E || cp == 0xFF9F)
 173                 return EXTEND;
 174             return OTHER;
 175         case Character.OTHER_LETTER:
 176             if (cp == 0x0E33 || cp == 0x0EB3)
 177                 return SPACINGMARK;
 178             // hangul jamo
 179             if (cp >= 0x1100 && cp <= 0x11FF) {
 180                 if (cp <= 0x115F)
 181                     return L;
 182                 if (cp <= 0x11A7)
 183                     return V;
 184                 return T;
 185             }
 186             // hangul syllables
 187             int sindex = cp - SYLLABLE_BASE;
 188             if (sindex >= 0 && sindex < SCOUNT) {
 189 
 190                 if (sindex % TCOUNT == 0)
 191                     return LV;
 192                 return LVT;
 193             }
 194             //  hangul jamo_extended A
 195             if (cp >= 0xA960 && cp <= 0xA97C)
 196                 return L;
 197             //  hangul jamo_extended B
 198             if (cp >= 0xD7B0 && cp <= 0xD7C6)
 199                 return V;
 200             if (cp >= 0xD7CB && cp <= 0xD7FB)
 201                 return T;
 202         }
 203         return OTHER;
 204     }
 205 }