1 /*
   2  * Copyright © 2012  Google, Inc.
   3  *
   4  *  This is part of HarfBuzz, a text shaping library.
   5  *
   6  * Permission is hereby granted, without written agreement and without
   7  * license or royalty fees, to use, copy, modify, and distribute this
   8  * software and its documentation for any purpose, provided that the
   9  * above copyright notice and the following two paragraphs appear in
  10  * all copies of this software.
  11  *
  12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
  13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
  15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
  16  * DAMAGE.
  17  *
  18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
  19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
  20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
  21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
  22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
  23  *
  24  * Google Author(s): Behdad Esfahbod
  25  */
  26 
  27 #ifndef HB_OT_SHAPE_COMPLEX_INDIC_HH
  28 #define HB_OT_SHAPE_COMPLEX_INDIC_HH
  29 
  30 #include "hb.hh"
  31 
  32 #include "hb-ot-shape-complex.hh"
  33 
  34 
  35 /* buffer var allocations */
  36 #define indic_category() complex_var_u8_0() /* indic_category_t */
  37 #define indic_position() complex_var_u8_1() /* indic_position_t */
  38 
  39 
  40 #define INDIC_TABLE_ELEMENT_TYPE uint16_t
  41 
  42 /* Cateories used in the OpenType spec:
  43  * https://docs.microsoft.com/en-us/typography/script-development/devanagari
  44  */
  45 /* Note: This enum is duplicated in the -machine.rl source file.
  46  * Not sure how to avoid duplication. */
  47 enum indic_category_t {
  48   OT_X = 0,
  49   OT_C = 1,
  50   OT_V = 2,
  51   OT_N = 3,
  52   OT_H = 4,
  53   OT_ZWNJ = 5,
  54   OT_ZWJ = 6,
  55   OT_M = 7,
  56   OT_SM = 8,
  57   /* OT_VD = 9, UNUSED; we use OT_A instead. */
  58   OT_A = 10,
  59   OT_PLACEHOLDER = 11,
  60   OT_DOTTEDCIRCLE = 12,
  61   OT_RS = 13, /* Register Shifter, used in Khmer OT spec. */
  62   OT_Coeng = 14, /* Khmer-style Virama. */
  63   OT_Repha = 15, /* Atomically-encoded logical or visual repha. */
  64   OT_Ra = 16,
  65   OT_CM = 17,  /* Consonant-Medial; Unused by Indic shaper. */
  66   OT_Symbol = 18, /* Avagraha, etc that take marks (SM,A,VD). */
  67   OT_CS = 19
  68 };
  69 
  70 /* Note:
  71  *
  72  * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
  73  * cannot happen in a consonant syllable.  The plus side however is, we can call the
  74  * consonant syllable logic from the vowel syllable function and get it all right! */
  75 #define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_CS) | FLAG (OT_Ra) | FLAG (OT_V) | FLAG (OT_PLACEHOLDER) | FLAG (OT_DOTTEDCIRCLE))
  76 #define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
  77 
  78 
  79 /* Visual positions in a syllable from left to right. */
  80 enum indic_position_t {
  81   POS_START = 0,
  82 
  83   POS_RA_TO_BECOME_REPH = 1,
  84   POS_PRE_M = 2,
  85   POS_PRE_C = 3,
  86 
  87   POS_BASE_C = 4,
  88   POS_AFTER_MAIN = 5,
  89 
  90   POS_ABOVE_C = 6,
  91 
  92   POS_BEFORE_SUB = 7,
  93   POS_BELOW_C = 8,
  94   POS_AFTER_SUB = 9,
  95 
  96   POS_BEFORE_POST = 10,
  97   POS_POST_C = 11,
  98   POS_AFTER_POST = 12,
  99 
 100   POS_FINAL_C = 13,
 101   POS_SMVD = 14,
 102 
 103   POS_END = 15
 104 };
 105 
 106 /* Categories used in IndicSyllabicCategory.txt from UCD. */
 107 enum indic_syllabic_category_t {
 108   INDIC_SYLLABIC_CATEGORY_OTHER                         = OT_X,
 109 
 110   INDIC_SYLLABIC_CATEGORY_AVAGRAHA                      = OT_Symbol,
 111   INDIC_SYLLABIC_CATEGORY_BINDU                         = OT_SM,
 112   INDIC_SYLLABIC_CATEGORY_BRAHMI_JOINING_NUMBER         = OT_PLACEHOLDER, /* Don't care. */
 113   INDIC_SYLLABIC_CATEGORY_CANTILLATION_MARK             = OT_A,
 114   INDIC_SYLLABIC_CATEGORY_CONSONANT                     = OT_C,
 115   INDIC_SYLLABIC_CATEGORY_CONSONANT_DEAD                = OT_C,
 116   INDIC_SYLLABIC_CATEGORY_CONSONANT_FINAL               = OT_CM,
 117   INDIC_SYLLABIC_CATEGORY_CONSONANT_HEAD_LETTER         = OT_C,
 118   INDIC_SYLLABIC_CATEGORY_CONSONANT_KILLER              = OT_M, /* U+17CD only. */
 119   INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL              = OT_CM,
 120   INDIC_SYLLABIC_CATEGORY_CONSONANT_PLACEHOLDER         = OT_PLACEHOLDER,
 121   INDIC_SYLLABIC_CATEGORY_CONSONANT_PRECEDING_REPHA     = OT_Repha,
 122   INDIC_SYLLABIC_CATEGORY_CONSONANT_PREFIXED            = OT_X, /* Don't care. */
 123   INDIC_SYLLABIC_CATEGORY_CONSONANT_SUBJOINED           = OT_CM,
 124   INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA    = OT_CM,
 125   INDIC_SYLLABIC_CATEGORY_CONSONANT_WITH_STACKER        = OT_CS,
 126   INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK               = OT_SM, /* https://github.com/harfbuzz/harfbuzz/issues/552 */
 127   INDIC_SYLLABIC_CATEGORY_INVISIBLE_STACKER             = OT_Coeng,
 128   INDIC_SYLLABIC_CATEGORY_JOINER                        = OT_ZWJ,
 129   INDIC_SYLLABIC_CATEGORY_MODIFYING_LETTER              = OT_X,
 130   INDIC_SYLLABIC_CATEGORY_NON_JOINER                    = OT_ZWNJ,
 131   INDIC_SYLLABIC_CATEGORY_NUKTA                         = OT_N,
 132   INDIC_SYLLABIC_CATEGORY_NUMBER                        = OT_PLACEHOLDER,
 133   INDIC_SYLLABIC_CATEGORY_NUMBER_JOINER                 = OT_PLACEHOLDER, /* Don't care. */
 134   INDIC_SYLLABIC_CATEGORY_PURE_KILLER                   = OT_M, /* Is like a vowel matra. */
 135   INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER              = OT_RS,
 136   INDIC_SYLLABIC_CATEGORY_SYLLABLE_MODIFIER             = OT_SM,
 137   INDIC_SYLLABIC_CATEGORY_TONE_LETTER                   = OT_X,
 138   INDIC_SYLLABIC_CATEGORY_TONE_MARK                     = OT_N,
 139   INDIC_SYLLABIC_CATEGORY_VIRAMA                        = OT_H,
 140   INDIC_SYLLABIC_CATEGORY_VISARGA                       = OT_SM,
 141   INDIC_SYLLABIC_CATEGORY_VOWEL                         = OT_V,
 142   INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT               = OT_M,
 143   INDIC_SYLLABIC_CATEGORY_VOWEL_INDEPENDENT             = OT_V
 144 };
 145 
 146 /* Categories used in IndicSMatraCategory.txt from UCD */
 147 enum indic_matra_category_t {
 148   INDIC_MATRA_CATEGORY_NOT_APPLICABLE                   = POS_END,
 149 
 150   INDIC_MATRA_CATEGORY_LEFT                             = POS_PRE_C,
 151   INDIC_MATRA_CATEGORY_TOP                              = POS_ABOVE_C,
 152   INDIC_MATRA_CATEGORY_BOTTOM                           = POS_BELOW_C,
 153   INDIC_MATRA_CATEGORY_RIGHT                            = POS_POST_C,
 154 
 155   /* These should resolve to the position of the last part of the split sequence. */
 156   INDIC_MATRA_CATEGORY_BOTTOM_AND_RIGHT                 = INDIC_MATRA_CATEGORY_RIGHT,
 157   INDIC_MATRA_CATEGORY_LEFT_AND_RIGHT                   = INDIC_MATRA_CATEGORY_RIGHT,
 158   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM                   = INDIC_MATRA_CATEGORY_BOTTOM,
 159   INDIC_MATRA_CATEGORY_TOP_AND_BOTTOM_AND_RIGHT         = INDIC_MATRA_CATEGORY_RIGHT,
 160   INDIC_MATRA_CATEGORY_TOP_AND_LEFT                     = INDIC_MATRA_CATEGORY_TOP,
 161   INDIC_MATRA_CATEGORY_TOP_AND_LEFT_AND_RIGHT           = INDIC_MATRA_CATEGORY_RIGHT,
 162   INDIC_MATRA_CATEGORY_TOP_AND_RIGHT                    = INDIC_MATRA_CATEGORY_RIGHT,
 163 
 164   INDIC_MATRA_CATEGORY_OVERSTRUCK                       = POS_AFTER_MAIN,
 165   INDIC_MATRA_CATEGORY_VISUAL_ORDER_LEFT                = POS_PRE_M
 166 };
 167 
 168 #define INDIC_COMBINE_CATEGORIES(S,M) \
 169   ( \
 170     ASSERT_STATIC_EXPR_ZERO (S < 255 && M < 255) + \
 171     ( S | \
 172      ( \
 173       ( \
 174        S == INDIC_SYLLABIC_CATEGORY_CONSONANT_MEDIAL || \
 175        S == INDIC_SYLLABIC_CATEGORY_GEMINATION_MARK || \
 176        S == INDIC_SYLLABIC_CATEGORY_REGISTER_SHIFTER || \
 177        S == INDIC_SYLLABIC_CATEGORY_CONSONANT_SUCCEEDING_REPHA || \
 178        S == INDIC_SYLLABIC_CATEGORY_VIRAMA || \
 179        S == INDIC_SYLLABIC_CATEGORY_VOWEL_DEPENDENT || \
 180        false \
 181        ? M : INDIC_MATRA_CATEGORY_NOT_APPLICABLE \
 182       ) << 8 \
 183      ) \
 184     ) \
 185    )
 186 
 187 HB_INTERNAL INDIC_TABLE_ELEMENT_TYPE
 188 hb_indic_get_categories (hb_codepoint_t u);
 189 
 190 
 191 static inline bool
 192 is_one_of (const hb_glyph_info_t &info, unsigned int flags)
 193 {
 194   /* If it ligated, all bets are off. */
 195   if (_hb_glyph_info_ligated (&info)) return false;
 196   return !!(FLAG_UNSAFE (info.indic_category()) & flags);
 197 }
 198 
 199 static inline bool
 200 is_joiner (const hb_glyph_info_t &info)
 201 {
 202   return is_one_of (info, JOINER_FLAGS);
 203 }
 204 
 205 static inline bool
 206 is_consonant (const hb_glyph_info_t &info)
 207 {
 208   return is_one_of (info, CONSONANT_FLAGS);
 209 }
 210 
 211 static inline bool
 212 is_halant (const hb_glyph_info_t &info)
 213 {
 214   return is_one_of (info, FLAG (OT_H));
 215 }
 216 
 217 #define IN_HALF_BLOCK(u, Base) (((u) & ~0x7Fu) == (Base))
 218 
 219 #define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900u))
 220 #define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980u))
 221 #define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00u))
 222 #define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80u))
 223 #define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00u))
 224 #define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80u))
 225 #define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00u))
 226 #define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80u))
 227 #define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00u))
 228 #define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80u))
 229 
 230 
 231 #define MATRA_POS_LEFT(u)       POS_PRE_M
 232 #define MATRA_POS_RIGHT(u)      ( \
 233                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 234                                   IS_BENG(u) ? POS_AFTER_POST : \
 235                                   IS_GURU(u) ? POS_AFTER_POST : \
 236                                   IS_GUJR(u) ? POS_AFTER_POST : \
 237                                   IS_ORYA(u) ? POS_AFTER_POST : \
 238                                   IS_TAML(u) ? POS_AFTER_POST : \
 239                                   IS_TELU(u) ? (u <= 0x0C42u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
 240                                   IS_KNDA(u) ? (u < 0x0CC3u || u > 0xCD6u ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
 241                                   IS_MLYM(u) ? POS_AFTER_POST : \
 242                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 243                                   /*default*/  POS_AFTER_SUB    \
 244                                 )
 245 #define MATRA_POS_TOP(u)        ( /* BENG and MLYM don't have top matras. */ \
 246                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 247                                   IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
 248                                   IS_GUJR(u) ? POS_AFTER_SUB  : \
 249                                   IS_ORYA(u) ? POS_AFTER_MAIN : \
 250                                   IS_TAML(u) ? POS_AFTER_SUB  : \
 251                                   IS_TELU(u) ? POS_BEFORE_SUB : \
 252                                   IS_KNDA(u) ? POS_BEFORE_SUB : \
 253                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 254                                   /*default*/  POS_AFTER_SUB    \
 255                                 )
 256 #define MATRA_POS_BOTTOM(u)     ( \
 257                                   IS_DEVA(u) ? POS_AFTER_SUB  : \
 258                                   IS_BENG(u) ? POS_AFTER_SUB  : \
 259                                   IS_GURU(u) ? POS_AFTER_POST : \
 260                                   IS_GUJR(u) ? POS_AFTER_POST : \
 261                                   IS_ORYA(u) ? POS_AFTER_SUB  : \
 262                                   IS_TAML(u) ? POS_AFTER_POST : \
 263                                   IS_TELU(u) ? POS_BEFORE_SUB : \
 264                                   IS_KNDA(u) ? POS_BEFORE_SUB : \
 265                                   IS_MLYM(u) ? POS_AFTER_POST : \
 266                                   IS_SINH(u) ? POS_AFTER_SUB  : \
 267                                   /*default*/  POS_AFTER_SUB    \
 268                                 )
 269 
 270 static inline indic_position_t
 271 matra_position_indic (hb_codepoint_t u, indic_position_t side)
 272 {
 273   switch ((int) side)
 274   {
 275     case POS_PRE_C:     return MATRA_POS_LEFT (u);
 276     case POS_POST_C:    return MATRA_POS_RIGHT (u);
 277     case POS_ABOVE_C:   return MATRA_POS_TOP (u);
 278     case POS_BELOW_C:   return MATRA_POS_BOTTOM (u);
 279   };
 280   return side;
 281 }
 282 
 283 /* XXX
 284  * This is a hack for now.  We should move this data into the main Indic table.
 285  * Or completely remove it and just check in the tables.
 286  */
 287 static const hb_codepoint_t ra_chars[] = {
 288   0x0930u, /* Devanagari */
 289   0x09B0u, /* Bengali */
 290   0x09F0u, /* Bengali */
 291   0x0A30u, /* Gurmukhi */       /* No Reph */
 292   0x0AB0u, /* Gujarati */
 293   0x0B30u, /* Oriya */
 294   0x0BB0u, /* Tamil */          /* No Reph */
 295   0x0C30u, /* Telugu */         /* Reph formed only with ZWJ */
 296   0x0CB0u, /* Kannada */
 297   0x0D30u, /* Malayalam */      /* No Reph, Logical Repha */
 298 
 299   0x0DBBu, /* Sinhala */        /* Reph formed only with ZWJ */
 300 
 301   0x179Au, /* Khmer */
 302 };
 303 
 304 static inline bool
 305 is_ra (hb_codepoint_t u)
 306 {
 307   for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
 308     if (u == ra_chars[i])
 309       return true;
 310   return false;
 311 }
 312 
 313 static inline void
 314 set_indic_properties (hb_glyph_info_t &info)
 315 {
 316   hb_codepoint_t u = info.codepoint;
 317   unsigned int type = hb_indic_get_categories (u);
 318   indic_category_t cat = (indic_category_t) (type & 0x7Fu);
 319   indic_position_t pos = (indic_position_t) (type >> 8);
 320 
 321 
 322   /*
 323    * Re-assign category
 324    */
 325 
 326   /* The following act more like the Bindus. */
 327   if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0953u, 0x0954u)))
 328     cat = OT_SM;
 329   /* The following act like consonants. */
 330   else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0x0A72u, 0x0A73u,
 331                                       0x1CF5u, 0x1CF6u)))
 332     cat = OT_C;
 333   /* TODO: The following should only be allowed after a Visarga.
 334    * For now, just treat them like regular tone marks. */
 335   else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x1CE2u, 0x1CE8u)))
 336     cat = OT_A;
 337   /* TODO: The following should only be allowed after some of
 338    * the nasalization marks, maybe only for U+1CE9..U+1CF1.
 339    * For now, just treat them like tone marks. */
 340   else if (unlikely (u == 0x1CEDu))
 341     cat = OT_A;
 342   /* The following take marks in standalone clusters, similar to Avagraha. */
 343   else if (unlikely (hb_in_ranges<hb_codepoint_t> (u, 0xA8F2u, 0xA8F7u,
 344                                       0x1CE9u, 0x1CECu,
 345                                       0x1CEEu, 0x1CF1u)))
 346   {
 347     cat = OT_Symbol;
 348     static_assert (((int) INDIC_SYLLABIC_CATEGORY_AVAGRAHA == OT_Symbol), "");
 349   }
 350   else if (unlikely (u == 0x0A51u))
 351   {
 352     /* https://github.com/harfbuzz/harfbuzz/issues/524 */
 353     cat = OT_M;
 354     pos = POS_BELOW_C;
 355   }
 356 
 357   /* According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
 358    * so the Indic shaper needs to know their categories. */
 359   else if (unlikely (u == 0x11301u || u == 0x11303u)) cat = OT_SM;
 360   else if (unlikely (u == 0x1133cu)) cat = OT_N;
 361 
 362   else if (unlikely (u == 0x0AFBu)) cat = OT_N; /* https://github.com/harfbuzz/harfbuzz/issues/552 */
 363 
 364   else if (unlikely (u == 0x0980u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/issues/538 */
 365   else if (unlikely (u == 0x0C80u)) cat = OT_PLACEHOLDER; /* https://github.com/harfbuzz/harfbuzz/pull/623 */
 366   else if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x2010u, 0x2011u)))
 367                                     cat = OT_PLACEHOLDER;
 368   else if (unlikely (u == 0x25CCu)) cat = OT_DOTTEDCIRCLE;
 369 
 370 
 371   /*
 372    * Re-assign position.
 373    */
 374 
 375   if ((FLAG_UNSAFE (cat) & CONSONANT_FLAGS))
 376   {
 377     pos = POS_BASE_C;
 378     if (is_ra (u))
 379       cat = OT_Ra;
 380   }
 381   else if (cat == OT_M)
 382   {
 383     pos = matra_position_indic (u, pos);
 384   }
 385   else if ((FLAG_UNSAFE (cat) & (FLAG (OT_SM) /* | FLAG (OT_VD) */ | FLAG (OT_A) | FLAG (OT_Symbol))))
 386   {
 387     pos = POS_SMVD;
 388   }
 389 
 390   if (unlikely (u == 0x0B01u)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
 391 
 392 
 393 
 394   info.indic_category() = cat;
 395   info.indic_position() = pos;
 396 }
 397 
 398 
 399 #endif /* HB_OT_SHAPE_COMPLEX_INDIC_HH */