1 /*
   2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.  Oracle designates this
   7  * particular file as subject to the "Classpath" exception as provided
   8  * by Oracle in the LICENSE file that accompanied this code.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 /*
  27  *******************************************************************************
  28  *
  29  *   Copyright (C) 1999-2003, International Business Machines
  30  *   Corporation and others.  All Rights Reserved.
  31  *
  32  *******************************************************************************
  33  */
  34 
  35 package sun.font;
  36 
  37 /**
  38  * <code>ScriptRun</code> is used to find runs of characters in
  39  * the same script, as defined in the <code>Script</code> class.
  40  * It implements a simple iterator over an array of characters.
  41  * The iterator will assign <code>COMMON</code> and <code>INHERITED</code>
  42  * characters to the same script as the preceding characters. If the
  43  * COMMON and INHERITED characters are first, they will be assigned to
  44  * the same script as the following characters.
  45  *
  46  * The iterator will try to match paired punctuation. If it sees an
  47  * opening punctuation character, it will remember the script that
  48  * was assigned to that character, and assign the same script to the
  49  * matching closing punctuation.
  50  *
  51  * No attempt is made to combine related scripts into a single run. In
  52  * particular, Hiragana, Katakana, and Han characters will appear in seperate
  53  * runs.
  54 
  55  * Here is an example of how to iterate over script runs:
  56  * <pre>
  57  * void printScriptRuns(char[] text)
  58  * {
  59  *     ScriptRun scriptRun = new ScriptRun(text, 0, text.length);
  60  *
  61  *     while (scriptRun.next()) {
  62  *         int start  = scriptRun.getScriptStart();
  63  *         int limit  = scriptRun.getScriptLimit();
  64  *         int script = scriptRun.getScriptCode();
  65  *
  66  *         System.out.println("Script \"" + Script.getName(script) + "\" from " +
  67  *                            start + " to " + limit + ".");
  68  *     }
  69  *  }
  70  * </pre>
  71  *
  72  */
  73 public final class ScriptRun
  74 {
  75     private char[] text;   // fixed once set by constructor
  76     private int textStart;
  77     private int textLimit;
  78 
  79     private int scriptStart;     // change during iteration
  80     private int scriptLimit;
  81     private int scriptCode;
  82 
  83     private int stack[];         // stack used to handle paired punctuation if encountered
  84     private int parenSP;
  85 
  86     public ScriptRun() {
  87         // must call init later or we die.
  88     }
  89 
  90     /**
  91      * Construct a <code>ScriptRun</code> object which iterates over a subrange
  92      * of the given characetrs.
  93      *
  94      * @param chars the array of characters over which to iterate.
  95      * @param start the index of the first character over which to iterate
  96      * @param count the number of characters over which to iterate
  97      */
  98     public ScriptRun(char[] chars, int start, int count)
  99     {
 100         init(chars, start, count);
 101     }
 102 
 103     public void init(char[] chars, int start, int count)
 104     {
 105         if (chars == null || start < 0 || count < 0 || count > chars.length - start) {
 106             throw new IllegalArgumentException();
 107         }
 108 
 109         text = chars;
 110         textStart = start;
 111         textLimit = start + count;
 112 
 113         scriptStart = textStart;
 114         scriptLimit = textStart;
 115         scriptCode = Script.INVALID_CODE;
 116         parenSP = 0;
 117     }
 118 
 119     /**
 120      * Get the starting index of the current script run.
 121      *
 122      * @return the index of the first character in the current script run.
 123      */
 124     public int getScriptStart() {
 125         return scriptStart;
 126     }
 127 
 128     /**
 129      * Get the index of the first character after the current script run.
 130      *
 131      * @return the index of the first character after the current script run.
 132      */
 133     public int getScriptLimit() {
 134         return scriptLimit;
 135     }
 136 
 137     /**
 138      * Get the script code for the script of the current script run.
 139      *
 140      * @return the script code for the script of the current script run.
 141      * @see #Script
 142      */
 143     public int getScriptCode() {
 144         return scriptCode;
 145     }
 146 
 147     /**
 148      * Find the next script run. Returns <code>false</code> if there
 149      * isn't another run, returns <code>true</code> if there is.
 150      *
 151      * @return <code>false</code> if there isn't another run, <code>true</code> if there is.
 152      */
 153     public boolean next() {
 154         int startSP  = parenSP;  // used to find the first new open character
 155 
 156         // if we've fallen off the end of the text, we're done
 157         if (scriptLimit >= textLimit) {
 158             return false;
 159         }
 160 
 161         scriptCode  = Script.COMMON;
 162         scriptStart = scriptLimit;
 163 
 164         int ch;
 165 
 166         while ((ch = nextCodePoint()) != DONE) {
 167             int sc = ScriptRunData.getScript(ch);
 168             int pairIndex = sc == Script.COMMON ? getPairIndex(ch) : -1;
 169 
 170             // Paired character handling:
 171             //
 172             // if it's an open character, push it onto the stack.
 173             // if it's a close character, find the matching open on the
 174             // stack, and use that script code. Any non-matching open
 175             // characters above it on the stack will be popped.
 176             if (pairIndex >= 0) {
 177                 if ((pairIndex & 1) == 0) {
 178                     if (stack == null) {
 179                         stack = new int[32];
 180                     } else if (parenSP == stack.length) {
 181                         int[] newstack = new int[stack.length + 32];
 182                         System.arraycopy(stack, 0, newstack, 0, stack.length);
 183                         stack = newstack;
 184                     }
 185 
 186                     stack[parenSP++] = pairIndex;
 187                     stack[parenSP++] = scriptCode;
 188                 } else if (parenSP > 0) {
 189                     int pi = pairIndex & ~1;
 190 
 191                     while ((parenSP -= 2) >= 0 && stack[parenSP] != pi);
 192 
 193                     if (parenSP >= 0) {
 194                         sc = stack[parenSP+1];
 195                     } else {
 196                       parenSP = 0;
 197                     }
 198                     if (parenSP < startSP) {
 199                         startSP = parenSP;
 200                     }
 201                }
 202             }
 203 
 204             if (sameScript(scriptCode, sc)) {
 205                 if (scriptCode <= Script.INHERITED && sc > Script.INHERITED) {
 206                     scriptCode = sc;
 207 
 208                     // now that we have a final script code, fix any open
 209                     // characters we pushed before we knew the script code.
 210                     while (startSP < parenSP) {
 211                         stack[startSP+1] = scriptCode;
 212                         startSP += 2;
 213                     }
 214                 }
 215 
 216                 // if this character is a close paired character,
 217                 // pop it from the stack
 218                 if (pairIndex > 0 && (pairIndex & 1) != 0 && parenSP > 0) {
 219                     parenSP -= 2;
 220                 }
 221             } else {
 222                 // We've just seen the first character of
 223                 // the next run. Back over it so we'll see
 224                 // it again the next time.
 225                 pushback(ch);
 226 
 227                 // we're outta here
 228                 break;
 229             }
 230         }
 231 
 232         return true;
 233     }
 234 
 235     static final int SURROGATE_START = 0x10000;
 236     static final int LEAD_START = 0xd800;
 237     static final int LEAD_LIMIT = 0xdc00;
 238     static final int TAIL_START = 0xdc00;
 239     static final int TAIL_LIMIT = 0xe000;
 240     static final int LEAD_SURROGATE_SHIFT = 10;
 241     static final int SURROGATE_OFFSET = SURROGATE_START - (LEAD_START << LEAD_SURROGATE_SHIFT) - TAIL_START;
 242 
 243     static final int DONE = -1;
 244 
 245     private int nextCodePoint() {
 246         if (scriptLimit >= textLimit) {
 247             return DONE;
 248         }
 249         int ch = text[scriptLimit++];
 250         if (ch >= LEAD_START && ch < LEAD_LIMIT && scriptLimit < textLimit) {
 251             int nch = text[scriptLimit];
 252             if (nch >= TAIL_START && nch < TAIL_LIMIT) {
 253                 ++scriptLimit;
 254                 ch = (ch << LEAD_SURROGATE_SHIFT) + nch + SURROGATE_OFFSET;
 255             }
 256         }
 257         return ch;
 258     }
 259 
 260     private void pushback(int ch) {
 261         if (ch >= 0) {
 262             if (ch >= 0x10000) {
 263                 scriptLimit -= 2;
 264             } else {
 265                 scriptLimit -= 1;
 266             }
 267         }
 268     }
 269 
 270     /**
 271      * Compare two script codes to see if they are in the same script. If one script is
 272      * a strong script, and the other is INHERITED or COMMON, it will compare equal.
 273      *
 274      * @param scriptOne one of the script codes.
 275      * @param scriptTwo the other script code.
 276      * @return <code>true</code> if the two scripts are the same.
 277      * @see com.ibm.icu.lang.Script
 278      */
 279     private static boolean sameScript(int scriptOne, int scriptTwo) {
 280         return scriptOne == scriptTwo || scriptOne <= Script.INHERITED || scriptTwo <= Script.INHERITED;
 281     }
 282 
 283     /**
 284      * Find the highest bit that's set in a word. Uses a binary search through
 285      * the bits.
 286      *
 287      * @param n the word in which to find the highest bit that's set.
 288      * @return the bit number (counting from the low order bit) of the highest bit.
 289      */
 290     private static byte highBit(int n)
 291     {
 292         if (n <= 0) {
 293             return -32;
 294         }
 295 
 296         byte bit = 0;
 297 
 298         if (n >= 1 << 16) {
 299             n >>= 16;
 300             bit += 16;
 301         }
 302 
 303         if (n >= 1 << 8) {
 304             n >>= 8;
 305             bit += 8;
 306         }
 307 
 308         if (n >= 1 << 4) {
 309             n >>= 4;
 310             bit += 4;
 311         }
 312 
 313         if (n >= 1 << 2) {
 314             n >>= 2;
 315             bit += 2;
 316         }
 317 
 318         if (n >= 1 << 1) {
 319             n >>= 1;
 320             bit += 1;
 321         }
 322 
 323         return bit;
 324     }
 325 
 326     /**
 327      * Search the pairedChars array for the given character.
 328      *
 329      * @param ch the character for which to search.
 330      * @return the index of the character in the table, or -1 if it's not there.
 331      */
 332     private static int getPairIndex(int ch)
 333     {
 334         int probe = pairedCharPower;
 335         int index = 0;
 336 
 337         if (ch >= pairedChars[pairedCharExtra]) {
 338             index = pairedCharExtra;
 339         }
 340 
 341         while (probe > (1 << 0)) {
 342             probe >>= 1;
 343 
 344             if (ch >= pairedChars[index + probe]) {
 345                 index += probe;
 346             }
 347         }
 348 
 349         if (pairedChars[index] != ch) {
 350             index = -1;
 351         }
 352 
 353         return index;
 354     }
 355 
 356     // all common
 357     private static int pairedChars[] = {
 358         0x0028, 0x0029, // ascii paired punctuation  // common
 359         0x003c, 0x003e, // common
 360         0x005b, 0x005d, // common
 361         0x007b, 0x007d, // common
 362         0x00ab, 0x00bb, // guillemets // common
 363         0x2018, 0x2019, // general punctuation // common
 364         0x201c, 0x201d, // common
 365         0x2039, 0x203a, // common
 366         0x3008, 0x3009, // chinese paired punctuation // common
 367         0x300a, 0x300b,
 368         0x300c, 0x300d,
 369         0x300e, 0x300f,
 370         0x3010, 0x3011,
 371         0x3014, 0x3015,
 372         0x3016, 0x3017,
 373         0x3018, 0x3019,
 374         0x301a, 0x301b
 375     };
 376 
 377     private static final int pairedCharPower = 1 << highBit(pairedChars.length);
 378     private static final int pairedCharExtra = pairedChars.length - pairedCharPower;
 379 
 380 }