1 /* 2 * Copyright (c) 1999, 2008, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 41 package java.text; 42 43 import java.util.Vector; 44 import java.util.Stack; 45 import java.util.Hashtable; 46 import java.text.CharacterIterator; 47 import java.io.InputStream; 48 import java.io.IOException; 49 50 /** 51 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary 52 * to further subdivide ranges of text beyond what is possible using just the 53 * state-table-based algorithm. This is necessary, for example, to handle 54 * word and line breaking in Thai, which doesn't use spaces between words. The 55 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide 56 * up text as far as possible, and then contiguous ranges of letters are 57 * repeatedly compared against a list of known words (i.e., the dictionary) 58 * to divide them up into words. 59 * 60 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, 61 * but adds one more special substitution name: <dictionary>. This substitution 62 * name is used to identify characters in words in the dictionary. The idea is that 63 * if the iterator passes over a chunk of text that includes two or more characters 64 * in a row that are included in <dictionary>, it goes back through that range and 65 * derives additional break positions (if possible) using the dictionary. 66 * 67 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary 68 * file. It follows a prescribed search path to locate the dictionary (right now, 69 * it looks for it in /com/ibm/text/resources in each directory in the classpath, 70 * and won't find it in JAR files, but this location is likely to change). The 71 * dictionary file is in a serialized binary format. We have a very primitive (and 72 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't 73 * currently making it public. Contact us for help. 74 */ 75 class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { 76 77 /** 78 * a list of known words that is used to divide up contiguous ranges of letters, 79 * stored in a compressed, indexed, format that offers fast access 80 */ 81 private BreakDictionary dictionary; 82 83 /** 84 * a list of flags indicating which character categories are contained in 85 * the dictionary file (this is used to determine which ranges of characters 86 * to apply the dictionary to) 87 */ 88 private boolean[] categoryFlags; 89 90 /** 91 * a temporary hiding place for the number of dictionary characters in the 92 * last range passed over by next() 93 */ 94 private int dictionaryCharCount; 95 96 /** 97 * when a range of characters is divided up using the dictionary, the break 98 * positions that are discovered are stored here, preventing us from having 99 * to use either the dictionary or the state table again until the iterator 100 * leaves this range of text 101 */ 102 private int[] cachedBreakPositions; 103 104 /** 105 * if cachedBreakPositions is not null, this indicates which item in the 106 * cache the current iteration position refers to 107 */ 108 private int positionInCache; 109 110 /** 111 * Constructs a DictionaryBasedBreakIterator. 112 * @param description Same as the description parameter on RuleBasedBreakIterator, 113 * except for the special meaning of "<dictionary>". This parameter is just 114 * passed through to RuleBasedBreakIterator's constructor. 115 * @param dictionaryFilename The filename of the dictionary file to use 116 */ 117 public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile) 118 throws IOException { 119 super(dataFile); 120 byte[] tmp = super.getAdditionalData(); 121 if (tmp != null) { 122 prepareCategoryFlags(tmp); 123 super.setAdditionalData(null); 124 } 125 dictionary = new BreakDictionary(dictionaryFile); 126 } 127 128 private void prepareCategoryFlags(byte[] data) { 129 categoryFlags = new boolean[data.length]; 130 for (int i = 0; i < data.length; i++) { 131 categoryFlags[i] = (data[i] == (byte)1) ? true : false; 132 } 133 } 134 135 public void setText(CharacterIterator newText) { 136 super.setText(newText); 137 cachedBreakPositions = null; 138 dictionaryCharCount = 0; 139 positionInCache = 0; 140 } 141 142 /** 143 * Sets the current iteration position to the beginning of the text. 144 * (i.e., the CharacterIterator's starting offset). 145 * @return The offset of the beginning of the text. 146 */ 147 public int first() { 148 cachedBreakPositions = null; 149 dictionaryCharCount = 0; 150 positionInCache = 0; 151 return super.first(); 152 } 153 154 /** 155 * Sets the current iteration position to the end of the text. 156 * (i.e., the CharacterIterator's ending offset). 157 * @return The text's past-the-end offset. 158 */ 159 public int last() { 160 cachedBreakPositions = null; 161 dictionaryCharCount = 0; 162 positionInCache = 0; 163 return super.last(); 164 } 165 166 /** 167 * Advances the iterator one step backwards. 168 * @return The position of the last boundary position before the 169 * current iteration position 170 */ 171 public int previous() { 172 CharacterIterator text = getText(); 173 174 // if we have cached break positions and we're still in the range 175 // covered by them, just move one step backward in the cache 176 if (cachedBreakPositions != null && positionInCache > 0) { 177 --positionInCache; 178 text.setIndex(cachedBreakPositions[positionInCache]); 179 return cachedBreakPositions[positionInCache]; 180 } 181 182 // otherwise, dump the cache and use the inherited previous() method to move 183 // backward. This may fill up the cache with new break positions, in which 184 // case we have to mark our position in the cache 185 else { 186 cachedBreakPositions = null; 187 int result = super.previous(); 188 if (cachedBreakPositions != null) { 189 positionInCache = cachedBreakPositions.length - 2; 190 } 191 return result; 192 } 193 } 194 195 /** 196 * Sets the current iteration position to the last boundary position 197 * before the specified position. 198 * @param offset The position to begin searching from 199 * @return The position of the last boundary before "offset" 200 */ 201 public int preceding(int offset) { 202 CharacterIterator text = getText(); 203 checkOffset(offset, text); 204 205 // if we have no cached break positions, or "offset" is outside the 206 // range covered by the cache, we can just call the inherited routine 207 // (which will eventually call other routines in this class that may 208 // refresh the cache) 209 if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] || 210 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { 211 cachedBreakPositions = null; 212 return super.preceding(offset); 213 } 214 215 // on the other hand, if "offset" is within the range covered by the cache, 216 // then all we have to do is search the cache for the last break position 217 // before "offset" 218 else { 219 positionInCache = 0; 220 while (positionInCache < cachedBreakPositions.length 221 && offset > cachedBreakPositions[positionInCache]) { 222 ++positionInCache; 223 } 224 --positionInCache; 225 text.setIndex(cachedBreakPositions[positionInCache]); 226 return text.getIndex(); 227 } 228 } 229 230 /** 231 * Sets the current iteration position to the first boundary position after 232 * the specified position. 233 * @param offset The position to begin searching forward from 234 * @return The position of the first boundary after "offset" 235 */ 236 public int following(int offset) { 237 CharacterIterator text = getText(); 238 checkOffset(offset, text); 239 240 // if we have no cached break positions, or if "offset" is outside the 241 // range covered by the cache, then dump the cache and call our 242 // inherited following() method. This will call other methods in this 243 // class that may refresh the cache. 244 if (cachedBreakPositions == null || offset < cachedBreakPositions[0] || 245 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { 246 cachedBreakPositions = null; 247 return super.following(offset); 248 } 249 250 // on the other hand, if "offset" is within the range covered by the 251 // cache, then just search the cache for the first break position 252 // after "offset" 253 else { 254 positionInCache = 0; 255 while (positionInCache < cachedBreakPositions.length 256 && offset >= cachedBreakPositions[positionInCache]) { 257 ++positionInCache; 258 } 259 text.setIndex(cachedBreakPositions[positionInCache]); 260 return text.getIndex(); 261 } 262 } 263 264 /** 265 * This is the implementation function for next(). 266 */ 267 protected int handleNext() { 268 CharacterIterator text = getText(); 269 270 // if there are no cached break positions, or if we've just moved 271 // off the end of the range covered by the cache, we have to dump 272 // and possibly regenerate the cache 273 if (cachedBreakPositions == null || 274 positionInCache == cachedBreakPositions.length - 1) { 275 276 // start by using the inherited handleNext() to find a tentative return 277 // value. dictionaryCharCount tells us how many dictionary characters 278 // we passed over on our way to the tentative return value 279 int startPos = text.getIndex(); 280 dictionaryCharCount = 0; 281 int result = super.handleNext(); 282 283 // if we passed over more than one dictionary character, then we use 284 // divideUpDictionaryRange() to regenerate the cached break positions 285 // for the new range 286 if (dictionaryCharCount > 1 && result - startPos > 1) { 287 divideUpDictionaryRange(startPos, result); 288 } 289 290 // otherwise, the value we got back from the inherited fuction 291 // is our return value, and we can dump the cache 292 else { 293 cachedBreakPositions = null; 294 return result; 295 } 296 } 297 298 // if the cache of break positions has been regenerated (or existed all 299 // along), then just advance to the next break position in the cache 300 // and return it 301 if (cachedBreakPositions != null) { 302 ++positionInCache; 303 text.setIndex(cachedBreakPositions[positionInCache]); 304 return cachedBreakPositions[positionInCache]; 305 } 306 return -9999; // SHOULD NEVER GET HERE! 307 } 308 309 /** 310 * Looks up a character category for a character. 311 */ 312 protected int lookupCategory(int c) { 313 // this override of lookupCategory() exists only to keep track of whether we've 314 // passed over any dictionary characters. It calls the inherited lookupCategory() 315 // to do the real work, and then checks whether its return value is one of the 316 // categories represented in the dictionary. If it is, bump the dictionary- 317 // character count. 318 int result = super.lookupCategory(c); 319 if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) { 320 ++dictionaryCharCount; 321 } 322 return result; 323 } 324 325 /** 326 * This is the function that actually implements the dictionary-based 327 * algorithm. Given the endpoints of a range of text, it uses the 328 * dictionary to determine the positions of any boundaries in this 329 * range. It stores all the boundary positions it discovers in 330 * cachedBreakPositions so that we only have to do this work once 331 * for each time we enter the range. 332 */ 333 private void divideUpDictionaryRange(int startPos, int endPos) { 334 CharacterIterator text = getText(); 335 336 // the range we're dividing may begin or end with non-dictionary characters 337 // (i.e., for line breaking, we may have leading or trailing punctuation 338 // that needs to be kept with the word). Seek from the beginning of the 339 // range to the first dictionary character 340 text.setIndex(startPos); 341 int c = getCurrent(); 342 int category = lookupCategory(c); 343 while (category == IGNORE || !categoryFlags[category]) { 344 c = getNext(); 345 category = lookupCategory(c); 346 } 347 348 // initialize. We maintain two stacks: currentBreakPositions contains 349 // the list of break positions that will be returned if we successfully 350 // finish traversing the whole range now. possibleBreakPositions lists 351 // all other possible word ends we've passed along the way. (Whenever 352 // we reach an error [a sequence of characters that can't begin any word 353 // in the dictionary], we back up, possibly delete some breaks from 354 // currentBreakPositions, move a break from possibleBreakPositions 355 // to currentBreakPositions, and start over from there. This process 356 // continues in this way until we either successfully make it all the way 357 // across the range, or exhaust all of our combinations of break 358 // positions.) 359 Stack<Integer> currentBreakPositions = new Stack<>(); 360 Stack<Integer> possibleBreakPositions = new Stack<>(); 361 Vector<Integer> wrongBreakPositions = new Vector<>(); 362 363 // the dictionary is implemented as a trie, which is treated as a state 364 // machine. -1 represents the end of a legal word. Every word in the 365 // dictionary is represented by a path from the root node to -1. A path 366 // that ends in state 0 is an illegal combination of characters. 367 int state = 0; 368 369 // these two variables are used for error handling. We keep track of the 370 // farthest we've gotten through the range being divided, and the combination 371 // of breaks that got us that far. If we use up all possible break 372 // combinations, the text contains an error or a word that's not in the 373 // dictionary. In this case, we "bless" the break positions that got us the 374 // farthest as real break positions, and then start over from scratch with 375 // the character where the error occurred. 376 int farthestEndPoint = text.getIndex(); 377 Stack<Integer> bestBreakPositions = null; 378 379 // initialize (we always exit the loop with a break statement) 380 c = getCurrent(); 381 while (true) { 382 383 // if we can transition to state "-1" from our current state, we're 384 // on the last character of a legal word. Push that position onto 385 // the possible-break-positions stack 386 if (dictionary.getNextState(state, 0) == -1) { 387 possibleBreakPositions.push(Integer.valueOf(text.getIndex())); 388 } 389 390 // look up the new state to transition to in the dictionary 391 state = dictionary.getNextStateFromCharacter(state, c); 392 393 // if the character we're sitting on causes us to transition to 394 // the "end of word" state, then it was a non-dictionary character 395 // and we've successfully traversed the whole range. Drop out 396 // of the loop. 397 if (state == -1) { 398 currentBreakPositions.push(Integer.valueOf(text.getIndex())); 399 break; 400 } 401 402 // if the character we're sitting on causes us to transition to 403 // the error state, or if we've gone off the end of the range 404 // without transitioning to the "end of word" state, we've hit 405 // an error... 406 else if (state == 0 || text.getIndex() >= endPos) { 407 408 // if this is the farthest we've gotten, take note of it in 409 // case there's an error in the text 410 if (text.getIndex() > farthestEndPoint) { 411 farthestEndPoint = text.getIndex(); 412 413 @SuppressWarnings("unchecked") 414 Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone(); 415 416 bestBreakPositions = currentBreakPositionsCopy; 417 } 418 419 // wrongBreakPositions is a list of all break positions 420 // we've tried starting that didn't allow us to traverse 421 // all the way through the text. Every time we pop a 422 //break position off of currentBreakPositions, we put it 423 // into wrongBreakPositions to avoid trying it again later. 424 // If we make it to this spot, we're either going to back 425 // up to a break in possibleBreakPositions and try starting 426 // over from there, or we've exhausted all possible break 427 // positions and are going to do the fallback procedure. 428 // This loop prevents us from messing with anything in 429 // possibleBreakPositions that didn't work as a starting 430 // point the last time we tried it (this is to prevent a bunch of 431 // repetitive checks from slowing down some extreme cases) 432 Integer newStartingSpot = null; 433 while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains( 434 possibleBreakPositions.peek())) { 435 possibleBreakPositions.pop(); 436 } 437 438 // if we've used up all possible break-position combinations, there's 439 // an error or an unknown word in the text. In this case, we start 440 // over, treating the farthest character we've reached as the beginning 441 // of the range, and "blessing" the break positions that got us that 442 // far as real break positions 443 if (possibleBreakPositions.isEmpty()) { 444 if (bestBreakPositions != null) { 445 currentBreakPositions = bestBreakPositions; 446 if (farthestEndPoint < endPos) { 447 text.setIndex(farthestEndPoint + 1); 448 } 449 else { 450 break; 451 } 452 } 453 else { 454 if ((currentBreakPositions.size() == 0 || 455 currentBreakPositions.peek().intValue() != text.getIndex()) 456 && text.getIndex() != startPos) { 457 currentBreakPositions.push(new Integer(text.getIndex())); 458 } 459 getNext(); 460 currentBreakPositions.push(new Integer(text.getIndex())); 461 } 462 } 463 464 // if we still have more break positions we can try, then promote the 465 // last break in possibleBreakPositions into currentBreakPositions, 466 // and get rid of all entries in currentBreakPositions that come after 467 // it. Then back up to that position and start over from there (i.e., 468 // treat that position as the beginning of a new word) 469 else { 470 Integer temp = possibleBreakPositions.pop(); 471 Integer temp2 = null; 472 while (!currentBreakPositions.isEmpty() && temp.intValue() < 473 currentBreakPositions.peek().intValue()) { 474 temp2 = currentBreakPositions.pop(); 475 wrongBreakPositions.addElement(temp2); 476 } 477 currentBreakPositions.push(temp); 478 text.setIndex(currentBreakPositions.peek().intValue()); 479 } 480 481 // re-sync "c" for the next go-round, and drop out of the loop if 482 // we've made it off the end of the range 483 c = getCurrent(); 484 if (text.getIndex() >= endPos) { 485 break; 486 } 487 } 488 489 // if we didn't hit any exceptional conditions on this last iteration, 490 // just advance to the next character and loop 491 else { 492 c = getNext(); 493 } 494 } 495 496 // dump the last break position in the list, and replace it with the actual 497 // end of the range (which may be the same character, or may be further on 498 // because the range actually ended with non-dictionary characters we want to 499 // keep with the word) 500 if (!currentBreakPositions.isEmpty()) { 501 currentBreakPositions.pop(); 502 } 503 currentBreakPositions.push(Integer.valueOf(endPos)); 504 505 // create a regular array to hold the break positions and copy 506 // the break positions from the stack to the array (in addition, 507 // our starting position goes into this array as a break position). 508 // This array becomes the cache of break positions used by next() 509 // and previous(), so this is where we actually refresh the cache. 510 cachedBreakPositions = new int[currentBreakPositions.size() + 1]; 511 cachedBreakPositions[0] = startPos; 512 513 for (int i = 0; i < currentBreakPositions.size(); i++) { 514 cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue(); 515 } 516 positionInCache = 0; 517 } 518 }