1 /*
2 * Copyright (c) 1999, 2008, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 *
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 */
40
41 package java.text;
42
43 import java.util.Vector;
44 import java.util.Stack;
45 import java.util.Hashtable;
46 import java.text.CharacterIterator;
47 import java.io.InputStream;
48 import java.io.IOException;
49
50 /**
51 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
52 * to further subdivide ranges of text beyond what is possible using just the
53 * state-table-based algorithm. This is necessary, for example, to handle
54 * word and line breaking in Thai, which doesn't use spaces between words. The
55 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
56 * up text as far as possible, and then contiguous ranges of letters are
57 * repeatedly compared against a list of known words (i.e., the dictionary)
58 * to divide them up into words.
59 *
60 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
61 * but adds one more special substitution name: <dictionary>. This substitution
62 * name is used to identify characters in words in the dictionary. The idea is that
63 * if the iterator passes over a chunk of text that includes two or more characters
64 * in a row that are included in <dictionary>, it goes back through that range and
65 * derives additional break positions (if possible) using the dictionary.
66 *
67 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
68 * file. It follows a prescribed search path to locate the dictionary (right now,
97 * when a range of characters is divided up using the dictionary, the break
98 * positions that are discovered are stored here, preventing us from having
99 * to use either the dictionary or the state table again until the iterator
100 * leaves this range of text
101 */
102 private int[] cachedBreakPositions;
103
104 /**
105 * if cachedBreakPositions is not null, this indicates which item in the
106 * cache the current iteration position refers to
107 */
108 private int positionInCache;
109
110 /**
111 * Constructs a DictionaryBasedBreakIterator.
112 * @param description Same as the description parameter on RuleBasedBreakIterator,
113 * except for the special meaning of "<dictionary>". This parameter is just
114 * passed through to RuleBasedBreakIterator's constructor.
115 * @param dictionaryFilename The filename of the dictionary file to use
116 */
117 public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
118 throws IOException {
119 super(dataFile);
120 byte[] tmp = super.getAdditionalData();
121 if (tmp != null) {
122 prepareCategoryFlags(tmp);
123 super.setAdditionalData(null);
124 }
125 dictionary = new BreakDictionary(dictionaryFile);
126 }
127
128 private void prepareCategoryFlags(byte[] data) {
129 categoryFlags = new boolean[data.length];
130 for (int i = 0; i < data.length; i++) {
131 categoryFlags[i] = (data[i] == (byte)1) ? true : false;
132 }
133 }
134
135 public void setText(CharacterIterator newText) {
136 super.setText(newText);
137 cachedBreakPositions = null;
138 dictionaryCharCount = 0;
139 positionInCache = 0;
140 }
141
142 /**
143 * Sets the current iteration position to the beginning of the text.
144 * (i.e., the CharacterIterator's starting offset).
145 * @return The offset of the beginning of the text.
146 */
147 public int first() {
148 cachedBreakPositions = null;
149 dictionaryCharCount = 0;
150 positionInCache = 0;
151 return super.first();
152 }
153
154 /**
155 * Sets the current iteration position to the end of the text.
156 * (i.e., the CharacterIterator's ending offset).
157 * @return The text's past-the-end offset.
158 */
159 public int last() {
160 cachedBreakPositions = null;
161 dictionaryCharCount = 0;
162 positionInCache = 0;
163 return super.last();
164 }
165
166 /**
167 * Advances the iterator one step backwards.
168 * @return The position of the last boundary position before the
169 * current iteration position
170 */
171 public int previous() {
172 CharacterIterator text = getText();
173
174 // if we have cached break positions and we're still in the range
175 // covered by them, just move one step backward in the cache
176 if (cachedBreakPositions != null && positionInCache > 0) {
177 --positionInCache;
178 text.setIndex(cachedBreakPositions[positionInCache]);
179 return cachedBreakPositions[positionInCache];
180 }
181
182 // otherwise, dump the cache and use the inherited previous() method to move
183 // backward. This may fill up the cache with new break positions, in which
184 // case we have to mark our position in the cache
185 else {
186 cachedBreakPositions = null;
187 int result = super.previous();
188 if (cachedBreakPositions != null) {
189 positionInCache = cachedBreakPositions.length - 2;
190 }
191 return result;
192 }
193 }
194
195 /**
196 * Sets the current iteration position to the last boundary position
197 * before the specified position.
198 * @param offset The position to begin searching from
199 * @return The position of the last boundary before "offset"
200 */
201 public int preceding(int offset) {
202 CharacterIterator text = getText();
203 checkOffset(offset, text);
204
205 // if we have no cached break positions, or "offset" is outside the
206 // range covered by the cache, we can just call the inherited routine
207 // (which will eventually call other routines in this class that may
208 // refresh the cache)
209 if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
210 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
211 cachedBreakPositions = null;
212 return super.preceding(offset);
213 }
214
215 // on the other hand, if "offset" is within the range covered by the cache,
216 // then all we have to do is search the cache for the last break position
217 // before "offset"
218 else {
219 positionInCache = 0;
220 while (positionInCache < cachedBreakPositions.length
221 && offset > cachedBreakPositions[positionInCache]) {
222 ++positionInCache;
223 }
224 --positionInCache;
225 text.setIndex(cachedBreakPositions[positionInCache]);
226 return text.getIndex();
227 }
228 }
229
230 /**
231 * Sets the current iteration position to the first boundary position after
232 * the specified position.
233 * @param offset The position to begin searching forward from
234 * @return The position of the first boundary after "offset"
235 */
236 public int following(int offset) {
237 CharacterIterator text = getText();
238 checkOffset(offset, text);
239
240 // if we have no cached break positions, or if "offset" is outside the
241 // range covered by the cache, then dump the cache and call our
242 // inherited following() method. This will call other methods in this
243 // class that may refresh the cache.
244 if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
245 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
246 cachedBreakPositions = null;
247 return super.following(offset);
248 }
249
250 // on the other hand, if "offset" is within the range covered by the
251 // cache, then just search the cache for the first break position
252 // after "offset"
253 else {
254 positionInCache = 0;
255 while (positionInCache < cachedBreakPositions.length
256 && offset >= cachedBreakPositions[positionInCache]) {
257 ++positionInCache;
258 }
259 text.setIndex(cachedBreakPositions[positionInCache]);
260 return text.getIndex();
261 }
262 }
263
264 /**
265 * This is the implementation function for next().
266 */
267 protected int handleNext() {
268 CharacterIterator text = getText();
269
270 // if there are no cached break positions, or if we've just moved
271 // off the end of the range covered by the cache, we have to dump
272 // and possibly regenerate the cache
273 if (cachedBreakPositions == null ||
274 positionInCache == cachedBreakPositions.length - 1) {
275
276 // start by using the inherited handleNext() to find a tentative return
277 // value. dictionaryCharCount tells us how many dictionary characters
278 // we passed over on our way to the tentative return value
279 int startPos = text.getIndex();
280 dictionaryCharCount = 0;
281 int result = super.handleNext();
282
283 // if we passed over more than one dictionary character, then we use
284 // divideUpDictionaryRange() to regenerate the cached break positions
285 // for the new range
286 if (dictionaryCharCount > 1 && result - startPos > 1) {
292 else {
293 cachedBreakPositions = null;
294 return result;
295 }
296 }
297
298 // if the cache of break positions has been regenerated (or existed all
299 // along), then just advance to the next break position in the cache
300 // and return it
301 if (cachedBreakPositions != null) {
302 ++positionInCache;
303 text.setIndex(cachedBreakPositions[positionInCache]);
304 return cachedBreakPositions[positionInCache];
305 }
306 return -9999; // SHOULD NEVER GET HERE!
307 }
308
309 /**
310 * Looks up a character category for a character.
311 */
312 protected int lookupCategory(int c) {
313 // this override of lookupCategory() exists only to keep track of whether we've
314 // passed over any dictionary characters. It calls the inherited lookupCategory()
315 // to do the real work, and then checks whether its return value is one of the
316 // categories represented in the dictionary. If it is, bump the dictionary-
317 // character count.
318 int result = super.lookupCategory(c);
319 if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
320 ++dictionaryCharCount;
321 }
322 return result;
323 }
324
325 /**
326 * This is the function that actually implements the dictionary-based
327 * algorithm. Given the endpoints of a range of text, it uses the
328 * dictionary to determine the positions of any boundaries in this
329 * range. It stores all the boundary positions it discovers in
330 * cachedBreakPositions so that we only have to do this work once
331 * for each time we enter the range.
332 */
333 private void divideUpDictionaryRange(int startPos, int endPos) {
334 CharacterIterator text = getText();
335
336 // the range we're dividing may begin or end with non-dictionary characters
337 // (i.e., for line breaking, we may have leading or trailing punctuation
338 // that needs to be kept with the word). Seek from the beginning of the
339 // range to the first dictionary character
340 text.setIndex(startPos);
341 int c = getCurrent();
342 int category = lookupCategory(c);
343 while (category == IGNORE || !categoryFlags[category]) {
344 c = getNext();
345 category = lookupCategory(c);
346 }
347
348 // initialize. We maintain two stacks: currentBreakPositions contains
349 // the list of break positions that will be returned if we successfully
350 // finish traversing the whole range now. possibleBreakPositions lists
351 // all other possible word ends we've passed along the way. (Whenever
352 // we reach an error [a sequence of characters that can't begin any word
353 // in the dictionary], we back up, possibly delete some breaks from
354 // currentBreakPositions, move a break from possibleBreakPositions
355 // to currentBreakPositions, and start over from there. This process
356 // continues in this way until we either successfully make it all the way
357 // across the range, or exhaust all of our combinations of break
358 // positions.)
359 Stack<Integer> currentBreakPositions = new Stack<>();
360 Stack<Integer> possibleBreakPositions = new Stack<>();
361 Vector<Integer> wrongBreakPositions = new Vector<>();
362
363 // the dictionary is implemented as a trie, which is treated as a state
364 // machine. -1 represents the end of a legal word. Every word in the
365 // dictionary is represented by a path from the root node to -1. A path
366 // that ends in state 0 is an illegal combination of characters.
367 int state = 0;
368
369 // these two variables are used for error handling. We keep track of the
370 // farthest we've gotten through the range being divided, and the combination
371 // of breaks that got us that far. If we use up all possible break
372 // combinations, the text contains an error or a word that's not in the
373 // dictionary. In this case, we "bless" the break positions that got us the
374 // farthest as real break positions, and then start over from scratch with
375 // the character where the error occurred.
376 int farthestEndPoint = text.getIndex();
377 Stack<Integer> bestBreakPositions = null;
378
379 // initialize (we always exit the loop with a break statement)
380 c = getCurrent();
381 while (true) {
382
383 // if we can transition to state "-1" from our current state, we're
384 // on the last character of a legal word. Push that position onto
385 // the possible-break-positions stack
386 if (dictionary.getNextState(state, 0) == -1) {
387 possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
388 }
389
390 // look up the new state to transition to in the dictionary
391 state = dictionary.getNextStateFromCharacter(state, c);
392
393 // if the character we're sitting on causes us to transition to
394 // the "end of word" state, then it was a non-dictionary character
395 // and we've successfully traversed the whole range. Drop out
396 // of the loop.
397 if (state == -1) {
398 currentBreakPositions.push(Integer.valueOf(text.getIndex()));
399 break;
400 }
401
402 // if the character we're sitting on causes us to transition to
403 // the error state, or if we've gone off the end of the range
404 // without transitioning to the "end of word" state, we've hit
405 // an error...
406 else if (state == 0 || text.getIndex() >= endPos) {
407
408 // if this is the farthest we've gotten, take note of it in
409 // case there's an error in the text
410 if (text.getIndex() > farthestEndPoint) {
411 farthestEndPoint = text.getIndex();
412
413 @SuppressWarnings("unchecked")
414 Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
415
416 bestBreakPositions = currentBreakPositionsCopy;
417 }
418
419 // wrongBreakPositions is a list of all break positions
420 // we've tried starting that didn't allow us to traverse
421 // all the way through the text. Every time we pop a
422 //break position off of currentBreakPositions, we put it
423 // into wrongBreakPositions to avoid trying it again later.
424 // If we make it to this spot, we're either going to back
425 // up to a break in possibleBreakPositions and try starting
426 // over from there, or we've exhausted all possible break
427 // positions and are going to do the fallback procedure.
428 // This loop prevents us from messing with anything in
429 // possibleBreakPositions that didn't work as a starting
430 // point the last time we tried it (this is to prevent a bunch of
431 // repetitive checks from slowing down some extreme cases)
432 Integer newStartingSpot = null;
433 while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
434 possibleBreakPositions.peek())) {
435 possibleBreakPositions.pop();
436 }
437
438 // if we've used up all possible break-position combinations, there's
439 // an error or an unknown word in the text. In this case, we start
440 // over, treating the farthest character we've reached as the beginning
441 // of the range, and "blessing" the break positions that got us that
442 // far as real break positions
443 if (possibleBreakPositions.isEmpty()) {
444 if (bestBreakPositions != null) {
445 currentBreakPositions = bestBreakPositions;
446 if (farthestEndPoint < endPos) {
447 text.setIndex(farthestEndPoint + 1);
448 }
449 else {
450 break;
451 }
452 }
453 else {
454 if ((currentBreakPositions.size() == 0 ||
455 currentBreakPositions.peek().intValue() != text.getIndex())
456 && text.getIndex() != startPos) {
457 currentBreakPositions.push(new Integer(text.getIndex()));
458 }
459 getNext();
460 currentBreakPositions.push(new Integer(text.getIndex()));
461 }
462 }
463
464 // if we still have more break positions we can try, then promote the
465 // last break in possibleBreakPositions into currentBreakPositions,
466 // and get rid of all entries in currentBreakPositions that come after
467 // it. Then back up to that position and start over from there (i.e.,
468 // treat that position as the beginning of a new word)
469 else {
470 Integer temp = possibleBreakPositions.pop();
471 Integer temp2 = null;
472 while (!currentBreakPositions.isEmpty() && temp.intValue() <
473 currentBreakPositions.peek().intValue()) {
474 temp2 = currentBreakPositions.pop();
475 wrongBreakPositions.addElement(temp2);
476 }
477 currentBreakPositions.push(temp);
478 text.setIndex(currentBreakPositions.peek().intValue());
479 }
480
481 // re-sync "c" for the next go-round, and drop out of the loop if
482 // we've made it off the end of the range
483 c = getCurrent();
484 if (text.getIndex() >= endPos) {
485 break;
486 }
487 }
488
489 // if we didn't hit any exceptional conditions on this last iteration,
490 // just advance to the next character and loop
491 else {
492 c = getNext();
493 }
494 }
495
496 // dump the last break position in the list, and replace it with the actual
497 // end of the range (which may be the same character, or may be further on
498 // because the range actually ended with non-dictionary characters we want to
499 // keep with the word)
500 if (!currentBreakPositions.isEmpty()) {
501 currentBreakPositions.pop();
502 }
503 currentBreakPositions.push(Integer.valueOf(endPos));
504
505 // create a regular array to hold the break positions and copy
506 // the break positions from the stack to the array (in addition,
507 // our starting position goes into this array as a break position).
508 // This array becomes the cache of break positions used by next()
509 // and previous(), so this is where we actually refresh the cache.
510 cachedBreakPositions = new int[currentBreakPositions.size() + 1];
511 cachedBreakPositions[0] = startPos;
512
513 for (int i = 0; i < currentBreakPositions.size(); i++) {
514 cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
515 }
516 positionInCache = 0;
517 }
518 }
|
1 /*
2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 /*
27 *
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 */
40
41 package sun.util.locale.provider;
42
43 import java.io.IOException;
44 import java.text.CharacterIterator;
45 import java.util.ArrayList;
46 import java.util.List;
47 import java.util.Stack;
48
49 /**
50 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
51 * to further subdivide ranges of text beyond what is possible using just the
52 * state-table-based algorithm. This is necessary, for example, to handle
53 * word and line breaking in Thai, which doesn't use spaces between words. The
54 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
55 * up text as far as possible, and then contiguous ranges of letters are
56 * repeatedly compared against a list of known words (i.e., the dictionary)
57 * to divide them up into words.
58 *
59 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
60 * but adds one more special substitution name: <dictionary>. This substitution
61 * name is used to identify characters in words in the dictionary. The idea is that
62 * if the iterator passes over a chunk of text that includes two or more characters
63 * in a row that are included in <dictionary>, it goes back through that range and
64 * derives additional break positions (if possible) using the dictionary.
65 *
66 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
67 * file. It follows a prescribed search path to locate the dictionary (right now,
96 * when a range of characters is divided up using the dictionary, the break
97 * positions that are discovered are stored here, preventing us from having
98 * to use either the dictionary or the state table again until the iterator
99 * leaves this range of text
100 */
101 private int[] cachedBreakPositions;
102
103 /**
104 * if cachedBreakPositions is not null, this indicates which item in the
105 * cache the current iteration position refers to
106 */
107 private int positionInCache;
108
109 /**
110 * Constructs a DictionaryBasedBreakIterator.
111 * @param description Same as the description parameter on RuleBasedBreakIterator,
112 * except for the special meaning of "<dictionary>". This parameter is just
113 * passed through to RuleBasedBreakIterator's constructor.
114 * @param dictionaryFilename The filename of the dictionary file to use
115 */
116 DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
117 throws IOException {
118 super(dataFile);
119 byte[] tmp = super.getAdditionalData();
120 if (tmp != null) {
121 prepareCategoryFlags(tmp);
122 super.setAdditionalData(null);
123 }
124 dictionary = new BreakDictionary(dictionaryFile);
125 }
126
127 private void prepareCategoryFlags(byte[] data) {
128 categoryFlags = new boolean[data.length];
129 for (int i = 0; i < data.length; i++) {
130 categoryFlags[i] = (data[i] == (byte)1) ? true : false;
131 }
132 }
133
134 @Override
135 public void setText(CharacterIterator newText) {
136 super.setText(newText);
137 cachedBreakPositions = null;
138 dictionaryCharCount = 0;
139 positionInCache = 0;
140 }
141
142 /**
143 * Sets the current iteration position to the beginning of the text.
144 * (i.e., the CharacterIterator's starting offset).
145 * @return The offset of the beginning of the text.
146 */
147 @Override
148 public int first() {
149 cachedBreakPositions = null;
150 dictionaryCharCount = 0;
151 positionInCache = 0;
152 return super.first();
153 }
154
155 /**
156 * Sets the current iteration position to the end of the text.
157 * (i.e., the CharacterIterator's ending offset).
158 * @return The text's past-the-end offset.
159 */
160 @Override
161 public int last() {
162 cachedBreakPositions = null;
163 dictionaryCharCount = 0;
164 positionInCache = 0;
165 return super.last();
166 }
167
168 /**
169 * Advances the iterator one step backwards.
170 * @return The position of the last boundary position before the
171 * current iteration position
172 */
173 @Override
174 public int previous() {
175 CharacterIterator text = getText();
176
177 // if we have cached break positions and we're still in the range
178 // covered by them, just move one step backward in the cache
179 if (cachedBreakPositions != null && positionInCache > 0) {
180 --positionInCache;
181 text.setIndex(cachedBreakPositions[positionInCache]);
182 return cachedBreakPositions[positionInCache];
183 }
184
185 // otherwise, dump the cache and use the inherited previous() method to move
186 // backward. This may fill up the cache with new break positions, in which
187 // case we have to mark our position in the cache
188 else {
189 cachedBreakPositions = null;
190 int result = super.previous();
191 if (cachedBreakPositions != null) {
192 positionInCache = cachedBreakPositions.length - 2;
193 }
194 return result;
195 }
196 }
197
198 /**
199 * Sets the current iteration position to the last boundary position
200 * before the specified position.
201 * @param offset The position to begin searching from
202 * @return The position of the last boundary before "offset"
203 */
204 @Override
205 public int preceding(int offset) {
206 CharacterIterator text = getText();
207 checkOffset(offset, text);
208
209 // if we have no cached break positions, or "offset" is outside the
210 // range covered by the cache, we can just call the inherited routine
211 // (which will eventually call other routines in this class that may
212 // refresh the cache)
213 if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
214 offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
215 cachedBreakPositions = null;
216 return super.preceding(offset);
217 }
218
219 // on the other hand, if "offset" is within the range covered by the cache,
220 // then all we have to do is search the cache for the last break position
221 // before "offset"
222 else {
223 positionInCache = 0;
224 while (positionInCache < cachedBreakPositions.length
225 && offset > cachedBreakPositions[positionInCache]) {
226 ++positionInCache;
227 }
228 --positionInCache;
229 text.setIndex(cachedBreakPositions[positionInCache]);
230 return text.getIndex();
231 }
232 }
233
234 /**
235 * Sets the current iteration position to the first boundary position after
236 * the specified position.
237 * @param offset The position to begin searching forward from
238 * @return The position of the first boundary after "offset"
239 */
240 @Override
241 public int following(int offset) {
242 CharacterIterator text = getText();
243 checkOffset(offset, text);
244
245 // if we have no cached break positions, or if "offset" is outside the
246 // range covered by the cache, then dump the cache and call our
247 // inherited following() method. This will call other methods in this
248 // class that may refresh the cache.
249 if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
250 offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
251 cachedBreakPositions = null;
252 return super.following(offset);
253 }
254
255 // on the other hand, if "offset" is within the range covered by the
256 // cache, then just search the cache for the first break position
257 // after "offset"
258 else {
259 positionInCache = 0;
260 while (positionInCache < cachedBreakPositions.length
261 && offset >= cachedBreakPositions[positionInCache]) {
262 ++positionInCache;
263 }
264 text.setIndex(cachedBreakPositions[positionInCache]);
265 return text.getIndex();
266 }
267 }
268
269 /**
270 * This is the implementation function for next().
271 */
272 @Override
273 protected int handleNext() {
274 CharacterIterator text = getText();
275
276 // if there are no cached break positions, or if we've just moved
277 // off the end of the range covered by the cache, we have to dump
278 // and possibly regenerate the cache
279 if (cachedBreakPositions == null ||
280 positionInCache == cachedBreakPositions.length - 1) {
281
282 // start by using the inherited handleNext() to find a tentative return
283 // value. dictionaryCharCount tells us how many dictionary characters
284 // we passed over on our way to the tentative return value
285 int startPos = text.getIndex();
286 dictionaryCharCount = 0;
287 int result = super.handleNext();
288
289 // if we passed over more than one dictionary character, then we use
290 // divideUpDictionaryRange() to regenerate the cached break positions
291 // for the new range
292 if (dictionaryCharCount > 1 && result - startPos > 1) {
298 else {
299 cachedBreakPositions = null;
300 return result;
301 }
302 }
303
304 // if the cache of break positions has been regenerated (or existed all
305 // along), then just advance to the next break position in the cache
306 // and return it
307 if (cachedBreakPositions != null) {
308 ++positionInCache;
309 text.setIndex(cachedBreakPositions[positionInCache]);
310 return cachedBreakPositions[positionInCache];
311 }
312 return -9999; // SHOULD NEVER GET HERE!
313 }
314
315 /**
316 * Looks up a character category for a character.
317 */
318 @Override
319 protected int lookupCategory(int c) {
320 // this override of lookupCategory() exists only to keep track of whether we've
321 // passed over any dictionary characters. It calls the inherited lookupCategory()
322 // to do the real work, and then checks whether its return value is one of the
323 // categories represented in the dictionary. If it is, bump the dictionary-
324 // character count.
325 int result = super.lookupCategory(c);
326 if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
327 ++dictionaryCharCount;
328 }
329 return result;
330 }
331
332 /**
333 * This is the function that actually implements the dictionary-based
334 * algorithm. Given the endpoints of a range of text, it uses the
335 * dictionary to determine the positions of any boundaries in this
336 * range. It stores all the boundary positions it discovers in
337 * cachedBreakPositions so that we only have to do this work once
338 * for each time we enter the range.
339 */
340 @SuppressWarnings("unchecked")
341 private void divideUpDictionaryRange(int startPos, int endPos) {
342 CharacterIterator text = getText();
343
344 // the range we're dividing may begin or end with non-dictionary characters
345 // (i.e., for line breaking, we may have leading or trailing punctuation
346 // that needs to be kept with the word). Seek from the beginning of the
347 // range to the first dictionary character
348 text.setIndex(startPos);
349 int c = getCurrent();
350 int category = lookupCategory(c);
351 while (category == IGNORE || !categoryFlags[category]) {
352 c = getNext();
353 category = lookupCategory(c);
354 }
355
356 // initialize. We maintain two stacks: currentBreakPositions contains
357 // the list of break positions that will be returned if we successfully
358 // finish traversing the whole range now. possibleBreakPositions lists
359 // all other possible word ends we've passed along the way. (Whenever
360 // we reach an error [a sequence of characters that can't begin any word
361 // in the dictionary], we back up, possibly delete some breaks from
362 // currentBreakPositions, move a break from possibleBreakPositions
363 // to currentBreakPositions, and start over from there. This process
364 // continues in this way until we either successfully make it all the way
365 // across the range, or exhaust all of our combinations of break
366 // positions.)
367 Stack<Integer> currentBreakPositions = new Stack<>();
368 Stack<Integer> possibleBreakPositions = new Stack<>();
369 List<Integer> wrongBreakPositions = new ArrayList<>();
370
371 // the dictionary is implemented as a trie, which is treated as a state
372 // machine. -1 represents the end of a legal word. Every word in the
373 // dictionary is represented by a path from the root node to -1. A path
374 // that ends in state 0 is an illegal combination of characters.
375 int state = 0;
376
377 // these two variables are used for error handling. We keep track of the
378 // farthest we've gotten through the range being divided, and the combination
379 // of breaks that got us that far. If we use up all possible break
380 // combinations, the text contains an error or a word that's not in the
381 // dictionary. In this case, we "bless" the break positions that got us the
382 // farthest as real break positions, and then start over from scratch with
383 // the character where the error occurred.
384 int farthestEndPoint = text.getIndex();
385 Stack<Integer> bestBreakPositions = null;
386
387 // initialize (we always exit the loop with a break statement)
388 c = getCurrent();
389 while (true) {
390
391 // if we can transition to state "-1" from our current state, we're
392 // on the last character of a legal word. Push that position onto
393 // the possible-break-positions stack
394 if (dictionary.getNextState(state, 0) == -1) {
395 possibleBreakPositions.push(text.getIndex());
396 }
397
398 // look up the new state to transition to in the dictionary
399 state = dictionary.getNextStateFromCharacter(state, c);
400
401 // if the character we're sitting on causes us to transition to
402 // the "end of word" state, then it was a non-dictionary character
403 // and we've successfully traversed the whole range. Drop out
404 // of the loop.
405 if (state == -1) {
406 currentBreakPositions.push(text.getIndex());
407 break;
408 }
409
410 // if the character we're sitting on causes us to transition to
411 // the error state, or if we've gone off the end of the range
412 // without transitioning to the "end of word" state, we've hit
413 // an error...
414 else if (state == 0 || text.getIndex() >= endPos) {
415
416 // if this is the farthest we've gotten, take note of it in
417 // case there's an error in the text
418 if (text.getIndex() > farthestEndPoint) {
419 farthestEndPoint = text.getIndex();
420
421 @SuppressWarnings("unchecked")
422 Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
423
424 bestBreakPositions = currentBreakPositionsCopy;
425 }
426
427 // wrongBreakPositions is a list of all break positions
428 // we've tried starting that didn't allow us to traverse
429 // all the way through the text. Every time we pop a
430 // break position off of currentBreakPositions, we put it
431 // into wrongBreakPositions to avoid trying it again later.
432 // If we make it to this spot, we're either going to back
433 // up to a break in possibleBreakPositions and try starting
434 // over from there, or we've exhausted all possible break
435 // positions and are going to do the fallback procedure.
436 // This loop prevents us from messing with anything in
437 // possibleBreakPositions that didn't work as a starting
438 // point the last time we tried it (this is to prevent a bunch of
439 // repetitive checks from slowing down some extreme cases)
440 while (!possibleBreakPositions.isEmpty()
441 && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
442 possibleBreakPositions.pop();
443 }
444
445 // if we've used up all possible break-position combinations, there's
446 // an error or an unknown word in the text. In this case, we start
447 // over, treating the farthest character we've reached as the beginning
448 // of the range, and "blessing" the break positions that got us that
449 // far as real break positions
450 if (possibleBreakPositions.isEmpty()) {
451 if (bestBreakPositions != null) {
452 currentBreakPositions = bestBreakPositions;
453 if (farthestEndPoint < endPos) {
454 text.setIndex(farthestEndPoint + 1);
455 }
456 else {
457 break;
458 }
459 }
460 else {
461 if ((currentBreakPositions.size() == 0 ||
462 currentBreakPositions.peek().intValue() != text.getIndex())
463 && text.getIndex() != startPos) {
464 currentBreakPositions.push(new Integer(text.getIndex()));
465 }
466 getNext();
467 currentBreakPositions.push(new Integer(text.getIndex()));
468 }
469 }
470
471 // if we still have more break positions we can try, then promote the
472 // last break in possibleBreakPositions into currentBreakPositions,
473 // and get rid of all entries in currentBreakPositions that come after
474 // it. Then back up to that position and start over from there (i.e.,
475 // treat that position as the beginning of a new word)
476 else {
477 Integer temp = possibleBreakPositions.pop();
478 Integer temp2 = null;
479 while (!currentBreakPositions.isEmpty() && temp.intValue() <
480 currentBreakPositions.peek().intValue()) {
481 temp2 = currentBreakPositions.pop();
482 wrongBreakPositions.add(temp2);
483 }
484 currentBreakPositions.push(temp);
485 text.setIndex(currentBreakPositions.peek().intValue());
486 }
487
488 // re-sync "c" for the next go-round, and drop out of the loop if
489 // we've made it off the end of the range
490 c = getCurrent();
491 if (text.getIndex() >= endPos) {
492 break;
493 }
494 }
495
496 // if we didn't hit any exceptional conditions on this last iteration,
497 // just advance to the next character and loop
498 else {
499 c = getNext();
500 }
501 }
502
503 // dump the last break position in the list, and replace it with the actual
504 // end of the range (which may be the same character, or may be further on
505 // because the range actually ended with non-dictionary characters we want to
506 // keep with the word)
507 if (!currentBreakPositions.isEmpty()) {
508 currentBreakPositions.pop();
509 }
510 currentBreakPositions.push(endPos);
511
512 // create a regular array to hold the break positions and copy
513 // the break positions from the stack to the array (in addition,
514 // our starting position goes into this array as a break position).
515 // This array becomes the cache of break positions used by next()
516 // and previous(), so this is where we actually refresh the cache.
517 cachedBreakPositions = new int[currentBreakPositions.size() + 1];
518 cachedBreakPositions[0] = startPos;
519
520 for (int i = 0; i < currentBreakPositions.size(); i++) {
521 cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
522 }
523 positionInCache = 0;
524 }
525 }
|