src/share/classes/sun/util/locale/provider/DictionaryBasedBreakIterator.java
Print this page
rev 5696 : 6336885: RFE: Locale Data Deployment Enhancements
4609153: Provide locale data for Indic locales
5104387: Support for gl_ES locale (galician language)
6337471: desktop/system locale preferences support
7056139: (cal) SPI support for locale-dependent Calendar parameters
7058206: Provide CalendarData SPI for week params and display field value names
7073852: Support multiple scripts for digits and decimal symbols per locale
7079560: [Fmt-Da] Context dependent month names support in SimpleDateFormat
7171324: getAvailableLocales() of locale sensitive services should return the actual availability of locales
7151414: (cal) Support calendar type identification
7168528: LocaleServiceProvider needs to be aware of Locale extensions
7171372: (cal) locale's default Calendar should be created if unknown calendar is specified
Summary: JEP 127: Improve Locale Data Packaging and Adopt Unicode CLDR Data (part 1 w/o packaging changes. by Naoto Sato and Masayoshi Okutsu)
@@ -1,7 +1,7 @@
/*
- * Copyright (c) 1999, 2008, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
@@ -36,18 +36,17 @@
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
-package java.text;
+package sun.util.locale.provider;
-import java.util.Vector;
-import java.util.Stack;
-import java.util.Hashtable;
-import java.text.CharacterIterator;
-import java.io.InputStream;
import java.io.IOException;
+import java.text.CharacterIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Stack;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
@@ -112,11 +111,11 @@
* @param description Same as the description parameter on RuleBasedBreakIterator,
* except for the special meaning of "<dictionary>". This parameter is just
* passed through to RuleBasedBreakIterator's constructor.
* @param dictionaryFilename The filename of the dictionary file to use
*/
- public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
+ DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
throws IOException {
super(dataFile);
byte[] tmp = super.getAdditionalData();
if (tmp != null) {
prepareCategoryFlags(tmp);
@@ -130,10 +129,11 @@
for (int i = 0; i < data.length; i++) {
categoryFlags[i] = (data[i] == (byte)1) ? true : false;
}
}
+ @Override
public void setText(CharacterIterator newText) {
super.setText(newText);
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
@@ -142,10 +142,11 @@
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
*/
+ @Override
public int first() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.first();
@@ -154,10 +155,11 @@
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
*/
+ @Override
public int last() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.last();
@@ -166,10 +168,11 @@
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
+ @Override
public int previous() {
CharacterIterator text = getText();
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
@@ -196,10 +199,11 @@
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
+ @Override
public int preceding(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or "offset" is outside the
@@ -231,10 +235,11 @@
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
+ @Override
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or if "offset" is outside the
@@ -262,10 +267,11 @@
}
/**
* This is the implementation function for next().
*/
+ @Override
protected int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
@@ -307,10 +313,11 @@
}
/**
* Looks up a character category for a character.
*/
+ @Override
protected int lookupCategory(int c) {
// this override of lookupCategory() exists only to keep track of whether we've
// passed over any dictionary characters. It calls the inherited lookupCategory()
// to do the real work, and then checks whether its return value is one of the
// categories represented in the dictionary. If it is, bump the dictionary-
@@ -328,10 +335,11 @@
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
+ @SuppressWarnings("unchecked")
private void divideUpDictionaryRange(int startPos, int endPos) {
CharacterIterator text = getText();
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
@@ -356,11 +364,11 @@
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.)
Stack<Integer> currentBreakPositions = new Stack<>();
Stack<Integer> possibleBreakPositions = new Stack<>();
- Vector<Integer> wrongBreakPositions = new Vector<>();
+ List<Integer> wrongBreakPositions = new ArrayList<>();
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
@@ -382,11 +390,11 @@
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (dictionary.getNextState(state, 0) == -1) {
- possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
+ possibleBreakPositions.push(text.getIndex());
}
// look up the new state to transition to in the dictionary
state = dictionary.getNextStateFromCharacter(state, c);
@@ -393,11 +401,11 @@
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
- currentBreakPositions.push(Integer.valueOf(text.getIndex()));
+ currentBreakPositions.push(text.getIndex());
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
@@ -417,23 +425,22 @@
}
// wrongBreakPositions is a list of all break positions
// we've tried starting that didn't allow us to traverse
// all the way through the text. Every time we pop a
- //break position off of currentBreakPositions, we put it
+ // break position off of currentBreakPositions, we put it
// into wrongBreakPositions to avoid trying it again later.
// If we make it to this spot, we're either going to back
// up to a break in possibleBreakPositions and try starting
// over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure.
// This loop prevents us from messing with anything in
// possibleBreakPositions that didn't work as a starting
// point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
- Integer newStartingSpot = null;
- while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
- possibleBreakPositions.peek())) {
+ while (!possibleBreakPositions.isEmpty()
+ && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
@@ -470,11 +477,11 @@
Integer temp = possibleBreakPositions.pop();
Integer temp2 = null;
while (!currentBreakPositions.isEmpty() && temp.intValue() <
currentBreakPositions.peek().intValue()) {
temp2 = currentBreakPositions.pop();
- wrongBreakPositions.addElement(temp2);
+ wrongBreakPositions.add(temp2);
}
currentBreakPositions.push(temp);
text.setIndex(currentBreakPositions.peek().intValue());
}
@@ -498,11 +505,11 @@
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
- currentBreakPositions.push(Integer.valueOf(endPos));
+ currentBreakPositions.push(endPos);
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()