src/share/classes/sun/util/locale/provider/DictionaryBasedBreakIterator.java
Print this page
rev 5615 : 6336885: RFE: Locale Data Deployment Enhancements
4609153: Provide locale data for Indic locales
5104387: Support for gl_ES locale (galician language)
6337471: desktop/system locale preferences support
7056139: (cal) SPI support for locale-dependent Calendar parameters
7058206: Provide CalendarData SPI for week params and display field value names
7073852: Support multiple scripts for digits and decimal symbols per locale
7079560: [Fmt-Da] Context dependent month names support in SimpleDateFormat
7171324: getAvailableLocales() of locale sensitive services should return the actual availability of locales
7151414: (cal) Support calendar type identification
7168528: LocaleServiceProvider needs to be aware of Locale extensions
7171372: (cal) locale's default Calendar should be created if unknown calendar is specified
Summary: JEP 127: Improve Locale Data Packaging and Adopt Unicode CLDR Data (part 1 w/o Jigsaw. by Naoto Sato and Masayoshi Okutsu)
*** 1,7 ****
/*
! * Copyright (c) 1999, 2008, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
--- 1,7 ----
/*
! * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
*** 36,53 ****
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
! package java.text;
- import java.util.Vector;
- import java.util.Stack;
- import java.util.Hashtable;
- import java.text.CharacterIterator;
- import java.io.InputStream;
import java.io.IOException;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
--- 36,52 ----
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
! package sun.util.locale.provider;
import java.io.IOException;
+ import java.text.CharacterIterator;
+ import java.util.ArrayList;
+ import java.util.List;
+ import java.util.Stack;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
*** 112,122 ****
* @param description Same as the description parameter on RuleBasedBreakIterator,
* except for the special meaning of "<dictionary>". This parameter is just
* passed through to RuleBasedBreakIterator's constructor.
* @param dictionaryFilename The filename of the dictionary file to use
*/
! public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
throws IOException {
super(dataFile);
byte[] tmp = super.getAdditionalData();
if (tmp != null) {
prepareCategoryFlags(tmp);
--- 111,121 ----
* @param description Same as the description parameter on RuleBasedBreakIterator,
* except for the special meaning of "<dictionary>". This parameter is just
* passed through to RuleBasedBreakIterator's constructor.
* @param dictionaryFilename The filename of the dictionary file to use
*/
! DictionaryBasedBreakIterator(String dataFile, String dictionaryFile)
throws IOException {
super(dataFile);
byte[] tmp = super.getAdditionalData();
if (tmp != null) {
prepareCategoryFlags(tmp);
*** 130,139 ****
--- 129,139 ----
for (int i = 0; i < data.length; i++) {
categoryFlags[i] = (data[i] == (byte)1) ? true : false;
}
}
+ @Override
public void setText(CharacterIterator newText) {
super.setText(newText);
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
*** 142,151 ****
--- 142,152 ----
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
*/
+ @Override
public int first() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.first();
*** 154,163 ****
--- 155,165 ----
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
*/
+ @Override
public int last() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.last();
*** 166,175 ****
--- 168,178 ----
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
+ @Override
public int previous() {
CharacterIterator text = getText();
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
*** 196,205 ****
--- 199,209 ----
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
+ @Override
public int preceding(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or "offset" is outside the
*** 231,240 ****
--- 235,245 ----
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
+ @Override
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or if "offset" is outside the
*** 262,271 ****
--- 267,277 ----
}
/**
* This is the implementation function for next().
*/
+ @Override
protected int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
*** 307,316 ****
--- 313,323 ----
}
/**
* Looks up a character category for a character.
*/
+ @Override
protected int lookupCategory(int c) {
// this override of lookupCategory() exists only to keep track of whether we've
// passed over any dictionary characters. It calls the inherited lookupCategory()
// to do the real work, and then checks whether its return value is one of the
// categories represented in the dictionary. If it is, bump the dictionary-
*** 328,337 ****
--- 335,345 ----
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
+ @SuppressWarnings("unchecked")
private void divideUpDictionaryRange(int startPos, int endPos) {
CharacterIterator text = getText();
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
*** 356,366 ****
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.)
Stack<Integer> currentBreakPositions = new Stack<>();
Stack<Integer> possibleBreakPositions = new Stack<>();
! Vector<Integer> wrongBreakPositions = new Vector<>();
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
--- 364,374 ----
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.)
Stack<Integer> currentBreakPositions = new Stack<>();
Stack<Integer> possibleBreakPositions = new Stack<>();
! List<Integer> wrongBreakPositions = new ArrayList<>();
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
*** 382,392 ****
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (dictionary.getNextState(state, 0) == -1) {
! possibleBreakPositions.push(Integer.valueOf(text.getIndex()));
}
// look up the new state to transition to in the dictionary
state = dictionary.getNextStateFromCharacter(state, c);
--- 390,400 ----
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (dictionary.getNextState(state, 0) == -1) {
! possibleBreakPositions.push(text.getIndex());
}
// look up the new state to transition to in the dictionary
state = dictionary.getNextStateFromCharacter(state, c);
*** 393,403 ****
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
! currentBreakPositions.push(Integer.valueOf(text.getIndex()));
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
--- 401,411 ----
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
! currentBreakPositions.push(text.getIndex());
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
*** 417,439 ****
}
// wrongBreakPositions is a list of all break positions
// we've tried starting that didn't allow us to traverse
// all the way through the text. Every time we pop a
! //break position off of currentBreakPositions, we put it
// into wrongBreakPositions to avoid trying it again later.
// If we make it to this spot, we're either going to back
// up to a break in possibleBreakPositions and try starting
// over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure.
// This loop prevents us from messing with anything in
// possibleBreakPositions that didn't work as a starting
// point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
! Integer newStartingSpot = null;
! while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
! possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
--- 425,446 ----
}
// wrongBreakPositions is a list of all break positions
// we've tried starting that didn't allow us to traverse
// all the way through the text. Every time we pop a
! // break position off of currentBreakPositions, we put it
// into wrongBreakPositions to avoid trying it again later.
// If we make it to this spot, we're either going to back
// up to a break in possibleBreakPositions and try starting
// over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure.
// This loop prevents us from messing with anything in
// possibleBreakPositions that didn't work as a starting
// point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
! while (!possibleBreakPositions.isEmpty()
! && wrongBreakPositions.contains(possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
*** 470,480 ****
Integer temp = possibleBreakPositions.pop();
Integer temp2 = null;
while (!currentBreakPositions.isEmpty() && temp.intValue() <
currentBreakPositions.peek().intValue()) {
temp2 = currentBreakPositions.pop();
! wrongBreakPositions.addElement(temp2);
}
currentBreakPositions.push(temp);
text.setIndex(currentBreakPositions.peek().intValue());
}
--- 477,487 ----
Integer temp = possibleBreakPositions.pop();
Integer temp2 = null;
while (!currentBreakPositions.isEmpty() && temp.intValue() <
currentBreakPositions.peek().intValue()) {
temp2 = currentBreakPositions.pop();
! wrongBreakPositions.add(temp2);
}
currentBreakPositions.push(temp);
text.setIndex(currentBreakPositions.peek().intValue());
}
*** 498,508 ****
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
! currentBreakPositions.push(Integer.valueOf(endPos));
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
--- 505,515 ----
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
! currentBreakPositions.push(endPos);
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()