src/share/classes/sun/util/locale/provider/DictionaryBasedBreakIterator.java

Print this page
rev 5615 : 6336885: RFE: Locale Data Deployment Enhancements
4609153: Provide locale data for Indic locales
5104387: Support for gl_ES locale (galician language)
6337471: desktop/system locale preferences support
7056139: (cal) SPI support for locale-dependent Calendar parameters
7058206: Provide CalendarData SPI for week params and display field value names
7073852: Support multiple scripts for digits and decimal symbols per locale
7079560: [Fmt-Da] Context dependent month names support in SimpleDateFormat
7171324: getAvailableLocales() of locale sensitive services should return the actual availability of locales
7151414: (cal) Support calendar type identification
7168528: LocaleServiceProvider needs to be aware of Locale extensions
7171372: (cal) locale's default Calendar should be created if unknown calendar is specified
Summary: JEP 127: Improve Locale Data Packaging and Adopt Unicode CLDR Data (part 1 w/o Jigsaw. by Naoto Sato and Masayoshi Okutsu)

*** 1,7 **** /* ! * Copyright (c) 1999, 2008, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 36,53 **** * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. */ ! package java.text; - import java.util.Vector; - import java.util.Stack; - import java.util.Hashtable; - import java.text.CharacterIterator; - import java.io.InputStream; import java.io.IOException; /** * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary * to further subdivide ranges of text beyond what is possible using just the * state-table-based algorithm. This is necessary, for example, to handle --- 36,52 ---- * * This notice and attribution to Taligent may not be removed. * Taligent is a registered trademark of Taligent, Inc. */ ! package sun.util.locale.provider; import java.io.IOException; + import java.text.CharacterIterator; + import java.util.ArrayList; + import java.util.List; + import java.util.Stack; /** * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary * to further subdivide ranges of text beyond what is possible using just the * state-table-based algorithm. This is necessary, for example, to handle
*** 112,122 **** * @param description Same as the description parameter on RuleBasedBreakIterator, * except for the special meaning of "<dictionary>". This parameter is just * passed through to RuleBasedBreakIterator's constructor. * @param dictionaryFilename The filename of the dictionary file to use */ ! public DictionaryBasedBreakIterator(String dataFile, String dictionaryFile) throws IOException { super(dataFile); byte[] tmp = super.getAdditionalData(); if (tmp != null) { prepareCategoryFlags(tmp); --- 111,121 ---- * @param description Same as the description parameter on RuleBasedBreakIterator, * except for the special meaning of "<dictionary>". This parameter is just * passed through to RuleBasedBreakIterator's constructor. * @param dictionaryFilename The filename of the dictionary file to use */ ! DictionaryBasedBreakIterator(String dataFile, String dictionaryFile) throws IOException { super(dataFile); byte[] tmp = super.getAdditionalData(); if (tmp != null) { prepareCategoryFlags(tmp);
*** 130,139 **** --- 129,139 ---- for (int i = 0; i < data.length; i++) { categoryFlags[i] = (data[i] == (byte)1) ? true : false; } } + @Override public void setText(CharacterIterator newText) { super.setText(newText); cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0;
*** 142,151 **** --- 142,152 ---- /** * Sets the current iteration position to the beginning of the text. * (i.e., the CharacterIterator's starting offset). * @return The offset of the beginning of the text. */ + @Override public int first() { cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0; return super.first();
*** 154,163 **** --- 155,165 ---- /** * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). * @return The text's past-the-end offset. */ + @Override public int last() { cachedBreakPositions = null; dictionaryCharCount = 0; positionInCache = 0; return super.last();
*** 166,175 **** --- 168,178 ---- /** * Advances the iterator one step backwards. * @return The position of the last boundary position before the * current iteration position */ + @Override public int previous() { CharacterIterator text = getText(); // if we have cached break positions and we're still in the range // covered by them, just move one step backward in the cache
*** 196,205 **** --- 199,209 ---- * Sets the current iteration position to the last boundary position * before the specified position. * @param offset The position to begin searching from * @return The position of the last boundary before "offset" */ + @Override public int preceding(int offset) { CharacterIterator text = getText(); checkOffset(offset, text); // if we have no cached break positions, or "offset" is outside the
*** 231,240 **** --- 235,245 ---- * Sets the current iteration position to the first boundary position after * the specified position. * @param offset The position to begin searching forward from * @return The position of the first boundary after "offset" */ + @Override public int following(int offset) { CharacterIterator text = getText(); checkOffset(offset, text); // if we have no cached break positions, or if "offset" is outside the
*** 262,271 **** --- 267,277 ---- } /** * This is the implementation function for next(). */ + @Override protected int handleNext() { CharacterIterator text = getText(); // if there are no cached break positions, or if we've just moved // off the end of the range covered by the cache, we have to dump
*** 307,316 **** --- 313,323 ---- } /** * Looks up a character category for a character. */ + @Override protected int lookupCategory(int c) { // this override of lookupCategory() exists only to keep track of whether we've // passed over any dictionary characters. It calls the inherited lookupCategory() // to do the real work, and then checks whether its return value is one of the // categories represented in the dictionary. If it is, bump the dictionary-
*** 328,337 **** --- 335,345 ---- * dictionary to determine the positions of any boundaries in this * range. It stores all the boundary positions it discovers in * cachedBreakPositions so that we only have to do this work once * for each time we enter the range. */ + @SuppressWarnings("unchecked") private void divideUpDictionaryRange(int startPos, int endPos) { CharacterIterator text = getText(); // the range we're dividing may begin or end with non-dictionary characters // (i.e., for line breaking, we may have leading or trailing punctuation
*** 356,366 **** // continues in this way until we either successfully make it all the way // across the range, or exhaust all of our combinations of break // positions.) Stack<Integer> currentBreakPositions = new Stack<>(); Stack<Integer> possibleBreakPositions = new Stack<>(); ! Vector<Integer> wrongBreakPositions = new Vector<>(); // the dictionary is implemented as a trie, which is treated as a state // machine. -1 represents the end of a legal word. Every word in the // dictionary is represented by a path from the root node to -1. A path // that ends in state 0 is an illegal combination of characters. --- 364,374 ---- // continues in this way until we either successfully make it all the way // across the range, or exhaust all of our combinations of break // positions.) Stack<Integer> currentBreakPositions = new Stack<>(); Stack<Integer> possibleBreakPositions = new Stack<>(); ! List<Integer> wrongBreakPositions = new ArrayList<>(); // the dictionary is implemented as a trie, which is treated as a state // machine. -1 represents the end of a legal word. Every word in the // dictionary is represented by a path from the root node to -1. A path // that ends in state 0 is an illegal combination of characters.
*** 382,392 **** // if we can transition to state "-1" from our current state, we're // on the last character of a legal word. Push that position onto // the possible-break-positions stack if (dictionary.getNextState(state, 0) == -1) { ! possibleBreakPositions.push(Integer.valueOf(text.getIndex())); } // look up the new state to transition to in the dictionary state = dictionary.getNextStateFromCharacter(state, c); --- 390,400 ---- // if we can transition to state "-1" from our current state, we're // on the last character of a legal word. Push that position onto // the possible-break-positions stack if (dictionary.getNextState(state, 0) == -1) { ! possibleBreakPositions.push(text.getIndex()); } // look up the new state to transition to in the dictionary state = dictionary.getNextStateFromCharacter(state, c);
*** 393,403 **** // if the character we're sitting on causes us to transition to // the "end of word" state, then it was a non-dictionary character // and we've successfully traversed the whole range. Drop out // of the loop. if (state == -1) { ! currentBreakPositions.push(Integer.valueOf(text.getIndex())); break; } // if the character we're sitting on causes us to transition to // the error state, or if we've gone off the end of the range --- 401,411 ---- // if the character we're sitting on causes us to transition to // the "end of word" state, then it was a non-dictionary character // and we've successfully traversed the whole range. Drop out // of the loop. if (state == -1) { ! currentBreakPositions.push(text.getIndex()); break; } // if the character we're sitting on causes us to transition to // the error state, or if we've gone off the end of the range
*** 417,439 **** } // wrongBreakPositions is a list of all break positions // we've tried starting that didn't allow us to traverse // all the way through the text. Every time we pop a ! //break position off of currentBreakPositions, we put it // into wrongBreakPositions to avoid trying it again later. // If we make it to this spot, we're either going to back // up to a break in possibleBreakPositions and try starting // over from there, or we've exhausted all possible break // positions and are going to do the fallback procedure. // This loop prevents us from messing with anything in // possibleBreakPositions that didn't work as a starting // point the last time we tried it (this is to prevent a bunch of // repetitive checks from slowing down some extreme cases) ! Integer newStartingSpot = null; ! while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains( ! possibleBreakPositions.peek())) { possibleBreakPositions.pop(); } // if we've used up all possible break-position combinations, there's // an error or an unknown word in the text. In this case, we start --- 425,446 ---- } // wrongBreakPositions is a list of all break positions // we've tried starting that didn't allow us to traverse // all the way through the text. Every time we pop a ! // break position off of currentBreakPositions, we put it // into wrongBreakPositions to avoid trying it again later. // If we make it to this spot, we're either going to back // up to a break in possibleBreakPositions and try starting // over from there, or we've exhausted all possible break // positions and are going to do the fallback procedure. // This loop prevents us from messing with anything in // possibleBreakPositions that didn't work as a starting // point the last time we tried it (this is to prevent a bunch of // repetitive checks from slowing down some extreme cases) ! while (!possibleBreakPositions.isEmpty() ! && wrongBreakPositions.contains(possibleBreakPositions.peek())) { possibleBreakPositions.pop(); } // if we've used up all possible break-position combinations, there's // an error or an unknown word in the text. In this case, we start
*** 470,480 **** Integer temp = possibleBreakPositions.pop(); Integer temp2 = null; while (!currentBreakPositions.isEmpty() && temp.intValue() < currentBreakPositions.peek().intValue()) { temp2 = currentBreakPositions.pop(); ! wrongBreakPositions.addElement(temp2); } currentBreakPositions.push(temp); text.setIndex(currentBreakPositions.peek().intValue()); } --- 477,487 ---- Integer temp = possibleBreakPositions.pop(); Integer temp2 = null; while (!currentBreakPositions.isEmpty() && temp.intValue() < currentBreakPositions.peek().intValue()) { temp2 = currentBreakPositions.pop(); ! wrongBreakPositions.add(temp2); } currentBreakPositions.push(temp); text.setIndex(currentBreakPositions.peek().intValue()); }
*** 498,508 **** // because the range actually ended with non-dictionary characters we want to // keep with the word) if (!currentBreakPositions.isEmpty()) { currentBreakPositions.pop(); } ! currentBreakPositions.push(Integer.valueOf(endPos)); // create a regular array to hold the break positions and copy // the break positions from the stack to the array (in addition, // our starting position goes into this array as a break position). // This array becomes the cache of break positions used by next() --- 505,515 ---- // because the range actually ended with non-dictionary characters we want to // keep with the word) if (!currentBreakPositions.isEmpty()) { currentBreakPositions.pop(); } ! currentBreakPositions.push(endPos); // create a regular array to hold the break positions and copy // the break positions from the stack to the array (in addition, // our starting position goes into this array as a break position). // This array becomes the cache of break positions used by next()