--- old/make/gendata/GendataBreakIterator.gmk 2016-10-20 12:41:25.709817595 +0900 +++ new/make/gendata/GendataBreakIterator.gmk 2016-10-20 12:41:25.581814726 +0900 @@ -55,7 +55,6 @@ $(eval $(call SetupJavaCompilation,BUILD_BREAKITERATOR_LD, \ SETUP := GENERATE_OLDBYTECODE, \ SRC := $(JDK_TOPDIR)/src/jdk.localedata/share/classes, \ - INCLUDES := $(TEXT_PKG_LD), \ INCLUDE_FILES := \ $(TEXT_PKG_LD)/BreakIteratorRules_th.java \ $(TEXT_PKG_LD)/BreakIteratorInfo_th.java, \ --- old/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java 2016-10-20 12:41:26.173827988 +0900 +++ new/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java 2016-10-20 12:41:26.037824941 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,6 +32,8 @@ import java.util.MissingResourceException; import java.util.Objects; import java.util.Set; +import sun.text.DictionaryBasedBreakIterator; +import sun.text.RuleBasedBreakIterator; /** * Concrete implementation of the {@link java.text.spi.BreakIteratorProvider @@ -153,29 +155,31 @@ } private BreakIterator getBreakInstance(Locale locale, - int type, - String dataName, - String dictionaryName) { + int type, + String ruleName, + String dictionaryName) { Objects.requireNonNull(locale); LocaleResources lr = LocaleProviderAdapter.forJRE().getLocaleResources(locale); String[] classNames = (String[]) lr.getBreakIteratorInfo("BreakIteratorClasses"); - String dataFile = (String) lr.getBreakIteratorInfo(dataName); + String ruleFile = (String) lr.getBreakIteratorInfo(ruleName); + byte[] ruleData = lr.getBreakIteratorResources(ruleName); try { switch (classNames[type]) { case "RuleBasedBreakIterator": - return new RuleBasedBreakIterator( - lr.getBreakIteratorDataModule(), dataFile); + return new RuleBasedBreakIterator(ruleFile, ruleData); + case "DictionaryBasedBreakIterator": String dictionaryFile = (String) lr.getBreakIteratorInfo(dictionaryName); - return new DictionaryBasedBreakIterator( - lr.getBreakIteratorDataModule(), dataFile, dictionaryFile); + byte[] dictionaryData = lr.getBreakIteratorResources(dictionaryName); + return new DictionaryBasedBreakIterator(ruleFile, ruleData, + dictionaryFile, dictionaryData); default: throw new IllegalArgumentException("Invalid break iterator class \"" + classNames[type] + "\""); } - } catch (IOException | MissingResourceException | IllegalArgumentException e) { + } catch (MissingResourceException | IllegalArgumentException e) { throw new InternalError(e.toString(), e); } } --- old/src/java.base/share/classes/sun/util/locale/provider/LocaleResources.java 2016-10-20 12:41:26.449834170 +0900 +++ new/src/java.base/share/classes/sun/util/locale/provider/LocaleResources.java 2016-10-20 12:41:26.345831840 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,7 +42,6 @@ import java.lang.ref.ReferenceQueue; import java.lang.ref.SoftReference; -import java.lang.reflect.Module; import java.text.MessageFormat; import java.util.Calendar; import java.util.LinkedHashSet; @@ -113,13 +112,14 @@ if (data == null || ((biInfo = data.get()) == null)) { biInfo = localeData.getBreakIteratorInfo(locale).getObject(key); cache.put(cacheKey, new ResourceReference(cacheKey, biInfo, referenceQueue)); - } + } return biInfo; } - Module getBreakIteratorDataModule() { - return localeData.getBreakIteratorInfo(locale).getClass().getModule(); + @SuppressWarnings("unchecked") + byte[] getBreakIteratorResources(String key) { + return (byte[]) localeData.getBreakIteratorResources(locale).getObject(key); } int getCalendarData(String key) { --- old/src/java.base/share/classes/sun/util/resources/LocaleData.java 2016-10-20 12:41:26.717840174 +0900 +++ new/src/java.base/share/classes/sun/util/resources/LocaleData.java 2016-10-20 12:41:26.613837844 +0900 @@ -123,6 +123,14 @@ } /** + * Gets a break iterator resources resource bundle, using + * privileges to allow accessing a sun.* package. + */ + public ResourceBundle getBreakIteratorResources(Locale locale) { + return getBundle(type.getTextResourcesPackage() + ".BreakIteratorResources", locale); + } + + /** * Gets a collation data resource bundle, using privileges * to allow accessing a sun.* package. */ --- old/src/java.base/share/classes/sun/util/locale/provider/RuleBasedBreakIterator.java 2016-10-20 12:41:27.049847612 +0900 +++ /dev/null 2016-10-11 12:01:27.741739134 +0900 @@ -1,1198 +0,0 @@ -/* - * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - * - * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved - * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved - * - * The original version of this source code and documentation - * is copyrighted and owned by Taligent, Inc., a wholly-owned - * subsidiary of IBM. These materials are provided under terms - * of a License Agreement between Taligent and Sun. This technology - * is protected by multiple US and International patents. - * - * This notice and attribution to Taligent may not be removed. - * Taligent is a registered trademark of Taligent, Inc. - */ - -package sun.util.locale.provider; - -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.lang.reflect.Module; -import java.security.AccessController; -import java.security.PrivilegedActionException; -import java.security.PrivilegedExceptionAction; -import java.text.BreakIterator; -import java.text.CharacterIterator; -import java.text.StringCharacterIterator; -import java.util.MissingResourceException; -import sun.text.CompactByteArray; -import sun.text.SupplementaryCharacterData; - -/** - *

A subclass of BreakIterator whose behavior is specified using a list of rules.

- * - *

There are two kinds of rules, which are separated by semicolons: substitutions - * and regular expressions.

- * - *

A substitution rule defines a name that can be used in place of an expression. It - * consists of a name, which is a string of characters contained in angle brackets, an equals - * sign, and an expression. (There can be no whitespace on either side of the equals sign.) - * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or - * square brackets. A substitution is visible after its definition, and is filled in using - * simple textual substitution. Substitution definitions can contain other substitutions, as - * long as those substitutions have been defined first. Substitutions are generally used to - * make the regular expressions (which can get quite complex) shorted and easier to read. - * They typically define either character categories or commonly-used subexpressions.

- * - *

There is one special substitution.  If the description defines a substitution - * called "<ignore>", the expression must be a [] expression, and the - * expression defines a set of characters (the "ignore characters") that - * will be transparent to the BreakIterator.  A sequence of characters will break the - * same way it would if any ignore characters it contains are taken out.  Break - * positions never occur befoer ignore characters.

- * - *

A regular expression uses a subset of the normal Unix regular-expression syntax, and - * defines a sequence of characters to be kept together. With one significant exception, the - * iterator uses a longest-possible-match algorithm when matching text to regular - * expressions. The iterator also treats descriptions containing multiple regular expressions - * as if they were ORed together (i.e., as if they were separated by |).

- * - *

The special characters recognized by the regular-expression parser are as follows:

- * - *
- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
*Specifies that the expression preceding the asterisk may occur any number - * of times (including not at all).
{}Encloses a sequence of characters that is optional.
()Encloses a sequence of characters.  If followed by *, the sequence - * repeats.  Otherwise, the parentheses are just a grouping device and a way to delimit - * the ends of expressions containing |.
|Separates two alternative sequences of characters.  Either one - * sequence or the other, but not both, matches this expression.  The | character can - * only occur inside ().
.Matches any character.
*?Specifies a non-greedy asterisk.  *? works the same way as *, except - * when there is overlap between the last group of characters in the expression preceding the - * * and the first group of characters following the *.  When there is this kind of - * overlap, * will match the longest sequence of characters that match the expression before - * the *, and *? will match the shortest sequence of characters matching the expression - * before the *?.  For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text, - * "x[xy]*x" will match through to the last x (i.e., "xxyxyyyxyxyxxyxyxyy", - * but "x[xy]*?x" will only match the first two xes ("xxyxyyyxyxyxxyxyxyy").
[]Specifies a group of alternative characters.  A [] expression will - * match any single character that is specified in the [] expression.  For more on the - * syntax of [] expressions, see below.
/Specifies where the break position should go if text matches this - * expression.  (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run - * of letters, followed by a run of whitespace, followed by a digit, but the break position - * will actually go before the whitespace).  Expressions that don't contain / put the - * break position at the end of the matching text.
\Escape character.  The \ itself is ignored, but causes the next - * character to be treated as literal character.  This has no effect for many - * characters, but for the characters listed above, this deprives them of their special - * meaning.  (There are no special escape sequences for Unicode characters, or tabs and - * newlines; these are all handled by a higher-level protocol.  In a Java string, - * "\n" will be converted to a literal newline character by the time the - * regular-expression parser sees it.  Of course, this means that \ sequences that are - * visible to the regexp parser must be written as \\ when inside a Java string.)  All - * characters in the ASCII range except for letters, digits, and control characters are - * reserved characters to the parser and must be preceded by \ even if they currently don't - * mean anything.
!If ! appears at the beginning of a regular expression, it tells the regexp - * parser that this expression specifies the backwards-iteration behavior of the iterator, - * and not its normal iteration behavior.  This is generally only used in situations - * where the automatically-generated backwards-iteration brhavior doesn't produce - * satisfactory results and must be supplemented with extra client-specified rules.
(all others)All other characters are treated as literal characters, which must match - * the corresponding character(s) in the text exactly.
- *
- * - *

Within a [] expression, a number of other special characters can be used to specify - * groups of characters:

- * - *
- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
-Specifies a range of matching characters.  For example - * "[a-p]" matches all lowercase Latin letters from a to p (inclusive).  The - - * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a - * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does - * it include accented letters such as a-umlaut.
::A pair of colons containing a one- or two-letter code matches all - * characters in the corresponding Unicode category.  The two-letter codes are the same - * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]" - * matches all currency symbols and all math symbols).  Specifying a one-letter code is - * the same as specifying all two-letter codes that begin with that letter (for example, - * "[:L:]" matches all letters, and is equivalent to - * "[:Lu::Ll::Lo::Lm::Lt:]").  Anything other than a valid two-letter Unicode - * category code or a single letter that begins a Unicode category code is illegal within - * colons.
[][] expressions can nest.  This has no effect, except when used in - * conjunction with the ^ token.
^Excludes the character (or the characters in the [] expression) following - * it from the group of characters.  For example, "[a-z^p]" matches all Latin - * lowercase letters except p.  "[:L:^[\u4e00-\u9fff]]" matches all letters - * except the Han ideographs.
(all others)All other characters are treated as literal characters.  (For - * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)
- *
- * - *

For a more complete explanation, see http://www.ibm.com/java/education/boundaries/boundaries.html. - *   For examples, see the resource data (which is annotated).

- * - * @author Richard Gillam - */ -class RuleBasedBreakIterator extends BreakIterator { - - /** - * A token used as a character-category value to identify ignore characters - */ - protected static final byte IGNORE = -1; - - /** - * The state number of the starting state - */ - private static final short START_STATE = 1; - - /** - * The state-transition value indicating "stop" - */ - private static final short STOP_STATE = 0; - - /** - * Magic number for the BreakIterator data file format. - */ - static final byte[] LABEL = { - (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', - (byte)'\0' - }; - static final int LABEL_LENGTH = LABEL.length; - - /** - * Version number of the dictionary that was read in. - */ - static final byte supportedVersion = 1; - - /** - * Header size in byte count - */ - private static final int HEADER_LENGTH = 36; - - /** - * An array length of indices for BMP characters - */ - private static final int BMP_INDICES_LENGTH = 512; - - /** - * Tables that indexes from character values to character category numbers - */ - private CompactByteArray charCategoryTable = null; - private SupplementaryCharacterData supplementaryCharCategoryTable = null; - - /** - * The table of state transitions used for forward iteration - */ - private short[] stateTable = null; - - /** - * The table of state transitions used to sync up the iterator with the - * text in backwards and random-access iteration - */ - private short[] backwardsStateTable = null; - - /** - * A list of flags indicating which states in the state table are accepting - * ("end") states - */ - private boolean[] endStates = null; - - /** - * A list of flags indicating which states in the state table are - * lookahead states (states which turn lookahead on and off) - */ - private boolean[] lookaheadStates = null; - - /** - * A table for additional data. May be used by a subclass of - * RuleBasedBreakIterator. - */ - private byte[] additionalData = null; - - /** - * The number of character categories (and, thus, the number of columns in - * the state tables) - */ - private int numCategories; - - /** - * The character iterator through which this BreakIterator accesses the text - */ - private CharacterIterator text = null; - - /** - * A CRC32 value of all data in datafile - */ - private long checksum; - - //======================================================================= - // constructors - //======================================================================= - - /** - * Constructs a RuleBasedBreakIterator according to the module and the datafile - * provided. - */ - RuleBasedBreakIterator(Module module, String datafile) - throws IOException, MissingResourceException { - readTables(module, datafile); - } - - /** - * Read datafile. The datafile's format is as follows: - *
-     *   BreakIteratorData {
-     *       u1           magic[7];
-     *       u1           version;
-     *       u4           totalDataSize;
-     *       header_info  header;
-     *       body         value;
-     *   }
-     * 
- * totalDataSize is the summation of the size of - * header_info and body in byte count. - *

- * In header, each field except for checksum implies the - * length of each field. Since BMPdataLength is a fixed-length - * data(512 entries), its length isn't included in header. - * checksum is a CRC32 value of all in body. - *

-     *   header_info {
-     *       u4           stateTableLength;
-     *       u4           backwardsStateTableLength;
-     *       u4           endStatesLength;
-     *       u4           lookaheadStatesLength;
-     *       u4           BMPdataLength;
-     *       u4           nonBMPdataLength;
-     *       u4           additionalDataLength;
-     *       u8           checksum;
-     *   }
-     * 
- *

- * - * Finally, BMPindices and BMPdata are set to - * charCategoryTable. nonBMPdata is set to - * supplementaryCharCategoryTable. - *

-     *   body {
-     *       u2           stateTable[stateTableLength];
-     *       u2           backwardsStateTable[backwardsStateTableLength];
-     *       u1           endStates[endStatesLength];
-     *       u1           lookaheadStates[lookaheadStatesLength];
-     *       u2           BMPindices[512];
-     *       u1           BMPdata[BMPdataLength];
-     *       u4           nonBMPdata[numNonBMPdataLength];
-     *       u1           additionalData[additionalDataLength];
-     *   }
-     * 
- */ - protected final void readTables(Module module, String datafile) - throws IOException, MissingResourceException { - - byte[] buffer = readFile(module, datafile); - - /* Read header_info. */ - int stateTableLength = getInt(buffer, 0); - int backwardsStateTableLength = getInt(buffer, 4); - int endStatesLength = getInt(buffer, 8); - int lookaheadStatesLength = getInt(buffer, 12); - int BMPdataLength = getInt(buffer, 16); - int nonBMPdataLength = getInt(buffer, 20); - int additionalDataLength = getInt(buffer, 24); - checksum = getLong(buffer, 28); - - /* Read stateTable[numCategories * numRows] */ - stateTable = new short[stateTableLength]; - int offset = HEADER_LENGTH; - for (int i = 0; i < stateTableLength; i++, offset+=2) { - stateTable[i] = getShort(buffer, offset); - } - - /* Read backwardsStateTable[numCategories * numRows] */ - backwardsStateTable = new short[backwardsStateTableLength]; - for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) { - backwardsStateTable[i] = getShort(buffer, offset); - } - - /* Read endStates[numRows] */ - endStates = new boolean[endStatesLength]; - for (int i = 0; i < endStatesLength; i++, offset++) { - endStates[i] = buffer[offset] == 1; - } - - /* Read lookaheadStates[numRows] */ - lookaheadStates = new boolean[lookaheadStatesLength]; - for (int i = 0; i < lookaheadStatesLength; i++, offset++) { - lookaheadStates[i] = buffer[offset] == 1; - } - - /* Read a category table and indices for BMP characters. */ - short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices - for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) { - temp1[i] = getShort(buffer, offset); - } - byte[] temp2 = new byte[BMPdataLength]; // BMPdata - System.arraycopy(buffer, offset, temp2, 0, BMPdataLength); - offset += BMPdataLength; - charCategoryTable = new CompactByteArray(temp1, temp2); - - /* Read a category table for non-BMP characters. */ - int[] temp3 = new int[nonBMPdataLength]; - for (int i = 0; i < nonBMPdataLength; i++, offset+=4) { - temp3[i] = getInt(buffer, offset); - } - supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3); - - /* Read additional data */ - if (additionalDataLength > 0) { - additionalData = new byte[additionalDataLength]; - System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength); - } - - /* Set numCategories */ - numCategories = stateTable.length / endStates.length; - } - - protected byte[] readFile(final Module module, final String datafile) - throws IOException, MissingResourceException { - - BufferedInputStream is; - try { - PrivilegedExceptionAction pa = () -> { - String pathName = "jdk.localedata".equals(module.getName()) ? - "sun/text/resources/ext/" : - "sun/text/resources/"; - InputStream in = module.getResourceAsStream(pathName + datafile); - if (in == null) { - // Try to load the file with "java.base" module instance. Assumption - // here is that the fall back data files to be read should reside in - // java.base. - in = RuleBasedBreakIterator.class.getModule().getResourceAsStream("sun/text/resources/" + datafile); - } - - return new BufferedInputStream(in); - }; - is = AccessController.doPrivileged(pa); - } catch (PrivilegedActionException e) { - throw new InternalError(e.toString(), e); - } - - int offset = 0; - - /* First, read magic, version, and header_info. */ - int len = LABEL_LENGTH + 5; - byte[] buf = new byte[len]; - if (is.read(buf) != len) { - throw new MissingResourceException("Wrong header length", - datafile, ""); - } - - /* Validate the magic number. */ - for (int i = 0; i < LABEL_LENGTH; i++, offset++) { - if (buf[offset] != LABEL[offset]) { - throw new MissingResourceException("Wrong magic number", - datafile, ""); - } - } - - /* Validate the version number. */ - if (buf[offset] != supportedVersion) { - throw new MissingResourceException("Unsupported version(" + buf[offset] + ")", - datafile, ""); - } - - /* Read data: totalDataSize + 8(for checksum) */ - len = getInt(buf, ++offset); - buf = new byte[len]; - if (is.read(buf) != len) { - throw new MissingResourceException("Wrong data length", - datafile, ""); - } - - is.close(); - - return buf; - } - - byte[] getAdditionalData() { - return additionalData; - } - - void setAdditionalData(byte[] b) { - additionalData = b; - } - - //======================================================================= - // boilerplate - //======================================================================= - /** - * Clones this iterator. - * @return A newly-constructed RuleBasedBreakIterator with the same - * behavior as this one. - */ - @Override - public Object clone() { - RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone(); - if (text != null) { - result.text = (CharacterIterator) text.clone(); - } - return result; - } - - /** - * Returns true if both BreakIterators are of the same class, have the same - * rules, and iterate over the same text. - */ - @Override - public boolean equals(Object that) { - try { - if (that == null) { - return false; - } - - RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; - if (checksum != other.checksum) { - return false; - } - if (text == null) { - return other.text == null; - } else { - return text.equals(other.text); - } - } - catch(ClassCastException e) { - return false; - } - } - - /** - * Returns text - */ - @Override - public String toString() { - return "[checksum=0x" + Long.toHexString(checksum) + ']'; - } - - /** - * Compute a hashcode for this BreakIterator - * @return A hash code - */ - @Override - public int hashCode() { - return (int)checksum; - } - - //======================================================================= - // BreakIterator overrides - //======================================================================= - - /** - * Sets the current iteration position to the beginning of the text. - * (i.e., the CharacterIterator's starting offset). - * @return The offset of the beginning of the text. - */ - @Override - public int first() { - CharacterIterator t = getText(); - - t.first(); - return t.getIndex(); - } - - /** - * Sets the current iteration position to the end of the text. - * (i.e., the CharacterIterator's ending offset). - * @return The text's past-the-end offset. - */ - @Override - public int last() { - CharacterIterator t = getText(); - - // I'm not sure why, but t.last() returns the offset of the last character, - // rather than the past-the-end offset - t.setIndex(t.getEndIndex()); - return t.getIndex(); - } - - /** - * Advances the iterator either forward or backward the specified number of steps. - * Negative values move backward, and positive values move forward. This is - * equivalent to repeatedly calling next() or previous(). - * @param n The number of steps to move. The sign indicates the direction - * (negative is backwards, and positive is forwards). - * @return The character offset of the boundary position n boundaries away from - * the current one. - */ - @Override - public int next(int n) { - int result = current(); - while (n > 0) { - result = handleNext(); - --n; - } - while (n < 0) { - result = previous(); - ++n; - } - return result; - } - - /** - * Advances the iterator to the next boundary position. - * @return The position of the first boundary after this one. - */ - @Override - public int next() { - return handleNext(); - } - - private int cachedLastKnownBreak = BreakIterator.DONE; - - /** - * Advances the iterator backwards, to the last boundary preceding this one. - * @return The position of the last boundary position preceding this one. - */ - @Override - public int previous() { - // if we're already sitting at the beginning of the text, return DONE - CharacterIterator text = getText(); - if (current() == text.getBeginIndex()) { - return BreakIterator.DONE; - } - - // set things up. handlePrevious() will back us up to some valid - // break position before the current position (we back our internal - // iterator up one step to prevent handlePrevious() from returning - // the current position), but not necessarily the last one before - // where we started - int start = current(); - int lastResult = cachedLastKnownBreak; - if (lastResult >= start || lastResult <= BreakIterator.DONE) { - getPrevious(); - lastResult = handlePrevious(); - } else { - //it might be better to check if handlePrevious() give us closer - //safe value but handlePrevious() is slow too - //So, this has to be done carefully - text.setIndex(lastResult); - } - int result = lastResult; - - // iterate forward from the known break position until we pass our - // starting point. The last break position before the starting - // point is our return value - while (result != BreakIterator.DONE && result < start) { - lastResult = result; - result = handleNext(); - } - - // set the current iteration position to be the last break position - // before where we started, and then return that value - text.setIndex(lastResult); - cachedLastKnownBreak = lastResult; - return lastResult; - } - - /** - * Returns previous character - */ - private int getPrevious() { - char c2 = text.previous(); - if (Character.isLowSurrogate(c2) && - text.getIndex() > text.getBeginIndex()) { - char c1 = text.previous(); - if (Character.isHighSurrogate(c1)) { - return Character.toCodePoint(c1, c2); - } else { - text.next(); - } - } - return (int)c2; - } - - /** - * Returns current character - */ - int getCurrent() { - char c1 = text.current(); - if (Character.isHighSurrogate(c1) && - text.getIndex() < text.getEndIndex()) { - char c2 = text.next(); - text.previous(); - if (Character.isLowSurrogate(c2)) { - return Character.toCodePoint(c1, c2); - } - } - return (int)c1; - } - - /** - * Returns the count of next character. - */ - private int getCurrentCodePointCount() { - char c1 = text.current(); - if (Character.isHighSurrogate(c1) && - text.getIndex() < text.getEndIndex()) { - char c2 = text.next(); - text.previous(); - if (Character.isLowSurrogate(c2)) { - return 2; - } - } - return 1; - } - - /** - * Returns next character - */ - int getNext() { - int index = text.getIndex(); - int endIndex = text.getEndIndex(); - if (index == endIndex || - (index += getCurrentCodePointCount()) >= endIndex) { - return CharacterIterator.DONE; - } - text.setIndex(index); - return getCurrent(); - } - - /** - * Returns the position of next character. - */ - private int getNextIndex() { - int index = text.getIndex() + getCurrentCodePointCount(); - int endIndex = text.getEndIndex(); - if (index > endIndex) { - return endIndex; - } else { - return index; - } - } - - /** - * Throw IllegalArgumentException unless begin <= offset < end. - */ - protected static final void checkOffset(int offset, CharacterIterator text) { - if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { - throw new IllegalArgumentException("offset out of bounds"); - } - } - - /** - * Sets the iterator to refer to the first boundary position following - * the specified position. - * @offset The position from which to begin searching for a break position. - * @return The position of the first break after the current position. - */ - @Override - public int following(int offset) { - - CharacterIterator text = getText(); - checkOffset(offset, text); - - // Set our internal iteration position (temporarily) - // to the position passed in. If this is the _beginning_ position, - // then we can just use next() to get our return value - text.setIndex(offset); - if (offset == text.getBeginIndex()) { - cachedLastKnownBreak = handleNext(); - return cachedLastKnownBreak; - } - - // otherwise, we have to sync up first. Use handlePrevious() to back - // us up to a known break position before the specified position (if - // we can determine that the specified position is a break position, - // we don't back up at all). This may or may not be the last break - // position at or before our starting position. Advance forward - // from here until we've passed the starting position. The position - // we stop on will be the first break position after the specified one. - int result = cachedLastKnownBreak; - if (result >= offset || result <= BreakIterator.DONE) { - result = handlePrevious(); - } else { - //it might be better to check if handlePrevious() give us closer - //safe value but handlePrevious() is slow too - //So, this has to be done carefully - text.setIndex(result); - } - while (result != BreakIterator.DONE && result <= offset) { - result = handleNext(); - } - cachedLastKnownBreak = result; - return result; - } - - /** - * Sets the iterator to refer to the last boundary position before the - * specified position. - * @offset The position to begin searching for a break from. - * @return The position of the last boundary before the starting position. - */ - @Override - public int preceding(int offset) { - // if we start by updating the current iteration position to the - // position specified by the caller, we can just use previous() - // to carry out this operation - CharacterIterator text = getText(); - checkOffset(offset, text); - text.setIndex(offset); - return previous(); - } - - /** - * Returns true if the specified position is a boundary position. As a side - * effect, leaves the iterator pointing to the first boundary position at - * or after "offset". - * @param offset the offset to check. - * @return True if "offset" is a boundary position. - */ - @Override - public boolean isBoundary(int offset) { - CharacterIterator text = getText(); - checkOffset(offset, text); - if (offset == text.getBeginIndex()) { - return true; - } - - // to check whether this is a boundary, we can use following() on the - // position before the specified one and return true if the position we - // get back is the one the user specified - else { - return following(offset - 1) == offset; - } - } - - /** - * Returns the current iteration position. - * @return The current iteration position. - */ - @Override - public int current() { - return getText().getIndex(); - } - - /** - * Return a CharacterIterator over the text being analyzed. This version - * of this method returns the actual CharacterIterator we're using internally. - * Changing the state of this iterator can have undefined consequences. If - * you need to change it, clone it first. - * @return An iterator over the text being analyzed. - */ - @Override - public CharacterIterator getText() { - // The iterator is initialized pointing to no text at all, so if this - // function is called while we're in that state, we have to fudge an - // iterator to return. - if (text == null) { - text = new StringCharacterIterator(""); - } - return text; - } - - /** - * Set the iterator to analyze a new piece of text. This function resets - * the current iteration position to the beginning of the text. - * @param newText An iterator over the text to analyze. - */ - @Override - public void setText(CharacterIterator newText) { - // Test iterator to see if we need to wrap it in a SafeCharIterator. - // The correct behavior for CharacterIterators is to allow the - // position to be set to the endpoint of the iterator. Many - // CharacterIterators do not uphold this, so this is a workaround - // to permit them to use this class. - int end = newText.getEndIndex(); - boolean goodIterator; - try { - newText.setIndex(end); // some buggy iterators throw an exception here - goodIterator = newText.getIndex() == end; - } - catch(IllegalArgumentException e) { - goodIterator = false; - } - - if (goodIterator) { - text = newText; - } - else { - text = new SafeCharIterator(newText); - } - text.first(); - - cachedLastKnownBreak = BreakIterator.DONE; - } - - - //======================================================================= - // implementation - //======================================================================= - - /** - * This method is the actual implementation of the next() method. All iteration - * vectors through here. This method initializes the state machine to state 1 - * and advances through the text character by character until we reach the end - * of the text or the state machine transitions to state 0. We update our return - * value every time the state machine passes through a possible end state. - */ - protected int handleNext() { - // if we're already at the end of the text, return DONE. - CharacterIterator text = getText(); - if (text.getIndex() == text.getEndIndex()) { - return BreakIterator.DONE; - } - - // no matter what, we always advance at least one character forward - int result = getNextIndex(); - int lookaheadResult = 0; - - // begin in state 1 - int state = START_STATE; - int category; - int c = getCurrent(); - - // loop until we reach the end of the text or transition to state 0 - while (c != CharacterIterator.DONE && state != STOP_STATE) { - - // look up the current character's character category (which tells us - // which column in the state table to look at) - category = lookupCategory(c); - - // if the character isn't an ignore character, look up a state - // transition in the state table - if (category != IGNORE) { - state = lookupState(state, category); - } - - // if the state we've just transitioned to is a lookahead state, - // (but not also an end state), save its position. If it's - // both a lookahead state and an end state, update the break position - // to the last saved lookup-state position - if (lookaheadStates[state]) { - if (endStates[state]) { - result = lookaheadResult; - } - else { - lookaheadResult = getNextIndex(); - } - } - - // otherwise, if the state we've just transitioned to is an accepting - // state, update the break position to be the current iteration position - else { - if (endStates[state]) { - result = getNextIndex(); - } - } - - c = getNext(); - } - - // if we've run off the end of the text, and the very last character took us into - // a lookahead state, advance the break position to the lookahead position - // (the theory here is that if there are no characters at all after the lookahead - // position, that always matches the lookahead criteria) - if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) { - result = lookaheadResult; - } - - text.setIndex(result); - return result; - } - - /** - * This method backs the iterator back up to a "safe position" in the text. - * This is a position that we know, without any context, must be a break position. - * The various calling methods then iterate forward from this safe position to - * the appropriate position to return. (For more information, see the description - * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) - */ - protected int handlePrevious() { - CharacterIterator text = getText(); - int state = START_STATE; - int category = 0; - int lastCategory = 0; - int c = getCurrent(); - - // loop until we reach the beginning of the text or transition to state 0 - while (c != CharacterIterator.DONE && state != STOP_STATE) { - - // save the last character's category and look up the current - // character's category - lastCategory = category; - category = lookupCategory(c); - - // if the current character isn't an ignore character, look up a - // state transition in the backwards state table - if (category != IGNORE) { - state = lookupBackwardState(state, category); - } - - // then advance one character backwards - c = getPrevious(); - } - - // if we didn't march off the beginning of the text, we're either one or two - // positions away from the real break position. (One because of the call to - // previous() at the end of the loop above, and another because the character - // that takes us into the stop state will always be the character BEFORE - // the break position.) - if (c != CharacterIterator.DONE) { - if (lastCategory != IGNORE) { - getNext(); - getNext(); - } - else { - getNext(); - } - } - return text.getIndex(); - } - - /** - * Looks up a character's category (i.e., its category for breaking purposes, - * not its Unicode category) - */ - protected int lookupCategory(int c) { - if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - return charCategoryTable.elementAt((char)c); - } else { - return supplementaryCharCategoryTable.getValue(c); - } - } - - /** - * Given a current state and a character category, looks up the - * next state to transition to in the state table. - */ - protected int lookupState(int state, int category) { - return stateTable[state * numCategories + category]; - } - - /** - * Given a current state and a character category, looks up the - * next state to transition to in the backwards state table. - */ - protected int lookupBackwardState(int state, int category) { - return backwardsStateTable[state * numCategories + category]; - } - - static long getLong(byte[] buf, int offset) { - long num = buf[offset]&0xFF; - for (int i = 1; i < 8; i++) { - num = num<<8 | (buf[offset+i]&0xFF); - } - return num; - } - - static int getInt(byte[] buf, int offset) { - int num = buf[offset]&0xFF; - for (int i = 1; i < 4; i++) { - num = num<<8 | (buf[offset+i]&0xFF); - } - return num; - } - - static short getShort(byte[] buf, int offset) { - short num = (short)(buf[offset]&0xFF); - num = (short)(num<<8 | (buf[offset+1]&0xFF)); - return num; - } - - /* - * This class exists to work around a bug in incorrect implementations - * of CharacterIterator, which incorrectly handle setIndex(endIndex). - * This iterator relies only on base.setIndex(n) where n is less than - * endIndex. - * - * One caveat: if the base iterator's begin and end indices change - * the change will not be reflected by this wrapper. Does that matter? - */ - // TODO: Review this class to see if it's still required. - private static final class SafeCharIterator implements CharacterIterator, - Cloneable { - - private CharacterIterator base; - private int rangeStart; - private int rangeLimit; - private int currentIndex; - - SafeCharIterator(CharacterIterator base) { - this.base = base; - this.rangeStart = base.getBeginIndex(); - this.rangeLimit = base.getEndIndex(); - this.currentIndex = base.getIndex(); - } - - @Override - public char first() { - return setIndex(rangeStart); - } - - @Override - public char last() { - return setIndex(rangeLimit - 1); - } - - @Override - public char current() { - if (currentIndex < rangeStart || currentIndex >= rangeLimit) { - return DONE; - } - else { - return base.setIndex(currentIndex); - } - } - - @Override - public char next() { - - currentIndex++; - if (currentIndex >= rangeLimit) { - currentIndex = rangeLimit; - return DONE; - } - else { - return base.setIndex(currentIndex); - } - } - - @Override - public char previous() { - - currentIndex--; - if (currentIndex < rangeStart) { - currentIndex = rangeStart; - return DONE; - } - else { - return base.setIndex(currentIndex); - } - } - - @Override - public char setIndex(int i) { - - if (i < rangeStart || i > rangeLimit) { - throw new IllegalArgumentException("Invalid position"); - } - currentIndex = i; - return current(); - } - - @Override - public int getBeginIndex() { - return rangeStart; - } - - @Override - public int getEndIndex() { - return rangeLimit; - } - - @Override - public int getIndex() { - return currentIndex; - } - - @Override - public Object clone() { - - SafeCharIterator copy = null; - try { - copy = (SafeCharIterator) super.clone(); - } - catch(CloneNotSupportedException e) { - throw new Error("Clone not supported: " + e); - } - - CharacterIterator copyOfBase = (CharacterIterator) base.clone(); - copy.base = copyOfBase; - return copy; - } - } -} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/java.base/share/classes/sun/text/RuleBasedBreakIterator.java 2016-10-20 12:41:26.909844475 +0900 @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + */ + +package sun.text; + +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; +import java.text.BreakIterator; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.MissingResourceException; +import sun.text.CompactByteArray; +import sun.text.SupplementaryCharacterData; + +/** + *

A subclass of BreakIterator whose behavior is specified using a list of rules.

+ * + *

There are two kinds of rules, which are separated by semicolons: substitutions + * and regular expressions.

+ * + *

A substitution rule defines a name that can be used in place of an expression. It + * consists of a name, which is a string of characters contained in angle brackets, an equals + * sign, and an expression. (There can be no whitespace on either side of the equals sign.) + * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or + * square brackets. A substitution is visible after its definition, and is filled in using + * simple textual substitution. Substitution definitions can contain other substitutions, as + * long as those substitutions have been defined first. Substitutions are generally used to + * make the regular expressions (which can get quite complex) shorted and easier to read. + * They typically define either character categories or commonly-used subexpressions.

+ * + *

There is one special substitution.  If the description defines a substitution + * called "<ignore>", the expression must be a [] expression, and the + * expression defines a set of characters (the "ignore characters") that + * will be transparent to the BreakIterator.  A sequence of characters will break the + * same way it would if any ignore characters it contains are taken out.  Break + * positions never occur befoer ignore characters.

+ * + *

A regular expression uses a subset of the normal Unix regular-expression syntax, and + * defines a sequence of characters to be kept together. With one significant exception, the + * iterator uses a longest-possible-match algorithm when matching text to regular + * expressions. The iterator also treats descriptions containing multiple regular expressions + * as if they were ORed together (i.e., as if they were separated by |).

+ * + *

The special characters recognized by the regular-expression parser are as follows:

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
*Specifies that the expression preceding the asterisk may occur any number + * of times (including not at all).
{}Encloses a sequence of characters that is optional.
()Encloses a sequence of characters.  If followed by *, the sequence + * repeats.  Otherwise, the parentheses are just a grouping device and a way to delimit + * the ends of expressions containing |.
|Separates two alternative sequences of characters.  Either one + * sequence or the other, but not both, matches this expression.  The | character can + * only occur inside ().
.Matches any character.
*?Specifies a non-greedy asterisk.  *? works the same way as *, except + * when there is overlap between the last group of characters in the expression preceding the + * * and the first group of characters following the *.  When there is this kind of + * overlap, * will match the longest sequence of characters that match the expression before + * the *, and *? will match the shortest sequence of characters matching the expression + * before the *?.  For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text, + * "x[xy]*x" will match through to the last x (i.e., "xxyxyyyxyxyxxyxyxyy", + * but "x[xy]*?x" will only match the first two xes ("xxyxyyyxyxyxxyxyxyy").
[]Specifies a group of alternative characters.  A [] expression will + * match any single character that is specified in the [] expression.  For more on the + * syntax of [] expressions, see below.
/Specifies where the break position should go if text matches this + * expression.  (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run + * of letters, followed by a run of whitespace, followed by a digit, but the break position + * will actually go before the whitespace).  Expressions that don't contain / put the + * break position at the end of the matching text.
\Escape character.  The \ itself is ignored, but causes the next + * character to be treated as literal character.  This has no effect for many + * characters, but for the characters listed above, this deprives them of their special + * meaning.  (There are no special escape sequences for Unicode characters, or tabs and + * newlines; these are all handled by a higher-level protocol.  In a Java string, + * "\n" will be converted to a literal newline character by the time the + * regular-expression parser sees it.  Of course, this means that \ sequences that are + * visible to the regexp parser must be written as \\ when inside a Java string.)  All + * characters in the ASCII range except for letters, digits, and control characters are + * reserved characters to the parser and must be preceded by \ even if they currently don't + * mean anything.
!If ! appears at the beginning of a regular expression, it tells the regexp + * parser that this expression specifies the backwards-iteration behavior of the iterator, + * and not its normal iteration behavior.  This is generally only used in situations + * where the automatically-generated backwards-iteration brhavior doesn't produce + * satisfactory results and must be supplemented with extra client-specified rules.
(all others)All other characters are treated as literal characters, which must match + * the corresponding character(s) in the text exactly.
+ *
+ * + *

Within a [] expression, a number of other special characters can be used to specify + * groups of characters:

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
-Specifies a range of matching characters.  For example + * "[a-p]" matches all lowercase Latin letters from a to p (inclusive).  The - + * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a + * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does + * it include accented letters such as a-umlaut.
::A pair of colons containing a one- or two-letter code matches all + * characters in the corresponding Unicode category.  The two-letter codes are the same + * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]" + * matches all currency symbols and all math symbols).  Specifying a one-letter code is + * the same as specifying all two-letter codes that begin with that letter (for example, + * "[:L:]" matches all letters, and is equivalent to + * "[:Lu::Ll::Lo::Lm::Lt:]").  Anything other than a valid two-letter Unicode + * category code or a single letter that begins a Unicode category code is illegal within + * colons.
[][] expressions can nest.  This has no effect, except when used in + * conjunction with the ^ token.
^Excludes the character (or the characters in the [] expression) following + * it from the group of characters.  For example, "[a-z^p]" matches all Latin + * lowercase letters except p.  "[:L:^[\u4e00-\u9fff]]" matches all letters + * except the Han ideographs.
(all others)All other characters are treated as literal characters.  (For + * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)
+ *
+ * + *

For a more complete explanation, see http://www.ibm.com/java/education/boundaries/boundaries.html. + *   For examples, see the resource data (which is annotated).

+ * + * @author Richard Gillam + */ +public class RuleBasedBreakIterator extends BreakIterator { + + /** + * A token used as a character-category value to identify ignore characters + */ + protected static final byte IGNORE = -1; + + /** + * The state number of the starting state + */ + private static final short START_STATE = 1; + + /** + * The state-transition value indicating "stop" + */ + private static final short STOP_STATE = 0; + + /** + * Magic number for the BreakIterator data file format. + */ + static final byte[] LABEL = { + (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a', + (byte)'\0' + }; + static final int LABEL_LENGTH = LABEL.length; + + /** + * Version number of the dictionary that was read in. + */ + static final byte supportedVersion = 1; + + /** + * An array length of indices for BMP characters + */ + private static final int BMP_INDICES_LENGTH = 512; + + /** + * Tables that indexes from character values to character category numbers + */ + private CompactByteArray charCategoryTable = null; + private SupplementaryCharacterData supplementaryCharCategoryTable = null; + + /** + * The table of state transitions used for forward iteration + */ + private short[] stateTable = null; + + /** + * The table of state transitions used to sync up the iterator with the + * text in backwards and random-access iteration + */ + private short[] backwardsStateTable = null; + + /** + * A list of flags indicating which states in the state table are accepting + * ("end") states + */ + private boolean[] endStates = null; + + /** + * A list of flags indicating which states in the state table are + * lookahead states (states which turn lookahead on and off) + */ + private boolean[] lookaheadStates = null; + + /** + * A table for additional data. May be used by a subclass of + * RuleBasedBreakIterator. + */ + private byte[] additionalData = null; + + /** + * The number of character categories (and, thus, the number of columns in + * the state tables) + */ + private int numCategories; + + /** + * The character iterator through which this BreakIterator accesses the text + */ + private CharacterIterator text = null; + + /** + * A CRC32 value of all data in datafile + */ + private long checksum; + + //======================================================================= + // constructors + //======================================================================= + + /** + * Constructs a RuleBasedBreakIterator using the given rule data. + * + * @throws MissingResourceException if the rule data is invalid or corrupted + */ + public RuleBasedBreakIterator(String ruleFile, byte[] ruleData) { + ByteBuffer bb = ByteBuffer.wrap(ruleData); + try { + validateRuleData(ruleFile, bb); + setupTables(ruleFile, bb); + } catch (BufferUnderflowException bue) { + MissingResourceException e; + e = new MissingResourceException("Corrupted rule data file", ruleFile, ""); + e.initCause(bue); + throw e; + } + } + + /** + * Initializes the fields with the given rule data. + * The data format is as follows: + *
+     *   BreakIteratorData {
+     *       u1           magic[7];
+     *       u1           version;
+     *       u4           totalDataSize;
+     *       header_info  header;
+     *       body         value;
+     *   }
+     * 
+ * totalDataSize is the summation of the size of + * header_info and body in byte count. + *

+ * In header, each field except for checksum implies the + * length of each field. Since BMPdataLength is a fixed-length + * data(512 entries), its length isn't included in header. + * checksum is a CRC32 value of all in body. + *

+     *   header_info {
+     *       u4           stateTableLength;
+     *       u4           backwardsStateTableLength;
+     *       u4           endStatesLength;
+     *       u4           lookaheadStatesLength;
+     *       u4           BMPdataLength;
+     *       u4           nonBMPdataLength;
+     *       u4           additionalDataLength;
+     *       u8           checksum;
+     *   }
+     * 
+ *

+ * + * Finally, BMPindices and BMPdata are set to + * charCategoryTable. nonBMPdata is set to + * supplementaryCharCategoryTable. + *

+     *   body {
+     *       u2           stateTable[stateTableLength];
+     *       u2           backwardsStateTable[backwardsStateTableLength];
+     *       u1           endStates[endStatesLength];
+     *       u1           lookaheadStates[lookaheadStatesLength];
+     *       u2           BMPindices[512];
+     *       u1           BMPdata[BMPdataLength];
+     *       u4           nonBMPdata[numNonBMPdataLength];
+     *       u1           additionalData[additionalDataLength];
+     *   }
+     * 
+ * + * @throws BufferUnderflowException if the end-of-data is reached before + * setting up all the tables + */ + private void setupTables(String ruleFile, ByteBuffer bb) { + /* Read header_info. */ + int stateTableLength = bb.getInt(); + int backwardsStateTableLength = bb.getInt(); + int endStatesLength = bb.getInt(); + int lookaheadStatesLength = bb.getInt(); + int BMPdataLength = bb.getInt(); + int nonBMPdataLength = bb.getInt(); + int additionalDataLength = bb.getInt(); + checksum = bb.getLong(); + + /* Read stateTable[numCategories * numRows] */ + stateTable = new short[stateTableLength]; + for (int i = 0; i < stateTableLength; i++) { + stateTable[i] = bb.getShort(); + } + + /* Read backwardsStateTable[numCategories * numRows] */ + backwardsStateTable = new short[backwardsStateTableLength]; + for (int i = 0; i < backwardsStateTableLength; i++) { + backwardsStateTable[i] = bb.getShort(); + } + + /* Read endStates[numRows] */ + endStates = new boolean[endStatesLength]; + for (int i = 0; i < endStatesLength; i++) { + endStates[i] = bb.get() == 1; + } + + /* Read lookaheadStates[numRows] */ + lookaheadStates = new boolean[lookaheadStatesLength]; + for (int i = 0; i < lookaheadStatesLength; i++) { + lookaheadStates[i] = bb.get() == 1; + } + + /* Read a category table and indices for BMP characters. */ + short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices + for (int i = 0; i < BMP_INDICES_LENGTH; i++) { + temp1[i] = bb.getShort(); + } + byte[] temp2 = new byte[BMPdataLength]; // BMPdata + bb.get(temp2); + charCategoryTable = new CompactByteArray(temp1, temp2); + + /* Read a category table for non-BMP characters. */ + int[] temp3 = new int[nonBMPdataLength]; + for (int i = 0; i < nonBMPdataLength; i++) { + temp3[i] = bb.getInt(); + } + supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3); + + /* Read additional data */ + if (additionalDataLength > 0) { + additionalData = new byte[additionalDataLength]; + bb.get(additionalData); + } + assert bb.position() == bb.limit(); + + /* Set numCategories */ + numCategories = stateTable.length / endStates.length; + } + + /** + * Validates the magic number, version, and the length of the given data. + * + * @throws BufferUnderflowException if the end-of-data is reached while + * validating data + * @throws MissingResourceException if valification failed + */ + void validateRuleData(String ruleFile, ByteBuffer bb) { + /* Verify the magic number. */ + for (int i = 0; i < LABEL_LENGTH; i++) { + if (bb.get() != LABEL[i]) { + throw new MissingResourceException("Wrong magic number", + ruleFile, ""); + } + } + + /* Verify the version number. */ + byte version = bb.get(); + if (version != supportedVersion) { + throw new MissingResourceException("Unsupported version(" + version + ")", + ruleFile, ""); + } + + // Check the length of the rest of data + int len = bb.getInt(); + if (bb.position() + len != bb.limit()) { + throw new MissingResourceException("Wrong data length", + ruleFile, ""); + } + } + + byte[] getAdditionalData() { + return additionalData; + } + + void setAdditionalData(byte[] b) { + additionalData = b; + } + + //======================================================================= + // boilerplate + //======================================================================= + /** + * Clones this iterator. + * @return A newly-constructed RuleBasedBreakIterator with the same + * behavior as this one. + */ + @Override + public Object clone() { + RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone(); + if (text != null) { + result.text = (CharacterIterator) text.clone(); + } + return result; + } + + /** + * Returns true if both BreakIterators are of the same class, have the same + * rules, and iterate over the same text. + */ + @Override + public boolean equals(Object that) { + try { + if (that == null) { + return false; + } + + RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; + if (checksum != other.checksum) { + return false; + } + if (text == null) { + return other.text == null; + } else { + return text.equals(other.text); + } + } + catch(ClassCastException e) { + return false; + } + } + + /** + * Returns text + */ + @Override + public String toString() { + return "[checksum=0x" + Long.toHexString(checksum) + ']'; + } + + /** + * Compute a hashcode for this BreakIterator + * @return A hash code + */ + @Override + public int hashCode() { + return (int)checksum; + } + + //======================================================================= + // BreakIterator overrides + //======================================================================= + + /** + * Sets the current iteration position to the beginning of the text. + * (i.e., the CharacterIterator's starting offset). + * @return The offset of the beginning of the text. + */ + @Override + public int first() { + CharacterIterator t = getText(); + + t.first(); + return t.getIndex(); + } + + /** + * Sets the current iteration position to the end of the text. + * (i.e., the CharacterIterator's ending offset). + * @return The text's past-the-end offset. + */ + @Override + public int last() { + CharacterIterator t = getText(); + + // I'm not sure why, but t.last() returns the offset of the last character, + // rather than the past-the-end offset + t.setIndex(t.getEndIndex()); + return t.getIndex(); + } + + /** + * Advances the iterator either forward or backward the specified number of steps. + * Negative values move backward, and positive values move forward. This is + * equivalent to repeatedly calling next() or previous(). + * @param n The number of steps to move. The sign indicates the direction + * (negative is backwards, and positive is forwards). + * @return The character offset of the boundary position n boundaries away from + * the current one. + */ + @Override + public int next(int n) { + int result = current(); + while (n > 0) { + result = handleNext(); + --n; + } + while (n < 0) { + result = previous(); + ++n; + } + return result; + } + + /** + * Advances the iterator to the next boundary position. + * @return The position of the first boundary after this one. + */ + @Override + public int next() { + return handleNext(); + } + + private int cachedLastKnownBreak = BreakIterator.DONE; + + /** + * Advances the iterator backwards, to the last boundary preceding this one. + * @return The position of the last boundary position preceding this one. + */ + @Override + public int previous() { + // if we're already sitting at the beginning of the text, return DONE + CharacterIterator text = getText(); + if (current() == text.getBeginIndex()) { + return BreakIterator.DONE; + } + + // set things up. handlePrevious() will back us up to some valid + // break position before the current position (we back our internal + // iterator up one step to prevent handlePrevious() from returning + // the current position), but not necessarily the last one before + // where we started + int start = current(); + int lastResult = cachedLastKnownBreak; + if (lastResult >= start || lastResult <= BreakIterator.DONE) { + getPrevious(); + lastResult = handlePrevious(); + } else { + //it might be better to check if handlePrevious() give us closer + //safe value but handlePrevious() is slow too + //So, this has to be done carefully + text.setIndex(lastResult); + } + int result = lastResult; + + // iterate forward from the known break position until we pass our + // starting point. The last break position before the starting + // point is our return value + while (result != BreakIterator.DONE && result < start) { + lastResult = result; + result = handleNext(); + } + + // set the current iteration position to be the last break position + // before where we started, and then return that value + text.setIndex(lastResult); + cachedLastKnownBreak = lastResult; + return lastResult; + } + + /** + * Returns previous character + */ + private int getPrevious() { + char c2 = text.previous(); + if (Character.isLowSurrogate(c2) && + text.getIndex() > text.getBeginIndex()) { + char c1 = text.previous(); + if (Character.isHighSurrogate(c1)) { + return Character.toCodePoint(c1, c2); + } else { + text.next(); + } + } + return (int)c2; + } + + /** + * Returns current character + */ + int getCurrent() { + char c1 = text.current(); + if (Character.isHighSurrogate(c1) && + text.getIndex() < text.getEndIndex()) { + char c2 = text.next(); + text.previous(); + if (Character.isLowSurrogate(c2)) { + return Character.toCodePoint(c1, c2); + } + } + return (int)c1; + } + + /** + * Returns the count of next character. + */ + private int getCurrentCodePointCount() { + char c1 = text.current(); + if (Character.isHighSurrogate(c1) && + text.getIndex() < text.getEndIndex()) { + char c2 = text.next(); + text.previous(); + if (Character.isLowSurrogate(c2)) { + return 2; + } + } + return 1; + } + + /** + * Returns next character + */ + int getNext() { + int index = text.getIndex(); + int endIndex = text.getEndIndex(); + if (index == endIndex || + (index += getCurrentCodePointCount()) >= endIndex) { + return CharacterIterator.DONE; + } + text.setIndex(index); + return getCurrent(); + } + + /** + * Returns the position of next character. + */ + private int getNextIndex() { + int index = text.getIndex() + getCurrentCodePointCount(); + int endIndex = text.getEndIndex(); + if (index > endIndex) { + return endIndex; + } else { + return index; + } + } + + /** + * Throw IllegalArgumentException unless begin <= offset < end. + */ + protected static final void checkOffset(int offset, CharacterIterator text) { + if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { + throw new IllegalArgumentException("offset out of bounds"); + } + } + + /** + * Sets the iterator to refer to the first boundary position following + * the specified position. + * @offset The position from which to begin searching for a break position. + * @return The position of the first break after the current position. + */ + @Override + public int following(int offset) { + + CharacterIterator text = getText(); + checkOffset(offset, text); + + // Set our internal iteration position (temporarily) + // to the position passed in. If this is the _beginning_ position, + // then we can just use next() to get our return value + text.setIndex(offset); + if (offset == text.getBeginIndex()) { + cachedLastKnownBreak = handleNext(); + return cachedLastKnownBreak; + } + + // otherwise, we have to sync up first. Use handlePrevious() to back + // us up to a known break position before the specified position (if + // we can determine that the specified position is a break position, + // we don't back up at all). This may or may not be the last break + // position at or before our starting position. Advance forward + // from here until we've passed the starting position. The position + // we stop on will be the first break position after the specified one. + int result = cachedLastKnownBreak; + if (result >= offset || result <= BreakIterator.DONE) { + result = handlePrevious(); + } else { + //it might be better to check if handlePrevious() give us closer + //safe value but handlePrevious() is slow too + //So, this has to be done carefully + text.setIndex(result); + } + while (result != BreakIterator.DONE && result <= offset) { + result = handleNext(); + } + cachedLastKnownBreak = result; + return result; + } + + /** + * Sets the iterator to refer to the last boundary position before the + * specified position. + * @offset The position to begin searching for a break from. + * @return The position of the last boundary before the starting position. + */ + @Override + public int preceding(int offset) { + // if we start by updating the current iteration position to the + // position specified by the caller, we can just use previous() + // to carry out this operation + CharacterIterator text = getText(); + checkOffset(offset, text); + text.setIndex(offset); + return previous(); + } + + /** + * Returns true if the specified position is a boundary position. As a side + * effect, leaves the iterator pointing to the first boundary position at + * or after "offset". + * @param offset the offset to check. + * @return True if "offset" is a boundary position. + */ + @Override + public boolean isBoundary(int offset) { + CharacterIterator text = getText(); + checkOffset(offset, text); + if (offset == text.getBeginIndex()) { + return true; + } + + // to check whether this is a boundary, we can use following() on the + // position before the specified one and return true if the position we + // get back is the one the user specified + else { + return following(offset - 1) == offset; + } + } + + /** + * Returns the current iteration position. + * @return The current iteration position. + */ + @Override + public int current() { + return getText().getIndex(); + } + + /** + * Return a CharacterIterator over the text being analyzed. This version + * of this method returns the actual CharacterIterator we're using internally. + * Changing the state of this iterator can have undefined consequences. If + * you need to change it, clone it first. + * @return An iterator over the text being analyzed. + */ + @Override + public CharacterIterator getText() { + // The iterator is initialized pointing to no text at all, so if this + // function is called while we're in that state, we have to fudge an + // iterator to return. + if (text == null) { + text = new StringCharacterIterator(""); + } + return text; + } + + /** + * Set the iterator to analyze a new piece of text. This function resets + * the current iteration position to the beginning of the text. + * @param newText An iterator over the text to analyze. + */ + @Override + public void setText(CharacterIterator newText) { + // Test iterator to see if we need to wrap it in a SafeCharIterator. + // The correct behavior for CharacterIterators is to allow the + // position to be set to the endpoint of the iterator. Many + // CharacterIterators do not uphold this, so this is a workaround + // to permit them to use this class. + int end = newText.getEndIndex(); + boolean goodIterator; + try { + newText.setIndex(end); // some buggy iterators throw an exception here + goodIterator = newText.getIndex() == end; + } + catch(IllegalArgumentException e) { + goodIterator = false; + } + + if (goodIterator) { + text = newText; + } + else { + text = new SafeCharIterator(newText); + } + text.first(); + + cachedLastKnownBreak = BreakIterator.DONE; + } + + + //======================================================================= + // implementation + //======================================================================= + + /** + * This method is the actual implementation of the next() method. All iteration + * vectors through here. This method initializes the state machine to state 1 + * and advances through the text character by character until we reach the end + * of the text or the state machine transitions to state 0. We update our return + * value every time the state machine passes through a possible end state. + */ + protected int handleNext() { + // if we're already at the end of the text, return DONE. + CharacterIterator text = getText(); + if (text.getIndex() == text.getEndIndex()) { + return BreakIterator.DONE; + } + + // no matter what, we always advance at least one character forward + int result = getNextIndex(); + int lookaheadResult = 0; + + // begin in state 1 + int state = START_STATE; + int category; + int c = getCurrent(); + + // loop until we reach the end of the text or transition to state 0 + while (c != CharacterIterator.DONE && state != STOP_STATE) { + + // look up the current character's character category (which tells us + // which column in the state table to look at) + category = lookupCategory(c); + + // if the character isn't an ignore character, look up a state + // transition in the state table + if (category != IGNORE) { + state = lookupState(state, category); + } + + // if the state we've just transitioned to is a lookahead state, + // (but not also an end state), save its position. If it's + // both a lookahead state and an end state, update the break position + // to the last saved lookup-state position + if (lookaheadStates[state]) { + if (endStates[state]) { + result = lookaheadResult; + } + else { + lookaheadResult = getNextIndex(); + } + } + + // otherwise, if the state we've just transitioned to is an accepting + // state, update the break position to be the current iteration position + else { + if (endStates[state]) { + result = getNextIndex(); + } + } + + c = getNext(); + } + + // if we've run off the end of the text, and the very last character took us into + // a lookahead state, advance the break position to the lookahead position + // (the theory here is that if there are no characters at all after the lookahead + // position, that always matches the lookahead criteria) + if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) { + result = lookaheadResult; + } + + text.setIndex(result); + return result; + } + + /** + * This method backs the iterator back up to a "safe position" in the text. + * This is a position that we know, without any context, must be a break position. + * The various calling methods then iterate forward from this safe position to + * the appropriate position to return. (For more information, see the description + * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) + */ + protected int handlePrevious() { + CharacterIterator text = getText(); + int state = START_STATE; + int category = 0; + int lastCategory = 0; + int c = getCurrent(); + + // loop until we reach the beginning of the text or transition to state 0 + while (c != CharacterIterator.DONE && state != STOP_STATE) { + + // save the last character's category and look up the current + // character's category + lastCategory = category; + category = lookupCategory(c); + + // if the current character isn't an ignore character, look up a + // state transition in the backwards state table + if (category != IGNORE) { + state = lookupBackwardState(state, category); + } + + // then advance one character backwards + c = getPrevious(); + } + + // if we didn't march off the beginning of the text, we're either one or two + // positions away from the real break position. (One because of the call to + // previous() at the end of the loop above, and another because the character + // that takes us into the stop state will always be the character BEFORE + // the break position.) + if (c != CharacterIterator.DONE) { + if (lastCategory != IGNORE) { + getNext(); + getNext(); + } + else { + getNext(); + } + } + return text.getIndex(); + } + + /** + * Looks up a character's category (i.e., its category for breaking purposes, + * not its Unicode category) + */ + protected int lookupCategory(int c) { + if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + return charCategoryTable.elementAt((char)c); + } else { + return supplementaryCharCategoryTable.getValue(c); + } + } + + /** + * Given a current state and a character category, looks up the + * next state to transition to in the state table. + */ + protected int lookupState(int state, int category) { + return stateTable[state * numCategories + category]; + } + + /** + * Given a current state and a character category, looks up the + * next state to transition to in the backwards state table. + */ + protected int lookupBackwardState(int state, int category) { + return backwardsStateTable[state * numCategories + category]; + } + + /* + * This class exists to work around a bug in incorrect implementations + * of CharacterIterator, which incorrectly handle setIndex(endIndex). + * This iterator relies only on base.setIndex(n) where n is less than + * endIndex. + * + * One caveat: if the base iterator's begin and end indices change + * the change will not be reflected by this wrapper. Does that matter? + */ + // TODO: Review this class to see if it's still required. + private static final class SafeCharIterator implements CharacterIterator, + Cloneable { + + private CharacterIterator base; + private int rangeStart; + private int rangeLimit; + private int currentIndex; + + SafeCharIterator(CharacterIterator base) { + this.base = base; + this.rangeStart = base.getBeginIndex(); + this.rangeLimit = base.getEndIndex(); + this.currentIndex = base.getIndex(); + } + + @Override + public char first() { + return setIndex(rangeStart); + } + + @Override + public char last() { + return setIndex(rangeLimit - 1); + } + + @Override + public char current() { + if (currentIndex < rangeStart || currentIndex >= rangeLimit) { + return DONE; + } + else { + return base.setIndex(currentIndex); + } + } + + @Override + public char next() { + + currentIndex++; + if (currentIndex >= rangeLimit) { + currentIndex = rangeLimit; + return DONE; + } + else { + return base.setIndex(currentIndex); + } + } + + @Override + public char previous() { + + currentIndex--; + if (currentIndex < rangeStart) { + currentIndex = rangeStart; + return DONE; + } + else { + return base.setIndex(currentIndex); + } + } + + @Override + public char setIndex(int i) { + + if (i < rangeStart || i > rangeLimit) { + throw new IllegalArgumentException("Invalid position"); + } + currentIndex = i; + return current(); + } + + @Override + public int getBeginIndex() { + return rangeStart; + } + + @Override + public int getEndIndex() { + return rangeLimit; + } + + @Override + public int getIndex() { + return currentIndex; + } + + @Override + public Object clone() { + + SafeCharIterator copy = null; + try { + copy = (SafeCharIterator) super.clone(); + } + catch(CloneNotSupportedException e) { + throw new Error("Clone not supported: " + e); + } + + CharacterIterator copyOfBase = (CharacterIterator) base.clone(); + copy.base = copyOfBase; + return copy; + } + } +} --- old/src/java.base/share/classes/sun/util/locale/provider/DictionaryBasedBreakIterator.java 2016-10-20 12:41:27.393855317 +0900 +++ /dev/null 2016-10-11 12:01:27.741739134 +0900 @@ -1,524 +0,0 @@ -/* - * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - * - * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved - * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved - * - * The original version of this source code and documentation - * is copyrighted and owned by Taligent, Inc., a wholly-owned - * subsidiary of IBM. These materials are provided under terms - * of a License Agreement between Taligent and Sun. This technology - * is protected by multiple US and International patents. - * - * This notice and attribution to Taligent may not be removed. - * Taligent is a registered trademark of Taligent, Inc. - */ - -package sun.util.locale.provider; - -import java.io.IOException; -import java.lang.reflect.Module; -import java.text.CharacterIterator; -import java.util.ArrayList; -import java.util.List; -import java.util.Stack; - -/** - * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary - * to further subdivide ranges of text beyond what is possible using just the - * state-table-based algorithm. This is necessary, for example, to handle - * word and line breaking in Thai, which doesn't use spaces between words. The - * state-table-based algorithm used by RuleBasedBreakIterator is used to divide - * up text as far as possible, and then contiguous ranges of letters are - * repeatedly compared against a list of known words (i.e., the dictionary) - * to divide them up into words. - * - * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, - * but adds one more special substitution name: <dictionary>. This substitution - * name is used to identify characters in words in the dictionary. The idea is that - * if the iterator passes over a chunk of text that includes two or more characters - * in a row that are included in <dictionary>, it goes back through that range and - * derives additional break positions (if possible) using the dictionary. - * - * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary - * file. It follows a prescribed search path to locate the dictionary (right now, - * it looks for it in /com/ibm/text/resources in each directory in the classpath, - * and won't find it in JAR files, but this location is likely to change). The - * dictionary file is in a serialized binary format. We have a very primitive (and - * slow) BuildDictionaryFile utility for creating dictionary files, but aren't - * currently making it public. Contact us for help. - */ -class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { - - /** - * a list of known words that is used to divide up contiguous ranges of letters, - * stored in a compressed, indexed, format that offers fast access - */ - private BreakDictionary dictionary; - - /** - * a list of flags indicating which character categories are contained in - * the dictionary file (this is used to determine which ranges of characters - * to apply the dictionary to) - */ - private boolean[] categoryFlags; - - /** - * a temporary hiding place for the number of dictionary characters in the - * last range passed over by next() - */ - private int dictionaryCharCount; - - /** - * when a range of characters is divided up using the dictionary, the break - * positions that are discovered are stored here, preventing us from having - * to use either the dictionary or the state table again until the iterator - * leaves this range of text - */ - private int[] cachedBreakPositions; - - /** - * if cachedBreakPositions is not null, this indicates which item in the - * cache the current iteration position refers to - */ - private int positionInCache; - - /** - * Constructs a DictionaryBasedBreakIterator. - * @param module The module where the dictionary file resides - * @param dictionaryFilename The filename of the dictionary file to use - */ - DictionaryBasedBreakIterator(Module module, String dataFile, String dictionaryFile) - throws IOException { - super(module, dataFile); - byte[] tmp = super.getAdditionalData(); - if (tmp != null) { - prepareCategoryFlags(tmp); - super.setAdditionalData(null); - } - dictionary = new BreakDictionary(module, dictionaryFile); - } - - private void prepareCategoryFlags(byte[] data) { - categoryFlags = new boolean[data.length]; - for (int i = 0; i < data.length; i++) { - categoryFlags[i] = (data[i] == (byte)1) ? true : false; - } - } - - @Override - public void setText(CharacterIterator newText) { - super.setText(newText); - cachedBreakPositions = null; - dictionaryCharCount = 0; - positionInCache = 0; - } - - /** - * Sets the current iteration position to the beginning of the text. - * (i.e., the CharacterIterator's starting offset). - * @return The offset of the beginning of the text. - */ - @Override - public int first() { - cachedBreakPositions = null; - dictionaryCharCount = 0; - positionInCache = 0; - return super.first(); - } - - /** - * Sets the current iteration position to the end of the text. - * (i.e., the CharacterIterator's ending offset). - * @return The text's past-the-end offset. - */ - @Override - public int last() { - cachedBreakPositions = null; - dictionaryCharCount = 0; - positionInCache = 0; - return super.last(); - } - - /** - * Advances the iterator one step backwards. - * @return The position of the last boundary position before the - * current iteration position - */ - @Override - public int previous() { - CharacterIterator text = getText(); - - // if we have cached break positions and we're still in the range - // covered by them, just move one step backward in the cache - if (cachedBreakPositions != null && positionInCache > 0) { - --positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return cachedBreakPositions[positionInCache]; - } - - // otherwise, dump the cache and use the inherited previous() method to move - // backward. This may fill up the cache with new break positions, in which - // case we have to mark our position in the cache - else { - cachedBreakPositions = null; - int result = super.previous(); - if (cachedBreakPositions != null) { - positionInCache = cachedBreakPositions.length - 2; - } - return result; - } - } - - /** - * Sets the current iteration position to the last boundary position - * before the specified position. - * @param offset The position to begin searching from - * @return The position of the last boundary before "offset" - */ - @Override - public int preceding(int offset) { - CharacterIterator text = getText(); - checkOffset(offset, text); - - // if we have no cached break positions, or "offset" is outside the - // range covered by the cache, we can just call the inherited routine - // (which will eventually call other routines in this class that may - // refresh the cache) - if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] || - offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { - cachedBreakPositions = null; - return super.preceding(offset); - } - - // on the other hand, if "offset" is within the range covered by the cache, - // then all we have to do is search the cache for the last break position - // before "offset" - else { - positionInCache = 0; - while (positionInCache < cachedBreakPositions.length - && offset > cachedBreakPositions[positionInCache]) { - ++positionInCache; - } - --positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return text.getIndex(); - } - } - - /** - * Sets the current iteration position to the first boundary position after - * the specified position. - * @param offset The position to begin searching forward from - * @return The position of the first boundary after "offset" - */ - @Override - public int following(int offset) { - CharacterIterator text = getText(); - checkOffset(offset, text); - - // if we have no cached break positions, or if "offset" is outside the - // range covered by the cache, then dump the cache and call our - // inherited following() method. This will call other methods in this - // class that may refresh the cache. - if (cachedBreakPositions == null || offset < cachedBreakPositions[0] || - offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { - cachedBreakPositions = null; - return super.following(offset); - } - - // on the other hand, if "offset" is within the range covered by the - // cache, then just search the cache for the first break position - // after "offset" - else { - positionInCache = 0; - while (positionInCache < cachedBreakPositions.length - && offset >= cachedBreakPositions[positionInCache]) { - ++positionInCache; - } - text.setIndex(cachedBreakPositions[positionInCache]); - return text.getIndex(); - } - } - - /** - * This is the implementation function for next(). - */ - @Override - protected int handleNext() { - CharacterIterator text = getText(); - - // if there are no cached break positions, or if we've just moved - // off the end of the range covered by the cache, we have to dump - // and possibly regenerate the cache - if (cachedBreakPositions == null || - positionInCache == cachedBreakPositions.length - 1) { - - // start by using the inherited handleNext() to find a tentative return - // value. dictionaryCharCount tells us how many dictionary characters - // we passed over on our way to the tentative return value - int startPos = text.getIndex(); - dictionaryCharCount = 0; - int result = super.handleNext(); - - // if we passed over more than one dictionary character, then we use - // divideUpDictionaryRange() to regenerate the cached break positions - // for the new range - if (dictionaryCharCount > 1 && result - startPos > 1) { - divideUpDictionaryRange(startPos, result); - } - - // otherwise, the value we got back from the inherited fuction - // is our return value, and we can dump the cache - else { - cachedBreakPositions = null; - return result; - } - } - - // if the cache of break positions has been regenerated (or existed all - // along), then just advance to the next break position in the cache - // and return it - if (cachedBreakPositions != null) { - ++positionInCache; - text.setIndex(cachedBreakPositions[positionInCache]); - return cachedBreakPositions[positionInCache]; - } - return -9999; // SHOULD NEVER GET HERE! - } - - /** - * Looks up a character category for a character. - */ - @Override - protected int lookupCategory(int c) { - // this override of lookupCategory() exists only to keep track of whether we've - // passed over any dictionary characters. It calls the inherited lookupCategory() - // to do the real work, and then checks whether its return value is one of the - // categories represented in the dictionary. If it is, bump the dictionary- - // character count. - int result = super.lookupCategory(c); - if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) { - ++dictionaryCharCount; - } - return result; - } - - /** - * This is the function that actually implements the dictionary-based - * algorithm. Given the endpoints of a range of text, it uses the - * dictionary to determine the positions of any boundaries in this - * range. It stores all the boundary positions it discovers in - * cachedBreakPositions so that we only have to do this work once - * for each time we enter the range. - */ - @SuppressWarnings("unchecked") - private void divideUpDictionaryRange(int startPos, int endPos) { - CharacterIterator text = getText(); - - // the range we're dividing may begin or end with non-dictionary characters - // (i.e., for line breaking, we may have leading or trailing punctuation - // that needs to be kept with the word). Seek from the beginning of the - // range to the first dictionary character - text.setIndex(startPos); - int c = getCurrent(); - int category = lookupCategory(c); - while (category == IGNORE || !categoryFlags[category]) { - c = getNext(); - category = lookupCategory(c); - } - - // initialize. We maintain two stacks: currentBreakPositions contains - // the list of break positions that will be returned if we successfully - // finish traversing the whole range now. possibleBreakPositions lists - // all other possible word ends we've passed along the way. (Whenever - // we reach an error [a sequence of characters that can't begin any word - // in the dictionary], we back up, possibly delete some breaks from - // currentBreakPositions, move a break from possibleBreakPositions - // to currentBreakPositions, and start over from there. This process - // continues in this way until we either successfully make it all the way - // across the range, or exhaust all of our combinations of break - // positions.) - Stack currentBreakPositions = new Stack<>(); - Stack possibleBreakPositions = new Stack<>(); - List wrongBreakPositions = new ArrayList<>(); - - // the dictionary is implemented as a trie, which is treated as a state - // machine. -1 represents the end of a legal word. Every word in the - // dictionary is represented by a path from the root node to -1. A path - // that ends in state 0 is an illegal combination of characters. - int state = 0; - - // these two variables are used for error handling. We keep track of the - // farthest we've gotten through the range being divided, and the combination - // of breaks that got us that far. If we use up all possible break - // combinations, the text contains an error or a word that's not in the - // dictionary. In this case, we "bless" the break positions that got us the - // farthest as real break positions, and then start over from scratch with - // the character where the error occurred. - int farthestEndPoint = text.getIndex(); - Stack bestBreakPositions = null; - - // initialize (we always exit the loop with a break statement) - c = getCurrent(); - while (true) { - - // if we can transition to state "-1" from our current state, we're - // on the last character of a legal word. Push that position onto - // the possible-break-positions stack - if (dictionary.getNextState(state, 0) == -1) { - possibleBreakPositions.push(text.getIndex()); - } - - // look up the new state to transition to in the dictionary - state = dictionary.getNextStateFromCharacter(state, c); - - // if the character we're sitting on causes us to transition to - // the "end of word" state, then it was a non-dictionary character - // and we've successfully traversed the whole range. Drop out - // of the loop. - if (state == -1) { - currentBreakPositions.push(text.getIndex()); - break; - } - - // if the character we're sitting on causes us to transition to - // the error state, or if we've gone off the end of the range - // without transitioning to the "end of word" state, we've hit - // an error... - else if (state == 0 || text.getIndex() >= endPos) { - - // if this is the farthest we've gotten, take note of it in - // case there's an error in the text - if (text.getIndex() > farthestEndPoint) { - farthestEndPoint = text.getIndex(); - - @SuppressWarnings("unchecked") - Stack currentBreakPositionsCopy = (Stack) currentBreakPositions.clone(); - - bestBreakPositions = currentBreakPositionsCopy; - } - - // wrongBreakPositions is a list of all break positions - // we've tried starting that didn't allow us to traverse - // all the way through the text. Every time we pop a - // break position off of currentBreakPositions, we put it - // into wrongBreakPositions to avoid trying it again later. - // If we make it to this spot, we're either going to back - // up to a break in possibleBreakPositions and try starting - // over from there, or we've exhausted all possible break - // positions and are going to do the fallback procedure. - // This loop prevents us from messing with anything in - // possibleBreakPositions that didn't work as a starting - // point the last time we tried it (this is to prevent a bunch of - // repetitive checks from slowing down some extreme cases) - while (!possibleBreakPositions.isEmpty() - && wrongBreakPositions.contains(possibleBreakPositions.peek())) { - possibleBreakPositions.pop(); - } - - // if we've used up all possible break-position combinations, there's - // an error or an unknown word in the text. In this case, we start - // over, treating the farthest character we've reached as the beginning - // of the range, and "blessing" the break positions that got us that - // far as real break positions - if (possibleBreakPositions.isEmpty()) { - if (bestBreakPositions != null) { - currentBreakPositions = bestBreakPositions; - if (farthestEndPoint < endPos) { - text.setIndex(farthestEndPoint + 1); - } - else { - break; - } - } - else { - if ((currentBreakPositions.size() == 0 || - currentBreakPositions.peek().intValue() != text.getIndex()) - && text.getIndex() != startPos) { - currentBreakPositions.push(text.getIndex()); - } - getNext(); - currentBreakPositions.push(text.getIndex()); - } - } - - // if we still have more break positions we can try, then promote the - // last break in possibleBreakPositions into currentBreakPositions, - // and get rid of all entries in currentBreakPositions that come after - // it. Then back up to that position and start over from there (i.e., - // treat that position as the beginning of a new word) - else { - Integer temp = possibleBreakPositions.pop(); - Integer temp2 = null; - while (!currentBreakPositions.isEmpty() && temp.intValue() < - currentBreakPositions.peek().intValue()) { - temp2 = currentBreakPositions.pop(); - wrongBreakPositions.add(temp2); - } - currentBreakPositions.push(temp); - text.setIndex(currentBreakPositions.peek().intValue()); - } - - // re-sync "c" for the next go-round, and drop out of the loop if - // we've made it off the end of the range - c = getCurrent(); - if (text.getIndex() >= endPos) { - break; - } - } - - // if we didn't hit any exceptional conditions on this last iteration, - // just advance to the next character and loop - else { - c = getNext(); - } - } - - // dump the last break position in the list, and replace it with the actual - // end of the range (which may be the same character, or may be further on - // because the range actually ended with non-dictionary characters we want to - // keep with the word) - if (!currentBreakPositions.isEmpty()) { - currentBreakPositions.pop(); - } - currentBreakPositions.push(endPos); - - // create a regular array to hold the break positions and copy - // the break positions from the stack to the array (in addition, - // our starting position goes into this array as a break position). - // This array becomes the cache of break positions used by next() - // and previous(), so this is where we actually refresh the cache. - cachedBreakPositions = new int[currentBreakPositions.size() + 1]; - cachedBreakPositions[0] = startPos; - - for (int i = 0; i < currentBreakPositions.size(); i++) { - cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue(); - } - positionInCache = 0; - } -} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/java.base/share/classes/sun/text/DictionaryBasedBreakIterator.java 2016-10-20 12:41:27.249852091 +0900 @@ -0,0 +1,526 @@ +/* + * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + */ + +package sun.text; + +import java.text.CharacterIterator; +import java.util.ArrayList; +import java.util.List; +import java.util.Stack; + +/** + * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary + * to further subdivide ranges of text beyond what is possible using just the + * state-table-based algorithm. This is necessary, for example, to handle + * word and line breaking in Thai, which doesn't use spaces between words. The + * state-table-based algorithm used by RuleBasedBreakIterator is used to divide + * up text as far as possible, and then contiguous ranges of letters are + * repeatedly compared against a list of known words (i.e., the dictionary) + * to divide them up into words. + * + * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, + * but adds one more special substitution name: <dictionary>. This substitution + * name is used to identify characters in words in the dictionary. The idea is that + * if the iterator passes over a chunk of text that includes two or more characters + * in a row that are included in <dictionary>, it goes back through that range and + * derives additional break positions (if possible) using the dictionary. + * + * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary + * file. It follows a prescribed search path to locate the dictionary (right now, + * it looks for it in /com/ibm/text/resources in each directory in the classpath, + * and won't find it in JAR files, but this location is likely to change). The + * dictionary file is in a serialized binary format. We have a very primitive (and + * slow) BuildDictionaryFile utility for creating dictionary files, but aren't + * currently making it public. Contact us for help. + */ +public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { + + /** + * a list of known words that is used to divide up contiguous ranges of letters, + * stored in a compressed, indexed, format that offers fast access + */ + private BreakDictionary dictionary; + + /** + * a list of flags indicating which character categories are contained in + * the dictionary file (this is used to determine which ranges of characters + * to apply the dictionary to) + */ + private boolean[] categoryFlags; + + /** + * a temporary hiding place for the number of dictionary characters in the + * last range passed over by next() + */ + private int dictionaryCharCount; + + /** + * when a range of characters is divided up using the dictionary, the break + * positions that are discovered are stored here, preventing us from having + * to use either the dictionary or the state table again until the iterator + * leaves this range of text + */ + private int[] cachedBreakPositions; + + /** + * if cachedBreakPositions is not null, this indicates which item in the + * cache the current iteration position refers to + */ + private int positionInCache; + + /** + * Constructs a DictionaryBasedBreakIterator. + * + * @param ruleFile the name of the rule data file + * @param ruleData the rule data loaded from the rule data file + * @param dictionaryFile the name of the dictionary file + * @param dictionartData the dictionary data loaded from the dictionary file + * @throws MissingResourceException if rule data or dictionary initialization failed + */ + public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData, + String dictionaryFile, byte[] dictionaryData) { + super(ruleFile, ruleData); + byte[] tmp = super.getAdditionalData(); + if (tmp != null) { + prepareCategoryFlags(tmp); + super.setAdditionalData(null); + } + dictionary = new BreakDictionary(dictionaryFile, dictionaryData); + } + + private void prepareCategoryFlags(byte[] data) { + categoryFlags = new boolean[data.length]; + for (int i = 0; i < data.length; i++) { + categoryFlags[i] = (data[i] == (byte)1) ? true : false; + } + } + + @Override + public void setText(CharacterIterator newText) { + super.setText(newText); + cachedBreakPositions = null; + dictionaryCharCount = 0; + positionInCache = 0; + } + + /** + * Sets the current iteration position to the beginning of the text. + * (i.e., the CharacterIterator's starting offset). + * @return The offset of the beginning of the text. + */ + @Override + public int first() { + cachedBreakPositions = null; + dictionaryCharCount = 0; + positionInCache = 0; + return super.first(); + } + + /** + * Sets the current iteration position to the end of the text. + * (i.e., the CharacterIterator's ending offset). + * @return The text's past-the-end offset. + */ + @Override + public int last() { + cachedBreakPositions = null; + dictionaryCharCount = 0; + positionInCache = 0; + return super.last(); + } + + /** + * Advances the iterator one step backwards. + * @return The position of the last boundary position before the + * current iteration position + */ + @Override + public int previous() { + CharacterIterator text = getText(); + + // if we have cached break positions and we're still in the range + // covered by them, just move one step backward in the cache + if (cachedBreakPositions != null && positionInCache > 0) { + --positionInCache; + text.setIndex(cachedBreakPositions[positionInCache]); + return cachedBreakPositions[positionInCache]; + } + + // otherwise, dump the cache and use the inherited previous() method to move + // backward. This may fill up the cache with new break positions, in which + // case we have to mark our position in the cache + else { + cachedBreakPositions = null; + int result = super.previous(); + if (cachedBreakPositions != null) { + positionInCache = cachedBreakPositions.length - 2; + } + return result; + } + } + + /** + * Sets the current iteration position to the last boundary position + * before the specified position. + * @param offset The position to begin searching from + * @return The position of the last boundary before "offset" + */ + @Override + public int preceding(int offset) { + CharacterIterator text = getText(); + checkOffset(offset, text); + + // if we have no cached break positions, or "offset" is outside the + // range covered by the cache, we can just call the inherited routine + // (which will eventually call other routines in this class that may + // refresh the cache) + if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] || + offset > cachedBreakPositions[cachedBreakPositions.length - 1]) { + cachedBreakPositions = null; + return super.preceding(offset); + } + + // on the other hand, if "offset" is within the range covered by the cache, + // then all we have to do is search the cache for the last break position + // before "offset" + else { + positionInCache = 0; + while (positionInCache < cachedBreakPositions.length + && offset > cachedBreakPositions[positionInCache]) { + ++positionInCache; + } + --positionInCache; + text.setIndex(cachedBreakPositions[positionInCache]); + return text.getIndex(); + } + } + + /** + * Sets the current iteration position to the first boundary position after + * the specified position. + * @param offset The position to begin searching forward from + * @return The position of the first boundary after "offset" + */ + @Override + public int following(int offset) { + CharacterIterator text = getText(); + checkOffset(offset, text); + + // if we have no cached break positions, or if "offset" is outside the + // range covered by the cache, then dump the cache and call our + // inherited following() method. This will call other methods in this + // class that may refresh the cache. + if (cachedBreakPositions == null || offset < cachedBreakPositions[0] || + offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) { + cachedBreakPositions = null; + return super.following(offset); + } + + // on the other hand, if "offset" is within the range covered by the + // cache, then just search the cache for the first break position + // after "offset" + else { + positionInCache = 0; + while (positionInCache < cachedBreakPositions.length + && offset >= cachedBreakPositions[positionInCache]) { + ++positionInCache; + } + text.setIndex(cachedBreakPositions[positionInCache]); + return text.getIndex(); + } + } + + /** + * This is the implementation function for next(). + */ + @Override + protected int handleNext() { + CharacterIterator text = getText(); + + // if there are no cached break positions, or if we've just moved + // off the end of the range covered by the cache, we have to dump + // and possibly regenerate the cache + if (cachedBreakPositions == null || + positionInCache == cachedBreakPositions.length - 1) { + + // start by using the inherited handleNext() to find a tentative return + // value. dictionaryCharCount tells us how many dictionary characters + // we passed over on our way to the tentative return value + int startPos = text.getIndex(); + dictionaryCharCount = 0; + int result = super.handleNext(); + + // if we passed over more than one dictionary character, then we use + // divideUpDictionaryRange() to regenerate the cached break positions + // for the new range + if (dictionaryCharCount > 1 && result - startPos > 1) { + divideUpDictionaryRange(startPos, result); + } + + // otherwise, the value we got back from the inherited fuction + // is our return value, and we can dump the cache + else { + cachedBreakPositions = null; + return result; + } + } + + // if the cache of break positions has been regenerated (or existed all + // along), then just advance to the next break position in the cache + // and return it + if (cachedBreakPositions != null) { + ++positionInCache; + text.setIndex(cachedBreakPositions[positionInCache]); + return cachedBreakPositions[positionInCache]; + } + return -9999; // SHOULD NEVER GET HERE! + } + + /** + * Looks up a character category for a character. + */ + @Override + protected int lookupCategory(int c) { + // this override of lookupCategory() exists only to keep track of whether we've + // passed over any dictionary characters. It calls the inherited lookupCategory() + // to do the real work, and then checks whether its return value is one of the + // categories represented in the dictionary. If it is, bump the dictionary- + // character count. + int result = super.lookupCategory(c); + if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) { + ++dictionaryCharCount; + } + return result; + } + + /** + * This is the function that actually implements the dictionary-based + * algorithm. Given the endpoints of a range of text, it uses the + * dictionary to determine the positions of any boundaries in this + * range. It stores all the boundary positions it discovers in + * cachedBreakPositions so that we only have to do this work once + * for each time we enter the range. + */ + @SuppressWarnings("unchecked") + private void divideUpDictionaryRange(int startPos, int endPos) { + CharacterIterator text = getText(); + + // the range we're dividing may begin or end with non-dictionary characters + // (i.e., for line breaking, we may have leading or trailing punctuation + // that needs to be kept with the word). Seek from the beginning of the + // range to the first dictionary character + text.setIndex(startPos); + int c = getCurrent(); + int category = lookupCategory(c); + while (category == IGNORE || !categoryFlags[category]) { + c = getNext(); + category = lookupCategory(c); + } + + // initialize. We maintain two stacks: currentBreakPositions contains + // the list of break positions that will be returned if we successfully + // finish traversing the whole range now. possibleBreakPositions lists + // all other possible word ends we've passed along the way. (Whenever + // we reach an error [a sequence of characters that can't begin any word + // in the dictionary], we back up, possibly delete some breaks from + // currentBreakPositions, move a break from possibleBreakPositions + // to currentBreakPositions, and start over from there. This process + // continues in this way until we either successfully make it all the way + // across the range, or exhaust all of our combinations of break + // positions.) + Stack currentBreakPositions = new Stack<>(); + Stack possibleBreakPositions = new Stack<>(); + List wrongBreakPositions = new ArrayList<>(); + + // the dictionary is implemented as a trie, which is treated as a state + // machine. -1 represents the end of a legal word. Every word in the + // dictionary is represented by a path from the root node to -1. A path + // that ends in state 0 is an illegal combination of characters. + int state = 0; + + // these two variables are used for error handling. We keep track of the + // farthest we've gotten through the range being divided, and the combination + // of breaks that got us that far. If we use up all possible break + // combinations, the text contains an error or a word that's not in the + // dictionary. In this case, we "bless" the break positions that got us the + // farthest as real break positions, and then start over from scratch with + // the character where the error occurred. + int farthestEndPoint = text.getIndex(); + Stack bestBreakPositions = null; + + // initialize (we always exit the loop with a break statement) + c = getCurrent(); + while (true) { + + // if we can transition to state "-1" from our current state, we're + // on the last character of a legal word. Push that position onto + // the possible-break-positions stack + if (dictionary.getNextState(state, 0) == -1) { + possibleBreakPositions.push(text.getIndex()); + } + + // look up the new state to transition to in the dictionary + state = dictionary.getNextStateFromCharacter(state, c); + + // if the character we're sitting on causes us to transition to + // the "end of word" state, then it was a non-dictionary character + // and we've successfully traversed the whole range. Drop out + // of the loop. + if (state == -1) { + currentBreakPositions.push(text.getIndex()); + break; + } + + // if the character we're sitting on causes us to transition to + // the error state, or if we've gone off the end of the range + // without transitioning to the "end of word" state, we've hit + // an error... + else if (state == 0 || text.getIndex() >= endPos) { + + // if this is the farthest we've gotten, take note of it in + // case there's an error in the text + if (text.getIndex() > farthestEndPoint) { + farthestEndPoint = text.getIndex(); + + @SuppressWarnings("unchecked") + Stack currentBreakPositionsCopy = (Stack) currentBreakPositions.clone(); + + bestBreakPositions = currentBreakPositionsCopy; + } + + // wrongBreakPositions is a list of all break positions + // we've tried starting that didn't allow us to traverse + // all the way through the text. Every time we pop a + // break position off of currentBreakPositions, we put it + // into wrongBreakPositions to avoid trying it again later. + // If we make it to this spot, we're either going to back + // up to a break in possibleBreakPositions and try starting + // over from there, or we've exhausted all possible break + // positions and are going to do the fallback procedure. + // This loop prevents us from messing with anything in + // possibleBreakPositions that didn't work as a starting + // point the last time we tried it (this is to prevent a bunch of + // repetitive checks from slowing down some extreme cases) + while (!possibleBreakPositions.isEmpty() + && wrongBreakPositions.contains(possibleBreakPositions.peek())) { + possibleBreakPositions.pop(); + } + + // if we've used up all possible break-position combinations, there's + // an error or an unknown word in the text. In this case, we start + // over, treating the farthest character we've reached as the beginning + // of the range, and "blessing" the break positions that got us that + // far as real break positions + if (possibleBreakPositions.isEmpty()) { + if (bestBreakPositions != null) { + currentBreakPositions = bestBreakPositions; + if (farthestEndPoint < endPos) { + text.setIndex(farthestEndPoint + 1); + } + else { + break; + } + } + else { + if ((currentBreakPositions.size() == 0 || + currentBreakPositions.peek().intValue() != text.getIndex()) + && text.getIndex() != startPos) { + currentBreakPositions.push(text.getIndex()); + } + getNext(); + currentBreakPositions.push(text.getIndex()); + } + } + + // if we still have more break positions we can try, then promote the + // last break in possibleBreakPositions into currentBreakPositions, + // and get rid of all entries in currentBreakPositions that come after + // it. Then back up to that position and start over from there (i.e., + // treat that position as the beginning of a new word) + else { + Integer temp = possibleBreakPositions.pop(); + Integer temp2 = null; + while (!currentBreakPositions.isEmpty() && temp.intValue() < + currentBreakPositions.peek().intValue()) { + temp2 = currentBreakPositions.pop(); + wrongBreakPositions.add(temp2); + } + currentBreakPositions.push(temp); + text.setIndex(currentBreakPositions.peek().intValue()); + } + + // re-sync "c" for the next go-round, and drop out of the loop if + // we've made it off the end of the range + c = getCurrent(); + if (text.getIndex() >= endPos) { + break; + } + } + + // if we didn't hit any exceptional conditions on this last iteration, + // just advance to the next character and loop + else { + c = getNext(); + } + } + + // dump the last break position in the list, and replace it with the actual + // end of the range (which may be the same character, or may be further on + // because the range actually ended with non-dictionary characters we want to + // keep with the word) + if (!currentBreakPositions.isEmpty()) { + currentBreakPositions.pop(); + } + currentBreakPositions.push(endPos); + + // create a regular array to hold the break positions and copy + // the break positions from the stack to the array (in addition, + // our starting position goes into this array as a break position). + // This array becomes the cache of break positions used by next() + // and previous(), so this is where we actually refresh the cache. + cachedBreakPositions = new int[currentBreakPositions.size() + 1]; + cachedBreakPositions[0] = startPos; + + for (int i = 0; i < currentBreakPositions.size(); i++) { + cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue(); + } + positionInCache = 0; + } +} --- old/src/java.base/share/classes/sun/util/locale/provider/BreakDictionary.java 2016-10-20 12:41:27.693862038 +0900 +++ /dev/null 2016-10-11 12:01:27.741739134 +0900 @@ -1,352 +0,0 @@ -/* - * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - * - * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved - * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved - * - * The original version of this source code and documentation - * is copyrighted and owned by Taligent, Inc., a wholly-owned - * subsidiary of IBM. These materials are provided under terms - * of a License Agreement between Taligent and Sun. This technology - * is protected by multiple US and International patents. - * - * This notice and attribution to Taligent may not be removed. - * Taligent is a registered trademark of Taligent, Inc. - */ -package sun.util.locale.provider; - -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.lang.reflect.Module; -import java.security.AccessController; -import java.security.PrivilegedActionException; -import java.security.PrivilegedExceptionAction; -import java.util.MissingResourceException; -import sun.text.CompactByteArray; -import sun.text.SupplementaryCharacterData; - -/** - * This is the class that represents the list of known words used by - * DictionaryBasedBreakIterator. The conceptual data structure used - * here is a trie: there is a node hanging off the root node for every - * letter that can start a word. Each of these nodes has a node hanging - * off of it for every letter that can be the second letter of a word - * if this node is the first letter, and so on. The trie is represented - * as a two-dimensional array that can be treated as a table of state - * transitions. Indexes are used to compress this array, taking - * advantage of the fact that this array will always be very sparse. - */ -class BreakDictionary { - - //========================================================================= - // data members - //========================================================================= - - /** - * The version of the dictionary that was read in. - */ - private static int supportedVersion = 1; - - /** - * Maps from characters to column numbers. The main use of this is to - * avoid making room in the array for empty columns. - */ - private CompactByteArray columnMap = null; - private SupplementaryCharacterData supplementaryCharColumnMap = null; - - /** - * The number of actual columns in the table - */ - private int numCols; - - /** - * Columns are organized into groups of 32. This says how many - * column groups. (We could calculate this, but we store the - * value to avoid having to repeatedly calculate it.) - */ - private int numColGroups; - - /** - * The actual compressed state table. Each conceptual row represents - * a state, and the cells in it contain the row numbers of the states - * to transition to for each possible letter. 0 is used to indicate - * an illegal combination of letters (i.e., the error state). The - * table is compressed by eliminating all the unpopulated (i.e., zero) - * cells. Multiple conceptual rows can then be doubled up in a single - * physical row by sliding them up and possibly shifting them to one - * side or the other so the populated cells don't collide. Indexes - * are used to identify unpopulated cells and to locate populated cells. - */ - private short[] table = null; - - /** - * This index maps logical row numbers to physical row numbers - */ - private short[] rowIndex = null; - - /** - * A bitmap is used to tell which cells in the comceptual table are - * populated. This array contains all the unique bit combinations - * in that bitmap. If the table is more than 32 columns wide, - * successive entries in this array are used for a single row. - */ - private int[] rowIndexFlags = null; - - /** - * This index maps from a logical row number into the bitmap table above. - * (This keeps us from storing duplicate bitmap combinations.) Since there - * are a lot of rows with only one populated cell, instead of wasting space - * in the bitmap table, we just store a negative number in this index for - * rows with one populated cell. The absolute value of that number is - * the column number of the populated cell. - */ - private short[] rowIndexFlagsIndex = null; - - /** - * For each logical row, this index contains a constant that is added to - * the logical column number to get the physical column number - */ - private byte[] rowIndexShifts = null; - - //========================================================================= - // deserialization - //========================================================================= - - BreakDictionary(Module module, String dictionaryName) - throws IOException, MissingResourceException { - - readDictionaryFile(module, dictionaryName); - } - - private void readDictionaryFile(final Module module, final String dictionaryName) - throws IOException, MissingResourceException { - - BufferedInputStream in; - try { - PrivilegedExceptionAction pa = () -> { - String pathName = "jdk.localedata".equals(module.getName()) ? - "sun/text/resources/ext/" : - "sun/text/resources/"; - InputStream is = module.getResourceAsStream(pathName + dictionaryName); - if (is == null) { - // Try to load the file with "java.base" module instance. Assumption - // here is that the fall back data files to be read should reside in - // java.base. - is = BreakDictionary.class.getModule().getResourceAsStream("sun/text/resources/" + dictionaryName); - } - - return new BufferedInputStream(is); - }; - in = AccessController.doPrivileged(pa); - } - catch (PrivilegedActionException e) { - throw new InternalError(e.toString(), e); - } - - byte[] buf = new byte[8]; - if (in.read(buf) != 8) { - throw new MissingResourceException("Wrong data length", - dictionaryName, ""); - } - - // check version - int version = RuleBasedBreakIterator.getInt(buf, 0); - if (version != supportedVersion) { - throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", - dictionaryName, ""); - } - - // get data size - int len = RuleBasedBreakIterator.getInt(buf, 4); - buf = new byte[len]; - if (in.read(buf) != len) { - throw new MissingResourceException("Wrong data length", - dictionaryName, ""); - } - - // close the stream - in.close(); - - int l; - int offset = 0; - - // read in the column map for BMP characteres (this is serialized in - // its internal form: an index array followed by a data array) - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - short[] temp = new short[l]; - for (int i = 0; i < l; i++, offset+=2) { - temp[i] = RuleBasedBreakIterator.getShort(buf, offset); - } - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - byte[] temp2 = new byte[l]; - for (int i = 0; i < l; i++, offset++) { - temp2[i] = buf[offset]; - } - columnMap = new CompactByteArray(temp, temp2); - - // read in numCols and numColGroups - numCols = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - numColGroups = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - - // read in the row-number index - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - rowIndex = new short[l]; - for (int i = 0; i < l; i++, offset+=2) { - rowIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); - } - - // load in the populated-cells bitmap: index first, then bitmap list - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - rowIndexFlagsIndex = new short[l]; - for (int i = 0; i < l; i++, offset+=2) { - rowIndexFlagsIndex[i] = RuleBasedBreakIterator.getShort(buf, offset); - } - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - rowIndexFlags = new int[l]; - for (int i = 0; i < l; i++, offset+=4) { - rowIndexFlags[i] = RuleBasedBreakIterator.getInt(buf, offset); - } - - // load in the row-shift index - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - rowIndexShifts = new byte[l]; - for (int i = 0; i < l; i++, offset++) { - rowIndexShifts[i] = buf[offset]; - } - - // load in the actual state table - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - table = new short[l]; - for (int i = 0; i < l; i++, offset+=2) { - table[i] = RuleBasedBreakIterator.getShort(buf, offset); - } - - // finally, prepare the column map for supplementary characters - l = RuleBasedBreakIterator.getInt(buf, offset); - offset += 4; - int[] temp3 = new int[l]; - for (int i = 0; i < l; i++, offset+=4) { - temp3[i] = RuleBasedBreakIterator.getInt(buf, offset); - } - supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); - } - - //========================================================================= - // access to the words - //========================================================================= - - /** - * Uses the column map to map the character to a column number, then - * passes the row and column number to getNextState() - * @param row The current state - * @param ch The character whose column we're interested in - * @return The new state to transition to - */ - public final short getNextStateFromCharacter(int row, int ch) { - int col; - if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - col = columnMap.elementAt((char)ch); - } else { - col = supplementaryCharColumnMap.getValue(ch); - } - return getNextState(row, col); - } - - /** - * Returns the value in the cell with the specified (logical) row and - * column numbers. In DictionaryBasedBreakIterator, the row number is - * a state number, the column number is an input, and the return value - * is the row number of the new state to transition to. (0 is the - * "error" state, and -1 is the "end of word" state in a dictionary) - * @param row The row number of the current state - * @param col The column number of the input character (0 means "not a - * dictionary character") - * @return The row number of the new state to transition to - */ - public final short getNextState(int row, int col) { - if (cellIsPopulated(row, col)) { - // we map from logical to physical row number by looking up the - // mapping in rowIndex; we map from logical column number to - // physical column number by looking up a shift value for this - // logical row and offsetting the logical column number by - // the shift amount. Then we can use internalAt() to actually - // get the value out of the table. - return internalAt(rowIndex[row], col + rowIndexShifts[row]); - } - else { - return 0; - } - } - - /** - * Given (logical) row and column numbers, returns true if the - * cell in that position is populated - */ - private boolean cellIsPopulated(int row, int col) { - // look up the entry in the bitmap index for the specified row. - // If it's a negative number, it's the column number of the only - // populated cell in the row - if (rowIndexFlagsIndex[row] < 0) { - return col == -rowIndexFlagsIndex[row]; - } - - // if it's a positive number, it's the offset of an entry in the bitmap - // list. If the table is more than 32 columns wide, the bitmap is stored - // successive entries in the bitmap list, so we have to divide the column - // number by 32 and offset the number we got out of the index by the result. - // Once we have the appropriate piece of the bitmap, test the appropriate - // bit and return the result. - else { - int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; - return (flags & (1 << (col & 0x1f))) != 0; - } - } - - /** - * Implementation of getNextState() when we know the specified cell is - * populated. - * @param row The PHYSICAL row number of the cell - * @param col The PHYSICAL column number of the cell - * @return The value stored in the cell - */ - private short internalAt(int row, int col) { - // the table is a one-dimensional array, so this just does the math necessary - // to treat it as a two-dimensional array (we don't just use a two-dimensional - // array because two-dimensional arrays are inefficient in Java) - return table[row * numCols + col]; - } -} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/java.base/share/classes/sun/text/BreakDictionary.java 2016-10-20 12:41:27.557858990 +0900 @@ -0,0 +1,306 @@ +/* + * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + */ +package sun.text; + +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; +import java.util.MissingResourceException; +import sun.text.CompactByteArray; +import sun.text.SupplementaryCharacterData; + +/** + * This is the class that represents the list of known words used by + * DictionaryBasedBreakIterator. The conceptual data structure used + * here is a trie: there is a node hanging off the root node for every + * letter that can start a word. Each of these nodes has a node hanging + * off of it for every letter that can be the second letter of a word + * if this node is the first letter, and so on. The trie is represented + * as a two-dimensional array that can be treated as a table of state + * transitions. Indexes are used to compress this array, taking + * advantage of the fact that this array will always be very sparse. + */ +class BreakDictionary { + + //========================================================================= + // data members + //========================================================================= + + /** + * The version of the dictionary that was read in. + */ + private static int supportedVersion = 1; + + /** + * Maps from characters to column numbers. The main use of this is to + * avoid making room in the array for empty columns. + */ + private CompactByteArray columnMap = null; + private SupplementaryCharacterData supplementaryCharColumnMap = null; + + /** + * The number of actual columns in the table + */ + private int numCols; + + /** + * Columns are organized into groups of 32. This says how many + * column groups. (We could calculate this, but we store the + * value to avoid having to repeatedly calculate it.) + */ + private int numColGroups; + + /** + * The actual compressed state table. Each conceptual row represents + * a state, and the cells in it contain the row numbers of the states + * to transition to for each possible letter. 0 is used to indicate + * an illegal combination of letters (i.e., the error state). The + * table is compressed by eliminating all the unpopulated (i.e., zero) + * cells. Multiple conceptual rows can then be doubled up in a single + * physical row by sliding them up and possibly shifting them to one + * side or the other so the populated cells don't collide. Indexes + * are used to identify unpopulated cells and to locate populated cells. + */ + private short[] table = null; + + /** + * This index maps logical row numbers to physical row numbers + */ + private short[] rowIndex = null; + + /** + * A bitmap is used to tell which cells in the comceptual table are + * populated. This array contains all the unique bit combinations + * in that bitmap. If the table is more than 32 columns wide, + * successive entries in this array are used for a single row. + */ + private int[] rowIndexFlags = null; + + /** + * This index maps from a logical row number into the bitmap table above. + * (This keeps us from storing duplicate bitmap combinations.) Since there + * are a lot of rows with only one populated cell, instead of wasting space + * in the bitmap table, we just store a negative number in this index for + * rows with one populated cell. The absolute value of that number is + * the column number of the populated cell. + */ + private short[] rowIndexFlagsIndex = null; + + /** + * For each logical row, this index contains a constant that is added to + * the logical column number to get the physical column number + */ + private byte[] rowIndexShifts = null; + + //========================================================================= + // deserialization + //========================================================================= + + BreakDictionary(String dictionaryName, byte[] dictionaryData) { + try { + setupDictionary(dictionaryName, dictionaryData); + } catch (BufferUnderflowException bue) { + MissingResourceException e; + e = new MissingResourceException("Corrupted dictionary data", + dictionaryName, ""); + e.initCause(bue); + throw e; + } + } + + private void setupDictionary(String dictionaryName, byte[] dictionaryData) { + ByteBuffer bb = ByteBuffer.wrap(dictionaryData); + + // check version + int version = bb.getInt(); + if (version != supportedVersion) { + throw new MissingResourceException("Dictionary version(" + version + ") is unsupported", + dictionaryName, ""); + } + + // Check data size + int len = bb.getInt(); + if (bb.position() + len != bb.limit()) { + throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(), + dictionaryName, ""); + } + + // read in the column map for BMP characteres (this is serialized in + // its internal form: an index array followed by a data array) + len = bb.getInt(); + short[] temp = new short[len]; + for (int i = 0; i < len; i++) { + temp[i] = bb.getShort(); + } + len = bb.getInt(); + byte[] temp2 = new byte[len]; + bb.get(temp2); + columnMap = new CompactByteArray(temp, temp2); + + // read in numCols and numColGroups + numCols = bb.getInt(); + numColGroups = bb.getInt(); + + // read in the row-number index + len = bb.getInt(); + rowIndex = new short[len]; + for (int i = 0; i < len; i++) { + rowIndex[i] = bb.getShort(); + } + + // load in the populated-cells bitmap: index first, then bitmap list + len = bb.getInt(); + rowIndexFlagsIndex = new short[len]; + for (int i = 0; i < len; i++) { + rowIndexFlagsIndex[i] = bb.getShort(); + } + len = bb.getInt(); + rowIndexFlags = new int[len]; + for (int i = 0; i < len; i++) { + rowIndexFlags[i] = bb.getInt(); + } + + // load in the row-shift index + len = bb.getInt(); + rowIndexShifts = new byte[len]; + bb.get(rowIndexShifts); + + // load in the actual state table + len = bb.getInt(); + table = new short[len]; + for (int i = 0; i < len; i++) { + table[i] = bb.getShort(); + } + + // finally, prepare the column map for supplementary characters + len = bb.getInt(); + int[] temp3 = new int[len]; + for (int i = 0; i < len; i++) { + temp3[i] = bb.getInt(); + } + assert bb.position() == bb.limit(); + + supplementaryCharColumnMap = new SupplementaryCharacterData(temp3); + } + + //========================================================================= + // access to the words + //========================================================================= + + /** + * Uses the column map to map the character to a column number, then + * passes the row and column number to getNextState() + * @param row The current state + * @param ch The character whose column we're interested in + * @return The new state to transition to + */ + public final short getNextStateFromCharacter(int row, int ch) { + int col; + if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + col = columnMap.elementAt((char)ch); + } else { + col = supplementaryCharColumnMap.getValue(ch); + } + return getNextState(row, col); + } + + /** + * Returns the value in the cell with the specified (logical) row and + * column numbers. In DictionaryBasedBreakIterator, the row number is + * a state number, the column number is an input, and the return value + * is the row number of the new state to transition to. (0 is the + * "error" state, and -1 is the "end of word" state in a dictionary) + * @param row The row number of the current state + * @param col The column number of the input character (0 means "not a + * dictionary character") + * @return The row number of the new state to transition to + */ + public final short getNextState(int row, int col) { + if (cellIsPopulated(row, col)) { + // we map from logical to physical row number by looking up the + // mapping in rowIndex; we map from logical column number to + // physical column number by looking up a shift value for this + // logical row and offsetting the logical column number by + // the shift amount. Then we can use internalAt() to actually + // get the value out of the table. + return internalAt(rowIndex[row], col + rowIndexShifts[row]); + } + else { + return 0; + } + } + + /** + * Given (logical) row and column numbers, returns true if the + * cell in that position is populated + */ + private boolean cellIsPopulated(int row, int col) { + // look up the entry in the bitmap index for the specified row. + // If it's a negative number, it's the column number of the only + // populated cell in the row + if (rowIndexFlagsIndex[row] < 0) { + return col == -rowIndexFlagsIndex[row]; + } + + // if it's a positive number, it's the offset of an entry in the bitmap + // list. If the table is more than 32 columns wide, the bitmap is stored + // successive entries in the bitmap list, so we have to divide the column + // number by 32 and offset the number we got out of the index by the result. + // Once we have the appropriate piece of the bitmap, test the appropriate + // bit and return the result. + else { + int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)]; + return (flags & (1 << (col & 0x1f))) != 0; + } + } + + /** + * Implementation of getNextState() when we know the specified cell is + * populated. + * @param row The PHYSICAL row number of the cell + * @param col The PHYSICAL column number of the cell + * @return The value stored in the cell + */ + private short internalAt(int row, int col) { + // the table is a one-dimensional array, so this just does the math necessary + // to treat it as a two-dimensional array (we don't just use a two-dimensional + // array because two-dimensional arrays are inefficient in Java) + return table[row * numCols + col]; + } +} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/java.base/share/classes/sun/util/resources/BreakIteratorResourceBundle.java 2016-10-20 12:41:27.857865710 +0900 @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package sun.util.resources; + +import java.io.InputStream; +import java.security.AccessController; +import java.security.PrivilegedActionException; +import java.security.PrivilegedExceptionAction; +import java.util.Collections; +import java.util.Enumeration; +import java.util.ResourceBundle; +import java.util.Set; + +/** + * BreakIteratorResourceBundle is an abstract class for loading BreakIterator + * data (rules or dictionary) from each module. An implementation class must + * implement getBreakIteratorInfo() that returns an instance of the + * corresponding BreakIteratorInfo (basename). The data name is taken from the + * BreakIteratorInfo instance. + * + *

For example, if the given key is "WordDictionary" and Locale is "th", the + * data name is taken from a BreakIteratorInfo_th and the key's value is + * "thai_dict". Its data thai_dict is loaded from the Module of the + * implementation class of this class. + */ + +public abstract class BreakIteratorResourceBundle extends ResourceBundle { + // If any keys that are not for data names are added to BreakIteratorInfo*, + // those keys must be added to NON_DATA_KEYS. + private static final Set NON_DATA_KEYS = Set.of("BreakIteratorClasses"); + + private volatile Set keys; + + /** + * Returns an instance of the corresponding {@code BreakIteratorInfo} (basename). + * The instance shouldn't have its parent. + */ + protected abstract ResourceBundle getBreakIteratorInfo(); + + @Override + protected Object handleGetObject(String key) { + if (NON_DATA_KEYS.contains(key)) { + return null; + } + ResourceBundle info = getBreakIteratorInfo(); + if (!info.containsKey(key)) { + return null; + } + String path = getClass().getPackage().getName().replace('.', '/') + + '/' + info.getString(key); + byte[] data; + try (InputStream is = getResourceAsStream(path)) { + data = is.readAllBytes(); + } catch (Exception e) { + throw new InternalError("Can't load " + path, e); + } + return data; + } + + private InputStream getResourceAsStream(String path) throws Exception { + PrivilegedExceptionAction pa; + pa = () -> getClass().getModule().getResourceAsStream(path); + InputStream is; + try { + is = AccessController.doPrivileged(pa); + } catch (PrivilegedActionException e) { + throw e.getException(); + } + return is; + } + + @Override + public Enumeration getKeys() { + return Collections.enumeration(keySet()); + } + + @Override + protected Set handleKeySet() { + if (keys == null) { + ResourceBundle info = getBreakIteratorInfo(); + Set k = info.keySet(); + k.removeAll(NON_DATA_KEYS); + synchronized (this) { + if (keys == null) { + keys = k; + } + } + } + return keys; + } +} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/java.base/share/classes/sun/text/resources/BreakIteratorResources.java 2016-10-20 12:41:28.129871804 +0900 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package sun.text.resources; + +import java.util.ResourceBundle; +import sun.util.resources.BreakIteratorResourceBundle; + +public class BreakIteratorResources extends BreakIteratorResourceBundle { + @Override + protected ResourceBundle getBreakIteratorInfo() { + return new BreakIteratorInfo(); + } +} --- /dev/null 2016-10-11 12:01:27.741739134 +0900 +++ new/src/jdk.localedata/share/classes/sun/text/resources/ext/BreakIteratorResources_th.java 2016-10-20 12:41:28.389877629 +0900 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package sun.text.resources.ext; + +import java.util.ResourceBundle; +import sun.util.resources.BreakIteratorResourceBundle; + +public class BreakIteratorResources_th extends BreakIteratorResourceBundle { + @Override + protected ResourceBundle getBreakIteratorInfo() { + return new BreakIteratorInfo_th(); + } +} --- old/test/java/util/PluggableLocale/BreakIteratorProviderTest.java 2016-10-20 12:41:28.777886321 +0900 +++ new/test/java/util/PluggableLocale/BreakIteratorProviderTest.java 2016-10-20 12:41:28.665883810 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2016, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -89,7 +89,7 @@ String[] jresResult = new String[4]; if (jreSupportsLocale) { for (int i = 0; i < 4; i++) { - jresResult[i] = "sun.util.locale.provider."+classNames[i]; + jresResult[i] = "sun.text." + classNames[i]; } } --- old/test/java/util/PluggableLocale/BreakIteratorProviderTest.sh 2016-10-20 12:41:29.049892413 +0900 +++ new/test/java/util/PluggableLocale/BreakIteratorProviderTest.sh 2016-10-20 12:41:28.937889905 +0900 @@ -1,6 +1,6 @@ #!/bin/sh # -# Copyright (c) 2007, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2007, 2016, Oracle and/or its affiliates. All rights reserved. # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. # # This code is free software; you can redistribute it and/or modify it @@ -23,6 +23,6 @@ # # # @test -# @bug 4052440 8062588 +# @bug 4052440 8062588 8165804 # @summary BreakIteratorProvider tests # @run shell ExecTest.sh foo BreakIteratorProviderTest --- old/test/tools/jlink/plugins/IncludeLocalesPluginTest.java 2016-10-20 12:41:29.337898864 +0900 +++ new/test/tools/jlink/plugins/IncludeLocalesPluginTest.java 2016-10-20 12:41:29.221896266 +0900 @@ -40,7 +40,7 @@ /* * @test - * @bug 8152143 8152704 8155649 + * @bug 8152143 8152704 8155649 8165804 * @summary IncludeLocalesPlugin tests * @author Naoto Sato * @library ../../lib @@ -236,6 +236,7 @@ "/jdk.localedata/sun/text/resources/ext/thai_dict", "/jdk.localedata/sun/text/resources/ext/WordBreakIteratorData_th", "/jdk.localedata/sun/text/resources/ext/BreakIteratorInfo_th.class", + "/jdk.localedata/sun/text/resources/ext/BreakIteratorResources_th.class", "/jdk.localedata/sun/text/resources/ext/FormatData_en_GB.class", "/jdk.localedata/sun/text/resources/ext/FormatData_ja.class", "/jdk.localedata/sun/text/resources/ext/FormatData_th.class", @@ -261,6 +262,7 @@ "/jdk.localedata/sun/text/resources/ext/thai_dict", "/jdk.localedata/sun/text/resources/ext/WordBreakIteratorData_th", "/jdk.localedata/sun/text/resources/ext/BreakIteratorInfo_th.class", + "/jdk.localedata/sun/text/resources/ext/BreakIteratorResources_th.class", "/jdk.localedata/sun/text/resources/ext/FormatData_th.class"), List.of( "/jdk.localedata/sun/text/resources/ext/FormatData_en_GB.class",