1 /* 2 * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 41 package sun.util.locale.provider; 42 43 import java.io.IOException; 44 import java.lang.reflect.Module; 45 import java.text.CharacterIterator; 46 import java.util.ArrayList; 47 import java.util.List; 48 import java.util.Stack; 49 50 /** 51 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary 52 * to further subdivide ranges of text beyond what is possible using just the 53 * state-table-based algorithm. This is necessary, for example, to handle 54 * word and line breaking in Thai, which doesn't use spaces between words. The 55 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide 56 * up text as far as possible, and then contiguous ranges of letters are 57 * repeatedly compared against a list of known words (i.e., the dictionary) 58 * to divide them up into words. 59 * 60 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, 61 * but adds one more special substitution name: <dictionary>. This substitution 62 * name is used to identify characters in words in the dictionary. The idea is that 63 * if the iterator passes over a chunk of text that includes two or more characters 64 * in a row that are included in <dictionary>, it goes back through that range and 65 * derives additional break positions (if possible) using the dictionary. 66 * 67 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary 68 * file. It follows a prescribed search path to locate the dictionary (right now, 69 * it looks for it in /com/ibm/text/resources in each directory in the classpath, 70 * and won't find it in JAR files, but this location is likely to change). The 71 * dictionary file is in a serialized binary format. We have a very primitive (and 72 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't 73 * currently making it public. Contact us for help. 74 */ 75 class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { 76 77 /** 78 * a list of known words that is used to divide up contiguous ranges of letters, 79 * stored in a compressed, indexed, format that offers fast access 80 */ 81 private BreakDictionary dictionary; 82 83 /** 84 * a list of flags indicating which character categories are contained in 85 * the dictionary file (this is used to determine which ranges of characters 86 * to apply the dictionary to) 87 */ 88 private boolean[] categoryFlags; 89 90 /** 91 * a temporary hiding place for the number of dictionary characters in the 92 * last range passed over by next() 93 */ 94 private int dictionaryCharCount; 95 96 /** 97 * when a range of characters is divided up using the dictionary, the break 98 * positions that are discovered are stored here, preventing us from having 99 * to use either the dictionary or the state table again until the iterator 100 * leaves this range of text 101 */ 102 private int[] cachedBreakPositions; 103 104 /** 105 * if cachedBreakPositions is not null, this indicates which item in the 106 * cache the current iteration position refers to 107 */ 108 private int positionInCache; 109 110 /** 111 * Constructs a DictionaryBasedBreakIterator. 112 * @param module The module where the dictionary file resides 113 * @param dictionaryFilename The filename of the dictionary file to use 114 */ 115 DictionaryBasedBreakIterator(Module module, String dataFile, String dictionaryFile) 116 throws IOException { 117 super(module, dataFile); 118 byte[] tmp = super.getAdditionalData(); 119 if (tmp != null) { 120 prepareCategoryFlags(tmp); 121 super.setAdditionalData(null); 122 } 123 dictionary = new BreakDictionary(module, dictionaryFile); 124 } 125 126 private void prepareCategoryFlags(byte[] data) { 127 categoryFlags = new boolean[data.length]; 128 for (int i = 0; i < data.length; i++) { 129 categoryFlags[i] = (data[i] == (byte)1) ? true : false; 130 } 131 } 132 133 @Override 134 public void setText(CharacterIterator newText) { 135 super.setText(newText); 136 cachedBreakPositions = null; 137 dictionaryCharCount = 0; 138 positionInCache = 0; 139 } 140 141 /** 142 * Sets the current iteration position to the beginning of the text. 143 * (i.e., the CharacterIterator's starting offset). | 1 /* 2 * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * 28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 30 * 31 * The original version of this source code and documentation 32 * is copyrighted and owned by Taligent, Inc., a wholly-owned 33 * subsidiary of IBM. These materials are provided under terms 34 * of a License Agreement between Taligent and Sun. This technology 35 * is protected by multiple US and International patents. 36 * 37 * This notice and attribution to Taligent may not be removed. 38 * Taligent is a registered trademark of Taligent, Inc. 39 */ 40 41 package sun.text; 42 43 import java.text.CharacterIterator; 44 import java.util.ArrayList; 45 import java.util.List; 46 import java.util.Stack; 47 48 /** 49 * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary 50 * to further subdivide ranges of text beyond what is possible using just the 51 * state-table-based algorithm. This is necessary, for example, to handle 52 * word and line breaking in Thai, which doesn't use spaces between words. The 53 * state-table-based algorithm used by RuleBasedBreakIterator is used to divide 54 * up text as far as possible, and then contiguous ranges of letters are 55 * repeatedly compared against a list of known words (i.e., the dictionary) 56 * to divide them up into words. 57 * 58 * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator, 59 * but adds one more special substitution name: <dictionary>. This substitution 60 * name is used to identify characters in words in the dictionary. The idea is that 61 * if the iterator passes over a chunk of text that includes two or more characters 62 * in a row that are included in <dictionary>, it goes back through that range and 63 * derives additional break positions (if possible) using the dictionary. 64 * 65 * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary 66 * file. It follows a prescribed search path to locate the dictionary (right now, 67 * it looks for it in /com/ibm/text/resources in each directory in the classpath, 68 * and won't find it in JAR files, but this location is likely to change). The 69 * dictionary file is in a serialized binary format. We have a very primitive (and 70 * slow) BuildDictionaryFile utility for creating dictionary files, but aren't 71 * currently making it public. Contact us for help. 72 */ 73 public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator { 74 75 /** 76 * a list of known words that is used to divide up contiguous ranges of letters, 77 * stored in a compressed, indexed, format that offers fast access 78 */ 79 private BreakDictionary dictionary; 80 81 /** 82 * a list of flags indicating which character categories are contained in 83 * the dictionary file (this is used to determine which ranges of characters 84 * to apply the dictionary to) 85 */ 86 private boolean[] categoryFlags; 87 88 /** 89 * a temporary hiding place for the number of dictionary characters in the 90 * last range passed over by next() 91 */ 92 private int dictionaryCharCount; 93 94 /** 95 * when a range of characters is divided up using the dictionary, the break 96 * positions that are discovered are stored here, preventing us from having 97 * to use either the dictionary or the state table again until the iterator 98 * leaves this range of text 99 */ 100 private int[] cachedBreakPositions; 101 102 /** 103 * if cachedBreakPositions is not null, this indicates which item in the 104 * cache the current iteration position refers to 105 */ 106 private int positionInCache; 107 108 /** 109 * Constructs a DictionaryBasedBreakIterator. 110 * 111 * @param ruleFile the name of the rule data file 112 * @param ruleData the rule data loaded from the rule data file 113 * @param dictionaryFile the name of the dictionary file 114 * @param dictionartData the dictionary data loaded from the dictionary file 115 * @throws MissingResourceException if rule data or dictionary initialization failed 116 */ 117 public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData, 118 String dictionaryFile, byte[] dictionaryData) { 119 super(ruleFile, ruleData); 120 byte[] tmp = super.getAdditionalData(); 121 if (tmp != null) { 122 prepareCategoryFlags(tmp); 123 super.setAdditionalData(null); 124 } 125 dictionary = new BreakDictionary(dictionaryFile, dictionaryData); 126 } 127 128 private void prepareCategoryFlags(byte[] data) { 129 categoryFlags = new boolean[data.length]; 130 for (int i = 0; i < data.length; i++) { 131 categoryFlags[i] = (data[i] == (byte)1) ? true : false; 132 } 133 } 134 135 @Override 136 public void setText(CharacterIterator newText) { 137 super.setText(newText); 138 cachedBreakPositions = null; 139 dictionaryCharCount = 0; 140 positionInCache = 0; 141 } 142 143 /** 144 * Sets the current iteration position to the beginning of the text. 145 * (i.e., the CharacterIterator's starting offset). |