< prev index next >

src/java.base/share/classes/sun/text/DictionaryBasedBreakIterator.java

Print this page


   1 /*
   2  * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *
  28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  29  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  30  *
  31  * The original version of this source code and documentation
  32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  33  * subsidiary of IBM. These materials are provided under terms
  34  * of a License Agreement between Taligent and Sun. This technology
  35  * is protected by multiple US and International patents.
  36  *
  37  * This notice and attribution to Taligent may not be removed.
  38  * Taligent is a registered trademark of Taligent, Inc.
  39  */
  40 
  41 package sun.util.locale.provider;
  42 
  43 import java.io.IOException;
  44 import java.lang.reflect.Module;
  45 import java.text.CharacterIterator;
  46 import java.util.ArrayList;
  47 import java.util.List;
  48 import java.util.Stack;
  49 
  50 /**
  51  * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
  52  * to further subdivide ranges of text beyond what is possible using just the
  53  * state-table-based algorithm.  This is necessary, for example, to handle
  54  * word and line breaking in Thai, which doesn't use spaces between words.  The
  55  * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
  56  * up text as far as possible, and then contiguous ranges of letters are
  57  * repeatedly compared against a list of known words (i.e., the dictionary)
  58  * to divide them up into words.
  59  *
  60  * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
  61  * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
  62  * name is used to identify characters in words in the dictionary.  The idea is that
  63  * if the iterator passes over a chunk of text that includes two or more characters
  64  * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
  65  * derives additional break positions (if possible) using the dictionary.
  66  *
  67  * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
  68  * file.  It follows a prescribed search path to locate the dictionary (right now,
  69  * it looks for it in /com/ibm/text/resources in each directory in the classpath,
  70  * and won't find it in JAR files, but this location is likely to change).  The
  71  * dictionary file is in a serialized binary format.  We have a very primitive (and
  72  * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
  73  * currently making it public.  Contact us for help.
  74  */
  75 class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
  76 
  77     /**
  78      * a list of known words that is used to divide up contiguous ranges of letters,
  79      * stored in a compressed, indexed, format that offers fast access
  80      */
  81     private BreakDictionary dictionary;
  82 
  83     /**
  84      * a list of flags indicating which character categories are contained in
  85      * the dictionary file (this is used to determine which ranges of characters
  86      * to apply the dictionary to)
  87      */
  88     private boolean[] categoryFlags;
  89 
  90     /**
  91      * a temporary hiding place for the number of dictionary characters in the
  92      * last range passed over by next()
  93      */
  94     private int dictionaryCharCount;
  95 
  96     /**
  97      * when a range of characters is divided up using the dictionary, the break
  98      * positions that are discovered are stored here, preventing us from having
  99      * to use either the dictionary or the state table again until the iterator
 100      * leaves this range of text
 101      */
 102     private int[] cachedBreakPositions;
 103 
 104     /**
 105      * if cachedBreakPositions is not null, this indicates which item in the
 106      * cache the current iteration position refers to
 107      */
 108     private int positionInCache;
 109 
 110     /**
 111      * Constructs a DictionaryBasedBreakIterator.
 112      * @param module The module where the dictionary file resides
 113      * @param dictionaryFilename The filename of the dictionary file to use
 114      */
 115     DictionaryBasedBreakIterator(Module module, String dataFile, String dictionaryFile)
 116                                         throws IOException {
 117         super(module, dataFile);




 118         byte[] tmp = super.getAdditionalData();
 119         if (tmp != null) {
 120             prepareCategoryFlags(tmp);
 121             super.setAdditionalData(null);
 122         }
 123         dictionary = new BreakDictionary(module, dictionaryFile);
 124     }
 125 
 126     private void prepareCategoryFlags(byte[] data) {
 127         categoryFlags = new boolean[data.length];
 128         for (int i = 0; i < data.length; i++) {
 129             categoryFlags[i] = (data[i] == (byte)1) ? true : false;
 130         }
 131     }
 132 
 133     @Override
 134     public void setText(CharacterIterator newText) {
 135         super.setText(newText);
 136         cachedBreakPositions = null;
 137         dictionaryCharCount = 0;
 138         positionInCache = 0;
 139     }
 140 
 141     /**
 142      * Sets the current iteration position to the beginning of the text.
 143      * (i.e., the CharacterIterator's starting offset).


   1 /*
   2  * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 /*
  27  *
  28  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  29  * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
  30  *
  31  * The original version of this source code and documentation
  32  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  33  * subsidiary of IBM. These materials are provided under terms
  34  * of a License Agreement between Taligent and Sun. This technology
  35  * is protected by multiple US and International patents.
  36  *
  37  * This notice and attribution to Taligent may not be removed.
  38  * Taligent is a registered trademark of Taligent, Inc.
  39  */
  40 
  41 package sun.text;
  42 


  43 import java.text.CharacterIterator;
  44 import java.util.ArrayList;
  45 import java.util.List;
  46 import java.util.Stack;
  47 
  48 /**
  49  * A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
  50  * to further subdivide ranges of text beyond what is possible using just the
  51  * state-table-based algorithm.  This is necessary, for example, to handle
  52  * word and line breaking in Thai, which doesn't use spaces between words.  The
  53  * state-table-based algorithm used by RuleBasedBreakIterator is used to divide
  54  * up text as far as possible, and then contiguous ranges of letters are
  55  * repeatedly compared against a list of known words (i.e., the dictionary)
  56  * to divide them up into words.
  57  *
  58  * DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
  59  * but adds one more special substitution name: &lt;dictionary&gt;.  This substitution
  60  * name is used to identify characters in words in the dictionary.  The idea is that
  61  * if the iterator passes over a chunk of text that includes two or more characters
  62  * in a row that are included in &lt;dictionary&gt;, it goes back through that range and
  63  * derives additional break positions (if possible) using the dictionary.
  64  *
  65  * DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
  66  * file.  It follows a prescribed search path to locate the dictionary (right now,
  67  * it looks for it in /com/ibm/text/resources in each directory in the classpath,
  68  * and won't find it in JAR files, but this location is likely to change).  The
  69  * dictionary file is in a serialized binary format.  We have a very primitive (and
  70  * slow) BuildDictionaryFile utility for creating dictionary files, but aren't
  71  * currently making it public.  Contact us for help.
  72  */
  73 public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
  74 
  75     /**
  76      * a list of known words that is used to divide up contiguous ranges of letters,
  77      * stored in a compressed, indexed, format that offers fast access
  78      */
  79     private BreakDictionary dictionary;
  80 
  81     /**
  82      * a list of flags indicating which character categories are contained in
  83      * the dictionary file (this is used to determine which ranges of characters
  84      * to apply the dictionary to)
  85      */
  86     private boolean[] categoryFlags;
  87 
  88     /**
  89      * a temporary hiding place for the number of dictionary characters in the
  90      * last range passed over by next()
  91      */
  92     private int dictionaryCharCount;
  93 
  94     /**
  95      * when a range of characters is divided up using the dictionary, the break
  96      * positions that are discovered are stored here, preventing us from having
  97      * to use either the dictionary or the state table again until the iterator
  98      * leaves this range of text
  99      */
 100     private int[] cachedBreakPositions;
 101 
 102     /**
 103      * if cachedBreakPositions is not null, this indicates which item in the
 104      * cache the current iteration position refers to
 105      */
 106     private int positionInCache;
 107 
 108     /**
 109      * Constructs a DictionaryBasedBreakIterator.
 110      *
 111      * @param ruleFile       the name of the rule data file
 112      * @param ruleData       the rule data loaded from the rule data file
 113      * @param dictionaryFile the name of the dictionary file
 114      * @param dictionartData the dictionary data loaded from the dictionary file
 115      * @throws MissingResourceException if rule data or dictionary initialization failed
 116      */
 117     public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData,
 118                                         String dictionaryFile, byte[] dictionaryData) {
 119         super(ruleFile, ruleData);
 120         byte[] tmp = super.getAdditionalData();
 121         if (tmp != null) {
 122             prepareCategoryFlags(tmp);
 123             super.setAdditionalData(null);
 124         }
 125         dictionary = new BreakDictionary(dictionaryFile, dictionaryData);
 126     }
 127 
 128     private void prepareCategoryFlags(byte[] data) {
 129         categoryFlags = new boolean[data.length];
 130         for (int i = 0; i < data.length; i++) {
 131             categoryFlags[i] = (data[i] == (byte)1) ? true : false;
 132         }
 133     }
 134 
 135     @Override
 136     public void setText(CharacterIterator newText) {
 137         super.setText(newText);
 138         cachedBreakPositions = null;
 139         dictionaryCharCount = 0;
 140         positionInCache = 0;
 141     }
 142 
 143     /**
 144      * Sets the current iteration position to the beginning of the text.
 145      * (i.e., the CharacterIterator's starting offset).


< prev index next >