--- old/make/jdk/src/classes/build/tools/generatebreakiteratordata/CharSet.java 2020-03-23 19:56:38.251962690 +0100 +++ /dev/null 2020-02-11 10:29:13.086348146 +0100 @@ -1,819 +0,0 @@ -/* - * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved - * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved - * - * The original version of this source code and documentation - * is copyrighted and owned by Taligent, Inc., a wholly-owned - * subsidiary of IBM. These materials are provided under terms - * of a License Agreement between Taligent and Sun. This technology - * is protected by multiple US and International patents. - * - * This notice and attribution to Taligent may not be removed. - * Taligent is a registered trademark of Taligent, Inc. - */ - -package build.tools.generatebreakiteratordata; - -import java.util.Arrays; -import java.util.Hashtable; - -/** - * An object representing a set of characters. (This is a "set" in the - * mathematical sense: an unduplicated list of characters on which set - * operations such as union and intersection can be performed.) The - * set information is stored in compressed, optimized form: The object - * contains an integer array with an even number of characters. Each - * pair of characters represents a range of characters contained in the set - * (a pair of the same character represents a single character). The - * characters are sorted in increasing order. - */ -class CharSet { - /** - * The structure containing the set information. The characters - * in this array are organized into pairs, each pair representing - * a range of characters contained in the set - */ - private int[] chars; - - //========================================================================== - // parseString() and associated routines - //========================================================================== - /** - * A cache which is used to speed up parseString() whenever it is - * used to parse a description that has been parsed before - */ - private static Hashtable expressionCache = null; - - /** - * Builds a CharSet based on a textual description. For the syntax of - * the description, see the documentation of RuleBasedBreakIterator. - * @see java.text.RuleBasedBreakIterator - */ - public static CharSet parseString(String s) { - CharSet result = null; - - // if "s" is in the expression cache, pull the result out - // of the expresison cache - if (expressionCache != null) { - result = expressionCache.get(s); - } - - // otherwise, use doParseString() to actually parse the string, - // and then add a corresponding entry to the expression cache - if (result == null) { - result = doParseString(s); - if (expressionCache == null) { - expressionCache = new Hashtable<>(); - } - expressionCache.put(s, result); - } - result = (CharSet)(result.clone()); - return result; - } - - /** - * This function is used by parseString() to actually parse the string - */ - private static CharSet doParseString(String s) { - CharSet result = new CharSet(); - int p = 0; - - boolean haveDash = false; - boolean haveTilde = false; - boolean wIsReal = false; - int w = 0x0000; - - // for each character in the description... - while (p < s.length()) { - int c = s.codePointAt(p); - - // if it's an opening bracket... - if (c == '[') { - // flush the single-character cache - if (wIsReal) { - result.internalUnion(new CharSet(w)); - } - - // locate the matching closing bracket - int bracketLevel = 1; - int q = p + 1; - while (bracketLevel != 0) { - // if no matching bracket by end of string then... - if (q >= s.length()) { - throw new IllegalArgumentException("Parse error at position " + p + " in " + s); - } - int ch = s.codePointAt(q); - switch (ch) { - case '\\': // need to step over next character - ch = s.codePointAt(++q); - break; - case '[': - ++bracketLevel; - break; - case ']': - --bracketLevel; - break; - } - q += Character.charCount(ch); - } - --q; - - // call parseString() recursively to parse the text inside - // the brackets, then either add or subtract the result from - // our running result depending on whether or not the [] - // expresison was preceded by a ^ - if (!haveTilde) { - result.internalUnion(CharSet.parseString(s.substring(p + 1, q))); - } - else { - result.internalDifference(CharSet.parseString(s.substring(p + 1, q))); - } - haveTilde = false; - haveDash = false; - wIsReal = false; - p = q + 1; - } - - // if the character is a colon... - else if (c == ':') { - // flush the single-character cache - if (wIsReal) { - result.internalUnion(new CharSet(w)); - } - - // locate the matching colon (and throw an error if there - // isn't one) - int q = s.indexOf(':', p + 1); - if (q == -1) { - throw new IllegalArgumentException("Parse error at position " + p + " in " + s); - } - - // use charSetForCategory() to parse the text in the colons, - // and either add or substract the result from our running - // result depending on whether the :: expression was - // preceded by a ^ - if (!haveTilde) { - result.internalUnion(charSetForCategory(s.substring(p + 1, q))); - } - else { - result.internalDifference(charSetForCategory(s.substring(p + 1, q))); - } - - // reset everything and advance to the next character - haveTilde = false; - haveDash = false; - wIsReal = false; - p = q + 1; - } - - // if the character is a dash, set an appropriate flag - else if (c == '-') { - if (wIsReal) { - haveDash = true; - } - ++p; - } - - // if the character is a caret, flush the single-character - // cache and set an appropriate flag. If the set is empty - // (i.e., if the expression begins with ^), invert the set - // (i.e., set it to include everything). The idea here is - // that a set that includes nothing but ^ expressions - // means "everything but these things". - else if (c == '^') { - if (wIsReal) { - result.internalUnion(new CharSet(w)); - wIsReal = false; - } - haveTilde = true; - ++p; - if (result.empty()) { - result.internalComplement(); - } - } - - // throw an exception on an illegal character - else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c) - && !Character.isDigit((char)c) && c != '\\') { - throw new IllegalArgumentException("Parse error at position " + p + " in " + s); - } - - // otherwise, we end up here... - else { - // on a backslash, advance to the next character - if (c == '\\') { - ++p; - } - - // if the preceding character was a dash, this character - // defines the end of a range. Add or subtract that range - // from the running result depending on whether or not it - // was preceded by a ^ - if (haveDash) { - if (s.codePointAt(p) < w) { - throw new IllegalArgumentException("U+" + - Integer.toHexString(s.codePointAt(p)) - + " is less than U+" + Integer.toHexString(w) + ". Dash expressions " - + "can't have their endpoints in reverse order."); - } - - int ch = s.codePointAt(p); - if (!haveTilde) { - result.internalUnion(new CharSet(w, ch)); - } - else { - result.internalDifference(new CharSet(w, ch)); - } - p += Character.charCount(ch); - haveDash = false; - haveTilde = false; - wIsReal = false; - } - - // if the preceding character was a caret, remove this character - // from the running result - else if (haveTilde) { - w = s.codePointAt(p); - result.internalDifference(new CharSet(w)); - p += Character.charCount(w); - haveTilde = false; - wIsReal = false; - } - - // otherwise, flush the single-character cache and then - // put this character into the cache - else if (wIsReal) { - result.internalUnion(new CharSet(w)); - w = s.codePointAt(p); - p += Character.charCount(w); - wIsReal = true; - } else { - w = s.codePointAt(p); - p += Character.charCount(w); - wIsReal = true; - } - } - } - - // finally, flush the single-character cache one last time - if (wIsReal) { - result.internalUnion(new CharSet(w)); - } - - return result; - } - - /** - * Creates a CharSet containing all the characters in a particular - * Unicode category. The text is either a two-character code from - * the Unicode database or a single character that begins one or more - * two-character codes. - */ - private static CharSet charSetForCategory(String category) { - // throw an exception if we have anything other than one or two - // characters inside the colons - if (category.length() == 0 || category.length() >= 3) { - throw new IllegalArgumentException("Invalid character category: " + category); - } - - // if we have two characters, search the category map for that code - // and either construct and return a CharSet from the data in the - // category map or throw an exception - if (category.length() == 2) { - for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { - if (CharacterCategory.categoryNames[i].equals(category)) { - return new CharSet(CharacterCategory.getCategoryMap(i)); - } - } - throw new IllegalArgumentException("Invalid character category: " + category); - } - - // if we have one character, search the category map for codes beginning - // with that letter, and union together all of the matching sets that - // we find (or throw an exception if there are no matches) - else if (category.length() == 1) { - CharSet result = new CharSet(); - for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { - if (CharacterCategory.categoryNames[i].startsWith(category)) { - result = result.union(new CharSet(CharacterCategory.getCategoryMap(i))); - } - } - if (result.empty()) { - throw new IllegalArgumentException("Invalid character category: " + category); - } - else { - return result; - } - } - return new CharSet(); // should never get here, but to make the compiler happy... - } - - /** - * Returns a copy of CharSet's expression cache and sets CharSet's - * expression cache to empty. - */ - public static Hashtable releaseExpressionCache() { - Hashtable result = expressionCache; - expressionCache = null; - return result; - } - - //========================================================================== - // CharSet manipulation - //========================================================================== - /** - * Creates an empty CharSet. - */ - public CharSet() { - chars = new int[0]; - } - - /** - * Creates a CharSet containing a single character. - * @param c The character to put into the CharSet - */ - public CharSet(int c) { - chars = new int[2]; - chars[0] = c; - chars[1] = c; - } - - /** - * Creates a CharSet containing a range of characters. - * @param lo The lowest-numbered character to include in the range - * @param hi The highest-numbered character to include in the range - */ - public CharSet(int lo, int hi) { - chars = new int[2]; - if (lo <= hi) { - chars[0] = lo; - chars[1] = hi; - } - else { - chars[0] = hi; - chars[1] = lo; - } - } - - /** - * Creates a CharSet, initializing it from the internal storage - * of another CharSet (this function performs no error checking - * on "chars", so if it's malformed, undefined behavior will result) - */ - private CharSet(int[] chars) { - this.chars = chars; - } - - /** - * Returns a CharSet representing the union of two CharSets. - */ - public CharSet union(CharSet that) { - return new CharSet(doUnion(that.chars)); - } - - /** - * Adds the characters in "that" to this CharSet - */ - private void internalUnion(CharSet that) { - chars = doUnion(that.chars); - } - - /** - * The actual implementation of the union functions - */ - private int[] doUnion(int[] c2) { - int[] result = new int[chars.length+c2.length]; - - int i = 0; - int j = 0; - int index = 0; - - // consider all the characters in both strings - while (i < chars.length && j < c2.length) { - int ub; - - // the first character in the result is the lower of the - // starting characters of the two strings, and "ub" gets - // set to the upper bound of that range - if (chars[i] < c2[j]) { - result[index++] = chars[i]; - ub = chars[++i]; - } - else { - result[index++] = c2[j]; - ub = c2[++j]; - } - - // for as long as one of our two pointers is pointing to a range's - // end point, or i is pointing to a character that is less than - // "ub" plus one (the "plus one" stitches touching ranges together)... - while (i % 2 == 1 || - j % 2 == 1 || - (i < chars.length && chars[i] <= ub + 1)) { - - // advance i to the first character that is greater than - // "ub" plus one - while (i < chars.length && chars[i] <= ub + 1) { - ++i; - } - - // if i points to the endpoint of a range, update "ub" - // to that character, or if i points to the start of - // a range and the endpoint of the preceding range is - // greater than "ub", update "up" to _that_ character - if (i % 2 == 1) { - ub = chars[i]; - } - else if (i > 0 && chars[i - 1] > ub) { - ub = chars[i - 1]; - } - - // now advance j to the first character that is greater - // that "ub" plus one - while (j < c2.length && c2[j] <= ub + 1) { - ++j; - } - - // if j points to the endpoint of a range, update "ub" - // to that character, or if j points to the start of - // a range and the endpoint of the preceding range is - // greater than "ub", update "up" to _that_ character - if (j % 2 == 1) { - ub = c2[j]; - } - else if (j > 0 && c2[j - 1] > ub) { - ub = c2[j - 1]; - } - } - // when we finally fall out of this loop, we will have stitched - // together a series of ranges that overlap or touch, i and j - // will both point to starting points of ranges, and "ub" will - // be the endpoint of the range we're working on. Write "ub" - // to the result - result[index++] = ub; - - // loop back around to create the next range in the result - } - - // we fall out to here when we've exhausted all the characters in - // one of the operands. We can append all of the remaining characters - // in the other operand without doing any extra work. - if (i < chars.length) { - for (int k = i; k < chars.length; k++) { - result[index++] = chars[k]; - } - } - if (j < c2.length) { - for (int k = j; k < c2.length; k++) { - result[index++] = c2[k]; - } - } - - if (result.length > index) { - int[] tmpbuf = new int[index]; - System.arraycopy(result, 0, tmpbuf, 0, index); - return tmpbuf; - } - - return result; - } - - /** - * Returns the intersection of two CharSets. - */ - public CharSet intersection(CharSet that) { - return new CharSet(doIntersection(that.chars)); - } - - /** - * Removes from this CharSet any characters that aren't also in "that" - */ - private void internalIntersection(CharSet that) { - chars = doIntersection(that.chars); - } - - /** - * The internal implementation of the two intersection functions - */ - private int[] doIntersection(int[] c2) { - int[] result = new int[chars.length+c2.length]; - - int i = 0; - int j = 0; - int oldI; - int oldJ; - int index = 0; - - // iterate until we've exhausted one of the operands - while (i < chars.length && j < c2.length) { - - // advance j until it points to a character that is larger than - // the one i points to. If this is the beginning of a one- - // character range, advance j to point to the end - if (i < chars.length && i % 2 == 0) { - while (j < c2.length && c2[j] < chars[i]) { - ++j; - } - if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) { - ++j; - } - } - - // if j points to the endpoint of a range, save the current - // value of i, then advance i until it reaches a character - // which is larger than the character pointed at - // by j. All of the characters we've advanced over (except - // the one currently pointed to by i) are added to the result - oldI = i; - while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) { - ++i; - } - for (int k = oldI; k < i; k++) { - result[index++] = chars[k]; - } - - // if i points to the endpoint of a range, save the current - // value of j, then advance j until it reaches a character - // which is larger than the character pointed at - // by i. All of the characters we've advanced over (except - // the one currently pointed to by i) are added to the result - oldJ = j; - while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) { - ++j; - } - for (int k = oldJ; k < j; k++) { - result[index++] = c2[k]; - } - - // advance i until it points to a character larger than j - // If it points at the beginning of a one-character range, - // advance it to the end of that range - if (j < c2.length && j % 2 == 0) { - while (i < chars.length && chars[i] < c2[j]) { - ++i; - } - if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) { - ++i; - } - } - } - - if (result.length > index) { - int[] tmpbuf = new int[index]; - System.arraycopy(result, 0, tmpbuf, 0, index); - return tmpbuf; - } - - return result; - } - - /** - * Returns a CharSet containing all the characters in "this" that - * aren't also in "that" - */ - public CharSet difference(CharSet that) { - return new CharSet(doIntersection(that.doComplement())); - } - - /** - * Removes from "this" all the characters that are also in "that" - */ - private void internalDifference(CharSet that) { - chars = doIntersection(that.doComplement()); - } - - /** - * Returns a CharSet containing all the characters which are not - * in "this" - */ - public CharSet complement() { - return new CharSet(doComplement()); - } - - /** - * Complements "this". All the characters it contains are removed, - * and all the characters it doesn't contain are added. - */ - private void internalComplement() { - chars = doComplement(); - } - - /** - * The internal implementation function for the complement routines - */ - private int[] doComplement() { - // the complement of an empty CharSet is one containing everything - if (empty()) { - int[] result = new int[2]; - result[0] = 0x0000; - result[1] = 0x10FFFF; - return result; - } - - int[] result = new int[chars.length+2]; - - int i = 0; - int index = 0; - - // the result begins with \u0000 unless the original CharSet does - if (chars[0] != 0x0000) { - result[index++] = 0x0000; - } - - // walk through the characters in this CharSet. Append a pair of - // characters the first of which is one less than the first - // character we see and the second of which is one plus the second - // character we see (don't write the first character if it's \u0000, - // and don't write the second character if it's \uffff. - while (i < chars.length) { - if (chars[i] != 0x0000) { - result[index++] = chars[i] - 1; - } - if (chars[i + 1] != 0x10FFFF) { - result[index++] = chars[i + 1] + 1; - } - i += 2; - } - - // add 0x10ffff to the end of the result, unless it was in - // the original set - if (chars[i-1] != 0x10FFFF) { - result[index++] = 0x10FFFF; - } - - if (result.length > index) { - int[] tmpbuf = new int[index]; - System.arraycopy(result, 0, tmpbuf, 0, index); - return tmpbuf; - } - - return result; - } - - /** - * Returns true if this CharSet contains the specified character - * @param c The character we're testing for set membership - */ - public boolean contains(int c) { - // search for the first range endpoint that is greater than or - // equal to c - int i = 1; - while (i < chars.length && chars[i] < c) { - i += 2; - } - - // if we've walked off the end, we don't contain c - if (i == chars.length) { - return false; - } - - // otherwise, we contain c if the beginning of the range is less - // than or equal to c - return chars[i - 1] <= c; - } - - /** - * Returns true if "that" is another instance of CharSet containing - * the exact same characters as this one - */ - public boolean equals(Object that) { - return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars); - } - - /** - * Returns the hash code for this set of characters - */ - public int hashCode() { - return Arrays.hashCode(chars); - } - - /** - * Creates a new CharSet that is equal to this one - */ - public Object clone() { - return new CharSet(chars); - } - - /** - * Returns true if this CharSet contains no characters - */ - public boolean empty() { - return chars.length == 0; - } - - /** - * Returns a textual representation of this CharSet. If the result - * of calling this function is passed to CharSet.parseString(), it - * will produce another CharSet that is equal to this one. - */ - public String toString() { - StringBuffer result = new StringBuffer(); - - // the result begins with an opening bracket - result.append('['); - - // iterate through the ranges in the CharSet - for (int i = 0; i < chars.length; i += 2) { - // for a range with the same beginning and ending point, - // output that character - if (chars[i] == chars[i + 1]) { - result.append("0x"); - result.append(Integer.toHexString(chars[i])); - } - - // otherwise, output the start and end points of the range - // separated by a dash - else { - result.append("0x"); - result.append(Integer.toHexString(chars[i])); - result.append("-0x"); - result.append(Integer.toHexString(chars[i + 1])); - } - } - - // the result ends with a closing bracket - result.append(']'); - return result.toString(); - } - - /** - * Returns an integer array representing the contents of this CharSet - * in the same form in which they're stored internally: as pairs - * of characters representing the start and end points of ranges - */ - public int[] getRanges() { - return chars; - } - - /** - * Returns an Enumeration that will return the ranges of characters - * contained in this CharSet one at a time - */ - public Enumeration getChars() { - return new Enumeration(this); - } - - //========================================================================== - // CharSet.Enumeration - //========================================================================== - - /** - * An Enumeration that can be used to extract the character ranges - * from a CharSet one at a time - */ - public class Enumeration implements java.util.Enumeration { - /** - * Initializes a CharSet.Enumeration - */ - Enumeration(CharSet cs) { - this.chars = cs.chars; - p = 0; - } - - /** - * Returns true if the enumeration hasn't yet returned - * all the ranges in the CharSet - */ - public boolean hasMoreElements() { - return p < chars.length; - } - - /** - * Returns the next range in the CarSet - */ - public int[] nextElement() { - int[] result = new int[2]; - result[0] = chars[p++]; - result[1] = chars[p++]; - return result; - } - - int p; - int[] chars; - } -} --- /dev/null 2020-02-11 10:29:13.086348146 +0100 +++ new/src/java.base/share/tools/org/openjdk/buildtools/generatebreakiteratordata/CharSet.java 2020-03-23 19:56:37.823962693 +0100 @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + */ + +package org.openjdk.buildtools.generatebreakiteratordata; + +import java.util.Arrays; +import java.util.Hashtable; + +/** + * An object representing a set of characters. (This is a "set" in the + * mathematical sense: an unduplicated list of characters on which set + * operations such as union and intersection can be performed.) The + * set information is stored in compressed, optimized form: The object + * contains an integer array with an even number of characters. Each + * pair of characters represents a range of characters contained in the set + * (a pair of the same character represents a single character). The + * characters are sorted in increasing order. + */ +class CharSet { + /** + * The structure containing the set information. The characters + * in this array are organized into pairs, each pair representing + * a range of characters contained in the set + */ + private int[] chars; + + //========================================================================== + // parseString() and associated routines + //========================================================================== + /** + * A cache which is used to speed up parseString() whenever it is + * used to parse a description that has been parsed before + */ + private static Hashtable expressionCache = null; + + /** + * Builds a CharSet based on a textual description. For the syntax of + * the description, see the documentation of RuleBasedBreakIterator. + * @see java.text.RuleBasedBreakIterator + */ + public static CharSet parseString(String s) { + CharSet result = null; + + // if "s" is in the expression cache, pull the result out + // of the expresison cache + if (expressionCache != null) { + result = expressionCache.get(s); + } + + // otherwise, use doParseString() to actually parse the string, + // and then add a corresponding entry to the expression cache + if (result == null) { + result = doParseString(s); + if (expressionCache == null) { + expressionCache = new Hashtable<>(); + } + expressionCache.put(s, result); + } + result = (CharSet)(result.clone()); + return result; + } + + /** + * This function is used by parseString() to actually parse the string + */ + private static CharSet doParseString(String s) { + CharSet result = new CharSet(); + int p = 0; + + boolean haveDash = false; + boolean haveTilde = false; + boolean wIsReal = false; + int w = 0x0000; + + // for each character in the description... + while (p < s.length()) { + int c = s.codePointAt(p); + + // if it's an opening bracket... + if (c == '[') { + // flush the single-character cache + if (wIsReal) { + result.internalUnion(new CharSet(w)); + } + + // locate the matching closing bracket + int bracketLevel = 1; + int q = p + 1; + while (bracketLevel != 0) { + // if no matching bracket by end of string then... + if (q >= s.length()) { + throw new IllegalArgumentException("Parse error at position " + p + " in " + s); + } + int ch = s.codePointAt(q); + switch (ch) { + case '\\': // need to step over next character + ch = s.codePointAt(++q); + break; + case '[': + ++bracketLevel; + break; + case ']': + --bracketLevel; + break; + } + q += Character.charCount(ch); + } + --q; + + // call parseString() recursively to parse the text inside + // the brackets, then either add or subtract the result from + // our running result depending on whether or not the [] + // expresison was preceded by a ^ + if (!haveTilde) { + result.internalUnion(CharSet.parseString(s.substring(p + 1, q))); + } + else { + result.internalDifference(CharSet.parseString(s.substring(p + 1, q))); + } + haveTilde = false; + haveDash = false; + wIsReal = false; + p = q + 1; + } + + // if the character is a colon... + else if (c == ':') { + // flush the single-character cache + if (wIsReal) { + result.internalUnion(new CharSet(w)); + } + + // locate the matching colon (and throw an error if there + // isn't one) + int q = s.indexOf(':', p + 1); + if (q == -1) { + throw new IllegalArgumentException("Parse error at position " + p + " in " + s); + } + + // use charSetForCategory() to parse the text in the colons, + // and either add or substract the result from our running + // result depending on whether the :: expression was + // preceded by a ^ + if (!haveTilde) { + result.internalUnion(charSetForCategory(s.substring(p + 1, q))); + } + else { + result.internalDifference(charSetForCategory(s.substring(p + 1, q))); + } + + // reset everything and advance to the next character + haveTilde = false; + haveDash = false; + wIsReal = false; + p = q + 1; + } + + // if the character is a dash, set an appropriate flag + else if (c == '-') { + if (wIsReal) { + haveDash = true; + } + ++p; + } + + // if the character is a caret, flush the single-character + // cache and set an appropriate flag. If the set is empty + // (i.e., if the expression begins with ^), invert the set + // (i.e., set it to include everything). The idea here is + // that a set that includes nothing but ^ expressions + // means "everything but these things". + else if (c == '^') { + if (wIsReal) { + result.internalUnion(new CharSet(w)); + wIsReal = false; + } + haveTilde = true; + ++p; + if (result.empty()) { + result.internalComplement(); + } + } + + // throw an exception on an illegal character + else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c) + && !Character.isDigit((char)c) && c != '\\') { + throw new IllegalArgumentException("Parse error at position " + p + " in " + s); + } + + // otherwise, we end up here... + else { + // on a backslash, advance to the next character + if (c == '\\') { + ++p; + } + + // if the preceding character was a dash, this character + // defines the end of a range. Add or subtract that range + // from the running result depending on whether or not it + // was preceded by a ^ + if (haveDash) { + if (s.codePointAt(p) < w) { + throw new IllegalArgumentException("U+" + + Integer.toHexString(s.codePointAt(p)) + + " is less than U+" + Integer.toHexString(w) + ". Dash expressions " + + "can't have their endpoints in reverse order."); + } + + int ch = s.codePointAt(p); + if (!haveTilde) { + result.internalUnion(new CharSet(w, ch)); + } + else { + result.internalDifference(new CharSet(w, ch)); + } + p += Character.charCount(ch); + haveDash = false; + haveTilde = false; + wIsReal = false; + } + + // if the preceding character was a caret, remove this character + // from the running result + else if (haveTilde) { + w = s.codePointAt(p); + result.internalDifference(new CharSet(w)); + p += Character.charCount(w); + haveTilde = false; + wIsReal = false; + } + + // otherwise, flush the single-character cache and then + // put this character into the cache + else if (wIsReal) { + result.internalUnion(new CharSet(w)); + w = s.codePointAt(p); + p += Character.charCount(w); + wIsReal = true; + } else { + w = s.codePointAt(p); + p += Character.charCount(w); + wIsReal = true; + } + } + } + + // finally, flush the single-character cache one last time + if (wIsReal) { + result.internalUnion(new CharSet(w)); + } + + return result; + } + + /** + * Creates a CharSet containing all the characters in a particular + * Unicode category. The text is either a two-character code from + * the Unicode database or a single character that begins one or more + * two-character codes. + */ + private static CharSet charSetForCategory(String category) { + // throw an exception if we have anything other than one or two + // characters inside the colons + if (category.length() == 0 || category.length() >= 3) { + throw new IllegalArgumentException("Invalid character category: " + category); + } + + // if we have two characters, search the category map for that code + // and either construct and return a CharSet from the data in the + // category map or throw an exception + if (category.length() == 2) { + for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { + if (CharacterCategory.categoryNames[i].equals(category)) { + return new CharSet(CharacterCategory.getCategoryMap(i)); + } + } + throw new IllegalArgumentException("Invalid character category: " + category); + } + + // if we have one character, search the category map for codes beginning + // with that letter, and union together all of the matching sets that + // we find (or throw an exception if there are no matches) + else if (category.length() == 1) { + CharSet result = new CharSet(); + for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { + if (CharacterCategory.categoryNames[i].startsWith(category)) { + result = result.union(new CharSet(CharacterCategory.getCategoryMap(i))); + } + } + if (result.empty()) { + throw new IllegalArgumentException("Invalid character category: " + category); + } + else { + return result; + } + } + return new CharSet(); // should never get here, but to make the compiler happy... + } + + /** + * Returns a copy of CharSet's expression cache and sets CharSet's + * expression cache to empty. + */ + public static Hashtable releaseExpressionCache() { + Hashtable result = expressionCache; + expressionCache = null; + return result; + } + + //========================================================================== + // CharSet manipulation + //========================================================================== + /** + * Creates an empty CharSet. + */ + public CharSet() { + chars = new int[0]; + } + + /** + * Creates a CharSet containing a single character. + * @param c The character to put into the CharSet + */ + public CharSet(int c) { + chars = new int[2]; + chars[0] = c; + chars[1] = c; + } + + /** + * Creates a CharSet containing a range of characters. + * @param lo The lowest-numbered character to include in the range + * @param hi The highest-numbered character to include in the range + */ + public CharSet(int lo, int hi) { + chars = new int[2]; + if (lo <= hi) { + chars[0] = lo; + chars[1] = hi; + } + else { + chars[0] = hi; + chars[1] = lo; + } + } + + /** + * Creates a CharSet, initializing it from the internal storage + * of another CharSet (this function performs no error checking + * on "chars", so if it's malformed, undefined behavior will result) + */ + private CharSet(int[] chars) { + this.chars = chars; + } + + /** + * Returns a CharSet representing the union of two CharSets. + */ + public CharSet union(CharSet that) { + return new CharSet(doUnion(that.chars)); + } + + /** + * Adds the characters in "that" to this CharSet + */ + private void internalUnion(CharSet that) { + chars = doUnion(that.chars); + } + + /** + * The actual implementation of the union functions + */ + private int[] doUnion(int[] c2) { + int[] result = new int[chars.length+c2.length]; + + int i = 0; + int j = 0; + int index = 0; + + // consider all the characters in both strings + while (i < chars.length && j < c2.length) { + int ub; + + // the first character in the result is the lower of the + // starting characters of the two strings, and "ub" gets + // set to the upper bound of that range + if (chars[i] < c2[j]) { + result[index++] = chars[i]; + ub = chars[++i]; + } + else { + result[index++] = c2[j]; + ub = c2[++j]; + } + + // for as long as one of our two pointers is pointing to a range's + // end point, or i is pointing to a character that is less than + // "ub" plus one (the "plus one" stitches touching ranges together)... + while (i % 2 == 1 || + j % 2 == 1 || + (i < chars.length && chars[i] <= ub + 1)) { + + // advance i to the first character that is greater than + // "ub" plus one + while (i < chars.length && chars[i] <= ub + 1) { + ++i; + } + + // if i points to the endpoint of a range, update "ub" + // to that character, or if i points to the start of + // a range and the endpoint of the preceding range is + // greater than "ub", update "up" to _that_ character + if (i % 2 == 1) { + ub = chars[i]; + } + else if (i > 0 && chars[i - 1] > ub) { + ub = chars[i - 1]; + } + + // now advance j to the first character that is greater + // that "ub" plus one + while (j < c2.length && c2[j] <= ub + 1) { + ++j; + } + + // if j points to the endpoint of a range, update "ub" + // to that character, or if j points to the start of + // a range and the endpoint of the preceding range is + // greater than "ub", update "up" to _that_ character + if (j % 2 == 1) { + ub = c2[j]; + } + else if (j > 0 && c2[j - 1] > ub) { + ub = c2[j - 1]; + } + } + // when we finally fall out of this loop, we will have stitched + // together a series of ranges that overlap or touch, i and j + // will both point to starting points of ranges, and "ub" will + // be the endpoint of the range we're working on. Write "ub" + // to the result + result[index++] = ub; + + // loop back around to create the next range in the result + } + + // we fall out to here when we've exhausted all the characters in + // one of the operands. We can append all of the remaining characters + // in the other operand without doing any extra work. + if (i < chars.length) { + for (int k = i; k < chars.length; k++) { + result[index++] = chars[k]; + } + } + if (j < c2.length) { + for (int k = j; k < c2.length; k++) { + result[index++] = c2[k]; + } + } + + if (result.length > index) { + int[] tmpbuf = new int[index]; + System.arraycopy(result, 0, tmpbuf, 0, index); + return tmpbuf; + } + + return result; + } + + /** + * Returns the intersection of two CharSets. + */ + public CharSet intersection(CharSet that) { + return new CharSet(doIntersection(that.chars)); + } + + /** + * Removes from this CharSet any characters that aren't also in "that" + */ + private void internalIntersection(CharSet that) { + chars = doIntersection(that.chars); + } + + /** + * The internal implementation of the two intersection functions + */ + private int[] doIntersection(int[] c2) { + int[] result = new int[chars.length+c2.length]; + + int i = 0; + int j = 0; + int oldI; + int oldJ; + int index = 0; + + // iterate until we've exhausted one of the operands + while (i < chars.length && j < c2.length) { + + // advance j until it points to a character that is larger than + // the one i points to. If this is the beginning of a one- + // character range, advance j to point to the end + if (i < chars.length && i % 2 == 0) { + while (j < c2.length && c2[j] < chars[i]) { + ++j; + } + if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) { + ++j; + } + } + + // if j points to the endpoint of a range, save the current + // value of i, then advance i until it reaches a character + // which is larger than the character pointed at + // by j. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldI = i; + while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) { + ++i; + } + for (int k = oldI; k < i; k++) { + result[index++] = chars[k]; + } + + // if i points to the endpoint of a range, save the current + // value of j, then advance j until it reaches a character + // which is larger than the character pointed at + // by i. All of the characters we've advanced over (except + // the one currently pointed to by i) are added to the result + oldJ = j; + while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) { + ++j; + } + for (int k = oldJ; k < j; k++) { + result[index++] = c2[k]; + } + + // advance i until it points to a character larger than j + // If it points at the beginning of a one-character range, + // advance it to the end of that range + if (j < c2.length && j % 2 == 0) { + while (i < chars.length && chars[i] < c2[j]) { + ++i; + } + if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) { + ++i; + } + } + } + + if (result.length > index) { + int[] tmpbuf = new int[index]; + System.arraycopy(result, 0, tmpbuf, 0, index); + return tmpbuf; + } + + return result; + } + + /** + * Returns a CharSet containing all the characters in "this" that + * aren't also in "that" + */ + public CharSet difference(CharSet that) { + return new CharSet(doIntersection(that.doComplement())); + } + + /** + * Removes from "this" all the characters that are also in "that" + */ + private void internalDifference(CharSet that) { + chars = doIntersection(that.doComplement()); + } + + /** + * Returns a CharSet containing all the characters which are not + * in "this" + */ + public CharSet complement() { + return new CharSet(doComplement()); + } + + /** + * Complements "this". All the characters it contains are removed, + * and all the characters it doesn't contain are added. + */ + private void internalComplement() { + chars = doComplement(); + } + + /** + * The internal implementation function for the complement routines + */ + private int[] doComplement() { + // the complement of an empty CharSet is one containing everything + if (empty()) { + int[] result = new int[2]; + result[0] = 0x0000; + result[1] = 0x10FFFF; + return result; + } + + int[] result = new int[chars.length+2]; + + int i = 0; + int index = 0; + + // the result begins with \u0000 unless the original CharSet does + if (chars[0] != 0x0000) { + result[index++] = 0x0000; + } + + // walk through the characters in this CharSet. Append a pair of + // characters the first of which is one less than the first + // character we see and the second of which is one plus the second + // character we see (don't write the first character if it's \u0000, + // and don't write the second character if it's \uffff. + while (i < chars.length) { + if (chars[i] != 0x0000) { + result[index++] = chars[i] - 1; + } + if (chars[i + 1] != 0x10FFFF) { + result[index++] = chars[i + 1] + 1; + } + i += 2; + } + + // add 0x10ffff to the end of the result, unless it was in + // the original set + if (chars[i-1] != 0x10FFFF) { + result[index++] = 0x10FFFF; + } + + if (result.length > index) { + int[] tmpbuf = new int[index]; + System.arraycopy(result, 0, tmpbuf, 0, index); + return tmpbuf; + } + + return result; + } + + /** + * Returns true if this CharSet contains the specified character + * @param c The character we're testing for set membership + */ + public boolean contains(int c) { + // search for the first range endpoint that is greater than or + // equal to c + int i = 1; + while (i < chars.length && chars[i] < c) { + i += 2; + } + + // if we've walked off the end, we don't contain c + if (i == chars.length) { + return false; + } + + // otherwise, we contain c if the beginning of the range is less + // than or equal to c + return chars[i - 1] <= c; + } + + /** + * Returns true if "that" is another instance of CharSet containing + * the exact same characters as this one + */ + public boolean equals(Object that) { + return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars); + } + + /** + * Returns the hash code for this set of characters + */ + public int hashCode() { + return Arrays.hashCode(chars); + } + + /** + * Creates a new CharSet that is equal to this one + */ + public Object clone() { + return new CharSet(chars); + } + + /** + * Returns true if this CharSet contains no characters + */ + public boolean empty() { + return chars.length == 0; + } + + /** + * Returns a textual representation of this CharSet. If the result + * of calling this function is passed to CharSet.parseString(), it + * will produce another CharSet that is equal to this one. + */ + public String toString() { + StringBuffer result = new StringBuffer(); + + // the result begins with an opening bracket + result.append('['); + + // iterate through the ranges in the CharSet + for (int i = 0; i < chars.length; i += 2) { + // for a range with the same beginning and ending point, + // output that character + if (chars[i] == chars[i + 1]) { + result.append("0x"); + result.append(Integer.toHexString(chars[i])); + } + + // otherwise, output the start and end points of the range + // separated by a dash + else { + result.append("0x"); + result.append(Integer.toHexString(chars[i])); + result.append("-0x"); + result.append(Integer.toHexString(chars[i + 1])); + } + } + + // the result ends with a closing bracket + result.append(']'); + return result.toString(); + } + + /** + * Returns an integer array representing the contents of this CharSet + * in the same form in which they're stored internally: as pairs + * of characters representing the start and end points of ranges + */ + public int[] getRanges() { + return chars; + } + + /** + * Returns an Enumeration that will return the ranges of characters + * contained in this CharSet one at a time + */ + public Enumeration getChars() { + return new Enumeration(this); + } + + //========================================================================== + // CharSet.Enumeration + //========================================================================== + + /** + * An Enumeration that can be used to extract the character ranges + * from a CharSet one at a time + */ + public class Enumeration implements java.util.Enumeration { + /** + * Initializes a CharSet.Enumeration + */ + Enumeration(CharSet cs) { + this.chars = cs.chars; + p = 0; + } + + /** + * Returns true if the enumeration hasn't yet returned + * all the ranges in the CharSet + */ + public boolean hasMoreElements() { + return p < chars.length; + } + + /** + * Returns the next range in the CarSet + */ + public int[] nextElement() { + int[] result = new int[2]; + result[0] = chars[p++]; + result[1] = chars[p++]; + return result; + } + + int p; + int[] chars; + } +}