1 /*
2 * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
56 * markers consisting of an alphabetic name string preceded by "$$".
57 * Such markers are replaced with generated program text. As a special
58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59 * alphabetic characters constituting a variable name. The character "_"
60 * is considered alphabetic for these purposes.
61 *
62 * @author Guy Steele
63 * @author Alan Liu
64 * @author John O'Conner
65 */
66
67 public class GenerateCharacter {
68
69 final static boolean DEBUG = false;
70
71 final static String commandMarker = "$$";
72 static String ROOT = "";
73 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
74 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75 static String DefaultPropListFileName = ROOT + "PropList.txt";
76 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
77 static String DefaultJavaOutputFileName = ROOT + "Character.java";
78 static String DefaultCTemplateFileName = ROOT + "Character.c.template";
79 static String DefaultCOutputFileName = ROOT + "Character.c";
80
81 static int plane = 0;
82
83 /* The overall idea is that, in the generated Character class source code,
84 most character property data is stored in a special multi-level table whose
85 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
86 The integers must sum to 16 (the number of bits in a character).
87 The first table is indexed by the k1 high-order bits of the character code.
88 The result is concatenated to the next k2 bits of the character code to index
89 the second table, and so on. Eventually the kn low-order bits of the character
90 code are concatenated and used to index one of two tables A and B; A contains
91 32-bit integer entries and B contains 16-bit short entries. The 48 bits that
92 can be thus obtained encode the properties for the character.
93
94 The default specification is [9, 4, 3, 0]. This particular table format was
95 designed by conducting an exhaustive search of table formats to minimize the
142 4 is Java whitespace
143 2 bits This field indicates whether the character has a numeric property.
144 The four possible values for this field are as follows:
145 0 This character has no numeric property.
146 1 Adding the digit offset to the character code and then
147 masking with 0x1F will produce the desired numeric value.
148 2 This character has a "strange" numeric value.
149 3 A Java supradecimal digit: adding the digit offset to the
150 character code, then masking with 0x1F, then adding 10
151 will produce the desired numeric value.
152 5 bits The digit offset (see description of previous field)
153 5 bits Character type (see below)
154
155 B: the high 16 bits are defined as:
156 1 bit Other_Lowercase property
157 1 bit Other_Uppercase property
158 1 bit Other_Alphabetic property
159 1 bit Other_Math property
160 1 bit Ideographic property
161 1 bit Noncharacter codepoint property
162 */
163
164
165 // bit masks identify each component of a 32-bit property field described
166 // above.
167 // shift* indicates how many shifts right must happen to get the
168 // indicated property value in the lowest bits of the 32-bit space.
169 private static final int
170 shiftType = 0, maskType = 0x001F,
171 shiftDigitOffset = 5, maskDigitOffset = 0x03E0,
172 shiftNumericType = 10, maskNumericType = 0x0C00,
173 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000,
174 maskUnicodePart = 0x1000,
175 shiftCaseInfo = 15, maskCaseInfo = 0x38000,
176 maskLowerCase = 0x20000,
177 maskUpperCase = 0x10000,
178 maskTitleCase = 0x08000,
179 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000,
180 shiftCaseOffsetSign = 5,
181 // used only when calculating and
182 // storing digit offsets from char values
183 maskDigit = 0x001F,
184 // case offset are 9 bits
185 maskCase = 0x01FF,
186 shiftBidi = 27, maskBidi = 0x78000000,
187 shiftMirrored = 31, //maskMirrored = 0x80000000,
188 shiftPlane = 16, maskPlane = 0xFF0000;
189
190 // maskMirrored needs to be long, if up 16-bit
191 private static final long maskMirrored = 0x80000000L;
192
193 // bit masks identify the 16-bit priperty field described above, in B
194 // table
195 private static final long
196 maskOtherLowercase = 0x100000000L,
197 maskOtherUppercase = 0x200000000L,
198 maskOtherAlphabetic = 0x400000000L,
199 maskOtherMath = 0x800000000L,
200 maskIdeographic = 0x1000000000L,
201 maskNoncharacterCP = 0x2000000000L;
202
203 // Can compare masked values with these to determine
204 // numeric or lexical types.
205 public static int
206 valueNotNumeric = 0x0000,
207 valueDigit = 0x0400,
208 valueStrangeNumeric = 0x0800,
209 valueJavaSupradecimal = 0x0C00,
210 valueIgnorable = 0x1000,
211 valueJavaOnlyPart = 0x2000,
212 valueJavaUnicodePart = 0x3000,
213 valueJavaWhitespace = 0x4000,
214 valueJavaStartUnicodePart = 0x5000,
215 valueJavaOnlyStart = 0x6000,
216 valueJavaUnicodeStart = 0x7000,
217 lowJavaStart = 0x5000,
218 nonzeroJavaPart = 0x3000,
219 valueUnicodeStart = 0x7000;
220
221 // these values are used when only identifier properties are generated
350 System.out.println("An error has occured during spec mapping.");
351 System.exit(0);
352 }
353 }
354 // if there are still unprocessed chars, process them
355 // as unassigned/undefined.
356 codePoint = (plane<<16) | k;
357 while (k < result.length) {
358 result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
359 ++k;
360 ++codePoint;
361 }
362 // now add all extra supported properties from PropList, to the
363 // upper 16-bit
364 addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
365 addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
366 addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
367 addExProp(result, propList, "Ideographic", maskIdeographic);
368 //addExProp(result, propList, "Other_Math", maskOtherMath);
369 //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
370
371 return result;
372 }
373
374 // The maximum and minimum offsets found while scanning the database
375 static int maxOffsetSeen = 0;
376 static int minOffsetSeen = 0;
377
378 /**
379 * Some Unicode separator characters are not considered Java whitespace.
380 * @param c character to test
381 * @return true if c in an invalid Java whitespace character, false otherwise.
382 */
383 static boolean isInvalidJavaWhiteSpace(int c) {
384 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
385 boolean retValue = false;
386 for(int x=0;x<exceptions.length;x++) {
387 if(c == exceptions[x]) {
388 retValue = true;
389 break;
763 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
764 if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
765 x.substring(x.length()-1).equals(")") )
766 return genAccess("B", x.substring(9, x.length()-1), 16);
767 if (x.equals("shiftType")) return Long.toString(shiftType);
768 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
769 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
770 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
771 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
772 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
773 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
774 if (x.equals("maskCase")) return "0x" + hex8(maskCase);
775 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
776 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
777 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
778 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
779 if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
780 if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
781 if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
782 if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
783 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
784 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
785 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
786 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
787 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
788 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
789 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
790 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
791 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
792 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
793 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
794 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
795 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
796 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
797 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
798 if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
799 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
800 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
801 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
802 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
1595 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1596 access = extracted;
1597 }
1598 return access;
1599 }
1600
1601 /* The command line arguments are decoded and used to set the following
1602 global variables.
1603 */
1604
1605 static boolean verbose = false;
1606 static boolean nobidi = false;
1607 static boolean nomirror = false;
1608 static boolean identifiers = false;
1609 static boolean Csyntax = false;
1610 static String TemplateFileName = null;
1611 static String OutputFileName = null;
1612 static String UnicodeSpecFileName = null; // liu
1613 static String SpecialCasingFileName = null;
1614 static String PropListFileName = null;
1615 static boolean useCharForByte = false;
1616 static int[] sizes;
1617 static int bins = 0; // liu; if > 0, then perform search
1618 static boolean tableAsString = false;
1619 static boolean bLatin1 = false;
1620
1621 static String commandLineDescription;
1622
1623 /* Other global variables, equal in length to the "sizes" array. */
1624
1625 static int[] shifts;
1626 static int[] zeroextend;
1627 static int[] bytes;
1628 static boolean[] preshifted;
1629 static long[][] tables;
1630
1631
1632 /* Other global variables */
1633 static String commentStart;
1634 static String commentEnd;
1722 else {
1723 UnicodeSpecFileName = args[++j];
1724 }
1725 }
1726 else if (args[j].equals("-specialcasing")) {
1727 if (j == args.length -1) {
1728 FAIL("File name missing after -specialcasing");
1729 }
1730 else {
1731 SpecialCasingFileName = args[++j];
1732 }
1733 }
1734 else if (args[j].equals("-proplist")) {
1735 if (j == args.length -1) {
1736 FAIL("File name missing after -proplist");
1737 }
1738 else {
1739 PropListFileName = args[++j];
1740 }
1741 }
1742 else if (args[j].equals("-plane")) {
1743 if (j == args.length -1) {
1744 FAIL("Plane number missing after -plane");
1745 }
1746 else {
1747 plane = Integer.parseInt(args[++j]);
1748 }
1749 if (plane > 0) {
1750 bLatin1 = false;
1751 }
1752 }
1753 else if ("-usecharforbyte".equals(args[j])) {
1754 useCharForByte = true;
1755 }
1756 else if (args[j].equals("-latin1")) {
1757 bLatin1 = true;
1758 plane = 0;
1759 }
1760 else {
1761 try {
1786 sizes = newsizes;
1787 }
1788 else {
1789 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1790 desc.append("10 5 1]");
1791 sizes = newsizes;
1792 }
1793 }
1794 if (UnicodeSpecFileName == null) { // liu
1795 UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1796 desc.append(" [-spec " + UnicodeSpecFileName + ']');
1797 }
1798 if (SpecialCasingFileName == null) {
1799 SpecialCasingFileName = DefaultSpecialCasingFileName;
1800 desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1801 }
1802 if (PropListFileName == null) {
1803 PropListFileName = DefaultPropListFileName;
1804 desc.append(" [-proplist " + PropListFileName + ']');
1805 }
1806 if (TemplateFileName == null) {
1807 TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1808 : DefaultJavaTemplateFileName);
1809 desc.append(" [-template " + TemplateFileName + ']');
1810 }
1811 if (OutputFileName == null) {
1812 OutputFileName = (Csyntax ? DefaultCOutputFileName
1813 : DefaultJavaOutputFileName);
1814 desc.append(" [-o " + OutputFileName + ']');
1815 }
1816 commentStart = (Csyntax ? "/*" : "//");
1817 commentEnd = (Csyntax ? " */" : "");
1818 commandLineDescription = desc.toString();
1819 }
1820
1821 private static void searchBins(long[] map, int binsOccupied) throws Exception {
1822 int bitsFree = 16;
1823 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1824 if (binsOccupied == (bins-1)) {
1825 sizes[binsOccupied] = bitsFree;
1937 * <li> Generate the source code for the class Character by performing
1938 * macro processing on a template file.
1939 * </ol>
1940 *
1941 * @param args the command line arguments, as an array of String
1942 *
1943 * @see GenerateCharacter#processArgs
1944 * @see UnicodeSpec@readSpecFile
1945 * @see GenerateCharacter#buildMap
1946 * @see GenerateCharacter#buildTable
1947 * @see GenerateCharacter#generateCharacterClass
1948 */
1949
1950 public static void main(String[] args) {
1951 processArgs(args);
1952 try {
1953
1954 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1955 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1956 PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1957
1958 if (verbose) {
1959 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1960 }
1961 long[] map = buildMap(data, specialCaseMaps, propList);
1962 if (verbose) {
1963 System.err.println("Completed building of initial map");
1964 }
1965
1966 if (bins == 0) {
1967 generateForSizes(map);
1968 }
1969 else {
1970 while (bins > 0) {
1971 sizes = new int[bins];
1972 searchBins(map, 0);
1973 --bins;
1974 }
1975 }
1976 if (verbose && false) {
|
1 /*
2 * Copyright (c) 2002, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
56 * markers consisting of an alphabetic name string preceded by "$$".
57 * Such markers are replaced with generated program text. As a special
58 * case, the marker "Lookup(xxx)" is recognized, where "xxx" consists of
59 * alphabetic characters constituting a variable name. The character "_"
60 * is considered alphabetic for these purposes.
61 *
62 * @author Guy Steele
63 * @author Alan Liu
64 * @author John O'Conner
65 */
66
67 public class GenerateCharacter {
68
69 final static boolean DEBUG = false;
70
71 final static String commandMarker = "$$";
72 static String ROOT = "";
73 static String DefaultUnicodeSpecFileName = ROOT + "UnicodeData.txt";
74 static String DefaultSpecialCasingFileName = ROOT + "SpecialCasing.txt";
75 static String DefaultPropListFileName = ROOT + "PropList.txt";
76 static String DefaultDerivedPropsFileName = ROOT + "DerivedCoreProperties.txt";
77 static String DefaultJavaTemplateFileName = ROOT + "Character.java.template";
78 static String DefaultJavaOutputFileName = ROOT + "Character.java";
79 static String DefaultCTemplateFileName = ROOT + "Character.c.template";
80 static String DefaultCOutputFileName = ROOT + "Character.c";
81
82 static int plane = 0;
83
84 /* The overall idea is that, in the generated Character class source code,
85 most character property data is stored in a special multi-level table whose
86 structure is defined by a sequence of nonnegative integers [k1, k2, ..., kn].
87 The integers must sum to 16 (the number of bits in a character).
88 The first table is indexed by the k1 high-order bits of the character code.
89 The result is concatenated to the next k2 bits of the character code to index
90 the second table, and so on. Eventually the kn low-order bits of the character
91 code are concatenated and used to index one of two tables A and B; A contains
92 32-bit integer entries and B contains 16-bit short entries. The 48 bits that
93 can be thus obtained encode the properties for the character.
94
95 The default specification is [9, 4, 3, 0]. This particular table format was
96 designed by conducting an exhaustive search of table formats to minimize the
143 4 is Java whitespace
144 2 bits This field indicates whether the character has a numeric property.
145 The four possible values for this field are as follows:
146 0 This character has no numeric property.
147 1 Adding the digit offset to the character code and then
148 masking with 0x1F will produce the desired numeric value.
149 2 This character has a "strange" numeric value.
150 3 A Java supradecimal digit: adding the digit offset to the
151 character code, then masking with 0x1F, then adding 10
152 will produce the desired numeric value.
153 5 bits The digit offset (see description of previous field)
154 5 bits Character type (see below)
155
156 B: the high 16 bits are defined as:
157 1 bit Other_Lowercase property
158 1 bit Other_Uppercase property
159 1 bit Other_Alphabetic property
160 1 bit Other_Math property
161 1 bit Ideographic property
162 1 bit Noncharacter codepoint property
163 1 bit ID_Start property
164 1 bit ID_Continue property
165 */
166
167
168 // bit masks identify each component of a 32-bit property field described
169 // above.
170 // shift* indicates how many shifts right must happen to get the
171 // indicated property value in the lowest bits of the 32-bit space.
172 private static final int
173 shiftType = 0, maskType = 0x001F,
174 shiftDigitOffset = 5, maskDigitOffset = 0x03E0,
175 shiftNumericType = 10, maskNumericType = 0x0C00,
176 shiftIdentifierInfo = 12, maskIdentifierInfo = 0x7000,
177 maskUnicodePart = 0x1000,
178 shiftCaseInfo = 15, maskCaseInfo = 0x38000,
179 maskLowerCase = 0x20000,
180 maskUpperCase = 0x10000,
181 maskTitleCase = 0x08000,
182 shiftCaseOffset = 18, maskCaseOffset = 0x07FC0000,
183 shiftCaseOffsetSign = 5,
184 // used only when calculating and
185 // storing digit offsets from char values
186 maskDigit = 0x001F,
187 // case offset are 9 bits
188 maskCase = 0x01FF,
189 shiftBidi = 27, maskBidi = 0x78000000,
190 shiftMirrored = 31, //maskMirrored = 0x80000000,
191 shiftPlane = 16, maskPlane = 0xFF0000;
192
193 // maskMirrored needs to be long, if up 16-bit
194 private static final long maskMirrored = 0x80000000L;
195
196 // bit masks identify the 16-bit property field described above, in B
197 // table
198 private static final long
199 maskOtherLowercase = 0x100000000L,
200 maskOtherUppercase = 0x200000000L,
201 maskOtherAlphabetic = 0x400000000L,
202 maskOtherMath = 0x800000000L,
203 maskIdeographic = 0x1000000000L,
204 maskNoncharacterCP = 0x2000000000L,
205 maskIDStart = 0x4000000000L,
206 maskIDContinue = 0x8000000000L;
207
208 // Can compare masked values with these to determine
209 // numeric or lexical types.
210 public static int
211 valueNotNumeric = 0x0000,
212 valueDigit = 0x0400,
213 valueStrangeNumeric = 0x0800,
214 valueJavaSupradecimal = 0x0C00,
215 valueIgnorable = 0x1000,
216 valueJavaOnlyPart = 0x2000,
217 valueJavaUnicodePart = 0x3000,
218 valueJavaWhitespace = 0x4000,
219 valueJavaStartUnicodePart = 0x5000,
220 valueJavaOnlyStart = 0x6000,
221 valueJavaUnicodeStart = 0x7000,
222 lowJavaStart = 0x5000,
223 nonzeroJavaPart = 0x3000,
224 valueUnicodeStart = 0x7000;
225
226 // these values are used when only identifier properties are generated
355 System.out.println("An error has occured during spec mapping.");
356 System.exit(0);
357 }
358 }
359 // if there are still unprocessed chars, process them
360 // as unassigned/undefined.
361 codePoint = (plane<<16) | k;
362 while (k < result.length) {
363 result[k] = buildOne(codePoint, nonCharSpec, specialMaps);
364 ++k;
365 ++codePoint;
366 }
367 // now add all extra supported properties from PropList, to the
368 // upper 16-bit
369 addExProp(result, propList, "Other_Lowercase", maskOtherLowercase);
370 addExProp(result, propList, "Other_Uppercase", maskOtherUppercase);
371 addExProp(result, propList, "Other_Alphabetic", maskOtherAlphabetic);
372 addExProp(result, propList, "Ideographic", maskIdeographic);
373 //addExProp(result, propList, "Other_Math", maskOtherMath);
374 //addExProp(result, propList, "Noncharacter_CodePoint", maskNoncharacterCP);
375 addExProp(result, propList, "ID_Start", maskIDStart);
376 addExProp(result, propList, "ID_Continue", maskIDContinue);
377
378 return result;
379 }
380
381 // The maximum and minimum offsets found while scanning the database
382 static int maxOffsetSeen = 0;
383 static int minOffsetSeen = 0;
384
385 /**
386 * Some Unicode separator characters are not considered Java whitespace.
387 * @param c character to test
388 * @return true if c in an invalid Java whitespace character, false otherwise.
389 */
390 static boolean isInvalidJavaWhiteSpace(int c) {
391 int[] exceptions = {0x00A0, 0x2007, 0x202F, 0xFEFF};
392 boolean retValue = false;
393 for(int x=0;x<exceptions.length;x++) {
394 if(c == exceptions[x]) {
395 retValue = true;
396 break;
770 return genAccess("A", x.substring(7, x.length()-1), (identifiers ? 2 : 32));
771 if (x.length() >= 11 && x.substring(0, 9).equals("LookupEx(") &&
772 x.substring(x.length()-1).equals(")") )
773 return genAccess("B", x.substring(9, x.length()-1), 16);
774 if (x.equals("shiftType")) return Long.toString(shiftType);
775 if (x.equals("shiftIdentifierInfo")) return Long.toString(shiftIdentifierInfo);
776 if (x.equals("maskIdentifierInfo")) return "0x" + hex8(maskIdentifierInfo);
777 if (x.equals("maskUnicodePart")) return "0x" + hex8(maskUnicodePart);
778 if (x.equals("shiftCaseOffset")) return Long.toString(shiftCaseOffset);
779 if (x.equals("shiftCaseInfo")) return Long.toString(shiftCaseInfo);
780 if (x.equals("shiftCaseOffsetSign")) return Long.toString(shiftCaseOffsetSign);
781 if (x.equals("maskCase")) return "0x" + hex8(maskCase);
782 if (x.equals("maskCaseOffset")) return "0x" + hex8(maskCaseOffset);
783 if (x.equals("maskLowerCase")) return "0x" + hex8(maskLowerCase);
784 if (x.equals("maskUpperCase")) return "0x" + hex8(maskUpperCase);
785 if (x.equals("maskTitleCase")) return "0x" + hex8(maskTitleCase);
786 if (x.equals("maskOtherLowercase")) return "0x" + hex4(maskOtherLowercase >> 32);
787 if (x.equals("maskOtherUppercase")) return "0x" + hex4(maskOtherUppercase >> 32);
788 if (x.equals("maskOtherAlphabetic")) return "0x" + hex4(maskOtherAlphabetic >> 32);
789 if (x.equals("maskIdeographic")) return "0x" + hex4(maskIdeographic >> 32);
790 if (x.equals("maskIDStart")) return "0x" + hex4(maskIDStart >> 32);
791 if (x.equals("maskIDContinue")) return "0x" + hex4(maskIDContinue >> 32);
792 if (x.equals("valueIgnorable")) return "0x" + hex8(valueIgnorable);
793 if (x.equals("valueJavaUnicodeStart")) return "0x" + hex8(valueJavaUnicodeStart);
794 if (x.equals("valueJavaOnlyStart")) return "0x" + hex8(valueJavaOnlyStart);
795 if (x.equals("valueJavaUnicodePart")) return "0x" + hex8(valueJavaUnicodePart);
796 if (x.equals("valueJavaOnlyPart")) return "0x" + hex8(valueJavaOnlyPart);
797 if (x.equals("valueJavaWhitespace")) return "0x" + hex8(valueJavaWhitespace);
798 if (x.equals("lowJavaStart")) return "0x" + hex8(lowJavaStart);
799 if (x.equals("nonzeroJavaPart")) return "0x" + hex8(nonzeroJavaPart);
800 if (x.equals("bitJavaStart")) return "0x" + hex8(bitJavaStart);
801 if (x.equals("bitJavaPart")) return Long.toString(bitJavaPart);
802 if (x.equals("valueUnicodeStart")) return "0x" + hex8(valueUnicodeStart);
803 if (x.equals("maskIsJavaIdentifierStart")) return "0x" + hex(maskIsJavaIdentifierStart);
804 if (x.equals("maskIsJavaIdentifierPart")) return "0x" + hex(maskIsJavaIdentifierPart);
805 if (x.equals("shiftDigitOffset")) return Long.toString(shiftDigitOffset);
806 if (x.equals("maskDigitOffset")) return "0x" + hex(maskDigitOffset);
807 if (x.equals("maskDigit")) return "0x" + hex(maskDigit);
808 if (x.equals("shiftNumericType")) return Long.toString(shiftNumericType);
809 if (x.equals("maskNumericType")) return "0x" + hex(maskNumericType);
810 if (x.equals("valueNotNumeric")) return "0x" + hex8(valueNotNumeric);
811 if (x.equals("valueDigit")) return "0x" + hex8(valueDigit);
1604 (bits == 4 ? "0xF" : "" + ((1 << bits) - 1)) + ")";
1605 access = extracted;
1606 }
1607 return access;
1608 }
1609
1610 /* The command line arguments are decoded and used to set the following
1611 global variables.
1612 */
1613
1614 static boolean verbose = false;
1615 static boolean nobidi = false;
1616 static boolean nomirror = false;
1617 static boolean identifiers = false;
1618 static boolean Csyntax = false;
1619 static String TemplateFileName = null;
1620 static String OutputFileName = null;
1621 static String UnicodeSpecFileName = null; // liu
1622 static String SpecialCasingFileName = null;
1623 static String PropListFileName = null;
1624 static String DerivedPropsFileName = null;
1625 static boolean useCharForByte = false;
1626 static int[] sizes;
1627 static int bins = 0; // liu; if > 0, then perform search
1628 static boolean tableAsString = false;
1629 static boolean bLatin1 = false;
1630
1631 static String commandLineDescription;
1632
1633 /* Other global variables, equal in length to the "sizes" array. */
1634
1635 static int[] shifts;
1636 static int[] zeroextend;
1637 static int[] bytes;
1638 static boolean[] preshifted;
1639 static long[][] tables;
1640
1641
1642 /* Other global variables */
1643 static String commentStart;
1644 static String commentEnd;
1732 else {
1733 UnicodeSpecFileName = args[++j];
1734 }
1735 }
1736 else if (args[j].equals("-specialcasing")) {
1737 if (j == args.length -1) {
1738 FAIL("File name missing after -specialcasing");
1739 }
1740 else {
1741 SpecialCasingFileName = args[++j];
1742 }
1743 }
1744 else if (args[j].equals("-proplist")) {
1745 if (j == args.length -1) {
1746 FAIL("File name missing after -proplist");
1747 }
1748 else {
1749 PropListFileName = args[++j];
1750 }
1751 }
1752 else if (args[j].equals("-derivedprops")) {
1753 if (j == args.length -1) {
1754 FAIL("File name missing after -derivedprops");
1755 }
1756 else {
1757 DerivedPropsFileName = args[++j];
1758 }
1759 }
1760 else if (args[j].equals("-plane")) {
1761 if (j == args.length -1) {
1762 FAIL("Plane number missing after -plane");
1763 }
1764 else {
1765 plane = Integer.parseInt(args[++j]);
1766 }
1767 if (plane > 0) {
1768 bLatin1 = false;
1769 }
1770 }
1771 else if ("-usecharforbyte".equals(args[j])) {
1772 useCharForByte = true;
1773 }
1774 else if (args[j].equals("-latin1")) {
1775 bLatin1 = true;
1776 plane = 0;
1777 }
1778 else {
1779 try {
1804 sizes = newsizes;
1805 }
1806 else {
1807 int[] newsizes = { 10, 5, 1 }; // Guy's old defaults for 2.0.14: { 9, 4, 3, 0 }
1808 desc.append("10 5 1]");
1809 sizes = newsizes;
1810 }
1811 }
1812 if (UnicodeSpecFileName == null) { // liu
1813 UnicodeSpecFileName = DefaultUnicodeSpecFileName;
1814 desc.append(" [-spec " + UnicodeSpecFileName + ']');
1815 }
1816 if (SpecialCasingFileName == null) {
1817 SpecialCasingFileName = DefaultSpecialCasingFileName;
1818 desc.append(" [-specialcasing " + SpecialCasingFileName + ']');
1819 }
1820 if (PropListFileName == null) {
1821 PropListFileName = DefaultPropListFileName;
1822 desc.append(" [-proplist " + PropListFileName + ']');
1823 }
1824 if (DerivedPropsFileName == null) {
1825 DerivedPropsFileName = DefaultDerivedPropsFileName;
1826 desc.append(" [-derivedprops " + DerivedPropsFileName + ']');
1827 }
1828 if (TemplateFileName == null) {
1829 TemplateFileName = (Csyntax ? DefaultCTemplateFileName
1830 : DefaultJavaTemplateFileName);
1831 desc.append(" [-template " + TemplateFileName + ']');
1832 }
1833 if (OutputFileName == null) {
1834 OutputFileName = (Csyntax ? DefaultCOutputFileName
1835 : DefaultJavaOutputFileName);
1836 desc.append(" [-o " + OutputFileName + ']');
1837 }
1838 commentStart = (Csyntax ? "/*" : "//");
1839 commentEnd = (Csyntax ? " */" : "");
1840 commandLineDescription = desc.toString();
1841 }
1842
1843 private static void searchBins(long[] map, int binsOccupied) throws Exception {
1844 int bitsFree = 16;
1845 for (int i=0; i<binsOccupied; ++i) bitsFree -= sizes[i];
1846 if (binsOccupied == (bins-1)) {
1847 sizes[binsOccupied] = bitsFree;
1959 * <li> Generate the source code for the class Character by performing
1960 * macro processing on a template file.
1961 * </ol>
1962 *
1963 * @param args the command line arguments, as an array of String
1964 *
1965 * @see GenerateCharacter#processArgs
1966 * @see UnicodeSpec@readSpecFile
1967 * @see GenerateCharacter#buildMap
1968 * @see GenerateCharacter#buildTable
1969 * @see GenerateCharacter#generateCharacterClass
1970 */
1971
1972 public static void main(String[] args) {
1973 processArgs(args);
1974 try {
1975
1976 UnicodeSpec[] data = UnicodeSpec.readSpecFile(new File(UnicodeSpecFileName), plane);
1977 specialCaseMaps = SpecialCaseMap.readSpecFile(new File(SpecialCasingFileName), plane);
1978 PropList propList = PropList.readSpecFile(new File(PropListFileName), plane);
1979 propList.putAll(PropList.readSpecFile(new File(DerivedPropsFileName), plane));
1980
1981 if (verbose) {
1982 System.out.println(data.length + " items read from Unicode spec file " + UnicodeSpecFileName); // liu
1983 }
1984 long[] map = buildMap(data, specialCaseMaps, propList);
1985 if (verbose) {
1986 System.err.println("Completed building of initial map");
1987 }
1988
1989 if (bins == 0) {
1990 generateForSizes(map);
1991 }
1992 else {
1993 while (bins > 0) {
1994 sizes = new int[bins];
1995 searchBins(map, 0);
1996 --bins;
1997 }
1998 }
1999 if (verbose && false) {
|