--- /dev/null Tue May 11 21:40:13 2010 +++ new/make/tools/src/build/tools/generatecharacter/CharacterScript.java Tue May 11 21:40:11 2010 @@ -0,0 +1,214 @@ +import java.util.regex.*; +import java.util.*; +import java.io.*; + +public class CharacterScript { + + // generate the code needed for j.l.C.UnicodeScript + static void fortest(String fmt, Object... o) { + //System.out.printf(fmt, o); + } + + static void print(String fmt, Object... o) { + System.out.printf(fmt, o); + } + + static void debug(String fmt, Object... o) { + //System.out.printf(fmt, o); + } + + public static void main(String args[]){ + try { + if (args.length != 1) { + System.out.println("java CharacterScript script.txt out"); + System.exit(1); + } + + int i, j; + BufferedReader sbfr = new BufferedReader(new FileReader(args[0])); + HashMap scriptMap = new HashMap(); + String line = null; + + Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher(""); + + int prevS = -1; + int prevE = -1; + String prevN = null; + int[][] scripts = new int[1024][3]; + int scriptSize = 0; + + while ((line = sbfr.readLine()) != null) { + if (line.length() <= 1 || line.charAt(0) == '#') { + continue; + } + m.reset(line); + if (m.matches()) { + int start = Integer.parseInt(m.group(1), 16); + int end = (m.group(2)==null)?start + :Integer.parseInt(m.group(2), 16); + String name = m.group(3); + if (name.equals(prevN) && start == prevE + 1) { + prevE = end; + } else { + if (prevS != -1) { + if (scriptMap.get(prevN) == null) { + scriptMap.put(prevN, scriptMap.size()); + } + scripts[scriptSize][0] = prevS; + scripts[scriptSize][1] = prevE; + scripts[scriptSize][2] = scriptMap.get(prevN); + scriptSize++; + } + debug("%x-%x\t%s%n", prevS, prevE, prevN); + prevS = start; prevE = end; prevN = name; + } + } else { + debug("Warning: Unrecognized line <%s>%n", line); + } + } + + //last one. + if (scriptMap.get(prevN) == null) { + scriptMap.put(prevN, scriptMap.size()); + } + scripts[scriptSize][0] = prevS; + scripts[scriptSize][1] = prevE; + scripts[scriptSize][2] = scriptMap.get(prevN); + scriptSize++; + + debug("%x-%x\t%s%n", prevS, prevE, prevN); + debug("-----------------%n"); + debug("Total scripts=%s%n", scriptMap.size()); + debug("-----------------%n%n"); + + String[] names = new String[scriptMap.size()]; + for (String name: scriptMap.keySet()) { + names[scriptMap.get(name).intValue()] = name; + } + + for (j = 0; j < scriptSize; j++) { + for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) { + String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);; + if (cp > 0xffff) + System.out.printf("%05X %s%n", cp, name); + else + System.out.printf("%05X %s%n", cp, name); + } + } + + Arrays.sort(scripts, 0, scriptSize, + new Comparator() { + public int compare(int[] a1, int[] a2) { + return a1[0] - a2[0]; + } + public boolean compare(Object obj) { + return obj == this; + } + }); + + + + // Consolidation: there are lots of "reserved" code points + // embedded in those otherwise "sequential" blocks. + // To make the lookup table smaller, we combine those + // separated segments with the assumption that the lookup + // implementation checks + // Character.getType() != Character.UNASSIGNED + // first (return UNKNOWN for unassigned) + + ArrayList list = new ArrayList(); + list.add(scripts[0]); + + int[] last = scripts[0]; + for (i = 1; i < scriptSize; i++) { + if (scripts[i][0] != (last[1] + 1)) { + + boolean isNotUnassigned = false; + for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) { + if (Character.getType(cp) != Character.UNASSIGNED) { + isNotUnassigned = true; + debug("Warning: [%x] is ASSIGNED but in NON script%n", cp); + break; + } + } + if (isNotUnassigned) { + // surrogates only? + int[] a = new int[3]; + a[0] = last[1] + 1; + a[1] = scripts[i][0] - 1; + a[2] = -1; // unknown + list.add(a); + } else { + if (last[2] == scripts[i][2]) { + //combine + last[1] = scripts[i][1]; + continue; + } else { + // expand last + last[1] = scripts[i][0] - 1; + } + } + } + list.add(scripts[i]); + last = scripts[i]; + } + + for (i = 0; i < list.size(); i++) { + int[] a = (int[])list.get(i); + String name = "UNKNOWN"; + if (a[2] != -1) + name = names[a[2]].toUpperCase(Locale.US); + debug("0x%05x, 0x%05x %s%n", a[0], a[1], name); + } + debug("--->total=%d%n", list.size()); + + + //////////////////OUTPUT////////////////////////////////// + print("public class Scripts {%n%n"); + print(" public static enum UnicodeScript {%n"); + for (i = 0; i < names.length; i++) { + print(" /**%n * Unicode script \"%s\".%n */%n", names[i]); + print(" %s,%n%n", names[i].toUpperCase(Locale.US)); + } + print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n"); + + + // lookup table + print(" private static final int[] scriptStarts = {%n"); + for (int[] a : list) { + String name = "UNKNOWN"; + if (a[2] != -1) + name = names[a[2]].toUpperCase(Locale.US); + if (a[0] < 0x10000) + print(" 0x%04X, // %04X..%04X; %s%n", + a[0], a[0], a[1], name); + else + print(" 0x%05X, // %05X..%05X; %s%n", + a[0], a[0], a[1], name); + } + last = list.get(list.size() -1); + if (last[1] != Character.MAX_CODE_POINT) + print(" 0x%05X // %05X..%06X; %s%n", + last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT, + "UNKNOWN"); + print("%n };%n%n"); + + print(" private static final UnicodeScript[] scripts = {%n"); + for (int[] a : list) { + String name = "UNKNOWN"; + if (a[2] != -1) + name = names[a[2]].toUpperCase(Locale.US); + print(" %s,%n", name); + } + + if (last[1] != Character.MAX_CODE_POINT) + print(" UNKNOWN%n"); + print(" };%n"); + print(" }%n"); + print("}%n"); + + } catch (Exception e) { + e.printStackTrace(); + } + } +}