1 import java.util.regex.*; 2 import java.util.*; 3 import java.io.*; 4 5 public class CharacterScript { 6 7 // generate the code needed for j.l.C.UnicodeScript 8 static void fortest(String fmt, Object... o) { 9 //System.out.printf(fmt, o); 10 } 11 12 static void print(String fmt, Object... o) { 13 System.out.printf(fmt, o); 14 } 15 16 static void debug(String fmt, Object... o) { 17 //System.out.printf(fmt, o); 18 } 19 20 public static void main(String args[]){ 21 try { 22 if (args.length != 1) { 23 System.out.println("java CharacterScript script.txt out"); 24 System.exit(1); 25 } 26 27 int i, j; 28 BufferedReader sbfr = new BufferedReader(new FileReader(args[0])); 29 HashMap<String,Integer> scriptMap = new HashMap<String,Integer>(); 30 String line = null; 31 32 Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher(""); 33 34 int prevS = -1; 35 int prevE = -1; 36 String prevN = null; 37 int[][] scripts = new int[1024][3]; 38 int scriptSize = 0; 39 40 while ((line = sbfr.readLine()) != null) { 41 if (line.length() <= 1 || line.charAt(0) == '#') { 42 continue; 43 } 44 m.reset(line); 45 if (m.matches()) { 46 int start = Integer.parseInt(m.group(1), 16); 47 int end = (m.group(2)==null)?start 48 :Integer.parseInt(m.group(2), 16); 49 String name = m.group(3); 50 if (name.equals(prevN) && start == prevE + 1) { 51 prevE = end; 52 } else { 53 if (prevS != -1) { 54 if (scriptMap.get(prevN) == null) { 55 scriptMap.put(prevN, scriptMap.size()); 56 } 57 scripts[scriptSize][0] = prevS; 58 scripts[scriptSize][1] = prevE; 59 scripts[scriptSize][2] = scriptMap.get(prevN); 60 scriptSize++; 61 } 62 debug("%x-%x\t%s%n", prevS, prevE, prevN); 63 prevS = start; prevE = end; prevN = name; 64 } 65 } else { 66 debug("Warning: Unrecognized line <%s>%n", line); 67 } 68 } 69 70 //last one. 71 if (scriptMap.get(prevN) == null) { 72 scriptMap.put(prevN, scriptMap.size()); 73 } 74 scripts[scriptSize][0] = prevS; 75 scripts[scriptSize][1] = prevE; 76 scripts[scriptSize][2] = scriptMap.get(prevN); 77 scriptSize++; 78 79 debug("%x-%x\t%s%n", prevS, prevE, prevN); 80 debug("-----------------%n"); 81 debug("Total scripts=%s%n", scriptMap.size()); 82 debug("-----------------%n%n"); 83 84 String[] names = new String[scriptMap.size()]; 85 for (String name: scriptMap.keySet()) { 86 names[scriptMap.get(name).intValue()] = name; 87 } 88 89 for (j = 0; j < scriptSize; j++) { 90 for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) { 91 String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);; 92 if (cp > 0xffff) 93 System.out.printf("%05X %s%n", cp, name); 94 else 95 System.out.printf("%05X %s%n", cp, name); 96 } 97 } 98 99 Arrays.sort(scripts, 0, scriptSize, 100 new Comparator<int[]>() { 101 public int compare(int[] a1, int[] a2) { 102 return a1[0] - a2[0]; 103 } 104 public boolean compare(Object obj) { 105 return obj == this; 106 } 107 }); 108 109 110 111 // Consolidation: there are lots of "reserved" code points 112 // embedded in those otherwise "sequential" blocks. 113 // To make the lookup table smaller, we combine those 114 // separated segments with the assumption that the lookup 115 // implementation checks 116 // Character.getType() != Character.UNASSIGNED 117 // first (return UNKNOWN for unassigned) 118 119 ArrayList<int[]> list = new ArrayList(); 120 list.add(scripts[0]); 121 122 int[] last = scripts[0]; 123 for (i = 1; i < scriptSize; i++) { 124 if (scripts[i][0] != (last[1] + 1)) { 125 126 boolean isNotUnassigned = false; 127 for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) { 128 if (Character.getType(cp) != Character.UNASSIGNED) { 129 isNotUnassigned = true; 130 debug("Warning: [%x] is ASSIGNED but in NON script%n", cp); 131 break; 132 } 133 } 134 if (isNotUnassigned) { 135 // surrogates only? 136 int[] a = new int[3]; 137 a[0] = last[1] + 1; 138 a[1] = scripts[i][0] - 1; 139 a[2] = -1; // unknown 140 list.add(a); 141 } else { 142 if (last[2] == scripts[i][2]) { 143 //combine 144 last[1] = scripts[i][1]; 145 continue; 146 } else { 147 // expand last 148 last[1] = scripts[i][0] - 1; 149 } 150 } 151 } 152 list.add(scripts[i]); 153 last = scripts[i]; 154 } 155 156 for (i = 0; i < list.size(); i++) { 157 int[] a = (int[])list.get(i); 158 String name = "UNKNOWN"; 159 if (a[2] != -1) 160 name = names[a[2]].toUpperCase(Locale.US); 161 debug("0x%05x, 0x%05x %s%n", a[0], a[1], name); 162 } 163 debug("--->total=%d%n", list.size()); 164 165 166 //////////////////OUTPUT////////////////////////////////// 167 print("public class Scripts {%n%n"); 168 print(" public static enum UnicodeScript {%n"); 169 for (i = 0; i < names.length; i++) { 170 print(" /**%n * Unicode script \"%s\".%n */%n", names[i]); 171 print(" %s,%n%n", names[i].toUpperCase(Locale.US)); 172 } 173 print(" /**%n * Unicode script \"Unknown\".%n */%n UNKNOWN;%n%n"); 174 175 176 // lookup table 177 print(" private static final int[] scriptStarts = {%n"); 178 for (int[] a : list) { 179 String name = "UNKNOWN"; 180 if (a[2] != -1) 181 name = names[a[2]].toUpperCase(Locale.US); 182 if (a[0] < 0x10000) 183 print(" 0x%04X, // %04X..%04X; %s%n", 184 a[0], a[0], a[1], name); 185 else 186 print(" 0x%05X, // %05X..%05X; %s%n", 187 a[0], a[0], a[1], name); 188 } 189 last = list.get(list.size() -1); 190 if (last[1] != Character.MAX_CODE_POINT) 191 print(" 0x%05X // %05X..%06X; %s%n", 192 last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT, 193 "UNKNOWN"); 194 print("%n };%n%n"); 195 196 print(" private static final UnicodeScript[] scripts = {%n"); 197 for (int[] a : list) { 198 String name = "UNKNOWN"; 199 if (a[2] != -1) 200 name = names[a[2]].toUpperCase(Locale.US); 201 print(" %s,%n", name); 202 } 203 204 if (last[1] != Character.MAX_CODE_POINT) 205 print(" UNKNOWN%n"); 206 print(" };%n"); 207 print(" }%n"); 208 print("}%n"); 209 210 } catch (Exception e) { 211 e.printStackTrace(); 212 } 213 } 214 }