1 import java.util.regex.*;
   2 import java.util.*;
   3 import java.io.*;
   4 
   5 public class CharacterScript {
   6 
   7     // generate the code needed for j.l.C.UnicodeScript
   8     static void fortest(String fmt, Object... o) {
   9         //System.out.printf(fmt, o);
  10     }
  11 
  12     static void print(String fmt, Object... o) {
  13         System.out.printf(fmt, o);
  14     }
  15 
  16     static void debug(String fmt, Object... o) {
  17         //System.out.printf(fmt, o);
  18     }
  19 
  20     public static void main(String args[]){
  21         try {
  22             if (args.length != 1) {
  23                 System.out.println("java CharacterScript script.txt out");
  24                 System.exit(1);
  25             }
  26 
  27             int i, j;
  28             BufferedReader sbfr = new BufferedReader(new FileReader(args[0]));
  29             HashMap<String,Integer> scriptMap = new HashMap<String,Integer>();
  30             String line = null;
  31 
  32             Matcher m = Pattern.compile("(\\p{XDigit}+)(?:\\.{2}(\\p{XDigit}+))?\\s+;\\s+(\\w+)\\s+#.*").matcher("");
  33 
  34             int prevS = -1;
  35             int prevE = -1;
  36             String prevN = null;
  37             int[][] scripts = new int[1024][3];
  38             int scriptSize = 0;
  39 
  40             while ((line = sbfr.readLine()) != null) {
  41                 if (line.length() <= 1 || line.charAt(0) == '#') {
  42                     continue;
  43                 }
  44                 m.reset(line);
  45                 if (m.matches()) {
  46                     int start = Integer.parseInt(m.group(1), 16);
  47                     int end = (m.group(2)==null)?start
  48                               :Integer.parseInt(m.group(2), 16);
  49                     String name = m.group(3);
  50                     if (name.equals(prevN) && start == prevE + 1) {
  51                         prevE = end;
  52                     } else {
  53                         if (prevS != -1) {
  54                             if (scriptMap.get(prevN) == null) {
  55                                 scriptMap.put(prevN, scriptMap.size());
  56                             }
  57                             scripts[scriptSize][0] = prevS;
  58                             scripts[scriptSize][1] = prevE;
  59                             scripts[scriptSize][2] = scriptMap.get(prevN);
  60                             scriptSize++;
  61                         }
  62                         debug("%x-%x\t%s%n", prevS, prevE, prevN);
  63                         prevS = start; prevE = end; prevN = name;
  64                     }
  65                 } else {
  66                     debug("Warning: Unrecognized line <%s>%n", line);
  67                 }
  68             }
  69 
  70             //last one.
  71             if (scriptMap.get(prevN) == null) {
  72                 scriptMap.put(prevN, scriptMap.size());
  73             }
  74             scripts[scriptSize][0] = prevS;
  75             scripts[scriptSize][1] = prevE;
  76             scripts[scriptSize][2] = scriptMap.get(prevN);
  77             scriptSize++;
  78 
  79             debug("%x-%x\t%s%n", prevS, prevE, prevN);
  80             debug("-----------------%n");
  81             debug("Total scripts=%s%n", scriptMap.size());
  82             debug("-----------------%n%n");
  83 
  84             String[] names = new String[scriptMap.size()];
  85             for (String name: scriptMap.keySet()) {
  86                 names[scriptMap.get(name).intValue()] = name;
  87             }
  88 
  89             for (j = 0; j < scriptSize; j++) {
  90                 for (int cp = scripts[j][0]; cp <= scripts[j][1]; cp++) {
  91                     String name = names[scripts[j][2]].toUpperCase(Locale.ENGLISH);;
  92                     if (cp > 0xffff)
  93                         System.out.printf("%05X    %s%n", cp, name);
  94                     else
  95                         System.out.printf("%05X    %s%n", cp, name);
  96                 }
  97             }
  98 
  99             Arrays.sort(scripts, 0, scriptSize,
 100                         new Comparator<int[]>() {
 101                             public int compare(int[] a1, int[] a2) {
 102                                 return a1[0] - a2[0];
 103                             }
 104                             public boolean compare(Object obj) {
 105                                 return obj == this;
 106                             }
 107                          });
 108 
 109 
 110 
 111             // Consolidation: there are lots of "reserved" code points
 112             // embedded in those otherwise "sequential" blocks.
 113             // To make the lookup table smaller, we combine those
 114             // separated segments with the assumption that the lookup
 115             // implementation checks
 116             //    Character.getType() !=  Character.UNASSIGNED
 117             // first (return UNKNOWN for unassigned)
 118 
 119             ArrayList<int[]> list = new ArrayList();
 120             list.add(scripts[0]);
 121 
 122             int[] last = scripts[0];
 123             for (i = 1; i < scriptSize; i++) {
 124                 if (scripts[i][0] != (last[1] + 1)) {
 125 
 126                     boolean isNotUnassigned = false;
 127                     for (int cp = last[1] + 1; cp < scripts[i][0]; cp++) {
 128                         if (Character.getType(cp) != Character.UNASSIGNED) {
 129                             isNotUnassigned = true;
 130                             debug("Warning: [%x] is ASSIGNED but in NON script%n", cp);
 131                             break;
 132                         }
 133                     }
 134                     if (isNotUnassigned) {
 135                         // surrogates only?
 136                         int[] a = new int[3];
 137                         a[0] = last[1] + 1;
 138                         a[1] = scripts[i][0] - 1;
 139                         a[2] = -1;  // unknown
 140                         list.add(a);
 141                     } else {
 142                         if (last[2] == scripts[i][2]) {
 143                             //combine
 144                             last[1] = scripts[i][1];
 145                             continue;
 146                         } else {
 147                             // expand last
 148                             last[1] = scripts[i][0] - 1;
 149                         }
 150                     }
 151                 }
 152                 list.add(scripts[i]);
 153                 last = scripts[i];
 154             }
 155 
 156             for (i = 0; i < list.size(); i++) {
 157                 int[] a = (int[])list.get(i);
 158                 String name = "UNKNOWN";
 159                 if (a[2] != -1)
 160                     name = names[a[2]].toUpperCase(Locale.US);
 161                 debug("0x%05x, 0x%05x  %s%n", a[0], a[1], name);
 162             }
 163             debug("--->total=%d%n", list.size());
 164 
 165 
 166             //////////////////OUTPUT//////////////////////////////////
 167             print("public class Scripts {%n%n");
 168             print("    public static enum UnicodeScript {%n");
 169             for (i = 0; i < names.length; i++) {
 170                 print("        /**%n         * Unicode script \"%s\".%n         */%n", names[i]);
 171                 print("        %s,%n%n",  names[i].toUpperCase(Locale.US));
 172             }
 173             print("        /**%n         * Unicode script \"Unknown\".%n         */%n        UNKNOWN;%n%n");
 174 
 175 
 176             // lookup table
 177             print("        private static final int[] scriptStarts = {%n");
 178             for (int[] a : list) {
 179                 String name = "UNKNOWN";
 180                 if (a[2] != -1)
 181                     name = names[a[2]].toUpperCase(Locale.US);
 182                 if (a[0] < 0x10000)
 183                     print("            0x%04X,   // %04X..%04X; %s%n",
 184                           a[0], a[0], a[1], name);
 185                 else
 186                     print("            0x%05X,  // %05X..%05X; %s%n",
 187                           a[0], a[0], a[1], name);
 188             }
 189             last = list.get(list.size() -1);
 190             if (last[1] != Character.MAX_CODE_POINT)
 191                 print("            0x%05X   // %05X..%06X; %s%n",
 192                       last[1] + 1, last[1] + 1, Character.MAX_CODE_POINT,
 193                       "UNKNOWN");
 194             print("%n        };%n%n");
 195 
 196             print("        private static final UnicodeScript[] scripts = {%n");
 197             for (int[] a : list) {
 198                 String name = "UNKNOWN";
 199                 if (a[2] != -1)
 200                     name = names[a[2]].toUpperCase(Locale.US);
 201                 print("            %s,%n", name);
 202             }
 203 
 204             if (last[1] != Character.MAX_CODE_POINT)
 205                 print("            UNKNOWN%n");
 206             print("        };%n");
 207             print("    }%n");
 208             print("}%n");
 209 
 210         } catch (Exception e) {
 211             e.printStackTrace();
 212         }
 213     }
 214 }