New src/share/classes/sun/nio/cs/ext/EUC

   1 /*
   2  * Copyright 2009 Sun Microsystems, Inc.  All Rights Reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Sun designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Sun in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  22  * CA 95054 USA or visit www.sun.com if you need additional information or
  23  * have any questions.
  24  */
  25 
  26 package sun.nio.cs.ext;
  27 
  28 import java.io.*;
  29 import java.nio.CharBuffer;
  30 import java.nio.ByteBuffer;
  31 import java.nio.charset.Charset;
  32 import java.nio.charset.CharsetDecoder;
  33 import java.nio.charset.CharsetEncoder;
  34 import java.nio.charset.CoderResult;
  35 import java.util.Arrays;
  36 import sun.nio.cs.HistoricallyNamedCharset;
  37 import sun.nio.cs.Surrogate;
  38 import static sun.nio.cs.CharsetMapping.*;
  39 
  40 public class EUC_TW extends Charset implements HistoricallyNamedCharset
  41 {
  42     private static final int SS2 = 0x8E;
  43 
  44     /*
  45        (1) EUC_TW
  46        Second byte of EUC_TW for cs2 is in range of
  47        0xA1-0xB0 for plane 1-16. According to CJKV /163,
  48        plane1 is coded in both cs1 and cs2. This impl
  49        however does not decode the codepoints of plane1
  50        in cs2, so only p2-p7 and p15 are supported in cs2.
  51 
  52        Plane2  0xA2;
  53        Plane3  0xA3;
  54        Plane4  0xA4;
  55        Plane5  0xA5;
  56        Plane6  0xA6;
  57        Plane7  0xA7;
  58        Plane15 0xAF;
  59 
  60        (2) Mapping
  61        The fact that all supplementary characters encoded in EUC_TW are
  62        in 0x2xxxx range gives us the room to optimize the data tables.
  63 
  64        Decoding:
  65        (1) save the lower 16-bit value of all codepoints of b->c mapping
  66            in a String array table  String[plane] b2c.
  67        (2) save "codepoint is supplementary" info (one bit) in a
  68            byte[] b2cIsSupp, so 8 codepoints (same codepoint value, different
  69            plane No) share one byte.
  70 
  71        Encoding:
  72        (1)c->b mappings are stored in
  73           char[]c2b/char[]c2bIndex
  74           char[]c2bSupp/char[]c2bIndexsupp  (indexed by lower 16-bit
  75        (2)byte[] c2bPlane stores the "plane info" of each euc-tw codepoints,
  76           BMP and Supp share the low/high 4 bits of one byte.
  77 
  78        Mapping tables are stored separated in EUC_TWMapping, which
  79        is generated by tool.
  80      */
  81 
  82     public EUC_TW() {
  83         super("x-EUC-TW", ExtendedCharsets.aliasesFor("x-EUC-TW"));
  84     }
  85 
  86     public String historicalName() {
  87         return "EUC_TW";
  88     }
  89 
  90     public boolean contains(Charset cs) {
  91         return ((cs.name().equals("US-ASCII"))
  92                 || (cs instanceof EUC_TW));
  93     }
  94 
  95     public CharsetDecoder newDecoder() {
  96         return new Decoder(this);
  97     }
  98 
  99     public CharsetEncoder newEncoder() {
 100         return new Encoder(this);
 101     }
 102 
 103     public static class Decoder extends CharsetDecoder {
 104         public Decoder(Charset cs) {
 105             super(cs, 2.0f, 2.0f);
 106         }
 107 
 108         char[] c1 = new char[1];
 109         char[] c2 = new char[2];
 110         public char[] toUnicode(int b1, int b2, int p) {
 111             return decode(b1, b2, p, c1, c2);
 112         }
 113 
 114         static final String[] b2c =  EUC_TWMapping.b2c;
 115         static final int b1Min    =  EUC_TWMapping.b1Min;
 116         static final int b1Max    =  EUC_TWMapping.b1Max;
 117         static final int b2Min    =  EUC_TWMapping.b2Min;
 118         static final int b2Max    =  EUC_TWMapping.b2Max;
 119         static final int dbSegSize = b2Max - b2Min + 1;
 120         static final byte[] b2cIsSupp;
 121 
 122         // adjust from cns planeNo to the plane index of b2c
 123         static final byte[] cnspToIndex = new byte[0x100];
 124         static {
 125             Arrays.fill(cnspToIndex, (byte)-1);
 126             cnspToIndex[0xa2] = 1; cnspToIndex[0xa3] = 2; cnspToIndex[0xa4] = 3;
 127             cnspToIndex[0xa5] = 4; cnspToIndex[0xa6] = 5; cnspToIndex[0xa7] = 6;
 128             cnspToIndex[0xaf] = 7;
 129         } 
 130 
 131         //static final BitSet b2cIsSupp;
 132         static {
 133             String b2cIsSuppStr = EUC_TWMapping.b2cIsSuppStr;
 134             // work on a local copy is much faster than operate
 135             // directly on b2cIsSupp
 136             byte[] flag = new byte[b2cIsSuppStr.length() << 1];
 137             int off = 0;
 138             for (int i = 0; i < b2cIsSuppStr.length(); i++) {
 139                 char c = b2cIsSuppStr.charAt(i);
 140                 flag[off++] = (byte)(c >> 8);
 141                 flag[off++] = (byte)(c & 0xff);
 142             }
 143             b2cIsSupp = flag;
 144         }
 145 
 146         static boolean isLegalDB(int b) {
 147            return b >= b1Min && b <= b1Max;
 148         }
 149 
 150         static char[] decode(int b1, int b2, int p, char[] c1, char[] c2)
 151         {
 152             if (b1 < b1Min || b1 > b1Max || b2 < b2Min || b2 > b2Max)
 153                 return null;
 154             int index = (b1 - b1Min) * dbSegSize + b2 - b2Min;
 155             char c = b2c[p].charAt(index); 
 156             if (c == UNMAPPABLE_DECODING)
 157                 return null;
 158             if ((b2cIsSupp[index] & (1 << p)) == 0) {
 159                 c1[0] = c;
 160                 return c1;
 161             } else {
 162                 c2[0] = Surrogate.high(0x20000 + c);
 163                 c2[1] = Surrogate.low(0x20000 + c);
 164                 return c2;
 165             }
 166         }
 167 
 168         private CoderResult decodeArrayLoop(ByteBuffer src,
 169                                             CharBuffer dst)
 170         {
 171             byte[] sa = src.array();
 172             int sp = src.arrayOffset() + src.position();
 173             int sl = src.arrayOffset() + src.limit();
 174 
 175             char[] da = dst.array();
 176             int dp = dst.arrayOffset() + dst.position();
 177             int dl = dst.arrayOffset() + dst.limit();
 178             try {
 179                 while (sp < sl) {
 180                     int byte1 = sa[sp] & 0xff;
 181                     if (byte1 == SS2) { // Codeset 2  G2
 182                         if ( sl - sp < 4)
 183                             return CoderResult.UNDERFLOW;
 184                         int cnsPlane = cnspToIndex[sa[sp + 1] & 0xff];
 185                         if (cnsPlane < 0)
 186                             return CoderResult.malformedForLength(2);
 187                         byte1 = sa[sp + 2] & 0xff;
 188                         int byte2 = sa[sp + 3] & 0xff;
 189                         char[] cc = toUnicode(byte1, byte2, cnsPlane);
 190                         if (cc == null) {
 191                             if (!isLegalDB(byte1) || !isLegalDB(byte2))
 192                                 return CoderResult.malformedForLength(4);
 193                             return CoderResult.unmappableForLength(4);
 194                         }
 195                         if (dl - dp < cc.length)
 196                             return CoderResult.OVERFLOW;
 197                         if (cc.length == 1) {
 198                             da[dp++] = cc[0];
 199                         } else {
 200                             da[dp++] = cc[0];
 201                             da[dp++] = cc[1];
 202                         }
 203                         sp += 4;
 204                     } else if (byte1 < 0x80) {       // ASCII      G0
 205                         if (dl - dp < 1)
 206                            return CoderResult.OVERFLOW;
 207                         da[dp++] = (char) byte1;
 208                         sp++;
 209                     } else {                    // Codeset 1  G1
 210                         if ( sl - sp < 2) 
 211                             return CoderResult.UNDERFLOW;
 212                         int byte2 = sa[sp + 1] & 0xff;
 213                         char[] cc = toUnicode(byte1, byte2, 0);
 214                         if (cc == null) {
 215                             if (!isLegalDB(byte1) || !isLegalDB(byte2))
 216                                 return CoderResult.malformedForLength(1);
 217                             return CoderResult.unmappableForLength(2);
 218                         }
 219                         if (dl - dp < 1)
 220                             return CoderResult.OVERFLOW;
 221                         da[dp++] = cc[0];
 222                         sp += 2;
 223                     }
 224                 }
 225                 return CoderResult.UNDERFLOW;
 226             } finally {
 227                 src.position(sp - src.arrayOffset());
 228                 dst.position(dp - dst.arrayOffset());
 229             }
 230         }
 231 
 232         private CoderResult decodeBufferLoop(ByteBuffer src,
 233                                              CharBuffer dst)
 234         {
 235             int mark = src.position();
 236             try {
 237                 while (src.hasRemaining()) {
 238                     int byte1 = src.get() & 0xff;
 239                     if (byte1 == SS2) {            // Codeset 2  G2
 240                         if ( src.remaining() < 3)
 241                             return CoderResult.UNDERFLOW;
 242                         int cnsPlane = cnspToIndex[src.get() & 0xff];
 243                         if (cnsPlane < 0)
 244                             return CoderResult.malformedForLength(2);
 245                         byte1 = src.get() & 0xff;
 246                         int byte2 = src.get() & 0xff;
 247                         char[] cc = toUnicode(byte1, byte2, cnsPlane);
 248                         if (cc == null) {
 249                             if (!isLegalDB(byte1) || !isLegalDB(byte2))
 250                                 return CoderResult.malformedForLength(4);
 251                             return CoderResult.unmappableForLength(4);
 252                         }
 253                         if (dst.remaining() < cc.length)
 254                             return CoderResult.OVERFLOW;
 255                         if (cc.length == 1) {
 256                             dst.put(cc[0]);
 257                         } else {
 258                             dst.put(cc[0]);
 259                             dst.put(cc[1]);
 260                         }
 261                         mark += 4;
 262                     } else if (byte1 < 0x80) {             // ASCII      G0
 263                         if (!dst.hasRemaining())
 264                            return CoderResult.OVERFLOW;
 265                         dst.put((char) byte1);
 266                         mark++;
 267                     } else {                          // Codeset 1  G1
 268                         if (!src.hasRemaining())
 269                             return CoderResult.UNDERFLOW;
 270                         int byte2 = src.get() & 0xff;
 271                         char[] cc = toUnicode(byte1, byte2, 0);
 272                         if (cc == null) {
 273                             if (!isLegalDB(byte1) || !isLegalDB(byte2))
 274                                 return CoderResult.malformedForLength(1);
 275                             return CoderResult.unmappableForLength(2);
 276                         }
 277                         if (!dst.hasRemaining())
 278                             return CoderResult.OVERFLOW;
 279                         dst.put(cc[0]);
 280                         mark +=2;
 281                     }
 282                }
 283                return CoderResult.UNDERFLOW;
 284             } finally {
 285                 src.position(mark);
 286             }
 287         }
 288 
 289         protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst)
 290         {
 291             if (src.hasArray() && dst.hasArray())
 292                 return decodeArrayLoop(src, dst);
 293             else
 294                 return decodeBufferLoop(src, dst);
 295         }
 296     }
 297 
 298     public static class Encoder extends CharsetEncoder {
 299         private byte[] bb = new byte[4];
 300 
 301         public Encoder(Charset cs) {
 302             super(cs, 4.0f, 4.0f);
 303         }
 304 
 305         public boolean canEncode(char c) {
 306             return (c <= '\u007f' || toEUC(c, bb) != -1);
 307         }
 308 
 309         public boolean canEncode(CharSequence cs) {
 310             int i = 0;
 311             while (i < cs.length()) {
 312                 char c = cs.charAt(i++);
 313                 if (Surrogate.isHigh(c)) {
 314                     if (i == cs.length())
 315                         return false;
 316                     char low = cs.charAt(i++);
 317                     if (!Surrogate.isLow(low) || toEUC(c, low, bb) == -1)
 318                         return false;     
 319                 } else if (!canEncode(c)) {
 320                     return false;
 321                 }
 322             }
 323             return true;
 324         }
 325 
 326         public int toEUC(char hi, char low, byte[] bb) {
 327             return encode(hi, low, bb);
 328         }
 329 
 330         public int toEUC(char c, byte[] bb) {
 331             return encode(c, bb);
 332         }
 333 
 334         private CoderResult encodeArrayLoop(CharBuffer src,
 335                                             ByteBuffer dst)
 336         {
 337             char[] sa = src.array();
 338             int sp = src.arrayOffset() + src.position();
 339             int sl = src.arrayOffset() + src.limit();
 340 
 341             byte[] da = dst.array();
 342             int dp = dst.arrayOffset() + dst.position();
 343             int dl = dst.arrayOffset() + dst.limit();
 344 
 345             int inSize;
 346             int outSize;
 347 
 348             try {
 349                 while (sp < sl) {
 350                     char c = sa[sp];
 351                     inSize = 1;
 352                     if (c < 0x80) {  // ASCII
 353                         bb[0] = (byte)c;
 354                         outSize = 1;
 355                     } else {
 356                         outSize = toEUC(c, bb);
 357                         if (outSize == -1) {
 358                             // to check surrogates only after BMP failed
 359                             // has the benefit of improving the BMP encoding
 360                             // 10% faster, with the price of the slowdown of
 361                             // supplementary character encoding. given the use
 362                             // of supplementary characters is really rare, this
 363                             // is something worth doing.
 364                             if (Surrogate.isHigh(c)) {
 365                                 if ((sp + 1) == sl)
 366                                     return CoderResult.UNDERFLOW;
 367                                 if (!Surrogate.isLow(sa[sp + 1]))
 368                                     return CoderResult.malformedForLength(1);
 369                                 outSize = toEUC(c, sa[sp+1], bb);
 370                                     inSize = 2;
 371                             } else if (Surrogate.isLow(c)) {
 372                                 return CoderResult.malformedForLength(1);
 373                             }
 374                         }
 375                     }
 376                     if (outSize == -1)
 377                         return CoderResult.unmappableForLength(inSize);
 378                     if ( dl - dp < outSize)
 379                         return CoderResult.OVERFLOW;
 380                     for (int i = 0; i < outSize; i++)
 381                         da[dp++] = bb[i];
 382                     sp  += inSize;
 383                 }
 384                 return CoderResult.UNDERFLOW;
 385             } finally {
 386                 src.position(sp - src.arrayOffset());
 387                 dst.position(dp - dst.arrayOffset());
 388             }
 389         }
 390 
 391         private CoderResult encodeBufferLoop(CharBuffer src,
 392                                              ByteBuffer dst)
 393         {
 394             int outSize;
 395             int inSize;
 396             int mark = src.position();
 397 
 398             try {
 399                 while (src.hasRemaining()) {
 400                     inSize = 1;
 401                     char c = src.get();
 402                     if (c < 0x80) {   // ASCII
 403                         outSize = 1;
 404                         bb[0] = (byte)c;
 405                     } else {
 406                         outSize = toEUC(c, bb);
 407                         if (outSize == -1) {
 408                             if (Surrogate.isHigh(c)) {
 409                                 if (!src.hasRemaining())
 410                                     return CoderResult.UNDERFLOW;
 411                                 char c2 = src.get();
 412                                 if (!Surrogate.isLow(c2))
 413                                     return CoderResult.malformedForLength(1);
 414                                 outSize = toEUC(c, c2, bb);
 415                                 inSize = 2;
 416                             } else if (Surrogate.isLow(c)) {
 417                                 return CoderResult.malformedForLength(1);
 418                             }
 419                         }
 420                     }
 421                     if (outSize == -1)
 422                         return CoderResult.unmappableForLength(inSize);
 423                     if (dst.remaining() < outSize)
 424                         return CoderResult.OVERFLOW;
 425                     for (int i = 0; i < outSize; i++)
 426                         dst.put((byte)bb[i]);
 427                     mark += inSize;
 428                 }
 429                 return CoderResult.UNDERFLOW;
 430             } finally {
 431                 src.position(mark);
 432             }
 433         }
 434 
 435         protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst)
 436         {
 437             if (src.hasArray() && dst.hasArray())
 438                 return encodeArrayLoop(src, dst);
 439             else
 440                 return encodeBufferLoop(src, dst);
 441         }
 442 
 443         static int encode(char hi, char low, byte[] bb) {
 444             int c = Surrogate.toUCS4(hi, low);
 445             if ((c & 0xf0000) != 0x20000)
 446                 return -1;
 447             c -= 0x20000;
 448             int index = c2bSuppIndex[c >> 8];
 449             if (index  == UNMAPPABLE_ENCODING)
 450                 return -1;
 451             index = index + (c & 0xff);
 452             int db = c2bSupp[index];
 453             if (db == UNMAPPABLE_ENCODING)
 454                 return -1;
 455             int p = (c2bPlane[index] >> 4) & 0xf;
 456             bb[0] = (byte)SS2;
 457             bb[1] = (byte)(0xa0 | p);
 458             bb[2] = (byte)(db >> 8);
 459             bb[3] = (byte)db;
 460             return 4;
 461         }
 462     
 463         static int encode(char c, byte[] bb) {
 464             int index = c2bIndex[c >> 8];
 465             if (index  == UNMAPPABLE_ENCODING)
 466                 return -1;
 467             index = index + (c & 0xff);
 468             int db = c2b[index];
 469             if (db == UNMAPPABLE_ENCODING)
 470                 return -1;
 471             int p = c2bPlane[index] & 0xf;
 472             if (p == 0) {
 473                 bb[0] = (byte)(db >> 8);
 474                 bb[1] = (byte)db;
 475                 return 2;
 476             } else {
 477                 bb[0] = (byte)SS2;
 478                 bb[1] = (byte)(0xa0 | p);
 479                 bb[2] = (byte)(db >> 8);
 480                 bb[3] = (byte)db;
 481                 return 4;
 482             }
 483         }
 484 
 485         static final char[] c2b;
 486         static final char[] c2bIndex;
 487         static final char[] c2bSupp;
 488         static final char[] c2bSuppIndex;
 489         static final byte[] c2bPlane;
 490         static {
 491             int b1Min    =  Decoder.b1Min;
 492             int b1Max    =  Decoder.b1Max;
 493             int b2Min    =  Decoder.b2Min;
 494             int b2Max    =  Decoder.b2Max;
 495             int dbSegSize = Decoder.dbSegSize;
 496             String[] b2c = Decoder.b2c;
 497             byte[] b2cIsSupp = Decoder.b2cIsSupp;
 498 
 499             c2bIndex = EUC_TWMapping.c2bIndex;
 500             c2bSuppIndex = EUC_TWMapping.c2bSuppIndex;
 501             char[] c2b0 = new char[EUC_TWMapping.C2BSIZE];
 502             char[] c2bSupp0 = new char[EUC_TWMapping.C2BSUPPSIZE];
 503             byte[] c2bPlane0 = new byte[Math.max(EUC_TWMapping.C2BSIZE,
 504                                                  EUC_TWMapping.C2BSUPPSIZE)];
 505                         
 506             Arrays.fill(c2b0, (char)UNMAPPABLE_ENCODING);
 507             Arrays.fill(c2bSupp0, (char)UNMAPPABLE_ENCODING);
 508 
 509             for (int p = 0; p < b2c.length; p++) {
 510                 String db = b2c[p];
 511                 /*
 512                    adjust the "plane" from 0..7 to 0, 2, 3, 4, 5, 6, 7, 0xf,
 513                    which helps balance between footprint (to save the plane
 514                    info in 4 bits) and runtime performance (to require only
 515                    one operation "0xa0 | plane" to encode the plane byte)
 516                 */
 517                 int plane = p;
 518                 if (plane == 7)
 519                     plane = 0xf;
 520                 else if (plane != 0)
 521                     plane = p + 1;
 522 
 523                 int off = 0;
 524                 for (int b1 = b1Min; b1 <= b1Max; b1++) {
 525                     for (int b2 = b2Min; b2 <= b2Max; b2++) {
 526                         char c = db.charAt(off);
 527                         if (c != UNMAPPABLE_DECODING) {
 528                             if ((b2cIsSupp[off] & (1 << p)) != 0) {
 529                                 int index = c2bSuppIndex[c >> 8] + (c&0xff);
 530                                 c2bSupp0[index] = (char)((b1 << 8) + b2);
 531                                 c2bPlane0[index] |= (byte)(plane << 4); 
 532                             } else {
 533                                 int index = c2bIndex[c >> 8] + (c&0xff);
 534                                 c2b0[index] = (char)((b1 << 8) + b2);
 535                                 c2bPlane0[index] |= (byte)plane;
 536                             }
 537                         }
 538                         off++;
 539                     }
 540                 }
 541             }
 542             c2b = c2b0;
 543             c2bSupp = c2bSupp0;
 544             c2bPlane = c2bPlane0;
 545         }
 546     }
 547 }
 548