1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All Rights Reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Sun designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Sun in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, 22 * CA 95054 USA or visit www.sun.com if you need additional information or 23 * have any questions. 24 */ 25 26 package sun.nio.cs.ext; 27 28 import java.io.*; 29 import java.nio.CharBuffer; 30 import java.nio.ByteBuffer; 31 import java.nio.charset.Charset; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.charset.CharsetEncoder; 34 import java.nio.charset.CoderResult; 35 import java.util.Arrays; 36 import sun.nio.cs.HistoricallyNamedCharset; 37 import sun.nio.cs.Surrogate; 38 import static sun.nio.cs.CharsetMapping.*; 39 40 public class EUC_TW extends Charset implements HistoricallyNamedCharset 41 { 42 private static final int SS2 = 0x8E; 43 44 /* 45 (1) EUC_TW 46 Second byte of EUC_TW for cs2 is in range of 47 0xA1-0xB0 for plane 1-16. According to CJKV /163, 48 plane1 is coded in both cs1 and cs2. This impl 49 however does not decode the codepoints of plane1 50 in cs2, so only p2-p7 and p15 are supported in cs2. 51 52 Plane2 0xA2; 53 Plane3 0xA3; 54 Plane4 0xA4; 55 Plane5 0xA5; 56 Plane6 0xA6; 57 Plane7 0xA7; 58 Plane15 0xAF; 59 60 (2) Mapping 61 The fact that all supplementary characters encoded in EUC_TW are 62 in 0x2xxxx range gives us the room to optimize the data tables. 63 64 Decoding: 65 (1) save the lower 16-bit value of all codepoints of b->c mapping 66 in a String array table String[plane] b2c. 67 (2) save "codepoint is supplementary" info (one bit) in a 68 byte[] b2cIsSupp, so 8 codepoints (same codepoint value, different 69 plane No) share one byte. 70 71 Encoding: 72 (1)c->b mappings are stored in 73 char[]c2b/char[]c2bIndex 74 char[]c2bSupp/char[]c2bIndexsupp (indexed by lower 16-bit 75 (2)byte[] c2bPlane stores the "plane info" of each euc-tw codepoints, 76 BMP and Supp share the low/high 4 bits of one byte. 77 78 Mapping tables are stored separated in EUC_TWMapping, which 79 is generated by tool. 80 */ 81 82 public EUC_TW() { 83 super("x-EUC-TW", ExtendedCharsets.aliasesFor("x-EUC-TW")); 84 } 85 86 public String historicalName() { 87 return "EUC_TW"; 88 } 89 90 public boolean contains(Charset cs) { 91 return ((cs.name().equals("US-ASCII")) 92 || (cs instanceof EUC_TW)); 93 } 94 95 public CharsetDecoder newDecoder() { 96 return new Decoder(this); 97 } 98 99 public CharsetEncoder newEncoder() { 100 return new Encoder(this); 101 } 102 103 public static class Decoder extends CharsetDecoder { 104 public Decoder(Charset cs) { 105 super(cs, 2.0f, 2.0f); 106 } 107 108 char[] c1 = new char[1]; 109 char[] c2 = new char[2]; 110 public char[] toUnicode(int b1, int b2, int p) { 111 return decode(b1, b2, p, c1, c2); 112 } 113 114 static final String[] b2c = EUC_TWMapping.b2c; 115 static final int b1Min = EUC_TWMapping.b1Min; 116 static final int b1Max = EUC_TWMapping.b1Max; 117 static final int b2Min = EUC_TWMapping.b2Min; 118 static final int b2Max = EUC_TWMapping.b2Max; 119 static final int dbSegSize = b2Max - b2Min + 1; 120 static final byte[] b2cIsSupp; 121 122 // adjust from cns planeNo to the plane index of b2c 123 static final byte[] cnspToIndex = new byte[0x100]; 124 static { 125 Arrays.fill(cnspToIndex, (byte)-1); 126 cnspToIndex[0xa2] = 1; cnspToIndex[0xa3] = 2; cnspToIndex[0xa4] = 3; 127 cnspToIndex[0xa5] = 4; cnspToIndex[0xa6] = 5; cnspToIndex[0xa7] = 6; 128 cnspToIndex[0xaf] = 7; 129 } 130 131 //static final BitSet b2cIsSupp; 132 static { 133 String b2cIsSuppStr = EUC_TWMapping.b2cIsSuppStr; 134 // work on a local copy is much faster than operate 135 // directly on b2cIsSupp 136 byte[] flag = new byte[b2cIsSuppStr.length() << 1]; 137 int off = 0; 138 for (int i = 0; i < b2cIsSuppStr.length(); i++) { 139 char c = b2cIsSuppStr.charAt(i); 140 flag[off++] = (byte)(c >> 8); 141 flag[off++] = (byte)(c & 0xff); 142 } 143 b2cIsSupp = flag; 144 } 145 146 static boolean isLegalDB(int b) { 147 return b >= b1Min && b <= b1Max; 148 } 149 150 static char[] decode(int b1, int b2, int p, char[] c1, char[] c2) 151 { 152 if (b1 < b1Min || b1 > b1Max || b2 < b2Min || b2 > b2Max) 153 return null; 154 int index = (b1 - b1Min) * dbSegSize + b2 - b2Min; 155 char c = b2c[p].charAt(index); 156 if (c == UNMAPPABLE_DECODING) 157 return null; 158 if ((b2cIsSupp[index] & (1 << p)) == 0) { 159 c1[0] = c; 160 return c1; 161 } else { 162 c2[0] = Surrogate.high(0x20000 + c); 163 c2[1] = Surrogate.low(0x20000 + c); 164 return c2; 165 } 166 } 167 168 private CoderResult decodeArrayLoop(ByteBuffer src, 169 CharBuffer dst) 170 { 171 byte[] sa = src.array(); 172 int sp = src.arrayOffset() + src.position(); 173 int sl = src.arrayOffset() + src.limit(); 174 175 char[] da = dst.array(); 176 int dp = dst.arrayOffset() + dst.position(); 177 int dl = dst.arrayOffset() + dst.limit(); 178 try { 179 while (sp < sl) { 180 int byte1 = sa[sp] & 0xff; 181 if (byte1 == SS2) { // Codeset 2 G2 182 if ( sl - sp < 4) 183 return CoderResult.UNDERFLOW; 184 int cnsPlane = cnspToIndex[sa[sp + 1] & 0xff]; 185 if (cnsPlane < 0) 186 return CoderResult.malformedForLength(2); 187 byte1 = sa[sp + 2] & 0xff; 188 int byte2 = sa[sp + 3] & 0xff; 189 char[] cc = toUnicode(byte1, byte2, cnsPlane); 190 if (cc == null) { 191 if (!isLegalDB(byte1) || !isLegalDB(byte2)) 192 return CoderResult.malformedForLength(4); 193 return CoderResult.unmappableForLength(4); 194 } 195 if (dl - dp < cc.length) 196 return CoderResult.OVERFLOW; 197 if (cc.length == 1) { 198 da[dp++] = cc[0]; 199 } else { 200 da[dp++] = cc[0]; 201 da[dp++] = cc[1]; 202 } 203 sp += 4; 204 } else if (byte1 < 0x80) { // ASCII G0 205 if (dl - dp < 1) 206 return CoderResult.OVERFLOW; 207 da[dp++] = (char) byte1; 208 sp++; 209 } else { // Codeset 1 G1 210 if ( sl - sp < 2) 211 return CoderResult.UNDERFLOW; 212 int byte2 = sa[sp + 1] & 0xff; 213 char[] cc = toUnicode(byte1, byte2, 0); 214 if (cc == null) { 215 if (!isLegalDB(byte1) || !isLegalDB(byte2)) 216 return CoderResult.malformedForLength(1); 217 return CoderResult.unmappableForLength(2); 218 } 219 if (dl - dp < 1) 220 return CoderResult.OVERFLOW; 221 da[dp++] = cc[0]; 222 sp += 2; 223 } 224 } 225 return CoderResult.UNDERFLOW; 226 } finally { 227 src.position(sp - src.arrayOffset()); 228 dst.position(dp - dst.arrayOffset()); 229 } 230 } 231 232 private CoderResult decodeBufferLoop(ByteBuffer src, 233 CharBuffer dst) 234 { 235 int mark = src.position(); 236 try { 237 while (src.hasRemaining()) { 238 int byte1 = src.get() & 0xff; 239 if (byte1 == SS2) { // Codeset 2 G2 240 if ( src.remaining() < 3) 241 return CoderResult.UNDERFLOW; 242 int cnsPlane = cnspToIndex[src.get() & 0xff]; 243 if (cnsPlane < 0) 244 return CoderResult.malformedForLength(2); 245 byte1 = src.get() & 0xff; 246 int byte2 = src.get() & 0xff; 247 char[] cc = toUnicode(byte1, byte2, cnsPlane); 248 if (cc == null) { 249 if (!isLegalDB(byte1) || !isLegalDB(byte2)) 250 return CoderResult.malformedForLength(4); 251 return CoderResult.unmappableForLength(4); 252 } 253 if (dst.remaining() < cc.length) 254 return CoderResult.OVERFLOW; 255 if (cc.length == 1) { 256 dst.put(cc[0]); 257 } else { 258 dst.put(cc[0]); 259 dst.put(cc[1]); 260 } 261 mark += 4; 262 } else if (byte1 < 0x80) { // ASCII G0 263 if (!dst.hasRemaining()) 264 return CoderResult.OVERFLOW; 265 dst.put((char) byte1); 266 mark++; 267 } else { // Codeset 1 G1 268 if (!src.hasRemaining()) 269 return CoderResult.UNDERFLOW; 270 int byte2 = src.get() & 0xff; 271 char[] cc = toUnicode(byte1, byte2, 0); 272 if (cc == null) { 273 if (!isLegalDB(byte1) || !isLegalDB(byte2)) 274 return CoderResult.malformedForLength(1); 275 return CoderResult.unmappableForLength(2); 276 } 277 if (!dst.hasRemaining()) 278 return CoderResult.OVERFLOW; 279 dst.put(cc[0]); 280 mark +=2; 281 } 282 } 283 return CoderResult.UNDERFLOW; 284 } finally { 285 src.position(mark); 286 } 287 } 288 289 protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) 290 { 291 if (src.hasArray() && dst.hasArray()) 292 return decodeArrayLoop(src, dst); 293 else 294 return decodeBufferLoop(src, dst); 295 } 296 } 297 298 public static class Encoder extends CharsetEncoder { 299 private byte[] bb = new byte[4]; 300 301 public Encoder(Charset cs) { 302 super(cs, 4.0f, 4.0f); 303 } 304 305 public boolean canEncode(char c) { 306 return (c <= '\u007f' || toEUC(c, bb) != -1); 307 } 308 309 public boolean canEncode(CharSequence cs) { 310 int i = 0; 311 while (i < cs.length()) { 312 char c = cs.charAt(i++); 313 if (Surrogate.isHigh(c)) { 314 if (i == cs.length()) 315 return false; 316 char low = cs.charAt(i++); 317 if (!Surrogate.isLow(low) || toEUC(c, low, bb) == -1) 318 return false; 319 } else if (!canEncode(c)) { 320 return false; 321 } 322 } 323 return true; 324 } 325 326 public int toEUC(char hi, char low, byte[] bb) { 327 return encode(hi, low, bb); 328 } 329 330 public int toEUC(char c, byte[] bb) { 331 return encode(c, bb); 332 } 333 334 private CoderResult encodeArrayLoop(CharBuffer src, 335 ByteBuffer dst) 336 { 337 char[] sa = src.array(); 338 int sp = src.arrayOffset() + src.position(); 339 int sl = src.arrayOffset() + src.limit(); 340 341 byte[] da = dst.array(); 342 int dp = dst.arrayOffset() + dst.position(); 343 int dl = dst.arrayOffset() + dst.limit(); 344 345 int inSize; 346 int outSize; 347 348 try { 349 while (sp < sl) { 350 char c = sa[sp]; 351 inSize = 1; 352 if (c < 0x80) { // ASCII 353 bb[0] = (byte)c; 354 outSize = 1; 355 } else { 356 outSize = toEUC(c, bb); 357 if (outSize == -1) { 358 // to check surrogates only after BMP failed 359 // has the benefit of improving the BMP encoding 360 // 10% faster, with the price of the slowdown of 361 // supplementary character encoding. given the use 362 // of supplementary characters is really rare, this 363 // is something worth doing. 364 if (Surrogate.isHigh(c)) { 365 if ((sp + 1) == sl) 366 return CoderResult.UNDERFLOW; 367 if (!Surrogate.isLow(sa[sp + 1])) 368 return CoderResult.malformedForLength(1); 369 outSize = toEUC(c, sa[sp+1], bb); 370 inSize = 2; 371 } else if (Surrogate.isLow(c)) { 372 return CoderResult.malformedForLength(1); 373 } 374 } 375 } 376 if (outSize == -1) 377 return CoderResult.unmappableForLength(inSize); 378 if ( dl - dp < outSize) 379 return CoderResult.OVERFLOW; 380 for (int i = 0; i < outSize; i++) 381 da[dp++] = bb[i]; 382 sp += inSize; 383 } 384 return CoderResult.UNDERFLOW; 385 } finally { 386 src.position(sp - src.arrayOffset()); 387 dst.position(dp - dst.arrayOffset()); 388 } 389 } 390 391 private CoderResult encodeBufferLoop(CharBuffer src, 392 ByteBuffer dst) 393 { 394 int outSize; 395 int inSize; 396 int mark = src.position(); 397 398 try { 399 while (src.hasRemaining()) { 400 inSize = 1; 401 char c = src.get(); 402 if (c < 0x80) { // ASCII 403 outSize = 1; 404 bb[0] = (byte)c; 405 } else { 406 outSize = toEUC(c, bb); 407 if (outSize == -1) { 408 if (Surrogate.isHigh(c)) { 409 if (!src.hasRemaining()) 410 return CoderResult.UNDERFLOW; 411 char c2 = src.get(); 412 if (!Surrogate.isLow(c2)) 413 return CoderResult.malformedForLength(1); 414 outSize = toEUC(c, c2, bb); 415 inSize = 2; 416 } else if (Surrogate.isLow(c)) { 417 return CoderResult.malformedForLength(1); 418 } 419 } 420 } 421 if (outSize == -1) 422 return CoderResult.unmappableForLength(inSize); 423 if (dst.remaining() < outSize) 424 return CoderResult.OVERFLOW; 425 for (int i = 0; i < outSize; i++) 426 dst.put((byte)bb[i]); 427 mark += inSize; 428 } 429 return CoderResult.UNDERFLOW; 430 } finally { 431 src.position(mark); 432 } 433 } 434 435 protected CoderResult encodeLoop(CharBuffer src, ByteBuffer dst) 436 { 437 if (src.hasArray() && dst.hasArray()) 438 return encodeArrayLoop(src, dst); 439 else 440 return encodeBufferLoop(src, dst); 441 } 442 443 static int encode(char hi, char low, byte[] bb) { 444 int c = Surrogate.toUCS4(hi, low); 445 if ((c & 0xf0000) != 0x20000) 446 return -1; 447 c -= 0x20000; 448 int index = c2bSuppIndex[c >> 8]; 449 if (index == UNMAPPABLE_ENCODING) 450 return -1; 451 index = index + (c & 0xff); 452 int db = c2bSupp[index]; 453 if (db == UNMAPPABLE_ENCODING) 454 return -1; 455 int p = (c2bPlane[index] >> 4) & 0xf; 456 bb[0] = (byte)SS2; 457 bb[1] = (byte)(0xa0 | p); 458 bb[2] = (byte)(db >> 8); 459 bb[3] = (byte)db; 460 return 4; 461 } 462 463 static int encode(char c, byte[] bb) { 464 int index = c2bIndex[c >> 8]; 465 if (index == UNMAPPABLE_ENCODING) 466 return -1; 467 index = index + (c & 0xff); 468 int db = c2b[index]; 469 if (db == UNMAPPABLE_ENCODING) 470 return -1; 471 int p = c2bPlane[index] & 0xf; 472 if (p == 0) { 473 bb[0] = (byte)(db >> 8); 474 bb[1] = (byte)db; 475 return 2; 476 } else { 477 bb[0] = (byte)SS2; 478 bb[1] = (byte)(0xa0 | p); 479 bb[2] = (byte)(db >> 8); 480 bb[3] = (byte)db; 481 return 4; 482 } 483 } 484 485 static final char[] c2b; 486 static final char[] c2bIndex; 487 static final char[] c2bSupp; 488 static final char[] c2bSuppIndex; 489 static final byte[] c2bPlane; 490 static { 491 int b1Min = Decoder.b1Min; 492 int b1Max = Decoder.b1Max; 493 int b2Min = Decoder.b2Min; 494 int b2Max = Decoder.b2Max; 495 int dbSegSize = Decoder.dbSegSize; 496 String[] b2c = Decoder.b2c; 497 byte[] b2cIsSupp = Decoder.b2cIsSupp; 498 499 c2bIndex = EUC_TWMapping.c2bIndex; 500 c2bSuppIndex = EUC_TWMapping.c2bSuppIndex; 501 char[] c2b0 = new char[EUC_TWMapping.C2BSIZE]; 502 char[] c2bSupp0 = new char[EUC_TWMapping.C2BSUPPSIZE]; 503 byte[] c2bPlane0 = new byte[Math.max(EUC_TWMapping.C2BSIZE, 504 EUC_TWMapping.C2BSUPPSIZE)]; 505 506 Arrays.fill(c2b0, (char)UNMAPPABLE_ENCODING); 507 Arrays.fill(c2bSupp0, (char)UNMAPPABLE_ENCODING); 508 509 for (int p = 0; p < b2c.length; p++) { 510 String db = b2c[p]; 511 /* 512 adjust the "plane" from 0..7 to 0, 2, 3, 4, 5, 6, 7, 0xf, 513 which helps balance between footprint (to save the plane 514 info in 4 bits) and runtime performance (to require only 515 one operation "0xa0 | plane" to encode the plane byte) 516 */ 517 int plane = p; 518 if (plane == 7) 519 plane = 0xf; 520 else if (plane != 0) 521 plane = p + 1; 522 523 int off = 0; 524 for (int b1 = b1Min; b1 <= b1Max; b1++) { 525 for (int b2 = b2Min; b2 <= b2Max; b2++) { 526 char c = db.charAt(off); 527 if (c != UNMAPPABLE_DECODING) { 528 if ((b2cIsSupp[off] & (1 << p)) != 0) { 529 int index = c2bSuppIndex[c >> 8] + (c&0xff); 530 c2bSupp0[index] = (char)((b1 << 8) + b2); 531 c2bPlane0[index] |= (byte)(plane << 4); 532 } else { 533 int index = c2bIndex[c >> 8] + (c&0xff); 534 c2b0[index] = (char)((b1 << 8) + b2); 535 c2bPlane0[index] |= (byte)plane; 536 } 537 } 538 off++; 539 } 540 } 541 } 542 c2b = c2b0; 543 c2bSupp = c2bSupp0; 544 c2bPlane = c2bPlane0; 545 } 546 } 547 } 548