1 /* 2 * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package sun.nio.cs; 27 28 import java.nio.Buffer; 29 import java.nio.ByteBuffer; 30 import java.nio.CharBuffer; 31 import java.nio.charset.Charset; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.charset.CharsetEncoder; 34 import java.nio.charset.CoderResult; 35 import java.nio.charset.CodingErrorAction; 36 37 /* Legal CESU-8 Byte Sequences 38 * 39 * # Code Points Bits Bit/Byte pattern 40 * 1 7 0xxxxxxx 41 * U+0000..U+007F 00..7F 42 * 43 * 2 11 110xxxxx 10xxxxxx 44 * U+0080..U+07FF C2..DF 80..BF 45 * 46 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 47 * U+0800..U+0FFF E0 A0..BF 80..BF 48 * U+1000..U+FFFF E1..EF 80..BF 80..BF 49 * 50 */ 51 52 class CESU_8 extends Unicode 53 { 54 public CESU_8() { 55 super("CESU-8", StandardCharsets.aliases_CESU_8()); 56 } 57 58 public String historicalName() { 59 return "CESU8"; 60 } 61 62 public CharsetDecoder newDecoder() { 63 return new Decoder(this); 64 } 65 66 public CharsetEncoder newEncoder() { 67 return new Encoder(this); 68 } 69 70 private static final void updatePositions(Buffer src, int sp, 71 Buffer dst, int dp) { 72 src.position(sp - src.arrayOffset()); 73 dst.position(dp - dst.arrayOffset()); 74 } 75 76 private static class Decoder extends CharsetDecoder 77 implements ArrayDecoder { 78 private Decoder(Charset cs) { 79 super(cs, 1.0f, 1.0f); 80 } 81 82 private static boolean isNotContinuation(int b) { 83 return (b & 0xc0) != 0x80; 84 } 85 86 // [E0] [A0..BF] [80..BF] 87 // [E1..EF] [80..BF] [80..BF] 88 private static boolean isMalformed3(int b1, int b2, int b3) { 89 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 90 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 91 } 92 93 // only used when there is only one byte left in src buffer 94 private static boolean isMalformed3_2(int b1, int b2) { 95 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 96 (b2 & 0xc0) != 0x80; 97 } 98 99 100 // [F0] [90..BF] [80..BF] [80..BF] 101 // [F1..F3] [80..BF] [80..BF] [80..BF] 102 // [F4] [80..8F] [80..BF] [80..BF] 103 // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] 104 // will be checked by Character.isSupplementaryCodePoint(uc) 105 private static boolean isMalformed4(int b2, int b3, int b4) { 106 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 107 (b4 & 0xc0) != 0x80; 108 } 109 110 // only used when there is less than 4 bytes left in src buffer 111 private static boolean isMalformed4_2(int b1, int b2) { 112 return (b1 == 0xf0 && b2 == 0x90) || 113 (b2 & 0xc0) != 0x80; 114 } 115 116 private static boolean isMalformed4_3(int b3) { 117 return (b3 & 0xc0) != 0x80; 118 } 119 120 private static CoderResult malformedN(ByteBuffer src, int nb) { 121 switch (nb) { 122 case 1: 123 case 2: // always 1 124 return CoderResult.malformedForLength(1); 125 case 3: 126 int b1 = src.get(); 127 int b2 = src.get(); // no need to lookup b3 128 return CoderResult.malformedForLength( 129 ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 130 isNotContinuation(b2)) ? 1 : 2); 131 case 4: // we don't care the speed here 132 b1 = src.get() & 0xff; 133 b2 = src.get() & 0xff; 134 if (b1 > 0xf4 || 135 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 136 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 137 isNotContinuation(b2)) 138 return CoderResult.malformedForLength(1); 139 if (isNotContinuation(src.get())) 140 return CoderResult.malformedForLength(2); 141 return CoderResult.malformedForLength(3); 142 default: 143 assert false; 144 return null; 145 } 146 } 147 148 private static CoderResult malformed(ByteBuffer src, int sp, 149 CharBuffer dst, int dp, 150 int nb) 151 { 152 src.position(sp - src.arrayOffset()); 153 CoderResult cr = malformedN(src, nb); 154 updatePositions(src, sp, dst, dp); 155 return cr; 156 } 157 158 159 private static CoderResult malformed(ByteBuffer src, 160 int mark, int nb) 161 { 162 src.position(mark); 163 CoderResult cr = malformedN(src, nb); 164 src.position(mark); 165 return cr; 166 } 167 168 private static CoderResult malformedForLength(ByteBuffer src, 169 int sp, 170 CharBuffer dst, 171 int dp, 172 int malformedNB) 173 { 174 updatePositions(src, sp, dst, dp); 175 return CoderResult.malformedForLength(malformedNB); 176 } 177 178 private static CoderResult malformedForLength(ByteBuffer src, 179 int mark, 180 int malformedNB) 181 { 182 src.position(mark); 183 return CoderResult.malformedForLength(malformedNB); 184 } 185 186 187 private static CoderResult xflow(Buffer src, int sp, int sl, 188 Buffer dst, int dp, int nb) { 189 updatePositions(src, sp, dst, dp); 190 return (nb == 0 || sl - sp < nb) 191 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 192 } 193 194 private static CoderResult xflow(Buffer src, int mark, int nb) { 195 src.position(mark); 196 return (nb == 0 || src.remaining() < nb) 197 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 198 } 199 200 private CoderResult decodeArrayLoop(ByteBuffer src, 201 CharBuffer dst) 202 { 203 // This method is optimized for ASCII input. 204 byte[] sa = src.array(); 205 int sp = src.arrayOffset() + src.position(); 206 int sl = src.arrayOffset() + src.limit(); 207 208 char[] da = dst.array(); 209 int dp = dst.arrayOffset() + dst.position(); 210 int dl = dst.arrayOffset() + dst.limit(); 211 int dlASCII = dp + Math.min(sl - sp, dl - dp); 212 213 // ASCII only loop 214 while (dp < dlASCII && sa[sp] >= 0) 215 da[dp++] = (char) sa[sp++]; 216 while (sp < sl) { 217 int b1 = sa[sp]; 218 if (b1 >= 0) { 219 // 1 byte, 7 bits: 0xxxxxxx 220 if (dp >= dl) 221 return xflow(src, sp, sl, dst, dp, 1); 222 da[dp++] = (char) b1; 223 sp++; 224 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 225 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 226 if (sl - sp < 2 || dp >= dl) 227 return xflow(src, sp, sl, dst, dp, 2); 228 int b2 = sa[sp + 1]; 229 if (isNotContinuation(b2)) 230 return malformedForLength(src, sp, dst, dp, 1); 231 da[dp++] = (char) (((b1 << 6) ^ b2) 232 ^ 233 (((byte) 0xC0 << 6) ^ 234 ((byte) 0x80 << 0))); 235 sp += 2; 236 } else if ((b1 >> 4) == -2) { 237 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 238 int srcRemaining = sl - sp; 239 if (srcRemaining < 3 || dp >= dl) { 240 if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) 241 return malformedForLength(src, sp, dst, dp, 1); 242 return xflow(src, sp, sl, dst, dp, 3); 243 } 244 int b2 = sa[sp + 1]; 245 int b3 = sa[sp + 2]; 246 if (isMalformed3(b1, b2, b3)) 247 return malformed(src, sp, dst, dp, 3); 248 da[dp++] = (char) 249 ((b1 << 12) ^ 250 (b2 << 6) ^ 251 (b3 ^ 252 (((byte) 0xE0 << 12) ^ 253 ((byte) 0x80 << 6) ^ 254 ((byte) 0x80 << 0)))); 255 sp += 3; 256 } else { 257 return malformed(src, sp, dst, dp, 1); 258 } 259 } 260 return xflow(src, sp, sl, dst, dp, 0); 261 } 262 263 private CoderResult decodeBufferLoop(ByteBuffer src, 264 CharBuffer dst) 265 { 266 int mark = src.position(); 267 int limit = src.limit(); 268 while (mark < limit) { 269 int b1 = src.get(); 270 if (b1 >= 0) { 271 // 1 byte, 7 bits: 0xxxxxxx 272 if (dst.remaining() < 1) 273 return xflow(src, mark, 1); // overflow 274 dst.put((char) b1); 275 mark++; 276 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 277 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 278 if (limit - mark < 2|| dst.remaining() < 1) 279 return xflow(src, mark, 2); 280 int b2 = src.get(); 281 if (isNotContinuation(b2)) 282 return malformedForLength(src, mark, 1); 283 dst.put((char) (((b1 << 6) ^ b2) 284 ^ 285 (((byte) 0xC0 << 6) ^ 286 ((byte) 0x80 << 0)))); 287 mark += 2; 288 } else if ((b1 >> 4) == -2) { 289 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 290 int srcRemaining = limit - mark; 291 if (srcRemaining < 3 || dst.remaining() < 1) { 292 if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) 293 return malformedForLength(src, mark, 1); 294 return xflow(src, mark, 3); 295 } 296 int b2 = src.get(); 297 int b3 = src.get(); 298 if (isMalformed3(b1, b2, b3)) 299 return malformed(src, mark, 3); 300 dst.put((char) 301 ((b1 << 12) ^ 302 (b2 << 6) ^ 303 (b3 ^ 304 (((byte) 0xE0 << 12) ^ 305 ((byte) 0x80 << 6) ^ 306 ((byte) 0x80 << 0))))); 307 mark += 3; 308 } else { 309 return malformed(src, mark, 1); 310 } 311 } 312 return xflow(src, mark, 0); 313 } 314 315 protected CoderResult decodeLoop(ByteBuffer src, 316 CharBuffer dst) 317 { 318 if (src.hasArray() && dst.hasArray()) 319 return decodeArrayLoop(src, dst); 320 else 321 return decodeBufferLoop(src, dst); 322 } 323 324 private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp) 325 { 326 if (bb == null) 327 bb = ByteBuffer.wrap(ba); 328 bb.position(sp); 329 return bb; 330 } 331 332 // returns -1 if there is/are malformed byte(s) and the 333 // "action" for malformed input is not REPLACE. 334 public int decode(byte[] sa, int sp, int len, char[] da) { 335 final int sl = sp + len; 336 int dp = 0; 337 int dlASCII = Math.min(len, da.length); 338 ByteBuffer bb = null; // only necessary if malformed 339 340 // ASCII only optimized loop 341 while (dp < dlASCII && sa[sp] >= 0) 342 da[dp++] = (char) sa[sp++]; 343 344 while (sp < sl) { 345 int b1 = sa[sp++]; 346 if (b1 >= 0) { 347 // 1 byte, 7 bits: 0xxxxxxx 348 da[dp++] = (char) b1; 349 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 350 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 351 if (sp < sl) { 352 int b2 = sa[sp++]; 353 if (isNotContinuation(b2)) { 354 if (malformedInputAction() != CodingErrorAction.REPLACE) 355 return -1; 356 da[dp++] = replacement().charAt(0); 357 sp--; // malformedN(bb, 2) always returns 1 358 } else { 359 da[dp++] = (char) (((b1 << 6) ^ b2)^ 360 (((byte) 0xC0 << 6) ^ 361 ((byte) 0x80 << 0))); 362 } 363 continue; 364 } 365 if (malformedInputAction() != CodingErrorAction.REPLACE) 366 return -1; 367 da[dp++] = replacement().charAt(0); 368 return dp; 369 } else if ((b1 >> 4) == -2) { 370 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 371 if (sp + 1 < sl) { 372 int b2 = sa[sp++]; 373 int b3 = sa[sp++]; 374 if (isMalformed3(b1, b2, b3)) { 375 if (malformedInputAction() != CodingErrorAction.REPLACE) 376 return -1; 377 da[dp++] = replacement().charAt(0); 378 sp -=3; 379 bb = getByteBuffer(bb, sa, sp); 380 sp += malformedN(bb, 3).length(); 381 } else { 382 da[dp++] = (char)((b1 << 12) ^ 383 (b2 << 6) ^ 384 (b3 ^ 385 (((byte) 0xE0 << 12) ^ 386 ((byte) 0x80 << 6) ^ 387 ((byte) 0x80 << 0)))); 388 } 389 continue; 390 } 391 if (malformedInputAction() != CodingErrorAction.REPLACE) 392 return -1; 393 if (sp < sl && isMalformed3_2(b1, sa[sp])) { 394 da[dp++] = replacement().charAt(0); 395 continue; 396 397 } 398 da[dp++] = replacement().charAt(0); 399 return dp; 400 } else { 401 if (malformedInputAction() != CodingErrorAction.REPLACE) 402 return -1; 403 da[dp++] = replacement().charAt(0); 404 } 405 } 406 return dp; 407 } 408 } 409 410 private static class Encoder extends CharsetEncoder 411 implements ArrayEncoder { 412 413 private Encoder(Charset cs) { 414 super(cs, 1.1f, 3.0f); 415 } 416 417 public boolean canEncode(char c) { 418 return !Character.isSurrogate(c); 419 } 420 421 public boolean isLegalReplacement(byte[] repl) { 422 return ((repl.length == 1 && repl[0] >= 0) || 423 super.isLegalReplacement(repl)); 424 } 425 426 private static CoderResult overflow(CharBuffer src, int sp, 427 ByteBuffer dst, int dp) { 428 updatePositions(src, sp, dst, dp); 429 return CoderResult.OVERFLOW; 430 } 431 432 private static CoderResult overflow(CharBuffer src, int mark) { 433 src.position(mark); 434 return CoderResult.OVERFLOW; 435 } 436 437 private static void to3Bytes(byte[] da, int dp, char c) { 438 da[dp] = (byte)(0xe0 | ((c >> 12))); 439 da[dp + 1] = (byte)(0x80 | ((c >> 6) & 0x3f)); 440 da[dp + 2] = (byte)(0x80 | (c & 0x3f)); 441 } 442 443 private static void to3Bytes(ByteBuffer dst, char c) { 444 dst.put((byte)(0xe0 | ((c >> 12)))); 445 dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); 446 dst.put((byte)(0x80 | (c & 0x3f))); 447 } 448 449 private Surrogate.Parser sgp; 450 private char[] c2; 451 private CoderResult encodeArrayLoop(CharBuffer src, 452 ByteBuffer dst) 453 { 454 char[] sa = src.array(); 455 int sp = src.arrayOffset() + src.position(); 456 int sl = src.arrayOffset() + src.limit(); 457 458 byte[] da = dst.array(); 459 int dp = dst.arrayOffset() + dst.position(); 460 int dl = dst.arrayOffset() + dst.limit(); 461 int dlASCII = dp + Math.min(sl - sp, dl - dp); 462 463 // ASCII only loop 464 while (dp < dlASCII && sa[sp] < '\u0080') 465 da[dp++] = (byte) sa[sp++]; 466 while (sp < sl) { 467 char c = sa[sp]; 468 if (c < 0x80) { 469 // Have at most seven bits 470 if (dp >= dl) 471 return overflow(src, sp, dst, dp); 472 da[dp++] = (byte)c; 473 } else if (c < 0x800) { 474 // 2 bytes, 11 bits 475 if (dl - dp < 2) 476 return overflow(src, sp, dst, dp); 477 da[dp++] = (byte)(0xc0 | (c >> 6)); 478 da[dp++] = (byte)(0x80 | (c & 0x3f)); 479 } else if (Character.isSurrogate(c)) { 480 // Have a surrogate pair 481 if (sgp == null) 482 sgp = new Surrogate.Parser(); 483 int uc = sgp.parse(c, sa, sp, sl); 484 if (uc < 0) { 485 updatePositions(src, sp, dst, dp); 486 return sgp.error(); 487 } 488 if (dl - dp < 6) 489 return overflow(src, sp, dst, dp); 490 to3Bytes(da, dp, Character.highSurrogate(uc)); 491 dp += 3; 492 to3Bytes(da, dp, Character.lowSurrogate(uc)); 493 dp += 3; 494 sp++; // 2 chars 495 } else { 496 // 3 bytes, 16 bits 497 if (dl - dp < 3) 498 return overflow(src, sp, dst, dp); 499 to3Bytes(da, dp, c); 500 dp += 3; 501 } 502 sp++; 503 } 504 updatePositions(src, sp, dst, dp); 505 return CoderResult.UNDERFLOW; 506 } 507 508 private CoderResult encodeBufferLoop(CharBuffer src, 509 ByteBuffer dst) 510 { 511 int mark = src.position(); 512 while (src.hasRemaining()) { 513 char c = src.get(); 514 if (c < 0x80) { 515 // Have at most seven bits 516 if (!dst.hasRemaining()) 517 return overflow(src, mark); 518 dst.put((byte)c); 519 } else if (c < 0x800) { 520 // 2 bytes, 11 bits 521 if (dst.remaining() < 2) 522 return overflow(src, mark); 523 dst.put((byte)(0xc0 | (c >> 6))); 524 dst.put((byte)(0x80 | (c & 0x3f))); 525 } else if (Character.isSurrogate(c)) { 526 // Have a surrogate pair 527 if (sgp == null) 528 sgp = new Surrogate.Parser(); 529 int uc = sgp.parse(c, src); 530 if (uc < 0) { 531 src.position(mark); 532 return sgp.error(); 533 } 534 if (dst.remaining() < 6) 535 return overflow(src, mark); 536 to3Bytes(dst, Character.highSurrogate(uc)); 537 to3Bytes(dst, Character.lowSurrogate(uc)); 538 mark++; // 2 chars 539 } else { 540 // 3 bytes, 16 bits 541 if (dst.remaining() < 3) 542 return overflow(src, mark); 543 to3Bytes(dst, c); 544 } 545 mark++; 546 } 547 src.position(mark); 548 return CoderResult.UNDERFLOW; 549 } 550 551 protected final CoderResult encodeLoop(CharBuffer src, 552 ByteBuffer dst) 553 { 554 if (src.hasArray() && dst.hasArray()) 555 return encodeArrayLoop(src, dst); 556 else 557 return encodeBufferLoop(src, dst); 558 } 559 560 // returns -1 if there is malformed char(s) and the 561 // "action" for malformed input is not REPLACE. 562 public int encode(char[] sa, int sp, int len, byte[] da) { 563 int sl = sp + len; 564 int dp = 0; 565 int dlASCII = dp + Math.min(len, da.length); 566 567 // ASCII only optimized loop 568 while (dp < dlASCII && sa[sp] < '\u0080') 569 da[dp++] = (byte) sa[sp++]; 570 571 while (sp < sl) { 572 char c = sa[sp++]; 573 if (c < 0x80) { 574 // Have at most seven bits 575 da[dp++] = (byte)c; 576 } else if (c < 0x800) { 577 // 2 bytes, 11 bits 578 da[dp++] = (byte)(0xc0 | (c >> 6)); 579 da[dp++] = (byte)(0x80 | (c & 0x3f)); 580 } else if (Character.isSurrogate(c)) { 581 if (sgp == null) 582 sgp = new Surrogate.Parser(); 583 int uc = sgp.parse(c, sa, sp - 1, sl); 584 if (uc < 0) { 585 if (malformedInputAction() != CodingErrorAction.REPLACE) 586 return -1; 587 da[dp++] = replacement()[0]; 588 } else { 589 to3Bytes(da, dp, Character.highSurrogate(uc)); 590 dp += 3; 591 to3Bytes(da, dp, Character.lowSurrogate(uc)); 592 dp += 3; 593 sp++; // 2 chars 594 } 595 } else { 596 // 3 bytes, 16 bits 597 to3Bytes(da, dp, c); 598 dp += 3; 599 } 600 } 601 return dp; 602 } 603 } 604 }