1 /* 2 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package sun.nio.cs; 27 28 import java.nio.Buffer; 29 import java.nio.ByteBuffer; 30 import java.nio.CharBuffer; 31 import java.nio.charset.Charset; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.charset.CharsetEncoder; 34 import java.nio.charset.CoderResult; 35 import java.nio.charset.CodingErrorAction; 36 37 /* Legal UTF-8 Byte Sequences 38 * 39 * # Code Points Bits Bit/Byte pattern 40 * 1 7 0xxxxxxx 41 * U+0000..U+007F 00..7F 42 * 43 * 2 11 110xxxxx 10xxxxxx 44 * U+0080..U+07FF C2..DF 80..BF 45 * 46 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 47 * U+0800..U+0FFF E0 A0..BF 80..BF 48 * U+1000..U+FFFF E1..EF 80..BF 80..BF 49 * 50 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 51 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 52 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 53 * U+100000..U10FFFF F4 80..8F 80..BF 80..BF 54 * 55 */ 56 57 public final class UTF_8 extends Unicode { 58 public UTF_8() { 59 super("UTF-8", StandardCharsets.aliases_UTF_8()); 60 } 61 62 public String historicalName() { 63 return "UTF8"; 64 } 65 66 public CharsetDecoder newDecoder() { 67 return new Decoder(this); 68 } 69 70 public CharsetEncoder newEncoder() { 71 return new Encoder(this); 72 } 73 74 static final void updatePositions(Buffer src, int sp, 75 Buffer dst, int dp) { 76 src.position(sp - src.arrayOffset()); 77 dst.position(dp - dst.arrayOffset()); 78 } 79 80 private static class Decoder extends CharsetDecoder { 81 82 private Decoder(Charset cs) { 83 super(cs, 1.0f, 1.0f); 84 } 85 86 private static boolean isNotContinuation(int b) { 87 return (b & 0xc0) != 0x80; 88 } 89 90 // [E0] [A0..BF] [80..BF] 91 // [E1..EF] [80..BF] [80..BF] 92 private static boolean isMalformed3(int b1, int b2, int b3) { 93 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 94 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 95 } 96 97 // only used when there is only one byte left in src buffer 98 private static boolean isMalformed3_2(int b1, int b2) { 99 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 100 (b2 & 0xc0) != 0x80; 101 } 102 103 // [F0] [90..BF] [80..BF] [80..BF] 104 // [F1..F3] [80..BF] [80..BF] [80..BF] 105 // [F4] [80..8F] [80..BF] [80..BF] 106 // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] 107 // will be checked by Character.isSupplementaryCodePoint(uc) 108 private static boolean isMalformed4(int b2, int b3, int b4) { 109 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 110 (b4 & 0xc0) != 0x80; 111 } 112 113 // only used when there is less than 4 bytes left in src buffer. 114 // both b1 and b2 should be "& 0xff" before passed in. 115 private static boolean isMalformed4_2(int b1, int b2) { 116 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 117 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 118 (b2 & 0xc0) != 0x80; 119 } 120 121 // tests if b1 and b2 are malformed as the first 2 bytes of a 122 // legal`4-byte utf-8 byte sequence. 123 // only used when there is less than 4 bytes left in src buffer, 124 // after isMalformed4_2 has been invoked. 125 private static boolean isMalformed4_3(int b3) { 126 return (b3 & 0xc0) != 0x80; 127 } 128 129 private static CoderResult lookupN(ByteBuffer src, int n) 130 { 131 for (int i = 1; i < n; i++) { 132 if (isNotContinuation(src.get())) 133 return CoderResult.malformedForLength(i); 134 } 135 return CoderResult.malformedForLength(n); 136 } 137 138 private static CoderResult malformedN(ByteBuffer src, int nb) { 139 switch (nb) { 140 case 1: 141 case 2: // always 1 142 return CoderResult.malformedForLength(1); 143 case 3: 144 int b1 = src.get(); 145 int b2 = src.get(); // no need to lookup b3 146 return CoderResult.malformedForLength( 147 ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 148 isNotContinuation(b2)) ? 1 : 2); 149 case 4: // we don't care the speed here 150 b1 = src.get() & 0xff; 151 b2 = src.get() & 0xff; 152 if (b1 > 0xf4 || 153 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 154 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 155 isNotContinuation(b2)) 156 return CoderResult.malformedForLength(1); 157 if (isNotContinuation(src.get())) 158 return CoderResult.malformedForLength(2); 159 return CoderResult.malformedForLength(3); 160 default: 161 assert false; 162 return null; 163 } 164 } 165 166 private static CoderResult malformed(ByteBuffer src, int sp, 167 CharBuffer dst, int dp, 168 int nb) 169 { 170 src.position(sp - src.arrayOffset()); 171 CoderResult cr = malformedN(src, nb); 172 updatePositions(src, sp, dst, dp); 173 return cr; 174 } 175 176 177 private static CoderResult malformed(ByteBuffer src, 178 int mark, int nb) 179 { 180 src.position(mark); 181 CoderResult cr = malformedN(src, nb); 182 src.position(mark); 183 return cr; 184 } 185 186 private static CoderResult malformedForLength(ByteBuffer src, 187 int sp, 188 CharBuffer dst, 189 int dp, 190 int malformedNB) 191 { 192 updatePositions(src, sp, dst, dp); 193 return CoderResult.malformedForLength(malformedNB); 194 } 195 196 private static CoderResult malformedForLength(ByteBuffer src, 197 int mark, 198 int malformedNB) 199 { 200 src.position(mark); 201 return CoderResult.malformedForLength(malformedNB); 202 } 203 204 205 private static CoderResult xflow(Buffer src, int sp, int sl, 206 Buffer dst, int dp, int nb) { 207 updatePositions(src, sp, dst, dp); 208 return (nb == 0 || sl - sp < nb) 209 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 210 } 211 212 private static CoderResult xflow(Buffer src, int mark, int nb) { 213 src.position(mark); 214 return (nb == 0 || src.remaining() < nb) 215 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 216 } 217 218 private CoderResult decodeArrayLoop(ByteBuffer src, 219 CharBuffer dst) 220 { 221 // This method is optimized for ASCII input. 222 byte[] sa = src.array(); 223 int sp = src.arrayOffset() + src.position(); 224 int sl = src.arrayOffset() + src.limit(); 225 226 char[] da = dst.array(); 227 int dp = dst.arrayOffset() + dst.position(); 228 int dl = dst.arrayOffset() + dst.limit(); 229 int dlASCII = dp + Math.min(sl - sp, dl - dp); 230 231 // ASCII only loop 232 while (dp < dlASCII && sa[sp] >= 0) 233 da[dp++] = (char) sa[sp++]; 234 while (sp < sl) { 235 int b1 = sa[sp]; 236 if (b1 >= 0) { 237 // 1 byte, 7 bits: 0xxxxxxx 238 if (dp >= dl) 239 return xflow(src, sp, sl, dst, dp, 1); 240 da[dp++] = (char) b1; 241 sp++; 242 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 243 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 244 // [C2..DF] [80..BF] 245 if (sl - sp < 2 || dp >= dl) 246 return xflow(src, sp, sl, dst, dp, 2); 247 int b2 = sa[sp + 1]; 248 // Now we check the first byte of 2-byte sequence as 249 // if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) 250 // no longer need to check b1 against c1 & c0 for 251 // malformed as we did in previous version 252 // (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; 253 // only need to check the second byte b2. 254 if (isNotContinuation(b2)) 255 return malformedForLength(src, sp, dst, dp, 1); 256 da[dp++] = (char) (((b1 << 6) ^ b2) 257 ^ 258 (((byte) 0xC0 << 6) ^ 259 ((byte) 0x80 << 0))); 260 sp += 2; 261 } else if ((b1 >> 4) == -2) { 262 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 263 int srcRemaining = sl - sp; 264 if (srcRemaining < 3 || dp >= dl) { 265 if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) 266 return malformedForLength(src, sp, dst, dp, 1); 267 return xflow(src, sp, sl, dst, dp, 3); 268 } 269 int b2 = sa[sp + 1]; 270 int b3 = sa[sp + 2]; 271 if (isMalformed3(b1, b2, b3)) 272 return malformed(src, sp, dst, dp, 3); 273 char c = (char) 274 ((b1 << 12) ^ 275 (b2 << 6) ^ 276 (b3 ^ 277 (((byte) 0xE0 << 12) ^ 278 ((byte) 0x80 << 6) ^ 279 ((byte) 0x80 << 0)))); 280 if (Character.isSurrogate(c)) 281 return malformedForLength(src, sp, dst, dp, 3); 282 da[dp++] = c; 283 sp += 3; 284 } else if ((b1 >> 3) == -2) { 285 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 286 int srcRemaining = sl - sp; 287 if (srcRemaining < 4 || dl - dp < 2) { 288 b1 &= 0xff; 289 if (b1 > 0xf4 || 290 srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff)) 291 return malformedForLength(src, sp, dst, dp, 1); 292 if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2])) 293 return malformedForLength(src, sp, dst, dp, 2); 294 return xflow(src, sp, sl, dst, dp, 4); 295 } 296 int b2 = sa[sp + 1]; 297 int b3 = sa[sp + 2]; 298 int b4 = sa[sp + 3]; 299 int uc = ((b1 << 18) ^ 300 (b2 << 12) ^ 301 (b3 << 6) ^ 302 (b4 ^ 303 (((byte) 0xF0 << 18) ^ 304 ((byte) 0x80 << 12) ^ 305 ((byte) 0x80 << 6) ^ 306 ((byte) 0x80 << 0)))); 307 if (isMalformed4(b2, b3, b4) || 308 // shortest form check 309 !Character.isSupplementaryCodePoint(uc)) { 310 return malformed(src, sp, dst, dp, 4); 311 } 312 da[dp++] = Character.highSurrogate(uc); 313 da[dp++] = Character.lowSurrogate(uc); 314 sp += 4; 315 } else 316 return malformed(src, sp, dst, dp, 1); 317 } 318 return xflow(src, sp, sl, dst, dp, 0); 319 } 320 321 private CoderResult decodeBufferLoop(ByteBuffer src, 322 CharBuffer dst) 323 { 324 int mark = src.position(); 325 int limit = src.limit(); 326 while (mark < limit) { 327 int b1 = src.get(); 328 if (b1 >= 0) { 329 // 1 byte, 7 bits: 0xxxxxxx 330 if (dst.remaining() < 1) 331 return xflow(src, mark, 1); // overflow 332 dst.put((char) b1); 333 mark++; 334 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 335 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 336 if (limit - mark < 2|| dst.remaining() < 1) 337 return xflow(src, mark, 2); 338 int b2 = src.get(); 339 if (isNotContinuation(b2)) 340 return malformedForLength(src, mark, 1); 341 dst.put((char) (((b1 << 6) ^ b2) 342 ^ 343 (((byte) 0xC0 << 6) ^ 344 ((byte) 0x80 << 0)))); 345 mark += 2; 346 } else if ((b1 >> 4) == -2) { 347 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 348 int srcRemaining = limit - mark; 349 if (srcRemaining < 3 || dst.remaining() < 1) { 350 if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) 351 return malformedForLength(src, mark, 1); 352 return xflow(src, mark, 3); 353 } 354 int b2 = src.get(); 355 int b3 = src.get(); 356 if (isMalformed3(b1, b2, b3)) 357 return malformed(src, mark, 3); 358 char c = (char) 359 ((b1 << 12) ^ 360 (b2 << 6) ^ 361 (b3 ^ 362 (((byte) 0xE0 << 12) ^ 363 ((byte) 0x80 << 6) ^ 364 ((byte) 0x80 << 0)))); 365 if (Character.isSurrogate(c)) 366 return malformedForLength(src, mark, 3); 367 dst.put(c); 368 mark += 3; 369 } else if ((b1 >> 3) == -2) { 370 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 371 int srcRemaining = limit - mark; 372 if (srcRemaining < 4 || dst.remaining() < 2) { 373 b1 &= 0xff; 374 if (b1 > 0xf4 || 375 srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff)) 376 return malformedForLength(src, mark, 1); 377 if (srcRemaining > 2 && isMalformed4_3(src.get())) 378 return malformedForLength(src, mark, 2); 379 return xflow(src, mark, 4); 380 } 381 int b2 = src.get(); 382 int b3 = src.get(); 383 int b4 = src.get(); 384 int uc = ((b1 << 18) ^ 385 (b2 << 12) ^ 386 (b3 << 6) ^ 387 (b4 ^ 388 (((byte) 0xF0 << 18) ^ 389 ((byte) 0x80 << 12) ^ 390 ((byte) 0x80 << 6) ^ 391 ((byte) 0x80 << 0)))); 392 if (isMalformed4(b2, b3, b4) || 393 // shortest form check 394 !Character.isSupplementaryCodePoint(uc)) { 395 return malformed(src, mark, 4); 396 } 397 dst.put(Character.highSurrogate(uc)); 398 dst.put(Character.lowSurrogate(uc)); 399 mark += 4; 400 } else { 401 return malformed(src, mark, 1); 402 } 403 } 404 return xflow(src, mark, 0); 405 } 406 407 protected CoderResult decodeLoop(ByteBuffer src, 408 CharBuffer dst) 409 { 410 if (src.hasArray() && dst.hasArray()) 411 return decodeArrayLoop(src, dst); 412 else 413 return decodeBufferLoop(src, dst); 414 } 415 416 private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp) 417 { 418 if (bb == null) 419 bb = ByteBuffer.wrap(ba); 420 bb.position(sp); 421 return bb; 422 } 423 } 424 425 private static final class Encoder extends CharsetEncoder { 426 427 private Encoder(Charset cs) { 428 super(cs, 1.1f, 3.0f); 429 } 430 431 public boolean canEncode(char c) { 432 return !Character.isSurrogate(c); 433 } 434 435 public boolean isLegalReplacement(byte[] repl) { 436 return ((repl.length == 1 && repl[0] >= 0) || 437 super.isLegalReplacement(repl)); 438 } 439 440 private static CoderResult overflow(CharBuffer src, int sp, 441 ByteBuffer dst, int dp) { 442 updatePositions(src, sp, dst, dp); 443 return CoderResult.OVERFLOW; 444 } 445 446 private static CoderResult overflow(CharBuffer src, int mark) { 447 src.position(mark); 448 return CoderResult.OVERFLOW; 449 } 450 451 private Surrogate.Parser sgp; 452 private CoderResult encodeArrayLoop(CharBuffer src, 453 ByteBuffer dst) 454 { 455 char[] sa = src.array(); 456 int sp = src.arrayOffset() + src.position(); 457 int sl = src.arrayOffset() + src.limit(); 458 459 byte[] da = dst.array(); 460 int dp = dst.arrayOffset() + dst.position(); 461 int dl = dst.arrayOffset() + dst.limit(); 462 int dlASCII = dp + Math.min(sl - sp, dl - dp); 463 464 // ASCII only loop 465 while (dp < dlASCII && sa[sp] < '\u0080') 466 da[dp++] = (byte) sa[sp++]; 467 while (sp < sl) { 468 char c = sa[sp]; 469 if (c < 0x80) { 470 // Have at most seven bits 471 if (dp >= dl) 472 return overflow(src, sp, dst, dp); 473 da[dp++] = (byte)c; 474 } else if (c < 0x800) { 475 // 2 bytes, 11 bits 476 if (dl - dp < 2) 477 return overflow(src, sp, dst, dp); 478 da[dp++] = (byte)(0xc0 | (c >> 6)); 479 da[dp++] = (byte)(0x80 | (c & 0x3f)); 480 } else if (Character.isSurrogate(c)) { 481 // Have a surrogate pair 482 if (sgp == null) 483 sgp = new Surrogate.Parser(); 484 int uc = sgp.parse(c, sa, sp, sl); 485 if (uc < 0) { 486 updatePositions(src, sp, dst, dp); 487 return sgp.error(); 488 } 489 if (dl - dp < 4) 490 return overflow(src, sp, dst, dp); 491 da[dp++] = (byte)(0xf0 | ((uc >> 18))); 492 da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 493 da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 494 da[dp++] = (byte)(0x80 | (uc & 0x3f)); 495 sp++; // 2 chars 496 } else { 497 // 3 bytes, 16 bits 498 if (dl - dp < 3) 499 return overflow(src, sp, dst, dp); 500 da[dp++] = (byte)(0xe0 | ((c >> 12))); 501 da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 502 da[dp++] = (byte)(0x80 | (c & 0x3f)); 503 } 504 sp++; 505 } 506 updatePositions(src, sp, dst, dp); 507 return CoderResult.UNDERFLOW; 508 } 509 510 private CoderResult encodeBufferLoop(CharBuffer src, 511 ByteBuffer dst) 512 { 513 int mark = src.position(); 514 while (src.hasRemaining()) { 515 char c = src.get(); 516 if (c < 0x80) { 517 // Have at most seven bits 518 if (!dst.hasRemaining()) 519 return overflow(src, mark); 520 dst.put((byte)c); 521 } else if (c < 0x800) { 522 // 2 bytes, 11 bits 523 if (dst.remaining() < 2) 524 return overflow(src, mark); 525 dst.put((byte)(0xc0 | (c >> 6))); 526 dst.put((byte)(0x80 | (c & 0x3f))); 527 } else if (Character.isSurrogate(c)) { 528 // Have a surrogate pair 529 if (sgp == null) 530 sgp = new Surrogate.Parser(); 531 int uc = sgp.parse(c, src); 532 if (uc < 0) { 533 src.position(mark); 534 return sgp.error(); 535 } 536 if (dst.remaining() < 4) 537 return overflow(src, mark); 538 dst.put((byte)(0xf0 | ((uc >> 18)))); 539 dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); 540 dst.put((byte)(0x80 | ((uc >> 6) & 0x3f))); 541 dst.put((byte)(0x80 | (uc & 0x3f))); 542 mark++; // 2 chars 543 } else { 544 // 3 bytes, 16 bits 545 if (dst.remaining() < 3) 546 return overflow(src, mark); 547 dst.put((byte)(0xe0 | ((c >> 12)))); 548 dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); 549 dst.put((byte)(0x80 | (c & 0x3f))); 550 } 551 mark++; 552 } 553 src.position(mark); 554 return CoderResult.UNDERFLOW; 555 } 556 557 protected final CoderResult encodeLoop(CharBuffer src, 558 ByteBuffer dst) 559 { 560 if (src.hasArray() && dst.hasArray()) 561 return encodeArrayLoop(src, dst); 562 else 563 return encodeBufferLoop(src, dst); 564 } 565 566 } 567 }