1 /* 2 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package sun.nio.cs; 27 28 import java.nio.Buffer; 29 import java.nio.ByteBuffer; 30 import java.nio.CharBuffer; 31 import java.nio.charset.Charset; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.charset.CharsetEncoder; 34 import java.nio.charset.CoderResult; 35 import java.nio.charset.CodingErrorAction; 36 37 /* Legal UTF-8 Byte Sequences 38 * 39 * # Code Points Bits Bit/Byte pattern 40 * 1 7 0xxxxxxx 41 * U+0000..U+007F 00..7F 42 * 43 * 2 11 110xxxxx 10xxxxxx 44 * U+0080..U+07FF C2..DF 80..BF 45 * 46 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 47 * U+0800..U+0FFF E0 A0..BF 80..BF 48 * U+1000..U+FFFF E1..EF 80..BF 80..BF 49 * 50 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 51 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 52 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 53 * U+100000..U10FFFF F4 80..8F 80..BF 80..BF 54 * 55 */ 56 57 public final class UTF_8 extends Unicode { 58 59 public static final UTF_8 INSTANCE = new UTF_8(); 60 61 public UTF_8() { 62 super("UTF-8", StandardCharsets.aliases_UTF_8()); 63 } 64 65 public String historicalName() { 66 return "UTF8"; 67 } 68 69 public CharsetDecoder newDecoder() { 70 return new Decoder(this); 71 } 72 73 public CharsetEncoder newEncoder() { 74 return new Encoder(this); 75 } 76 77 static final void updatePositions(Buffer src, int sp, 78 Buffer dst, int dp) { 79 src.position(sp - src.arrayOffset()); 80 dst.position(dp - dst.arrayOffset()); 81 } 82 83 private static class Decoder extends CharsetDecoder { 84 85 private Decoder(Charset cs) { 86 super(cs, 1.0f, 1.0f); 87 } 88 89 private static boolean isNotContinuation(int b) { 90 return (b & 0xc0) != 0x80; 91 } 92 93 // [E0] [A0..BF] [80..BF] 94 // [E1..EF] [80..BF] [80..BF] 95 private static boolean isMalformed3(int b1, int b2, int b3) { 96 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 97 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 98 } 99 100 // only used when there is only one byte left in src buffer 101 private static boolean isMalformed3_2(int b1, int b2) { 102 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 103 (b2 & 0xc0) != 0x80; 104 } 105 106 // [F0] [90..BF] [80..BF] [80..BF] 107 // [F1..F3] [80..BF] [80..BF] [80..BF] 108 // [F4] [80..8F] [80..BF] [80..BF] 109 // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] 110 // will be checked by Character.isSupplementaryCodePoint(uc) 111 private static boolean isMalformed4(int b2, int b3, int b4) { 112 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 113 (b4 & 0xc0) != 0x80; 114 } 115 116 // only used when there is less than 4 bytes left in src buffer. 117 // both b1 and b2 should be "& 0xff" before passed in. 118 private static boolean isMalformed4_2(int b1, int b2) { 119 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 120 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 121 (b2 & 0xc0) != 0x80; 122 } 123 124 // tests if b1 and b2 are malformed as the first 2 bytes of a 125 // legal`4-byte utf-8 byte sequence. 126 // only used when there is less than 4 bytes left in src buffer, 127 // after isMalformed4_2 has been invoked. 128 private static boolean isMalformed4_3(int b3) { 129 return (b3 & 0xc0) != 0x80; 130 } 131 132 private static CoderResult lookupN(ByteBuffer src, int n) 133 { 134 for (int i = 1; i < n; i++) { 135 if (isNotContinuation(src.get())) 136 return CoderResult.malformedForLength(i); 137 } 138 return CoderResult.malformedForLength(n); 139 } 140 141 private static CoderResult malformedN(ByteBuffer src, int nb) { 142 switch (nb) { 143 case 1: 144 case 2: // always 1 145 return CoderResult.malformedForLength(1); 146 case 3: 147 int b1 = src.get(); 148 int b2 = src.get(); // no need to lookup b3 149 return CoderResult.malformedForLength( 150 ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 151 isNotContinuation(b2)) ? 1 : 2); 152 case 4: // we don't care the speed here 153 b1 = src.get() & 0xff; 154 b2 = src.get() & 0xff; 155 if (b1 > 0xf4 || 156 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 157 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 158 isNotContinuation(b2)) 159 return CoderResult.malformedForLength(1); 160 if (isNotContinuation(src.get())) 161 return CoderResult.malformedForLength(2); 162 return CoderResult.malformedForLength(3); 163 default: 164 assert false; 165 return null; 166 } 167 } 168 169 private static CoderResult malformed(ByteBuffer src, int sp, 170 CharBuffer dst, int dp, 171 int nb) 172 { 173 src.position(sp - src.arrayOffset()); 174 CoderResult cr = malformedN(src, nb); 175 updatePositions(src, sp, dst, dp); 176 return cr; 177 } 178 179 180 private static CoderResult malformed(ByteBuffer src, 181 int mark, int nb) 182 { 183 src.position(mark); 184 CoderResult cr = malformedN(src, nb); 185 src.position(mark); 186 return cr; 187 } 188 189 private static CoderResult malformedForLength(ByteBuffer src, 190 int sp, 191 CharBuffer dst, 192 int dp, 193 int malformedNB) 194 { 195 updatePositions(src, sp, dst, dp); 196 return CoderResult.malformedForLength(malformedNB); 197 } 198 199 private static CoderResult malformedForLength(ByteBuffer src, 200 int mark, 201 int malformedNB) 202 { 203 src.position(mark); 204 return CoderResult.malformedForLength(malformedNB); 205 } 206 207 208 private static CoderResult xflow(Buffer src, int sp, int sl, 209 Buffer dst, int dp, int nb) { 210 updatePositions(src, sp, dst, dp); 211 return (nb == 0 || sl - sp < nb) 212 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 213 } 214 215 private static CoderResult xflow(Buffer src, int mark, int nb) { 216 src.position(mark); 217 return (nb == 0 || src.remaining() < nb) 218 ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW; 219 } 220 221 private CoderResult decodeArrayLoop(ByteBuffer src, 222 CharBuffer dst) 223 { 224 // This method is optimized for ASCII input. 225 byte[] sa = src.array(); 226 int sp = src.arrayOffset() + src.position(); 227 int sl = src.arrayOffset() + src.limit(); 228 229 char[] da = dst.array(); 230 int dp = dst.arrayOffset() + dst.position(); 231 int dl = dst.arrayOffset() + dst.limit(); 232 int dlASCII = dp + Math.min(sl - sp, dl - dp); 233 234 // ASCII only loop 235 while (dp < dlASCII && sa[sp] >= 0) 236 da[dp++] = (char) sa[sp++]; 237 while (sp < sl) { 238 int b1 = sa[sp]; 239 if (b1 >= 0) { 240 // 1 byte, 7 bits: 0xxxxxxx 241 if (dp >= dl) 242 return xflow(src, sp, sl, dst, dp, 1); 243 da[dp++] = (char) b1; 244 sp++; 245 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 246 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 247 // [C2..DF] [80..BF] 248 if (sl - sp < 2 || dp >= dl) 249 return xflow(src, sp, sl, dst, dp, 2); 250 int b2 = sa[sp + 1]; 251 // Now we check the first byte of 2-byte sequence as 252 // if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) 253 // no longer need to check b1 against c1 & c0 for 254 // malformed as we did in previous version 255 // (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; 256 // only need to check the second byte b2. 257 if (isNotContinuation(b2)) 258 return malformedForLength(src, sp, dst, dp, 1); 259 da[dp++] = (char) (((b1 << 6) ^ b2) 260 ^ 261 (((byte) 0xC0 << 6) ^ 262 ((byte) 0x80 << 0))); 263 sp += 2; 264 } else if ((b1 >> 4) == -2) { 265 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 266 int srcRemaining = sl - sp; 267 if (srcRemaining < 3 || dp >= dl) { 268 if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1])) 269 return malformedForLength(src, sp, dst, dp, 1); 270 return xflow(src, sp, sl, dst, dp, 3); 271 } 272 int b2 = sa[sp + 1]; 273 int b3 = sa[sp + 2]; 274 if (isMalformed3(b1, b2, b3)) 275 return malformed(src, sp, dst, dp, 3); 276 char c = (char) 277 ((b1 << 12) ^ 278 (b2 << 6) ^ 279 (b3 ^ 280 (((byte) 0xE0 << 12) ^ 281 ((byte) 0x80 << 6) ^ 282 ((byte) 0x80 << 0)))); 283 if (Character.isSurrogate(c)) 284 return malformedForLength(src, sp, dst, dp, 3); 285 da[dp++] = c; 286 sp += 3; 287 } else if ((b1 >> 3) == -2) { 288 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 289 int srcRemaining = sl - sp; 290 if (srcRemaining < 4 || dl - dp < 2) { 291 b1 &= 0xff; 292 if (b1 > 0xf4 || 293 srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff)) 294 return malformedForLength(src, sp, dst, dp, 1); 295 if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2])) 296 return malformedForLength(src, sp, dst, dp, 2); 297 return xflow(src, sp, sl, dst, dp, 4); 298 } 299 int b2 = sa[sp + 1]; 300 int b3 = sa[sp + 2]; 301 int b4 = sa[sp + 3]; 302 int uc = ((b1 << 18) ^ 303 (b2 << 12) ^ 304 (b3 << 6) ^ 305 (b4 ^ 306 (((byte) 0xF0 << 18) ^ 307 ((byte) 0x80 << 12) ^ 308 ((byte) 0x80 << 6) ^ 309 ((byte) 0x80 << 0)))); 310 if (isMalformed4(b2, b3, b4) || 311 // shortest form check 312 !Character.isSupplementaryCodePoint(uc)) { 313 return malformed(src, sp, dst, dp, 4); 314 } 315 da[dp++] = Character.highSurrogate(uc); 316 da[dp++] = Character.lowSurrogate(uc); 317 sp += 4; 318 } else 319 return malformed(src, sp, dst, dp, 1); 320 } 321 return xflow(src, sp, sl, dst, dp, 0); 322 } 323 324 private CoderResult decodeBufferLoop(ByteBuffer src, 325 CharBuffer dst) 326 { 327 int mark = src.position(); 328 int limit = src.limit(); 329 while (mark < limit) { 330 int b1 = src.get(); 331 if (b1 >= 0) { 332 // 1 byte, 7 bits: 0xxxxxxx 333 if (dst.remaining() < 1) 334 return xflow(src, mark, 1); // overflow 335 dst.put((char) b1); 336 mark++; 337 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 338 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 339 if (limit - mark < 2|| dst.remaining() < 1) 340 return xflow(src, mark, 2); 341 int b2 = src.get(); 342 if (isNotContinuation(b2)) 343 return malformedForLength(src, mark, 1); 344 dst.put((char) (((b1 << 6) ^ b2) 345 ^ 346 (((byte) 0xC0 << 6) ^ 347 ((byte) 0x80 << 0)))); 348 mark += 2; 349 } else if ((b1 >> 4) == -2) { 350 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 351 int srcRemaining = limit - mark; 352 if (srcRemaining < 3 || dst.remaining() < 1) { 353 if (srcRemaining > 1 && isMalformed3_2(b1, src.get())) 354 return malformedForLength(src, mark, 1); 355 return xflow(src, mark, 3); 356 } 357 int b2 = src.get(); 358 int b3 = src.get(); 359 if (isMalformed3(b1, b2, b3)) 360 return malformed(src, mark, 3); 361 char c = (char) 362 ((b1 << 12) ^ 363 (b2 << 6) ^ 364 (b3 ^ 365 (((byte) 0xE0 << 12) ^ 366 ((byte) 0x80 << 6) ^ 367 ((byte) 0x80 << 0)))); 368 if (Character.isSurrogate(c)) 369 return malformedForLength(src, mark, 3); 370 dst.put(c); 371 mark += 3; 372 } else if ((b1 >> 3) == -2) { 373 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 374 int srcRemaining = limit - mark; 375 if (srcRemaining < 4 || dst.remaining() < 2) { 376 b1 &= 0xff; 377 if (b1 > 0xf4 || 378 srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff)) 379 return malformedForLength(src, mark, 1); 380 if (srcRemaining > 2 && isMalformed4_3(src.get())) 381 return malformedForLength(src, mark, 2); 382 return xflow(src, mark, 4); 383 } 384 int b2 = src.get(); 385 int b3 = src.get(); 386 int b4 = src.get(); 387 int uc = ((b1 << 18) ^ 388 (b2 << 12) ^ 389 (b3 << 6) ^ 390 (b4 ^ 391 (((byte) 0xF0 << 18) ^ 392 ((byte) 0x80 << 12) ^ 393 ((byte) 0x80 << 6) ^ 394 ((byte) 0x80 << 0)))); 395 if (isMalformed4(b2, b3, b4) || 396 // shortest form check 397 !Character.isSupplementaryCodePoint(uc)) { 398 return malformed(src, mark, 4); 399 } 400 dst.put(Character.highSurrogate(uc)); 401 dst.put(Character.lowSurrogate(uc)); 402 mark += 4; 403 } else { 404 return malformed(src, mark, 1); 405 } 406 } 407 return xflow(src, mark, 0); 408 } 409 410 protected CoderResult decodeLoop(ByteBuffer src, 411 CharBuffer dst) 412 { 413 if (src.hasArray() && dst.hasArray()) 414 return decodeArrayLoop(src, dst); 415 else 416 return decodeBufferLoop(src, dst); 417 } 418 419 private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp) 420 { 421 if (bb == null) 422 bb = ByteBuffer.wrap(ba); 423 bb.position(sp); 424 return bb; 425 } 426 } 427 428 private static final class Encoder extends CharsetEncoder { 429 430 private Encoder(Charset cs) { 431 super(cs, 1.1f, 3.0f); 432 } 433 434 public boolean canEncode(char c) { 435 return !Character.isSurrogate(c); 436 } 437 438 public boolean isLegalReplacement(byte[] repl) { 439 return ((repl.length == 1 && repl[0] >= 0) || 440 super.isLegalReplacement(repl)); 441 } 442 443 private static CoderResult overflow(CharBuffer src, int sp, 444 ByteBuffer dst, int dp) { 445 updatePositions(src, sp, dst, dp); 446 return CoderResult.OVERFLOW; 447 } 448 449 private static CoderResult overflow(CharBuffer src, int mark) { 450 src.position(mark); 451 return CoderResult.OVERFLOW; 452 } 453 454 private Surrogate.Parser sgp; 455 private CoderResult encodeArrayLoop(CharBuffer src, 456 ByteBuffer dst) 457 { 458 char[] sa = src.array(); 459 int sp = src.arrayOffset() + src.position(); 460 int sl = src.arrayOffset() + src.limit(); 461 462 byte[] da = dst.array(); 463 int dp = dst.arrayOffset() + dst.position(); 464 int dl = dst.arrayOffset() + dst.limit(); 465 int dlASCII = dp + Math.min(sl - sp, dl - dp); 466 467 // ASCII only loop 468 while (dp < dlASCII && sa[sp] < '\u0080') 469 da[dp++] = (byte) sa[sp++]; 470 while (sp < sl) { 471 char c = sa[sp]; 472 if (c < 0x80) { 473 // Have at most seven bits 474 if (dp >= dl) 475 return overflow(src, sp, dst, dp); 476 da[dp++] = (byte)c; 477 } else if (c < 0x800) { 478 // 2 bytes, 11 bits 479 if (dl - dp < 2) 480 return overflow(src, sp, dst, dp); 481 da[dp++] = (byte)(0xc0 | (c >> 6)); 482 da[dp++] = (byte)(0x80 | (c & 0x3f)); 483 } else if (Character.isSurrogate(c)) { 484 // Have a surrogate pair 485 if (sgp == null) 486 sgp = new Surrogate.Parser(); 487 int uc = sgp.parse(c, sa, sp, sl); 488 if (uc < 0) { 489 updatePositions(src, sp, dst, dp); 490 return sgp.error(); 491 } 492 if (dl - dp < 4) 493 return overflow(src, sp, dst, dp); 494 da[dp++] = (byte)(0xf0 | ((uc >> 18))); 495 da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 496 da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 497 da[dp++] = (byte)(0x80 | (uc & 0x3f)); 498 sp++; // 2 chars 499 } else { 500 // 3 bytes, 16 bits 501 if (dl - dp < 3) 502 return overflow(src, sp, dst, dp); 503 da[dp++] = (byte)(0xe0 | ((c >> 12))); 504 da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 505 da[dp++] = (byte)(0x80 | (c & 0x3f)); 506 } 507 sp++; 508 } 509 updatePositions(src, sp, dst, dp); 510 return CoderResult.UNDERFLOW; 511 } 512 513 private CoderResult encodeBufferLoop(CharBuffer src, 514 ByteBuffer dst) 515 { 516 int mark = src.position(); 517 while (src.hasRemaining()) { 518 char c = src.get(); 519 if (c < 0x80) { 520 // Have at most seven bits 521 if (!dst.hasRemaining()) 522 return overflow(src, mark); 523 dst.put((byte)c); 524 } else if (c < 0x800) { 525 // 2 bytes, 11 bits 526 if (dst.remaining() < 2) 527 return overflow(src, mark); 528 dst.put((byte)(0xc0 | (c >> 6))); 529 dst.put((byte)(0x80 | (c & 0x3f))); 530 } else if (Character.isSurrogate(c)) { 531 // Have a surrogate pair 532 if (sgp == null) 533 sgp = new Surrogate.Parser(); 534 int uc = sgp.parse(c, src); 535 if (uc < 0) { 536 src.position(mark); 537 return sgp.error(); 538 } 539 if (dst.remaining() < 4) 540 return overflow(src, mark); 541 dst.put((byte)(0xf0 | ((uc >> 18)))); 542 dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); 543 dst.put((byte)(0x80 | ((uc >> 6) & 0x3f))); 544 dst.put((byte)(0x80 | (uc & 0x3f))); 545 mark++; // 2 chars 546 } else { 547 // 3 bytes, 16 bits 548 if (dst.remaining() < 3) 549 return overflow(src, mark); 550 dst.put((byte)(0xe0 | ((c >> 12)))); 551 dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); 552 dst.put((byte)(0x80 | (c & 0x3f))); 553 } 554 mark++; 555 } 556 src.position(mark); 557 return CoderResult.UNDERFLOW; 558 } 559 560 protected final CoderResult encodeLoop(CharBuffer src, 561 ByteBuffer dst) 562 { 563 if (src.hasArray() && dst.hasArray()) 564 return encodeArrayLoop(src, dst); 565 else 566 return encodeBufferLoop(src, dst); 567 } 568 569 } 570 }