1 /* 2 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package sun.nio.cs; 27 28 import java.nio.Buffer; 29 import java.nio.ByteBuffer; 30 import java.nio.CharBuffer; 31 import java.nio.charset.Charset; 32 import java.nio.charset.CharsetDecoder; 33 import java.nio.charset.CharsetEncoder; 34 import java.nio.charset.CoderResult; 35 36 /* Legal UTF-8 Byte Sequences 37 * 38 * # Code Points Bits Bit/Byte pattern 39 * 1 7 0xxxxxxx 40 * U+0000..U+007F 00..7F 41 * 42 * 2 11 110xxxxx 10xxxxxx 43 * U+0080..U+07FF C2..DF 80..BF 44 * 45 * 3 16 1110xxxx 10xxxxxx 10xxxxxx 46 * U+0800..U+0FFF E0 A0..BF 80..BF 47 * U+1000..U+FFFF E1..EF 80..BF 80..BF 48 * 49 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 50 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 51 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 52 * U+100000..U10FFFF F4 80..8F 80..BF 80..BF 53 * 54 */ 55 56 class UTF_8 extends Unicode 57 { 58 public UTF_8() { 59 super("UTF-8", StandardCharsets.aliases_UTF_8); 60 } 61 62 public String historicalName() { 63 return "UTF8"; 64 } 65 66 public CharsetDecoder newDecoder() { 67 return new Decoder(this); 68 } 69 70 public CharsetEncoder newEncoder() { 71 return new Encoder(this); 72 } 73 74 static final void updatePositions(Buffer src, int sp, 75 Buffer dst, int dp) { 76 src.position(sp - src.arrayOffset()); 77 dst.position(dp - dst.arrayOffset()); 78 } 79 80 private static class Decoder extends CharsetDecoder { 81 private Decoder(Charset cs) { 82 super(cs, 1.0f, 1.0f); 83 } 84 85 private static boolean isNotContinuation(int b) { 86 return (b & 0xc0) != 0x80; 87 } 88 89 // [C2..DF] [80..BF] 90 private static boolean isMalformed2(int b1, int b2) { 91 return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80; 92 } 93 94 // [E0] [A0..BF] [80..BF] 95 // [E1..EF] [80..BF] [80..BF] 96 private static boolean isMalformed3(int b1, int b2, int b3) { 97 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 98 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 99 } 100 101 // [F0] [90..BF] [80..BF] [80..BF] 102 // [F1..F3] [80..BF] [80..BF] [80..BF] 103 // [F4] [80..8F] [80..BF] [80..BF] 104 // only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...] 105 // will be checked by Character.isSupplementaryCodePoint(uc) 106 private static boolean isMalformed4(int b2, int b3, int b4) { 107 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 108 (b4 & 0xc0) != 0x80; 109 } 110 111 private static CoderResult lookupN(ByteBuffer src, int n) 112 { 113 for (int i = 1; i < n; i++) { 114 if (isNotContinuation(src.get())) 115 return CoderResult.malformedForLength(i); 116 } 117 return CoderResult.malformedForLength(n); 118 } 119 120 private static CoderResult malformedN(ByteBuffer src, int nb) { 121 switch (nb) { 122 case 1: 123 int b1 = src.get(); 124 if ((b1 >> 2) == -2) { 125 // 5 bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 126 if (src.remaining() < 4) 127 return CoderResult.UNDERFLOW; 128 return lookupN(src, 5); 129 } 130 if ((b1 >> 1) == -2) { 131 // 6 bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 132 if (src.remaining() < 5) 133 return CoderResult.UNDERFLOW; 134 return lookupN(src, 6); 135 } 136 return CoderResult.malformedForLength(1); 137 case 2: // always 1 138 return CoderResult.malformedForLength(1); 139 case 3: 140 b1 = src.get(); 141 int b2 = src.get(); // no need to lookup b3 142 return CoderResult.malformedForLength( 143 ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 144 isNotContinuation(b2))?1:2); 145 case 4: // we don't care the speed here 146 b1 = src.get() & 0xff; 147 b2 = src.get() & 0xff; 148 if (b1 > 0xf4 || 149 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 150 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 151 isNotContinuation(b2)) 152 return CoderResult.malformedForLength(1); 153 if (isNotContinuation(src.get())) 154 return CoderResult.malformedForLength(2); 155 return CoderResult.malformedForLength(3); 156 default: 157 assert false; 158 return null; 159 } 160 } 161 162 private static CoderResult malformed(ByteBuffer src, int sp, 163 CharBuffer dst, int dp, 164 int nb) 165 { 166 src.position(sp - src.arrayOffset()); 167 CoderResult cr = malformedN(src, nb); 168 updatePositions(src, sp, dst, dp); 169 return cr; 170 } 171 172 private static CoderResult malformed(ByteBuffer src, 173 int mark, int nb) 174 { 175 src.position(mark); 176 CoderResult cr = malformedN(src, nb); 177 src.position(mark); 178 return cr; 179 } 180 181 private static CoderResult xflow(Buffer src, int sp, int sl, 182 Buffer dst, int dp, int nb) { 183 updatePositions(src, sp, dst, dp); 184 return (nb == 0 || sl - sp < nb) 185 ?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; 186 } 187 188 private static CoderResult xflow(Buffer src, int mark, int nb) { 189 CoderResult cr = (nb == 0 || src.remaining() < (nb - 1)) 190 ?CoderResult.UNDERFLOW:CoderResult.OVERFLOW; 191 src.position(mark); 192 return cr; 193 } 194 195 private CoderResult decodeArrayLoop(ByteBuffer src, 196 CharBuffer dst) 197 { 198 // This method is optimized for ASCII input. 199 byte[] sa = src.array(); 200 int sp = src.arrayOffset() + src.position(); 201 int sl = src.arrayOffset() + src.limit(); 202 203 char[] da = dst.array(); 204 int dp = dst.arrayOffset() + dst.position(); 205 int dl = dst.arrayOffset() + dst.limit(); 206 int dlASCII = dp + Math.min(sl - sp, dl - dp); 207 208 // ASCII only loop 209 while (dp < dlASCII && sa[sp] >= 0) 210 da[dp++] = (char) sa[sp++]; 211 212 while (sp < sl) { 213 int b1 = sa[sp]; 214 if (b1 >= 0) { 215 // 1 byte, 7 bits: 0xxxxxxx 216 if (dp >= dl) 217 return xflow(src, sp, sl, dst, dp, 1); 218 da[dp++] = (char) b1; 219 sp++; 220 } else if ((b1 >> 5) == -2) { 221 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 222 if (sl - sp < 2 || dp >= dl) 223 return xflow(src, sp, sl, dst, dp, 2); 224 int b2 = sa[sp + 1]; 225 if (isMalformed2(b1, b2)) 226 return malformed(src, sp, dst, dp, 2); 227 da[dp++] = (char) (((b1 << 6) ^ b2) 228 ^ 229 (((byte) 0xC0 << 6) ^ 230 ((byte) 0x80 << 0))); 231 sp += 2; 232 } else if ((b1 >> 4) == -2) { 233 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 234 if (sl - sp < 3 || dp >= dl) 235 return xflow(src, sp, sl, dst, dp, 3); 236 int b2 = sa[sp + 1]; 237 int b3 = sa[sp + 2]; 238 if (isMalformed3(b1, b2, b3)) 239 return malformed(src, sp, dst, dp, 3); 240 da[dp++] = (char) 241 ((b1 << 12) ^ 242 (b2 << 6) ^ 243 (b3 ^ 244 (((byte) 0xE0 << 12) ^ 245 ((byte) 0x80 << 6) ^ 246 ((byte) 0x80 << 0)))); 247 sp += 3; 248 } else if ((b1 >> 3) == -2) { 249 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 250 if (sl - sp < 4 || dl - dp < 2) 251 return xflow(src, sp, sl, dst, dp, 4); 252 int b2 = sa[sp + 1]; 253 int b3 = sa[sp + 2]; 254 int b4 = sa[sp + 3]; 255 int uc = ((b1 << 18) ^ 256 (b2 << 12) ^ 257 (b3 << 6) ^ 258 (b4 ^ 259 (((byte) 0xF0 << 18) ^ 260 ((byte) 0x80 << 12) ^ 261 ((byte) 0x80 << 6) ^ 262 ((byte) 0x80 << 0)))); 263 if (isMalformed4(b2, b3, b4) || 264 // shortest form check 265 !Character.isSupplementaryCodePoint(uc)) { 266 return malformed(src, sp, dst, dp, 4); 267 } 268 da[dp++] = Character.highSurrogate(uc); 269 da[dp++] = Character.lowSurrogate(uc); 270 sp += 4; 271 } else 272 return malformed(src, sp, dst, dp, 1); 273 } 274 return xflow(src, sp, sl, dst, dp, 0); 275 } 276 277 private CoderResult decodeBufferLoop(ByteBuffer src, 278 CharBuffer dst) 279 { 280 int mark = src.position(); 281 int limit = src.limit(); 282 while (mark < limit) { 283 int b1 = src.get(); 284 if (b1 >= 0) { 285 // 1 byte, 7 bits: 0xxxxxxx 286 if (dst.remaining() < 1) 287 return xflow(src, mark, 1); // overflow 288 dst.put((char) b1); 289 mark++; 290 } else if ((b1 >> 5) == -2) { 291 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx 292 if (limit - mark < 2|| dst.remaining() < 1) 293 return xflow(src, mark, 2); 294 int b2 = src.get(); 295 if (isMalformed2(b1, b2)) 296 return malformed(src, mark, 2); 297 dst.put((char) (((b1 << 6) ^ b2) 298 ^ 299 (((byte) 0xC0 << 6) ^ 300 ((byte) 0x80 << 0)))); 301 mark += 2; 302 } else if ((b1 >> 4) == -2) { 303 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx 304 if (limit - mark < 3 || dst.remaining() < 1) 305 return xflow(src, mark, 3); 306 int b2 = src.get(); 307 int b3 = src.get(); 308 if (isMalformed3(b1, b2, b3)) 309 return malformed(src, mark, 3); 310 dst.put((char) 311 ((b1 << 12) ^ 312 (b2 << 6) ^ 313 (b3 ^ 314 (((byte) 0xE0 << 12) ^ 315 ((byte) 0x80 << 6) ^ 316 ((byte) 0x80 << 0))))); 317 mark += 3; 318 } else if ((b1 >> 3) == -2) { 319 // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 320 if (limit - mark < 4 || dst.remaining() < 2) 321 return xflow(src, mark, 4); 322 int b2 = src.get(); 323 int b3 = src.get(); 324 int b4 = src.get(); 325 int uc = ((b1 << 18) ^ 326 (b2 << 12) ^ 327 (b3 << 6) ^ 328 (b4 ^ 329 (((byte) 0xF0 << 18) ^ 330 ((byte) 0x80 << 12) ^ 331 ((byte) 0x80 << 6) ^ 332 ((byte) 0x80 << 0)))); 333 if (isMalformed4(b2, b3, b4) || 334 // shortest form check 335 !Character.isSupplementaryCodePoint(uc)) { 336 return malformed(src, mark, 4); 337 } 338 dst.put(Character.highSurrogate(uc)); 339 dst.put(Character.lowSurrogate(uc)); 340 mark += 4; 341 } else { 342 return malformed(src, mark, 1); 343 } 344 } 345 return xflow(src, mark, 0); 346 } 347 348 protected CoderResult decodeLoop(ByteBuffer src, 349 CharBuffer dst) 350 { 351 if (src.hasArray() && dst.hasArray()) 352 return decodeArrayLoop(src, dst); 353 else 354 return decodeBufferLoop(src, dst); 355 } 356 } 357 358 private static class Encoder extends CharsetEncoder { 359 360 private Encoder(Charset cs) { 361 super(cs, 1.1f, 3.0f); 362 } 363 364 public boolean canEncode(char c) { 365 return !Character.isSurrogate(c); 366 } 367 368 public boolean isLegalReplacement(byte[] repl) { 369 return ((repl.length == 1 && repl[0] >= 0) || 370 super.isLegalReplacement(repl)); 371 } 372 373 private static CoderResult overflow(CharBuffer src, int sp, 374 ByteBuffer dst, int dp) { 375 updatePositions(src, sp, dst, dp); 376 return CoderResult.OVERFLOW; 377 } 378 379 private static CoderResult overflow(CharBuffer src, int mark) { 380 src.position(mark); 381 return CoderResult.OVERFLOW; 382 } 383 384 private Surrogate.Parser sgp; 385 private CoderResult encodeArrayLoop(CharBuffer src, 386 ByteBuffer dst) 387 { 388 char[] sa = src.array(); 389 int sp = src.arrayOffset() + src.position(); 390 int sl = src.arrayOffset() + src.limit(); 391 392 byte[] da = dst.array(); 393 int dp = dst.arrayOffset() + dst.position(); 394 int dl = dst.arrayOffset() + dst.limit(); 395 int dlASCII = dp + Math.min(sl - sp, dl - dp); 396 397 // ASCII only loop 398 while (dp < dlASCII && sa[sp] < '\u0080') 399 da[dp++] = (byte) sa[sp++]; 400 while (sp < sl) { 401 char c = sa[sp]; 402 if (c < 0x80) { 403 // Have at most seven bits 404 if (dp >= dl) 405 return overflow(src, sp, dst, dp); 406 da[dp++] = (byte)c; 407 } else if (c < 0x800) { 408 // 2 bytes, 11 bits 409 if (dl - dp < 2) 410 return overflow(src, sp, dst, dp); 411 da[dp++] = (byte)(0xc0 | (c >> 6)); 412 da[dp++] = (byte)(0x80 | (c & 0x3f)); 413 } else if (Character.isSurrogate(c)) { 414 // Have a surrogate pair 415 if (sgp == null) 416 sgp = new Surrogate.Parser(); 417 int uc = sgp.parse(c, sa, sp, sl); 418 if (uc < 0) { 419 updatePositions(src, sp, dst, dp); 420 return sgp.error(); 421 } 422 if (dl - dp < 4) 423 return overflow(src, sp, dst, dp); 424 da[dp++] = (byte)(0xf0 | ((uc >> 18))); 425 da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 426 da[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 427 da[dp++] = (byte)(0x80 | (uc & 0x3f)); 428 sp++; // 2 chars 429 } else { 430 // 3 bytes, 16 bits 431 if (dl - dp < 3) 432 return overflow(src, sp, dst, dp); 433 da[dp++] = (byte)(0xe0 | ((c >> 12))); 434 da[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 435 da[dp++] = (byte)(0x80 | (c & 0x3f)); 436 } 437 sp++; 438 } 439 updatePositions(src, sp, dst, dp); 440 return CoderResult.UNDERFLOW; 441 } 442 443 private CoderResult encodeBufferLoop(CharBuffer src, 444 ByteBuffer dst) 445 { 446 int mark = src.position(); 447 while (src.hasRemaining()) { 448 char c = src.get(); 449 if (c < 0x80) { 450 // Have at most seven bits 451 if (!dst.hasRemaining()) 452 return overflow(src, mark); 453 dst.put((byte)c); 454 } else if (c < 0x800) { 455 // 2 bytes, 11 bits 456 if (dst.remaining() < 2) 457 return overflow(src, mark); 458 dst.put((byte)(0xc0 | (c >> 6))); 459 dst.put((byte)(0x80 | (c & 0x3f))); 460 } else if (Character.isSurrogate(c)) { 461 // Have a surrogate pair 462 if (sgp == null) 463 sgp = new Surrogate.Parser(); 464 int uc = sgp.parse(c, src); 465 if (uc < 0) { 466 src.position(mark); 467 return sgp.error(); 468 } 469 if (dst.remaining() < 4) 470 return overflow(src, mark); 471 dst.put((byte)(0xf0 | ((uc >> 18)))); 472 dst.put((byte)(0x80 | ((uc >> 12) & 0x3f))); 473 dst.put((byte)(0x80 | ((uc >> 6) & 0x3f))); 474 dst.put((byte)(0x80 | (uc & 0x3f))); 475 mark++; // 2 chars 476 } else { 477 // 3 bytes, 16 bits 478 if (dst.remaining() < 3) 479 return overflow(src, mark); 480 dst.put((byte)(0xe0 | ((c >> 12)))); 481 dst.put((byte)(0x80 | ((c >> 6) & 0x3f))); 482 dst.put((byte)(0x80 | (c & 0x3f))); 483 } 484 mark++; 485 } 486 src.position(mark); 487 return CoderResult.UNDERFLOW; 488 } 489 490 protected final CoderResult encodeLoop(CharBuffer src, 491 ByteBuffer dst) 492 { 493 if (src.hasArray() && dst.hasArray()) 494 return encodeArrayLoop(src, dst); 495 else 496 return encodeBufferLoop(src, dst); 497 } 498 } 499 }