1 /* 2 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.UnsupportedCharsetException; 40 import java.util.Arrays; 41 import jdk.internal.HotSpotIntrinsicCandidate; 42 import sun.nio.cs.HistoricallyNamedCharset; 43 import sun.nio.cs.ArrayDecoder; 44 import sun.nio.cs.ArrayEncoder; 45 import sun.nio.cs.StandardCharsets; 46 47 import static java.lang.String.LATIN1; 48 import static java.lang.String.UTF16; 49 import static java.lang.String.COMPACT_STRINGS; 50 import static java.lang.Character.isSurrogate; 51 import static java.lang.Character.highSurrogate; 52 import static java.lang.Character.lowSurrogate; 53 import static java.lang.Character.isSupplementaryCodePoint; 54 import static java.lang.StringUTF16.putChar; 55 56 /** 57 * Utility class for string encoding and decoding. 58 */ 59 60 class StringCoding { 61 62 private StringCoding() { } 63 64 /** The cached coders for each thread */ 65 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 66 new ThreadLocal<>(); 67 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 68 new ThreadLocal<>(); 69 70 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; 71 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; 72 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; 73 74 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 75 SoftReference<T> sr = tl.get(); 76 if (sr == null) 77 return null; 78 return sr.get(); 79 } 80 81 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 82 tl.set(new SoftReference<>(ob)); 83 } 84 85 // Trim the given byte array to the given length 86 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 87 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 88 return ba; 89 else 90 return Arrays.copyOf(ba, len); 91 } 92 93 private static int scale(int len, float expansionFactor) { 94 // We need to perform double, not float, arithmetic; otherwise 95 // we lose low order bits when len is larger than 2**24. 96 return (int)(len * (double)expansionFactor); 97 } 98 99 private static Charset lookupCharset(String csn) { 100 if (Charset.isSupported(csn)) { 101 try { 102 return Charset.forName(csn); 103 } catch (UnsupportedCharsetException x) { 104 throw new Error(x); 105 } 106 } 107 return null; 108 } 109 110 static class Result { 111 byte[] value; 112 byte coder; 113 114 Result with() { 115 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 116 value = new byte[0]; 117 return this; 118 } 119 120 Result with(char[] val, int off, int len) { 121 if (String.COMPACT_STRINGS) { 122 byte[] bs = StringUTF16.compress(val, off, len); 123 if (bs != null) { 124 value = bs; 125 coder = LATIN1; 126 return this; 127 } 128 } 129 coder = UTF16; 130 value = StringUTF16.toBytes(val, off, len); 131 return this; 132 } 133 134 Result with(byte[] val, byte coder) { 135 this.coder = coder; 136 value = val; 137 return this; 138 } 139 } 140 141 @HotSpotIntrinsicCandidate 142 public static boolean hasNegatives(byte[] ba, int off, int len) { 143 for (int i = off; i < off + len; i++) { 144 if (ba[i] < 0) { 145 return true; 146 } 147 } 148 return false; 149 } 150 151 // -- Decoding -- 152 static class StringDecoder { 153 private final String requestedCharsetName; 154 private final Charset cs; 155 private final boolean isASCIICompatible; 156 private final CharsetDecoder cd; 157 protected final Result result; 158 159 StringDecoder(Charset cs, String rcn) { 160 this.requestedCharsetName = rcn; 161 this.cs = cs; 162 this.cd = cs.newDecoder() 163 .onMalformedInput(CodingErrorAction.REPLACE) 164 .onUnmappableCharacter(CodingErrorAction.REPLACE); 165 this.result = new Result(); 166 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 167 ((ArrayDecoder)cd).isASCIICompatible(); 168 } 169 170 String charsetName() { 171 if (cs instanceof HistoricallyNamedCharset) 172 return ((HistoricallyNamedCharset)cs).historicalName(); 173 return cs.name(); 174 } 175 176 final String requestedCharsetName() { 177 return requestedCharsetName; 178 } 179 180 Result decode(byte[] ba, int off, int len) { 181 if (len == 0) { 182 return result.with(); 183 } 184 // fastpath for ascii compatible 185 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 186 if (COMPACT_STRINGS) { 187 return result.with(Arrays.copyOfRange(ba, off, off + len), 188 LATIN1); 189 } else { 190 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 191 } 192 } 193 int en = scale(len, cd.maxCharsPerByte()); 194 char[] ca = new char[en]; 195 if (cd instanceof ArrayDecoder) { 196 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 197 return result.with(ca, 0, clen); 198 } 199 cd.reset(); 200 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 201 CharBuffer cb = CharBuffer.wrap(ca); 202 try { 203 CoderResult cr = cd.decode(bb, cb, true); 204 if (!cr.isUnderflow()) 205 cr.throwException(); 206 cr = cd.flush(cb); 207 if (!cr.isUnderflow()) 208 cr.throwException(); 209 } catch (CharacterCodingException x) { 210 // Substitution is always enabled, 211 // so this shouldn't happen 212 throw new Error(x); 213 } 214 return result.with(ca, 0, cb.position()); 215 } 216 } 217 218 static Result decode(String charsetName, byte[] ba, int off, int len) 219 throws UnsupportedEncodingException 220 { 221 StringDecoder sd = deref(decoder); 222 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 223 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 224 || csn.equals(sd.charsetName()))) { 225 sd = null; 226 try { 227 Charset cs = lookupCharset(csn); 228 if (cs != null) { 229 if (cs == UTF_8) { 230 return decodeUTF8(ba, off, len, true); 231 } 232 if (cs == ISO_8859_1) { 233 return decodeLatin1(ba, off, len); 234 } 235 if (cs == US_ASCII) { 236 return decodeASCII(ba, off, len); 237 } 238 sd = new StringDecoder(cs, csn); 239 } 240 } catch (IllegalCharsetNameException x) {} 241 if (sd == null) 242 throw new UnsupportedEncodingException(csn); 243 set(decoder, sd); 244 } 245 return sd.decode(ba, off, len); 246 } 247 248 static Result decode(Charset cs, byte[] ba, int off, int len) { 249 if (cs == UTF_8) { 250 return decodeUTF8(ba, off, len, true); 251 } 252 if (cs == ISO_8859_1) { 253 return decodeLatin1(ba, off, len); 254 } 255 if (cs == US_ASCII) { 256 return decodeASCII(ba, off, len); 257 } 258 259 // (1)We never cache the "external" cs, the only benefit of creating 260 // an additional StringDe/Encoder object to wrap it is to share the 261 // de/encode() method. These SD/E objects are short-lived, the young-gen 262 // gc should be able to take care of them well. But the best approach 263 // is still not to generate them if not really necessary. 264 // (2)The defensive copy of the input byte/char[] has a big performance 265 // impact, as well as the outgoing result byte/char[]. Need to do the 266 // optimization check of (sm==null && classLoader0==null) for both. 267 // (3)There might be a timing gap in isTrusted setting. getClassLoader0() 268 // is only checked (and then isTrusted gets set) when (SM==null). It is 269 // possible that the SM==null for now but then SM is NOT null later 270 // when safeTrim() is invoked...the "safe" way to do is to redundant 271 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 272 // but it then can be argued that the SM is null when the operation 273 // is started... 274 CharsetDecoder cd = cs.newDecoder(); 275 // ascii fastpath 276 if ((cd instanceof ArrayDecoder) && 277 ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { 278 return decodeLatin1(ba, off, len); 279 } 280 int en = scale(len, cd.maxCharsPerByte()); 281 if (len == 0) { 282 return new Result().with(); 283 } 284 cd.onMalformedInput(CodingErrorAction.REPLACE) 285 .onUnmappableCharacter(CodingErrorAction.REPLACE) 286 .reset(); 287 char[] ca = new char[en]; 288 if (cd instanceof ArrayDecoder) { 289 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 290 return new Result().with(ca, 0, clen); 291 } 292 if (cs.getClass().getClassLoader0() != null && 293 System.getSecurityManager() != null) { 294 ba = Arrays.copyOfRange(ba, off, off + len); 295 off = 0; 296 } 297 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 298 CharBuffer cb = CharBuffer.wrap(ca); 299 try { 300 CoderResult cr = cd.decode(bb, cb, true); 301 if (!cr.isUnderflow()) 302 cr.throwException(); 303 cr = cd.flush(cb); 304 if (!cr.isUnderflow()) 305 cr.throwException(); 306 } catch (CharacterCodingException x) { 307 // Substitution is always enabled, 308 // so this shouldn't happen 309 throw new Error(x); 310 } 311 return new Result().with(ca, 0, cb.position()); 312 } 313 314 static Result decode(byte[] ba, int off, int len) { 315 Charset cs = Charset.defaultCharset(); 316 if (cs == UTF_8) { 317 return decodeUTF8(ba, off, len, true); 318 } 319 if (cs == ISO_8859_1) { 320 return decodeLatin1(ba, off, len); 321 } 322 if (cs == US_ASCII) { 323 return decodeASCII(ba, off, len); 324 } 325 StringDecoder sd = deref(decoder); 326 if (sd == null || !cs.name().equals(sd.cs.name())) { 327 sd = new StringDecoder(cs, cs.name()); 328 set(decoder, sd); 329 } 330 return sd.decode(ba, off, len); 331 } 332 333 // -- Encoding -- 334 private static class StringEncoder { 335 private Charset cs; 336 private CharsetEncoder ce; 337 private final boolean isASCIICompatible; 338 private final String requestedCharsetName; 339 private final boolean isTrusted; 340 341 private StringEncoder(Charset cs, String rcn) { 342 this.requestedCharsetName = rcn; 343 this.cs = cs; 344 this.ce = cs.newEncoder() 345 .onMalformedInput(CodingErrorAction.REPLACE) 346 .onUnmappableCharacter(CodingErrorAction.REPLACE); 347 this.isTrusted = (cs.getClass().getClassLoader0() == null); 348 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 349 ((ArrayEncoder)ce).isASCIICompatible(); 350 } 351 352 String charsetName() { 353 if (cs instanceof HistoricallyNamedCharset) 354 return ((HistoricallyNamedCharset)cs).historicalName(); 355 return cs.name(); 356 } 357 358 final String requestedCharsetName() { 359 return requestedCharsetName; 360 } 361 362 byte[] encode(byte coder, byte[] val) { 363 // fastpath for ascii compatible 364 if (coder == LATIN1 && isASCIICompatible && 365 !hasNegatives(val, 0, val.length)) { 366 return Arrays.copyOf(val, val.length); 367 } 368 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 369 int en = scale(len, ce.maxBytesPerChar()); 370 byte[] ba = new byte[en]; 371 if (len == 0) { 372 return ba; 373 } 374 if (ce instanceof ArrayEncoder) { 375 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 376 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 377 if (blen != -1) { 378 return safeTrim(ba, blen, isTrusted); 379 } 380 } 381 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 382 : StringUTF16.toChars(val); 383 ce.reset(); 384 ByteBuffer bb = ByteBuffer.wrap(ba); 385 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 386 try { 387 CoderResult cr = ce.encode(cb, bb, true); 388 if (!cr.isUnderflow()) 389 cr.throwException(); 390 cr = ce.flush(bb); 391 if (!cr.isUnderflow()) 392 cr.throwException(); 393 } catch (CharacterCodingException x) { 394 // Substitution is always enabled, 395 // so this shouldn't happen 396 throw new Error(x); 397 } 398 return safeTrim(ba, bb.position(), isTrusted); 399 } 400 } 401 402 static byte[] encode(String charsetName, byte coder, byte[] val) 403 throws UnsupportedEncodingException 404 { 405 StringEncoder se = deref(encoder); 406 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 407 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 408 || csn.equals(se.charsetName()))) { 409 se = null; 410 try { 411 Charset cs = lookupCharset(csn); 412 if (cs != null) { 413 if (cs == UTF_8) { 414 return encodeUTF8(coder, val, true); 415 } 416 if (cs == ISO_8859_1) { 417 return encode8859_1(coder, val); 418 } 419 if (cs == US_ASCII) { 420 return encodeASCII(coder, val); 421 } 422 se = new StringEncoder(cs, csn); 423 } 424 } catch (IllegalCharsetNameException x) {} 425 if (se == null) { 426 throw new UnsupportedEncodingException (csn); 427 } 428 set(encoder, se); 429 } 430 return se.encode(coder, val); 431 } 432 433 static byte[] encode(Charset cs, byte coder, byte[] val) { 434 if (cs == UTF_8) { 435 return encodeUTF8(coder, val, true); 436 } 437 if (cs == ISO_8859_1) { 438 return encode8859_1(coder, val); 439 } 440 if (cs == US_ASCII) { 441 return encodeASCII(coder, val); 442 } 443 CharsetEncoder ce = cs.newEncoder(); 444 // fastpath for ascii compatible 445 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 446 ((ArrayEncoder)ce).isASCIICompatible() && 447 !hasNegatives(val, 0, val.length)))) { 448 return Arrays.copyOf(val, val.length); 449 } 450 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 451 int en = scale(len, ce.maxBytesPerChar()); 452 byte[] ba = new byte[en]; 453 if (len == 0) { 454 return ba; 455 } 456 ce.onMalformedInput(CodingErrorAction.REPLACE) 457 .onUnmappableCharacter(CodingErrorAction.REPLACE) 458 .reset(); 459 if (ce instanceof ArrayEncoder) { 460 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 461 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 462 if (blen != -1) { 463 return safeTrim(ba, blen, true); 464 } 465 } 466 boolean isTrusted = cs.getClass().getClassLoader0() == null || 467 System.getSecurityManager() == null; 468 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 469 : StringUTF16.toChars(val); 470 ByteBuffer bb = ByteBuffer.wrap(ba); 471 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 472 try { 473 CoderResult cr = ce.encode(cb, bb, true); 474 if (!cr.isUnderflow()) 475 cr.throwException(); 476 cr = ce.flush(bb); 477 if (!cr.isUnderflow()) 478 cr.throwException(); 479 } catch (CharacterCodingException x) { 480 throw new Error(x); 481 } 482 return safeTrim(ba, bb.position(), isTrusted); 483 } 484 485 static byte[] encode(byte coder, byte[] val) { 486 Charset cs = Charset.defaultCharset(); 487 if (cs == UTF_8) { 488 return encodeUTF8(coder, val, true); 489 } 490 if (cs == ISO_8859_1) { 491 return encode8859_1(coder, val); 492 } 493 if (cs == US_ASCII) { 494 return encodeASCII(coder, val); 495 } 496 StringEncoder se = deref(encoder); 497 if (se == null || !cs.name().equals(se.cs.name())) { 498 se = new StringEncoder(cs, cs.name()); 499 set(encoder, se); 500 } 501 return se.encode(coder, val); 502 } 503 504 /** 505 * Print a message directly to stderr, bypassing all character conversion 506 * methods. 507 * @param msg message to print 508 */ 509 private static native void err(String msg); 510 511 /* The cached Result for each thread */ 512 private static final ThreadLocal<StringCoding.Result> 513 resultCached = new ThreadLocal<>() { 514 protected StringCoding.Result initialValue() { 515 return new StringCoding.Result(); 516 }}; 517 518 ////////////////////////// ascii ////////////////////////////// 519 520 private static Result decodeASCII(byte[] ba, int off, int len) { 521 Result result = resultCached.get(); 522 if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { 523 return result.with(Arrays.copyOfRange(ba, off, off + len), 524 LATIN1); 525 } 526 byte[] dst = new byte[len<<1]; 527 int dp = 0; 528 while (dp < len) { 529 int b = ba[off++]; 530 putChar(dst, dp++, (b >= 0) ? (char)b : repl); 531 } 532 return result.with(dst, UTF16); 533 } 534 535 private static byte[] encodeASCII(byte coder, byte[] val) { 536 if (coder == LATIN1) { 537 byte[] dst = new byte[val.length]; 538 for (int i = 0; i < val.length; i++) { 539 if (val[i] < 0) { 540 dst[i] = '?'; 541 } else { 542 dst[i] = val[i]; 543 } 544 } 545 return dst; 546 } 547 int len = val.length >> 1; 548 byte[] dst = new byte[len]; 549 int dp = 0; 550 for (int i = 0; i < len; i++) { 551 char c = StringUTF16.getChar(val, i); 552 if (c < 0x80) { 553 dst[dp++] = (byte)c; 554 continue; 555 } 556 if (Character.isHighSurrogate(c) && i + 1 < len && 557 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 558 i++; 559 } 560 dst[dp++] = '?'; 561 } 562 if (len == dp) { 563 return dst; 564 } 565 return Arrays.copyOf(dst, dp); 566 } 567 568 ////////////////////////// latin1/8859_1 /////////////////////////// 569 570 private static Result decodeLatin1(byte[] ba, int off, int len) { 571 Result result = resultCached.get(); 572 if (COMPACT_STRINGS) { 573 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 574 } else { 575 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 576 } 577 } 578 579 @HotSpotIntrinsicCandidate 580 private static int implEncodeISOArray(byte[] sa, int sp, 581 byte[] da, int dp, int len) { 582 int i = 0; 583 for (; i < len; i++) { 584 char c = StringUTF16.getChar(sa, sp++); 585 if (c > '\u00FF') 586 break; 587 da[dp++] = (byte)c; 588 } 589 return i; 590 } 591 592 private static byte[] encode8859_1(byte coder, byte[] val) { 593 if (coder == LATIN1) { 594 return Arrays.copyOf(val, val.length); 595 } 596 int len = val.length >> 1; 597 byte[] dst = new byte[len]; 598 int dp = 0; 599 int sp = 0; 600 int sl = len; 601 while (sp < sl) { 602 int ret = implEncodeISOArray(val, sp, dst, dp, len); 603 sp = sp + ret; 604 dp = dp + ret; 605 if (ret != len) { 606 char c = StringUTF16.getChar(val, sp++); 607 if (Character.isHighSurrogate(c) && sp < sl && 608 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 609 sp++; 610 } 611 dst[dp++] = '?'; 612 len = sl - sp; 613 } 614 } 615 if (dp == dst.length) { 616 return dst; 617 } 618 return Arrays.copyOf(dst, dp); 619 } 620 621 //////////////////////////////// utf8 //////////////////////////////////// 622 623 private static boolean isNotContinuation(int b) { 624 return (b & 0xc0) != 0x80; 625 } 626 627 private static boolean isMalformed3(int b1, int b2, int b3) { 628 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 629 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 630 } 631 632 private static boolean isMalformed3_2(int b1, int b2) { 633 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 634 (b2 & 0xc0) != 0x80; 635 } 636 637 private static boolean isMalformed4(int b2, int b3, int b4) { 638 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 639 (b4 & 0xc0) != 0x80; 640 } 641 642 private static boolean isMalformed4_2(int b1, int b2) { 643 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 644 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 645 (b2 & 0xc0) != 0x80; 646 } 647 648 private static boolean isMalformed4_3(int b3) { 649 return (b3 & 0xc0) != 0x80; 650 } 651 652 // for nb == 3/4 653 private static int malformedN(byte[] src, int sp, int nb) { 654 if (nb == 3) { 655 int b1 = src[sp++]; 656 int b2 = src[sp++]; // no need to lookup b3 657 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 658 isNotContinuation(b2)) ? 1 : 2; 659 } else if (nb == 4) { // we don't care the speed here 660 int b1 = src[sp++] & 0xff; 661 int b2 = src[sp++] & 0xff; 662 if (b1 > 0xf4 || 663 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 664 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 665 isNotContinuation(b2)) 666 return 1; 667 if (isNotContinuation(src[sp++])) 668 return 2; 669 return 3; 670 } 671 assert false; 672 return -1; 673 } 674 675 private static void throwMalformed(int off, int nb) { 676 throw new IllegalArgumentException("malformed input off : " + off + 677 ", length : " + nb); 678 } 679 680 private static char repl = '\ufffd'; 681 682 private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { 683 // ascii-bais, which has a relative impact to the non-ascii-only bytes 684 if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) 685 return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), 686 LATIN1); 687 return decodeUTF8_0(src, sp, len, doReplace); 688 } 689 690 private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { 691 Result ret = resultCached.get(); 692 693 int sl = sp + len; 694 int dp = 0; 695 byte[] dst = new byte[len]; 696 697 if (COMPACT_STRINGS) { 698 while (sp < sl) { 699 int b1 = src[sp]; 700 if (b1 >= 0) { 701 dst[dp++] = (byte)b1; 702 sp++; 703 continue; 704 } 705 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && 706 sp + 1 < sl) { 707 int b2 = src[sp + 1]; 708 if (!isNotContinuation(b2)) { 709 dst[dp++] = (byte)(((b1 << 6) ^ b2)^ 710 (((byte) 0xC0 << 6) ^ 711 ((byte) 0x80 << 0))); 712 sp += 2; 713 continue; 714 } 715 } 716 // anything not a latin1, including the repl 717 // we have to go with the utf16 718 break; 719 } 720 if (sp == sl) { 721 if (dp != dst.length) { 722 dst = Arrays.copyOf(dst, dp); 723 } 724 return ret.with(dst, LATIN1); 725 } 726 } 727 if (dp == 0) { 728 dst = new byte[len << 1]; 729 } else { 730 byte[] buf = new byte[len << 1]; 731 StringLatin1.inflate(dst, 0, buf, 0, dp); 732 dst = buf; 733 } 734 while (sp < sl) { 735 int b1 = src[sp++]; 736 if (b1 >= 0) { 737 putChar(dst, dp++, (char) b1); 738 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 739 if (sp < sl) { 740 int b2 = src[sp++]; 741 if (isNotContinuation(b2)) { 742 if (!doReplace) { 743 throwMalformed(sp - 1, 1); 744 } 745 putChar(dst, dp++, repl); 746 sp--; 747 } else { 748 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ 749 (((byte) 0xC0 << 6) ^ 750 ((byte) 0x80 << 0)))); 751 } 752 continue; 753 } 754 if (!doReplace) { 755 throwMalformed(sp, 1); // underflow() 756 } 757 putChar(dst, dp++, repl); 758 break; 759 } else if ((b1 >> 4) == -2) { 760 if (sp + 1 < sl) { 761 int b2 = src[sp++]; 762 int b3 = src[sp++]; 763 if (isMalformed3(b1, b2, b3)) { 764 if (!doReplace) { 765 throwMalformed(sp - 3, 3); 766 } 767 putChar(dst, dp++, repl); 768 sp -= 3; 769 sp += malformedN(src, sp, 3); 770 } else { 771 char c = (char)((b1 << 12) ^ 772 (b2 << 6) ^ 773 (b3 ^ 774 (((byte) 0xE0 << 12) ^ 775 ((byte) 0x80 << 6) ^ 776 ((byte) 0x80 << 0)))); 777 if (isSurrogate(c)) { 778 if (!doReplace) { 779 throwMalformed(sp - 3, 3); 780 } 781 putChar(dst, dp++, repl); 782 } else { 783 putChar(dst, dp++, c); 784 } 785 } 786 continue; 787 } 788 if (sp < sl && isMalformed3_2(b1, src[sp])) { 789 if (!doReplace) { 790 throwMalformed(sp - 1, 2); 791 } 792 putChar(dst, dp++, repl); 793 continue; 794 } 795 if (!doReplace){ 796 throwMalformed(sp, 1); 797 } 798 putChar(dst, dp++, repl); 799 break; 800 } else if ((b1 >> 3) == -2) { 801 if (sp + 2 < sl) { 802 int b2 = src[sp++]; 803 int b3 = src[sp++]; 804 int b4 = src[sp++]; 805 int uc = ((b1 << 18) ^ 806 (b2 << 12) ^ 807 (b3 << 6) ^ 808 (b4 ^ 809 (((byte) 0xF0 << 18) ^ 810 ((byte) 0x80 << 12) ^ 811 ((byte) 0x80 << 6) ^ 812 ((byte) 0x80 << 0)))); 813 if (isMalformed4(b2, b3, b4) || 814 !isSupplementaryCodePoint(uc)) { // shortest form check 815 if (!doReplace) { 816 throwMalformed(sp - 4, 4); 817 } 818 putChar(dst, dp++, repl); 819 sp -= 4; 820 sp += malformedN(src, sp, 4); 821 } else { 822 putChar(dst, dp++, highSurrogate(uc)); 823 putChar(dst, dp++, lowSurrogate(uc)); 824 } 825 continue; 826 } 827 b1 &= 0xff; 828 if (b1 > 0xf4 || 829 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 830 if (!doReplace) { 831 throwMalformed(sp - 1, 1); // or 2 832 } 833 putChar(dst, dp++, repl); 834 continue; 835 } 836 if (!doReplace) { 837 throwMalformed(sp - 1, 1); 838 } 839 sp++; 840 putChar(dst, dp++, repl); 841 if (sp < sl && isMalformed4_3(src[sp])) { 842 continue; 843 } 844 break; 845 } else { 846 if (!doReplace) { 847 throwMalformed(sp - 1, 1); 848 } 849 putChar(dst, dp++, repl); 850 } 851 } 852 if (dp != len) { 853 dst = Arrays.copyOf(dst, dp << 1); 854 } 855 return ret.with(dst, UTF16); 856 } 857 858 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 859 if (coder == UTF16) 860 return encodeUTF8_UTF16(val, doReplace); 861 862 if (!hasNegatives(val, 0, val.length)) 863 return Arrays.copyOf(val, val.length); 864 865 int dp = 0; 866 byte[] dst = new byte[val.length << 1]; 867 for (int sp = 0; sp < val.length; sp++) { 868 byte c = val[sp]; 869 if (c < 0) { 870 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 871 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 872 } else { 873 dst[dp++] = c; 874 } 875 } 876 if (dp == dst.length) 877 return dst; 878 return Arrays.copyOf(dst, dp); 879 } 880 881 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 882 int dp = 0; 883 int sp = 0; 884 int sl = val.length >> 1; 885 byte[] dst = new byte[sl * 3]; 886 char c; 887 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 888 // ascii fast loop; 889 dst[dp++] = (byte)c; 890 sp++; 891 } 892 while (sp < sl) { 893 c = StringUTF16.getChar(val, sp++); 894 if (c < 0x80) { 895 dst[dp++] = (byte)c; 896 } else if (c < 0x800) { 897 dst[dp++] = (byte)(0xc0 | (c >> 6)); 898 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 899 } else if (Character.isSurrogate(c)) { 900 int uc = -1; 901 char c2; 902 if (Character.isHighSurrogate(c) && sp < sl && 903 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 904 uc = Character.toCodePoint(c, c2); 905 } 906 if (uc < 0) { 907 if (doReplace) { 908 dst[dp++] = '?'; 909 } else { 910 throwMalformed(sp - 1, 1); // or 2, does not matter here 911 } 912 } else { 913 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 914 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 915 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 916 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 917 sp++; // 2 chars 918 } 919 } else { 920 // 3 bytes, 16 bits 921 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 922 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 923 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 924 } 925 } 926 if (dp == dst.length) { 927 return dst; 928 } 929 return Arrays.copyOf(dst, dp); 930 } 931 932 ////////////////////// for j.u.z.ZipCoder ////////////////////////// 933 934 /* 935 * Throws iae, instead of replacing, if malformed or unmappble. 936 */ 937 static String newStringUTF8NoRepl(byte[] src, int off, int len) { 938 if (COMPACT_STRINGS && !hasNegatives(src, off, len)) 939 return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); 940 Result ret = decodeUTF8_0(src, off, len, false); 941 return new String(ret.value, ret.coder); 942 } 943 944 /* 945 * Throws iae, instead of replacing, if unmappble. 946 */ 947 static byte[] getBytesUTF8NoRepl(String s) { 948 return encodeUTF8(s.coder(), s.value(), false); 949 } 950 }