1 /* 2 * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.MalformedInputException; 40 import java.nio.charset.UnmappableCharacterException; 41 import java.nio.charset.UnsupportedCharsetException; 42 import java.util.Arrays; 43 import jdk.internal.HotSpotIntrinsicCandidate; 44 import sun.nio.cs.HistoricallyNamedCharset; 45 import sun.nio.cs.ArrayDecoder; 46 import sun.nio.cs.ArrayEncoder; 47 48 import static java.lang.String.LATIN1; 49 import static java.lang.String.UTF16; 50 import static java.lang.String.COMPACT_STRINGS; 51 import static java.lang.Character.isSurrogate; 52 import static java.lang.Character.highSurrogate; 53 import static java.lang.Character.lowSurrogate; 54 import static java.lang.Character.isSupplementaryCodePoint; 55 import static java.lang.StringUTF16.putChar; 56 57 /** 58 * Utility class for string encoding and decoding. 59 */ 60 61 class StringCoding { 62 63 private StringCoding() { } 64 65 /** The cached coders for each thread */ 66 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 67 new ThreadLocal<>(); 68 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 69 new ThreadLocal<>(); 70 71 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; 72 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; 73 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; 74 75 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 76 SoftReference<T> sr = tl.get(); 77 if (sr == null) 78 return null; 79 return sr.get(); 80 } 81 82 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 83 tl.set(new SoftReference<>(ob)); 84 } 85 86 // Trim the given byte array to the given length 87 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 88 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 89 return ba; 90 else 91 return Arrays.copyOf(ba, len); 92 } 93 94 private static int scale(int len, float expansionFactor) { 95 // We need to perform double, not float, arithmetic; otherwise 96 // we lose low order bits when len is larger than 2**24. 97 return (int)(len * (double)expansionFactor); 98 } 99 100 private static Charset lookupCharset(String csn) { 101 if (Charset.isSupported(csn)) { 102 try { 103 return Charset.forName(csn); 104 } catch (UnsupportedCharsetException x) { 105 throw new Error(x); 106 } 107 } 108 return null; 109 } 110 111 static class Result { 112 byte[] value; 113 byte coder; 114 115 Result with() { 116 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 117 value = new byte[0]; 118 return this; 119 } 120 121 Result with(char[] val, int off, int len) { 122 if (String.COMPACT_STRINGS) { 123 byte[] bs = StringUTF16.compress(val, off, len); 124 if (bs != null) { 125 value = bs; 126 coder = LATIN1; 127 return this; 128 } 129 } 130 coder = UTF16; 131 value = StringUTF16.toBytes(val, off, len); 132 return this; 133 } 134 135 Result with(byte[] val, byte coder) { 136 this.coder = coder; 137 value = val; 138 return this; 139 } 140 } 141 142 @HotSpotIntrinsicCandidate 143 public static boolean hasNegatives(byte[] ba, int off, int len) { 144 for (int i = off; i < off + len; i++) { 145 if (ba[i] < 0) { 146 return true; 147 } 148 } 149 return false; 150 } 151 152 // -- Decoding -- 153 static class StringDecoder { 154 private final String requestedCharsetName; 155 private final Charset cs; 156 private final boolean isASCIICompatible; 157 private final CharsetDecoder cd; 158 protected final Result result; 159 160 StringDecoder(Charset cs, String rcn) { 161 this.requestedCharsetName = rcn; 162 this.cs = cs; 163 this.cd = cs.newDecoder() 164 .onMalformedInput(CodingErrorAction.REPLACE) 165 .onUnmappableCharacter(CodingErrorAction.REPLACE); 166 this.result = new Result(); 167 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 168 ((ArrayDecoder)cd).isASCIICompatible(); 169 } 170 171 String charsetName() { 172 if (cs instanceof HistoricallyNamedCharset) 173 return ((HistoricallyNamedCharset)cs).historicalName(); 174 return cs.name(); 175 } 176 177 final String requestedCharsetName() { 178 return requestedCharsetName; 179 } 180 181 Result decode(byte[] ba, int off, int len) { 182 if (len == 0) { 183 return result.with(); 184 } 185 // fastpath for ascii compatible 186 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 187 if (COMPACT_STRINGS) { 188 return result.with(Arrays.copyOfRange(ba, off, off + len), 189 LATIN1); 190 } else { 191 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 192 } 193 } 194 int en = scale(len, cd.maxCharsPerByte()); 195 char[] ca = new char[en]; 196 if (cd instanceof ArrayDecoder) { 197 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 198 return result.with(ca, 0, clen); 199 } 200 cd.reset(); 201 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 202 CharBuffer cb = CharBuffer.wrap(ca); 203 try { 204 CoderResult cr = cd.decode(bb, cb, true); 205 if (!cr.isUnderflow()) 206 cr.throwException(); 207 cr = cd.flush(cb); 208 if (!cr.isUnderflow()) 209 cr.throwException(); 210 } catch (CharacterCodingException x) { 211 // Substitution is always enabled, 212 // so this shouldn't happen 213 throw new Error(x); 214 } 215 return result.with(ca, 0, cb.position()); 216 } 217 } 218 219 static Result decode(String charsetName, byte[] ba, int off, int len) 220 throws UnsupportedEncodingException 221 { 222 StringDecoder sd = deref(decoder); 223 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 224 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 225 || csn.equals(sd.charsetName()))) { 226 sd = null; 227 try { 228 Charset cs = lookupCharset(csn); 229 if (cs != null) { 230 if (cs == UTF_8) { 231 return decodeUTF8(ba, off, len, true); 232 } 233 if (cs == ISO_8859_1) { 234 return decodeLatin1(ba, off, len); 235 } 236 if (cs == US_ASCII) { 237 return decodeASCII(ba, off, len); 238 } 239 sd = new StringDecoder(cs, csn); 240 } 241 } catch (IllegalCharsetNameException x) {} 242 if (sd == null) 243 throw new UnsupportedEncodingException(csn); 244 set(decoder, sd); 245 } 246 return sd.decode(ba, off, len); 247 } 248 249 static Result decode(Charset cs, byte[] ba, int off, int len) { 250 if (cs == UTF_8) { 251 return decodeUTF8(ba, off, len, true); 252 } 253 if (cs == ISO_8859_1) { 254 return decodeLatin1(ba, off, len); 255 } 256 if (cs == US_ASCII) { 257 return decodeASCII(ba, off, len); 258 } 259 260 // (1)We never cache the "external" cs, the only benefit of creating 261 // an additional StringDe/Encoder object to wrap it is to share the 262 // de/encode() method. These SD/E objects are short-lived, the young-gen 263 // gc should be able to take care of them well. But the best approach 264 // is still not to generate them if not really necessary. 265 // (2)The defensive copy of the input byte/char[] has a big performance 266 // impact, as well as the outgoing result byte/char[]. Need to do the 267 // optimization check of (sm==null && classLoader0==null) for both. 268 // (3)There might be a timing gap in isTrusted setting. getClassLoader0() 269 // is only checked (and then isTrusted gets set) when (SM==null). It is 270 // possible that the SM==null for now but then SM is NOT null later 271 // when safeTrim() is invoked...the "safe" way to do is to redundant 272 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 273 // but it then can be argued that the SM is null when the operation 274 // is started... 275 CharsetDecoder cd = cs.newDecoder(); 276 // ascii fastpath 277 if ((cd instanceof ArrayDecoder) && 278 ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { 279 return decodeLatin1(ba, off, len); 280 } 281 int en = scale(len, cd.maxCharsPerByte()); 282 if (len == 0) { 283 return new Result().with(); 284 } 285 cd.onMalformedInput(CodingErrorAction.REPLACE) 286 .onUnmappableCharacter(CodingErrorAction.REPLACE) 287 .reset(); 288 char[] ca = new char[en]; 289 if (cd instanceof ArrayDecoder) { 290 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 291 return new Result().with(ca, 0, clen); 292 } 293 if (cs.getClass().getClassLoader0() != null && 294 System.getSecurityManager() != null) { 295 ba = Arrays.copyOfRange(ba, off, off + len); 296 off = 0; 297 } 298 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 299 CharBuffer cb = CharBuffer.wrap(ca); 300 try { 301 CoderResult cr = cd.decode(bb, cb, true); 302 if (!cr.isUnderflow()) 303 cr.throwException(); 304 cr = cd.flush(cb); 305 if (!cr.isUnderflow()) 306 cr.throwException(); 307 } catch (CharacterCodingException x) { 308 // Substitution is always enabled, 309 // so this shouldn't happen 310 throw new Error(x); 311 } 312 return new Result().with(ca, 0, cb.position()); 313 } 314 315 static Result decode(byte[] ba, int off, int len) { 316 Charset cs = Charset.defaultCharset(); 317 if (cs == UTF_8) { 318 return decodeUTF8(ba, off, len, true); 319 } 320 if (cs == ISO_8859_1) { 321 return decodeLatin1(ba, off, len); 322 } 323 if (cs == US_ASCII) { 324 return decodeASCII(ba, off, len); 325 } 326 StringDecoder sd = deref(decoder); 327 if (sd == null || !cs.name().equals(sd.cs.name())) { 328 sd = new StringDecoder(cs, cs.name()); 329 set(decoder, sd); 330 } 331 return sd.decode(ba, off, len); 332 } 333 334 // -- Encoding -- 335 private static class StringEncoder { 336 private Charset cs; 337 private CharsetEncoder ce; 338 private final boolean isASCIICompatible; 339 private final String requestedCharsetName; 340 private final boolean isTrusted; 341 342 private StringEncoder(Charset cs, String rcn) { 343 this.requestedCharsetName = rcn; 344 this.cs = cs; 345 this.ce = cs.newEncoder() 346 .onMalformedInput(CodingErrorAction.REPLACE) 347 .onUnmappableCharacter(CodingErrorAction.REPLACE); 348 this.isTrusted = (cs.getClass().getClassLoader0() == null); 349 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 350 ((ArrayEncoder)ce).isASCIICompatible(); 351 } 352 353 String charsetName() { 354 if (cs instanceof HistoricallyNamedCharset) 355 return ((HistoricallyNamedCharset)cs).historicalName(); 356 return cs.name(); 357 } 358 359 final String requestedCharsetName() { 360 return requestedCharsetName; 361 } 362 363 byte[] encode(byte coder, byte[] val) { 364 // fastpath for ascii compatible 365 if (coder == LATIN1 && isASCIICompatible && 366 !hasNegatives(val, 0, val.length)) { 367 return Arrays.copyOf(val, val.length); 368 } 369 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 370 int en = scale(len, ce.maxBytesPerChar()); 371 byte[] ba = new byte[en]; 372 if (len == 0) { 373 return ba; 374 } 375 if (ce instanceof ArrayEncoder) { 376 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 377 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 378 if (blen != -1) { 379 return safeTrim(ba, blen, isTrusted); 380 } 381 } 382 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 383 : StringUTF16.toChars(val); 384 ce.reset(); 385 ByteBuffer bb = ByteBuffer.wrap(ba); 386 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 387 try { 388 CoderResult cr = ce.encode(cb, bb, true); 389 if (!cr.isUnderflow()) 390 cr.throwException(); 391 cr = ce.flush(bb); 392 if (!cr.isUnderflow()) 393 cr.throwException(); 394 } catch (CharacterCodingException x) { 395 // Substitution is always enabled, 396 // so this shouldn't happen 397 throw new Error(x); 398 } 399 return safeTrim(ba, bb.position(), isTrusted); 400 } 401 } 402 403 static byte[] encode(String charsetName, byte coder, byte[] val) 404 throws UnsupportedEncodingException 405 { 406 StringEncoder se = deref(encoder); 407 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 408 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 409 || csn.equals(se.charsetName()))) { 410 se = null; 411 try { 412 Charset cs = lookupCharset(csn); 413 if (cs != null) { 414 if (cs == UTF_8) { 415 return encodeUTF8(coder, val, true); 416 } 417 if (cs == ISO_8859_1) { 418 return encode8859_1(coder, val); 419 } 420 if (cs == US_ASCII) { 421 return encodeASCII(coder, val); 422 } 423 se = new StringEncoder(cs, csn); 424 } 425 } catch (IllegalCharsetNameException x) {} 426 if (se == null) { 427 throw new UnsupportedEncodingException (csn); 428 } 429 set(encoder, se); 430 } 431 return se.encode(coder, val); 432 } 433 434 static byte[] encode(Charset cs, byte coder, byte[] val) { 435 if (cs == UTF_8) { 436 return encodeUTF8(coder, val, true); 437 } 438 if (cs == ISO_8859_1) { 439 return encode8859_1(coder, val); 440 } 441 if (cs == US_ASCII) { 442 return encodeASCII(coder, val); 443 } 444 CharsetEncoder ce = cs.newEncoder(); 445 // fastpath for ascii compatible 446 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 447 ((ArrayEncoder)ce).isASCIICompatible() && 448 !hasNegatives(val, 0, val.length)))) { 449 return Arrays.copyOf(val, val.length); 450 } 451 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 452 int en = scale(len, ce.maxBytesPerChar()); 453 byte[] ba = new byte[en]; 454 if (len == 0) { 455 return ba; 456 } 457 ce.onMalformedInput(CodingErrorAction.REPLACE) 458 .onUnmappableCharacter(CodingErrorAction.REPLACE) 459 .reset(); 460 if (ce instanceof ArrayEncoder) { 461 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 462 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 463 if (blen != -1) { 464 return safeTrim(ba, blen, true); 465 } 466 } 467 boolean isTrusted = cs.getClass().getClassLoader0() == null || 468 System.getSecurityManager() == null; 469 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 470 : StringUTF16.toChars(val); 471 ByteBuffer bb = ByteBuffer.wrap(ba); 472 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 473 try { 474 CoderResult cr = ce.encode(cb, bb, true); 475 if (!cr.isUnderflow()) 476 cr.throwException(); 477 cr = ce.flush(bb); 478 if (!cr.isUnderflow()) 479 cr.throwException(); 480 } catch (CharacterCodingException x) { 481 throw new Error(x); 482 } 483 return safeTrim(ba, bb.position(), isTrusted); 484 } 485 486 static byte[] encode(byte coder, byte[] val) { 487 Charset cs = Charset.defaultCharset(); 488 if (cs == UTF_8) { 489 return encodeUTF8(coder, val, true); 490 } 491 if (cs == ISO_8859_1) { 492 return encode8859_1(coder, val); 493 } 494 if (cs == US_ASCII) { 495 return encodeASCII(coder, val); 496 } 497 StringEncoder se = deref(encoder); 498 if (se == null || !cs.name().equals(se.cs.name())) { 499 se = new StringEncoder(cs, cs.name()); 500 set(encoder, se); 501 } 502 return se.encode(coder, val); 503 } 504 505 /** 506 * Print a message directly to stderr, bypassing all character conversion 507 * methods. 508 * @param msg message to print 509 */ 510 private static native void err(String msg); 511 512 /* The cached Result for each thread */ 513 private static final ThreadLocal<StringCoding.Result> 514 resultCached = new ThreadLocal<>() { 515 protected StringCoding.Result initialValue() { 516 return new StringCoding.Result(); 517 }}; 518 519 ////////////////////////// ascii ////////////////////////////// 520 521 private static Result decodeASCII(byte[] ba, int off, int len) { 522 Result result = resultCached.get(); 523 if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { 524 return result.with(Arrays.copyOfRange(ba, off, off + len), 525 LATIN1); 526 } 527 byte[] dst = new byte[len<<1]; 528 int dp = 0; 529 while (dp < len) { 530 int b = ba[off++]; 531 putChar(dst, dp++, (b >= 0) ? (char)b : repl); 532 } 533 return result.with(dst, UTF16); 534 } 535 536 private static byte[] encodeASCII(byte coder, byte[] val) { 537 if (coder == LATIN1) { 538 byte[] dst = new byte[val.length]; 539 for (int i = 0; i < val.length; i++) { 540 if (val[i] < 0) { 541 dst[i] = '?'; 542 } else { 543 dst[i] = val[i]; 544 } 545 } 546 return dst; 547 } 548 int len = val.length >> 1; 549 byte[] dst = new byte[len]; 550 int dp = 0; 551 for (int i = 0; i < len; i++) { 552 char c = StringUTF16.getChar(val, i); 553 if (c < 0x80) { 554 dst[dp++] = (byte)c; 555 continue; 556 } 557 if (Character.isHighSurrogate(c) && i + 1 < len && 558 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 559 i++; 560 } 561 dst[dp++] = '?'; 562 } 563 if (len == dp) { 564 return dst; 565 } 566 return Arrays.copyOf(dst, dp); 567 } 568 569 ////////////////////////// latin1/8859_1 /////////////////////////// 570 571 private static Result decodeLatin1(byte[] ba, int off, int len) { 572 Result result = resultCached.get(); 573 if (COMPACT_STRINGS) { 574 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 575 } else { 576 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 577 } 578 } 579 580 @HotSpotIntrinsicCandidate 581 private static int implEncodeISOArray(byte[] sa, int sp, 582 byte[] da, int dp, int len) { 583 int i = 0; 584 for (; i < len; i++) { 585 char c = StringUTF16.getChar(sa, sp++); 586 if (c > '\u00FF') 587 break; 588 da[dp++] = (byte)c; 589 } 590 return i; 591 } 592 593 private static byte[] encode8859_1(byte coder, byte[] val) { 594 return encode8859_1(coder, val, true); 595 } 596 597 private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { 598 if (coder == LATIN1) { 599 return Arrays.copyOf(val, val.length); 600 } 601 int len = val.length >> 1; 602 byte[] dst = new byte[len]; 603 int dp = 0; 604 int sp = 0; 605 int sl = len; 606 while (sp < sl) { 607 int ret = implEncodeISOArray(val, sp, dst, dp, len); 608 sp = sp + ret; 609 dp = dp + ret; 610 if (ret != len) { 611 if (!doReplace) { 612 throwUnmappable(sp, 1); 613 } 614 char c = StringUTF16.getChar(val, sp++); 615 if (Character.isHighSurrogate(c) && sp < sl && 616 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 617 sp++; 618 } 619 dst[dp++] = '?'; 620 len = sl - sp; 621 } 622 } 623 if (dp == dst.length) { 624 return dst; 625 } 626 return Arrays.copyOf(dst, dp); 627 } 628 629 //////////////////////////////// utf8 //////////////////////////////////// 630 631 private static boolean isNotContinuation(int b) { 632 return (b & 0xc0) != 0x80; 633 } 634 635 private static boolean isMalformed3(int b1, int b2, int b3) { 636 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 637 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 638 } 639 640 private static boolean isMalformed3_2(int b1, int b2) { 641 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 642 (b2 & 0xc0) != 0x80; 643 } 644 645 private static boolean isMalformed4(int b2, int b3, int b4) { 646 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 647 (b4 & 0xc0) != 0x80; 648 } 649 650 private static boolean isMalformed4_2(int b1, int b2) { 651 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 652 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 653 (b2 & 0xc0) != 0x80; 654 } 655 656 private static boolean isMalformed4_3(int b3) { 657 return (b3 & 0xc0) != 0x80; 658 } 659 660 // for nb == 3/4 661 private static int malformedN(byte[] src, int sp, int nb) { 662 if (nb == 3) { 663 int b1 = src[sp++]; 664 int b2 = src[sp++]; // no need to lookup b3 665 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 666 isNotContinuation(b2)) ? 1 : 2; 667 } else if (nb == 4) { // we don't care the speed here 668 int b1 = src[sp++] & 0xff; 669 int b2 = src[sp++] & 0xff; 670 if (b1 > 0xf4 || 671 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 672 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 673 isNotContinuation(b2)) 674 return 1; 675 if (isNotContinuation(src[sp++])) 676 return 2; 677 return 3; 678 } 679 assert false; 680 return -1; 681 } 682 683 private static void throwMalformed(int off, int nb) { 684 String msg = "malformed input off : " + off + ", length : " + nb; 685 throw new IllegalArgumentException(msg, new MalformedInputException(nb)); 686 } 687 688 private static void throwMalformed(byte[] val) { 689 int dp = 0; 690 while (dp < val.length && val[dp] >=0) { dp++; } 691 throwMalformed(dp, 1); 692 } 693 694 private static void throwUnmappable(int off, int nb) { 695 String msg = "malformed input off : " + off + ", length : " + nb; 696 throw new IllegalArgumentException(msg, new UnmappableCharacterException(nb)); 697 } 698 699 private static void throwUnmappable(byte[] val) { 700 int dp = 0; 701 while (dp < val.length && val[dp] >=0) { dp++; } 702 throwUnmappable(dp, 1); 703 } 704 705 private static char repl = '\ufffd'; 706 707 private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { 708 // ascii-bais, which has a relative impact to the non-ascii-only bytes 709 if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) 710 return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), 711 LATIN1); 712 return decodeUTF8_0(src, sp, len, doReplace); 713 } 714 715 private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { 716 Result ret = resultCached.get(); 717 718 int sl = sp + len; 719 int dp = 0; 720 byte[] dst = new byte[len]; 721 722 if (COMPACT_STRINGS) { 723 while (sp < sl) { 724 int b1 = src[sp]; 725 if (b1 >= 0) { 726 dst[dp++] = (byte)b1; 727 sp++; 728 continue; 729 } 730 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && 731 sp + 1 < sl) { 732 int b2 = src[sp + 1]; 733 if (!isNotContinuation(b2)) { 734 dst[dp++] = (byte)(((b1 << 6) ^ b2)^ 735 (((byte) 0xC0 << 6) ^ 736 ((byte) 0x80 << 0))); 737 sp += 2; 738 continue; 739 } 740 } 741 // anything not a latin1, including the repl 742 // we have to go with the utf16 743 break; 744 } 745 if (sp == sl) { 746 if (dp != dst.length) { 747 dst = Arrays.copyOf(dst, dp); 748 } 749 return ret.with(dst, LATIN1); 750 } 751 } 752 if (dp == 0) { 753 dst = new byte[len << 1]; 754 } else { 755 byte[] buf = new byte[len << 1]; 756 StringLatin1.inflate(dst, 0, buf, 0, dp); 757 dst = buf; 758 } 759 while (sp < sl) { 760 int b1 = src[sp++]; 761 if (b1 >= 0) { 762 putChar(dst, dp++, (char) b1); 763 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 764 if (sp < sl) { 765 int b2 = src[sp++]; 766 if (isNotContinuation(b2)) { 767 if (!doReplace) { 768 throwMalformed(sp - 1, 1); 769 } 770 putChar(dst, dp++, repl); 771 sp--; 772 } else { 773 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ 774 (((byte) 0xC0 << 6) ^ 775 ((byte) 0x80 << 0)))); 776 } 777 continue; 778 } 779 if (!doReplace) { 780 throwMalformed(sp, 1); // underflow() 781 } 782 putChar(dst, dp++, repl); 783 break; 784 } else if ((b1 >> 4) == -2) { 785 if (sp + 1 < sl) { 786 int b2 = src[sp++]; 787 int b3 = src[sp++]; 788 if (isMalformed3(b1, b2, b3)) { 789 if (!doReplace) { 790 throwMalformed(sp - 3, 3); 791 } 792 putChar(dst, dp++, repl); 793 sp -= 3; 794 sp += malformedN(src, sp, 3); 795 } else { 796 char c = (char)((b1 << 12) ^ 797 (b2 << 6) ^ 798 (b3 ^ 799 (((byte) 0xE0 << 12) ^ 800 ((byte) 0x80 << 6) ^ 801 ((byte) 0x80 << 0)))); 802 if (isSurrogate(c)) { 803 if (!doReplace) { 804 throwMalformed(sp - 3, 3); 805 } 806 putChar(dst, dp++, repl); 807 } else { 808 putChar(dst, dp++, c); 809 } 810 } 811 continue; 812 } 813 if (sp < sl && isMalformed3_2(b1, src[sp])) { 814 if (!doReplace) { 815 throwMalformed(sp - 1, 2); 816 } 817 putChar(dst, dp++, repl); 818 continue; 819 } 820 if (!doReplace){ 821 throwMalformed(sp, 1); 822 } 823 putChar(dst, dp++, repl); 824 break; 825 } else if ((b1 >> 3) == -2) { 826 if (sp + 2 < sl) { 827 int b2 = src[sp++]; 828 int b3 = src[sp++]; 829 int b4 = src[sp++]; 830 int uc = ((b1 << 18) ^ 831 (b2 << 12) ^ 832 (b3 << 6) ^ 833 (b4 ^ 834 (((byte) 0xF0 << 18) ^ 835 ((byte) 0x80 << 12) ^ 836 ((byte) 0x80 << 6) ^ 837 ((byte) 0x80 << 0)))); 838 if (isMalformed4(b2, b3, b4) || 839 !isSupplementaryCodePoint(uc)) { // shortest form check 840 if (!doReplace) { 841 throwMalformed(sp - 4, 4); 842 } 843 putChar(dst, dp++, repl); 844 sp -= 4; 845 sp += malformedN(src, sp, 4); 846 } else { 847 putChar(dst, dp++, highSurrogate(uc)); 848 putChar(dst, dp++, lowSurrogate(uc)); 849 } 850 continue; 851 } 852 b1 &= 0xff; 853 if (b1 > 0xf4 || 854 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 855 if (!doReplace) { 856 throwMalformed(sp - 1, 1); // or 2 857 } 858 putChar(dst, dp++, repl); 859 continue; 860 } 861 if (!doReplace) { 862 throwMalformed(sp - 1, 1); 863 } 864 sp++; 865 putChar(dst, dp++, repl); 866 if (sp < sl && isMalformed4_3(src[sp])) { 867 continue; 868 } 869 break; 870 } else { 871 if (!doReplace) { 872 throwMalformed(sp - 1, 1); 873 } 874 putChar(dst, dp++, repl); 875 } 876 } 877 if (dp != len) { 878 dst = Arrays.copyOf(dst, dp << 1); 879 } 880 return ret.with(dst, UTF16); 881 } 882 883 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 884 if (coder == UTF16) 885 return encodeUTF8_UTF16(val, doReplace); 886 887 if (!hasNegatives(val, 0, val.length)) 888 return Arrays.copyOf(val, val.length); 889 890 int dp = 0; 891 byte[] dst = new byte[val.length << 1]; 892 for (int sp = 0; sp < val.length; sp++) { 893 byte c = val[sp]; 894 if (c < 0) { 895 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 896 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 897 } else { 898 dst[dp++] = c; 899 } 900 } 901 if (dp == dst.length) 902 return dst; 903 return Arrays.copyOf(dst, dp); 904 } 905 906 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 907 int dp = 0; 908 int sp = 0; 909 int sl = val.length >> 1; 910 byte[] dst = new byte[sl * 3]; 911 char c; 912 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 913 // ascii fast loop; 914 dst[dp++] = (byte)c; 915 sp++; 916 } 917 while (sp < sl) { 918 c = StringUTF16.getChar(val, sp++); 919 if (c < 0x80) { 920 dst[dp++] = (byte)c; 921 } else if (c < 0x800) { 922 dst[dp++] = (byte)(0xc0 | (c >> 6)); 923 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 924 } else if (Character.isSurrogate(c)) { 925 int uc = -1; 926 char c2; 927 if (Character.isHighSurrogate(c) && sp < sl && 928 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 929 uc = Character.toCodePoint(c, c2); 930 } 931 if (uc < 0) { 932 if (doReplace) { 933 dst[dp++] = '?'; 934 } else { 935 throwUnmappable(sp - 1, 1); // or 2, does not matter here 936 } 937 } else { 938 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 939 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 940 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 941 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 942 sp++; // 2 chars 943 } 944 } else { 945 // 3 bytes, 16 bits 946 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 947 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 948 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 949 } 950 } 951 if (dp == dst.length) { 952 return dst; 953 } 954 return Arrays.copyOf(dst, dp); 955 } 956 957 ////////////////////// for j.u.z.ZipCoder ////////////////////////// 958 959 /* 960 * Throws iae, instead of replacing, if malformed or unmappable. 961 */ 962 static String newStringUTF8NoRepl(byte[] src, int off, int len) { 963 if (COMPACT_STRINGS && !hasNegatives(src, off, len)) 964 return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); 965 Result ret = decodeUTF8_0(src, off, len, false); 966 return new String(ret.value, ret.coder); 967 } 968 969 /* 970 * Throws iae, instead of replacing, if unmappable. 971 */ 972 static byte[] getBytesUTF8NoRepl(String s) { 973 return encodeUTF8(s.coder(), s.value(), false); 974 } 975 976 ////////////////////// for j.n.f.Files ////////////////////////// 977 978 private static boolean isASCII(byte[] src) { 979 return !hasNegatives(src, 0, src.length); 980 } 981 982 private static String newStringLatin1(byte[] src) { 983 if (COMPACT_STRINGS) 984 return new String(src, LATIN1); 985 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 986 } 987 988 static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { 989 try { 990 return newStringNoRepl1(src, cs); 991 } catch (IllegalArgumentException e) { 992 Throwable cause = e.getCause(); 993 if (cause != null && cause instanceof MalformedInputException) { 994 throw (MalformedInputException)cause; 995 } 996 throw new UnmappableCharacterException(1); 997 } 998 } 999 1000 static String newStringNoRepl1(byte[] src, Charset cs) { 1001 if (cs == UTF_8) { 1002 if (COMPACT_STRINGS && isASCII(src)) 1003 return new String(src, LATIN1); 1004 Result ret = decodeUTF8_0(src, 0, src.length, false); 1005 return new String(ret.value, ret.coder); 1006 } 1007 if (cs == ISO_8859_1) { 1008 return newStringLatin1(src); 1009 } 1010 if (cs == US_ASCII) { 1011 if (isASCII(src)) { 1012 return newStringLatin1(src); 1013 } else { 1014 throwMalformed(src); 1015 } 1016 } 1017 1018 CharsetDecoder cd = cs.newDecoder(); 1019 // ascii fastpath 1020 if ((cd instanceof ArrayDecoder) && 1021 ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) { 1022 return newStringLatin1(src); 1023 } 1024 int len = src.length; 1025 if (len == 0) { 1026 return ""; 1027 } 1028 int en = scale(len, cd.maxCharsPerByte()); 1029 char[] ca = new char[en]; 1030 if (cs.getClass().getClassLoader0() != null && 1031 System.getSecurityManager() != null) { 1032 src = Arrays.copyOf(src, len); 1033 } 1034 ByteBuffer bb = ByteBuffer.wrap(src); 1035 CharBuffer cb = CharBuffer.wrap(ca); 1036 try { 1037 CoderResult cr = cd.decode(bb, cb, true); 1038 if (!cr.isUnderflow()) 1039 cr.throwException(); 1040 cr = cd.flush(cb); 1041 if (!cr.isUnderflow()) 1042 cr.throwException(); 1043 } catch (CharacterCodingException x) { 1044 throw new IllegalArgumentException(x); // todo 1045 } 1046 Result ret = resultCached.get().with(ca, 0, cb.position()); 1047 return new String(ret.value, ret.coder); 1048 } 1049 1050 /* 1051 * Throws iae, instead of replacing, if unmappable. 1052 */ 1053 static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { 1054 try { 1055 return getBytesNoRepl1(s, cs); 1056 } catch (IllegalArgumentException e) { 1057 Throwable cause = e.getCause(); 1058 if (cause != null && cause instanceof UnmappableCharacterException) { 1059 throw (UnmappableCharacterException)cause; 1060 } 1061 throw new UnmappableCharacterException(1); 1062 } 1063 } 1064 1065 static byte[] getBytesNoRepl1(String s, Charset cs) { 1066 byte[] val = s.value(); 1067 byte coder = s.coder(); 1068 if (cs == UTF_8) { 1069 if (isASCII(val)) { 1070 return val; 1071 } 1072 return encodeUTF8(coder, val, false); 1073 } 1074 if (cs == ISO_8859_1) { 1075 if (coder == LATIN1) { 1076 return val; 1077 } 1078 return encode8859_1(coder, val, false); 1079 } 1080 if (cs == US_ASCII) { 1081 if (coder == LATIN1) { 1082 if (isASCII(val)) { 1083 return val; 1084 } else { 1085 throwUnmappable(val); 1086 } 1087 } 1088 } 1089 CharsetEncoder ce = cs.newEncoder(); 1090 // fastpath for ascii compatible 1091 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 1092 ((ArrayEncoder)ce).isASCIICompatible() && 1093 isASCII(val)))) { 1094 return val; 1095 } 1096 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 1097 int en = scale(len, ce.maxBytesPerChar()); 1098 byte[] ba = new byte[en]; 1099 if (len == 0) { 1100 return ba; 1101 } 1102 if (ce instanceof ArrayEncoder) { 1103 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 1104 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 1105 if (blen != -1) { 1106 return safeTrim(ba, blen, true); 1107 } 1108 } 1109 boolean isTrusted = cs.getClass().getClassLoader0() == null || 1110 System.getSecurityManager() == null; 1111 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 1112 : StringUTF16.toChars(val); 1113 ByteBuffer bb = ByteBuffer.wrap(ba); 1114 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 1115 try { 1116 CoderResult cr = ce.encode(cb, bb, true); 1117 if (!cr.isUnderflow()) 1118 cr.throwException(); 1119 cr = ce.flush(bb); 1120 if (!cr.isUnderflow()) 1121 cr.throwException(); 1122 } catch (CharacterCodingException x) { 1123 throw new IllegalArgumentException(x); 1124 } 1125 return safeTrim(ba, bb.position(), isTrusted); 1126 } 1127 }