1 /* 2 * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.UnsupportedCharsetException; 40 import java.util.Arrays; 41 import jdk.internal.HotSpotIntrinsicCandidate; 42 import sun.misc.MessageUtils; 43 import sun.nio.cs.HistoricallyNamedCharset; 44 import sun.nio.cs.ArrayDecoder; 45 import sun.nio.cs.ArrayEncoder; 46 47 import static java.lang.String.LATIN1; 48 import static java.lang.String.UTF16; 49 import static java.lang.String.COMPACT_STRINGS; 50 import static java.nio.charset.StandardCharsets.ISO_8859_1; 51 import static java.nio.charset.StandardCharsets.US_ASCII; 52 import static java.nio.charset.StandardCharsets.UTF_8; 53 54 /** 55 * Utility class for string encoding and decoding. 56 */ 57 58 class StringCoding { 59 60 private StringCoding() { } 61 62 /** The cached coders for each thread */ 63 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 64 new ThreadLocal<>(); 65 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 66 new ThreadLocal<>(); 67 68 private static boolean warnUnsupportedCharset = true; 69 70 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 71 SoftReference<T> sr = tl.get(); 72 if (sr == null) 73 return null; 74 return sr.get(); 75 } 76 77 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 78 tl.set(new SoftReference<>(ob)); 79 } 80 81 // Trim the given byte array to the given length 82 // 83 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 84 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 85 return ba; 86 else 87 return Arrays.copyOf(ba, len); 88 } 89 90 private static int scale(int len, float expansionFactor) { 91 // We need to perform double, not float, arithmetic; otherwise 92 // we lose low order bits when len is larger than 2**24. 93 return (int)(len * (double)expansionFactor); 94 } 95 96 private static Charset lookupCharset(String csn) { 97 if (Charset.isSupported(csn)) { 98 try { 99 return Charset.forName(csn); 100 } catch (UnsupportedCharsetException x) { 101 throw new Error(x); 102 } 103 } 104 return null; 105 } 106 107 private static void warnUnsupportedCharset(String csn) { 108 if (warnUnsupportedCharset) { 109 // Use sun.misc.MessageUtils rather than the Logging API or 110 // System.err since this method may be called during VM 111 // initialization before either is available. 112 MessageUtils.err("WARNING: Default charset " + csn + 113 " not supported, using ISO-8859-1 instead"); 114 warnUnsupportedCharset = false; 115 } 116 } 117 118 static class Result { 119 byte[] value; 120 byte coder; 121 122 Result with() { 123 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 124 value = new byte[0]; 125 return this; 126 } 127 128 Result with(char[] val, int off, int len) { 129 if (String.COMPACT_STRINGS) { 130 byte[] bs = StringUTF16.compress(val, off, len); 131 if (bs != null) { 132 value = bs; 133 coder = LATIN1; 134 return this; 135 } 136 } 137 coder = UTF16; 138 value = StringUTF16.toBytes(val, off, len); 139 return this; 140 } 141 142 Result with(byte[] val, byte coder) { 143 this.coder = coder; 144 value = val; 145 return this; 146 } 147 } 148 149 @HotSpotIntrinsicCandidate 150 private static boolean hasNegatives(byte[] ba, int off, int len) { 151 for (int i = off; i < off + len; i++) { 152 if (ba[i] < 0) { 153 return true; 154 } 155 } 156 return false; 157 } 158 159 // -- Decoding -- 160 static class StringDecoder { 161 private final String requestedCharsetName; 162 private final Charset cs; 163 private final boolean isASCIICompatible; 164 private final CharsetDecoder cd; 165 protected final Result result; 166 167 StringDecoder(Charset cs, String rcn) { 168 this.requestedCharsetName = rcn; 169 this.cs = cs; 170 this.cd = cs.newDecoder() 171 .onMalformedInput(CodingErrorAction.REPLACE) 172 .onUnmappableCharacter(CodingErrorAction.REPLACE); 173 this.result = new Result(); 174 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 175 ((ArrayDecoder)cd).isASCIICompatible(); 176 } 177 178 String charsetName() { 179 if (cs instanceof HistoricallyNamedCharset) 180 return ((HistoricallyNamedCharset)cs).historicalName(); 181 return cs.name(); 182 } 183 184 final String requestedCharsetName() { 185 return requestedCharsetName; 186 } 187 188 Result decode(byte[] ba, int off, int len) { 189 if (len == 0) { 190 return result.with(); 191 } 192 // fastpath for ascii compatible 193 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 194 if (COMPACT_STRINGS) { 195 return result.with(Arrays.copyOfRange(ba, off, off + len), 196 LATIN1); 197 } else { 198 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 199 } 200 } 201 int en = scale(len, cd.maxCharsPerByte()); 202 char[] ca = new char[en]; 203 if (cd instanceof ArrayDecoder) { 204 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 205 return result.with(ca, 0, clen); 206 } 207 cd.reset(); 208 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 209 CharBuffer cb = CharBuffer.wrap(ca); 210 try { 211 CoderResult cr = cd.decode(bb, cb, true); 212 if (!cr.isUnderflow()) 213 cr.throwException(); 214 cr = cd.flush(cb); 215 if (!cr.isUnderflow()) 216 cr.throwException(); 217 } catch (CharacterCodingException x) { 218 // Substitution is always enabled, 219 // so this shouldn't happen 220 throw new Error(x); 221 } 222 return result.with(ca, 0, cb.position()); 223 } 224 } 225 226 private static class StringDecoder8859_1 extends StringDecoder { 227 StringDecoder8859_1(Charset cs, String rcn) { 228 super(cs, rcn); 229 } 230 Result decode(byte[] ba, int off, int len) { 231 if (COMPACT_STRINGS) { 232 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 233 } else { 234 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 235 } 236 } 237 } 238 239 static Result decode(String charsetName, byte[] ba, int off, int len) 240 throws UnsupportedEncodingException 241 { 242 StringDecoder sd = deref(decoder); 243 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 244 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 245 || csn.equals(sd.charsetName()))) { 246 sd = null; 247 try { 248 Charset cs = lookupCharset(csn); 249 if (cs != null) { 250 if (cs == UTF_8) { 251 sd = new StringDecoderUTF8(cs, csn); 252 } else if (cs == ISO_8859_1) { 253 sd = new StringDecoder8859_1(cs, csn); 254 } else { 255 sd = new StringDecoder(cs, csn); 256 } 257 } 258 } catch (IllegalCharsetNameException x) {} 259 if (sd == null) 260 throw new UnsupportedEncodingException(csn); 261 set(decoder, sd); 262 } 263 return sd.decode(ba, off, len); 264 } 265 266 static Result decode(Charset cs, byte[] ba, int off, int len) { 267 // (1)We never cache the "external" cs, the only benefit of creating 268 // an additional StringDe/Encoder object to wrap it is to share the 269 // de/encode() method. These SD/E objects are short-lived, the young-gen 270 // gc should be able to take care of them well. But the best approach 271 // is still not to generate them if not really necessary. 272 // (2)The defensive copy of the input byte/char[] has a big performance 273 // impact, as well as the outgoing result byte/char[]. Need to do the 274 // optimization check of (sm==null && classLoader0==null) for both. 275 // (3)getClass().getClassLoader0() is expensive 276 // (4)There might be a timing gap in isTrusted setting. getClassLoader0() 277 // is only checked (and then isTrusted gets set) when (SM==null). It is 278 // possible that the SM==null for now but then SM is NOT null later 279 // when safeTrim() is invoked...the "safe" way to do is to redundant 280 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 281 // but it then can be argued that the SM is null when the operation 282 // is started... 283 if (cs == UTF_8) { 284 return StringDecoderUTF8.decode(ba, off, len, new Result()); 285 } 286 CharsetDecoder cd = cs.newDecoder(); 287 // ascii fastpath 288 if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) && 289 ((ArrayDecoder)cd).isASCIICompatible() && 290 !hasNegatives(ba, off, len))) { 291 if (COMPACT_STRINGS) { 292 return new Result().with(Arrays.copyOfRange(ba, off, off + len), 293 LATIN1); 294 } else { 295 return new Result().with(StringLatin1.inflate(ba, off, len), UTF16); 296 } 297 } 298 int en = scale(len, cd.maxCharsPerByte()); 299 if (len == 0) { 300 return new Result().with(); 301 } 302 if (System.getSecurityManager() != null && 303 cs.getClass().getClassLoader0() != null) { 304 ba = Arrays.copyOfRange(ba, off, off + len); 305 off = 0; 306 } 307 cd.onMalformedInput(CodingErrorAction.REPLACE) 308 .onUnmappableCharacter(CodingErrorAction.REPLACE) 309 .reset(); 310 311 char[] ca = new char[en]; 312 if (cd instanceof ArrayDecoder) { 313 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 314 return new Result().with(ca, 0, clen); 315 } 316 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 317 CharBuffer cb = CharBuffer.wrap(ca); 318 try { 319 CoderResult cr = cd.decode(bb, cb, true); 320 if (!cr.isUnderflow()) 321 cr.throwException(); 322 cr = cd.flush(cb); 323 if (!cr.isUnderflow()) 324 cr.throwException(); 325 } catch (CharacterCodingException x) { 326 // Substitution is always enabled, 327 // so this shouldn't happen 328 throw new Error(x); 329 } 330 return new Result().with(ca, 0, cb.position()); 331 } 332 333 static Result decode(byte[] ba, int off, int len) { 334 String csn = Charset.defaultCharset().name(); 335 try { 336 // use charset name decode() variant which provides caching. 337 return decode(csn, ba, off, len); 338 } catch (UnsupportedEncodingException x) { 339 warnUnsupportedCharset(csn); 340 } 341 try { 342 return decode("ISO-8859-1", ba, off, len); 343 } catch (UnsupportedEncodingException x) { 344 // If this code is hit during VM initialization, MessageUtils is 345 // the only way we will be able to get any kind of error message. 346 MessageUtils.err("ISO-8859-1 charset not available: " 347 + x.toString()); 348 // If we can not find ISO-8859-1 (a required encoding) then things 349 // are seriously wrong with the installation. 350 System.exit(1); 351 return null; 352 } 353 } 354 355 // -- Encoding -- 356 private static class StringEncoder { 357 private Charset cs; 358 private CharsetEncoder ce; 359 private final boolean isASCIICompatible; 360 private final String requestedCharsetName; 361 private final boolean isTrusted; 362 363 private StringEncoder(Charset cs, String rcn) { 364 this.requestedCharsetName = rcn; 365 this.cs = cs; 366 this.ce = cs.newEncoder() 367 .onMalformedInput(CodingErrorAction.REPLACE) 368 .onUnmappableCharacter(CodingErrorAction.REPLACE); 369 this.isTrusted = (cs.getClass().getClassLoader0() == null); 370 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 371 ((ArrayEncoder)ce).isASCIICompatible(); 372 } 373 374 String charsetName() { 375 if (cs instanceof HistoricallyNamedCharset) 376 return ((HistoricallyNamedCharset)cs).historicalName(); 377 return cs.name(); 378 } 379 380 final String requestedCharsetName() { 381 return requestedCharsetName; 382 } 383 384 byte[] encode(byte coder, byte[] val) { 385 // fastpath for ascii compatible 386 if (coder == LATIN1 && isASCIICompatible && 387 !hasNegatives(val, 0, val.length)) { 388 return Arrays.copyOf(val, val.length); 389 } 390 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 391 int en = scale(len, ce.maxBytesPerChar()); 392 byte[] ba = new byte[en]; 393 if (len == 0) { 394 return ba; 395 } 396 if (ce instanceof ArrayEncoder) { 397 if (!isTrusted) { 398 val = Arrays.copyOf(val, val.length); 399 } 400 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 401 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 402 if (blen != -1) { 403 return safeTrim(ba, blen, isTrusted); 404 } 405 } 406 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 407 : StringUTF16.toChars(val); 408 ce.reset(); 409 ByteBuffer bb = ByteBuffer.wrap(ba); 410 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 411 try { 412 CoderResult cr = ce.encode(cb, bb, true); 413 if (!cr.isUnderflow()) 414 cr.throwException(); 415 cr = ce.flush(bb); 416 if (!cr.isUnderflow()) 417 cr.throwException(); 418 } catch (CharacterCodingException x) { 419 // Substitution is always enabled, 420 // so this shouldn't happen 421 throw new Error(x); 422 } 423 return safeTrim(ba, bb.position(), isTrusted); 424 } 425 } 426 427 @HotSpotIntrinsicCandidate 428 private static int implEncodeISOArray(byte[] sa, int sp, 429 byte[] da, int dp, int len) { 430 int i = 0; 431 for (; i < len; i++) { 432 char c = StringUTF16.getChar(sa, sp++); 433 if (c > '\u00FF') 434 break; 435 da[dp++] = (byte)c; 436 } 437 return i; 438 } 439 440 static byte[] encode8859_1(byte coder, byte[] val) { 441 if (coder == LATIN1) { 442 return Arrays.copyOf(val, val.length); 443 } 444 int len = val.length >> 1; 445 byte[] dst = new byte[len]; 446 int dp = 0; 447 int sp = 0; 448 int sl = len; 449 while (sp < sl) { 450 int ret = implEncodeISOArray(val, sp, dst, dp, len); 451 sp = sp + ret; 452 dp = dp + ret; 453 if (ret != len) { 454 char c = StringUTF16.getChar(val, sp++); 455 if (Character.isHighSurrogate(c) && sp < sl && 456 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 457 sp++; 458 } 459 dst[dp++] = '?'; 460 len = sl - sp; 461 } 462 } 463 if (dp == dst.length) { 464 return dst; 465 } 466 return Arrays.copyOf(dst, dp); 467 } 468 469 static byte[] encodeASCII(byte coder, byte[] val) { 470 if (coder == LATIN1) { 471 byte[] dst = new byte[val.length]; 472 for (int i = 0; i < val.length; i++) { 473 if (val[i] < 0) { 474 dst[i] = '?'; 475 } else { 476 dst[i] = val[i]; 477 } 478 } 479 return dst; 480 } 481 int len = val.length >> 1; 482 byte[] dst = new byte[len]; 483 int dp = 0; 484 for (int i = 0; i < len; i++) { 485 char c = StringUTF16.getChar(val, i); 486 if (c < 0x80) { 487 dst[dp++] = (byte)c; 488 continue; 489 } 490 if (Character.isHighSurrogate(c) && i + 1 < len && 491 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 492 i++; 493 } 494 dst[dp++] = '?'; 495 } 496 if (len == dp) { 497 return dst; 498 } 499 return Arrays.copyOf(dst, dp); 500 } 501 502 static byte[] encodeUTF8(byte coder, byte[] val) { 503 int dp = 0; 504 byte[] dst; 505 if (coder == LATIN1) { 506 dst = new byte[val.length << 1]; 507 for (int sp = 0; sp < val.length; sp++) { 508 byte c = val[sp]; 509 if (c < 0) { 510 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 511 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 512 } else { 513 dst[dp++] = c; 514 } 515 } 516 } else { 517 int sp = 0; 518 int sl = val.length >> 1; 519 dst = new byte[sl * 3]; 520 char c; 521 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 522 // ascii fast loop; 523 dst[dp++] = (byte)c; 524 sp++; 525 } 526 while (sp < sl) { 527 c = StringUTF16.getChar(val, sp++); 528 if (c < 0x80) { 529 dst[dp++] = (byte)c; 530 } else if (c < 0x800) { 531 dst[dp++] = (byte)(0xc0 | (c >> 6)); 532 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 533 } else if (Character.isSurrogate(c)) { 534 int uc = -1; 535 char c2; 536 if (Character.isHighSurrogate(c) && sp < sl && 537 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 538 uc = Character.toCodePoint(c, c2); 539 } 540 if (uc < 0) { 541 dst[dp++] = '?'; 542 } else { 543 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 544 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 545 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 546 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 547 sp++; // 2 chars 548 } 549 } else { 550 // 3 bytes, 16 bits 551 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 552 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 553 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 554 } 555 } 556 } 557 if (dp == dst.length) { 558 return dst; 559 } 560 return Arrays.copyOf(dst, dp); 561 } 562 563 static byte[] encode(String charsetName, byte coder, byte[] val) 564 throws UnsupportedEncodingException 565 { 566 StringEncoder se = deref(encoder); 567 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 568 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 569 || csn.equals(se.charsetName()))) { 570 se = null; 571 try { 572 Charset cs = lookupCharset(csn); 573 if (cs != null) { 574 if (cs == UTF_8) { 575 return encodeUTF8(coder, val); 576 } else if (cs == ISO_8859_1) { 577 return encode8859_1(coder, val); 578 } else if (cs == US_ASCII) { 579 return encodeASCII(coder, val); 580 } 581 se = new StringEncoder(cs, csn); 582 } 583 } catch (IllegalCharsetNameException x) {} 584 if (se == null) { 585 throw new UnsupportedEncodingException (csn); 586 } 587 set(encoder, se); 588 } 589 return se.encode(coder, val); 590 } 591 592 static byte[] encode(Charset cs, byte coder, byte[] val) { 593 if (cs == UTF_8) { 594 return encodeUTF8(coder, val); 595 } else if (cs == ISO_8859_1) { 596 return encode8859_1(coder, val); 597 } else if (cs == US_ASCII) { 598 return encodeASCII(coder, val); 599 } 600 CharsetEncoder ce = cs.newEncoder(); 601 // fastpath for ascii compatible 602 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 603 ((ArrayEncoder)ce).isASCIICompatible() && 604 !hasNegatives(val, 0, val.length)))) { 605 return Arrays.copyOf(val, val.length); 606 } 607 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 608 int en = scale(len, ce.maxBytesPerChar()); 609 byte[] ba = new byte[en]; 610 if (len == 0) { 611 return ba; 612 } 613 boolean isTrusted = System.getSecurityManager() == null || 614 cs.getClass().getClassLoader0() == null; 615 ce.onMalformedInput(CodingErrorAction.REPLACE) 616 .onUnmappableCharacter(CodingErrorAction.REPLACE) 617 .reset(); 618 if (ce instanceof ArrayEncoder) { 619 if (!isTrusted) { 620 val = Arrays.copyOf(val, val.length); 621 } 622 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 623 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 624 if (blen != -1) { 625 return safeTrim(ba, blen, isTrusted); 626 } 627 } 628 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 629 : StringUTF16.toChars(val); 630 ByteBuffer bb = ByteBuffer.wrap(ba); 631 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 632 try { 633 CoderResult cr = ce.encode(cb, bb, true); 634 if (!cr.isUnderflow()) 635 cr.throwException(); 636 cr = ce.flush(bb); 637 if (!cr.isUnderflow()) 638 cr.throwException(); 639 } catch (CharacterCodingException x) { 640 throw new Error(x); 641 } 642 return safeTrim(ba, bb.position(), isTrusted); 643 } 644 645 static byte[] encode(byte coder, byte[] val) { 646 String csn = Charset.defaultCharset().name(); 647 try { 648 // use charset name encode() variant which provides caching. 649 return encode(csn, coder, val); 650 } catch (UnsupportedEncodingException x) { 651 warnUnsupportedCharset(csn); 652 } 653 try { 654 return encode("ISO-8859-1", coder, val); 655 } catch (UnsupportedEncodingException x) { 656 // If this code is hit during VM initialization, MessageUtils is 657 // the only way we will be able to get any kind of error message. 658 MessageUtils.err("ISO-8859-1 charset not available: " 659 + x.toString()); 660 // If we can not find ISO-8859-1 (a required encoding) then things 661 // are seriously wrong with the installation. 662 System.exit(1); 663 return null; 664 } 665 } 666 }