1 /* 2 * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.UnsupportedCharsetException; 40 import java.util.Arrays; 41 import jdk.internal.HotSpotIntrinsicCandidate; 42 import sun.nio.cs.HistoricallyNamedCharset; 43 import sun.nio.cs.ArrayDecoder; 44 import sun.nio.cs.ArrayEncoder; 45 46 import static java.lang.String.LATIN1; 47 import static java.lang.String.UTF16; 48 import static java.lang.String.COMPACT_STRINGS; 49 50 /** 51 * Utility class for string encoding and decoding. 52 */ 53 54 class StringCoding { 55 56 private StringCoding() { } 57 58 /** The cached coders for each thread */ 59 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 60 new ThreadLocal<>(); 61 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 62 new ThreadLocal<>(); 63 64 private static final Charset ISO_8859_1 = Charset.forName("iso-8859-1"); 65 private static final Charset US_ASCII = Charset.forName("us-ascii"); 66 private static final Charset UTF_8 = Charset.forName("utf-8"); 67 68 private static boolean warnUnsupportedCharset = true; 69 70 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 71 SoftReference<T> sr = tl.get(); 72 if (sr == null) 73 return null; 74 return sr.get(); 75 } 76 77 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 78 tl.set(new SoftReference<>(ob)); 79 } 80 81 // Trim the given byte array to the given length 82 // 83 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 84 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 85 return ba; 86 else 87 return Arrays.copyOf(ba, len); 88 } 89 90 private static int scale(int len, float expansionFactor) { 91 // We need to perform double, not float, arithmetic; otherwise 92 // we lose low order bits when len is larger than 2**24. 93 return (int)(len * (double)expansionFactor); 94 } 95 96 private static Charset lookupCharset(String csn) { 97 if (Charset.isSupported(csn)) { 98 try { 99 return Charset.forName(csn); 100 } catch (UnsupportedCharsetException x) { 101 throw new Error(x); 102 } 103 } 104 return null; 105 } 106 107 private static void warnUnsupportedCharset(String csn) { 108 if (warnUnsupportedCharset) { 109 // Use err(String) rather than the Logging API or System.err 110 // since this method may be called during VM initialization 111 // before either is available. 112 err("WARNING: Default charset " + csn + 113 " not supported, using ISO-8859-1 instead\n"); 114 warnUnsupportedCharset = false; 115 } 116 } 117 118 static class Result { 119 byte[] value; 120 byte coder; 121 122 Result with() { 123 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 124 value = new byte[0]; 125 return this; 126 } 127 128 Result with(char[] val, int off, int len) { 129 if (String.COMPACT_STRINGS) { 130 byte[] bs = StringUTF16.compress(val, off, len); 131 if (bs != null) { 132 value = bs; 133 coder = LATIN1; 134 return this; 135 } 136 } 137 coder = UTF16; 138 value = StringUTF16.toBytes(val, off, len); 139 return this; 140 } 141 142 Result with(byte[] val, byte coder) { 143 this.coder = coder; 144 value = val; 145 return this; 146 } 147 } 148 149 @HotSpotIntrinsicCandidate 150 public static boolean hasNegatives(byte[] ba, int off, int len) { 151 for (int i = off; i < off + len; i++) { 152 if (ba[i] < 0) { 153 return true; 154 } 155 } 156 return false; 157 } 158 159 // -- Decoding -- 160 static class StringDecoder { 161 private final String requestedCharsetName; 162 private final Charset cs; 163 private final boolean isASCIICompatible; 164 private final CharsetDecoder cd; 165 protected final Result result; 166 167 StringDecoder(Charset cs, String rcn) { 168 this.requestedCharsetName = rcn; 169 this.cs = cs; 170 this.cd = cs.newDecoder() 171 .onMalformedInput(CodingErrorAction.REPLACE) 172 .onUnmappableCharacter(CodingErrorAction.REPLACE); 173 this.result = new Result(); 174 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 175 ((ArrayDecoder)cd).isASCIICompatible(); 176 } 177 178 String charsetName() { 179 if (cs instanceof HistoricallyNamedCharset) 180 return ((HistoricallyNamedCharset)cs).historicalName(); 181 return cs.name(); 182 } 183 184 final String requestedCharsetName() { 185 return requestedCharsetName; 186 } 187 188 Result decode(byte[] ba, int off, int len) { 189 if (len == 0) { 190 return result.with(); 191 } 192 // fastpath for ascii compatible 193 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 194 if (COMPACT_STRINGS) { 195 return result.with(Arrays.copyOfRange(ba, off, off + len), 196 LATIN1); 197 } else { 198 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 199 } 200 } 201 int en = scale(len, cd.maxCharsPerByte()); 202 char[] ca = new char[en]; 203 if (cd instanceof ArrayDecoder) { 204 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 205 return result.with(ca, 0, clen); 206 } 207 cd.reset(); 208 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 209 CharBuffer cb = CharBuffer.wrap(ca); 210 try { 211 CoderResult cr = cd.decode(bb, cb, true); 212 if (!cr.isUnderflow()) 213 cr.throwException(); 214 cr = cd.flush(cb); 215 if (!cr.isUnderflow()) 216 cr.throwException(); 217 } catch (CharacterCodingException x) { 218 // Substitution is always enabled, 219 // so this shouldn't happen 220 throw new Error(x); 221 } 222 return result.with(ca, 0, cb.position()); 223 } 224 } 225 226 private static class StringDecoder8859_1 extends StringDecoder { 227 StringDecoder8859_1(Charset cs, String rcn) { 228 super(cs, rcn); 229 } 230 Result decode(byte[] ba, int off, int len) { 231 if (COMPACT_STRINGS) { 232 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 233 } else { 234 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 235 } 236 } 237 } 238 239 static Result decode(String charsetName, byte[] ba, int off, int len) 240 throws UnsupportedEncodingException 241 { 242 StringDecoder sd = deref(decoder); 243 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 244 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 245 || csn.equals(sd.charsetName()))) { 246 sd = null; 247 try { 248 Charset cs = lookupCharset(csn); 249 if (cs != null) { 250 if (cs == UTF_8) { 251 sd = new StringDecoderUTF8(cs, csn); 252 } else if (cs == ISO_8859_1) { 253 sd = new StringDecoder8859_1(cs, csn); 254 } else { 255 sd = new StringDecoder(cs, csn); 256 } 257 } 258 } catch (IllegalCharsetNameException x) {} 259 if (sd == null) 260 throw new UnsupportedEncodingException(csn); 261 set(decoder, sd); 262 } 263 return sd.decode(ba, off, len); 264 } 265 266 static Result decode(Charset cs, byte[] ba, int off, int len) { 267 // (1)We never cache the "external" cs, the only benefit of creating 268 // an additional StringDe/Encoder object to wrap it is to share the 269 // de/encode() method. These SD/E objects are short-lived, the young-gen 270 // gc should be able to take care of them well. But the best approach 271 // is still not to generate them if not really necessary. 272 // (2)The defensive copy of the input byte/char[] has a big performance 273 // impact, as well as the outgoing result byte/char[]. Need to do the 274 // optimization check of (sm==null && classLoader0==null) for both. 275 // (3)There might be a timing gap in isTrusted setting. getClassLoader0() 276 // is only checked (and then isTrusted gets set) when (SM==null). It is 277 // possible that the SM==null for now but then SM is NOT null later 278 // when safeTrim() is invoked...the "safe" way to do is to redundant 279 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 280 // but it then can be argued that the SM is null when the operation 281 // is started... 282 if (cs == UTF_8) { 283 return StringDecoderUTF8.decode(ba, off, len, new Result()); 284 } 285 CharsetDecoder cd = cs.newDecoder(); 286 // ascii fastpath 287 if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) && 288 ((ArrayDecoder)cd).isASCIICompatible() && 289 !hasNegatives(ba, off, len))) { 290 if (COMPACT_STRINGS) { 291 return new Result().with(Arrays.copyOfRange(ba, off, off + len), 292 LATIN1); 293 } else { 294 return new Result().with(StringLatin1.inflate(ba, off, len), UTF16); 295 } 296 } 297 int en = scale(len, cd.maxCharsPerByte()); 298 if (len == 0) { 299 return new Result().with(); 300 } 301 if (cs.getClass().getClassLoader0() != null && 302 System.getSecurityManager() != null) { 303 ba = Arrays.copyOfRange(ba, off, off + len); 304 off = 0; 305 } 306 cd.onMalformedInput(CodingErrorAction.REPLACE) 307 .onUnmappableCharacter(CodingErrorAction.REPLACE) 308 .reset(); 309 310 char[] ca = new char[en]; 311 if (cd instanceof ArrayDecoder) { 312 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 313 return new Result().with(ca, 0, clen); 314 } 315 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 316 CharBuffer cb = CharBuffer.wrap(ca); 317 try { 318 CoderResult cr = cd.decode(bb, cb, true); 319 if (!cr.isUnderflow()) 320 cr.throwException(); 321 cr = cd.flush(cb); 322 if (!cr.isUnderflow()) 323 cr.throwException(); 324 } catch (CharacterCodingException x) { 325 // Substitution is always enabled, 326 // so this shouldn't happen 327 throw new Error(x); 328 } 329 return new Result().with(ca, 0, cb.position()); 330 } 331 332 static Result decode(byte[] ba, int off, int len) { 333 String csn = Charset.defaultCharset().name(); 334 try { 335 // use charset name decode() variant which provides caching. 336 return decode(csn, ba, off, len); 337 } catch (UnsupportedEncodingException x) { 338 warnUnsupportedCharset(csn); 339 } 340 try { 341 return decode("ISO-8859-1", ba, off, len); 342 } catch (UnsupportedEncodingException x) { 343 // If this code is hit during VM initialization, err(String) is 344 // the only way we will be able to get any kind of error message. 345 err("ISO-8859-1 charset not available: " + x.toString() + "\n"); 346 // If we can not find ISO-8859-1 (a required encoding) then things 347 // are seriously wrong with the installation. 348 System.exit(1); 349 return null; 350 } 351 } 352 353 // -- Encoding -- 354 private static class StringEncoder { 355 private Charset cs; 356 private CharsetEncoder ce; 357 private final boolean isASCIICompatible; 358 private final String requestedCharsetName; 359 private final boolean isTrusted; 360 361 private StringEncoder(Charset cs, String rcn) { 362 this.requestedCharsetName = rcn; 363 this.cs = cs; 364 this.ce = cs.newEncoder() 365 .onMalformedInput(CodingErrorAction.REPLACE) 366 .onUnmappableCharacter(CodingErrorAction.REPLACE); 367 this.isTrusted = (cs.getClass().getClassLoader0() == null); 368 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 369 ((ArrayEncoder)ce).isASCIICompatible(); 370 } 371 372 String charsetName() { 373 if (cs instanceof HistoricallyNamedCharset) 374 return ((HistoricallyNamedCharset)cs).historicalName(); 375 return cs.name(); 376 } 377 378 final String requestedCharsetName() { 379 return requestedCharsetName; 380 } 381 382 byte[] encode(byte coder, byte[] val) { 383 // fastpath for ascii compatible 384 if (coder == LATIN1 && isASCIICompatible && 385 !hasNegatives(val, 0, val.length)) { 386 return Arrays.copyOf(val, val.length); 387 } 388 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 389 int en = scale(len, ce.maxBytesPerChar()); 390 byte[] ba = new byte[en]; 391 if (len == 0) { 392 return ba; 393 } 394 if (ce instanceof ArrayEncoder) { 395 if (!isTrusted) { 396 val = Arrays.copyOf(val, val.length); 397 } 398 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 399 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 400 if (blen != -1) { 401 return safeTrim(ba, blen, isTrusted); 402 } 403 } 404 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 405 : StringUTF16.toChars(val); 406 ce.reset(); 407 ByteBuffer bb = ByteBuffer.wrap(ba); 408 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 409 try { 410 CoderResult cr = ce.encode(cb, bb, true); 411 if (!cr.isUnderflow()) 412 cr.throwException(); 413 cr = ce.flush(bb); 414 if (!cr.isUnderflow()) 415 cr.throwException(); 416 } catch (CharacterCodingException x) { 417 // Substitution is always enabled, 418 // so this shouldn't happen 419 throw new Error(x); 420 } 421 return safeTrim(ba, bb.position(), isTrusted); 422 } 423 } 424 425 @HotSpotIntrinsicCandidate 426 private static int implEncodeISOArray(byte[] sa, int sp, 427 byte[] da, int dp, int len) { 428 int i = 0; 429 for (; i < len; i++) { 430 char c = StringUTF16.getChar(sa, sp++); 431 if (c > '\u00FF') 432 break; 433 da[dp++] = (byte)c; 434 } 435 return i; 436 } 437 438 static byte[] encode8859_1(byte coder, byte[] val) { 439 if (coder == LATIN1) { 440 return Arrays.copyOf(val, val.length); 441 } 442 int len = val.length >> 1; 443 byte[] dst = new byte[len]; 444 int dp = 0; 445 int sp = 0; 446 int sl = len; 447 while (sp < sl) { 448 int ret = implEncodeISOArray(val, sp, dst, dp, len); 449 sp = sp + ret; 450 dp = dp + ret; 451 if (ret != len) { 452 char c = StringUTF16.getChar(val, sp++); 453 if (Character.isHighSurrogate(c) && sp < sl && 454 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 455 sp++; 456 } 457 dst[dp++] = '?'; 458 len = sl - sp; 459 } 460 } 461 if (dp == dst.length) { 462 return dst; 463 } 464 return Arrays.copyOf(dst, dp); 465 } 466 467 static byte[] encodeASCII(byte coder, byte[] val) { 468 if (coder == LATIN1) { 469 byte[] dst = new byte[val.length]; 470 for (int i = 0; i < val.length; i++) { 471 if (val[i] < 0) { 472 dst[i] = '?'; 473 } else { 474 dst[i] = val[i]; 475 } 476 } 477 return dst; 478 } 479 int len = val.length >> 1; 480 byte[] dst = new byte[len]; 481 int dp = 0; 482 for (int i = 0; i < len; i++) { 483 char c = StringUTF16.getChar(val, i); 484 if (c < 0x80) { 485 dst[dp++] = (byte)c; 486 continue; 487 } 488 if (Character.isHighSurrogate(c) && i + 1 < len && 489 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 490 i++; 491 } 492 dst[dp++] = '?'; 493 } 494 if (len == dp) { 495 return dst; 496 } 497 return Arrays.copyOf(dst, dp); 498 } 499 500 static byte[] encodeUTF8(byte coder, byte[] val) { 501 int dp = 0; 502 byte[] dst; 503 if (coder == LATIN1) { 504 dst = new byte[val.length << 1]; 505 for (int sp = 0; sp < val.length; sp++) { 506 byte c = val[sp]; 507 if (c < 0) { 508 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 509 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 510 } else { 511 dst[dp++] = c; 512 } 513 } 514 } else { 515 int sp = 0; 516 int sl = val.length >> 1; 517 dst = new byte[sl * 3]; 518 char c; 519 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 520 // ascii fast loop; 521 dst[dp++] = (byte)c; 522 sp++; 523 } 524 while (sp < sl) { 525 c = StringUTF16.getChar(val, sp++); 526 if (c < 0x80) { 527 dst[dp++] = (byte)c; 528 } else if (c < 0x800) { 529 dst[dp++] = (byte)(0xc0 | (c >> 6)); 530 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 531 } else if (Character.isSurrogate(c)) { 532 int uc = -1; 533 char c2; 534 if (Character.isHighSurrogate(c) && sp < sl && 535 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 536 uc = Character.toCodePoint(c, c2); 537 } 538 if (uc < 0) { 539 dst[dp++] = '?'; 540 } else { 541 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 542 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 543 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 544 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 545 sp++; // 2 chars 546 } 547 } else { 548 // 3 bytes, 16 bits 549 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 550 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 551 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 552 } 553 } 554 } 555 if (dp == dst.length) { 556 return dst; 557 } 558 return Arrays.copyOf(dst, dp); 559 } 560 561 static byte[] encode(String charsetName, byte coder, byte[] val) 562 throws UnsupportedEncodingException 563 { 564 StringEncoder se = deref(encoder); 565 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 566 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 567 || csn.equals(se.charsetName()))) { 568 se = null; 569 try { 570 Charset cs = lookupCharset(csn); 571 if (cs != null) { 572 if (cs == UTF_8) { 573 return encodeUTF8(coder, val); 574 } else if (cs == ISO_8859_1) { 575 return encode8859_1(coder, val); 576 } else if (cs == US_ASCII) { 577 return encodeASCII(coder, val); 578 } 579 se = new StringEncoder(cs, csn); 580 } 581 } catch (IllegalCharsetNameException x) {} 582 if (se == null) { 583 throw new UnsupportedEncodingException (csn); 584 } 585 set(encoder, se); 586 } 587 return se.encode(coder, val); 588 } 589 590 static byte[] encode(Charset cs, byte coder, byte[] val) { 591 if (cs == UTF_8) { 592 return encodeUTF8(coder, val); 593 } else if (cs == ISO_8859_1) { 594 return encode8859_1(coder, val); 595 } else if (cs == US_ASCII) { 596 return encodeASCII(coder, val); 597 } 598 CharsetEncoder ce = cs.newEncoder(); 599 // fastpath for ascii compatible 600 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 601 ((ArrayEncoder)ce).isASCIICompatible() && 602 !hasNegatives(val, 0, val.length)))) { 603 return Arrays.copyOf(val, val.length); 604 } 605 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 606 int en = scale(len, ce.maxBytesPerChar()); 607 byte[] ba = new byte[en]; 608 if (len == 0) { 609 return ba; 610 } 611 boolean isTrusted = cs.getClass().getClassLoader0() == null || 612 System.getSecurityManager() == null; 613 ce.onMalformedInput(CodingErrorAction.REPLACE) 614 .onUnmappableCharacter(CodingErrorAction.REPLACE) 615 .reset(); 616 if (ce instanceof ArrayEncoder) { 617 if (!isTrusted) { 618 val = Arrays.copyOf(val, val.length); 619 } 620 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 621 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 622 if (blen != -1) { 623 return safeTrim(ba, blen, isTrusted); 624 } 625 } 626 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 627 : StringUTF16.toChars(val); 628 ByteBuffer bb = ByteBuffer.wrap(ba); 629 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 630 try { 631 CoderResult cr = ce.encode(cb, bb, true); 632 if (!cr.isUnderflow()) 633 cr.throwException(); 634 cr = ce.flush(bb); 635 if (!cr.isUnderflow()) 636 cr.throwException(); 637 } catch (CharacterCodingException x) { 638 throw new Error(x); 639 } 640 return safeTrim(ba, bb.position(), isTrusted); 641 } 642 643 static byte[] encode(byte coder, byte[] val) { 644 String csn = Charset.defaultCharset().name(); 645 try { 646 // use charset name encode() variant which provides caching. 647 return encode(csn, coder, val); 648 } catch (UnsupportedEncodingException x) { 649 warnUnsupportedCharset(csn); 650 } 651 try { 652 return encode("ISO-8859-1", coder, val); 653 } catch (UnsupportedEncodingException x) { 654 // If this code is hit during VM initialization, err(String) is 655 // the only way we will be able to get any kind of error message. 656 err("ISO-8859-1 charset not available: " + x.toString() + "\n"); 657 // If we can not find ISO-8859-1 (a required encoding) then things 658 // are seriously wrong with the installation. 659 System.exit(1); 660 return null; 661 } 662 } 663 664 /** 665 * Print a message directly to stderr, bypassing all character conversion 666 * methods. 667 * @param msg message to print 668 */ 669 private static native void err(String msg); 670 }