--- old/src/java.base/share/classes/java/lang/StringCoding.java 2015-10-30 14:03:41.874341566 -0700 +++ new/src/java.base/share/classes/java/lang/StringCoding.java 2015-10-30 14:03:41.633342476 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -38,11 +38,19 @@ import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; +import jdk.internal.HotSpotIntrinsicCandidate; import sun.misc.MessageUtils; import sun.nio.cs.HistoricallyNamedCharset; import sun.nio.cs.ArrayDecoder; import sun.nio.cs.ArrayEncoder; +import static java.lang.String.LATIN1; +import static java.lang.String.UTF16; +import static java.lang.String.COMPACT_STRINGS; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.UTF_8; + /** * Utility class for string encoding and decoding. */ @@ -72,23 +80,13 @@ // Trim the given byte array to the given length // - private static byte[] safeTrim(byte[] ba, int len, Charset cs, boolean isTrusted) { + private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) return ba; else return Arrays.copyOf(ba, len); } - // Trim the given char array to the given length - // - private static char[] safeTrim(char[] ca, int len, - Charset cs, boolean isTrusted) { - if (len == ca.length && (isTrusted || System.getSecurityManager() == null)) - return ca; - else - return Arrays.copyOf(ca, len); - } - private static int scale(int len, float expansionFactor) { // We need to perform double, not float, arithmetic; otherwise // we lose low order bits when len is larger than 2**24. @@ -117,21 +115,64 @@ } } + static class Result { + byte[] value; + byte coder; + + Result with() { + coder = COMPACT_STRINGS ? LATIN1 : UTF16; + value = new byte[0]; + return this; + } + + Result with(char[] val, int off, int len) { + if (String.COMPACT_STRINGS) { + byte[] bs = StringUTF16.compress(val, off, len); + if (bs != null) { + value = bs; + coder = LATIN1; + return this; + } + } + coder = UTF16; + value = StringUTF16.toBytes(val, off, len); + return this; + } + + Result with(byte[] val, byte coder) { + this.coder = coder; + value = val; + return this; + } + } + + @HotSpotIntrinsicCandidate + private static boolean hasNegatives(byte[] ba, int off, int len) { + for (int i = off; i < off + len; i++) { + if (ba[i] < 0) { + return true; + } + } + return false; + } // -- Decoding -- - private static class StringDecoder { + static class StringDecoder { private final String requestedCharsetName; private final Charset cs; + private final boolean isASCIICompatible; private final CharsetDecoder cd; - private final boolean isTrusted; + protected final Result result; - private StringDecoder(Charset cs, String rcn) { + StringDecoder(Charset cs, String rcn) { this.requestedCharsetName = rcn; this.cs = cs; this.cd = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); - this.isTrusted = (cs.getClass().getClassLoader0() == null); + this.result = new Result(); + this.isASCIICompatible = (cd instanceof ArrayDecoder) && + ((ArrayDecoder)cd).isASCIICompatible(); } String charsetName() { @@ -144,36 +185,58 @@ return requestedCharsetName; } - char[] decode(byte[] ba, int off, int len) { + Result decode(byte[] ba, int off, int len) { + if (len == 0) { + return result.with(); + } + // fastpath for ascii compatible + if (isASCIICompatible && !hasNegatives(ba, off, len)) { + if (COMPACT_STRINGS) { + return result.with(Arrays.copyOfRange(ba, off, off + len), + LATIN1); + } else { + return result.with(StringLatin1.inflate(ba, off, len), UTF16); + } + } int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; - if (len == 0) - return ca; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); - return safeTrim(ca, clen, cs, isTrusted); + return result.with(ca, 0, clen); + } + cd.reset(); + ByteBuffer bb = ByteBuffer.wrap(ba, off, len); + CharBuffer cb = CharBuffer.wrap(ca); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + return result.with(ca, 0, cb.position()); + } + } + + private static class StringDecoder8859_1 extends StringDecoder { + StringDecoder8859_1(Charset cs, String rcn) { + super(cs, rcn); + } + Result decode(byte[] ba, int off, int len) { + if (COMPACT_STRINGS) { + return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); } else { - cd.reset(); - ByteBuffer bb = ByteBuffer.wrap(ba, off, len); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return safeTrim(ca, cb.position(), cs, isTrusted); + return result.with(StringLatin1.inflate(ba, off, len), UTF16); } } } - static char[] decode(String charsetName, byte[] ba, int off, int len) + static Result decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException { StringDecoder sd = deref(decoder); @@ -183,8 +246,15 @@ sd = null; try { Charset cs = lookupCharset(csn); - if (cs != null) - sd = new StringDecoder(cs, csn); + if (cs != null) { + if (cs == UTF_8) { + sd = new StringDecoderUTF8(cs, csn); + } else if (cs == ISO_8859_1) { + sd = new StringDecoder8859_1(cs, csn); + } else { + sd = new StringDecoder(cs, csn); + } + } } catch (IllegalCharsetNameException x) {} if (sd == null) throw new UnsupportedEncodingException(csn); @@ -193,7 +263,7 @@ return sd.decode(ba, off, len); } - static char[] decode(Charset cs, byte[] ba, int off, int len) { + static Result decode(Charset cs, byte[] ba, int off, int len) { // (1)We never cache the "external" cs, the only benefit of creating // an additional StringDe/Encoder object to wrap it is to share the // de/encode() method. These SD/E objects are short-lived, the young-gen @@ -210,44 +280,57 @@ // check (... && (isTrusted || SM == null || getClassLoader0())) in trim // but it then can be argued that the SM is null when the operation // is started... + if (cs == UTF_8) { + return StringDecoderUTF8.decode(ba, off, len, new Result()); + } CharsetDecoder cd = cs.newDecoder(); + // ascii fastpath + if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) && + ((ArrayDecoder)cd).isASCIICompatible() && + !hasNegatives(ba, off, len))) { + if (COMPACT_STRINGS) { + return new Result().with(Arrays.copyOfRange(ba, off, off + len), + LATIN1); + } else { + return new Result().with(StringLatin1.inflate(ba, off, len), UTF16); + } + } int en = scale(len, cd.maxCharsPerByte()); - char[] ca = new char[en]; - if (len == 0) - return ca; - boolean isTrusted = false; - if (System.getSecurityManager() != null) { - if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) { - ba = Arrays.copyOfRange(ba, off, off + len); - off = 0; - } + if (len == 0) { + return new Result().with(); + } + if (System.getSecurityManager() != null && + cs.getClass().getClassLoader0() != null) { + ba = Arrays.copyOfRange(ba, off, off + len); + off = 0; } cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); + + char[] ca = new char[en]; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); - return safeTrim(ca, clen, cs, isTrusted); - } else { - ByteBuffer bb = ByteBuffer.wrap(ba, off, len); - CharBuffer cb = CharBuffer.wrap(ca); - try { - CoderResult cr = cd.decode(bb, cb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = cd.flush(cb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); - } - return safeTrim(ca, cb.position(), cs, isTrusted); + return new Result().with(ca, 0, clen); } + ByteBuffer bb = ByteBuffer.wrap(ba, off, len); + CharBuffer cb = CharBuffer.wrap(ca); + try { + CoderResult cr = cd.decode(bb, cb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = cd.flush(cb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + return new Result().with(ca, 0, cb.position()); } - static char[] decode(byte[] ba, int off, int len) { + static Result decode(byte[] ba, int off, int len) { String csn = Charset.defaultCharset().name(); try { // use charset name decode() variant which provides caching. @@ -273,6 +356,7 @@ private static class StringEncoder { private Charset cs; private CharsetEncoder ce; + private final boolean isASCIICompatible; private final String requestedCharsetName; private final boolean isTrusted; @@ -283,6 +367,8 @@ .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); this.isTrusted = (cs.getClass().getClassLoader0() == null); + this.isASCIICompatible = (ce instanceof ArrayEncoder) && + ((ArrayEncoder)ce).isASCIICompatible(); } String charsetName() { @@ -295,36 +381,186 @@ return requestedCharsetName; } - byte[] encode(char[] ca, int off, int len) { + byte[] encode(byte coder, byte[] val) { + // fastpath for ascii compatible + if (coder == LATIN1 && isASCIICompatible && + !hasNegatives(val, 0, val.length)) { + return Arrays.copyOf(val, val.length); + } + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; - if (len == 0) + if (len == 0) { return ba; + } if (ce instanceof ArrayEncoder) { - int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba); - return safeTrim(ba, blen, cs, isTrusted); - } else { - ce.reset(); - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, off, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - // Substitution is always enabled, - // so this shouldn't happen - throw new Error(x); + if (!isTrusted) { + val = Arrays.copyOf(val, val.length); + } + int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) + : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); + if (blen != -1) { + return safeTrim(ba, blen, isTrusted); + } + } + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ce.reset(); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + try { + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + // Substitution is always enabled, + // so this shouldn't happen + throw new Error(x); + } + return safeTrim(ba, bb.position(), isTrusted); + } + } + + @HotSpotIntrinsicCandidate + private static int implEncodeISOArray(byte[] sa, int sp, + byte[] da, int dp, int len) { + int i = 0; + for (; i < len; i++) { + char c = StringUTF16.getChar(sa, sp++); + if (c > '\u00FF') + break; + da[dp++] = (byte)c; + } + return i; + } + + static byte[] encode8859_1(byte coder, byte[] val) { + if (coder == LATIN1) { + return Arrays.copyOf(val, val.length); + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + int sp = 0; + int sl = len; + while (sp < sl) { + int ret = implEncodeISOArray(val, sp, dst, dp, len); + sp = sp + ret; + dp = dp + ret; + if (ret != len) { + char c = StringUTF16.getChar(val, sp++); + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { + sp++; } - return safeTrim(ba, bb.position(), cs, isTrusted); + dst[dp++] = '?'; + len = sl - sp; } } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); } - static byte[] encode(String charsetName, char[] ca, int off, int len) + static byte[] encodeASCII(byte coder, byte[] val) { + if (coder == LATIN1) { + byte[] dst = new byte[val.length]; + for (int i = 0; i < val.length; i++) { + if (val[i] < 0) { + dst[i] = '?'; + } else { + dst[i] = val[i]; + } + } + return dst; + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + for (int i = 0; i < len; i++) { + char c = StringUTF16.getChar(val, i); + if (c < 0x80) { + dst[dp++] = (byte)c; + continue; + } + if (Character.isHighSurrogate(c) && i + 1 < len && + Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { + i++; + } + dst[dp++] = '?'; + } + if (len == dp) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + static byte[] encodeUTF8(byte coder, byte[] val) { + int dp = 0; + byte[] dst; + if (coder == LATIN1) { + dst = new byte[val.length << 1]; + for (int sp = 0; sp < val.length; sp++) { + byte c = val[sp]; + if (c < 0) { + dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } else { + dst[dp++] = c; + } + } + } else { + int sp = 0; + int sl = val.length >> 1; + dst = new byte[sl * 3]; + char c; + while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { + // ascii fast loop; + dst[dp++] = (byte)c; + sp++; + } + while (sp < sl) { + c = StringUTF16.getChar(val, sp++); + if (c < 0x80) { + dst[dp++] = (byte)c; + } else if (c < 0x800) { + dst[dp++] = (byte)(0xc0 | (c >> 6)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } else if (Character.isSurrogate(c)) { + int uc = -1; + char c2; + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { + uc = Character.toCodePoint(c, c2); + } + if (uc < 0) { + dst[dp++] = '?'; + } else { + dst[dp++] = (byte)(0xf0 | ((uc >> 18))); + dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); + dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (uc & 0x3f)); + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + dst[dp++] = (byte)(0xe0 | ((c >> 12))); + dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + static byte[] encode(String charsetName, byte coder, byte[] val) throws UnsupportedEncodingException { StringEncoder se = deref(encoder); @@ -334,62 +570,88 @@ se = null; try { Charset cs = lookupCharset(csn); - if (cs != null) + if (cs != null) { + if (cs == UTF_8) { + return encodeUTF8(coder, val); + } else if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } else if (cs == US_ASCII) { + return encodeASCII(coder, val); + } se = new StringEncoder(cs, csn); + } } catch (IllegalCharsetNameException x) {} - if (se == null) + if (se == null) { throw new UnsupportedEncodingException (csn); + } set(encoder, se); } - return se.encode(ca, off, len); + return se.encode(coder, val); } - static byte[] encode(Charset cs, char[] ca, int off, int len) { + static byte[] encode(Charset cs, byte coder, byte[] val) { + if (cs == UTF_8) { + return encodeUTF8(coder, val); + } else if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } else if (cs == US_ASCII) { + return encodeASCII(coder, val); + } CharsetEncoder ce = cs.newEncoder(); + // fastpath for ascii compatible + if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && + ((ArrayEncoder)ce).isASCIICompatible() && + !hasNegatives(val, 0, val.length)))) { + return Arrays.copyOf(val, val.length); + } + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; - if (len == 0) + if (len == 0) { return ba; - boolean isTrusted = false; - if (System.getSecurityManager() != null) { - if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) { - ca = Arrays.copyOfRange(ca, off, off + len); - off = 0; - } } + boolean isTrusted = System.getSecurityManager() == null || + cs.getClass().getClassLoader0() == null; ce.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); if (ce instanceof ArrayEncoder) { - int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba); - return safeTrim(ba, blen, cs, isTrusted); - } else { - ByteBuffer bb = ByteBuffer.wrap(ba); - CharBuffer cb = CharBuffer.wrap(ca, off, len); - try { - CoderResult cr = ce.encode(cb, bb, true); - if (!cr.isUnderflow()) - cr.throwException(); - cr = ce.flush(bb); - if (!cr.isUnderflow()) - cr.throwException(); - } catch (CharacterCodingException x) { - throw new Error(x); + if (!isTrusted) { + val = Arrays.copyOf(val, val.length); } - return safeTrim(ba, bb.position(), cs, isTrusted); + int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) + : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); + if (blen != -1) { + return safeTrim(ba, blen, isTrusted); + } + } + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + try { + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + throw new Error(x); } + return safeTrim(ba, bb.position(), isTrusted); } - static byte[] encode(char[] ca, int off, int len) { + static byte[] encode(byte coder, byte[] val) { String csn = Charset.defaultCharset().name(); try { // use charset name encode() variant which provides caching. - return encode(csn, ca, off, len); + return encode(csn, coder, val); } catch (UnsupportedEncodingException x) { warnUnsupportedCharset(csn); } try { - return encode("ISO-8859-1", ca, off, len); + return encode("ISO-8859-1", coder, val); } catch (UnsupportedEncodingException x) { // If this code is hit during VM initialization, MessageUtils is // the only way we will be able to get any kind of error message.