src/java.base/share/classes/java/lang/StringCoding.java

Print this page

        

*** 1,7 **** /* ! * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this --- 1,7 ---- /* ! * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this
*** 45,54 **** --- 45,59 ---- import sun.nio.cs.StandardCharsets; import static java.lang.String.LATIN1; import static java.lang.String.UTF16; import static java.lang.String.COMPACT_STRINGS; + import static java.lang.Character.isSurrogate; + import static java.lang.Character.highSurrogate; + import static java.lang.Character.lowSurrogate; + import static java.lang.Character.isSupplementaryCodePoint; + import static java.lang.StringUTF16.putChar; /** * Utility class for string encoding and decoding. */
*** 64,75 **** private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; - private static boolean warnUnsupportedCharset = true; - private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { SoftReference<T> sr = tl.get(); if (sr == null) return null; return sr.get(); --- 69,78 ----
*** 78,88 **** private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { tl.set(new SoftReference<>(ob)); } // Trim the given byte array to the given length - // private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) return ba; else return Arrays.copyOf(ba, len); --- 81,90 ----
*** 103,123 **** } } return null; } - private static void warnUnsupportedCharset(String csn) { - if (warnUnsupportedCharset) { - // Use err(String) rather than the Logging API or System.err - // since this method may be called during VM initialization - // before either is available. - err("WARNING: Default charset " + csn + - " not supported, using ISO-8859-1 instead\n"); - warnUnsupportedCharset = false; - } - } - static class Result { byte[] value; byte coder; Result with() { --- 105,114 ----
*** 222,244 **** } return result.with(ca, 0, cb.position()); } } - private static class StringDecoder8859_1 extends StringDecoder { - StringDecoder8859_1(Charset cs, String rcn) { - super(cs, rcn); - } - Result decode(byte[] ba, int off, int len) { - if (COMPACT_STRINGS) { - return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); - } else { - return result.with(StringLatin1.inflate(ba, off, len), UTF16); - } - } - } - static Result decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException { StringDecoder sd = deref(decoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; --- 213,222 ----
*** 247,272 **** sd = null; try { Charset cs = lookupCharset(csn); if (cs != null) { if (cs == UTF_8) { ! sd = new StringDecoderUTF8(cs, csn); ! } else if (cs == ISO_8859_1) { ! sd = new StringDecoder8859_1(cs, csn); ! } else { ! sd = new StringDecoder(cs, csn); } } } catch (IllegalCharsetNameException x) {} if (sd == null) throw new UnsupportedEncodingException(csn); set(decoder, sd); } return sd.decode(ba, off, len); } static Result decode(Charset cs, byte[] ba, int off, int len) { // (1)We never cache the "external" cs, the only benefit of creating // an additional StringDe/Encoder object to wrap it is to share the // de/encode() method. These SD/E objects are short-lived, the young-gen // gc should be able to take care of them well. But the best approach // is still not to generate them if not really necessary. --- 225,263 ---- sd = null; try { Charset cs = lookupCharset(csn); if (cs != null) { if (cs == UTF_8) { ! return decodeUTF8(ba, off, len, true); ! } ! if (cs == ISO_8859_1) { ! return decodeLatin1(ba, off, len); ! } ! if (cs == US_ASCII) { ! return decodeASCII(ba, off, len); } + sd = new StringDecoder(cs, csn); } } catch (IllegalCharsetNameException x) {} if (sd == null) throw new UnsupportedEncodingException(csn); set(decoder, sd); } return sd.decode(ba, off, len); } static Result decode(Charset cs, byte[] ba, int off, int len) { + if (cs == UTF_8) { + return decodeUTF8(ba, off, len, true); + } + if (cs == ISO_8859_1) { + return decodeLatin1(ba, off, len); + } + if (cs == US_ASCII) { + return decodeASCII(ba, off, len); + } + // (1)We never cache the "external" cs, the only benefit of creating // an additional StringDe/Encoder object to wrap it is to share the // de/encode() method. These SD/E objects are short-lived, the young-gen // gc should be able to take care of them well. But the best approach // is still not to generate them if not really necessary.
*** 278,320 **** // possible that the SM==null for now but then SM is NOT null later // when safeTrim() is invoked...the "safe" way to do is to redundant // check (... && (isTrusted || SM == null || getClassLoader0())) in trim // but it then can be argued that the SM is null when the operation // is started... - if (cs == UTF_8) { - return StringDecoderUTF8.decode(ba, off, len, new Result()); - } CharsetDecoder cd = cs.newDecoder(); // ascii fastpath ! if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) && ! ((ArrayDecoder)cd).isASCIICompatible() && ! !hasNegatives(ba, off, len))) { ! if (COMPACT_STRINGS) { ! return new Result().with(Arrays.copyOfRange(ba, off, off + len), ! LATIN1); ! } else { ! return new Result().with(StringLatin1.inflate(ba, off, len), UTF16); ! } } int en = scale(len, cd.maxCharsPerByte()); if (len == 0) { return new Result().with(); } - if (cs.getClass().getClassLoader0() != null && - System.getSecurityManager() != null) { - ba = Arrays.copyOfRange(ba, off, off + len); - off = 0; - } cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); - char[] ca = new char[en]; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); return new Result().with(ca, 0, clen); } ByteBuffer bb = ByteBuffer.wrap(ba, off, len); CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow()) --- 269,301 ---- // possible that the SM==null for now but then SM is NOT null later // when safeTrim() is invoked...the "safe" way to do is to redundant // check (... && (isTrusted || SM == null || getClassLoader0())) in trim // but it then can be argued that the SM is null when the operation // is started... CharsetDecoder cd = cs.newDecoder(); // ascii fastpath ! if ((cd instanceof ArrayDecoder) && ! ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { ! return decodeLatin1(ba, off, len); } int en = scale(len, cd.maxCharsPerByte()); if (len == 0) { return new Result().with(); } cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); char[] ca = new char[en]; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); return new Result().with(ca, 0, clen); } + if (cs.getClass().getClassLoader0() != null && + System.getSecurityManager() != null) { + ba = Arrays.copyOfRange(ba, off, off + len); + off = 0; + } ByteBuffer bb = ByteBuffer.wrap(ba, off, len); CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow())
*** 329,356 **** } return new Result().with(ca, 0, cb.position()); } static Result decode(byte[] ba, int off, int len) { ! String csn = Charset.defaultCharset().name(); ! try { ! // use charset name decode() variant which provides caching. ! return decode(csn, ba, off, len); ! } catch (UnsupportedEncodingException x) { ! warnUnsupportedCharset(csn); } ! try { ! return decode("ISO-8859-1", ba, off, len); ! } catch (UnsupportedEncodingException x) { ! // If this code is hit during VM initialization, err(String) is ! // the only way we will be able to get any kind of error message. ! err("ISO-8859-1 charset not available: " + x.toString() + "\n"); ! // If we can not find ISO-8859-1 (a required encoding) then things ! // are seriously wrong with the installation. ! System.exit(1); ! return null; } } // -- Encoding -- private static class StringEncoder { private Charset cs; --- 310,335 ---- } return new Result().with(ca, 0, cb.position()); } static Result decode(byte[] ba, int off, int len) { ! Charset cs = Charset.defaultCharset(); ! if (cs == UTF_8) { ! return decodeUTF8(ba, off, len, true); } ! if (cs == ISO_8859_1) { ! return decodeLatin1(ba, off, len); ! } ! if (cs == US_ASCII) { ! return decodeASCII(ba, off, len); ! } ! StringDecoder sd = deref(decoder); ! if (sd == null || !cs.name().equals(sd.cs.name())) { ! sd = new StringDecoder(cs, cs.name()); ! set(decoder, sd); } + return sd.decode(ba, off, len); } // -- Encoding -- private static class StringEncoder { private Charset cs;
*** 391,403 **** byte[] ba = new byte[en]; if (len == 0) { return ba; } if (ce instanceof ArrayEncoder) { - if (!isTrusted) { - val = Arrays.copyOf(val, val.length); - } int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); if (blen != -1) { return safeTrim(ba, blen, isTrusted); } --- 370,379 ----
*** 421,430 **** --- 397,583 ---- } return safeTrim(ba, bb.position(), isTrusted); } } + static byte[] encode(String charsetName, byte coder, byte[] val) + throws UnsupportedEncodingException + { + StringEncoder se = deref(encoder); + String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; + if ((se == null) || !(csn.equals(se.requestedCharsetName()) + || csn.equals(se.charsetName()))) { + se = null; + try { + Charset cs = lookupCharset(csn); + if (cs != null) { + if (cs == UTF_8) { + return encodeUTF8(coder, val, true); + } + if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } + if (cs == US_ASCII) { + return encodeASCII(coder, val); + } + se = new StringEncoder(cs, csn); + } + } catch (IllegalCharsetNameException x) {} + if (se == null) { + throw new UnsupportedEncodingException (csn); + } + set(encoder, se); + } + return se.encode(coder, val); + } + + static byte[] encode(Charset cs, byte coder, byte[] val) { + if (cs == UTF_8) { + return encodeUTF8(coder, val, true); + } + if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } + if (cs == US_ASCII) { + return encodeASCII(coder, val); + } + CharsetEncoder ce = cs.newEncoder(); + // fastpath for ascii compatible + if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && + ((ArrayEncoder)ce).isASCIICompatible() && + !hasNegatives(val, 0, val.length)))) { + return Arrays.copyOf(val, val.length); + } + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; + int en = scale(len, ce.maxBytesPerChar()); + byte[] ba = new byte[en]; + if (len == 0) { + return ba; + } + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .reset(); + if (ce instanceof ArrayEncoder) { + int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) + : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); + if (blen != -1) { + return safeTrim(ba, blen, true); + } + } + boolean isTrusted = cs.getClass().getClassLoader0() == null || + System.getSecurityManager() == null; + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + try { + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + } catch (CharacterCodingException x) { + throw new Error(x); + } + return safeTrim(ba, bb.position(), isTrusted); + } + + static byte[] encode(byte coder, byte[] val) { + Charset cs = Charset.defaultCharset(); + if (cs == UTF_8) { + return encodeUTF8(coder, val, true); + } + if (cs == ISO_8859_1) { + return encode8859_1(coder, val); + } + if (cs == US_ASCII) { + return encodeASCII(coder, val); + } + StringEncoder se = deref(encoder); + if (se == null || !cs.name().equals(se.cs.name())) { + se = new StringEncoder(cs, cs.name()); + set(encoder, se); + } + return se.encode(coder, val); + } + + /** + * Print a message directly to stderr, bypassing all character conversion + * methods. + * @param msg message to print + */ + private static native void err(String msg); + + /* The cached Result for each thread */ + private static final ThreadLocal<StringCoding.Result> + resultCached = new ThreadLocal<>() { + protected StringCoding.Result initialValue() { + return new StringCoding.Result(); + }}; + + ////////////////////////// ascii ////////////////////////////// + + private static Result decodeASCII(byte[] ba, int off, int len) { + Result result = resultCached.get(); + if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { + return result.with(Arrays.copyOfRange(ba, off, off + len), + LATIN1); + } + byte[] dst = new byte[len<<1]; + int dp = 0; + while (dp < len) { + int b = ba[off++]; + putChar(dst, dp++, (b >= 0) ? (char)b : repl); + } + return result.with(dst, UTF16); + } + + private static byte[] encodeASCII(byte coder, byte[] val) { + if (coder == LATIN1) { + byte[] dst = new byte[val.length]; + for (int i = 0; i < val.length; i++) { + if (val[i] < 0) { + dst[i] = '?'; + } else { + dst[i] = val[i]; + } + } + return dst; + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + for (int i = 0; i < len; i++) { + char c = StringUTF16.getChar(val, i); + if (c < 0x80) { + dst[dp++] = (byte)c; + continue; + } + if (Character.isHighSurrogate(c) && i + 1 < len && + Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { + i++; + } + dst[dp++] = '?'; + } + if (len == dp) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + ////////////////////////// latin1/8859_1 /////////////////////////// + + private static Result decodeLatin1(byte[] ba, int off, int len) { + Result result = resultCached.get(); + if (COMPACT_STRINGS) { + return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); + } else { + return result.with(StringLatin1.inflate(ba, off, len), UTF16); + } + } + @HotSpotIntrinsicCandidate private static int implEncodeISOArray(byte[] sa, int sp, byte[] da, int dp, int len) { int i = 0; for (; i < len; i++) {
*** 434,444 **** da[dp++] = (byte)c; } return i; } ! static byte[] encode8859_1(byte coder, byte[] val) { if (coder == LATIN1) { return Arrays.copyOf(val, val.length); } int len = val.length >> 1; byte[] dst = new byte[len]; --- 587,597 ---- da[dp++] = (byte)c; } return i; } ! private static byte[] encode8859_1(byte coder, byte[] val) { if (coder == LATIN1) { return Arrays.copyOf(val, val.length); } int len = val.length >> 1; byte[] dst = new byte[len];
*** 463,523 **** return dst; } return Arrays.copyOf(dst, dp); } ! static byte[] encodeASCII(byte coder, byte[] val) { ! if (coder == LATIN1) { ! byte[] dst = new byte[val.length]; ! for (int i = 0; i < val.length; i++) { ! if (val[i] < 0) { ! dst[i] = '?'; ! } else { ! dst[i] = val[i]; } } ! return dst; } ! int len = val.length >> 1; ! byte[] dst = new byte[len]; int dp = 0; ! for (int i = 0; i < len; i++) { ! char c = StringUTF16.getChar(val, i); ! if (c < 0x80) { ! dst[dp++] = (byte)c; continue; } ! if (Character.isHighSurrogate(c) && i + 1 < len && ! Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { ! i++; } - dst[dp++] = '?'; } ! if (len == dp) { ! return dst; } ! return Arrays.copyOf(dst, dp); } - static byte[] encodeUTF8(byte coder, byte[] val) { int dp = 0; ! byte[] dst; ! if (coder == LATIN1) { ! dst = new byte[val.length << 1]; for (int sp = 0; sp < val.length; sp++) { byte c = val[sp]; if (c < 0) { dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else { dst[dp++] = c; } } ! } else { int sp = 0; int sl = val.length >> 1; ! dst = new byte[sl * 3]; char c; while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { // ascii fast loop; dst[dp++] = (byte)c; sp++; --- 616,890 ---- return dst; } return Arrays.copyOf(dst, dp); } ! //////////////////////////////// utf8 //////////////////////////////////// ! ! private static boolean isNotContinuation(int b) { ! return (b & 0xc0) != 0x80; } + + private static boolean isMalformed3(int b1, int b2, int b3) { + return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || + (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; } ! ! private static boolean isMalformed3_2(int b1, int b2) { ! return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || ! (b2 & 0xc0) != 0x80; } ! ! private static boolean isMalformed4(int b2, int b3, int b4) { ! return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || ! (b4 & 0xc0) != 0x80; ! } ! ! private static boolean isMalformed4_2(int b1, int b2) { ! return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || ! (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || ! (b2 & 0xc0) != 0x80; ! } ! ! private static boolean isMalformed4_3(int b3) { ! return (b3 & 0xc0) != 0x80; ! } ! ! // for nb == 3/4 ! private static int malformedN(byte[] src, int sp, int nb) { ! if (nb == 3) { ! int b1 = src[sp++]; ! int b2 = src[sp++]; // no need to lookup b3 ! return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || ! isNotContinuation(b2)) ? 1 : 2; ! } else if (nb == 4) { // we don't care the speed here ! int b1 = src[sp++] & 0xff; ! int b2 = src[sp++] & 0xff; ! if (b1 > 0xf4 || ! (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || ! (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || ! isNotContinuation(b2)) ! return 1; ! if (isNotContinuation(src[sp++])) ! return 2; ! return 3; ! } ! assert false; ! return -1; ! } ! ! private static void throwMalformed(int off, int nb) { ! throw new IllegalArgumentException("malformed input off : " + off + ! ", length : " + nb); ! } ! ! private static char repl = '\ufffd'; ! ! private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { ! // ascii-bais, which has a relative impact to the non-ascii-only bytes ! if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) ! return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), ! LATIN1); ! return decodeUTF8_0(src, sp, len, doReplace); ! } ! ! private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { ! Result ret = resultCached.get(); ! ! int sl = sp + len; int dp = 0; ! byte[] dst = new byte[len]; ! ! if (COMPACT_STRINGS) { ! while (sp < sl) { ! int b1 = src[sp]; ! if (b1 >= 0) { ! dst[dp++] = (byte)b1; ! sp++; continue; } ! if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && ! sp + 1 < sl) { ! int b2 = src[sp + 1]; ! if (!isNotContinuation(b2)) { ! dst[dp++] = (byte)(((b1 << 6) ^ b2)^ ! (((byte) 0xC0 << 6) ^ ! ((byte) 0x80 << 0))); ! sp += 2; ! continue; } } ! // anything not a latin1, including the repl ! // we have to go with the utf16 ! break; } ! if (sp == sl) { ! if (dp != dst.length) { ! dst = Arrays.copyOf(dst, dp); ! } ! return ret.with(dst, LATIN1); ! } ! } ! if (dp == 0) { ! dst = new byte[len << 1]; ! } else { ! byte[] buf = new byte[len << 1]; ! StringLatin1.inflate(dst, 0, buf, 0, dp); ! dst = buf; ! } ! while (sp < sl) { ! int b1 = src[sp++]; ! if (b1 >= 0) { ! putChar(dst, dp++, (char) b1); ! } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { ! if (sp < sl) { ! int b2 = src[sp++]; ! if (isNotContinuation(b2)) { ! if (!doReplace) { ! throwMalformed(sp - 1, 1); ! } ! putChar(dst, dp++, repl); ! sp--; ! } else { ! putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ ! (((byte) 0xC0 << 6) ^ ! ((byte) 0x80 << 0)))); ! } ! continue; ! } ! if (!doReplace) { ! throwMalformed(sp, 1); // underflow() ! } ! putChar(dst, dp++, repl); ! break; ! } else if ((b1 >> 4) == -2) { ! if (sp + 1 < sl) { ! int b2 = src[sp++]; ! int b3 = src[sp++]; ! if (isMalformed3(b1, b2, b3)) { ! if (!doReplace) { ! throwMalformed(sp - 3, 3); ! } ! putChar(dst, dp++, repl); ! sp -= 3; ! sp += malformedN(src, sp, 3); ! } else { ! char c = (char)((b1 << 12) ^ ! (b2 << 6) ^ ! (b3 ^ ! (((byte) 0xE0 << 12) ^ ! ((byte) 0x80 << 6) ^ ! ((byte) 0x80 << 0)))); ! if (isSurrogate(c)) { ! if (!doReplace) { ! throwMalformed(sp - 3, 3); ! } ! putChar(dst, dp++, repl); ! } else { ! putChar(dst, dp++, c); ! } ! } ! continue; ! } ! if (sp < sl && isMalformed3_2(b1, src[sp])) { ! if (!doReplace) { ! throwMalformed(sp - 1, 2); ! } ! putChar(dst, dp++, repl); ! continue; } + if (!doReplace){ + throwMalformed(sp, 1); + } + putChar(dst, dp++, repl); + break; + } else if ((b1 >> 3) == -2) { + if (sp + 2 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + int b4 = src[sp++]; + int uc = ((b1 << 18) ^ + (b2 << 12) ^ + (b3 << 6) ^ + (b4 ^ + (((byte) 0xF0 << 18) ^ + ((byte) 0x80 << 12) ^ + ((byte) 0x80 << 6) ^ + ((byte) 0x80 << 0)))); + if (isMalformed4(b2, b3, b4) || + !isSupplementaryCodePoint(uc)) { // shortest form check + if (!doReplace) { + throwMalformed(sp - 4, 4); + } + putChar(dst, dp++, repl); + sp -= 4; + sp += malformedN(src, sp, 4); + } else { + putChar(dst, dp++, highSurrogate(uc)); + putChar(dst, dp++, lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || + sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { + if (!doReplace) { + throwMalformed(sp - 1, 1); // or 2 + } + putChar(dst, dp++, repl); + continue; + } + if (!doReplace) { + throwMalformed(sp - 1, 1); + } + sp++; + putChar(dst, dp++, repl); + if (sp < sl && isMalformed4_3(src[sp])) { + continue; + } + break; + } else { + if (!doReplace) { + throwMalformed(sp - 1, 1); + } + putChar(dst, dp++, repl); + } + } + if (dp != len) { + dst = Arrays.copyOf(dst, dp << 1); + } + return ret.with(dst, UTF16); + } + + private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { + if (coder == UTF16) + return encodeUTF8_UTF16(val, doReplace); + + if (!hasNegatives(val, 0, val.length)) + return Arrays.copyOf(val, val.length); int dp = 0; ! byte[] dst = new byte[val.length << 1]; for (int sp = 0; sp < val.length; sp++) { byte c = val[sp]; if (c < 0) { dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else { dst[dp++] = c; } } ! if (dp == dst.length) ! return dst; ! return Arrays.copyOf(dst, dp); ! } ! ! private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { ! int dp = 0; int sp = 0; int sl = val.length >> 1; ! byte[] dst = new byte[sl * 3]; char c; while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { // ascii fast loop; dst[dp++] = (byte)c; sp++;
*** 535,546 **** --- 902,917 ---- if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { uc = Character.toCodePoint(c, c2); } if (uc < 0) { + if (doReplace) { dst[dp++] = '?'; } else { + throwMalformed(sp - 1, 1); // or 2, does not matter here + } + } else { dst[dp++] = (byte)(0xf0 | ((uc >> 18))); dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (uc & 0x3f)); sp++; // 2 chars
*** 550,671 **** dst[dp++] = (byte)(0xe0 | ((c >> 12))); dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } } - } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } ! static byte[] encode(String charsetName, byte coder, byte[] val) ! throws UnsupportedEncodingException ! { ! StringEncoder se = deref(encoder); ! String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; ! if ((se == null) || !(csn.equals(se.requestedCharsetName()) ! || csn.equals(se.charsetName()))) { ! se = null; ! try { ! Charset cs = lookupCharset(csn); ! if (cs != null) { ! if (cs == UTF_8) { ! return encodeUTF8(coder, val); ! } else if (cs == ISO_8859_1) { ! return encode8859_1(coder, val); ! } else if (cs == US_ASCII) { ! return encodeASCII(coder, val); ! } ! se = new StringEncoder(cs, csn); ! } ! } catch (IllegalCharsetNameException x) {} ! if (se == null) { ! throw new UnsupportedEncodingException (csn); ! } ! set(encoder, se); ! } ! return se.encode(coder, val); ! } ! static byte[] encode(Charset cs, byte coder, byte[] val) { ! if (cs == UTF_8) { ! return encodeUTF8(coder, val); ! } else if (cs == ISO_8859_1) { ! return encode8859_1(coder, val); ! } else if (cs == US_ASCII) { ! return encodeASCII(coder, val); ! } ! CharsetEncoder ce = cs.newEncoder(); ! // fastpath for ascii compatible ! if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && ! ((ArrayEncoder)ce).isASCIICompatible() && ! !hasNegatives(val, 0, val.length)))) { ! return Arrays.copyOf(val, val.length); ! } ! int len = val.length >> coder; // assume LATIN1=0/UTF16=1; ! int en = scale(len, ce.maxBytesPerChar()); ! byte[] ba = new byte[en]; ! if (len == 0) { ! return ba; ! } ! boolean isTrusted = cs.getClass().getClassLoader0() == null || ! System.getSecurityManager() == null; ! ce.onMalformedInput(CodingErrorAction.REPLACE) ! .onUnmappableCharacter(CodingErrorAction.REPLACE) ! .reset(); ! if (ce instanceof ArrayEncoder) { ! if (!isTrusted) { ! val = Arrays.copyOf(val, val.length); ! } ! int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) ! : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); ! if (blen != -1) { ! return safeTrim(ba, blen, isTrusted); ! } ! } ! char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) ! : StringUTF16.toChars(val); ! ByteBuffer bb = ByteBuffer.wrap(ba); ! CharBuffer cb = CharBuffer.wrap(ca, 0, len); ! try { ! CoderResult cr = ce.encode(cb, bb, true); ! if (!cr.isUnderflow()) ! cr.throwException(); ! cr = ce.flush(bb); ! if (!cr.isUnderflow()) ! cr.throwException(); ! } catch (CharacterCodingException x) { ! throw new Error(x); ! } ! return safeTrim(ba, bb.position(), isTrusted); ! } ! ! static byte[] encode(byte coder, byte[] val) { ! String csn = Charset.defaultCharset().name(); ! try { ! // use charset name encode() variant which provides caching. ! return encode(csn, coder, val); ! } catch (UnsupportedEncodingException x) { ! warnUnsupportedCharset(csn); ! } ! try { ! return encode("ISO-8859-1", coder, val); ! } catch (UnsupportedEncodingException x) { ! // If this code is hit during VM initialization, err(String) is ! // the only way we will be able to get any kind of error message. ! err("ISO-8859-1 charset not available: " + x.toString() + "\n"); ! // If we can not find ISO-8859-1 (a required encoding) then things ! // are seriously wrong with the installation. ! System.exit(1); ! return null; ! } } ! /** ! * Print a message directly to stderr, bypassing all character conversion ! * methods. ! * @param msg message to print */ ! private static native void err(String msg); } --- 921,950 ---- dst[dp++] = (byte)(0xe0 | ((c >> 12))); dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } ! ////////////////////// for j.u.z.ZipCoder ////////////////////////// ! /* ! * Throws iae, instead of replacing, if malformed or unmappble. ! */ ! static String newStringUTF8NoRepl(byte[] src, int off, int len) { ! if (COMPACT_STRINGS && !hasNegatives(src, off, len)) ! return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); ! Result ret = decodeUTF8_0(src, off, len, false); ! return new String(ret.value, ret.coder); } ! /* ! * Throws iae, instead of replacing, if unmappble. */ ! static byte[] getBytesUTF8NoRepl(String s) { ! return encodeUTF8(s.coder(), s.value(), false); ! } }