/* * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package java.lang; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.lang.ref.SoftReference; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CharacterCodingException; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnmappableCharacterException; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; import jdk.internal.HotSpotIntrinsicCandidate; import sun.nio.cs.HistoricallyNamedCharset; import sun.nio.cs.ArrayDecoder; import sun.nio.cs.ArrayEncoder; import static java.lang.String.LATIN1; import static java.lang.String.UTF16; import static java.lang.String.COMPACT_STRINGS; import static java.lang.Character.isSurrogate; import static java.lang.Character.highSurrogate; import static java.lang.Character.lowSurrogate; import static java.lang.Character.isSupplementaryCodePoint; import static java.lang.StringUTF16.putChar; /** * Utility class for string encoding and decoding. */ class StringCoding { private StringCoding() { } /** The cached coders for each thread */ private static final ThreadLocal> decoder = new ThreadLocal<>(); private static final ThreadLocal> encoder = new ThreadLocal<>(); private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; private static T deref(ThreadLocal> tl) { SoftReference sr = tl.get(); if (sr == null) return null; return sr.get(); } private static void set(ThreadLocal> tl, T ob) { tl.set(new SoftReference<>(ob)); } // Trim the given byte array to the given length private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) return ba; else return Arrays.copyOf(ba, len); } private static int scale(int len, float expansionFactor) { // We need to perform double, not float, arithmetic; otherwise // we lose low order bits when len is larger than 2**24. return (int)(len * (double)expansionFactor); } private static Charset lookupCharset(String csn) { if (Charset.isSupported(csn)) { try { return Charset.forName(csn); } catch (UnsupportedCharsetException x) { throw new Error(x); } } return null; } static class Result { byte[] value; byte coder; Result with() { coder = COMPACT_STRINGS ? LATIN1 : UTF16; value = new byte[0]; return this; } Result with(char[] val, int off, int len) { if (String.COMPACT_STRINGS) { byte[] bs = StringUTF16.compress(val, off, len); if (bs != null) { value = bs; coder = LATIN1; return this; } } coder = UTF16; value = StringUTF16.toBytes(val, off, len); return this; } Result with(byte[] val, byte coder) { this.coder = coder; value = val; return this; } } @HotSpotIntrinsicCandidate public static boolean hasNegatives(byte[] ba, int off, int len) { for (int i = off; i < off + len; i++) { if (ba[i] < 0) { return true; } } return false; } // -- Decoding -- static class StringDecoder { private final String requestedCharsetName; private final Charset cs; private final boolean isASCIICompatible; private final CharsetDecoder cd; protected final Result result; StringDecoder(Charset cs, String rcn) { this.requestedCharsetName = rcn; this.cs = cs; this.cd = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); this.result = new Result(); this.isASCIICompatible = (cd instanceof ArrayDecoder) && ((ArrayDecoder)cd).isASCIICompatible(); } String charsetName() { if (cs instanceof HistoricallyNamedCharset) return ((HistoricallyNamedCharset)cs).historicalName(); return cs.name(); } final String requestedCharsetName() { return requestedCharsetName; } Result decode(byte[] ba, int off, int len) { if (len == 0) { return result.with(); } // fastpath for ascii compatible if (isASCIICompatible && !hasNegatives(ba, off, len)) { if (COMPACT_STRINGS) { return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); } else { return result.with(StringLatin1.inflate(ba, off, len), UTF16); } } int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); return result.with(ca, 0, clen); } cd.reset(); ByteBuffer bb = ByteBuffer.wrap(ba, off, len); CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow()) cr.throwException(); cr = cd.flush(cb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return result.with(ca, 0, cb.position()); } } static Result decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException { StringDecoder sd = deref(decoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd.charsetName()))) { sd = null; try { Charset cs = lookupCharset(csn); if (cs != null) { if (cs == UTF_8) { return decodeUTF8(ba, off, len, true); } if (cs == ISO_8859_1) { return decodeLatin1(ba, off, len); } if (cs == US_ASCII) { return decodeASCII(ba, off, len); } sd = new StringDecoder(cs, csn); } } catch (IllegalCharsetNameException x) {} if (sd == null) throw new UnsupportedEncodingException(csn); set(decoder, sd); } return sd.decode(ba, off, len); } static Result decode(Charset cs, byte[] ba, int off, int len) { if (cs == UTF_8) { return decodeUTF8(ba, off, len, true); } if (cs == ISO_8859_1) { return decodeLatin1(ba, off, len); } if (cs == US_ASCII) { return decodeASCII(ba, off, len); } // (1)We never cache the "external" cs, the only benefit of creating // an additional StringDe/Encoder object to wrap it is to share the // de/encode() method. These SD/E objects are short-lived, the young-gen // gc should be able to take care of them well. But the best approach // is still not to generate them if not really necessary. // (2)The defensive copy of the input byte/char[] has a big performance // impact, as well as the outgoing result byte/char[]. Need to do the // optimization check of (sm==null && classLoader0==null) for both. // (3)There might be a timing gap in isTrusted setting. getClassLoader0() // is only checked (and then isTrusted gets set) when (SM==null). It is // possible that the SM==null for now but then SM is NOT null later // when safeTrim() is invoked...the "safe" way to do is to redundant // check (... && (isTrusted || SM == null || getClassLoader0())) in trim // but it then can be argued that the SM is null when the operation // is started... CharsetDecoder cd = cs.newDecoder(); // ascii fastpath if ((cd instanceof ArrayDecoder) && ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { return decodeLatin1(ba, off, len); } int en = scale(len, cd.maxCharsPerByte()); if (len == 0) { return new Result().with(); } cd.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); char[] ca = new char[en]; if (cd instanceof ArrayDecoder) { int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); return new Result().with(ca, 0, clen); } if (cs.getClass().getClassLoader0() != null && System.getSecurityManager() != null) { ba = Arrays.copyOfRange(ba, off, off + len); off = 0; } ByteBuffer bb = ByteBuffer.wrap(ba, off, len); CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow()) cr.throwException(); cr = cd.flush(cb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return new Result().with(ca, 0, cb.position()); } static Result decode(byte[] ba, int off, int len) { Charset cs = Charset.defaultCharset(); if (cs == UTF_8) { return decodeUTF8(ba, off, len, true); } if (cs == ISO_8859_1) { return decodeLatin1(ba, off, len); } if (cs == US_ASCII) { return decodeASCII(ba, off, len); } StringDecoder sd = deref(decoder); if (sd == null || !cs.name().equals(sd.cs.name())) { sd = new StringDecoder(cs, cs.name()); set(decoder, sd); } return sd.decode(ba, off, len); } // -- Encoding -- private static class StringEncoder { private Charset cs; private CharsetEncoder ce; private final boolean isASCIICompatible; private final String requestedCharsetName; private final boolean isTrusted; private StringEncoder(Charset cs, String rcn) { this.requestedCharsetName = rcn; this.cs = cs; this.ce = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); this.isTrusted = (cs.getClass().getClassLoader0() == null); this.isASCIICompatible = (ce instanceof ArrayEncoder) && ((ArrayEncoder)ce).isASCIICompatible(); } String charsetName() { if (cs instanceof HistoricallyNamedCharset) return ((HistoricallyNamedCharset)cs).historicalName(); return cs.name(); } final String requestedCharsetName() { return requestedCharsetName; } byte[] encode(byte coder, byte[] val) { // fastpath for ascii compatible if (coder == LATIN1 && isASCIICompatible && !hasNegatives(val, 0, val.length)) { return Arrays.copyOf(val, val.length); } int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; if (len == 0) { return ba; } if (ce instanceof ArrayEncoder) { int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); if (blen != -1) { return safeTrim(ba, blen, isTrusted); } } char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ce.reset(); ByteBuffer bb = ByteBuffer.wrap(ba); CharBuffer cb = CharBuffer.wrap(ca, 0, len); try { CoderResult cr = ce.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = ce.flush(bb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return safeTrim(ba, bb.position(), isTrusted); } } static byte[] encode(String charsetName, byte coder, byte[] val) throws UnsupportedEncodingException { StringEncoder se = deref(encoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((se == null) || !(csn.equals(se.requestedCharsetName()) || csn.equals(se.charsetName()))) { se = null; try { Charset cs = lookupCharset(csn); if (cs != null) { if (cs == UTF_8) { return encodeUTF8(coder, val, true); } if (cs == ISO_8859_1) { return encode8859_1(coder, val); } if (cs == US_ASCII) { return encodeASCII(coder, val); } se = new StringEncoder(cs, csn); } } catch (IllegalCharsetNameException x) {} if (se == null) { throw new UnsupportedEncodingException (csn); } set(encoder, se); } return se.encode(coder, val); } static byte[] encode(Charset cs, byte coder, byte[] val) { if (cs == UTF_8) { return encodeUTF8(coder, val, true); } if (cs == ISO_8859_1) { return encode8859_1(coder, val); } if (cs == US_ASCII) { return encodeASCII(coder, val); } CharsetEncoder ce = cs.newEncoder(); // fastpath for ascii compatible if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && ((ArrayEncoder)ce).isASCIICompatible() && !hasNegatives(val, 0, val.length)))) { return Arrays.copyOf(val, val.length); } int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; if (len == 0) { return ba; } ce.onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE) .reset(); if (ce instanceof ArrayEncoder) { int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); if (blen != -1) { return safeTrim(ba, blen, true); } } boolean isTrusted = cs.getClass().getClassLoader0() == null || System.getSecurityManager() == null; char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ByteBuffer bb = ByteBuffer.wrap(ba); CharBuffer cb = CharBuffer.wrap(ca, 0, len); try { CoderResult cr = ce.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = ce.flush(bb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { throw new Error(x); } return safeTrim(ba, bb.position(), isTrusted); } static byte[] encode(byte coder, byte[] val) { Charset cs = Charset.defaultCharset(); if (cs == UTF_8) { return encodeUTF8(coder, val, true); } if (cs == ISO_8859_1) { return encode8859_1(coder, val); } if (cs == US_ASCII) { return encodeASCII(coder, val); } StringEncoder se = deref(encoder); if (se == null || !cs.name().equals(se.cs.name())) { se = new StringEncoder(cs, cs.name()); set(encoder, se); } return se.encode(coder, val); } /** * Print a message directly to stderr, bypassing all character conversion * methods. * @param msg message to print */ private static native void err(String msg); /* The cached Result for each thread */ private static final ThreadLocal resultCached = new ThreadLocal<>() { protected StringCoding.Result initialValue() { return new StringCoding.Result(); }}; ////////////////////////// ascii ////////////////////////////// private static Result decodeASCII(byte[] ba, int off, int len) { Result result = resultCached.get(); if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); } byte[] dst = new byte[len<<1]; int dp = 0; while (dp < len) { int b = ba[off++]; putChar(dst, dp++, (b >= 0) ? (char)b : repl); } return result.with(dst, UTF16); } private static byte[] encodeASCII(byte coder, byte[] val) { if (coder == LATIN1) { byte[] dst = new byte[val.length]; for (int i = 0; i < val.length; i++) { if (val[i] < 0) { dst[i] = '?'; } else { dst[i] = val[i]; } } return dst; } int len = val.length >> 1; byte[] dst = new byte[len]; int dp = 0; for (int i = 0; i < len; i++) { char c = StringUTF16.getChar(val, i); if (c < 0x80) { dst[dp++] = (byte)c; continue; } if (Character.isHighSurrogate(c) && i + 1 < len && Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { i++; } dst[dp++] = '?'; } if (len == dp) { return dst; } return Arrays.copyOf(dst, dp); } ////////////////////////// latin1/8859_1 /////////////////////////// private static Result decodeLatin1(byte[] ba, int off, int len) { Result result = resultCached.get(); if (COMPACT_STRINGS) { return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); } else { return result.with(StringLatin1.inflate(ba, off, len), UTF16); } } @HotSpotIntrinsicCandidate private static int implEncodeISOArray(byte[] sa, int sp, byte[] da, int dp, int len) { int i = 0; for (; i < len; i++) { char c = StringUTF16.getChar(sa, sp++); if (c > '\u00FF') break; da[dp++] = (byte)c; } return i; } private static byte[] encode8859_1(byte coder, byte[] val) { if (coder == LATIN1) { return Arrays.copyOf(val, val.length); } int len = val.length >> 1; byte[] dst = new byte[len]; int dp = 0; int sp = 0; int sl = len; while (sp < sl) { int ret = implEncodeISOArray(val, sp, dst, dp, len); sp = sp + ret; dp = dp + ret; if (ret != len) { char c = StringUTF16.getChar(val, sp++); if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { sp++; } dst[dp++] = '?'; len = sl - sp; } } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } //////////////////////////////// utf8 //////////////////////////////////// private static boolean isNotContinuation(int b) { return (b & 0xc0) != 0x80; } private static boolean isMalformed3(int b1, int b2, int b3) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; } private static boolean isMalformed3_2(int b1, int b2) { return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80; } private static boolean isMalformed4(int b2, int b3, int b4) { return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80; } private static boolean isMalformed4_2(int b1, int b2) { return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || (b2 & 0xc0) != 0x80; } private static boolean isMalformed4_3(int b3) { return (b3 & 0xc0) != 0x80; } // for nb == 3/4 private static int malformedN(byte[] src, int sp, int nb) { if (nb == 3) { int b1 = src[sp++]; int b2 = src[sp++]; // no need to lookup b3 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || isNotContinuation(b2)) ? 1 : 2; } else if (nb == 4) { // we don't care the speed here int b1 = src[sp++] & 0xff; int b2 = src[sp++] & 0xff; if (b1 > 0xf4 || (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || isNotContinuation(b2)) return 1; if (isNotContinuation(src[sp++])) return 2; return 3; } assert false; return -1; } private static void throwMalformed(int off, int nb) { throw new IllegalArgumentException("malformed input off : " + off + ", length : " + nb); } private static char repl = '\ufffd'; private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { // ascii-bais, which has a relative impact to the non-ascii-only bytes if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), LATIN1); return decodeUTF8_0(src, sp, len, doReplace); } private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { Result ret = resultCached.get(); int sl = sp + len; int dp = 0; byte[] dst = new byte[len]; if (COMPACT_STRINGS) { while (sp < sl) { int b1 = src[sp]; if (b1 >= 0) { dst[dp++] = (byte)b1; sp++; continue; } if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && sp + 1 < sl) { int b2 = src[sp + 1]; if (!isNotContinuation(b2)) { dst[dp++] = (byte)(((b1 << 6) ^ b2)^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0))); sp += 2; continue; } } // anything not a latin1, including the repl // we have to go with the utf16 break; } if (sp == sl) { if (dp != dst.length) { dst = Arrays.copyOf(dst, dp); } return ret.with(dst, LATIN1); } } if (dp == 0) { dst = new byte[len << 1]; } else { byte[] buf = new byte[len << 1]; StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } while (sp < sl) { int b1 = src[sp++]; if (b1 >= 0) { putChar(dst, dp++, (char) b1); } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { if (sp < sl) { int b2 = src[sp++]; if (isNotContinuation(b2)) { if (!doReplace) { throwMalformed(sp - 1, 1); } putChar(dst, dp++, repl); sp--; } else { putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0)))); } continue; } if (!doReplace) { throwMalformed(sp, 1); // underflow() } putChar(dst, dp++, repl); break; } else if ((b1 >> 4) == -2) { if (sp + 1 < sl) { int b2 = src[sp++]; int b3 = src[sp++]; if (isMalformed3(b1, b2, b3)) { if (!doReplace) { throwMalformed(sp - 3, 3); } putChar(dst, dp++, repl); sp -= 3; sp += malformedN(src, sp, 3); } else { char c = (char)((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (isSurrogate(c)) { if (!doReplace) { throwMalformed(sp - 3, 3); } putChar(dst, dp++, repl); } else { putChar(dst, dp++, c); } } continue; } if (sp < sl && isMalformed3_2(b1, src[sp])) { if (!doReplace) { throwMalformed(sp - 1, 2); } putChar(dst, dp++, repl); continue; } if (!doReplace){ throwMalformed(sp, 1); } putChar(dst, dp++, repl); break; } else if ((b1 >> 3) == -2) { if (sp + 2 < sl) { int b2 = src[sp++]; int b3 = src[sp++]; int b4 = src[sp++]; int uc = ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (isMalformed4(b2, b3, b4) || !isSupplementaryCodePoint(uc)) { // shortest form check if (!doReplace) { throwMalformed(sp - 4, 4); } putChar(dst, dp++, repl); sp -= 4; sp += malformedN(src, sp, 4); } else { putChar(dst, dp++, highSurrogate(uc)); putChar(dst, dp++, lowSurrogate(uc)); } continue; } b1 &= 0xff; if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { if (!doReplace) { throwMalformed(sp - 1, 1); // or 2 } putChar(dst, dp++, repl); continue; } if (!doReplace) { throwMalformed(sp - 1, 1); } sp++; putChar(dst, dp++, repl); if (sp < sl && isMalformed4_3(src[sp])) { continue; } break; } else { if (!doReplace) { throwMalformed(sp - 1, 1); } putChar(dst, dp++, repl); } } if (dp != len) { dst = Arrays.copyOf(dst, dp << 1); } return ret.with(dst, UTF16); } private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { if (coder == UTF16) return encodeUTF8_UTF16(val, doReplace); if (!hasNegatives(val, 0, val.length)) return Arrays.copyOf(val, val.length); int dp = 0; byte[] dst = new byte[val.length << 1]; for (int sp = 0; sp < val.length; sp++) { byte c = val[sp]; if (c < 0) { dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else { dst[dp++] = c; } } if (dp == dst.length) return dst; return Arrays.copyOf(dst, dp); } private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { int dp = 0; int sp = 0; int sl = val.length >> 1; byte[] dst = new byte[sl * 3]; char c; while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { // ascii fast loop; dst[dp++] = (byte)c; sp++; } while (sp < sl) { c = StringUTF16.getChar(val, sp++); if (c < 0x80) { dst[dp++] = (byte)c; } else if (c < 0x800) { dst[dp++] = (byte)(0xc0 | (c >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else if (Character.isSurrogate(c)) { int uc = -1; char c2; if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { uc = Character.toCodePoint(c, c2); } if (uc < 0) { if (doReplace) { dst[dp++] = '?'; } else { throwMalformed(sp - 1, 1); // or 2, does not matter here } } else { dst[dp++] = (byte)(0xf0 | ((uc >> 18))); dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (uc & 0x3f)); sp++; // 2 chars } } else { // 3 bytes, 16 bits dst[dp++] = (byte)(0xe0 | ((c >> 12))); dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } ////////////////////// for j.u.z.ZipCoder ////////////////////////// /* * Throws iae, instead of replacing, if malformed or unmappable. */ static String newStringUTF8NoRepl(byte[] src, int off, int len) { if (COMPACT_STRINGS && !hasNegatives(src, off, len)) return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); Result ret = decodeUTF8_0(src, off, len, false); return new String(ret.value, ret.coder); } /* * Throws iae, instead of replacing, if unmappable. */ static byte[] getBytesUTF8NoRepl(String s) { return encodeUTF8(s.coder(), s.value(), false); } ////////////////////// for j.n.f.Files ///////////////////////////////////////////// // the following methods are copies of the original ones with following changes: // // 1. throws Exception instead of replacement upon malformed or unmappable // // 2. throws CCE instead IAE or Error, that is, the methods declare CCE // // 3. avoid copying the internal byte arrays where appropriate // ////////////////////// for j.n.f.Files ///////////////////////////////////////////// private static boolean isASCII(byte[] src) { return !hasNegatives(src, 0, src.length); } private static String newStringLatin1(byte[] src) { if (COMPACT_STRINGS) return new String(src, LATIN1); return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { if (cs == UTF_8) { if (COMPACT_STRINGS && isASCII(src)) return new String(src, LATIN1); Result ret = decodeUTF8_0_CCE(src, 0, src.length); return new String(ret.value, ret.coder); } if (cs == ISO_8859_1) { return newStringLatin1(src); } if (cs == US_ASCII) { if (isASCII(src)) { return newStringLatin1(src); } else { throw new UnmappableCharacterException(1); } } CharsetDecoder cd = cs.newDecoder(); // ascii fastpath if ((cd instanceof ArrayDecoder) && ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) { return newStringLatin1(src); } int len = src.length; if (len == 0) { return ""; } int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; if (cs.getClass().getClassLoader0() != null && System.getSecurityManager() != null) { src = Arrays.copyOf(src, len); } ByteBuffer bb = ByteBuffer.wrap(src); CharBuffer cb = CharBuffer.wrap(ca); CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow()) cr.throwException(); cr = cd.flush(cb); if (!cr.isUnderflow()) cr.throwException(); Result ret = resultCached.get().with(ca, 0, cb.position()); return new String(ret.value, ret.coder); } /* * Throws CCE, instead of replacing, if unmappable. */ static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { byte[] val = s.value(); byte coder = s.coder(); if (cs == UTF_8) { if (isASCII(val)) { return val; } return encodeUTF8_CCE(coder, val); } if (cs == ISO_8859_1) { if (coder == LATIN1) { return val; } return encode8859_1_CCE(coder, val); } if (cs == US_ASCII) { if (coder == LATIN1) { if (isASCII(val)) { return val; } else { throw new UnmappableCharacterException(1); } } } CharsetEncoder ce = cs.newEncoder(); // fastpath for ascii compatible if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && ((ArrayEncoder)ce).isASCIICompatible() && isASCII(val)))) { return val; } int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); byte[] ba = new byte[en]; if (len == 0) { return ba; } if (ce instanceof ArrayEncoder) { int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); if (blen != -1) { return safeTrim(ba, blen, true); } } boolean isTrusted = cs.getClass().getClassLoader0() == null || System.getSecurityManager() == null; char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ByteBuffer bb = ByteBuffer.wrap(ba); CharBuffer cb = CharBuffer.wrap(ca, 0, len); CoderResult cr = ce.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = ce.flush(bb); if (!cr.isUnderflow()) cr.throwException(); return safeTrim(ba, bb.position(), isTrusted); } private static Result decodeUTF8_0_CCE(byte[] src, int sp, int len) throws CharacterCodingException { Result ret = resultCached.get(); int sl = sp + len; int dp = 0; byte[] dst = new byte[len]; if (COMPACT_STRINGS) { while (sp < sl) { int b1 = src[sp]; if (b1 >= 0) { dst[dp++] = (byte)b1; sp++; continue; } if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && sp + 1 < sl) { int b2 = src[sp + 1]; if (!isNotContinuation(b2)) { dst[dp++] = (byte)(((b1 << 6) ^ b2)^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0))); sp += 2; continue; } } // anything not a latin1, including the repl // we have to go with the utf16 break; } if (sp == sl) { if (dp != dst.length) { dst = Arrays.copyOf(dst, dp); } return ret.with(dst, LATIN1); } } if (dp == 0) { dst = new byte[len << 1]; } else { byte[] buf = new byte[len << 1]; StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } while (sp < sl) { int b1 = src[sp++]; if (b1 >= 0) { putChar(dst, dp++, (char) b1); continue; } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { if (sp < sl) { int b2 = src[sp++]; if (isNotContinuation(b2)) { throw new UnmappableCharacterException(1); } else { putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0)))); } continue; } } else if ((b1 >> 4) == -2) { if (sp + 1 < sl) { int b2 = src[sp++]; int b3 = src[sp++]; if (isMalformed3(b1, b2, b3)) { throw new UnmappableCharacterException(3); } else { char c = (char)((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (isSurrogate(c)) { throw new UnmappableCharacterException(3); } else { putChar(dst, dp++, c); } } continue; } if (sp < sl && isMalformed3_2(b1, src[sp])) { throw new UnmappableCharacterException(2); } } else if ((b1 >> 3) == -2) { if (sp + 2 < sl) { int b2 = src[sp++]; int b3 = src[sp++]; int b4 = src[sp++]; int uc = ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0)))); if (isMalformed4(b2, b3, b4) || !isSupplementaryCodePoint(uc)) { // shortest form check throw new UnmappableCharacterException(4); } else { putChar(dst, dp++, highSurrogate(uc)); putChar(dst, dp++, lowSurrogate(uc)); } continue; } } throw new UnmappableCharacterException(1); } if (dp != len) { dst = Arrays.copyOf(dst, dp << 1); } return ret.with(dst, UTF16); } private static byte[] encodeUTF8_CCE(byte coder, byte[] val) throws CharacterCodingException { if (coder == UTF16) return encodeUTF8_UTF16_CCE(val); if (!hasNegatives(val, 0, val.length)) return Arrays.copyOf(val, val.length); int dp = 0; byte[] dst = new byte[val.length << 1]; for (int sp = 0; sp < val.length; sp++) { byte c = val[sp]; if (c < 0) { dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else { dst[dp++] = c; } } if (dp == dst.length) return dst; return Arrays.copyOf(dst, dp); } private static byte[] encodeUTF8_UTF16_CCE(byte[] val) throws CharacterCodingException { int dp = 0; int sp = 0; int sl = val.length >> 1; byte[] dst = new byte[sl * 3]; char c; while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { // ascii fast loop; dst[dp++] = (byte)c; sp++; } while (sp < sl) { c = StringUTF16.getChar(val, sp++); if (c < 0x80) { dst[dp++] = (byte)c; } else if (c < 0x800) { dst[dp++] = (byte)(0xc0 | (c >> 6)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } else if (Character.isSurrogate(c)) { int uc = -1; char c2; if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { uc = Character.toCodePoint(c, c2); } if (uc < 0) { throw new UnmappableCharacterException(1); } else { dst[dp++] = (byte)(0xf0 | ((uc >> 18))); dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (uc & 0x3f)); sp++; // 2 chars } } else { // 3 bytes, 16 bits dst[dp++] = (byte)(0xe0 | ((c >> 12))); dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); dst[dp++] = (byte)(0x80 | (c & 0x3f)); } } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } private static byte[] encode8859_1_CCE(byte coder, byte[] val) throws CharacterCodingException { if (coder == LATIN1) { return Arrays.copyOf(val, val.length); } int len = val.length >> 1; byte[] dst = new byte[len]; int dp = 0; int sp = 0; int sl = len; while (sp < sl) { int ret = implEncodeISOArray(val, sp, dst, dp, len); sp = sp + ret; dp = dp + ret; if (ret != len) { throw new UnmappableCharacterException(1); } } if (dp == dst.length) { return dst; } return Arrays.copyOf(dst, dp); } }