src/java.base/share/classes/java/lang/StringCoding.java

Print this page

        

@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.  Oracle designates this

@@ -36,15 +36,23 @@
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.IllegalCharsetNameException;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.Arrays;
+import jdk.internal.HotSpotIntrinsicCandidate;
 import sun.misc.MessageUtils;
 import sun.nio.cs.HistoricallyNamedCharset;
 import sun.nio.cs.ArrayDecoder;
 import sun.nio.cs.ArrayEncoder;
 
+import static java.lang.String.LATIN1;
+import static java.lang.String.UTF16;
+import static java.lang.String.COMPACT_STRINGS;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * Utility class for string encoding and decoding.
  */
 
 class StringCoding {

@@ -70,27 +78,17 @@
         tl.set(new SoftReference<>(ob));
     }
 
     // Trim the given byte array to the given length
     //
-    private static byte[] safeTrim(byte[] ba, int len, Charset cs, boolean isTrusted) {
+    private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
             return ba;
         else
             return Arrays.copyOf(ba, len);
     }
 
-    // Trim the given char array to the given length
-    //
-    private static char[] safeTrim(char[] ca, int len,
-                                   Charset cs, boolean isTrusted) {
-        if (len == ca.length && (isTrusted || System.getSecurityManager() == null))
-            return ca;
-        else
-            return Arrays.copyOf(ca, len);
-    }
-
     private static int scale(int len, float expansionFactor) {
         // We need to perform double, not float, arithmetic; otherwise
         // we lose low order bits when len is larger than 2**24.
         return (int)(len * (double)expansionFactor);
     }

@@ -115,25 +113,68 @@
                              " not supported, using ISO-8859-1 instead");
             warnUnsupportedCharset = false;
         }
     }
 
+    static class Result {
+        byte[] value;
+        byte coder;
+
+        Result with() {
+            coder = COMPACT_STRINGS ? LATIN1 : UTF16;
+            value = new byte[0];
+            return this;
+        }
+
+        Result with(char[] val, int off, int len) {
+            if (String.COMPACT_STRINGS) {
+                byte[] bs = StringUTF16.compress(val, off, len);
+                if (bs != null) {
+                    value = bs;
+                    coder = LATIN1;
+                    return this;
+                }
+            }
+            coder = UTF16;
+            value = StringUTF16.toBytes(val, off, len);
+            return this;
+        }
+
+        Result with(byte[] val, byte coder) {
+            this.coder = coder;
+            value = val;
+            return this;
+        }
+    }
+
+    @HotSpotIntrinsicCandidate
+    private static boolean hasNegatives(byte[] ba, int off, int len) {
+        for (int i = off; i < off + len; i++) {
+            if (ba[i] < 0) {
+                return true;
+            }
+        }
+        return false;
+    }
 
     // -- Decoding --
-    private static class StringDecoder {
+    static class StringDecoder {
         private final String requestedCharsetName;
         private final Charset cs;
+        private final boolean isASCIICompatible;
         private final CharsetDecoder cd;
-        private final boolean isTrusted;
+        protected final Result result;
 
-        private StringDecoder(Charset cs, String rcn) {
+        StringDecoder(Charset cs, String rcn) {
             this.requestedCharsetName = rcn;
             this.cs = cs;
             this.cd = cs.newDecoder()
                 .onMalformedInput(CodingErrorAction.REPLACE)
                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
-            this.isTrusted = (cs.getClass().getClassLoader0() == null);
+            this.result = new Result();
+            this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
+                    ((ArrayDecoder)cd).isASCIICompatible();
         }
 
         String charsetName() {
             if (cs instanceof HistoricallyNamedCharset)
                 return ((HistoricallyNamedCharset)cs).historicalName();

@@ -142,19 +183,29 @@
 
         final String requestedCharsetName() {
             return requestedCharsetName;
         }
 
-        char[] decode(byte[] ba, int off, int len) {
+        Result decode(byte[] ba, int off, int len) {
+            if (len == 0) {
+                return result.with();
+            }
+            // fastpath for ascii compatible
+            if (isASCIICompatible && !hasNegatives(ba, off, len)) {
+                if (COMPACT_STRINGS) {
+                    return result.with(Arrays.copyOfRange(ba, off, off + len),
+                                      LATIN1);
+                } else {
+                    return result.with(StringLatin1.inflate(ba, off, len), UTF16);
+                }
+            }
             int en = scale(len, cd.maxCharsPerByte());
             char[] ca = new char[en];
-            if (len == 0)
-                return ca;
             if (cd instanceof ArrayDecoder) {
                 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
-                return safeTrim(ca, clen, cs, isTrusted);
-            } else {
+                return result.with(ca, 0, clen);
+            }
                 cd.reset();
                 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
                 CharBuffer cb = CharBuffer.wrap(ca);
                 try {
                     CoderResult cr = cd.decode(bb, cb, true);

@@ -166,36 +217,55 @@
                 } catch (CharacterCodingException x) {
                     // Substitution is always enabled,
                     // so this shouldn't happen
                     throw new Error(x);
                 }
-                return safeTrim(ca, cb.position(), cs, isTrusted);
+            return result.with(ca, 0, cb.position());
+        }
+    }
+
+    private static class StringDecoder8859_1 extends StringDecoder {
+        StringDecoder8859_1(Charset cs, String rcn) {
+            super(cs, rcn);
+        }
+        Result decode(byte[] ba, int off, int len) {
+            if (COMPACT_STRINGS) {
+                return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
+            } else {
+                return result.with(StringLatin1.inflate(ba, off, len), UTF16);
             }
         }
     }
 
-    static char[] decode(String charsetName, byte[] ba, int off, int len)
+    static Result decode(String charsetName, byte[] ba, int off, int len)
         throws UnsupportedEncodingException
     {
         StringDecoder sd = deref(decoder);
         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
                               || csn.equals(sd.charsetName()))) {
             sd = null;
             try {
                 Charset cs = lookupCharset(csn);
-                if (cs != null)
+                if (cs != null) {
+                    if (cs == UTF_8) {
+                        sd = new StringDecoderUTF8(cs, csn);
+                    } else if (cs == ISO_8859_1) {
+                        sd = new StringDecoder8859_1(cs, csn);
+                    } else {
                     sd = new StringDecoder(cs, csn);
+                    }
+                }
             } catch (IllegalCharsetNameException x) {}
             if (sd == null)
                 throw new UnsupportedEncodingException(csn);
             set(decoder, sd);
         }
         return sd.decode(ba, off, len);
     }
 
-    static char[] decode(Charset cs, byte[] ba, int off, int len) {
+    static Result decode(Charset cs, byte[] ba, int off, int len) {
         // (1)We never cache the "external" cs, the only benefit of creating
         // an additional StringDe/Encoder object to wrap it is to share the
         // de/encode() method. These SD/E objects are short-lived, the young-gen
         // gc should be able to take care of them well. But the best approach
         // is still not to generate them if not really necessary.

@@ -208,29 +278,43 @@
         // possible that the SM==null for now but then SM is NOT null later
         // when safeTrim() is invoked...the "safe" way to do is to redundant
         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
         // but it then can be argued that the SM is null when the operation
         // is started...
+        if (cs == UTF_8) {
+            return StringDecoderUTF8.decode(ba, off, len, new Result());
+        }
         CharsetDecoder cd = cs.newDecoder();
+        // ascii fastpath
+        if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) &&
+                                 ((ArrayDecoder)cd).isASCIICompatible() &&
+                                 !hasNegatives(ba, off, len))) {
+             if (COMPACT_STRINGS) {
+                 return new Result().with(Arrays.copyOfRange(ba, off, off + len),
+                                          LATIN1);
+             } else {
+                 return new Result().with(StringLatin1.inflate(ba, off, len), UTF16);
+             }
+        }
         int en = scale(len, cd.maxCharsPerByte());
-        char[] ca = new char[en];
-        if (len == 0)
-            return ca;
-        boolean isTrusted = false;
-        if (System.getSecurityManager() != null) {
-            if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
+        if (len == 0) {
+            return new Result().with();
+        }
+        if (System.getSecurityManager() != null &&
+            cs.getClass().getClassLoader0() != null) {
                 ba =  Arrays.copyOfRange(ba, off, off + len);
                 off = 0;
             }
-        }
         cd.onMalformedInput(CodingErrorAction.REPLACE)
           .onUnmappableCharacter(CodingErrorAction.REPLACE)
           .reset();
+
+        char[] ca = new char[en];
         if (cd instanceof ArrayDecoder) {
             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
-            return safeTrim(ca, clen, cs, isTrusted);
-        } else {
+            return new Result().with(ca, 0, clen);
+        }
             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
             CharBuffer cb = CharBuffer.wrap(ca);
             try {
                 CoderResult cr = cd.decode(bb, cb, true);
                 if (!cr.isUnderflow())

@@ -241,15 +325,14 @@
             } catch (CharacterCodingException x) {
                 // Substitution is always enabled,
                 // so this shouldn't happen
                 throw new Error(x);
             }
-            return safeTrim(ca, cb.position(), cs, isTrusted);
-        }
+        return new Result().with(ca, 0, cb.position());
     }
 
-    static char[] decode(byte[] ba, int off, int len) {
+    static Result decode(byte[] ba, int off, int len) {
         String csn = Charset.defaultCharset().name();
         try {
             // use charset name decode() variant which provides caching.
             return decode(csn, ba, off, len);
         } catch (UnsupportedEncodingException x) {

@@ -271,20 +354,23 @@
 
     // -- Encoding --
     private static class StringEncoder {
         private Charset cs;
         private CharsetEncoder ce;
+        private final boolean isASCIICompatible;
         private final String requestedCharsetName;
         private final boolean isTrusted;
 
         private StringEncoder(Charset cs, String rcn) {
             this.requestedCharsetName = rcn;
             this.cs = cs;
             this.ce = cs.newEncoder()
                 .onMalformedInput(CodingErrorAction.REPLACE)
                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
             this.isTrusted = (cs.getClass().getClassLoader0() == null);
+            this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
+                    ((ArrayEncoder)ce).isASCIICompatible();
         }
 
         String charsetName() {
             if (cs instanceof HistoricallyNamedCharset)
                 return ((HistoricallyNamedCharset)cs).historicalName();

@@ -293,22 +379,37 @@
 
         final String requestedCharsetName() {
             return requestedCharsetName;
         }
 
-        byte[] encode(char[] ca, int off, int len) {
+        byte[] encode(byte coder, byte[] val) {
+            // fastpath for ascii compatible
+            if (coder == LATIN1 && isASCIICompatible &&
+                !hasNegatives(val, 0, val.length)) {
+                return Arrays.copyOf(val, val.length);
+            }
+            int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
             int en = scale(len, ce.maxBytesPerChar());
             byte[] ba = new byte[en];
-            if (len == 0)
+            if (len == 0) {
                 return ba;
+            }
             if (ce instanceof ArrayEncoder) {
-                int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
-                return safeTrim(ba, blen, cs, isTrusted);
-            } else {
+                if (!isTrusted) {
+                    val = Arrays.copyOf(val, val.length);
+                }
+                int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
+                                              : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
+                if (blen != -1) {
+                    return safeTrim(ba, blen, isTrusted);
+                }
+            }
+            char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
+                                           : StringUTF16.toChars(val);
                 ce.reset();
                 ByteBuffer bb = ByteBuffer.wrap(ba);
-                CharBuffer cb = CharBuffer.wrap(ca, off, len);
+            CharBuffer cb = CharBuffer.wrap(ca, 0, len);
                 try {
                     CoderResult cr = ce.encode(cb, bb, true);
                     if (!cr.isUnderflow())
                         cr.throwException();
                     cr = ce.flush(bb);

@@ -317,81 +418,242 @@
                 } catch (CharacterCodingException x) {
                     // Substitution is always enabled,
                     // so this shouldn't happen
                     throw new Error(x);
                 }
-                return safeTrim(ba, bb.position(), cs, isTrusted);
+            return safeTrim(ba, bb.position(), isTrusted);
+        }
+    }
+
+    @HotSpotIntrinsicCandidate
+    private static int implEncodeISOArray(byte[] sa, int sp,
+                                          byte[] da, int dp, int len) {
+        int i = 0;
+        for (; i < len; i++) {
+            char c = StringUTF16.getChar(sa, sp++);
+            if (c > '\u00FF')
+                break;
+            da[dp++] = (byte)c;
+        }
+        return i;
+    }
+
+    static byte[] encode8859_1(byte coder, byte[] val) {
+        if (coder == LATIN1) {
+            return Arrays.copyOf(val, val.length);
+        }
+        int len = val.length >> 1;
+        byte[] dst = new byte[len];
+        int dp = 0;
+        int sp = 0;
+        int sl = len;
+        while (sp < sl) {
+            int ret = implEncodeISOArray(val, sp, dst, dp, len);
+            sp = sp + ret;
+            dp = dp + ret;
+            if (ret != len) {
+                char c = StringUTF16.getChar(val, sp++);
+                if (Character.isHighSurrogate(c) && sp < sl &&
+                    Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
+                    sp++;
+                }
+                dst[dp++] = '?';
+                len = sl - sp;
+            }
+        }
+        if (dp == dst.length) {
+            return dst;
+        }
+        return Arrays.copyOf(dst, dp);
+    }
+
+    static byte[] encodeASCII(byte coder, byte[] val) {
+        if (coder == LATIN1) {
+            byte[] dst = new byte[val.length];
+            for (int i = 0; i < val.length; i++) {
+                if (val[i] < 0) {
+                    dst[i] = '?';
+                } else {
+                    dst[i] = val[i];
+                }
+            }
+            return dst;
+        }
+        int len = val.length >> 1;
+        byte[] dst = new byte[len];
+        int dp = 0;
+        for (int i = 0; i < len; i++) {
+            char c = StringUTF16.getChar(val, i);
+            if (c < 0x80) {
+                dst[dp++] = (byte)c;
+                continue;
+            }
+            if (Character.isHighSurrogate(c) && i + 1 < len &&
+                Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
+                i++;
+            }
+            dst[dp++] = '?';
+        }
+        if (len == dp) {
+            return dst;
+        }
+        return Arrays.copyOf(dst, dp);
+    }
+
+   static byte[] encodeUTF8(byte coder, byte[] val) {
+        int dp = 0;
+        byte[] dst;
+        if (coder == LATIN1) {
+            dst = new byte[val.length << 1];
+            for (int sp = 0; sp < val.length; sp++) {
+                byte c = val[sp];
+                if (c < 0) {
+                    dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
+                    dst[dp++] = (byte)(0x80 | (c & 0x3f));
+                } else {
+                    dst[dp++] = c;
+                }
             }
+        } else {
+            int sp = 0;
+            int sl = val.length >> 1;
+            dst = new byte[sl * 3];
+            char c;
+            while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
+                // ascii fast loop;
+                dst[dp++] = (byte)c;
+                sp++;
+            }
+            while (sp < sl) {
+                c = StringUTF16.getChar(val, sp++);
+                if (c < 0x80) {
+                    dst[dp++] = (byte)c;
+                } else if (c < 0x800) {
+                    dst[dp++] = (byte)(0xc0 | (c >> 6));
+                    dst[dp++] = (byte)(0x80 | (c & 0x3f));
+                } else if (Character.isSurrogate(c)) {
+                    int uc = -1;
+                    char c2;
+                    if (Character.isHighSurrogate(c) && sp < sl &&
+                        Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
+                        uc = Character.toCodePoint(c, c2);
+                    }
+                    if (uc < 0) {
+                        dst[dp++] = '?';
+                    } else {
+                        dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
+                        dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
+                        dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
+                        dst[dp++] = (byte)(0x80 | (uc & 0x3f));
+                        sp++;  // 2 chars
+                    }
+                } else {
+                    // 3 bytes, 16 bits
+                    dst[dp++] = (byte)(0xe0 | ((c >> 12)));
+                    dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
+                    dst[dp++] = (byte)(0x80 | (c & 0x3f));
         }
     }
+        }
+        if (dp == dst.length) {
+            return dst;
+        }
+        return Arrays.copyOf(dst, dp);
+    }
 
-    static byte[] encode(String charsetName, char[] ca, int off, int len)
+    static byte[] encode(String charsetName, byte coder, byte[] val)
         throws UnsupportedEncodingException
     {
         StringEncoder se = deref(encoder);
         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
         if ((se == null) || !(csn.equals(se.requestedCharsetName())
                               || csn.equals(se.charsetName()))) {
             se = null;
             try {
                 Charset cs = lookupCharset(csn);
-                if (cs != null)
+                if (cs != null) {
+                    if (cs == UTF_8) {
+                        return encodeUTF8(coder, val);
+                    } else if (cs == ISO_8859_1) {
+                        return encode8859_1(coder, val);
+                    } else if (cs == US_ASCII) {
+                        return encodeASCII(coder, val);
+                    }
                     se = new StringEncoder(cs, csn);
+                }
             } catch (IllegalCharsetNameException x) {}
-            if (se == null)
+            if (se == null) {
                 throw new UnsupportedEncodingException (csn);
+            }
             set(encoder, se);
         }
-        return se.encode(ca, off, len);
+        return se.encode(coder, val);
     }
 
-    static byte[] encode(Charset cs, char[] ca, int off, int len) {
+    static byte[] encode(Charset cs, byte coder, byte[] val) {
+        if (cs == UTF_8) {
+            return encodeUTF8(coder, val);
+        } else if (cs == ISO_8859_1) {
+            return encode8859_1(coder, val);
+        } else if (cs == US_ASCII) {
+            return encodeASCII(coder, val);
+        }
         CharsetEncoder ce = cs.newEncoder();
+        // fastpath for ascii compatible
+        if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
+                                 ((ArrayEncoder)ce).isASCIICompatible() &&
+                                 !hasNegatives(val, 0, val.length)))) {
+            return Arrays.copyOf(val, val.length);
+        }
+        int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
         int en = scale(len, ce.maxBytesPerChar());
         byte[] ba = new byte[en];
-        if (len == 0)
+        if (len == 0) {
             return ba;
-        boolean isTrusted = false;
-        if (System.getSecurityManager() != null) {
-            if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
-                ca =  Arrays.copyOfRange(ca, off, off + len);
-                off = 0;
-            }
         }
+        boolean isTrusted = System.getSecurityManager() == null ||
+                            cs.getClass().getClassLoader0() == null;
         ce.onMalformedInput(CodingErrorAction.REPLACE)
           .onUnmappableCharacter(CodingErrorAction.REPLACE)
           .reset();
         if (ce instanceof ArrayEncoder) {
-            int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
-            return safeTrim(ba, blen, cs, isTrusted);
-        } else {
+            if (!isTrusted) {
+                val = Arrays.copyOf(val, val.length);
+            }
+            int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
+                                          : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
+            if (blen != -1) {
+                return safeTrim(ba, blen, isTrusted);
+            }
+        }
+        char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
+                                       : StringUTF16.toChars(val);
             ByteBuffer bb = ByteBuffer.wrap(ba);
-            CharBuffer cb = CharBuffer.wrap(ca, off, len);
+        CharBuffer cb = CharBuffer.wrap(ca, 0, len);
             try {
                 CoderResult cr = ce.encode(cb, bb, true);
                 if (!cr.isUnderflow())
                     cr.throwException();
                 cr = ce.flush(bb);
                 if (!cr.isUnderflow())
                     cr.throwException();
             } catch (CharacterCodingException x) {
                 throw new Error(x);
             }
-            return safeTrim(ba, bb.position(), cs, isTrusted);
-        }
+        return safeTrim(ba, bb.position(), isTrusted);
     }
 
-    static byte[] encode(char[] ca, int off, int len) {
+    static byte[] encode(byte coder, byte[] val) {
         String csn = Charset.defaultCharset().name();
         try {
             // use charset name encode() variant which provides caching.
-            return encode(csn, ca, off, len);
+            return encode(csn, coder, val);
         } catch (UnsupportedEncodingException x) {
             warnUnsupportedCharset(csn);
         }
         try {
-            return encode("ISO-8859-1", ca, off, len);
+            return encode("ISO-8859-1", coder, val);
         } catch (UnsupportedEncodingException x) {
             // If this code is hit during VM initialization, MessageUtils is
             // the only way we will be able to get any kind of error message.
             MessageUtils.err("ISO-8859-1 charset not available: "
                              + x.toString());