Old src/java.base/share/classes/java/lang/StringCoding.java

   1 /*
   2  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.lang;
  27 
  28 import java.io.UnsupportedEncodingException;
  29 import java.lang.ref.SoftReference;
  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.UnsupportedCharsetException;
  40 import java.util.Arrays;
  41 import jdk.internal.HotSpotIntrinsicCandidate;
  42 import sun.nio.cs.HistoricallyNamedCharset;
  43 import sun.nio.cs.ArrayDecoder;
  44 import sun.nio.cs.ArrayEncoder;
  45 import sun.nio.cs.StandardCharsets;
  46 
  47 import static java.lang.String.LATIN1;
  48 import static java.lang.String.UTF16;
  49 import static java.lang.String.COMPACT_STRINGS;
  50 import static java.lang.Character.isSurrogate;
  51 import static java.lang.Character.highSurrogate;
  52 import static java.lang.Character.lowSurrogate;
  53 import static java.lang.Character.isSupplementaryCodePoint;
  54 import static java.lang.StringUTF16.putChar;
  55 
  56 /**
  57  * Utility class for string encoding and decoding.
  58  */
  59 
  60 class StringCoding {
  61 
  62     private StringCoding() { }
  63 
  64     /** The cached coders for each thread */
  65     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  66         new ThreadLocal<>();
  67     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  68         new ThreadLocal<>();
  69 
  70     private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
  71     private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
  72     private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
  73 
  74     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  75         SoftReference<T> sr = tl.get();
  76         if (sr == null)
  77             return null;
  78         return sr.get();
  79     }
  80 
  81     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  82         tl.set(new SoftReference<>(ob));
  83     }
  84 
  85     // Trim the given byte array to the given length
  86     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  87         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  88             return ba;
  89         else
  90             return Arrays.copyOf(ba, len);
  91     }
  92 
  93     private static int scale(int len, float expansionFactor) {
  94         // We need to perform double, not float, arithmetic; otherwise
  95         // we lose low order bits when len is larger than 2**24.
  96         return (int)(len * (double)expansionFactor);
  97     }
  98 
  99     private static Charset lookupCharset(String csn) {
 100         if (Charset.isSupported(csn)) {
 101             try {
 102                 return Charset.forName(csn);
 103             } catch (UnsupportedCharsetException x) {
 104                 throw new Error(x);
 105             }
 106         }
 107         return null;
 108     }
 109 
 110     static class Result {
 111         byte[] value;
 112         byte coder;
 113 
 114         Result with() {
 115             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 116             value = new byte[0];
 117             return this;
 118         }
 119 
 120         Result with(char[] val, int off, int len) {
 121             if (String.COMPACT_STRINGS) {
 122                 byte[] bs = StringUTF16.compress(val, off, len);
 123                 if (bs != null) {
 124                     value = bs;
 125                     coder = LATIN1;
 126                     return this;
 127                 }
 128             }
 129             coder = UTF16;
 130             value = StringUTF16.toBytes(val, off, len);
 131             return this;
 132         }
 133 
 134         Result with(byte[] val, byte coder) {
 135             this.coder = coder;
 136             value = val;
 137             return this;
 138         }
 139     }
 140 
 141     @HotSpotIntrinsicCandidate
 142     public static boolean hasNegatives(byte[] ba, int off, int len) {
 143         for (int i = off; i < off + len; i++) {
 144             if (ba[i] < 0) {
 145                 return true;
 146             }
 147         }
 148         return false;
 149     }
 150 
 151     // -- Decoding --
 152     static class StringDecoder {
 153         private final String requestedCharsetName;
 154         private final Charset cs;
 155         private final boolean isASCIICompatible;
 156         private final CharsetDecoder cd;
 157         protected final Result result;
 158 
 159         StringDecoder(Charset cs, String rcn) {
 160             this.requestedCharsetName = rcn;
 161             this.cs = cs;
 162             this.cd = cs.newDecoder()
 163                 .onMalformedInput(CodingErrorAction.REPLACE)
 164                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 165             this.result = new Result();
 166             this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
 167                     ((ArrayDecoder)cd).isASCIICompatible();
 168         }
 169 
 170         String charsetName() {
 171             if (cs instanceof HistoricallyNamedCharset)
 172                 return ((HistoricallyNamedCharset)cs).historicalName();
 173             return cs.name();
 174         }
 175 
 176         final String requestedCharsetName() {
 177             return requestedCharsetName;
 178         }
 179 
 180         Result decode(byte[] ba, int off, int len) {
 181             if (len == 0) {
 182                 return result.with();
 183             }
 184             // fastpath for ascii compatible
 185             if (isASCIICompatible && !hasNegatives(ba, off, len)) {
 186                 if (COMPACT_STRINGS) {
 187                     return result.with(Arrays.copyOfRange(ba, off, off + len),
 188                                       LATIN1);
 189                 } else {
 190                     return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 191                 }
 192             }
 193             int en = scale(len, cd.maxCharsPerByte());
 194             char[] ca = new char[en];
 195             if (cd instanceof ArrayDecoder) {
 196                 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 197                 return result.with(ca, 0, clen);
 198             }
 199             cd.reset();
 200             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 201             CharBuffer cb = CharBuffer.wrap(ca);
 202             try {
 203                 CoderResult cr = cd.decode(bb, cb, true);
 204                 if (!cr.isUnderflow())
 205                     cr.throwException();
 206                 cr = cd.flush(cb);
 207                 if (!cr.isUnderflow())
 208                     cr.throwException();
 209             } catch (CharacterCodingException x) {
 210                 // Substitution is always enabled,
 211                 // so this shouldn't happen
 212                 throw new Error(x);
 213             }
 214             return result.with(ca, 0, cb.position());
 215         }
 216     }
 217 
 218     static Result decode(String charsetName, byte[] ba, int off, int len)
 219         throws UnsupportedEncodingException
 220     {
 221         StringDecoder sd = deref(decoder);
 222         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 223         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 224                               || csn.equals(sd.charsetName()))) {
 225             sd = null;
 226             try {
 227                 Charset cs = lookupCharset(csn);
 228                 if (cs != null) {
 229                     if (cs == UTF_8) {
 230                         return decodeUTF8(ba, off, len, true);
 231                     }
 232                     if (cs == ISO_8859_1) {
 233                         return decodeLatin1(ba, off, len);
 234                     }
 235                     if (cs == US_ASCII) {
 236                         return decodeASCII(ba, off, len);
 237                     }
 238                     sd = new StringDecoder(cs, csn);
 239                 }
 240             } catch (IllegalCharsetNameException x) {}
 241             if (sd == null)
 242                 throw new UnsupportedEncodingException(csn);
 243             set(decoder, sd);
 244         }
 245         return sd.decode(ba, off, len);
 246     }
 247 
 248     static Result decode(Charset cs, byte[] ba, int off, int len) {
 249         if (cs == UTF_8) {
 250             return decodeUTF8(ba, off, len, true);
 251         }
 252         if (cs == ISO_8859_1) {
 253             return decodeLatin1(ba, off, len);
 254         }
 255         if (cs == US_ASCII) {
 256             return decodeASCII(ba, off, len);
 257         }
 258 
 259         // (1)We never cache the "external" cs, the only benefit of creating
 260         // an additional StringDe/Encoder object to wrap it is to share the
 261         // de/encode() method. These SD/E objects are short-lived, the young-gen
 262         // gc should be able to take care of them well. But the best approach
 263         // is still not to generate them if not really necessary.
 264         // (2)The defensive copy of the input byte/char[] has a big performance
 265         // impact, as well as the outgoing result byte/char[]. Need to do the
 266         // optimization check of (sm==null && classLoader0==null) for both.
 267         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 268         // is only checked (and then isTrusted gets set) when (SM==null). It is
 269         // possible that the SM==null for now but then SM is NOT null later
 270         // when safeTrim() is invoked...the "safe" way to do is to redundant
 271         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 272         // but it then can be argued that the SM is null when the operation
 273         // is started...
 274         CharsetDecoder cd = cs.newDecoder();
 275         // ascii fastpath
 276         if ((cd instanceof ArrayDecoder) &&
 277             ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
 278             return decodeLatin1(ba, off, len);
 279         }
 280         int en = scale(len, cd.maxCharsPerByte());
 281         if (len == 0) {
 282             return new Result().with();
 283         }
 284         cd.onMalformedInput(CodingErrorAction.REPLACE)
 285           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 286           .reset();
 287         char[] ca = new char[en];
 288         if (cd instanceof ArrayDecoder) {
 289             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 290             return new Result().with(ca, 0, clen);
 291         }
 292         if (cs.getClass().getClassLoader0() != null &&
 293             System.getSecurityManager() != null) {
 294             ba = Arrays.copyOfRange(ba, off, off + len);
 295             off = 0;
 296         }
 297         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 298         CharBuffer cb = CharBuffer.wrap(ca);
 299         try {
 300             CoderResult cr = cd.decode(bb, cb, true);
 301             if (!cr.isUnderflow())
 302                 cr.throwException();
 303             cr = cd.flush(cb);
 304             if (!cr.isUnderflow())
 305                 cr.throwException();
 306         } catch (CharacterCodingException x) {
 307             // Substitution is always enabled,
 308             // so this shouldn't happen
 309             throw new Error(x);
 310         }
 311         return new Result().with(ca, 0, cb.position());
 312     }
 313 
 314     static Result decode(byte[] ba, int off, int len) {
 315         Charset cs = Charset.defaultCharset();
 316         if (cs == UTF_8) {
 317             return decodeUTF8(ba, off, len, true);
 318         }
 319         if (cs == ISO_8859_1) {
 320             return decodeLatin1(ba, off, len);
 321         }
 322         if (cs == US_ASCII) {
 323             return decodeASCII(ba, off, len);
 324         }
 325         StringDecoder sd = deref(decoder);
 326         if (sd == null || !cs.name().equals(sd.cs.name())) {
 327             sd = new StringDecoder(cs, cs.name());
 328             set(decoder, sd);
 329         }
 330         return sd.decode(ba, off, len);
 331     }
 332 
 333     // -- Encoding --
 334     private static class StringEncoder {
 335         private Charset cs;
 336         private CharsetEncoder ce;
 337         private final boolean isASCIICompatible;
 338         private final String requestedCharsetName;
 339         private final boolean isTrusted;
 340 
 341         private StringEncoder(Charset cs, String rcn) {
 342             this.requestedCharsetName = rcn;
 343             this.cs = cs;
 344             this.ce = cs.newEncoder()
 345                 .onMalformedInput(CodingErrorAction.REPLACE)
 346                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 347             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 348             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 349                     ((ArrayEncoder)ce).isASCIICompatible();
 350         }
 351 
 352         String charsetName() {
 353             if (cs instanceof HistoricallyNamedCharset)
 354                 return ((HistoricallyNamedCharset)cs).historicalName();
 355             return cs.name();
 356         }
 357 
 358         final String requestedCharsetName() {
 359             return requestedCharsetName;
 360         }
 361 
 362         byte[] encode(byte coder, byte[] val) {
 363             // fastpath for ascii compatible
 364             if (coder == LATIN1 && isASCIICompatible &&
 365                 !hasNegatives(val, 0, val.length)) {
 366                 return Arrays.copyOf(val, val.length);
 367             }
 368             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 369             int en = scale(len, ce.maxBytesPerChar());
 370             byte[] ba = new byte[en];
 371             if (len == 0) {
 372                 return ba;
 373             }
 374             if (ce instanceof ArrayEncoder) {
 375                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 376                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 377                 if (blen != -1) {
 378                     return safeTrim(ba, blen, isTrusted);
 379                 }
 380             }
 381             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 382                                            : StringUTF16.toChars(val);
 383             ce.reset();
 384             ByteBuffer bb = ByteBuffer.wrap(ba);
 385             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 386             try {
 387                 CoderResult cr = ce.encode(cb, bb, true);
 388                 if (!cr.isUnderflow())
 389                     cr.throwException();
 390                 cr = ce.flush(bb);
 391                 if (!cr.isUnderflow())
 392                     cr.throwException();
 393             } catch (CharacterCodingException x) {
 394                 // Substitution is always enabled,
 395                 // so this shouldn't happen
 396                 throw new Error(x);
 397             }
 398             return safeTrim(ba, bb.position(), isTrusted);
 399         }
 400     }
 401 
 402     static byte[] encode(String charsetName, byte coder, byte[] val)
 403         throws UnsupportedEncodingException
 404     {
 405         StringEncoder se = deref(encoder);
 406         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 407         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 408                               || csn.equals(se.charsetName()))) {
 409             se = null;
 410             try {
 411                 Charset cs = lookupCharset(csn);
 412                 if (cs != null) {
 413                     if (cs == UTF_8) {
 414                         return encodeUTF8(coder, val, true);
 415                     }
 416                     if (cs == ISO_8859_1) {
 417                         return encode8859_1(coder, val);
 418                     }
 419                     if (cs == US_ASCII) {
 420                         return encodeASCII(coder, val);
 421                     }
 422                     se = new StringEncoder(cs, csn);
 423                 }
 424             } catch (IllegalCharsetNameException x) {}
 425             if (se == null) {
 426                 throw new UnsupportedEncodingException (csn);
 427             }
 428             set(encoder, se);
 429         }
 430         return se.encode(coder, val);
 431     }
 432 
 433     static byte[] encode(Charset cs, byte coder, byte[] val) {
 434         if (cs == UTF_8) {
 435             return encodeUTF8(coder, val, true);
 436         }
 437         if (cs == ISO_8859_1) {
 438             return encode8859_1(coder, val);
 439         }
 440         if (cs == US_ASCII) {
 441             return encodeASCII(coder, val);
 442         }
 443         CharsetEncoder ce = cs.newEncoder();
 444         // fastpath for ascii compatible
 445         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 446                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 447                                  !hasNegatives(val, 0, val.length)))) {
 448             return Arrays.copyOf(val, val.length);
 449         }
 450         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 451         int en = scale(len, ce.maxBytesPerChar());
 452         byte[] ba = new byte[en];
 453         if (len == 0) {
 454             return ba;
 455         }
 456         ce.onMalformedInput(CodingErrorAction.REPLACE)
 457           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 458           .reset();
 459         if (ce instanceof ArrayEncoder) {
 460             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 461                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 462             if (blen != -1) {
 463                 return safeTrim(ba, blen, true);
 464             }
 465         }
 466         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 467                             System.getSecurityManager() == null;
 468         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 469                                        : StringUTF16.toChars(val);
 470         ByteBuffer bb = ByteBuffer.wrap(ba);
 471         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 472         try {
 473             CoderResult cr = ce.encode(cb, bb, true);
 474             if (!cr.isUnderflow())
 475                 cr.throwException();
 476             cr = ce.flush(bb);
 477             if (!cr.isUnderflow())
 478                 cr.throwException();
 479         } catch (CharacterCodingException x) {
 480             throw new Error(x);
 481         }
 482         return safeTrim(ba, bb.position(), isTrusted);
 483     }
 484 
 485     static byte[] encode(byte coder, byte[] val) {
 486         Charset cs = Charset.defaultCharset();
 487         if (cs == UTF_8) {
 488             return encodeUTF8(coder, val, true);
 489         }
 490         if (cs == ISO_8859_1) {
 491             return encode8859_1(coder, val);
 492         }
 493         if (cs == US_ASCII) {
 494             return encodeASCII(coder, val);
 495         }
 496         StringEncoder se = deref(encoder);
 497         if (se == null || !cs.name().equals(se.cs.name())) {
 498             se = new StringEncoder(cs, cs.name());
 499             set(encoder, se);
 500         }
 501         return se.encode(coder, val);
 502     }
 503 
 504     /**
 505      *  Print a message directly to stderr, bypassing all character conversion
 506      *  methods.
 507      *  @param msg  message to print
 508      */
 509     private static native void err(String msg);
 510 
 511      /* The cached Result for each thread */
 512     private static final ThreadLocal<StringCoding.Result>
 513         resultCached = new ThreadLocal<>() {
 514             protected StringCoding.Result initialValue() {
 515                 return new StringCoding.Result();
 516             }};
 517 
 518     ////////////////////////// ascii //////////////////////////////
 519 
 520     private static Result decodeASCII(byte[] ba, int off, int len) {
 521         Result result = resultCached.get();
 522         if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
 523             return result.with(Arrays.copyOfRange(ba, off, off + len),
 524                                LATIN1);
 525         }
 526         byte[] dst = new byte[len<<1];
 527         int dp = 0;
 528         while (dp < len) {
 529             int b = ba[off++];
 530             putChar(dst, dp++, (b >= 0) ? (char)b : repl);
 531         }
 532         return result.with(dst, UTF16);
 533     }
 534 
 535     private static byte[] encodeASCII(byte coder, byte[] val) {
 536         if (coder == LATIN1) {
 537             byte[] dst = new byte[val.length];
 538             for (int i = 0; i < val.length; i++) {
 539                 if (val[i] < 0) {
 540                     dst[i] = '?';
 541                 } else {
 542                     dst[i] = val[i];
 543                 }
 544             }
 545             return dst;
 546         }
 547         int len = val.length >> 1;
 548         byte[] dst = new byte[len];
 549         int dp = 0;
 550         for (int i = 0; i < len; i++) {
 551             char c = StringUTF16.getChar(val, i);
 552             if (c < 0x80) {
 553                 dst[dp++] = (byte)c;
 554                 continue;
 555             }
 556             if (Character.isHighSurrogate(c) && i + 1 < len &&
 557                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 558                 i++;
 559             }
 560             dst[dp++] = '?';
 561         }
 562         if (len == dp) {
 563             return dst;
 564         }
 565         return Arrays.copyOf(dst, dp);
 566     }
 567 
 568     ////////////////////////// latin1/8859_1 ///////////////////////////
 569 
 570     private static Result decodeLatin1(byte[] ba, int off, int len) {
 571        Result result = resultCached.get();
 572        if (COMPACT_STRINGS) {
 573            return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 574        } else {
 575            return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 576        }
 577     }
 578 
 579     @HotSpotIntrinsicCandidate
 580     private static int implEncodeISOArray(byte[] sa, int sp,
 581                                           byte[] da, int dp, int len) {
 582         int i = 0;
 583         for (; i < len; i++) {
 584             char c = StringUTF16.getChar(sa, sp++);
 585             if (c > '\u00FF')
 586                 break;
 587             da[dp++] = (byte)c;
 588         }
 589         return i;
 590     }
 591 
 592     private static byte[] encode8859_1(byte coder, byte[] val) {
 593         if (coder == LATIN1) {
 594             return Arrays.copyOf(val, val.length);
 595         }
 596         int len = val.length >> 1;
 597         byte[] dst = new byte[len];
 598         int dp = 0;
 599         int sp = 0;
 600         int sl = len;
 601         while (sp < sl) {
 602             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 603             sp = sp + ret;
 604             dp = dp + ret;
 605             if (ret != len) {
 606                 char c = StringUTF16.getChar(val, sp++);
 607                 if (Character.isHighSurrogate(c) && sp < sl &&
 608                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 609                     sp++;
 610                 }
 611                 dst[dp++] = '?';
 612                 len = sl - sp;
 613             }
 614         }
 615         if (dp == dst.length) {
 616             return dst;
 617         }
 618         return Arrays.copyOf(dst, dp);
 619     }
 620 
 621     //////////////////////////////// utf8 ////////////////////////////////////
 622 
 623     private static boolean isNotContinuation(int b) {
 624         return (b & 0xc0) != 0x80;
 625     }
 626 
 627     private static boolean isMalformed3(int b1, int b2, int b3) {
 628         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 629                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 630     }
 631 
 632     private static boolean isMalformed3_2(int b1, int b2) {
 633         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 634                (b2 & 0xc0) != 0x80;
 635     }
 636 
 637     private static boolean isMalformed4(int b2, int b3, int b4) {
 638         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 639                (b4 & 0xc0) != 0x80;
 640     }
 641 
 642     private static boolean isMalformed4_2(int b1, int b2) {
 643         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 644                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 645                (b2 & 0xc0) != 0x80;
 646     }
 647 
 648     private static boolean isMalformed4_3(int b3) {
 649         return (b3 & 0xc0) != 0x80;
 650     }
 651 
 652     // for nb == 3/4
 653     private static int malformedN(byte[] src, int sp, int nb) {
 654         if (nb == 3) {
 655             int b1 = src[sp++];
 656             int b2 = src[sp++];    // no need to lookup b3
 657             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 658                     isNotContinuation(b2)) ? 1 : 2;
 659         } else if (nb == 4) { // we don't care the speed here
 660             int b1 = src[sp++] & 0xff;
 661             int b2 = src[sp++] & 0xff;
 662             if (b1 > 0xf4 ||
 663                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 664                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 665                 isNotContinuation(b2))
 666                 return 1;
 667             if (isNotContinuation(src[sp++]))
 668                 return 2;
 669             return 3;
 670         }
 671         assert false;
 672         return -1;
 673     }
 674 
 675     private static void throwMalformed(int off, int nb) {
 676         throw new IllegalArgumentException("malformed input off : " + off +
 677                                            ", length : " + nb);
 678     }
 679 
 680     private static char repl = '\ufffd';
 681 
 682     private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
 683         // ascii-bais, which has a relative impact to the non-ascii-only bytes
 684         if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
 685             return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
 686                                            LATIN1);
 687         return decodeUTF8_0(src, sp, len, doReplace);
 688     }
 689 
 690     private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
 691         Result ret = resultCached.get();
 692 
 693         int sl = sp + len;
 694         int dp = 0;
 695         byte[] dst = new byte[len];
 696 
 697         if (COMPACT_STRINGS) {
 698             while (sp < sl) {
 699                 int b1 = src[sp];
 700                 if (b1 >= 0) {
 701                     dst[dp++] = (byte)b1;
 702                     sp++;
 703                     continue;
 704                 }
 705                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 706                     sp + 1 < sl) {
 707                     int b2 = src[sp + 1];
 708                     if (!isNotContinuation(b2)) {
 709                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 710                                            (((byte) 0xC0 << 6) ^
 711                                            ((byte) 0x80 << 0)));
 712                         sp += 2;
 713                         continue;
 714                     }
 715                 }
 716                 // anything not a latin1, including the repl
 717                 // we have to go with the utf16
 718                 break;
 719             }
 720             if (sp == sl) {
 721                 if (dp != dst.length) {
 722                     dst = Arrays.copyOf(dst, dp);
 723                 }
 724                 return ret.with(dst, LATIN1);
 725             }
 726         }
 727         if (dp == 0) {
 728             dst = new byte[len << 1];
 729         } else {
 730             byte[] buf = new byte[len << 1];
 731             StringLatin1.inflate(dst, 0, buf, 0, dp);
 732             dst = buf;
 733         }
 734         while (sp < sl) {
 735             int b1 = src[sp++];
 736             if (b1 >= 0) {
 737                 putChar(dst, dp++, (char) b1);
 738             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 739                 if (sp < sl) {
 740                     int b2 = src[sp++];
 741                     if (isNotContinuation(b2)) {
 742                         if (!doReplace) {
 743                             throwMalformed(sp - 1, 1);
 744                         }
 745                         putChar(dst, dp++, repl);
 746                         sp--;
 747                     } else {
 748                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 749                                                   (((byte) 0xC0 << 6) ^
 750                                                   ((byte) 0x80 << 0))));
 751                     }
 752                     continue;
 753                 }
 754                 if (!doReplace) {
 755                     throwMalformed(sp, 1);  // underflow()
 756                 }
 757                 putChar(dst, dp++, repl);
 758                 break;
 759             } else if ((b1 >> 4) == -2) {
 760                 if (sp + 1 < sl) {
 761                     int b2 = src[sp++];
 762                     int b3 = src[sp++];
 763                     if (isMalformed3(b1, b2, b3)) {
 764                         if (!doReplace) {
 765                             throwMalformed(sp - 3, 3);
 766                         }
 767                         putChar(dst, dp++, repl);
 768                         sp -= 3;
 769                         sp += malformedN(src, sp, 3);
 770                     } else {
 771                         char c = (char)((b1 << 12) ^
 772                                         (b2 <<  6) ^
 773                                         (b3 ^
 774                                          (((byte) 0xE0 << 12) ^
 775                                          ((byte) 0x80 <<  6) ^
 776                                          ((byte) 0x80 <<  0))));
 777                         if (isSurrogate(c)) {
 778                             if (!doReplace) {
 779                                 throwMalformed(sp - 3, 3);
 780                             }
 781                             putChar(dst, dp++, repl);
 782                         } else {
 783                             putChar(dst, dp++, c);
 784                         }
 785                     }
 786                     continue;
 787                 }
 788                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 789                     if (!doReplace) {
 790                         throwMalformed(sp - 1, 2);
 791                     }
 792                     putChar(dst, dp++, repl);
 793                     continue;
 794                 }
 795                 if (!doReplace){
 796                     throwMalformed(sp, 1);
 797                 }
 798                 putChar(dst, dp++, repl);
 799                 break;
 800             } else if ((b1 >> 3) == -2) {
 801                 if (sp + 2 < sl) {
 802                     int b2 = src[sp++];
 803                     int b3 = src[sp++];
 804                     int b4 = src[sp++];
 805                     int uc = ((b1 << 18) ^
 806                               (b2 << 12) ^
 807                               (b3 <<  6) ^
 808                               (b4 ^
 809                                (((byte) 0xF0 << 18) ^
 810                                ((byte) 0x80 << 12) ^
 811                                ((byte) 0x80 <<  6) ^
 812                                ((byte) 0x80 <<  0))));
 813                     if (isMalformed4(b2, b3, b4) ||
 814                         !isSupplementaryCodePoint(uc)) { // shortest form check
 815                         if (!doReplace) {
 816                             throwMalformed(sp - 4, 4);
 817                         }
 818                         putChar(dst, dp++, repl);
 819                         sp -= 4;
 820                         sp += malformedN(src, sp, 4);
 821                     } else {
 822                         putChar(dst, dp++, highSurrogate(uc));
 823                         putChar(dst, dp++, lowSurrogate(uc));
 824                     }
 825                     continue;
 826                 }
 827                 b1 &= 0xff;
 828                 if (b1 > 0xf4 ||
 829                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 830                     if (!doReplace) {
 831                         throwMalformed(sp - 1, 1);  // or 2
 832                     }
 833                     putChar(dst, dp++, repl);
 834                     continue;
 835                 }
 836                 if (!doReplace) {
 837                     throwMalformed(sp - 1, 1);
 838                 }
 839                 sp++;
 840                 putChar(dst, dp++, repl);
 841                 if (sp  < sl && isMalformed4_3(src[sp])) {
 842                     continue;
 843                 }
 844                 break;
 845             } else {
 846                 if (!doReplace) {
 847                     throwMalformed(sp - 1, 1);
 848                 }
 849                 putChar(dst, dp++, repl);
 850             }
 851         }
 852         if (dp != len) {
 853             dst = Arrays.copyOf(dst, dp << 1);
 854         }
 855         return ret.with(dst, UTF16);
 856     }
 857 
 858     private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
 859         if (coder == UTF16)
 860             return encodeUTF8_UTF16(val, doReplace);
 861 
 862         if (!hasNegatives(val, 0, val.length))
 863             return Arrays.copyOf(val, val.length);
 864 
 865         int dp = 0;
 866         byte[] dst = new byte[val.length << 1];
 867         for (int sp = 0; sp < val.length; sp++) {
 868             byte c = val[sp];
 869             if (c < 0) {
 870                 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 871                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 872             } else {
 873                 dst[dp++] = c;
 874             }
 875         }
 876         if (dp == dst.length)
 877             return dst;
 878         return Arrays.copyOf(dst, dp);
 879     }
 880 
 881     private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
 882         int dp = 0;
 883         int sp = 0;
 884         int sl = val.length >> 1;
 885         byte[] dst = new byte[sl * 3];
 886         char c;
 887         while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 888             // ascii fast loop;
 889             dst[dp++] = (byte)c;
 890             sp++;
 891         }
 892         while (sp < sl) {
 893             c = StringUTF16.getChar(val, sp++);
 894             if (c < 0x80) {
 895                 dst[dp++] = (byte)c;
 896             } else if (c < 0x800) {
 897                 dst[dp++] = (byte)(0xc0 | (c >> 6));
 898                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 899             } else if (Character.isSurrogate(c)) {
 900                 int uc = -1;
 901                 char c2;
 902                 if (Character.isHighSurrogate(c) && sp < sl &&
 903                     Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 904                     uc = Character.toCodePoint(c, c2);
 905                 }
 906                 if (uc < 0) {
 907                     if (doReplace) {
 908                         dst[dp++] = '?';
 909                     } else {
 910                         throwMalformed(sp - 1, 1); // or 2, does not matter here
 911                     }
 912                 } else {
 913                     dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 914                     dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 915                     dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 916                     dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 917                     sp++;  // 2 chars
 918                 }
 919             } else {
 920                 // 3 bytes, 16 bits
 921                 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 922                 dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 923                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 924             }
 925         }
 926         if (dp == dst.length) {
 927             return dst;
 928         }
 929         return Arrays.copyOf(dst, dp);
 930     }
 931 
 932     ////////////////////// for j.u.z.ZipCoder //////////////////////////
 933 
 934     /*
 935      * Throws iae, instead of replacing, if malformed or unmappble.
 936      */
 937     static String newStringUTF8NoRepl(byte[] src, int off, int len) {
 938         if (COMPACT_STRINGS && !hasNegatives(src, off, len))
 939             return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
 940         Result ret = decodeUTF8_0(src, off, len, false);
 941         return new String(ret.value, ret.coder);
 942     }
 943 
 944     /*
 945      * Throws iae, instead of replacing, if unmappble.
 946      */
 947     static byte[] getBytesUTF8NoRepl(String s) {
 948         return encodeUTF8(s.coder(), s.value(), false);
 949     }
 950 }