Old src/java.base/share/classes/java/lang/StringCoding.java

   1 /*
   2  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.lang;
  27 
  28 import java.io.UnsupportedEncodingException;
  29 import java.lang.ref.SoftReference;
  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.UnsupportedCharsetException;
  40 import java.util.Arrays;
  41 import jdk.internal.HotSpotIntrinsicCandidate;
  42 import sun.nio.cs.HistoricallyNamedCharset;
  43 import sun.nio.cs.ArrayDecoder;
  44 import sun.nio.cs.ArrayEncoder;
  45 import sun.nio.cs.StandardCharsets;
  46 
  47 import static java.lang.String.LATIN1;
  48 import static java.lang.String.UTF16;
  49 import static java.lang.String.COMPACT_STRINGS;
  50 import static java.lang.Character.isSurrogate;
  51 import static java.lang.Character.highSurrogate;
  52 import static java.lang.Character.lowSurrogate;
  53 import static java.lang.Character.isSupplementaryCodePoint;
  54 import static java.lang.StringUTF16.putChar;
  55 import static java.nio.charset.StandardCharsets.ISO_8859_1;
  56 import static java.nio.charset.StandardCharsets.US_ASCII;
  57 import static java.nio.charset.StandardCharsets.UTF_8;
  58 
  59 /**
  60  * Utility class for string encoding and decoding.
  61  */
  62 
  63 class StringCoding {
  64 
  65     private StringCoding() { }
  66 
  67     /** The cached coders for each thread */
  68     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  69         new ThreadLocal<>();
  70     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  71         new ThreadLocal<>();
  72 
  73     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  74         SoftReference<T> sr = tl.get();
  75         if (sr == null)
  76             return null;
  77         return sr.get();
  78     }
  79 
  80     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  81         tl.set(new SoftReference<>(ob));
  82     }
  83 
  84     // Trim the given byte array to the given length
  85     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  86         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  87             return ba;
  88         else
  89             return Arrays.copyOf(ba, len);
  90     }
  91 
  92     private static int scale(int len, float expansionFactor) {
  93         // We need to perform double, not float, arithmetic; otherwise
  94         // we lose low order bits when len is larger than 2**24.
  95         return (int)(len * (double)expansionFactor);
  96     }
  97 
  98     private static Charset lookupCharset(String csn) {
  99         if (Charset.isSupported(csn)) {
 100             try {
 101                 return Charset.forName(csn);
 102             } catch (UnsupportedCharsetException x) {
 103                 throw new Error(x);
 104             }
 105         }
 106         return null;
 107     }
 108 
 109     static class Result {
 110         byte[] value;
 111         byte coder;
 112 
 113         Result with() {
 114             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 115             value = new byte[0];
 116             return this;
 117         }
 118 
 119         Result with(char[] val, int off, int len) {
 120             if (String.COMPACT_STRINGS) {
 121                 byte[] bs = StringUTF16.compress(val, off, len);
 122                 if (bs != null) {
 123                     value = bs;
 124                     coder = LATIN1;
 125                     return this;
 126                 }
 127             }
 128             coder = UTF16;
 129             value = StringUTF16.toBytes(val, off, len);
 130             return this;
 131         }
 132 
 133         Result with(byte[] val, byte coder) {
 134             this.coder = coder;
 135             value = val;
 136             return this;
 137         }
 138     }
 139 
 140     @HotSpotIntrinsicCandidate
 141     public static boolean hasNegatives(byte[] ba, int off, int len) {
 142         for (int i = off; i < off + len; i++) {
 143             if (ba[i] < 0) {
 144                 return true;
 145             }
 146         }
 147         return false;
 148     }
 149 
 150     // -- Decoding --
 151     static class StringDecoder {
 152         private final String requestedCharsetName;
 153         private final Charset cs;
 154         private final boolean isASCIICompatible;
 155         private final CharsetDecoder cd;
 156         protected final Result result;
 157 
 158         StringDecoder(Charset cs, String rcn) {
 159             this.requestedCharsetName = rcn;
 160             this.cs = cs;
 161             this.cd = cs.newDecoder()
 162                 .onMalformedInput(CodingErrorAction.REPLACE)
 163                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 164             this.result = new Result();
 165             this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
 166                     ((ArrayDecoder)cd).isASCIICompatible();
 167         }
 168 
 169         String charsetName() {
 170             if (cs instanceof HistoricallyNamedCharset)
 171                 return ((HistoricallyNamedCharset)cs).historicalName();
 172             return cs.name();
 173         }
 174 
 175         final String requestedCharsetName() {
 176             return requestedCharsetName;
 177         }
 178 
 179         Result decode(byte[] ba, int off, int len) {
 180             if (len == 0) {
 181                 return result.with();
 182             }
 183             // fastpath for ascii compatible
 184             if (isASCIICompatible && !hasNegatives(ba, off, len)) {
 185                 if (COMPACT_STRINGS) {
 186                     return result.with(Arrays.copyOfRange(ba, off, off + len),
 187                                       LATIN1);
 188                 } else {
 189                     return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 190                 }
 191             }
 192             int en = scale(len, cd.maxCharsPerByte());
 193             char[] ca = new char[en];
 194             if (cd instanceof ArrayDecoder) {
 195                 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 196                 return result.with(ca, 0, clen);
 197             }
 198             cd.reset();
 199             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 200             CharBuffer cb = CharBuffer.wrap(ca);
 201             try {
 202                 CoderResult cr = cd.decode(bb, cb, true);
 203                 if (!cr.isUnderflow())
 204                     cr.throwException();
 205                 cr = cd.flush(cb);
 206                 if (!cr.isUnderflow())
 207                     cr.throwException();
 208             } catch (CharacterCodingException x) {
 209                 // Substitution is always enabled,
 210                 // so this shouldn't happen
 211                 throw new Error(x);
 212             }
 213             return result.with(ca, 0, cb.position());
 214         }
 215     }
 216 
 217     static Result decode(String charsetName, byte[] ba, int off, int len)
 218         throws UnsupportedEncodingException
 219     {
 220         StringDecoder sd = deref(decoder);
 221         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 222         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 223                               || csn.equals(sd.charsetName()))) {
 224             sd = null;
 225             try {
 226                 Charset cs = lookupCharset(csn);
 227                 if (cs != null) {
 228                     if (cs == UTF_8) {
 229                         return decodeUTF8(ba, off, len, true);
 230                     }
 231                     if (cs == ISO_8859_1) {
 232                         return decodeLatin1(ba, off, len);
 233                     }
 234                     if (cs == US_ASCII) {
 235                         return decodeASCII(ba, off, len);
 236                     }
 237                     sd = new StringDecoder(cs, csn);
 238                 }
 239             } catch (IllegalCharsetNameException x) {}
 240             if (sd == null)
 241                 throw new UnsupportedEncodingException(csn);
 242             set(decoder, sd);
 243         }
 244         return sd.decode(ba, off, len);
 245     }
 246 
 247     static Result decode(Charset cs, byte[] ba, int off, int len) {
 248         if (cs == UTF_8) {
 249             return decodeUTF8(ba, off, len, true);
 250         }
 251         if (cs == ISO_8859_1) {
 252             return decodeLatin1(ba, off, len);
 253         }
 254         if (cs == US_ASCII) {
 255             return decodeASCII(ba, off, len);
 256         }
 257 
 258         // (1)We never cache the "external" cs, the only benefit of creating
 259         // an additional StringDe/Encoder object to wrap it is to share the
 260         // de/encode() method. These SD/E objects are short-lived, the young-gen
 261         // gc should be able to take care of them well. But the best approach
 262         // is still not to generate them if not really necessary.
 263         // (2)The defensive copy of the input byte/char[] has a big performance
 264         // impact, as well as the outgoing result byte/char[]. Need to do the
 265         // optimization check of (sm==null && classLoader0==null) for both.
 266         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 267         // is only checked (and then isTrusted gets set) when (SM==null). It is
 268         // possible that the SM==null for now but then SM is NOT null later
 269         // when safeTrim() is invoked...the "safe" way to do is to redundant
 270         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 271         // but it then can be argued that the SM is null when the operation
 272         // is started...
 273         CharsetDecoder cd = cs.newDecoder();
 274         // ascii fastpath
 275         if ((cd instanceof ArrayDecoder) &&
 276             ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
 277             return decodeLatin1(ba, off, len);
 278         }
 279         int en = scale(len, cd.maxCharsPerByte());
 280         if (len == 0) {
 281             return new Result().with();
 282         }
 283         cd.onMalformedInput(CodingErrorAction.REPLACE)
 284           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 285           .reset();
 286         char[] ca = new char[en];
 287         if (cd instanceof ArrayDecoder) {
 288             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 289             return new Result().with(ca, 0, clen);
 290         }
 291         if (cs.getClass().getClassLoader0() != null &&
 292             System.getSecurityManager() != null) {
 293             ba = Arrays.copyOfRange(ba, off, off + len);
 294             off = 0;
 295         }
 296         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 297         CharBuffer cb = CharBuffer.wrap(ca);
 298         try {
 299             CoderResult cr = cd.decode(bb, cb, true);
 300             if (!cr.isUnderflow())
 301                 cr.throwException();
 302             cr = cd.flush(cb);
 303             if (!cr.isUnderflow())
 304                 cr.throwException();
 305         } catch (CharacterCodingException x) {
 306             // Substitution is always enabled,
 307             // so this shouldn't happen
 308             throw new Error(x);
 309         }
 310         return new Result().with(ca, 0, cb.position());
 311     }
 312 
 313     static Result decode(byte[] ba, int off, int len) {
 314         Charset cs = Charset.defaultCharset();
 315         if (cs == UTF_8) {
 316             return decodeUTF8(ba, off, len, true);
 317         }
 318         if (cs == ISO_8859_1) {
 319             return decodeLatin1(ba, off, len);
 320         }
 321         if (cs == US_ASCII) {
 322             return decodeASCII(ba, off, len);
 323         }
 324         StringDecoder sd = deref(decoder);
 325         if (sd == null || !cs.name().equals(sd.cs.name())) {
 326             sd = new StringDecoder(cs, cs.name());
 327             set(decoder, sd);
 328         }
 329         return sd.decode(ba, off, len);
 330     }
 331 
 332     // -- Encoding --
 333     private static class StringEncoder {
 334         private Charset cs;
 335         private CharsetEncoder ce;
 336         private final boolean isASCIICompatible;
 337         private final String requestedCharsetName;
 338         private final boolean isTrusted;
 339 
 340         private StringEncoder(Charset cs, String rcn) {
 341             this.requestedCharsetName = rcn;
 342             this.cs = cs;
 343             this.ce = cs.newEncoder()
 344                 .onMalformedInput(CodingErrorAction.REPLACE)
 345                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 346             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 347             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 348                     ((ArrayEncoder)ce).isASCIICompatible();
 349         }
 350 
 351         String charsetName() {
 352             if (cs instanceof HistoricallyNamedCharset)
 353                 return ((HistoricallyNamedCharset)cs).historicalName();
 354             return cs.name();
 355         }
 356 
 357         final String requestedCharsetName() {
 358             return requestedCharsetName;
 359         }
 360 
 361         byte[] encode(byte coder, byte[] val) {
 362             // fastpath for ascii compatible
 363             if (coder == LATIN1 && isASCIICompatible &&
 364                 !hasNegatives(val, 0, val.length)) {
 365                 return Arrays.copyOf(val, val.length);
 366             }
 367             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 368             int en = scale(len, ce.maxBytesPerChar());
 369             byte[] ba = new byte[en];
 370             if (len == 0) {
 371                 return ba;
 372             }
 373             if (ce instanceof ArrayEncoder) {
 374                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 375                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 376                 if (blen != -1) {
 377                     return safeTrim(ba, blen, isTrusted);
 378                 }
 379             }
 380             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 381                                            : StringUTF16.toChars(val);
 382             ce.reset();
 383             ByteBuffer bb = ByteBuffer.wrap(ba);
 384             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 385             try {
 386                 CoderResult cr = ce.encode(cb, bb, true);
 387                 if (!cr.isUnderflow())
 388                     cr.throwException();
 389                 cr = ce.flush(bb);
 390                 if (!cr.isUnderflow())
 391                     cr.throwException();
 392             } catch (CharacterCodingException x) {
 393                 // Substitution is always enabled,
 394                 // so this shouldn't happen
 395                 throw new Error(x);
 396             }
 397             return safeTrim(ba, bb.position(), isTrusted);
 398         }
 399     }
 400 
 401     static byte[] encode(String charsetName, byte coder, byte[] val)
 402         throws UnsupportedEncodingException
 403     {
 404         StringEncoder se = deref(encoder);
 405         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 406         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 407                               || csn.equals(se.charsetName()))) {
 408             se = null;
 409             try {
 410                 Charset cs = lookupCharset(csn);
 411                 if (cs != null) {
 412                     if (cs == UTF_8) {
 413                         return encodeUTF8(coder, val, true);
 414                     }
 415                     if (cs == ISO_8859_1) {
 416                         return encode8859_1(coder, val);
 417                     }
 418                     if (cs == US_ASCII) {
 419                         return encodeASCII(coder, val);
 420                     }
 421                     se = new StringEncoder(cs, csn);
 422                 }
 423             } catch (IllegalCharsetNameException x) {}
 424             if (se == null) {
 425                 throw new UnsupportedEncodingException (csn);
 426             }
 427             set(encoder, se);
 428         }
 429         return se.encode(coder, val);
 430     }
 431 
 432     static byte[] encode(Charset cs, byte coder, byte[] val) {
 433         if (cs == UTF_8) {
 434             return encodeUTF8(coder, val, true);
 435         }
 436         if (cs == ISO_8859_1) {
 437             return encode8859_1(coder, val);
 438         }
 439         if (cs == US_ASCII) {
 440             return encodeASCII(coder, val);
 441         }
 442         CharsetEncoder ce = cs.newEncoder();
 443         // fastpath for ascii compatible
 444         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 445                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 446                                  !hasNegatives(val, 0, val.length)))) {
 447             return Arrays.copyOf(val, val.length);
 448         }
 449         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 450         int en = scale(len, ce.maxBytesPerChar());
 451         byte[] ba = new byte[en];
 452         if (len == 0) {
 453             return ba;
 454         }
 455         ce.onMalformedInput(CodingErrorAction.REPLACE)
 456           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 457           .reset();
 458         if (ce instanceof ArrayEncoder) {
 459             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 460                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 461             if (blen != -1) {
 462                 return safeTrim(ba, blen, true);
 463             }
 464         }
 465         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 466                             System.getSecurityManager() == null;
 467         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 468                                        : StringUTF16.toChars(val);
 469         ByteBuffer bb = ByteBuffer.wrap(ba);
 470         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 471         try {
 472             CoderResult cr = ce.encode(cb, bb, true);
 473             if (!cr.isUnderflow())
 474                 cr.throwException();
 475             cr = ce.flush(bb);
 476             if (!cr.isUnderflow())
 477                 cr.throwException();
 478         } catch (CharacterCodingException x) {
 479             throw new Error(x);
 480         }
 481         return safeTrim(ba, bb.position(), isTrusted);
 482     }
 483 
 484     static byte[] encode(byte coder, byte[] val) {
 485         Charset cs = Charset.defaultCharset();
 486         if (cs == UTF_8) {
 487             return encodeUTF8(coder, val, true);
 488         }
 489         if (cs == ISO_8859_1) {
 490             return encode8859_1(coder, val);
 491         }
 492         if (cs == US_ASCII) {
 493             return encodeASCII(coder, val);
 494         }
 495         StringEncoder se = deref(encoder);
 496         if (se == null || !cs.name().equals(se.cs.name())) {
 497             se = new StringEncoder(cs, cs.name());
 498             set(encoder, se);
 499         }
 500         return se.encode(coder, val);
 501     }
 502 
 503     /**
 504      *  Print a message directly to stderr, bypassing all character conversion
 505      *  methods.
 506      *  @param msg  message to print
 507      */
 508     private static native void err(String msg);
 509 
 510      /* The cached Result for each thread */
 511     private static final ThreadLocal<StringCoding.Result>
 512         resultCached = new ThreadLocal<>() {
 513             protected StringCoding.Result initialValue() {
 514                 return new StringCoding.Result();
 515             }};
 516 
 517     ////////////////////////// ascii //////////////////////////////
 518 
 519     private static Result decodeASCII(byte[] ba, int off, int len) {
 520         Result result = resultCached.get();
 521         if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
 522             return result.with(Arrays.copyOfRange(ba, off, off + len),
 523                                LATIN1);
 524         }
 525         byte[] dst = new byte[len<<1];
 526         int dp = 0;
 527         while (dp < len) {
 528             int b = ba[off++];
 529             putChar(dst, dp++, (b >= 0) ? (char)b : repl);
 530         }
 531         return result.with(dst, UTF16);
 532     }
 533 
 534     private static byte[] encodeASCII(byte coder, byte[] val) {
 535         if (coder == LATIN1) {
 536             byte[] dst = new byte[val.length];
 537             for (int i = 0; i < val.length; i++) {
 538                 if (val[i] < 0) {
 539                     dst[i] = '?';
 540                 } else {
 541                     dst[i] = val[i];
 542                 }
 543             }
 544             return dst;
 545         }
 546         int len = val.length >> 1;
 547         byte[] dst = new byte[len];
 548         int dp = 0;
 549         for (int i = 0; i < len; i++) {
 550             char c = StringUTF16.getChar(val, i);
 551             if (c < 0x80) {
 552                 dst[dp++] = (byte)c;
 553                 continue;
 554             }
 555             if (Character.isHighSurrogate(c) && i + 1 < len &&
 556                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 557                 i++;
 558             }
 559             dst[dp++] = '?';
 560         }
 561         if (len == dp) {
 562             return dst;
 563         }
 564         return Arrays.copyOf(dst, dp);
 565     }
 566 
 567     ////////////////////////// latin1/8859_1 ///////////////////////////
 568 
 569     private static Result decodeLatin1(byte[] ba, int off, int len) {
 570        Result result = resultCached.get();
 571        if (COMPACT_STRINGS) {
 572            return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 573        } else {
 574            return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 575        }
 576     }
 577 
 578     @HotSpotIntrinsicCandidate
 579     private static int implEncodeISOArray(byte[] sa, int sp,
 580                                           byte[] da, int dp, int len) {
 581         int i = 0;
 582         for (; i < len; i++) {
 583             char c = StringUTF16.getChar(sa, sp++);
 584             if (c > '\u00FF')
 585                 break;
 586             da[dp++] = (byte)c;
 587         }
 588         return i;
 589     }
 590 
 591     private static byte[] encode8859_1(byte coder, byte[] val) {
 592         if (coder == LATIN1) {
 593             return Arrays.copyOf(val, val.length);
 594         }
 595         int len = val.length >> 1;
 596         byte[] dst = new byte[len];
 597         int dp = 0;
 598         int sp = 0;
 599         int sl = len;
 600         while (sp < sl) {
 601             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 602             sp = sp + ret;
 603             dp = dp + ret;
 604             if (ret != len) {
 605                 char c = StringUTF16.getChar(val, sp++);
 606                 if (Character.isHighSurrogate(c) && sp < sl &&
 607                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 608                     sp++;
 609                 }
 610                 dst[dp++] = '?';
 611                 len = sl - sp;
 612             }
 613         }
 614         if (dp == dst.length) {
 615             return dst;
 616         }
 617         return Arrays.copyOf(dst, dp);
 618     }
 619 
 620     //////////////////////////////// utf8 ////////////////////////////////////
 621 
 622     private static boolean isNotContinuation(int b) {
 623         return (b & 0xc0) != 0x80;
 624     }
 625 
 626     private static boolean isMalformed3(int b1, int b2, int b3) {
 627         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 628                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 629     }
 630 
 631     private static boolean isMalformed3_2(int b1, int b2) {
 632         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 633                (b2 & 0xc0) != 0x80;
 634     }
 635 
 636     private static boolean isMalformed4(int b2, int b3, int b4) {
 637         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 638                (b4 & 0xc0) != 0x80;
 639     }
 640 
 641     private static boolean isMalformed4_2(int b1, int b2) {
 642         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 643                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 644                (b2 & 0xc0) != 0x80;
 645     }
 646 
 647     private static boolean isMalformed4_3(int b3) {
 648         return (b3 & 0xc0) != 0x80;
 649     }
 650 
 651     // for nb == 3/4
 652     private static int malformedN(byte[] src, int sp, int nb) {
 653         if (nb == 3) {
 654             int b1 = src[sp++];
 655             int b2 = src[sp++];    // no need to lookup b3
 656             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 657                     isNotContinuation(b2)) ? 1 : 2;
 658         } else if (nb == 4) { // we don't care the speed here
 659             int b1 = src[sp++] & 0xff;
 660             int b2 = src[sp++] & 0xff;
 661             if (b1 > 0xf4 ||
 662                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 663                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 664                 isNotContinuation(b2))
 665                 return 1;
 666             if (isNotContinuation(src[sp++]))
 667                 return 2;
 668             return 3;
 669         }
 670         assert false;
 671         return -1;
 672     }
 673 
 674     private static void throwMalformed(int off, int nb) {
 675         throw new IllegalArgumentException("malformed input off : " + off +
 676                                            ", length : " + nb);
 677     }
 678 
 679     private static char repl = '\ufffd';
 680 
 681     private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
 682         // ascii-bais, which has a relative impact to the non-ascii-only bytes
 683         if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
 684             return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
 685                                            LATIN1);
 686         return decodeUTF8_0(src, sp, len, doReplace);
 687     }
 688 
 689     private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
 690         Result ret = resultCached.get();
 691 
 692         int sl = sp + len;
 693         int dp = 0;
 694         byte[] dst = new byte[len];
 695 
 696         if (COMPACT_STRINGS) {
 697             while (sp < sl) {
 698                 int b1 = src[sp];
 699                 if (b1 >= 0) {
 700                     dst[dp++] = (byte)b1;
 701                     sp++;
 702                     continue;
 703                 }
 704                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 705                     sp + 1 < sl) {
 706                     int b2 = src[sp + 1];
 707                     if (!isNotContinuation(b2)) {
 708                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 709                                            (((byte) 0xC0 << 6) ^
 710                                            ((byte) 0x80 << 0)));
 711                         sp += 2;
 712                         continue;
 713                     }
 714                 }
 715                 // anything not a latin1, including the repl
 716                 // we have to go with the utf16
 717                 break;
 718             }
 719             if (sp == sl) {
 720                 if (dp != dst.length) {
 721                     dst = Arrays.copyOf(dst, dp);
 722                 }
 723                 return ret.with(dst, LATIN1);
 724             }
 725         }
 726         if (dp == 0) {
 727             dst = new byte[len << 1];
 728         } else {
 729             byte[] buf = new byte[len << 1];
 730             StringLatin1.inflate(dst, 0, buf, 0, dp);
 731             dst = buf;
 732         }
 733         while (sp < sl) {
 734             int b1 = src[sp++];
 735             if (b1 >= 0) {
 736                 putChar(dst, dp++, (char) b1);
 737             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 738                 if (sp < sl) {
 739                     int b2 = src[sp++];
 740                     if (isNotContinuation(b2)) {
 741                         if (!doReplace) {
 742                             throwMalformed(sp - 1, 1);
 743                         }
 744                         putChar(dst, dp++, repl);
 745                         sp--;
 746                     } else {
 747                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 748                                                   (((byte) 0xC0 << 6) ^
 749                                                   ((byte) 0x80 << 0))));
 750                     }
 751                     continue;
 752                 }
 753                 if (!doReplace) {
 754                     throwMalformed(sp, 1);  // underflow()
 755                 }
 756                 putChar(dst, dp++, repl);
 757                 break;
 758             } else if ((b1 >> 4) == -2) {
 759                 if (sp + 1 < sl) {
 760                     int b2 = src[sp++];
 761                     int b3 = src[sp++];
 762                     if (isMalformed3(b1, b2, b3)) {
 763                         if (!doReplace) {
 764                             throwMalformed(sp - 3, 3);
 765                         }
 766                         putChar(dst, dp++, repl);
 767                         sp -= 3;
 768                         sp += malformedN(src, sp, 3);
 769                     } else {
 770                         char c = (char)((b1 << 12) ^
 771                                         (b2 <<  6) ^
 772                                         (b3 ^
 773                                          (((byte) 0xE0 << 12) ^
 774                                          ((byte) 0x80 <<  6) ^
 775                                          ((byte) 0x80 <<  0))));
 776                         if (isSurrogate(c)) {
 777                             if (!doReplace) {
 778                                 throwMalformed(sp - 3, 3);
 779                             }
 780                             putChar(dst, dp++, repl);
 781                         } else {
 782                             putChar(dst, dp++, c);
 783                         }
 784                     }
 785                     continue;
 786                 }
 787                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 788                     if (!doReplace) {
 789                         throwMalformed(sp - 1, 2);
 790                     }
 791                     putChar(dst, dp++, repl);
 792                     continue;
 793                 }
 794                 if (!doReplace){
 795                     throwMalformed(sp, 1);
 796                 }
 797                 putChar(dst, dp++, repl);
 798                 break;
 799             } else if ((b1 >> 3) == -2) {
 800                 if (sp + 2 < sl) {
 801                     int b2 = src[sp++];
 802                     int b3 = src[sp++];
 803                     int b4 = src[sp++];
 804                     int uc = ((b1 << 18) ^
 805                               (b2 << 12) ^
 806                               (b3 <<  6) ^
 807                               (b4 ^
 808                                (((byte) 0xF0 << 18) ^
 809                                ((byte) 0x80 << 12) ^
 810                                ((byte) 0x80 <<  6) ^
 811                                ((byte) 0x80 <<  0))));
 812                     if (isMalformed4(b2, b3, b4) ||
 813                         !isSupplementaryCodePoint(uc)) { // shortest form check
 814                         if (!doReplace) {
 815                             throwMalformed(sp - 4, 4);
 816                         }
 817                         putChar(dst, dp++, repl);
 818                         sp -= 4;
 819                         sp += malformedN(src, sp, 4);
 820                     } else {
 821                         putChar(dst, dp++, highSurrogate(uc));
 822                         putChar(dst, dp++, lowSurrogate(uc));
 823                     }
 824                     continue;
 825                 }
 826                 b1 &= 0xff;
 827                 if (b1 > 0xf4 ||
 828                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 829                     if (!doReplace) {
 830                         throwMalformed(sp - 1, 1);  // or 2
 831                     }
 832                     putChar(dst, dp++, repl);
 833                     continue;
 834                 }
 835                 if (!doReplace) {
 836                     throwMalformed(sp - 1, 1);
 837                 }
 838                 sp++;
 839                 putChar(dst, dp++, repl);
 840                 if (sp  < sl && isMalformed4_3(src[sp])) {
 841                     continue;
 842                 }
 843                 break;
 844             } else {
 845                 if (!doReplace) {
 846                     throwMalformed(sp - 1, 1);
 847                 }
 848                 putChar(dst, dp++, repl);
 849             }
 850         }
 851         if (dp != len) {
 852             dst = Arrays.copyOf(dst, dp << 1);
 853         }
 854         return ret.with(dst, UTF16);
 855     }
 856 
 857     private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
 858         if (coder == UTF16)
 859             return encodeUTF8_UTF16(val, doReplace);
 860 
 861         if (!hasNegatives(val, 0, val.length))
 862             return Arrays.copyOf(val, val.length);
 863 
 864         int dp = 0;
 865         byte[] dst = new byte[val.length << 1];
 866         for (int sp = 0; sp < val.length; sp++) {
 867             byte c = val[sp];
 868             if (c < 0) {
 869                 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 870                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 871             } else {
 872                 dst[dp++] = c;
 873             }
 874         }
 875         if (dp == dst.length)
 876             return dst;
 877         return Arrays.copyOf(dst, dp);
 878     }
 879 
 880     private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
 881         int dp = 0;
 882         int sp = 0;
 883         int sl = val.length >> 1;
 884         byte[] dst = new byte[sl * 3];
 885         char c;
 886         while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 887             // ascii fast loop;
 888             dst[dp++] = (byte)c;
 889             sp++;
 890         }
 891         while (sp < sl) {
 892             c = StringUTF16.getChar(val, sp++);
 893             if (c < 0x80) {
 894                 dst[dp++] = (byte)c;
 895             } else if (c < 0x800) {
 896                 dst[dp++] = (byte)(0xc0 | (c >> 6));
 897                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 898             } else if (Character.isSurrogate(c)) {
 899                 int uc = -1;
 900                 char c2;
 901                 if (Character.isHighSurrogate(c) && sp < sl &&
 902                     Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 903                     uc = Character.toCodePoint(c, c2);
 904                 }
 905                 if (uc < 0) {
 906                     if (doReplace) {
 907                         dst[dp++] = '?';
 908                     } else {
 909                         throwMalformed(sp - 1, 1); // or 2, does not matter here
 910                     }
 911                 } else {
 912                     dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 913                     dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 914                     dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 915                     dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 916                     sp++;  // 2 chars
 917                 }
 918             } else {
 919                 // 3 bytes, 16 bits
 920                 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 921                 dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 922                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 923             }
 924         }
 925         if (dp == dst.length) {
 926             return dst;
 927         }
 928         return Arrays.copyOf(dst, dp);
 929     }
 930 
 931     ////////////////////// for j.u.z.ZipCoder //////////////////////////
 932 
 933     /*
 934      * Throws iae, instead of replacing, if malformed or unmappble.
 935      */
 936     static String newStringUTF8NoRepl(byte[] src, int off, int len) {
 937         if (COMPACT_STRINGS && !hasNegatives(src, off, len))
 938             return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
 939         Result ret = decodeUTF8_0(src, off, len, false);
 940         return new String(ret.value, ret.coder);
 941     }
 942 
 943     /*
 944      * Throws iae, instead of replacing, if unmappble.
 945      */
 946     static byte[] getBytesUTF8NoRepl(String s) {
 947         return encodeUTF8(s.coder(), s.value(), false);
 948     }
 949 }