src/java.base/share/classes/java/lang/StringCoding.java

Print this page


   1 /*
   2  * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.UnsupportedCharsetException;
  40 import java.util.Arrays;
  41 import jdk.internal.HotSpotIntrinsicCandidate;
  42 import sun.nio.cs.HistoricallyNamedCharset;
  43 import sun.nio.cs.ArrayDecoder;
  44 import sun.nio.cs.ArrayEncoder;
  45 import sun.nio.cs.StandardCharsets;
  46 
  47 import static java.lang.String.LATIN1;
  48 import static java.lang.String.UTF16;
  49 import static java.lang.String.COMPACT_STRINGS;





  50 
  51 /**
  52  * Utility class for string encoding and decoding.
  53  */
  54 
  55 class StringCoding {
  56 
  57     private StringCoding() { }
  58 
  59     /** The cached coders for each thread */
  60     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  61         new ThreadLocal<>();
  62     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  63         new ThreadLocal<>();
  64 
  65     private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
  66     private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
  67     private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
  68 
  69     private static boolean warnUnsupportedCharset = true;
  70 
  71     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  72         SoftReference<T> sr = tl.get();
  73         if (sr == null)
  74             return null;
  75         return sr.get();
  76     }
  77 
  78     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  79         tl.set(new SoftReference<>(ob));
  80     }
  81 
  82     // Trim the given byte array to the given length
  83     //
  84     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  85         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  86             return ba;
  87         else
  88             return Arrays.copyOf(ba, len);
  89     }
  90 
  91     private static int scale(int len, float expansionFactor) {
  92         // We need to perform double, not float, arithmetic; otherwise
  93         // we lose low order bits when len is larger than 2**24.
  94         return (int)(len * (double)expansionFactor);
  95     }
  96 
  97     private static Charset lookupCharset(String csn) {
  98         if (Charset.isSupported(csn)) {
  99             try {
 100                 return Charset.forName(csn);
 101             } catch (UnsupportedCharsetException x) {
 102                 throw new Error(x);
 103             }
 104         }
 105         return null;
 106     }
 107 
 108     private static void warnUnsupportedCharset(String csn) {
 109         if (warnUnsupportedCharset) {
 110             // Use err(String) rather than the Logging API or System.err
 111             // since this method may be called during VM initialization
 112             // before either is available.
 113             err("WARNING: Default charset " + csn +
 114                 " not supported, using ISO-8859-1 instead\n");
 115             warnUnsupportedCharset = false;
 116         }
 117     }
 118 
 119     static class Result {
 120         byte[] value;
 121         byte coder;
 122 
 123         Result with() {
 124             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 125             value = new byte[0];
 126             return this;
 127         }
 128 
 129         Result with(char[] val, int off, int len) {
 130             if (String.COMPACT_STRINGS) {
 131                 byte[] bs = StringUTF16.compress(val, off, len);
 132                 if (bs != null) {
 133                     value = bs;
 134                     coder = LATIN1;
 135                     return this;
 136                 }
 137             }
 138             coder = UTF16;


 207             }
 208             cd.reset();
 209             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 210             CharBuffer cb = CharBuffer.wrap(ca);
 211             try {
 212                 CoderResult cr = cd.decode(bb, cb, true);
 213                 if (!cr.isUnderflow())
 214                     cr.throwException();
 215                 cr = cd.flush(cb);
 216                 if (!cr.isUnderflow())
 217                     cr.throwException();
 218             } catch (CharacterCodingException x) {
 219                 // Substitution is always enabled,
 220                 // so this shouldn't happen
 221                 throw new Error(x);
 222             }
 223             return result.with(ca, 0, cb.position());
 224         }
 225     }
 226 
 227     private static class StringDecoder8859_1 extends StringDecoder {
 228         StringDecoder8859_1(Charset cs, String rcn) {
 229             super(cs, rcn);
 230         }
 231         Result decode(byte[] ba, int off, int len) {
 232             if (COMPACT_STRINGS) {
 233                 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 234             } else {
 235                 return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 236             }
 237         }
 238     }
 239 
 240     static Result decode(String charsetName, byte[] ba, int off, int len)
 241         throws UnsupportedEncodingException
 242     {
 243         StringDecoder sd = deref(decoder);
 244         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 245         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 246                               || csn.equals(sd.charsetName()))) {
 247             sd = null;
 248             try {
 249                 Charset cs = lookupCharset(csn);
 250                 if (cs != null) {
 251                     if (cs == UTF_8) {
 252                         sd = new StringDecoderUTF8(cs, csn);
 253                     } else if (cs == ISO_8859_1) {
 254                         sd = new StringDecoder8859_1(cs, csn);
 255                     } else {
 256                         sd = new StringDecoder(cs, csn);


 257                     }

 258                 }
 259             } catch (IllegalCharsetNameException x) {}
 260             if (sd == null)
 261                 throw new UnsupportedEncodingException(csn);
 262             set(decoder, sd);
 263         }
 264         return sd.decode(ba, off, len);
 265     }
 266 
 267     static Result decode(Charset cs, byte[] ba, int off, int len) {










 268         // (1)We never cache the "external" cs, the only benefit of creating
 269         // an additional StringDe/Encoder object to wrap it is to share the
 270         // de/encode() method. These SD/E objects are short-lived, the young-gen
 271         // gc should be able to take care of them well. But the best approach
 272         // is still not to generate them if not really necessary.
 273         // (2)The defensive copy of the input byte/char[] has a big performance
 274         // impact, as well as the outgoing result byte/char[]. Need to do the
 275         // optimization check of (sm==null && classLoader0==null) for both.
 276         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 277         // is only checked (and then isTrusted gets set) when (SM==null). It is
 278         // possible that the SM==null for now but then SM is NOT null later
 279         // when safeTrim() is invoked...the "safe" way to do is to redundant
 280         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 281         // but it then can be argued that the SM is null when the operation
 282         // is started...
 283         if (cs == UTF_8) {
 284             return StringDecoderUTF8.decode(ba, off, len, new Result());
 285         }
 286         CharsetDecoder cd = cs.newDecoder();
 287         // ascii fastpath
 288         if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) &&
 289                                  ((ArrayDecoder)cd).isASCIICompatible() &&
 290                                  !hasNegatives(ba, off, len))) {
 291              if (COMPACT_STRINGS) {
 292                  return new Result().with(Arrays.copyOfRange(ba, off, off + len),
 293                                           LATIN1);
 294              } else {
 295                  return new Result().with(StringLatin1.inflate(ba, off, len), UTF16);
 296              }
 297         }
 298         int en = scale(len, cd.maxCharsPerByte());
 299         if (len == 0) {
 300             return new Result().with();
 301         }
 302         if (cs.getClass().getClassLoader0() != null &&
 303             System.getSecurityManager() != null) {
 304             ba =  Arrays.copyOfRange(ba, off, off + len);
 305             off = 0;
 306         }
 307         cd.onMalformedInput(CodingErrorAction.REPLACE)
 308           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 309           .reset();
 310 
 311         char[] ca = new char[en];
 312         if (cd instanceof ArrayDecoder) {
 313             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 314             return new Result().with(ca, 0, clen);
 315         }





 316         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 317         CharBuffer cb = CharBuffer.wrap(ca);
 318         try {
 319             CoderResult cr = cd.decode(bb, cb, true);
 320             if (!cr.isUnderflow())
 321                 cr.throwException();
 322             cr = cd.flush(cb);
 323             if (!cr.isUnderflow())
 324                 cr.throwException();
 325         } catch (CharacterCodingException x) {
 326             // Substitution is always enabled,
 327             // so this shouldn't happen
 328             throw new Error(x);
 329         }
 330         return new Result().with(ca, 0, cb.position());
 331     }
 332 
 333     static Result decode(byte[] ba, int off, int len) {
 334         String csn = Charset.defaultCharset().name();
 335         try {
 336             // use charset name decode() variant which provides caching.
 337             return decode(csn, ba, off, len);
 338         } catch (UnsupportedEncodingException x) {
 339             warnUnsupportedCharset(csn);
 340         }
 341         try {
 342             return decode("ISO-8859-1", ba, off, len);
 343         } catch (UnsupportedEncodingException x) {
 344             // If this code is hit during VM initialization, err(String) is
 345             // the only way we will be able to get any kind of error message.
 346             err("ISO-8859-1 charset not available: " + x.toString() + "\n");
 347             // If we can not find ISO-8859-1 (a required encoding) then things
 348             // are seriously wrong with the installation.
 349             System.exit(1);
 350             return null;
 351         }

 352     }
 353 
 354     // -- Encoding --
 355     private static class StringEncoder {
 356         private Charset cs;
 357         private CharsetEncoder ce;
 358         private final boolean isASCIICompatible;
 359         private final String requestedCharsetName;
 360         private final boolean isTrusted;
 361 
 362         private StringEncoder(Charset cs, String rcn) {
 363             this.requestedCharsetName = rcn;
 364             this.cs = cs;
 365             this.ce = cs.newEncoder()
 366                 .onMalformedInput(CodingErrorAction.REPLACE)
 367                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 368             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 369             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 370                     ((ArrayEncoder)ce).isASCIICompatible();
 371         }


 376             return cs.name();
 377         }
 378 
 379         final String requestedCharsetName() {
 380             return requestedCharsetName;
 381         }
 382 
 383         byte[] encode(byte coder, byte[] val) {
 384             // fastpath for ascii compatible
 385             if (coder == LATIN1 && isASCIICompatible &&
 386                 !hasNegatives(val, 0, val.length)) {
 387                 return Arrays.copyOf(val, val.length);
 388             }
 389             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 390             int en = scale(len, ce.maxBytesPerChar());
 391             byte[] ba = new byte[en];
 392             if (len == 0) {
 393                 return ba;
 394             }
 395             if (ce instanceof ArrayEncoder) {
 396                 if (!isTrusted) {
 397                     val = Arrays.copyOf(val, val.length);
 398                 }
 399                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 400                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 401                 if (blen != -1) {
 402                     return safeTrim(ba, blen, isTrusted);
 403                 }
 404             }
 405             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 406                                            : StringUTF16.toChars(val);
 407             ce.reset();
 408             ByteBuffer bb = ByteBuffer.wrap(ba);
 409             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 410             try {
 411                 CoderResult cr = ce.encode(cb, bb, true);
 412                 if (!cr.isUnderflow())
 413                     cr.throwException();
 414                 cr = ce.flush(bb);
 415                 if (!cr.isUnderflow())
 416                     cr.throwException();
 417             } catch (CharacterCodingException x) {
 418                 // Substitution is always enabled,
 419                 // so this shouldn't happen
 420                 throw new Error(x);
 421             }
 422             return safeTrim(ba, bb.position(), isTrusted);
 423         }
 424     }
 425 

















































































































































































 426     @HotSpotIntrinsicCandidate
 427     private static int implEncodeISOArray(byte[] sa, int sp,
 428                                           byte[] da, int dp, int len) {
 429         int i = 0;
 430         for (; i < len; i++) {
 431             char c = StringUTF16.getChar(sa, sp++);
 432             if (c > '\u00FF')
 433                 break;
 434             da[dp++] = (byte)c;
 435         }
 436         return i;
 437     }
 438 
 439     static byte[] encode8859_1(byte coder, byte[] val) {
 440         if (coder == LATIN1) {
 441             return Arrays.copyOf(val, val.length);
 442         }
 443         int len = val.length >> 1;
 444         byte[] dst = new byte[len];
 445         int dp = 0;
 446         int sp = 0;
 447         int sl = len;
 448         while (sp < sl) {
 449             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 450             sp = sp + ret;
 451             dp = dp + ret;
 452             if (ret != len) {
 453                 char c = StringUTF16.getChar(val, sp++);
 454                 if (Character.isHighSurrogate(c) && sp < sl &&
 455                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 456                     sp++;
 457                 }
 458                 dst[dp++] = '?';
 459                 len = sl - sp;
 460             }
 461         }
 462         if (dp == dst.length) {
 463             return dst;
 464         }
 465         return Arrays.copyOf(dst, dp);
 466     }
 467 
 468     static byte[] encodeASCII(byte coder, byte[] val) {
 469         if (coder == LATIN1) {
 470             byte[] dst = new byte[val.length];
 471             for (int i = 0; i < val.length; i++) {
 472                 if (val[i] < 0) {
 473                     dst[i] = '?';
 474                 } else {
 475                     dst[i] = val[i];
 476                 }




 477             }
 478             return dst;



 479         }
 480         int len = val.length >> 1;
 481         byte[] dst = new byte[len];
























































 482         int dp = 0;
 483         for (int i = 0; i < len; i++) {
 484             char c = StringUTF16.getChar(val, i);
 485             if (c < 0x80) {
 486                 dst[dp++] = (byte)c;




 487                 continue;
 488             }
 489             if (Character.isHighSurrogate(c) && i + 1 < len &&
 490                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 491                 i++;






 492             }
 493             dst[dp++] = '?';
 494         }
 495         if (len == dp) {
 496             return dst;

 497         }
 498         return Arrays.copyOf(dst, dp);









































































 499     }





































































 500 
 501    static byte[] encodeUTF8(byte coder, byte[] val) {
 502         int dp = 0;
 503         byte[] dst;
 504         if (coder == LATIN1) {
 505             dst = new byte[val.length << 1];
 506             for (int sp = 0; sp < val.length; sp++) {
 507                 byte c = val[sp];
 508                 if (c < 0) {
 509                     dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 510                     dst[dp++] = (byte)(0x80 | (c & 0x3f));
 511                 } else {
 512                     dst[dp++] = c;
 513                 }
 514             }
 515         } else {






 516             int sp = 0;
 517             int sl = val.length >> 1;
 518             dst = new byte[sl * 3];
 519             char c;
 520             while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 521                 // ascii fast loop;
 522                 dst[dp++] = (byte)c;
 523                 sp++;
 524             }
 525             while (sp < sl) {
 526                 c = StringUTF16.getChar(val, sp++);
 527                 if (c < 0x80) {
 528                     dst[dp++] = (byte)c;
 529                 } else if (c < 0x800) {
 530                     dst[dp++] = (byte)(0xc0 | (c >> 6));
 531                     dst[dp++] = (byte)(0x80 | (c & 0x3f));
 532                 } else if (Character.isSurrogate(c)) {
 533                     int uc = -1;
 534                     char c2;
 535                     if (Character.isHighSurrogate(c) && sp < sl &&
 536                         Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 537                         uc = Character.toCodePoint(c, c2);
 538                     }
 539                     if (uc < 0) {

 540                         dst[dp++] = '?';
 541                     } else {



 542                         dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 543                         dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 544                         dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 545                         dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 546                         sp++;  // 2 chars
 547                     }
 548                 } else {
 549                     // 3 bytes, 16 bits
 550                     dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 551                     dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 552                     dst[dp++] = (byte)(0x80 | (c & 0x3f));
 553                 }
 554             }
 555         }
 556         if (dp == dst.length) {
 557             return dst;
 558         }
 559         return Arrays.copyOf(dst, dp);
 560     }
 561 
 562     static byte[] encode(String charsetName, byte coder, byte[] val)
 563         throws UnsupportedEncodingException
 564     {
 565         StringEncoder se = deref(encoder);
 566         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 567         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 568                               || csn.equals(se.charsetName()))) {
 569             se = null;
 570             try {
 571                 Charset cs = lookupCharset(csn);
 572                 if (cs != null) {
 573                     if (cs == UTF_8) {
 574                         return encodeUTF8(coder, val);
 575                     } else if (cs == ISO_8859_1) {
 576                         return encode8859_1(coder, val);
 577                     } else if (cs == US_ASCII) {
 578                         return encodeASCII(coder, val);
 579                     }
 580                     se = new StringEncoder(cs, csn);
 581                 }
 582             } catch (IllegalCharsetNameException x) {}
 583             if (se == null) {
 584                 throw new UnsupportedEncodingException (csn);
 585             }
 586             set(encoder, se);
 587         }
 588         return se.encode(coder, val);
 589     }
 590 
 591     static byte[] encode(Charset cs, byte coder, byte[] val) {
 592         if (cs == UTF_8) {
 593             return encodeUTF8(coder, val);
 594         } else if (cs == ISO_8859_1) {
 595             return encode8859_1(coder, val);
 596         } else if (cs == US_ASCII) {
 597             return encodeASCII(coder, val);
 598         }
 599         CharsetEncoder ce = cs.newEncoder();
 600         // fastpath for ascii compatible
 601         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 602                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 603                                  !hasNegatives(val, 0, val.length)))) {
 604             return Arrays.copyOf(val, val.length);
 605         }
 606         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 607         int en = scale(len, ce.maxBytesPerChar());
 608         byte[] ba = new byte[en];
 609         if (len == 0) {
 610             return ba;
 611         }
 612         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 613                             System.getSecurityManager() == null;
 614         ce.onMalformedInput(CodingErrorAction.REPLACE)
 615           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 616           .reset();
 617         if (ce instanceof ArrayEncoder) {
 618             if (!isTrusted) {
 619                 val = Arrays.copyOf(val, val.length);
 620             }
 621             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 622                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 623             if (blen != -1) {
 624                 return safeTrim(ba, blen, isTrusted);
 625             }
 626         }
 627         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 628                                        : StringUTF16.toChars(val);
 629         ByteBuffer bb = ByteBuffer.wrap(ba);
 630         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 631         try {
 632             CoderResult cr = ce.encode(cb, bb, true);
 633             if (!cr.isUnderflow())
 634                 cr.throwException();
 635             cr = ce.flush(bb);
 636             if (!cr.isUnderflow())
 637                 cr.throwException();
 638         } catch (CharacterCodingException x) {
 639             throw new Error(x);
 640         }
 641         return safeTrim(ba, bb.position(), isTrusted);
 642     }
 643 
 644     static byte[] encode(byte coder, byte[] val) {
 645         String csn = Charset.defaultCharset().name();
 646         try {
 647             // use charset name encode() variant which provides caching.
 648             return encode(csn, coder, val);
 649         } catch (UnsupportedEncodingException x) {
 650             warnUnsupportedCharset(csn);
 651         }
 652         try {
 653             return encode("ISO-8859-1", coder, val);
 654         } catch (UnsupportedEncodingException x) {
 655             // If this code is hit during VM initialization, err(String) is
 656             // the only way we will be able to get any kind of error message.
 657             err("ISO-8859-1 charset not available: " + x.toString() + "\n");
 658             // If we can not find ISO-8859-1 (a required encoding) then things
 659             // are seriously wrong with the installation.
 660             System.exit(1);
 661             return null;
 662         }
 663     }
 664 
 665     /**
 666      *  Print a message directly to stderr, bypassing all character conversion
 667      *  methods.
 668      *  @param msg  message to print
 669      */
 670     private static native void err(String msg);


 671 }
   1 /*
   2  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any


  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.UnsupportedCharsetException;
  40 import java.util.Arrays;
  41 import jdk.internal.HotSpotIntrinsicCandidate;
  42 import sun.nio.cs.HistoricallyNamedCharset;
  43 import sun.nio.cs.ArrayDecoder;
  44 import sun.nio.cs.ArrayEncoder;
  45 import sun.nio.cs.StandardCharsets;
  46 
  47 import static java.lang.String.LATIN1;
  48 import static java.lang.String.UTF16;
  49 import static java.lang.String.COMPACT_STRINGS;
  50 import static java.lang.Character.isSurrogate;
  51 import static java.lang.Character.highSurrogate;
  52 import static java.lang.Character.lowSurrogate;
  53 import static java.lang.Character.isSupplementaryCodePoint;
  54 import static java.lang.StringUTF16.putChar;
  55 
  56 /**
  57  * Utility class for string encoding and decoding.
  58  */
  59 
  60 class StringCoding {
  61 
  62     private StringCoding() { }
  63 
  64     /** The cached coders for each thread */
  65     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  66         new ThreadLocal<>();
  67     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  68         new ThreadLocal<>();
  69 
  70     private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
  71     private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
  72     private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
  73 


  74     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  75         SoftReference<T> sr = tl.get();
  76         if (sr == null)
  77             return null;
  78         return sr.get();
  79     }
  80 
  81     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  82         tl.set(new SoftReference<>(ob));
  83     }
  84 
  85     // Trim the given byte array to the given length

  86     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  87         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  88             return ba;
  89         else
  90             return Arrays.copyOf(ba, len);
  91     }
  92 
  93     private static int scale(int len, float expansionFactor) {
  94         // We need to perform double, not float, arithmetic; otherwise
  95         // we lose low order bits when len is larger than 2**24.
  96         return (int)(len * (double)expansionFactor);
  97     }
  98 
  99     private static Charset lookupCharset(String csn) {
 100         if (Charset.isSupported(csn)) {
 101             try {
 102                 return Charset.forName(csn);
 103             } catch (UnsupportedCharsetException x) {
 104                 throw new Error(x);
 105             }
 106         }
 107         return null;
 108     }
 109 











 110     static class Result {
 111         byte[] value;
 112         byte coder;
 113 
 114         Result with() {
 115             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 116             value = new byte[0];
 117             return this;
 118         }
 119 
 120         Result with(char[] val, int off, int len) {
 121             if (String.COMPACT_STRINGS) {
 122                 byte[] bs = StringUTF16.compress(val, off, len);
 123                 if (bs != null) {
 124                     value = bs;
 125                     coder = LATIN1;
 126                     return this;
 127                 }
 128             }
 129             coder = UTF16;


 198             }
 199             cd.reset();
 200             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 201             CharBuffer cb = CharBuffer.wrap(ca);
 202             try {
 203                 CoderResult cr = cd.decode(bb, cb, true);
 204                 if (!cr.isUnderflow())
 205                     cr.throwException();
 206                 cr = cd.flush(cb);
 207                 if (!cr.isUnderflow())
 208                     cr.throwException();
 209             } catch (CharacterCodingException x) {
 210                 // Substitution is always enabled,
 211                 // so this shouldn't happen
 212                 throw new Error(x);
 213             }
 214             return result.with(ca, 0, cb.position());
 215         }
 216     }
 217 













 218     static Result decode(String charsetName, byte[] ba, int off, int len)
 219         throws UnsupportedEncodingException
 220     {
 221         StringDecoder sd = deref(decoder);
 222         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 223         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 224                               || csn.equals(sd.charsetName()))) {
 225             sd = null;
 226             try {
 227                 Charset cs = lookupCharset(csn);
 228                 if (cs != null) {
 229                     if (cs == UTF_8) {
 230                         return decodeUTF8(ba, off, len, true);
 231                     }
 232                     if (cs == ISO_8859_1) {
 233                         return decodeLatin1(ba, off, len);
 234                     }
 235                     if (cs == US_ASCII) {
 236                         return decodeASCII(ba, off, len);
 237                     }
 238                     sd = new StringDecoder(cs, csn);
 239                 }
 240             } catch (IllegalCharsetNameException x) {}
 241             if (sd == null)
 242                 throw new UnsupportedEncodingException(csn);
 243             set(decoder, sd);
 244         }
 245         return sd.decode(ba, off, len);
 246     }
 247 
 248     static Result decode(Charset cs, byte[] ba, int off, int len) {
 249         if (cs == UTF_8) {
 250             return decodeUTF8(ba, off, len, true);
 251         }
 252         if (cs == ISO_8859_1) {
 253             return decodeLatin1(ba, off, len);
 254         }
 255         if (cs == US_ASCII) {
 256             return decodeASCII(ba, off, len);
 257         }
 258 
 259         // (1)We never cache the "external" cs, the only benefit of creating
 260         // an additional StringDe/Encoder object to wrap it is to share the
 261         // de/encode() method. These SD/E objects are short-lived, the young-gen
 262         // gc should be able to take care of them well. But the best approach
 263         // is still not to generate them if not really necessary.
 264         // (2)The defensive copy of the input byte/char[] has a big performance
 265         // impact, as well as the outgoing result byte/char[]. Need to do the
 266         // optimization check of (sm==null && classLoader0==null) for both.
 267         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 268         // is only checked (and then isTrusted gets set) when (SM==null). It is
 269         // possible that the SM==null for now but then SM is NOT null later
 270         // when safeTrim() is invoked...the "safe" way to do is to redundant
 271         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 272         // but it then can be argued that the SM is null when the operation
 273         // is started...



 274         CharsetDecoder cd = cs.newDecoder();
 275         // ascii fastpath
 276         if ((cd instanceof ArrayDecoder) &&
 277             ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
 278             return decodeLatin1(ba, off, len);






 279         }
 280         int en = scale(len, cd.maxCharsPerByte());
 281         if (len == 0) {
 282             return new Result().with();
 283         }





 284         cd.onMalformedInput(CodingErrorAction.REPLACE)
 285           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 286           .reset();

 287         char[] ca = new char[en];
 288         if (cd instanceof ArrayDecoder) {
 289             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 290             return new Result().with(ca, 0, clen);
 291         }
 292         if (cs.getClass().getClassLoader0() != null &&
 293             System.getSecurityManager() != null) {
 294             ba = Arrays.copyOfRange(ba, off, off + len);
 295             off = 0;
 296         }
 297         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 298         CharBuffer cb = CharBuffer.wrap(ca);
 299         try {
 300             CoderResult cr = cd.decode(bb, cb, true);
 301             if (!cr.isUnderflow())
 302                 cr.throwException();
 303             cr = cd.flush(cb);
 304             if (!cr.isUnderflow())
 305                 cr.throwException();
 306         } catch (CharacterCodingException x) {
 307             // Substitution is always enabled,
 308             // so this shouldn't happen
 309             throw new Error(x);
 310         }
 311         return new Result().with(ca, 0, cb.position());
 312     }
 313 
 314     static Result decode(byte[] ba, int off, int len) {
 315         Charset cs = Charset.defaultCharset();
 316         if (cs == UTF_8) {
 317             return decodeUTF8(ba, off, len, true);



 318         }
 319         if (cs == ISO_8859_1) {
 320             return decodeLatin1(ba, off, len);
 321         }
 322         if (cs == US_ASCII) {
 323             return decodeASCII(ba, off, len);
 324         }
 325         StringDecoder sd = deref(decoder);
 326         if (sd == null || !cs.name().equals(sd.cs.name())) {
 327             sd = new StringDecoder(cs, cs.name());
 328             set(decoder, sd);
 329         }
 330         return sd.decode(ba, off, len);
 331     }
 332 
 333     // -- Encoding --
 334     private static class StringEncoder {
 335         private Charset cs;
 336         private CharsetEncoder ce;
 337         private final boolean isASCIICompatible;
 338         private final String requestedCharsetName;
 339         private final boolean isTrusted;
 340 
 341         private StringEncoder(Charset cs, String rcn) {
 342             this.requestedCharsetName = rcn;
 343             this.cs = cs;
 344             this.ce = cs.newEncoder()
 345                 .onMalformedInput(CodingErrorAction.REPLACE)
 346                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 347             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 348             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 349                     ((ArrayEncoder)ce).isASCIICompatible();
 350         }


 355             return cs.name();
 356         }
 357 
 358         final String requestedCharsetName() {
 359             return requestedCharsetName;
 360         }
 361 
 362         byte[] encode(byte coder, byte[] val) {
 363             // fastpath for ascii compatible
 364             if (coder == LATIN1 && isASCIICompatible &&
 365                 !hasNegatives(val, 0, val.length)) {
 366                 return Arrays.copyOf(val, val.length);
 367             }
 368             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 369             int en = scale(len, ce.maxBytesPerChar());
 370             byte[] ba = new byte[en];
 371             if (len == 0) {
 372                 return ba;
 373             }
 374             if (ce instanceof ArrayEncoder) {



 375                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 376                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 377                 if (blen != -1) {
 378                     return safeTrim(ba, blen, isTrusted);
 379                 }
 380             }
 381             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 382                                            : StringUTF16.toChars(val);
 383             ce.reset();
 384             ByteBuffer bb = ByteBuffer.wrap(ba);
 385             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 386             try {
 387                 CoderResult cr = ce.encode(cb, bb, true);
 388                 if (!cr.isUnderflow())
 389                     cr.throwException();
 390                 cr = ce.flush(bb);
 391                 if (!cr.isUnderflow())
 392                     cr.throwException();
 393             } catch (CharacterCodingException x) {
 394                 // Substitution is always enabled,
 395                 // so this shouldn't happen
 396                 throw new Error(x);
 397             }
 398             return safeTrim(ba, bb.position(), isTrusted);
 399         }
 400     }
 401 
 402     static byte[] encode(String charsetName, byte coder, byte[] val)
 403         throws UnsupportedEncodingException
 404     {
 405         StringEncoder se = deref(encoder);
 406         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 407         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 408                               || csn.equals(se.charsetName()))) {
 409             se = null;
 410             try {
 411                 Charset cs = lookupCharset(csn);
 412                 if (cs != null) {
 413                     if (cs == UTF_8) {
 414                         return encodeUTF8(coder, val, true);
 415                     }
 416                     if (cs == ISO_8859_1) {
 417                         return encode8859_1(coder, val);
 418                     }
 419                     if (cs == US_ASCII) {
 420                         return encodeASCII(coder, val);
 421                     }
 422                     se = new StringEncoder(cs, csn);
 423                 }
 424             } catch (IllegalCharsetNameException x) {}
 425             if (se == null) {
 426                 throw new UnsupportedEncodingException (csn);
 427             }
 428             set(encoder, se);
 429         }
 430         return se.encode(coder, val);
 431     }
 432 
 433     static byte[] encode(Charset cs, byte coder, byte[] val) {
 434         if (cs == UTF_8) {
 435             return encodeUTF8(coder, val, true);
 436         }
 437         if (cs == ISO_8859_1) {
 438             return encode8859_1(coder, val);
 439         }
 440         if (cs == US_ASCII) {
 441             return encodeASCII(coder, val);
 442         }
 443         CharsetEncoder ce = cs.newEncoder();
 444         // fastpath for ascii compatible
 445         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 446                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 447                                  !hasNegatives(val, 0, val.length)))) {
 448             return Arrays.copyOf(val, val.length);
 449         }
 450         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 451         int en = scale(len, ce.maxBytesPerChar());
 452         byte[] ba = new byte[en];
 453         if (len == 0) {
 454             return ba;
 455         }
 456         ce.onMalformedInput(CodingErrorAction.REPLACE)
 457           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 458           .reset();
 459         if (ce instanceof ArrayEncoder) {
 460             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 461                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 462             if (blen != -1) {
 463                 return safeTrim(ba, blen, true);
 464             }
 465         }
 466         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 467                             System.getSecurityManager() == null;
 468         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 469                                        : StringUTF16.toChars(val);
 470         ByteBuffer bb = ByteBuffer.wrap(ba);
 471         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 472         try {
 473             CoderResult cr = ce.encode(cb, bb, true);
 474             if (!cr.isUnderflow())
 475                 cr.throwException();
 476             cr = ce.flush(bb);
 477             if (!cr.isUnderflow())
 478                 cr.throwException();
 479         } catch (CharacterCodingException x) {
 480             throw new Error(x);
 481         }
 482         return safeTrim(ba, bb.position(), isTrusted);
 483     }
 484 
 485     static byte[] encode(byte coder, byte[] val) {
 486         Charset cs = Charset.defaultCharset();
 487         if (cs == UTF_8) {
 488             return encodeUTF8(coder, val, true);
 489         }
 490         if (cs == ISO_8859_1) {
 491             return encode8859_1(coder, val);
 492         }
 493         if (cs == US_ASCII) {
 494             return encodeASCII(coder, val);
 495         }
 496         StringEncoder se = deref(encoder);
 497         if (se == null || !cs.name().equals(se.cs.name())) {
 498             se = new StringEncoder(cs, cs.name());
 499             set(encoder, se);
 500         }
 501         return se.encode(coder, val);
 502     }
 503 
 504     /**
 505      *  Print a message directly to stderr, bypassing all character conversion
 506      *  methods.
 507      *  @param msg  message to print
 508      */
 509     private static native void err(String msg);
 510 
 511      /* The cached Result for each thread */
 512     private static final ThreadLocal<StringCoding.Result>
 513         resultCached = new ThreadLocal<>() {
 514             protected StringCoding.Result initialValue() {
 515                 return new StringCoding.Result();
 516             }};
 517 
 518     ////////////////////////// ascii //////////////////////////////
 519 
 520     private static Result decodeASCII(byte[] ba, int off, int len) {
 521         Result result = resultCached.get();
 522         if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
 523             return result.with(Arrays.copyOfRange(ba, off, off + len),
 524                                LATIN1);
 525         }
 526         byte[] dst = new byte[len<<1];
 527         int dp = 0;
 528         while (dp < len) {
 529             int b = ba[off++];
 530             putChar(dst, dp++, (b >= 0) ? (char)b : repl);
 531         }
 532         return result.with(dst, UTF16);
 533     }
 534 
 535     private static byte[] encodeASCII(byte coder, byte[] val) {
 536         if (coder == LATIN1) {
 537             byte[] dst = new byte[val.length];
 538             for (int i = 0; i < val.length; i++) {
 539                 if (val[i] < 0) {
 540                     dst[i] = '?';
 541                 } else {
 542                     dst[i] = val[i];
 543                 }
 544             }
 545             return dst;
 546         }
 547         int len = val.length >> 1;
 548         byte[] dst = new byte[len];
 549         int dp = 0;
 550         for (int i = 0; i < len; i++) {
 551             char c = StringUTF16.getChar(val, i);
 552             if (c < 0x80) {
 553                 dst[dp++] = (byte)c;
 554                 continue;
 555             }
 556             if (Character.isHighSurrogate(c) && i + 1 < len &&
 557                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 558                 i++;
 559             }
 560             dst[dp++] = '?';
 561         }
 562         if (len == dp) {
 563             return dst;
 564         }
 565         return Arrays.copyOf(dst, dp);
 566     }
 567 
 568     ////////////////////////// latin1/8859_1 ///////////////////////////
 569 
 570     private static Result decodeLatin1(byte[] ba, int off, int len) {
 571        Result result = resultCached.get();
 572        if (COMPACT_STRINGS) {
 573            return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 574        } else {
 575            return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 576        }
 577     }
 578 
 579     @HotSpotIntrinsicCandidate
 580     private static int implEncodeISOArray(byte[] sa, int sp,
 581                                           byte[] da, int dp, int len) {
 582         int i = 0;
 583         for (; i < len; i++) {
 584             char c = StringUTF16.getChar(sa, sp++);
 585             if (c > '\u00FF')
 586                 break;
 587             da[dp++] = (byte)c;
 588         }
 589         return i;
 590     }
 591 
 592     private static byte[] encode8859_1(byte coder, byte[] val) {
 593         if (coder == LATIN1) {
 594             return Arrays.copyOf(val, val.length);
 595         }
 596         int len = val.length >> 1;
 597         byte[] dst = new byte[len];
 598         int dp = 0;
 599         int sp = 0;
 600         int sl = len;
 601         while (sp < sl) {
 602             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 603             sp = sp + ret;
 604             dp = dp + ret;
 605             if (ret != len) {
 606                 char c = StringUTF16.getChar(val, sp++);
 607                 if (Character.isHighSurrogate(c) && sp < sl &&
 608                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 609                     sp++;
 610                 }
 611                 dst[dp++] = '?';
 612                 len = sl - sp;
 613             }
 614         }
 615         if (dp == dst.length) {
 616             return dst;
 617         }
 618         return Arrays.copyOf(dst, dp);
 619     }
 620 
 621     //////////////////////////////// utf8 ////////////////////////////////////
 622 
 623     private static boolean isNotContinuation(int b) {
 624         return (b & 0xc0) != 0x80;




 625     }
 626 
 627     private static boolean isMalformed3(int b1, int b2, int b3) {
 628         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 629                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 630     }
 631 
 632     private static boolean isMalformed3_2(int b1, int b2) {
 633         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 634                (b2 & 0xc0) != 0x80;
 635     }
 636 
 637     private static boolean isMalformed4(int b2, int b3, int b4) {
 638         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 639                (b4 & 0xc0) != 0x80;
 640     }
 641 
 642     private static boolean isMalformed4_2(int b1, int b2) {
 643         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 644                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 645                (b2 & 0xc0) != 0x80;
 646     }
 647 
 648     private static boolean isMalformed4_3(int b3) {
 649         return (b3 & 0xc0) != 0x80;
 650     }
 651 
 652     // for nb == 3/4
 653     private static int malformedN(byte[] src, int sp, int nb) {
 654         if (nb == 3) {
 655             int b1 = src[sp++];
 656             int b2 = src[sp++];    // no need to lookup b3
 657             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 658                     isNotContinuation(b2)) ? 1 : 2;
 659         } else if (nb == 4) { // we don't care the speed here
 660             int b1 = src[sp++] & 0xff;
 661             int b2 = src[sp++] & 0xff;
 662             if (b1 > 0xf4 ||
 663                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 664                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 665                 isNotContinuation(b2))
 666                 return 1;
 667             if (isNotContinuation(src[sp++]))
 668                 return 2;
 669             return 3;
 670         }
 671         assert false;
 672         return -1;
 673     }
 674 
 675     private static void throwMalformed(int off, int nb) {
 676         throw new IllegalArgumentException("malformed input off : " + off +
 677                                            ", length : " + nb);
 678     }
 679 
 680     private static char repl = '\ufffd';
 681 
 682     private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
 683         // ascii-bais, which has a relative impact to the non-ascii-only bytes
 684         if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
 685             return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
 686                                            LATIN1);
 687         return decodeUTF8_0(src, sp, len, doReplace);
 688     }
 689 
 690     private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
 691         Result ret = resultCached.get();
 692 
 693         int sl = sp + len;
 694         int dp = 0;
 695         byte[] dst = new byte[len];
 696 
 697         if (COMPACT_STRINGS) {
 698             while (sp < sl) {
 699                 int b1 = src[sp];
 700                 if (b1 >= 0) {
 701                     dst[dp++] = (byte)b1;
 702                     sp++;
 703                     continue;
 704                 }
 705                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 706                     sp + 1 < sl) {
 707                     int b2 = src[sp + 1];
 708                     if (!isNotContinuation(b2)) {
 709                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 710                                            (((byte) 0xC0 << 6) ^
 711                                            ((byte) 0x80 << 0)));
 712                         sp += 2;
 713                         continue;
 714                     }

 715                 }
 716                 // anything not a latin1, including the repl
 717                 // we have to go with the utf16
 718                 break;
 719             }
 720             if (sp == sl) {
 721                 if (dp != dst.length) {
 722                     dst = Arrays.copyOf(dst, dp);
 723                 }
 724                 return ret.with(dst, LATIN1);
 725             }
 726         }
 727         if (dp == 0) {
 728             dst = new byte[len << 1];
 729         } else {
 730             byte[] buf = new byte[len << 1];
 731             StringLatin1.inflate(dst, 0, buf, 0, dp);
 732             dst = buf;
 733         }
 734         while (sp < sl) {
 735             int b1 = src[sp++];
 736             if (b1 >= 0) {
 737                 putChar(dst, dp++, (char) b1);
 738             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 739                 if (sp < sl) {
 740                     int b2 = src[sp++];
 741                     if (isNotContinuation(b2)) {
 742                         if (!doReplace) {
 743                             throwMalformed(sp - 1, 1);
 744                         }
 745                         putChar(dst, dp++, repl);
 746                         sp--;
 747                     } else {
 748                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 749                                                   (((byte) 0xC0 << 6) ^
 750                                                   ((byte) 0x80 << 0))));
 751                     }
 752                     continue;
 753                 }
 754                 if (!doReplace) {
 755                     throwMalformed(sp, 1);  // underflow()
 756                 }
 757                 putChar(dst, dp++, repl);
 758                 break;
 759             } else if ((b1 >> 4) == -2) {
 760                 if (sp + 1 < sl) {
 761                     int b2 = src[sp++];
 762                     int b3 = src[sp++];
 763                     if (isMalformed3(b1, b2, b3)) {
 764                         if (!doReplace) {
 765                             throwMalformed(sp - 3, 3);
 766                         }
 767                         putChar(dst, dp++, repl);
 768                         sp -= 3;
 769                         sp += malformedN(src, sp, 3);
 770                     } else {
 771                         char c = (char)((b1 << 12) ^
 772                                         (b2 <<  6) ^
 773                                         (b3 ^
 774                                          (((byte) 0xE0 << 12) ^
 775                                          ((byte) 0x80 <<  6) ^
 776                                          ((byte) 0x80 <<  0))));
 777                         if (isSurrogate(c)) {
 778                             if (!doReplace) {
 779                                 throwMalformed(sp - 3, 3);
 780                             }
 781                             putChar(dst, dp++, repl);
 782                         } else {
 783                             putChar(dst, dp++, c);
 784                         }
 785                     }
 786                     continue;
 787                 }
 788                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 789                     if (!doReplace) {
 790                         throwMalformed(sp - 1, 2);
 791                     }
 792                     putChar(dst, dp++, repl);
 793                     continue;
 794                 }
 795                 if (!doReplace){
 796                     throwMalformed(sp, 1);
 797                 }
 798                 putChar(dst, dp++, repl);
 799                 break;
 800             } else if ((b1 >> 3) == -2) {
 801                 if (sp + 2 < sl) {
 802                     int b2 = src[sp++];
 803                     int b3 = src[sp++];
 804                     int b4 = src[sp++];
 805                     int uc = ((b1 << 18) ^
 806                               (b2 << 12) ^
 807                               (b3 <<  6) ^
 808                               (b4 ^
 809                                (((byte) 0xF0 << 18) ^
 810                                ((byte) 0x80 << 12) ^
 811                                ((byte) 0x80 <<  6) ^
 812                                ((byte) 0x80 <<  0))));
 813                     if (isMalformed4(b2, b3, b4) ||
 814                         !isSupplementaryCodePoint(uc)) { // shortest form check
 815                         if (!doReplace) {
 816                             throwMalformed(sp - 4, 4);
 817                         }
 818                         putChar(dst, dp++, repl);
 819                         sp -= 4;
 820                         sp += malformedN(src, sp, 4);
 821                     } else {
 822                         putChar(dst, dp++, highSurrogate(uc));
 823                         putChar(dst, dp++, lowSurrogate(uc));
 824                     }
 825                     continue;
 826                 }
 827                 b1 &= 0xff;
 828                 if (b1 > 0xf4 ||
 829                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 830                     if (!doReplace) {
 831                         throwMalformed(sp - 1, 1);  // or 2
 832                     }
 833                     putChar(dst, dp++, repl);
 834                     continue;
 835                 }
 836                 if (!doReplace) {
 837                     throwMalformed(sp - 1, 1);
 838                 }
 839                 sp++;
 840                 putChar(dst, dp++, repl);
 841                 if (sp  < sl && isMalformed4_3(src[sp])) {
 842                     continue;
 843                 }
 844                 break;
 845             } else {
 846                 if (!doReplace) {
 847                     throwMalformed(sp - 1, 1);
 848                 }
 849                 putChar(dst, dp++, repl);
 850             }
 851         }
 852         if (dp != len) {
 853             dst = Arrays.copyOf(dst, dp << 1);
 854         }
 855         return ret.with(dst, UTF16);
 856     }
 857 
 858     private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
 859         if (coder == UTF16)
 860             return encodeUTF8_UTF16(val, doReplace);
 861 
 862         if (!hasNegatives(val, 0, val.length))
 863             return Arrays.copyOf(val, val.length);
 864 

 865         int dp = 0;
 866         byte[] dst = new byte[val.length << 1];


 867         for (int sp = 0; sp < val.length; sp++) {
 868             byte c = val[sp];
 869             if (c < 0) {
 870                 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 871                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 872             } else {
 873                 dst[dp++] = c;
 874             }
 875         }
 876         if (dp == dst.length)
 877             return dst;
 878         return Arrays.copyOf(dst, dp);
 879     }
 880 
 881     private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
 882         int dp = 0;
 883         int sp = 0;
 884         int sl = val.length >> 1;
 885         byte[] dst = new byte[sl * 3];
 886         char c;
 887         while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 888             // ascii fast loop;
 889             dst[dp++] = (byte)c;
 890             sp++;
 891         }
 892         while (sp < sl) {
 893             c = StringUTF16.getChar(val, sp++);
 894             if (c < 0x80) {
 895                 dst[dp++] = (byte)c;
 896             } else if (c < 0x800) {
 897                 dst[dp++] = (byte)(0xc0 | (c >> 6));
 898                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 899             } else if (Character.isSurrogate(c)) {
 900                 int uc = -1;
 901                 char c2;
 902                 if (Character.isHighSurrogate(c) && sp < sl &&
 903                     Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 904                     uc = Character.toCodePoint(c, c2);
 905                 }
 906                 if (uc < 0) {
 907                     if (doReplace) {
 908                         dst[dp++] = '?';
 909                     } else {
 910                         throwMalformed(sp - 1, 1); // or 2, does not matter here
 911                     }
 912                 } else {
 913                     dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 914                     dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 915                     dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 916                     dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 917                     sp++;  // 2 chars
 918                 }
 919             } else {
 920                 // 3 bytes, 16 bits
 921                 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 922                 dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 923                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 924             }
 925         }

 926         if (dp == dst.length) {
 927             return dst;
 928         }
 929         return Arrays.copyOf(dst, dp);
 930     }
 931 
 932     ////////////////////// for j.u.z.ZipCoder //////////////////////////



























 933 
 934     /*
 935      * Throws iae, instead of replacing, if malformed or unmappble.
 936      */
 937     static String newStringUTF8NoRepl(byte[] src, int off, int len) {
 938         if (COMPACT_STRINGS && !hasNegatives(src, off, len))
 939             return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
 940         Result ret = decodeUTF8_0(src, off, len, false);
 941         return new String(ret.value, ret.coder);
































































 942     }
 943 
 944     /*
 945      * Throws iae, instead of replacing, if unmappble.


 946      */
 947     static byte[] getBytesUTF8NoRepl(String s) {
 948         return encodeUTF8(s.coder(), s.value(), false);
 949     }
 950 }