New src/java.base/share/classes/java/lang/StringCoding.java

   1 /*
   2  * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.lang;
  27 
  28 import java.io.UnsupportedEncodingException;
  29 import java.lang.ref.SoftReference;
  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.MalformedInputException;
  40 import java.nio.charset.UnmappableCharacterException;
  41 import java.nio.charset.UnsupportedCharsetException;
  42 import java.util.Arrays;
  43 import jdk.internal.HotSpotIntrinsicCandidate;
  44 import sun.nio.cs.HistoricallyNamedCharset;
  45 import sun.nio.cs.ArrayDecoder;
  46 import sun.nio.cs.ArrayEncoder;
  47 
  48 import static java.lang.String.LATIN1;
  49 import static java.lang.String.UTF16;
  50 import static java.lang.String.COMPACT_STRINGS;
  51 import static java.lang.Character.isSurrogate;
  52 import static java.lang.Character.highSurrogate;
  53 import static java.lang.Character.lowSurrogate;
  54 import static java.lang.Character.isSupplementaryCodePoint;
  55 import static java.lang.StringUTF16.putChar;
  56 
  57 /**
  58  * Utility class for string encoding and decoding.
  59  */
  60 
  61 class StringCoding {
  62 
  63     private StringCoding() { }
  64 
  65     /** The cached coders for each thread */
  66     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  67         new ThreadLocal<>();
  68     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  69         new ThreadLocal<>();
  70 
  71     private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
  72     private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
  73     private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
  74 
  75     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  76         SoftReference<T> sr = tl.get();
  77         if (sr == null)
  78             return null;
  79         return sr.get();
  80     }
  81 
  82     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  83         tl.set(new SoftReference<>(ob));
  84     }
  85 
  86     // Trim the given byte array to the given length
  87     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  88         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  89             return ba;
  90         else
  91             return Arrays.copyOf(ba, len);
  92     }
  93 
  94     private static int scale(int len, float expansionFactor) {
  95         // We need to perform double, not float, arithmetic; otherwise
  96         // we lose low order bits when len is larger than 2**24.
  97         return (int)(len * (double)expansionFactor);
  98     }
  99 
 100     private static Charset lookupCharset(String csn) {
 101         if (Charset.isSupported(csn)) {
 102             try {
 103                 return Charset.forName(csn);
 104             } catch (UnsupportedCharsetException x) {
 105                 throw new Error(x);
 106             }
 107         }
 108         return null;
 109     }
 110 
 111     static class Result {
 112         byte[] value;
 113         byte coder;
 114 
 115         Result with() {
 116             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 117             value = new byte[0];
 118             return this;
 119         }
 120 
 121         Result with(char[] val, int off, int len) {
 122             if (String.COMPACT_STRINGS) {
 123                 byte[] bs = StringUTF16.compress(val, off, len);
 124                 if (bs != null) {
 125                     value = bs;
 126                     coder = LATIN1;
 127                     return this;
 128                 }
 129             }
 130             coder = UTF16;
 131             value = StringUTF16.toBytes(val, off, len);
 132             return this;
 133         }
 134 
 135         Result with(byte[] val, byte coder) {
 136             this.coder = coder;
 137             value = val;
 138             return this;
 139         }
 140     }
 141 
 142     @HotSpotIntrinsicCandidate
 143     public static boolean hasNegatives(byte[] ba, int off, int len) {
 144         for (int i = off; i < off + len; i++) {
 145             if (ba[i] < 0) {
 146                 return true;
 147             }
 148         }
 149         return false;
 150     }
 151 
 152     // -- Decoding --
 153     static class StringDecoder {
 154         private final String requestedCharsetName;
 155         private final Charset cs;
 156         private final boolean isASCIICompatible;
 157         private final CharsetDecoder cd;
 158         protected final Result result;
 159 
 160         StringDecoder(Charset cs, String rcn) {
 161             this.requestedCharsetName = rcn;
 162             this.cs = cs;
 163             this.cd = cs.newDecoder()
 164                 .onMalformedInput(CodingErrorAction.REPLACE)
 165                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 166             this.result = new Result();
 167             this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
 168                     ((ArrayDecoder)cd).isASCIICompatible();
 169         }
 170 
 171         String charsetName() {
 172             if (cs instanceof HistoricallyNamedCharset)
 173                 return ((HistoricallyNamedCharset)cs).historicalName();
 174             return cs.name();
 175         }
 176 
 177         final String requestedCharsetName() {
 178             return requestedCharsetName;
 179         }
 180 
 181         Result decode(byte[] ba, int off, int len) {
 182             if (len == 0) {
 183                 return result.with();
 184             }
 185             // fastpath for ascii compatible
 186             if (isASCIICompatible && !hasNegatives(ba, off, len)) {
 187                 if (COMPACT_STRINGS) {
 188                     return result.with(Arrays.copyOfRange(ba, off, off + len),
 189                                       LATIN1);
 190                 } else {
 191                     return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 192                 }
 193             }
 194             int en = scale(len, cd.maxCharsPerByte());
 195             char[] ca = new char[en];
 196             if (cd instanceof ArrayDecoder) {
 197                 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 198                 return result.with(ca, 0, clen);
 199             }
 200             cd.reset();
 201             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 202             CharBuffer cb = CharBuffer.wrap(ca);
 203             try {
 204                 CoderResult cr = cd.decode(bb, cb, true);
 205                 if (!cr.isUnderflow())
 206                     cr.throwException();
 207                 cr = cd.flush(cb);
 208                 if (!cr.isUnderflow())
 209                     cr.throwException();
 210             } catch (CharacterCodingException x) {
 211                 // Substitution is always enabled,
 212                 // so this shouldn't happen
 213                 throw new Error(x);
 214             }
 215             return result.with(ca, 0, cb.position());
 216         }
 217     }
 218 
 219     static Result decode(String charsetName, byte[] ba, int off, int len)
 220         throws UnsupportedEncodingException
 221     {
 222         StringDecoder sd = deref(decoder);
 223         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 224         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 225                               || csn.equals(sd.charsetName()))) {
 226             sd = null;
 227             try {
 228                 Charset cs = lookupCharset(csn);
 229                 if (cs != null) {
 230                     if (cs == UTF_8) {
 231                         return decodeUTF8(ba, off, len, true);
 232                     }
 233                     if (cs == ISO_8859_1) {
 234                         return decodeLatin1(ba, off, len);
 235                     }
 236                     if (cs == US_ASCII) {
 237                         return decodeASCII(ba, off, len);
 238                     }
 239                     sd = new StringDecoder(cs, csn);
 240                 }
 241             } catch (IllegalCharsetNameException x) {}
 242             if (sd == null)
 243                 throw new UnsupportedEncodingException(csn);
 244             set(decoder, sd);
 245         }
 246         return sd.decode(ba, off, len);
 247     }
 248 
 249     static Result decode(Charset cs, byte[] ba, int off, int len) {
 250         if (cs == UTF_8) {
 251             return decodeUTF8(ba, off, len, true);
 252         }
 253         if (cs == ISO_8859_1) {
 254             return decodeLatin1(ba, off, len);
 255         }
 256         if (cs == US_ASCII) {
 257             return decodeASCII(ba, off, len);
 258         }
 259 
 260         // (1)We never cache the "external" cs, the only benefit of creating
 261         // an additional StringDe/Encoder object to wrap it is to share the
 262         // de/encode() method. These SD/E objects are short-lived, the young-gen
 263         // gc should be able to take care of them well. But the best approach
 264         // is still not to generate them if not really necessary.
 265         // (2)The defensive copy of the input byte/char[] has a big performance
 266         // impact, as well as the outgoing result byte/char[]. Need to do the
 267         // optimization check of (sm==null && classLoader0==null) for both.
 268         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 269         // is only checked (and then isTrusted gets set) when (SM==null). It is
 270         // possible that the SM==null for now but then SM is NOT null later
 271         // when safeTrim() is invoked...the "safe" way to do is to redundant
 272         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 273         // but it then can be argued that the SM is null when the operation
 274         // is started...
 275         CharsetDecoder cd = cs.newDecoder();
 276         // ascii fastpath
 277         if ((cd instanceof ArrayDecoder) &&
 278             ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
 279             return decodeLatin1(ba, off, len);
 280         }
 281         int en = scale(len, cd.maxCharsPerByte());
 282         if (len == 0) {
 283             return new Result().with();
 284         }
 285         cd.onMalformedInput(CodingErrorAction.REPLACE)
 286           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 287           .reset();
 288         char[] ca = new char[en];
 289         if (cd instanceof ArrayDecoder) {
 290             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 291             return new Result().with(ca, 0, clen);
 292         }
 293         if (cs.getClass().getClassLoader0() != null &&
 294             System.getSecurityManager() != null) {
 295             ba = Arrays.copyOfRange(ba, off, off + len);
 296             off = 0;
 297         }
 298         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 299         CharBuffer cb = CharBuffer.wrap(ca);
 300         try {
 301             CoderResult cr = cd.decode(bb, cb, true);
 302             if (!cr.isUnderflow())
 303                 cr.throwException();
 304             cr = cd.flush(cb);
 305             if (!cr.isUnderflow())
 306                 cr.throwException();
 307         } catch (CharacterCodingException x) {
 308             // Substitution is always enabled,
 309             // so this shouldn't happen
 310             throw new Error(x);
 311         }
 312         return new Result().with(ca, 0, cb.position());
 313     }
 314 
 315     static Result decode(byte[] ba, int off, int len) {
 316         Charset cs = Charset.defaultCharset();
 317         if (cs == UTF_8) {
 318             return decodeUTF8(ba, off, len, true);
 319         }
 320         if (cs == ISO_8859_1) {
 321             return decodeLatin1(ba, off, len);
 322         }
 323         if (cs == US_ASCII) {
 324             return decodeASCII(ba, off, len);
 325         }
 326         StringDecoder sd = deref(decoder);
 327         if (sd == null || !cs.name().equals(sd.cs.name())) {
 328             sd = new StringDecoder(cs, cs.name());
 329             set(decoder, sd);
 330         }
 331         return sd.decode(ba, off, len);
 332     }
 333 
 334     // -- Encoding --
 335     private static class StringEncoder {
 336         private Charset cs;
 337         private CharsetEncoder ce;
 338         private final boolean isASCIICompatible;
 339         private final String requestedCharsetName;
 340         private final boolean isTrusted;
 341 
 342         private StringEncoder(Charset cs, String rcn) {
 343             this.requestedCharsetName = rcn;
 344             this.cs = cs;
 345             this.ce = cs.newEncoder()
 346                 .onMalformedInput(CodingErrorAction.REPLACE)
 347                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 348             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 349             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 350                     ((ArrayEncoder)ce).isASCIICompatible();
 351         }
 352 
 353         String charsetName() {
 354             if (cs instanceof HistoricallyNamedCharset)
 355                 return ((HistoricallyNamedCharset)cs).historicalName();
 356             return cs.name();
 357         }
 358 
 359         final String requestedCharsetName() {
 360             return requestedCharsetName;
 361         }
 362 
 363         byte[] encode(byte coder, byte[] val) {
 364             // fastpath for ascii compatible
 365             if (coder == LATIN1 && isASCIICompatible &&
 366                 !hasNegatives(val, 0, val.length)) {
 367                 return Arrays.copyOf(val, val.length);
 368             }
 369             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 370             int en = scale(len, ce.maxBytesPerChar());
 371             byte[] ba = new byte[en];
 372             if (len == 0) {
 373                 return ba;
 374             }
 375             if (ce instanceof ArrayEncoder) {
 376                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 377                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 378                 if (blen != -1) {
 379                     return safeTrim(ba, blen, isTrusted);
 380                 }
 381             }
 382             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 383                                            : StringUTF16.toChars(val);
 384             ce.reset();
 385             ByteBuffer bb = ByteBuffer.wrap(ba);
 386             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 387             try {
 388                 CoderResult cr = ce.encode(cb, bb, true);
 389                 if (!cr.isUnderflow())
 390                     cr.throwException();
 391                 cr = ce.flush(bb);
 392                 if (!cr.isUnderflow())
 393                     cr.throwException();
 394             } catch (CharacterCodingException x) {
 395                 // Substitution is always enabled,
 396                 // so this shouldn't happen
 397                 throw new Error(x);
 398             }
 399             return safeTrim(ba, bb.position(), isTrusted);
 400         }
 401     }
 402 
 403     static byte[] encode(String charsetName, byte coder, byte[] val)
 404         throws UnsupportedEncodingException
 405     {
 406         StringEncoder se = deref(encoder);
 407         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 408         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 409                               || csn.equals(se.charsetName()))) {
 410             se = null;
 411             try {
 412                 Charset cs = lookupCharset(csn);
 413                 if (cs != null) {
 414                     if (cs == UTF_8) {
 415                         return encodeUTF8(coder, val, true);
 416                     }
 417                     if (cs == ISO_8859_1) {
 418                         return encode8859_1(coder, val);
 419                     }
 420                     if (cs == US_ASCII) {
 421                         return encodeASCII(coder, val);
 422                     }
 423                     se = new StringEncoder(cs, csn);
 424                 }
 425             } catch (IllegalCharsetNameException x) {}
 426             if (se == null) {
 427                 throw new UnsupportedEncodingException (csn);
 428             }
 429             set(encoder, se);
 430         }
 431         return se.encode(coder, val);
 432     }
 433 
 434     static byte[] encode(Charset cs, byte coder, byte[] val) {
 435         if (cs == UTF_8) {
 436             return encodeUTF8(coder, val, true);
 437         }
 438         if (cs == ISO_8859_1) {
 439             return encode8859_1(coder, val);
 440         }
 441         if (cs == US_ASCII) {
 442             return encodeASCII(coder, val);
 443         }
 444         CharsetEncoder ce = cs.newEncoder();
 445         // fastpath for ascii compatible
 446         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 447                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 448                                  !hasNegatives(val, 0, val.length)))) {
 449             return Arrays.copyOf(val, val.length);
 450         }
 451         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 452         int en = scale(len, ce.maxBytesPerChar());
 453         byte[] ba = new byte[en];
 454         if (len == 0) {
 455             return ba;
 456         }
 457         ce.onMalformedInput(CodingErrorAction.REPLACE)
 458           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 459           .reset();
 460         if (ce instanceof ArrayEncoder) {
 461             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 462                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 463             if (blen != -1) {
 464                 return safeTrim(ba, blen, true);
 465             }
 466         }
 467         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 468                             System.getSecurityManager() == null;
 469         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 470                                        : StringUTF16.toChars(val);
 471         ByteBuffer bb = ByteBuffer.wrap(ba);
 472         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 473         try {
 474             CoderResult cr = ce.encode(cb, bb, true);
 475             if (!cr.isUnderflow())
 476                 cr.throwException();
 477             cr = ce.flush(bb);
 478             if (!cr.isUnderflow())
 479                 cr.throwException();
 480         } catch (CharacterCodingException x) {
 481             throw new Error(x);
 482         }
 483         return safeTrim(ba, bb.position(), isTrusted);
 484     }
 485 
 486     static byte[] encode(byte coder, byte[] val) {
 487         Charset cs = Charset.defaultCharset();
 488         if (cs == UTF_8) {
 489             return encodeUTF8(coder, val, true);
 490         }
 491         if (cs == ISO_8859_1) {
 492             return encode8859_1(coder, val);
 493         }
 494         if (cs == US_ASCII) {
 495             return encodeASCII(coder, val);
 496         }
 497         StringEncoder se = deref(encoder);
 498         if (se == null || !cs.name().equals(se.cs.name())) {
 499             se = new StringEncoder(cs, cs.name());
 500             set(encoder, se);
 501         }
 502         return se.encode(coder, val);
 503     }
 504 
 505     /**
 506      *  Print a message directly to stderr, bypassing all character conversion
 507      *  methods.
 508      *  @param msg  message to print
 509      */
 510     private static native void err(String msg);
 511 
 512      /* The cached Result for each thread */
 513     private static final ThreadLocal<StringCoding.Result>
 514         resultCached = new ThreadLocal<>() {
 515             protected StringCoding.Result initialValue() {
 516                 return new StringCoding.Result();
 517             }};
 518 
 519     ////////////////////////// ascii //////////////////////////////
 520 
 521     private static Result decodeASCII(byte[] ba, int off, int len) {
 522         Result result = resultCached.get();
 523         if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
 524             return result.with(Arrays.copyOfRange(ba, off, off + len),
 525                                LATIN1);
 526         }
 527         byte[] dst = new byte[len<<1];
 528         int dp = 0;
 529         while (dp < len) {
 530             int b = ba[off++];
 531             putChar(dst, dp++, (b >= 0) ? (char)b : repl);
 532         }
 533         return result.with(dst, UTF16);
 534     }
 535 
 536     private static byte[] encodeASCII(byte coder, byte[] val) {
 537         if (coder == LATIN1) {
 538             byte[] dst = new byte[val.length];
 539             for (int i = 0; i < val.length; i++) {
 540                 if (val[i] < 0) {
 541                     dst[i] = '?';
 542                 } else {
 543                     dst[i] = val[i];
 544                 }
 545             }
 546             return dst;
 547         }
 548         int len = val.length >> 1;
 549         byte[] dst = new byte[len];
 550         int dp = 0;
 551         for (int i = 0; i < len; i++) {
 552             char c = StringUTF16.getChar(val, i);
 553             if (c < 0x80) {
 554                 dst[dp++] = (byte)c;
 555                 continue;
 556             }
 557             if (Character.isHighSurrogate(c) && i + 1 < len &&
 558                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 559                 i++;
 560             }
 561             dst[dp++] = '?';
 562         }
 563         if (len == dp) {
 564             return dst;
 565         }
 566         return Arrays.copyOf(dst, dp);
 567     }
 568 
 569     ////////////////////////// latin1/8859_1 ///////////////////////////
 570 
 571     private static Result decodeLatin1(byte[] ba, int off, int len) {
 572        Result result = resultCached.get();
 573        if (COMPACT_STRINGS) {
 574            return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 575        } else {
 576            return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 577        }
 578     }
 579 
 580     @HotSpotIntrinsicCandidate
 581     private static int implEncodeISOArray(byte[] sa, int sp,
 582                                           byte[] da, int dp, int len) {
 583         int i = 0;
 584         for (; i < len; i++) {
 585             char c = StringUTF16.getChar(sa, sp++);
 586             if (c > '\u00FF')
 587                 break;
 588             da[dp++] = (byte)c;
 589         }
 590         return i;
 591     }
 592 
 593     private static byte[] encode8859_1(byte coder, byte[] val) {
 594         return encode8859_1(coder, val, true);
 595     }
 596 
 597     private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) {
 598         if (coder == LATIN1) {
 599             return Arrays.copyOf(val, val.length);
 600         }
 601         int len = val.length >> 1;
 602         byte[] dst = new byte[len];
 603         int dp = 0;
 604         int sp = 0;
 605         int sl = len;
 606         while (sp < sl) {
 607             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 608             sp = sp + ret;
 609             dp = dp + ret;
 610             if (ret != len) {
 611                 if (!doReplace) {
 612                     throwUnmappable(sp, 1);
 613                 }
 614                 char c = StringUTF16.getChar(val, sp++);
 615                 if (Character.isHighSurrogate(c) && sp < sl &&
 616                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 617                     sp++;
 618                 }
 619                 dst[dp++] = '?';
 620                 len = sl - sp;
 621             }
 622         }
 623         if (dp == dst.length) {
 624             return dst;
 625         }
 626         return Arrays.copyOf(dst, dp);
 627     }
 628 
 629     //////////////////////////////// utf8 ////////////////////////////////////
 630 
 631     private static boolean isNotContinuation(int b) {
 632         return (b & 0xc0) != 0x80;
 633     }
 634 
 635     private static boolean isMalformed3(int b1, int b2, int b3) {
 636         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 637                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 638     }
 639 
 640     private static boolean isMalformed3_2(int b1, int b2) {
 641         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 642                (b2 & 0xc0) != 0x80;
 643     }
 644 
 645     private static boolean isMalformed4(int b2, int b3, int b4) {
 646         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 647                (b4 & 0xc0) != 0x80;
 648     }
 649 
 650     private static boolean isMalformed4_2(int b1, int b2) {
 651         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 652                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 653                (b2 & 0xc0) != 0x80;
 654     }
 655 
 656     private static boolean isMalformed4_3(int b3) {
 657         return (b3 & 0xc0) != 0x80;
 658     }
 659 
 660     // for nb == 3/4
 661     private static int malformedN(byte[] src, int sp, int nb) {
 662         if (nb == 3) {
 663             int b1 = src[sp++];
 664             int b2 = src[sp++];    // no need to lookup b3
 665             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 666                     isNotContinuation(b2)) ? 1 : 2;
 667         } else if (nb == 4) { // we don't care the speed here
 668             int b1 = src[sp++] & 0xff;
 669             int b2 = src[sp++] & 0xff;
 670             if (b1 > 0xf4 ||
 671                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 672                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 673                 isNotContinuation(b2))
 674                 return 1;
 675             if (isNotContinuation(src[sp++]))
 676                 return 2;
 677             return 3;
 678         }
 679         assert false;
 680         return -1;
 681     }
 682 
 683     private static void throwMalformed(int off, int nb) {
 684         String msg = "malformed input off : " + off + ", length : " + nb;
 685         throw new IllegalArgumentException(msg, new MalformedInputException(nb));
 686     }
 687 
 688     private static void throwMalformed(byte[] val) {
 689         int dp = 0;
 690         while (dp < val.length && val[dp] >=0) { dp++; }
 691         throwMalformed(dp, 1);
 692     }
 693 
 694     private static void throwUnmappable(int off, int nb) {
 695         String msg = "malformed input off : " + off + ", length : " + nb;
 696         throw new IllegalArgumentException(msg, new UnmappableCharacterException(nb));
 697     }
 698 
 699     private static void throwUnmappable(byte[] val) {
 700         int dp = 0;
 701         while (dp < val.length && val[dp] >=0) { dp++; }
 702         throwUnmappable(dp, 1);
 703     }
 704 
 705     private static char repl = '\ufffd';
 706 
 707     private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
 708         // ascii-bais, which has a relative impact to the non-ascii-only bytes
 709         if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
 710             return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
 711                                            LATIN1);
 712         return decodeUTF8_0(src, sp, len, doReplace);
 713     }
 714 
 715     private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
 716         Result ret = resultCached.get();
 717 
 718         int sl = sp + len;
 719         int dp = 0;
 720         byte[] dst = new byte[len];
 721 
 722         if (COMPACT_STRINGS) {
 723             while (sp < sl) {
 724                 int b1 = src[sp];
 725                 if (b1 >= 0) {
 726                     dst[dp++] = (byte)b1;
 727                     sp++;
 728                     continue;
 729                 }
 730                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 731                     sp + 1 < sl) {
 732                     int b2 = src[sp + 1];
 733                     if (!isNotContinuation(b2)) {
 734                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 735                                            (((byte) 0xC0 << 6) ^
 736                                            ((byte) 0x80 << 0)));
 737                         sp += 2;
 738                         continue;
 739                     }
 740                 }
 741                 // anything not a latin1, including the repl
 742                 // we have to go with the utf16
 743                 break;
 744             }
 745             if (sp == sl) {
 746                 if (dp != dst.length) {
 747                     dst = Arrays.copyOf(dst, dp);
 748                 }
 749                 return ret.with(dst, LATIN1);
 750             }
 751         }
 752         if (dp == 0) {
 753             dst = new byte[len << 1];
 754         } else {
 755             byte[] buf = new byte[len << 1];
 756             StringLatin1.inflate(dst, 0, buf, 0, dp);
 757             dst = buf;
 758         }
 759         while (sp < sl) {
 760             int b1 = src[sp++];
 761             if (b1 >= 0) {
 762                 putChar(dst, dp++, (char) b1);
 763             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 764                 if (sp < sl) {
 765                     int b2 = src[sp++];
 766                     if (isNotContinuation(b2)) {
 767                         if (!doReplace) {
 768                             throwMalformed(sp - 1, 1);
 769                         }
 770                         putChar(dst, dp++, repl);
 771                         sp--;
 772                     } else {
 773                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 774                                                   (((byte) 0xC0 << 6) ^
 775                                                   ((byte) 0x80 << 0))));
 776                     }
 777                     continue;
 778                 }
 779                 if (!doReplace) {
 780                     throwMalformed(sp, 1);  // underflow()
 781                 }
 782                 putChar(dst, dp++, repl);
 783                 break;
 784             } else if ((b1 >> 4) == -2) {
 785                 if (sp + 1 < sl) {
 786                     int b2 = src[sp++];
 787                     int b3 = src[sp++];
 788                     if (isMalformed3(b1, b2, b3)) {
 789                         if (!doReplace) {
 790                             throwMalformed(sp - 3, 3);
 791                         }
 792                         putChar(dst, dp++, repl);
 793                         sp -= 3;
 794                         sp += malformedN(src, sp, 3);
 795                     } else {
 796                         char c = (char)((b1 << 12) ^
 797                                         (b2 <<  6) ^
 798                                         (b3 ^
 799                                          (((byte) 0xE0 << 12) ^
 800                                          ((byte) 0x80 <<  6) ^
 801                                          ((byte) 0x80 <<  0))));
 802                         if (isSurrogate(c)) {
 803                             if (!doReplace) {
 804                                 throwMalformed(sp - 3, 3);
 805                             }
 806                             putChar(dst, dp++, repl);
 807                         } else {
 808                             putChar(dst, dp++, c);
 809                         }
 810                     }
 811                     continue;
 812                 }
 813                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 814                     if (!doReplace) {
 815                         throwMalformed(sp - 1, 2);
 816                     }
 817                     putChar(dst, dp++, repl);
 818                     continue;
 819                 }
 820                 if (!doReplace){
 821                     throwMalformed(sp, 1);
 822                 }
 823                 putChar(dst, dp++, repl);
 824                 break;
 825             } else if ((b1 >> 3) == -2) {
 826                 if (sp + 2 < sl) {
 827                     int b2 = src[sp++];
 828                     int b3 = src[sp++];
 829                     int b4 = src[sp++];
 830                     int uc = ((b1 << 18) ^
 831                               (b2 << 12) ^
 832                               (b3 <<  6) ^
 833                               (b4 ^
 834                                (((byte) 0xF0 << 18) ^
 835                                ((byte) 0x80 << 12) ^
 836                                ((byte) 0x80 <<  6) ^
 837                                ((byte) 0x80 <<  0))));
 838                     if (isMalformed4(b2, b3, b4) ||
 839                         !isSupplementaryCodePoint(uc)) { // shortest form check
 840                         if (!doReplace) {
 841                             throwMalformed(sp - 4, 4);
 842                         }
 843                         putChar(dst, dp++, repl);
 844                         sp -= 4;
 845                         sp += malformedN(src, sp, 4);
 846                     } else {
 847                         putChar(dst, dp++, highSurrogate(uc));
 848                         putChar(dst, dp++, lowSurrogate(uc));
 849                     }
 850                     continue;
 851                 }
 852                 b1 &= 0xff;
 853                 if (b1 > 0xf4 ||
 854                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 855                     if (!doReplace) {
 856                         throwMalformed(sp - 1, 1);  // or 2
 857                     }
 858                     putChar(dst, dp++, repl);
 859                     continue;
 860                 }
 861                 if (!doReplace) {
 862                     throwMalformed(sp - 1, 1);
 863                 }
 864                 sp++;
 865                 putChar(dst, dp++, repl);
 866                 if (sp  < sl && isMalformed4_3(src[sp])) {
 867                     continue;
 868                 }
 869                 break;
 870             } else {
 871                 if (!doReplace) {
 872                     throwMalformed(sp - 1, 1);
 873                 }
 874                 putChar(dst, dp++, repl);
 875             }
 876         }
 877         if (dp != len) {
 878             dst = Arrays.copyOf(dst, dp << 1);
 879         }
 880         return ret.with(dst, UTF16);
 881     }
 882 
 883     private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
 884         if (coder == UTF16)
 885             return encodeUTF8_UTF16(val, doReplace);
 886 
 887         if (!hasNegatives(val, 0, val.length))
 888             return Arrays.copyOf(val, val.length);
 889 
 890         int dp = 0;
 891         byte[] dst = new byte[val.length << 1];
 892         for (int sp = 0; sp < val.length; sp++) {
 893             byte c = val[sp];
 894             if (c < 0) {
 895                 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 896                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 897             } else {
 898                 dst[dp++] = c;
 899             }
 900         }
 901         if (dp == dst.length)
 902             return dst;
 903         return Arrays.copyOf(dst, dp);
 904     }
 905 
 906     private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
 907         int dp = 0;
 908         int sp = 0;
 909         int sl = val.length >> 1;
 910         byte[] dst = new byte[sl * 3];
 911         char c;
 912         while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 913             // ascii fast loop;
 914             dst[dp++] = (byte)c;
 915             sp++;
 916         }
 917         while (sp < sl) {
 918             c = StringUTF16.getChar(val, sp++);
 919             if (c < 0x80) {
 920                 dst[dp++] = (byte)c;
 921             } else if (c < 0x800) {
 922                 dst[dp++] = (byte)(0xc0 | (c >> 6));
 923                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 924             } else if (Character.isSurrogate(c)) {
 925                 int uc = -1;
 926                 char c2;
 927                 if (Character.isHighSurrogate(c) && sp < sl &&
 928                     Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 929                     uc = Character.toCodePoint(c, c2);
 930                 }
 931                 if (uc < 0) {
 932                     if (doReplace) {
 933                         dst[dp++] = '?';
 934                     } else {
 935                         throwUnmappable(sp - 1, 1); // or 2, does not matter here
 936                     }
 937                 } else {
 938                     dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 939                     dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 940                     dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 941                     dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 942                     sp++;  // 2 chars
 943                 }
 944             } else {
 945                 // 3 bytes, 16 bits
 946                 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 947                 dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 948                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 949             }
 950         }
 951         if (dp == dst.length) {
 952             return dst;
 953         }
 954         return Arrays.copyOf(dst, dp);
 955     }
 956 
 957     ////////////////////// for j.u.z.ZipCoder //////////////////////////
 958 
 959     /*
 960      * Throws iae, instead of replacing, if malformed or unmappable.
 961      */
 962     static String newStringUTF8NoRepl(byte[] src, int off, int len) {
 963         if (COMPACT_STRINGS && !hasNegatives(src, off, len))
 964             return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
 965         Result ret = decodeUTF8_0(src, off, len, false);
 966         return new String(ret.value, ret.coder);
 967     }
 968 
 969     /*
 970      * Throws iae, instead of replacing, if unmappable.
 971      */
 972     static byte[] getBytesUTF8NoRepl(String s) {
 973         return encodeUTF8(s.coder(), s.value(), false);
 974     }
 975 
 976     ////////////////////// for j.n.f.Files //////////////////////////
 977 
 978     private static boolean isASCII(byte[] src) {
 979         return !hasNegatives(src, 0, src.length);
 980     }
 981 
 982     private static String newStringLatin1(byte[] src) {
 983         if (COMPACT_STRINGS)
 984            return new String(src, LATIN1);
 985         return new String(StringLatin1.inflate(src, 0, src.length), UTF16);
 986     }
 987 
 988     static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException {
 989         try {
 990             return newStringNoRepl1(src, cs);
 991         } catch (IllegalArgumentException e) {
 992             Throwable cause = e.getCause();
 993             if (cause != null && cause instanceof MalformedInputException) {
 994                 throw (MalformedInputException)cause;
 995             }
 996             throw new UnmappableCharacterException(1);
 997         }
 998     }
 999 
1000     static String newStringNoRepl1(byte[] src, Charset cs) {
1001         if (cs == UTF_8) {
1002             if (COMPACT_STRINGS && isASCII(src))
1003                 return new String(src, LATIN1);
1004             Result ret = decodeUTF8_0(src, 0, src.length, false);
1005             return new String(ret.value, ret.coder);
1006         }
1007         if (cs == ISO_8859_1) {
1008             return newStringLatin1(src);
1009         }
1010         if (cs == US_ASCII) {
1011             if (isASCII(src)) {
1012                 return newStringLatin1(src);
1013             } else {
1014                 throwMalformed(src);
1015             }
1016         }
1017 
1018         CharsetDecoder cd = cs.newDecoder();
1019         // ascii fastpath
1020         if ((cd instanceof ArrayDecoder) &&
1021             ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) {
1022             return newStringLatin1(src);
1023         }
1024         int len = src.length;
1025         if (len == 0) {
1026             return "";
1027         }
1028         int en = scale(len, cd.maxCharsPerByte());
1029         char[] ca = new char[en];
1030         if (cs.getClass().getClassLoader0() != null &&
1031             System.getSecurityManager() != null) {
1032             src = Arrays.copyOf(src, len);
1033         }
1034         ByteBuffer bb = ByteBuffer.wrap(src);
1035         CharBuffer cb = CharBuffer.wrap(ca);
1036         try {
1037             CoderResult cr = cd.decode(bb, cb, true);
1038             if (!cr.isUnderflow())
1039                 cr.throwException();
1040             cr = cd.flush(cb);
1041             if (!cr.isUnderflow())
1042                 cr.throwException();
1043         } catch (CharacterCodingException x) {
1044             throw new IllegalArgumentException(x);  // todo
1045         }
1046         Result ret = resultCached.get().with(ca, 0, cb.position());
1047         return new String(ret.value, ret.coder);
1048     }
1049 
1050     /*
1051      * Throws iae, instead of replacing, if unmappable.
1052      */
1053     static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException {
1054         try {
1055             return getBytesNoRepl1(s, cs);
1056         } catch (IllegalArgumentException e) {
1057             Throwable cause = e.getCause();
1058             if (cause != null && cause instanceof UnmappableCharacterException) {
1059                 throw (UnmappableCharacterException)cause;
1060             }
1061             throw new UnmappableCharacterException(1);
1062         }
1063     }
1064 
1065     static byte[] getBytesNoRepl1(String s, Charset cs) {
1066         byte[] val = s.value();
1067         byte coder = s.coder();
1068         if (cs == UTF_8) {
1069             if (isASCII(val)) {
1070                 return val;
1071             }
1072             return encodeUTF8(coder, val, false);
1073         }
1074         if (cs == ISO_8859_1) {
1075             if (coder == LATIN1) {
1076                 return val;
1077             }
1078             return encode8859_1(coder, val, false);
1079         }
1080         if (cs == US_ASCII) {
1081             if (coder == LATIN1) {
1082                 if (isASCII(val)) {
1083                     return val;
1084                 } else {
1085                     throwUnmappable(val);
1086                 }
1087             }
1088         }
1089         CharsetEncoder ce = cs.newEncoder();
1090         // fastpath for ascii compatible
1091         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
1092                                  ((ArrayEncoder)ce).isASCIICompatible() &&
1093                                  isASCII(val)))) {
1094             return val;
1095         }
1096         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
1097         int en = scale(len, ce.maxBytesPerChar());
1098         byte[] ba = new byte[en];
1099         if (len == 0) {
1100             return ba;
1101         }
1102         if (ce instanceof ArrayEncoder) {
1103             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
1104                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
1105             if (blen != -1) {
1106                 return safeTrim(ba, blen, true);
1107             }
1108         }
1109         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
1110                             System.getSecurityManager() == null;
1111         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
1112                                        : StringUTF16.toChars(val);
1113         ByteBuffer bb = ByteBuffer.wrap(ba);
1114         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
1115         try {
1116             CoderResult cr = ce.encode(cb, bb, true);
1117             if (!cr.isUnderflow())
1118                 cr.throwException();
1119             cr = ce.flush(bb);
1120             if (!cr.isUnderflow())
1121                 cr.throwException();
1122         } catch (CharacterCodingException x) {
1123             throw new IllegalArgumentException(x);
1124         }
1125         return safeTrim(ba, bb.position(), isTrusted);
1126     }
1127 }