1 /*
2 * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
30 import java.nio.ByteBuffer;
31 import java.nio.CharBuffer;
32 import java.nio.charset.Charset;
33 import java.nio.charset.CharsetDecoder;
34 import java.nio.charset.CharsetEncoder;
35 import java.nio.charset.CharacterCodingException;
36 import java.nio.charset.CoderResult;
37 import java.nio.charset.CodingErrorAction;
38 import java.nio.charset.IllegalCharsetNameException;
39 import java.nio.charset.UnsupportedCharsetException;
40 import java.util.Arrays;
41 import jdk.internal.HotSpotIntrinsicCandidate;
42 import sun.nio.cs.HistoricallyNamedCharset;
43 import sun.nio.cs.ArrayDecoder;
44 import sun.nio.cs.ArrayEncoder;
45 import sun.nio.cs.StandardCharsets;
46
47 import static java.lang.String.LATIN1;
48 import static java.lang.String.UTF16;
49 import static java.lang.String.COMPACT_STRINGS;
50
51 /**
52 * Utility class for string encoding and decoding.
53 */
54
55 class StringCoding {
56
57 private StringCoding() { }
58
59 /** The cached coders for each thread */
60 private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
61 new ThreadLocal<>();
62 private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
63 new ThreadLocal<>();
64
65 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
66 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
67 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
68
69 private static boolean warnUnsupportedCharset = true;
70
71 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
72 SoftReference<T> sr = tl.get();
73 if (sr == null)
74 return null;
75 return sr.get();
76 }
77
78 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
79 tl.set(new SoftReference<>(ob));
80 }
81
82 // Trim the given byte array to the given length
83 //
84 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
85 if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
86 return ba;
87 else
88 return Arrays.copyOf(ba, len);
89 }
90
91 private static int scale(int len, float expansionFactor) {
92 // We need to perform double, not float, arithmetic; otherwise
93 // we lose low order bits when len is larger than 2**24.
94 return (int)(len * (double)expansionFactor);
95 }
96
97 private static Charset lookupCharset(String csn) {
98 if (Charset.isSupported(csn)) {
99 try {
100 return Charset.forName(csn);
101 } catch (UnsupportedCharsetException x) {
102 throw new Error(x);
103 }
104 }
105 return null;
106 }
107
108 private static void warnUnsupportedCharset(String csn) {
109 if (warnUnsupportedCharset) {
110 // Use err(String) rather than the Logging API or System.err
111 // since this method may be called during VM initialization
112 // before either is available.
113 err("WARNING: Default charset " + csn +
114 " not supported, using ISO-8859-1 instead\n");
115 warnUnsupportedCharset = false;
116 }
117 }
118
119 static class Result {
120 byte[] value;
121 byte coder;
122
123 Result with() {
124 coder = COMPACT_STRINGS ? LATIN1 : UTF16;
125 value = new byte[0];
126 return this;
127 }
128
129 Result with(char[] val, int off, int len) {
130 if (String.COMPACT_STRINGS) {
131 byte[] bs = StringUTF16.compress(val, off, len);
132 if (bs != null) {
133 value = bs;
134 coder = LATIN1;
135 return this;
136 }
137 }
138 coder = UTF16;
207 }
208 cd.reset();
209 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
210 CharBuffer cb = CharBuffer.wrap(ca);
211 try {
212 CoderResult cr = cd.decode(bb, cb, true);
213 if (!cr.isUnderflow())
214 cr.throwException();
215 cr = cd.flush(cb);
216 if (!cr.isUnderflow())
217 cr.throwException();
218 } catch (CharacterCodingException x) {
219 // Substitution is always enabled,
220 // so this shouldn't happen
221 throw new Error(x);
222 }
223 return result.with(ca, 0, cb.position());
224 }
225 }
226
227 private static class StringDecoder8859_1 extends StringDecoder {
228 StringDecoder8859_1(Charset cs, String rcn) {
229 super(cs, rcn);
230 }
231 Result decode(byte[] ba, int off, int len) {
232 if (COMPACT_STRINGS) {
233 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
234 } else {
235 return result.with(StringLatin1.inflate(ba, off, len), UTF16);
236 }
237 }
238 }
239
240 static Result decode(String charsetName, byte[] ba, int off, int len)
241 throws UnsupportedEncodingException
242 {
243 StringDecoder sd = deref(decoder);
244 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
245 if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
246 || csn.equals(sd.charsetName()))) {
247 sd = null;
248 try {
249 Charset cs = lookupCharset(csn);
250 if (cs != null) {
251 if (cs == UTF_8) {
252 sd = new StringDecoderUTF8(cs, csn);
253 } else if (cs == ISO_8859_1) {
254 sd = new StringDecoder8859_1(cs, csn);
255 } else {
256 sd = new StringDecoder(cs, csn);
257 }
258 }
259 } catch (IllegalCharsetNameException x) {}
260 if (sd == null)
261 throw new UnsupportedEncodingException(csn);
262 set(decoder, sd);
263 }
264 return sd.decode(ba, off, len);
265 }
266
267 static Result decode(Charset cs, byte[] ba, int off, int len) {
268 // (1)We never cache the "external" cs, the only benefit of creating
269 // an additional StringDe/Encoder object to wrap it is to share the
270 // de/encode() method. These SD/E objects are short-lived, the young-gen
271 // gc should be able to take care of them well. But the best approach
272 // is still not to generate them if not really necessary.
273 // (2)The defensive copy of the input byte/char[] has a big performance
274 // impact, as well as the outgoing result byte/char[]. Need to do the
275 // optimization check of (sm==null && classLoader0==null) for both.
276 // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
277 // is only checked (and then isTrusted gets set) when (SM==null). It is
278 // possible that the SM==null for now but then SM is NOT null later
279 // when safeTrim() is invoked...the "safe" way to do is to redundant
280 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
281 // but it then can be argued that the SM is null when the operation
282 // is started...
283 if (cs == UTF_8) {
284 return StringDecoderUTF8.decode(ba, off, len, new Result());
285 }
286 CharsetDecoder cd = cs.newDecoder();
287 // ascii fastpath
288 if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) &&
289 ((ArrayDecoder)cd).isASCIICompatible() &&
290 !hasNegatives(ba, off, len))) {
291 if (COMPACT_STRINGS) {
292 return new Result().with(Arrays.copyOfRange(ba, off, off + len),
293 LATIN1);
294 } else {
295 return new Result().with(StringLatin1.inflate(ba, off, len), UTF16);
296 }
297 }
298 int en = scale(len, cd.maxCharsPerByte());
299 if (len == 0) {
300 return new Result().with();
301 }
302 if (cs.getClass().getClassLoader0() != null &&
303 System.getSecurityManager() != null) {
304 ba = Arrays.copyOfRange(ba, off, off + len);
305 off = 0;
306 }
307 cd.onMalformedInput(CodingErrorAction.REPLACE)
308 .onUnmappableCharacter(CodingErrorAction.REPLACE)
309 .reset();
310
311 char[] ca = new char[en];
312 if (cd instanceof ArrayDecoder) {
313 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
314 return new Result().with(ca, 0, clen);
315 }
316 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
317 CharBuffer cb = CharBuffer.wrap(ca);
318 try {
319 CoderResult cr = cd.decode(bb, cb, true);
320 if (!cr.isUnderflow())
321 cr.throwException();
322 cr = cd.flush(cb);
323 if (!cr.isUnderflow())
324 cr.throwException();
325 } catch (CharacterCodingException x) {
326 // Substitution is always enabled,
327 // so this shouldn't happen
328 throw new Error(x);
329 }
330 return new Result().with(ca, 0, cb.position());
331 }
332
333 static Result decode(byte[] ba, int off, int len) {
334 String csn = Charset.defaultCharset().name();
335 try {
336 // use charset name decode() variant which provides caching.
337 return decode(csn, ba, off, len);
338 } catch (UnsupportedEncodingException x) {
339 warnUnsupportedCharset(csn);
340 }
341 try {
342 return decode("ISO-8859-1", ba, off, len);
343 } catch (UnsupportedEncodingException x) {
344 // If this code is hit during VM initialization, err(String) is
345 // the only way we will be able to get any kind of error message.
346 err("ISO-8859-1 charset not available: " + x.toString() + "\n");
347 // If we can not find ISO-8859-1 (a required encoding) then things
348 // are seriously wrong with the installation.
349 System.exit(1);
350 return null;
351 }
352 }
353
354 // -- Encoding --
355 private static class StringEncoder {
356 private Charset cs;
357 private CharsetEncoder ce;
358 private final boolean isASCIICompatible;
359 private final String requestedCharsetName;
360 private final boolean isTrusted;
361
362 private StringEncoder(Charset cs, String rcn) {
363 this.requestedCharsetName = rcn;
364 this.cs = cs;
365 this.ce = cs.newEncoder()
366 .onMalformedInput(CodingErrorAction.REPLACE)
367 .onUnmappableCharacter(CodingErrorAction.REPLACE);
368 this.isTrusted = (cs.getClass().getClassLoader0() == null);
369 this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
370 ((ArrayEncoder)ce).isASCIICompatible();
371 }
376 return cs.name();
377 }
378
379 final String requestedCharsetName() {
380 return requestedCharsetName;
381 }
382
383 byte[] encode(byte coder, byte[] val) {
384 // fastpath for ascii compatible
385 if (coder == LATIN1 && isASCIICompatible &&
386 !hasNegatives(val, 0, val.length)) {
387 return Arrays.copyOf(val, val.length);
388 }
389 int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
390 int en = scale(len, ce.maxBytesPerChar());
391 byte[] ba = new byte[en];
392 if (len == 0) {
393 return ba;
394 }
395 if (ce instanceof ArrayEncoder) {
396 if (!isTrusted) {
397 val = Arrays.copyOf(val, val.length);
398 }
399 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
400 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
401 if (blen != -1) {
402 return safeTrim(ba, blen, isTrusted);
403 }
404 }
405 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
406 : StringUTF16.toChars(val);
407 ce.reset();
408 ByteBuffer bb = ByteBuffer.wrap(ba);
409 CharBuffer cb = CharBuffer.wrap(ca, 0, len);
410 try {
411 CoderResult cr = ce.encode(cb, bb, true);
412 if (!cr.isUnderflow())
413 cr.throwException();
414 cr = ce.flush(bb);
415 if (!cr.isUnderflow())
416 cr.throwException();
417 } catch (CharacterCodingException x) {
418 // Substitution is always enabled,
419 // so this shouldn't happen
420 throw new Error(x);
421 }
422 return safeTrim(ba, bb.position(), isTrusted);
423 }
424 }
425
426 @HotSpotIntrinsicCandidate
427 private static int implEncodeISOArray(byte[] sa, int sp,
428 byte[] da, int dp, int len) {
429 int i = 0;
430 for (; i < len; i++) {
431 char c = StringUTF16.getChar(sa, sp++);
432 if (c > '\u00FF')
433 break;
434 da[dp++] = (byte)c;
435 }
436 return i;
437 }
438
439 static byte[] encode8859_1(byte coder, byte[] val) {
440 if (coder == LATIN1) {
441 return Arrays.copyOf(val, val.length);
442 }
443 int len = val.length >> 1;
444 byte[] dst = new byte[len];
445 int dp = 0;
446 int sp = 0;
447 int sl = len;
448 while (sp < sl) {
449 int ret = implEncodeISOArray(val, sp, dst, dp, len);
450 sp = sp + ret;
451 dp = dp + ret;
452 if (ret != len) {
453 char c = StringUTF16.getChar(val, sp++);
454 if (Character.isHighSurrogate(c) && sp < sl &&
455 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
456 sp++;
457 }
458 dst[dp++] = '?';
459 len = sl - sp;
460 }
461 }
462 if (dp == dst.length) {
463 return dst;
464 }
465 return Arrays.copyOf(dst, dp);
466 }
467
468 static byte[] encodeASCII(byte coder, byte[] val) {
469 if (coder == LATIN1) {
470 byte[] dst = new byte[val.length];
471 for (int i = 0; i < val.length; i++) {
472 if (val[i] < 0) {
473 dst[i] = '?';
474 } else {
475 dst[i] = val[i];
476 }
477 }
478 return dst;
479 }
480 int len = val.length >> 1;
481 byte[] dst = new byte[len];
482 int dp = 0;
483 for (int i = 0; i < len; i++) {
484 char c = StringUTF16.getChar(val, i);
485 if (c < 0x80) {
486 dst[dp++] = (byte)c;
487 continue;
488 }
489 if (Character.isHighSurrogate(c) && i + 1 < len &&
490 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
491 i++;
492 }
493 dst[dp++] = '?';
494 }
495 if (len == dp) {
496 return dst;
497 }
498 return Arrays.copyOf(dst, dp);
499 }
500
501 static byte[] encodeUTF8(byte coder, byte[] val) {
502 int dp = 0;
503 byte[] dst;
504 if (coder == LATIN1) {
505 dst = new byte[val.length << 1];
506 for (int sp = 0; sp < val.length; sp++) {
507 byte c = val[sp];
508 if (c < 0) {
509 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
510 dst[dp++] = (byte)(0x80 | (c & 0x3f));
511 } else {
512 dst[dp++] = c;
513 }
514 }
515 } else {
516 int sp = 0;
517 int sl = val.length >> 1;
518 dst = new byte[sl * 3];
519 char c;
520 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
521 // ascii fast loop;
522 dst[dp++] = (byte)c;
523 sp++;
524 }
525 while (sp < sl) {
526 c = StringUTF16.getChar(val, sp++);
527 if (c < 0x80) {
528 dst[dp++] = (byte)c;
529 } else if (c < 0x800) {
530 dst[dp++] = (byte)(0xc0 | (c >> 6));
531 dst[dp++] = (byte)(0x80 | (c & 0x3f));
532 } else if (Character.isSurrogate(c)) {
533 int uc = -1;
534 char c2;
535 if (Character.isHighSurrogate(c) && sp < sl &&
536 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
537 uc = Character.toCodePoint(c, c2);
538 }
539 if (uc < 0) {
540 dst[dp++] = '?';
541 } else {
542 dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
543 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
544 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
545 dst[dp++] = (byte)(0x80 | (uc & 0x3f));
546 sp++; // 2 chars
547 }
548 } else {
549 // 3 bytes, 16 bits
550 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
551 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
552 dst[dp++] = (byte)(0x80 | (c & 0x3f));
553 }
554 }
555 }
556 if (dp == dst.length) {
557 return dst;
558 }
559 return Arrays.copyOf(dst, dp);
560 }
561
562 static byte[] encode(String charsetName, byte coder, byte[] val)
563 throws UnsupportedEncodingException
564 {
565 StringEncoder se = deref(encoder);
566 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
567 if ((se == null) || !(csn.equals(se.requestedCharsetName())
568 || csn.equals(se.charsetName()))) {
569 se = null;
570 try {
571 Charset cs = lookupCharset(csn);
572 if (cs != null) {
573 if (cs == UTF_8) {
574 return encodeUTF8(coder, val);
575 } else if (cs == ISO_8859_1) {
576 return encode8859_1(coder, val);
577 } else if (cs == US_ASCII) {
578 return encodeASCII(coder, val);
579 }
580 se = new StringEncoder(cs, csn);
581 }
582 } catch (IllegalCharsetNameException x) {}
583 if (se == null) {
584 throw new UnsupportedEncodingException (csn);
585 }
586 set(encoder, se);
587 }
588 return se.encode(coder, val);
589 }
590
591 static byte[] encode(Charset cs, byte coder, byte[] val) {
592 if (cs == UTF_8) {
593 return encodeUTF8(coder, val);
594 } else if (cs == ISO_8859_1) {
595 return encode8859_1(coder, val);
596 } else if (cs == US_ASCII) {
597 return encodeASCII(coder, val);
598 }
599 CharsetEncoder ce = cs.newEncoder();
600 // fastpath for ascii compatible
601 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
602 ((ArrayEncoder)ce).isASCIICompatible() &&
603 !hasNegatives(val, 0, val.length)))) {
604 return Arrays.copyOf(val, val.length);
605 }
606 int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
607 int en = scale(len, ce.maxBytesPerChar());
608 byte[] ba = new byte[en];
609 if (len == 0) {
610 return ba;
611 }
612 boolean isTrusted = cs.getClass().getClassLoader0() == null ||
613 System.getSecurityManager() == null;
614 ce.onMalformedInput(CodingErrorAction.REPLACE)
615 .onUnmappableCharacter(CodingErrorAction.REPLACE)
616 .reset();
617 if (ce instanceof ArrayEncoder) {
618 if (!isTrusted) {
619 val = Arrays.copyOf(val, val.length);
620 }
621 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
622 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
623 if (blen != -1) {
624 return safeTrim(ba, blen, isTrusted);
625 }
626 }
627 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
628 : StringUTF16.toChars(val);
629 ByteBuffer bb = ByteBuffer.wrap(ba);
630 CharBuffer cb = CharBuffer.wrap(ca, 0, len);
631 try {
632 CoderResult cr = ce.encode(cb, bb, true);
633 if (!cr.isUnderflow())
634 cr.throwException();
635 cr = ce.flush(bb);
636 if (!cr.isUnderflow())
637 cr.throwException();
638 } catch (CharacterCodingException x) {
639 throw new Error(x);
640 }
641 return safeTrim(ba, bb.position(), isTrusted);
642 }
643
644 static byte[] encode(byte coder, byte[] val) {
645 String csn = Charset.defaultCharset().name();
646 try {
647 // use charset name encode() variant which provides caching.
648 return encode(csn, coder, val);
649 } catch (UnsupportedEncodingException x) {
650 warnUnsupportedCharset(csn);
651 }
652 try {
653 return encode("ISO-8859-1", coder, val);
654 } catch (UnsupportedEncodingException x) {
655 // If this code is hit during VM initialization, err(String) is
656 // the only way we will be able to get any kind of error message.
657 err("ISO-8859-1 charset not available: " + x.toString() + "\n");
658 // If we can not find ISO-8859-1 (a required encoding) then things
659 // are seriously wrong with the installation.
660 System.exit(1);
661 return null;
662 }
663 }
664
665 /**
666 * Print a message directly to stderr, bypassing all character conversion
667 * methods.
668 * @param msg message to print
669 */
670 private static native void err(String msg);
671 }
|
1 /*
2 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
30 import java.nio.ByteBuffer;
31 import java.nio.CharBuffer;
32 import java.nio.charset.Charset;
33 import java.nio.charset.CharsetDecoder;
34 import java.nio.charset.CharsetEncoder;
35 import java.nio.charset.CharacterCodingException;
36 import java.nio.charset.CoderResult;
37 import java.nio.charset.CodingErrorAction;
38 import java.nio.charset.IllegalCharsetNameException;
39 import java.nio.charset.UnsupportedCharsetException;
40 import java.util.Arrays;
41 import jdk.internal.HotSpotIntrinsicCandidate;
42 import sun.nio.cs.HistoricallyNamedCharset;
43 import sun.nio.cs.ArrayDecoder;
44 import sun.nio.cs.ArrayEncoder;
45 import sun.nio.cs.StandardCharsets;
46
47 import static java.lang.String.LATIN1;
48 import static java.lang.String.UTF16;
49 import static java.lang.String.COMPACT_STRINGS;
50 import static java.lang.Character.isSurrogate;
51 import static java.lang.Character.highSurrogate;
52 import static java.lang.Character.lowSurrogate;
53 import static java.lang.Character.isSupplementaryCodePoint;
54 import static java.lang.StringUTF16.putChar;
55
56 /**
57 * Utility class for string encoding and decoding.
58 */
59
60 class StringCoding {
61
62 private StringCoding() { }
63
64 /** The cached coders for each thread */
65 private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
66 new ThreadLocal<>();
67 private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
68 new ThreadLocal<>();
69
70 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
71 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
72 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
73
74 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
75 SoftReference<T> sr = tl.get();
76 if (sr == null)
77 return null;
78 return sr.get();
79 }
80
81 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
82 tl.set(new SoftReference<>(ob));
83 }
84
85 // Trim the given byte array to the given length
86 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
87 if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
88 return ba;
89 else
90 return Arrays.copyOf(ba, len);
91 }
92
93 private static int scale(int len, float expansionFactor) {
94 // We need to perform double, not float, arithmetic; otherwise
95 // we lose low order bits when len is larger than 2**24.
96 return (int)(len * (double)expansionFactor);
97 }
98
99 private static Charset lookupCharset(String csn) {
100 if (Charset.isSupported(csn)) {
101 try {
102 return Charset.forName(csn);
103 } catch (UnsupportedCharsetException x) {
104 throw new Error(x);
105 }
106 }
107 return null;
108 }
109
110 static class Result {
111 byte[] value;
112 byte coder;
113
114 Result with() {
115 coder = COMPACT_STRINGS ? LATIN1 : UTF16;
116 value = new byte[0];
117 return this;
118 }
119
120 Result with(char[] val, int off, int len) {
121 if (String.COMPACT_STRINGS) {
122 byte[] bs = StringUTF16.compress(val, off, len);
123 if (bs != null) {
124 value = bs;
125 coder = LATIN1;
126 return this;
127 }
128 }
129 coder = UTF16;
198 }
199 cd.reset();
200 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
201 CharBuffer cb = CharBuffer.wrap(ca);
202 try {
203 CoderResult cr = cd.decode(bb, cb, true);
204 if (!cr.isUnderflow())
205 cr.throwException();
206 cr = cd.flush(cb);
207 if (!cr.isUnderflow())
208 cr.throwException();
209 } catch (CharacterCodingException x) {
210 // Substitution is always enabled,
211 // so this shouldn't happen
212 throw new Error(x);
213 }
214 return result.with(ca, 0, cb.position());
215 }
216 }
217
218 static Result decode(String charsetName, byte[] ba, int off, int len)
219 throws UnsupportedEncodingException
220 {
221 StringDecoder sd = deref(decoder);
222 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
223 if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
224 || csn.equals(sd.charsetName()))) {
225 sd = null;
226 try {
227 Charset cs = lookupCharset(csn);
228 if (cs != null) {
229 if (cs == UTF_8) {
230 return decodeUTF8(ba, off, len, true);
231 }
232 if (cs == ISO_8859_1) {
233 return decodeLatin1(ba, off, len);
234 }
235 if (cs == US_ASCII) {
236 return decodeASCII(ba, off, len);
237 }
238 sd = new StringDecoder(cs, csn);
239 }
240 } catch (IllegalCharsetNameException x) {}
241 if (sd == null)
242 throw new UnsupportedEncodingException(csn);
243 set(decoder, sd);
244 }
245 return sd.decode(ba, off, len);
246 }
247
248 static Result decode(Charset cs, byte[] ba, int off, int len) {
249 if (cs == UTF_8) {
250 return decodeUTF8(ba, off, len, true);
251 }
252 if (cs == ISO_8859_1) {
253 return decodeLatin1(ba, off, len);
254 }
255 if (cs == US_ASCII) {
256 return decodeASCII(ba, off, len);
257 }
258
259 // (1)We never cache the "external" cs, the only benefit of creating
260 // an additional StringDe/Encoder object to wrap it is to share the
261 // de/encode() method. These SD/E objects are short-lived, the young-gen
262 // gc should be able to take care of them well. But the best approach
263 // is still not to generate them if not really necessary.
264 // (2)The defensive copy of the input byte/char[] has a big performance
265 // impact, as well as the outgoing result byte/char[]. Need to do the
266 // optimization check of (sm==null && classLoader0==null) for both.
267 // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
268 // is only checked (and then isTrusted gets set) when (SM==null). It is
269 // possible that the SM==null for now but then SM is NOT null later
270 // when safeTrim() is invoked...the "safe" way to do is to redundant
271 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
272 // but it then can be argued that the SM is null when the operation
273 // is started...
274 CharsetDecoder cd = cs.newDecoder();
275 // ascii fastpath
276 if ((cd instanceof ArrayDecoder) &&
277 ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
278 return decodeLatin1(ba, off, len);
279 }
280 int en = scale(len, cd.maxCharsPerByte());
281 if (len == 0) {
282 return new Result().with();
283 }
284 cd.onMalformedInput(CodingErrorAction.REPLACE)
285 .onUnmappableCharacter(CodingErrorAction.REPLACE)
286 .reset();
287 char[] ca = new char[en];
288 if (cd instanceof ArrayDecoder) {
289 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
290 return new Result().with(ca, 0, clen);
291 }
292 if (cs.getClass().getClassLoader0() != null &&
293 System.getSecurityManager() != null) {
294 ba = Arrays.copyOfRange(ba, off, off + len);
295 off = 0;
296 }
297 ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
298 CharBuffer cb = CharBuffer.wrap(ca);
299 try {
300 CoderResult cr = cd.decode(bb, cb, true);
301 if (!cr.isUnderflow())
302 cr.throwException();
303 cr = cd.flush(cb);
304 if (!cr.isUnderflow())
305 cr.throwException();
306 } catch (CharacterCodingException x) {
307 // Substitution is always enabled,
308 // so this shouldn't happen
309 throw new Error(x);
310 }
311 return new Result().with(ca, 0, cb.position());
312 }
313
314 static Result decode(byte[] ba, int off, int len) {
315 Charset cs = Charset.defaultCharset();
316 if (cs == UTF_8) {
317 return decodeUTF8(ba, off, len, true);
318 }
319 if (cs == ISO_8859_1) {
320 return decodeLatin1(ba, off, len);
321 }
322 if (cs == US_ASCII) {
323 return decodeASCII(ba, off, len);
324 }
325 StringDecoder sd = deref(decoder);
326 if (sd == null || !cs.name().equals(sd.cs.name())) {
327 sd = new StringDecoder(cs, cs.name());
328 set(decoder, sd);
329 }
330 return sd.decode(ba, off, len);
331 }
332
333 // -- Encoding --
334 private static class StringEncoder {
335 private Charset cs;
336 private CharsetEncoder ce;
337 private final boolean isASCIICompatible;
338 private final String requestedCharsetName;
339 private final boolean isTrusted;
340
341 private StringEncoder(Charset cs, String rcn) {
342 this.requestedCharsetName = rcn;
343 this.cs = cs;
344 this.ce = cs.newEncoder()
345 .onMalformedInput(CodingErrorAction.REPLACE)
346 .onUnmappableCharacter(CodingErrorAction.REPLACE);
347 this.isTrusted = (cs.getClass().getClassLoader0() == null);
348 this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
349 ((ArrayEncoder)ce).isASCIICompatible();
350 }
355 return cs.name();
356 }
357
358 final String requestedCharsetName() {
359 return requestedCharsetName;
360 }
361
362 byte[] encode(byte coder, byte[] val) {
363 // fastpath for ascii compatible
364 if (coder == LATIN1 && isASCIICompatible &&
365 !hasNegatives(val, 0, val.length)) {
366 return Arrays.copyOf(val, val.length);
367 }
368 int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
369 int en = scale(len, ce.maxBytesPerChar());
370 byte[] ba = new byte[en];
371 if (len == 0) {
372 return ba;
373 }
374 if (ce instanceof ArrayEncoder) {
375 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
376 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
377 if (blen != -1) {
378 return safeTrim(ba, blen, isTrusted);
379 }
380 }
381 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
382 : StringUTF16.toChars(val);
383 ce.reset();
384 ByteBuffer bb = ByteBuffer.wrap(ba);
385 CharBuffer cb = CharBuffer.wrap(ca, 0, len);
386 try {
387 CoderResult cr = ce.encode(cb, bb, true);
388 if (!cr.isUnderflow())
389 cr.throwException();
390 cr = ce.flush(bb);
391 if (!cr.isUnderflow())
392 cr.throwException();
393 } catch (CharacterCodingException x) {
394 // Substitution is always enabled,
395 // so this shouldn't happen
396 throw new Error(x);
397 }
398 return safeTrim(ba, bb.position(), isTrusted);
399 }
400 }
401
402 static byte[] encode(String charsetName, byte coder, byte[] val)
403 throws UnsupportedEncodingException
404 {
405 StringEncoder se = deref(encoder);
406 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
407 if ((se == null) || !(csn.equals(se.requestedCharsetName())
408 || csn.equals(se.charsetName()))) {
409 se = null;
410 try {
411 Charset cs = lookupCharset(csn);
412 if (cs != null) {
413 if (cs == UTF_8) {
414 return encodeUTF8(coder, val, true);
415 }
416 if (cs == ISO_8859_1) {
417 return encode8859_1(coder, val);
418 }
419 if (cs == US_ASCII) {
420 return encodeASCII(coder, val);
421 }
422 se = new StringEncoder(cs, csn);
423 }
424 } catch (IllegalCharsetNameException x) {}
425 if (se == null) {
426 throw new UnsupportedEncodingException (csn);
427 }
428 set(encoder, se);
429 }
430 return se.encode(coder, val);
431 }
432
433 static byte[] encode(Charset cs, byte coder, byte[] val) {
434 if (cs == UTF_8) {
435 return encodeUTF8(coder, val, true);
436 }
437 if (cs == ISO_8859_1) {
438 return encode8859_1(coder, val);
439 }
440 if (cs == US_ASCII) {
441 return encodeASCII(coder, val);
442 }
443 CharsetEncoder ce = cs.newEncoder();
444 // fastpath for ascii compatible
445 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
446 ((ArrayEncoder)ce).isASCIICompatible() &&
447 !hasNegatives(val, 0, val.length)))) {
448 return Arrays.copyOf(val, val.length);
449 }
450 int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
451 int en = scale(len, ce.maxBytesPerChar());
452 byte[] ba = new byte[en];
453 if (len == 0) {
454 return ba;
455 }
456 ce.onMalformedInput(CodingErrorAction.REPLACE)
457 .onUnmappableCharacter(CodingErrorAction.REPLACE)
458 .reset();
459 if (ce instanceof ArrayEncoder) {
460 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
461 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
462 if (blen != -1) {
463 return safeTrim(ba, blen, true);
464 }
465 }
466 boolean isTrusted = cs.getClass().getClassLoader0() == null ||
467 System.getSecurityManager() == null;
468 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
469 : StringUTF16.toChars(val);
470 ByteBuffer bb = ByteBuffer.wrap(ba);
471 CharBuffer cb = CharBuffer.wrap(ca, 0, len);
472 try {
473 CoderResult cr = ce.encode(cb, bb, true);
474 if (!cr.isUnderflow())
475 cr.throwException();
476 cr = ce.flush(bb);
477 if (!cr.isUnderflow())
478 cr.throwException();
479 } catch (CharacterCodingException x) {
480 throw new Error(x);
481 }
482 return safeTrim(ba, bb.position(), isTrusted);
483 }
484
485 static byte[] encode(byte coder, byte[] val) {
486 Charset cs = Charset.defaultCharset();
487 if (cs == UTF_8) {
488 return encodeUTF8(coder, val, true);
489 }
490 if (cs == ISO_8859_1) {
491 return encode8859_1(coder, val);
492 }
493 if (cs == US_ASCII) {
494 return encodeASCII(coder, val);
495 }
496 StringEncoder se = deref(encoder);
497 if (se == null || !cs.name().equals(se.cs.name())) {
498 se = new StringEncoder(cs, cs.name());
499 set(encoder, se);
500 }
501 return se.encode(coder, val);
502 }
503
504 /**
505 * Print a message directly to stderr, bypassing all character conversion
506 * methods.
507 * @param msg message to print
508 */
509 private static native void err(String msg);
510
511 /* The cached Result for each thread */
512 private static final ThreadLocal<StringCoding.Result>
513 resultCached = new ThreadLocal<>() {
514 protected StringCoding.Result initialValue() {
515 return new StringCoding.Result();
516 }};
517
518 ////////////////////////// ascii //////////////////////////////
519
520 private static Result decodeASCII(byte[] ba, int off, int len) {
521 Result result = resultCached.get();
522 if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
523 return result.with(Arrays.copyOfRange(ba, off, off + len),
524 LATIN1);
525 }
526 byte[] dst = new byte[len<<1];
527 int dp = 0;
528 while (dp < len) {
529 int b = ba[off++];
530 putChar(dst, dp++, (b >= 0) ? (char)b : repl);
531 }
532 return result.with(dst, UTF16);
533 }
534
535 private static byte[] encodeASCII(byte coder, byte[] val) {
536 if (coder == LATIN1) {
537 byte[] dst = new byte[val.length];
538 for (int i = 0; i < val.length; i++) {
539 if (val[i] < 0) {
540 dst[i] = '?';
541 } else {
542 dst[i] = val[i];
543 }
544 }
545 return dst;
546 }
547 int len = val.length >> 1;
548 byte[] dst = new byte[len];
549 int dp = 0;
550 for (int i = 0; i < len; i++) {
551 char c = StringUTF16.getChar(val, i);
552 if (c < 0x80) {
553 dst[dp++] = (byte)c;
554 continue;
555 }
556 if (Character.isHighSurrogate(c) && i + 1 < len &&
557 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
558 i++;
559 }
560 dst[dp++] = '?';
561 }
562 if (len == dp) {
563 return dst;
564 }
565 return Arrays.copyOf(dst, dp);
566 }
567
568 ////////////////////////// latin1/8859_1 ///////////////////////////
569
570 private static Result decodeLatin1(byte[] ba, int off, int len) {
571 Result result = resultCached.get();
572 if (COMPACT_STRINGS) {
573 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
574 } else {
575 return result.with(StringLatin1.inflate(ba, off, len), UTF16);
576 }
577 }
578
579 @HotSpotIntrinsicCandidate
580 private static int implEncodeISOArray(byte[] sa, int sp,
581 byte[] da, int dp, int len) {
582 int i = 0;
583 for (; i < len; i++) {
584 char c = StringUTF16.getChar(sa, sp++);
585 if (c > '\u00FF')
586 break;
587 da[dp++] = (byte)c;
588 }
589 return i;
590 }
591
592 private static byte[] encode8859_1(byte coder, byte[] val) {
593 if (coder == LATIN1) {
594 return Arrays.copyOf(val, val.length);
595 }
596 int len = val.length >> 1;
597 byte[] dst = new byte[len];
598 int dp = 0;
599 int sp = 0;
600 int sl = len;
601 while (sp < sl) {
602 int ret = implEncodeISOArray(val, sp, dst, dp, len);
603 sp = sp + ret;
604 dp = dp + ret;
605 if (ret != len) {
606 char c = StringUTF16.getChar(val, sp++);
607 if (Character.isHighSurrogate(c) && sp < sl &&
608 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
609 sp++;
610 }
611 dst[dp++] = '?';
612 len = sl - sp;
613 }
614 }
615 if (dp == dst.length) {
616 return dst;
617 }
618 return Arrays.copyOf(dst, dp);
619 }
620
621 //////////////////////////////// utf8 ////////////////////////////////////
622
623 private static boolean isNotContinuation(int b) {
624 return (b & 0xc0) != 0x80;
625 }
626
627 private static boolean isMalformed3(int b1, int b2, int b3) {
628 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
629 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
630 }
631
632 private static boolean isMalformed3_2(int b1, int b2) {
633 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
634 (b2 & 0xc0) != 0x80;
635 }
636
637 private static boolean isMalformed4(int b2, int b3, int b4) {
638 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
639 (b4 & 0xc0) != 0x80;
640 }
641
642 private static boolean isMalformed4_2(int b1, int b2) {
643 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
644 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
645 (b2 & 0xc0) != 0x80;
646 }
647
648 private static boolean isMalformed4_3(int b3) {
649 return (b3 & 0xc0) != 0x80;
650 }
651
652 // for nb == 3/4
653 private static int malformedN(byte[] src, int sp, int nb) {
654 if (nb == 3) {
655 int b1 = src[sp++];
656 int b2 = src[sp++]; // no need to lookup b3
657 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
658 isNotContinuation(b2)) ? 1 : 2;
659 } else if (nb == 4) { // we don't care the speed here
660 int b1 = src[sp++] & 0xff;
661 int b2 = src[sp++] & 0xff;
662 if (b1 > 0xf4 ||
663 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
664 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
665 isNotContinuation(b2))
666 return 1;
667 if (isNotContinuation(src[sp++]))
668 return 2;
669 return 3;
670 }
671 assert false;
672 return -1;
673 }
674
675 private static void throwMalformed(int off, int nb) {
676 throw new IllegalArgumentException("malformed input off : " + off +
677 ", length : " + nb);
678 }
679
680 private static char repl = '\ufffd';
681
682 private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
683 // ascii-bais, which has a relative impact to the non-ascii-only bytes
684 if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
685 return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
686 LATIN1);
687 return decodeUTF8_0(src, sp, len, doReplace);
688 }
689
690 private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
691 Result ret = resultCached.get();
692
693 int sl = sp + len;
694 int dp = 0;
695 byte[] dst = new byte[len];
696
697 if (COMPACT_STRINGS) {
698 while (sp < sl) {
699 int b1 = src[sp];
700 if (b1 >= 0) {
701 dst[dp++] = (byte)b1;
702 sp++;
703 continue;
704 }
705 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
706 sp + 1 < sl) {
707 int b2 = src[sp + 1];
708 if (!isNotContinuation(b2)) {
709 dst[dp++] = (byte)(((b1 << 6) ^ b2)^
710 (((byte) 0xC0 << 6) ^
711 ((byte) 0x80 << 0)));
712 sp += 2;
713 continue;
714 }
715 }
716 // anything not a latin1, including the repl
717 // we have to go with the utf16
718 break;
719 }
720 if (sp == sl) {
721 if (dp != dst.length) {
722 dst = Arrays.copyOf(dst, dp);
723 }
724 return ret.with(dst, LATIN1);
725 }
726 }
727 if (dp == 0) {
728 dst = new byte[len << 1];
729 } else {
730 byte[] buf = new byte[len << 1];
731 StringLatin1.inflate(dst, 0, buf, 0, dp);
732 dst = buf;
733 }
734 while (sp < sl) {
735 int b1 = src[sp++];
736 if (b1 >= 0) {
737 putChar(dst, dp++, (char) b1);
738 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
739 if (sp < sl) {
740 int b2 = src[sp++];
741 if (isNotContinuation(b2)) {
742 if (!doReplace) {
743 throwMalformed(sp - 1, 1);
744 }
745 putChar(dst, dp++, repl);
746 sp--;
747 } else {
748 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
749 (((byte) 0xC0 << 6) ^
750 ((byte) 0x80 << 0))));
751 }
752 continue;
753 }
754 if (!doReplace) {
755 throwMalformed(sp, 1); // underflow()
756 }
757 putChar(dst, dp++, repl);
758 break;
759 } else if ((b1 >> 4) == -2) {
760 if (sp + 1 < sl) {
761 int b2 = src[sp++];
762 int b3 = src[sp++];
763 if (isMalformed3(b1, b2, b3)) {
764 if (!doReplace) {
765 throwMalformed(sp - 3, 3);
766 }
767 putChar(dst, dp++, repl);
768 sp -= 3;
769 sp += malformedN(src, sp, 3);
770 } else {
771 char c = (char)((b1 << 12) ^
772 (b2 << 6) ^
773 (b3 ^
774 (((byte) 0xE0 << 12) ^
775 ((byte) 0x80 << 6) ^
776 ((byte) 0x80 << 0))));
777 if (isSurrogate(c)) {
778 if (!doReplace) {
779 throwMalformed(sp - 3, 3);
780 }
781 putChar(dst, dp++, repl);
782 } else {
783 putChar(dst, dp++, c);
784 }
785 }
786 continue;
787 }
788 if (sp < sl && isMalformed3_2(b1, src[sp])) {
789 if (!doReplace) {
790 throwMalformed(sp - 1, 2);
791 }
792 putChar(dst, dp++, repl);
793 continue;
794 }
795 if (!doReplace){
796 throwMalformed(sp, 1);
797 }
798 putChar(dst, dp++, repl);
799 break;
800 } else if ((b1 >> 3) == -2) {
801 if (sp + 2 < sl) {
802 int b2 = src[sp++];
803 int b3 = src[sp++];
804 int b4 = src[sp++];
805 int uc = ((b1 << 18) ^
806 (b2 << 12) ^
807 (b3 << 6) ^
808 (b4 ^
809 (((byte) 0xF0 << 18) ^
810 ((byte) 0x80 << 12) ^
811 ((byte) 0x80 << 6) ^
812 ((byte) 0x80 << 0))));
813 if (isMalformed4(b2, b3, b4) ||
814 !isSupplementaryCodePoint(uc)) { // shortest form check
815 if (!doReplace) {
816 throwMalformed(sp - 4, 4);
817 }
818 putChar(dst, dp++, repl);
819 sp -= 4;
820 sp += malformedN(src, sp, 4);
821 } else {
822 putChar(dst, dp++, highSurrogate(uc));
823 putChar(dst, dp++, lowSurrogate(uc));
824 }
825 continue;
826 }
827 b1 &= 0xff;
828 if (b1 > 0xf4 ||
829 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
830 if (!doReplace) {
831 throwMalformed(sp - 1, 1); // or 2
832 }
833 putChar(dst, dp++, repl);
834 continue;
835 }
836 if (!doReplace) {
837 throwMalformed(sp - 1, 1);
838 }
839 sp++;
840 putChar(dst, dp++, repl);
841 if (sp < sl && isMalformed4_3(src[sp])) {
842 continue;
843 }
844 break;
845 } else {
846 if (!doReplace) {
847 throwMalformed(sp - 1, 1);
848 }
849 putChar(dst, dp++, repl);
850 }
851 }
852 if (dp != len) {
853 dst = Arrays.copyOf(dst, dp << 1);
854 }
855 return ret.with(dst, UTF16);
856 }
857
858 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
859 if (coder == UTF16)
860 return encodeUTF8_UTF16(val, doReplace);
861
862 if (!hasNegatives(val, 0, val.length))
863 return Arrays.copyOf(val, val.length);
864
865 int dp = 0;
866 byte[] dst = new byte[val.length << 1];
867 for (int sp = 0; sp < val.length; sp++) {
868 byte c = val[sp];
869 if (c < 0) {
870 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
871 dst[dp++] = (byte)(0x80 | (c & 0x3f));
872 } else {
873 dst[dp++] = c;
874 }
875 }
876 if (dp == dst.length)
877 return dst;
878 return Arrays.copyOf(dst, dp);
879 }
880
881 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
882 int dp = 0;
883 int sp = 0;
884 int sl = val.length >> 1;
885 byte[] dst = new byte[sl * 3];
886 char c;
887 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
888 // ascii fast loop;
889 dst[dp++] = (byte)c;
890 sp++;
891 }
892 while (sp < sl) {
893 c = StringUTF16.getChar(val, sp++);
894 if (c < 0x80) {
895 dst[dp++] = (byte)c;
896 } else if (c < 0x800) {
897 dst[dp++] = (byte)(0xc0 | (c >> 6));
898 dst[dp++] = (byte)(0x80 | (c & 0x3f));
899 } else if (Character.isSurrogate(c)) {
900 int uc = -1;
901 char c2;
902 if (Character.isHighSurrogate(c) && sp < sl &&
903 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
904 uc = Character.toCodePoint(c, c2);
905 }
906 if (uc < 0) {
907 if (doReplace) {
908 dst[dp++] = '?';
909 } else {
910 throwMalformed(sp - 1, 1); // or 2, does not matter here
911 }
912 } else {
913 dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
914 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
915 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
916 dst[dp++] = (byte)(0x80 | (uc & 0x3f));
917 sp++; // 2 chars
918 }
919 } else {
920 // 3 bytes, 16 bits
921 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
922 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
923 dst[dp++] = (byte)(0x80 | (c & 0x3f));
924 }
925 }
926 if (dp == dst.length) {
927 return dst;
928 }
929 return Arrays.copyOf(dst, dp);
930 }
931
932 ////////////////////// for j.u.z.ZipCoder //////////////////////////
933
934 /*
935 * Throws iae, instead of replacing, if malformed or unmappble.
936 */
937 static String newStringUTF8NoRepl(byte[] src, int off, int len) {
938 if (COMPACT_STRINGS && !hasNegatives(src, off, len))
939 return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
940 Result ret = decodeUTF8_0(src, off, len, false);
941 return new String(ret.value, ret.coder);
942 }
943
944 /*
945 * Throws iae, instead of replacing, if unmappble.
946 */
947 static byte[] getBytesUTF8NoRepl(String s) {
948 return encodeUTF8(s.coder(), s.value(), false);
949 }
950 }
|