1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.nio.charset.Charset; 29 import java.util.Arrays; 30 31 import static java.lang.String.LATIN1; 32 import static java.lang.String.UTF16; 33 import static java.lang.String.COMPACT_STRINGS; 34 import static java.lang.Character.isSurrogate; 35 import static java.lang.Character.highSurrogate; 36 import static java.lang.Character.lowSurrogate; 37 import static java.lang.Character.isSupplementaryCodePoint; 38 import static java.lang.StringUTF16.putChar; 39 40 class StringDecoderUTF8 extends StringCoding.StringDecoder { 41 42 StringDecoderUTF8(Charset cs, String rcn) { 43 super(cs, rcn); 44 } 45 46 private static boolean isNotContinuation(int b) { 47 return (b & 0xc0) != 0x80; 48 } 49 50 private static boolean isMalformed3(int b1, int b2, int b3) { 51 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 52 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 53 } 54 55 private static boolean isMalformed3_2(int b1, int b2) { 56 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 57 (b2 & 0xc0) != 0x80; 58 } 59 60 private static boolean isMalformed4(int b2, int b3, int b4) { 61 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 62 (b4 & 0xc0) != 0x80; 63 } 64 65 private static boolean isMalformed4_2(int b1, int b2) { 66 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 67 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 68 (b2 & 0xc0) != 0x80; 69 } 70 71 private static boolean isMalformed4_3(int b3) { 72 return (b3 & 0xc0) != 0x80; 73 } 74 75 // for nb == 3/4 76 private static int malformedN(byte[] src, int sp, int nb) { 77 if (nb == 3) { 78 int b1 = src[sp++]; 79 int b2 = src[sp++]; // no need to lookup b3 80 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 81 isNotContinuation(b2)) ? 1 : 2; 82 } else if (nb == 4) { // we don't care the speed here 83 int b1 = src[sp++] & 0xff; 84 int b2 = src[sp++] & 0xff; 85 if (b1 > 0xf4 || 86 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 87 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 88 isNotContinuation(b2)) 89 return 1; 90 if (isNotContinuation(src[sp++])) 91 return 2; 92 return 3; 93 } 94 assert false; 95 return -1; 96 } 97 98 private static char repl = '\ufffd'; 99 100 StringCoding.Result decode(byte[] src, int sp, int len) { 101 return decode(src, sp, len, result); 102 } 103 104 static StringCoding.Result decode(byte[] src, int sp, int len, 105 StringCoding.Result ret) { 106 int sl = sp + len; 107 byte[] dst = new byte[len]; 108 int dp = 0; 109 if (COMPACT_STRINGS) { // Latin1 only loop 110 while (sp < sl) { 111 int b1 = src[sp]; 112 if (b1 >= 0) { 113 dst[dp++] = (byte)b1; 114 sp++; 115 continue; 116 } 117 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && 118 sp + 1 < sl) { 119 int b2 = src[sp + 1]; 120 if (!isNotContinuation(b2)) { 121 dst[dp++] = (byte)(((b1 << 6) ^ b2)^ 122 (((byte) 0xC0 << 6) ^ 123 ((byte) 0x80 << 0))); 124 sp += 2; 125 continue; 126 } 127 } 128 // anything not a latin1, including the repl 129 // we have to go with the utf16 130 break; 131 } 132 if (sp == sl) { 133 if (dp != dst.length) { 134 dst = Arrays.copyOf(dst, dp); 135 } 136 return ret.with(dst, LATIN1); 137 } 138 } 139 if (dp == 0) { 140 dst = new byte[len << 1]; 141 } else { 142 byte[] buf = new byte[len << 1]; 143 StringLatin1.inflate(dst, 0, buf, 0, dp); 144 dst = buf; 145 } 146 while (sp < sl) { 147 int b1 = src[sp++]; 148 if (b1 >= 0) { 149 putChar(dst, dp++, (char) b1); 150 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 151 if (sp < sl) { 152 int b2 = src[sp++]; 153 if (isNotContinuation(b2)) { 154 putChar(dst, dp++, repl); 155 sp--; 156 } else { 157 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ 158 (((byte) 0xC0 << 6) ^ 159 ((byte) 0x80 << 0)))); 160 } 161 continue; 162 } 163 putChar(dst, dp++, repl); 164 break; 165 } else if ((b1 >> 4) == -2) { 166 if (sp + 1 < sl) { 167 int b2 = src[sp++]; 168 int b3 = src[sp++]; 169 if (isMalformed3(b1, b2, b3)) { 170 putChar(dst, dp++, repl); 171 sp -= 3; 172 sp += malformedN(src, sp, 3); 173 } else { 174 char c = (char)((b1 << 12) ^ 175 (b2 << 6) ^ 176 (b3 ^ 177 (((byte) 0xE0 << 12) ^ 178 ((byte) 0x80 << 6) ^ 179 ((byte) 0x80 << 0)))); 180 putChar(dst, dp++, isSurrogate(c) ? repl : c); 181 } 182 continue; 183 } 184 if (sp < sl && isMalformed3_2(b1, src[sp])) { 185 putChar(dst, dp++, repl); 186 continue; 187 } 188 putChar(dst, dp++, repl); 189 break; 190 } else if ((b1 >> 3) == -2) { 191 if (sp + 2 < sl) { 192 int b2 = src[sp++]; 193 int b3 = src[sp++]; 194 int b4 = src[sp++]; 195 int uc = ((b1 << 18) ^ 196 (b2 << 12) ^ 197 (b3 << 6) ^ 198 (b4 ^ 199 (((byte) 0xF0 << 18) ^ 200 ((byte) 0x80 << 12) ^ 201 ((byte) 0x80 << 6) ^ 202 ((byte) 0x80 << 0)))); 203 if (isMalformed4(b2, b3, b4) || 204 !isSupplementaryCodePoint(uc)) { // shortest form check 205 putChar(dst, dp++, repl); 206 sp -= 4; 207 sp += malformedN(src, sp, 4); 208 } else { 209 putChar(dst, dp++, highSurrogate(uc)); 210 putChar(dst, dp++, lowSurrogate(uc)); 211 } 212 continue; 213 } 214 b1 &= 0xff; 215 if (b1 > 0xf4 || 216 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 217 putChar(dst, dp++, repl); 218 continue; 219 } 220 sp++; 221 putChar(dst, dp++, repl); 222 if (sp < sl && isMalformed4_3(src[sp])) { 223 continue; 224 } 225 break; 226 } else { 227 putChar(dst, dp++, repl); 228 } 229 } 230 if (dp != len) { 231 dst = Arrays.copyOf(dst, dp << 1); 232 } 233 return ret.with(dst, UTF16); 234 } 235 }