1 /* 2 * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* 25 * @test 26 * @bug 4486841 7040220 7096080 27 * @summary Test UTF-8 charset 28 */ 29 30 import java.nio.charset.*; 31 import java.nio.*; 32 import java.util.*; 33 34 public class TestUTF8 { 35 static char[] decode(byte[] bb, String csn, boolean testDirect) 36 throws Exception { 37 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 38 ByteBuffer bbf; 39 CharBuffer cbf; 40 if (testDirect) { 41 bbf = ByteBuffer.allocateDirect(bb.length); 42 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 43 bbf.put(bb).flip(); 44 } else { 45 bbf = ByteBuffer.wrap(bb); 46 cbf = CharBuffer.allocate(bb.length); 47 } 48 CoderResult cr = dec.decode(bbf, cbf, true); 49 if (cr != CoderResult.UNDERFLOW) 50 throw new RuntimeException("Decoding err: " + csn); 51 char[] cc = new char[cbf.position()]; 52 cbf.flip(); cbf.get(cc); 53 return cc; 54 55 } 56 57 static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect) 58 throws Exception { 59 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 60 ByteBuffer bbf; 61 CharBuffer cbf; 62 if (testDirect) { 63 bbf = ByteBuffer.allocateDirect(bb.length); 64 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 65 bbf.put(bb).flip(); 66 } else { 67 bbf = ByteBuffer.wrap(bb); 68 cbf = CharBuffer.allocate(bb.length); 69 } 70 return dec.decode(bbf, cbf, true); 71 } 72 73 // copy/paste of the StringCoding.decode() 74 static char[] decode(Charset cs, byte[] ba, int off, int len) { 75 CharsetDecoder cd = cs.newDecoder(); 76 int en = (int)(len * cd.maxCharsPerByte()); 77 char[] ca = new char[en]; 78 if (len == 0) 79 return ca; 80 cd.onMalformedInput(CodingErrorAction.REPLACE) 81 .onUnmappableCharacter(CodingErrorAction.REPLACE) 82 .reset(); 83 84 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 85 CharBuffer cb = CharBuffer.wrap(ca); 86 try { 87 CoderResult cr = cd.decode(bb, cb, true); 88 if (!cr.isUnderflow()) 89 cr.throwException(); 90 cr = cd.flush(cb); 91 if (!cr.isUnderflow()) 92 cr.throwException(); 93 } catch (CharacterCodingException x) { 94 throw new Error(x); 95 } 96 return Arrays.copyOf(ca, cb.position()); 97 } 98 99 static byte[] encode(char[] cc, String csn, boolean testDirect) 100 throws Exception { 101 ByteBuffer bbf; 102 CharBuffer cbf; 103 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 104 if (testDirect) { 105 bbf = ByteBuffer.allocateDirect(cc.length * 4); 106 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 107 cbf.put(cc).flip(); 108 } else { 109 bbf = ByteBuffer.allocate(cc.length * 4); 110 cbf = CharBuffer.wrap(cc); 111 } 112 113 CoderResult cr = enc.encode(cbf, bbf, true); 114 if (cr != CoderResult.UNDERFLOW) 115 throw new RuntimeException("Encoding err: " + csn); 116 byte[] bb = new byte[bbf.position()]; 117 bbf.flip(); bbf.get(bb); 118 return bb; 119 } 120 121 static CoderResult encodeCR(char[] cc, String csn, boolean testDirect) 122 throws Exception { 123 ByteBuffer bbf; 124 CharBuffer cbf; 125 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 126 if (testDirect) { 127 bbf = ByteBuffer.allocateDirect(cc.length * 4); 128 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 129 cbf.put(cc).flip(); 130 } else { 131 bbf = ByteBuffer.allocate(cc.length * 4); 132 cbf = CharBuffer.wrap(cc); 133 } 134 return enc.encode(cbf, bbf, true); 135 } 136 137 static char[] getUTFChars() { 138 char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp 139 (0x110000 - 0x10000) * 2]; //supp 140 int pos = 0; 141 int i = 0; 142 for (i = 0; i < 0xd800; i++) 143 cc[pos++] = (char)i; 144 for (i = 0xe000; i < 0x10000; i++) 145 cc[pos++] = (char)i; 146 for (i = 0x10000; i < 0x110000; i++) { 147 pos += Character.toChars(i, cc, pos); 148 } 149 return cc; 150 } 151 152 static int to3ByteUTF8(char c, byte[] bb, int pos) { 153 bb[pos++] = (byte)(0xe0 | ((c >> 12))); 154 bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f)); 155 bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f)); 156 return 3; 157 } 158 159 static int to4ByteUTF8(int uc, byte[] bb, int pos) { 160 bb[pos++] = (byte)(0xf0 | ((uc >> 18))); 161 bb[pos++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 162 bb[pos++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 163 bb[pos++] = (byte)(0x80 | (uc & 0x3f)); 164 return 4; 165 } 166 167 static void checkRoundtrip(String csn) throws Exception { 168 System.out.printf(" Check roundtrip <%s>...", csn); 169 char[] cc = getUTFChars(); 170 byte[] bb = encode(cc, csn, false); 171 char[] ccO = decode(bb, csn, false); 172 173 if (!Arrays.equals(cc, ccO)) 174 System.out.printf(" non-direct failed"); 175 bb = encode(cc, csn, true); 176 ccO = decode(bb, csn, true); 177 if (!Arrays.equals(cc, ccO)) { 178 System.out.print(" (direct) failed"); 179 } 180 // String.getBytes()/toCharArray() goes to ArrayDe/Encoder path 181 if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { 182 System.out.printf(" String.getBytes() failed"); 183 } 184 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { 185 System.out.printf(" String.toCharArray() failed"); 186 } 187 System.out.println(); 188 } 189 190 static void check4ByteSurrs(String csn) throws Exception { 191 System.out.printf(" Check 4-byte Surrogates <%s>...%n", csn); 192 byte[] bb = new byte[(0x110000 - 0x10000) * 4]; 193 char[] cc = new char[(0x110000 - 0x10000) * 2]; 194 int bpos = 0; 195 int cpos = 0; 196 for (int i = 0x10000; i < 0x110000; i++) { 197 Character.toChars(i, cc, cpos); 198 bpos += to4ByteUTF8(i, bb, bpos); 199 cpos += 2; 200 } 201 checkSurrs(csn, bb, cc); 202 } 203 204 205 static void checkSurrs(String csn, byte[] bb, char[] cc) 206 throws Exception 207 { 208 char[] ccO = decode(bb, csn, false); 209 if (!Arrays.equals(cc, ccO)) { 210 System.out.printf(" decoding failed%n"); 211 } 212 ccO = decode(bb, csn, true); 213 if (!Arrays.equals(cc, ccO)) { 214 System.out.printf(" decoding(direct) failed%n"); 215 } 216 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { 217 System.out.printf(" String.toCharArray() failed"); 218 } 219 if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { 220 System.out.printf(" String.getBytes() failed"); 221 } 222 } 223 224 static void check6ByteSurrs(String csn) throws Exception { 225 System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); 226 byte[] bb = new byte[(0x110000 - 0x10000) * 6]; 227 char[] cc = new char[(0x110000 - 0x10000) * 2]; 228 int bpos = 0; 229 int cpos = 0; 230 for (int i = 0x10000; i < 0x110000; i++) { 231 Character.toChars(i, cc, cpos); 232 bpos += to3ByteUTF8(cc[cpos], bb, bpos); 233 bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); 234 cpos += 2; 235 } 236 checkSurrs(csn, bb, cc); 237 } 238 239 240 static void compare(String csn1, String csn2) throws Exception { 241 System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); 242 char[] cc = getUTFChars(); 243 244 byte[] bb1 = encode(cc, csn1, false); 245 byte[] bb2 = encode(cc, csn2, false); 246 if (!Arrays.equals(bb1, bb2)) 247 System.out.printf(" encoding failed%n"); 248 char[] cc1 = decode(bb1, csn1, false); 249 char[] cc2 = decode(bb1, csn2, false); 250 if (!Arrays.equals(cc1, cc2)) { 251 System.out.printf(" decoding failed%n"); 252 } 253 254 bb1 = encode(cc, csn1, true); 255 bb2 = encode(cc, csn2, true); 256 if (!Arrays.equals(bb1, bb2)) 257 System.out.printf(" encoding (direct) failed%n"); 258 cc1 = decode(bb1, csn1, true); 259 cc2 = decode(bb1, csn2, true); 260 if (!Arrays.equals(cc1, cc2)) { 261 System.out.printf(" decoding (direct) failed%n"); 262 } 263 } 264 265 // The first byte is the length of malformed bytes 266 static byte[][] malformed = { 267 // One-byte sequences: 268 {1, (byte)0xFF }, 269 {1, (byte)0xC0 }, 270 {1, (byte)0x80 }, 271 272 {1, (byte)0xFF, (byte)0xFF}, // all ones 273 {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble 274 275 // Two-byte sequences: 276 {1, (byte)0xC0, (byte)0x80}, // invalid first byte 277 {1, (byte)0xC1, (byte)0xBF}, // invalid first byte 278 {1, (byte)0xC2, (byte)0x00}, // invalid second byte 279 {1, (byte)0xC2, (byte)0xC0}, // invalid second byte 280 {1, (byte)0xD0, (byte)0x00}, // invalid second byte 281 {1, (byte)0xD0, (byte)0xC0}, // invalid second byte 282 {1, (byte)0xDF, (byte)0x00}, // invalid second byte 283 {1, (byte)0xDF, (byte)0xC0}, // invalid second byte 284 285 // Three-byte sequences 286 {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble 287 {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 288 {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 289 {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 290 291 {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte 292 {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte 293 {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte 294 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 295 {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte 296 {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte 297 {1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes 298 {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate 299 {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate 300 301 302 // Four-byte sequences 303 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 304 {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 305 {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded 306 {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded 307 308 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 309 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte 310 {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 311 {1, (byte)0xF0, (byte)41 }, // invalid second byte 312 // & only 2 bytes 313 314 {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte 315 {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte 316 {2, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte 317 // & 3 bytes input 318 319 {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 320 {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte 321 {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte 322 {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 323 {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 324 {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 325 326 // Five-byte sequences 327 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte 328 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 329 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 330 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 331 {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 332 333 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, 334 {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, 335 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, 336 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, 337 338 // Six-byte sequences 339 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 340 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 341 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 342 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 343 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, 344 {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, 345 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, 346 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, 347 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, 348 }; 349 350 // The first byte is the length of malformed bytes 351 static byte[][] malformed_cesu8 = { 352 // One-byte sequences: 353 {1, (byte)0xFF }, 354 {1, (byte)0xC0 }, 355 {1, (byte)0x80 }, 356 357 {1, (byte)0xFF, (byte)0xFF}, // all ones 358 {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble 359 360 // Two-byte sequences: 361 {1, (byte)0xC0, (byte)0x80}, // invalid first byte 362 {1, (byte)0xC1, (byte)0xBF}, // invalid first byte 363 {1, (byte)0xC2, (byte)0x00}, // invalid second byte 364 {1, (byte)0xC2, (byte)0xC0}, // invalid second byte 365 {1, (byte)0xD0, (byte)0x00}, // invalid second byte 366 {1, (byte)0xD0, (byte)0xC0}, // invalid second byte 367 {1, (byte)0xDF, (byte)0x00}, // invalid second byte 368 {1, (byte)0xDF, (byte)0xC0}, // invalid second byte 369 370 // Three-byte sequences 371 {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble 372 {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 373 {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 374 {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 375 376 {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte 377 {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte 378 {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte 379 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 380 {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte 381 {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte 382 {1, (byte)0xE0, (byte)0x41,}, // invalid second byte & 2 bytes 383 384 // CESU-8 does not have 4, 5, 6 bytes sequenc 385 // Four-byte sequences 386 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 387 {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 388 {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded 389 {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded 390 391 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 392 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte 393 {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 394 {1, (byte)0xF0, (byte)41 }, // invalid second byte 395 // & only 2 bytes 396 {1, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte 397 {1, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte 398 {1, (byte)0xF0, (byte)0x90, (byte)0x41 }, // invalid third byte 399 // & 3 bytes input 400 401 {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 402 {1, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte 403 {1, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte 404 {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 405 {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 406 {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 407 408 // Five-byte sequences 409 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte 410 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 411 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 412 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 413 {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 414 415 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, 416 {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, 417 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, 418 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, 419 420 // Six-byte sequences 421 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 422 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 423 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 424 {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 425 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, 426 {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, 427 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, 428 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, 429 {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, 430 }; 431 432 433 static void checkMalformed(String csn, byte[][] malformed) throws Exception { 434 boolean failed = false; 435 System.out.printf(" Check malformed <%s>...%n", csn); 436 Charset cs = Charset.forName(csn); 437 for (boolean direct: new boolean[] {false, true}) { 438 for (byte[] bins : malformed) { 439 int mlen = bins[0]; 440 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); 441 CoderResult cr = decodeCR(bin, csn, direct); 442 String ashex = ""; 443 for (int i = 0; i < bin.length; i++) { 444 if (i > 0) ashex += " "; 445 ashex += Integer.toBinaryString((int)bin[i] & 0xff); 446 } 447 if (!cr.isMalformed()) { 448 System.out.printf(" FAIL(direct=%b): [%s] not malformed.%n", direct, ashex); 449 failed = true; 450 } else if (cr.length() != mlen) { 451 System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length()); 452 failed = true; 453 } 454 if (!Arrays.equals(decode(cs, bin, 0, bin.length), 455 new String(bin, csn).toCharArray())) { 456 System.out.printf(" FAIL(new String(bb, %s)) failed%n", csn); 457 failed = true; 458 } 459 } 460 } 461 if (failed) 462 throw new RuntimeException("Check malformed failed " + csn); 463 } 464 465 static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) { 466 int inPos = flow[0]; 467 int inLen = flow[1]; 468 int outPos = flow[2]; 469 int outLen = flow[3]; 470 int expedInPos = flow[4]; 471 int expedOutPos = flow[5]; 472 CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW 473 :CoderResult.OVERFLOW; 474 ByteBuffer bbf; 475 CharBuffer cbf; 476 if (direct) { 477 bbf = ByteBuffer.allocateDirect(inPos + utf8s.length); 478 cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer(); 479 } else { 480 bbf = ByteBuffer.allocate(inPos + utf8s.length); 481 cbf = CharBuffer.allocate(outPos + outLen); 482 } 483 bbf.position(inPos); 484 bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen); 485 cbf.position(outPos); 486 dec.reset(); 487 CoderResult cr = dec.decode(bbf, cbf, false); 488 if (cr != expedCR || 489 bbf.position() != expedInPos || 490 cbf.position() != expedOutPos) { 491 System.out.printf("Expected(direct=%5b): [", direct); 492 for (int i:flow) System.out.print(" " + i); 493 System.out.println("] CR=" + cr + 494 ", inPos=" + bbf.position() + 495 ", outPos=" + cbf.position()); 496 return false; 497 } 498 return true; 499 } 500 501 static void checkUnderOverflow(String csn) throws Exception { 502 System.out.printf(" Check under/overflow <%s>...%n", csn); 503 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 504 boolean failed = false; 505 byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8"); 506 int inlen = utf8s.length; 507 508 for (int inoff = 0; inoff < 20; inoff++) { 509 for (int outoff = 0; outoff < 20; outoff++) { 510 int[][] Flows = { 511 //inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1) 512 {inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1}, 513 {inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1}, 514 {inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1}, 515 {inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1}, 516 {inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0}, 517 // underflow 518 {inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0}, 519 {inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0}, 520 {inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0}, 521 {inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0}, 522 {inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0}, 523 {inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0}, 524 {inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0}, 525 {inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0}, 526 {inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0}, 527 {inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0}, 528 // 2-byte underflow/overflow 529 {inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0}, 530 {inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1}, 531 // 3-byte underflow/overflow 532 {inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0}, 533 {inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0}, 534 {inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1}, 535 // 4-byte underflow/overflow 536 {inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0}, 537 {inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0}, 538 {inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0}, 539 {inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1}, 540 }; 541 for (boolean direct: new boolean[] {false, true}) { 542 for (int[] flow: Flows) { 543 if (!check(dec, utf8s, direct, flow)) 544 failed = true; 545 } 546 }}} 547 if (failed) 548 throw new RuntimeException("Check under/overflow failed " + csn); 549 } 550 551 public static void main(String[] args) throws Exception { 552 checkRoundtrip("UTF-8"); 553 check4ByteSurrs("UTF-8"); 554 checkMalformed("UTF-8", malformed); 555 checkUnderOverflow("UTF-8"); 556 557 checkRoundtrip("CESU-8"); 558 check6ByteSurrs("CESU-8"); 559 checkMalformed("CESU-8", malformed_cesu8); 560 } 561 }