1 /* 2 * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* 25 * @test 26 * @bug 4486841 7040220 27 * @summary Test UTF-8 charset 28 */ 29 30 import java.nio.charset.*; 31 import java.nio.*; 32 import java.util.*; 33 34 public class TestUTF8 { 35 static char[] decode(byte[] bb, String csn, boolean testDirect) 36 throws Exception { 37 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 38 ByteBuffer bbf; 39 CharBuffer cbf; 40 if (testDirect) { 41 bbf = ByteBuffer.allocateDirect(bb.length); 42 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 43 bbf.put(bb).flip(); 44 } else { 45 bbf = ByteBuffer.wrap(bb); 46 cbf = CharBuffer.allocate(bb.length); 47 } 48 CoderResult cr = dec.decode(bbf, cbf, true); 49 if (cr != CoderResult.UNDERFLOW) 50 throw new RuntimeException("Decoding err: " + csn); 51 char[] cc = new char[cbf.position()]; 52 cbf.flip(); cbf.get(cc); 53 return cc; 54 55 } 56 57 static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect) 58 throws Exception { 59 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 60 ByteBuffer bbf; 61 CharBuffer cbf; 62 if (testDirect) { 63 bbf = ByteBuffer.allocateDirect(bb.length); 64 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 65 bbf.put(bb).flip(); 66 } else { 67 bbf = ByteBuffer.wrap(bb); 68 cbf = CharBuffer.allocate(bb.length); 69 } 70 return dec.decode(bbf, cbf, true); 71 } 72 73 // copy/paste of the StringCoding.decode() 74 static char[] decode(Charset cs, byte[] ba, int off, int len) { 75 CharsetDecoder cd = cs.newDecoder(); 76 int en = (int)(len * cd.maxCharsPerByte()); 77 char[] ca = new char[en]; 78 if (len == 0) 79 return ca; 80 cd.onMalformedInput(CodingErrorAction.REPLACE) 81 .onUnmappableCharacter(CodingErrorAction.REPLACE) 82 .reset(); 83 84 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 85 CharBuffer cb = CharBuffer.wrap(ca); 86 try { 87 CoderResult cr = cd.decode(bb, cb, true); 88 if (!cr.isUnderflow()) 89 cr.throwException(); 90 cr = cd.flush(cb); 91 if (!cr.isUnderflow()) 92 cr.throwException(); 93 } catch (CharacterCodingException x) { 94 throw new Error(x); 95 } 96 return Arrays.copyOf(ca, cb.position()); 97 } 98 99 static byte[] encode(char[] cc, String csn, boolean testDirect) 100 throws Exception { 101 ByteBuffer bbf; 102 CharBuffer cbf; 103 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 104 if (testDirect) { 105 bbf = ByteBuffer.allocateDirect(cc.length * 4); 106 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 107 cbf.put(cc).flip(); 108 } else { 109 bbf = ByteBuffer.allocate(cc.length * 4); 110 cbf = CharBuffer.wrap(cc); 111 } 112 113 CoderResult cr = enc.encode(cbf, bbf, true); 114 if (cr != CoderResult.UNDERFLOW) 115 throw new RuntimeException("Encoding err: " + csn); 116 byte[] bb = new byte[bbf.position()]; 117 bbf.flip(); bbf.get(bb); 118 return bb; 119 } 120 121 static CoderResult encodeCR(char[] cc, String csn, boolean testDirect) 122 throws Exception { 123 ByteBuffer bbf; 124 CharBuffer cbf; 125 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 126 if (testDirect) { 127 bbf = ByteBuffer.allocateDirect(cc.length * 4); 128 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 129 cbf.put(cc).flip(); 130 } else { 131 bbf = ByteBuffer.allocate(cc.length * 4); 132 cbf = CharBuffer.wrap(cc); 133 } 134 return enc.encode(cbf, bbf, true); 135 } 136 137 static char[] getUTFChars() { 138 char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp 139 (0x110000 - 0x10000) * 2]; //supp 140 int pos = 0; 141 int i = 0; 142 for (i = 0; i < 0xd800; i++) 143 cc[pos++] = (char)i; 144 for (i = 0xe000; i < 0x10000; i++) 145 cc[pos++] = (char)i; 146 for (i = 0x10000; i < 0x110000; i++) { 147 pos += Character.toChars(i, cc, pos); 148 } 149 return cc; 150 } 151 152 static int to3ByteUTF8(char c, byte[] bb, int pos) { 153 bb[pos++] = (byte)(0xe0 | ((c >> 12))); 154 bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f)); 155 bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f)); 156 return 3; 157 } 158 159 static void checkRoundtrip(String csn) throws Exception { 160 System.out.printf(" Check roundtrip <%s>...", csn); 161 char[] cc = getUTFChars(); 162 byte[] bb = encode(cc, csn, false); 163 char[] ccO = decode(bb, csn, false); 164 165 if (!Arrays.equals(cc, ccO)) { 166 System.out.printf(" non-direct failed"); 167 } 168 bb = encode(cc, csn, true); 169 ccO = decode(bb, csn, true); 170 if (!Arrays.equals(cc, ccO)) { 171 System.out.print(" (direct) failed"); 172 } 173 // String.getBytes()/toCharArray() goes to ArrayDe/Encoder path 174 if (!Arrays.equals(bb, new String(cc).getBytes(csn))) { 175 System.out.printf(" String.getBytes() failed"); 176 } 177 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { 178 System.out.printf(" String.toCharArray() failed"); 179 } 180 System.out.println(); 181 } 182 183 static void check6ByteSurrs(String csn) throws Exception { 184 System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); 185 byte[] bb = new byte[(0x110000 - 0x10000) * 6]; 186 char[] cc = new char[(0x110000 - 0x10000) * 2]; 187 int bpos = 0; 188 int cpos = 0; 189 for (int i = 0x10000; i < 0x110000; i++) { 190 Character.toChars(i, cc, cpos); 191 bpos += to3ByteUTF8(cc[cpos], bb, bpos); 192 bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); 193 cpos += 2; 194 } 195 196 char[] ccO = decode(bb, csn, false); 197 if (!Arrays.equals(cc, ccO)) { 198 System.out.printf(" decoding failed%n"); 199 } 200 ccO = decode(bb, csn, true); 201 if (!Arrays.equals(cc, ccO)) { 202 System.out.printf(" decoding(direct) failed%n"); 203 } 204 // new String(bb, csn).getBytes(csn) will not return 205 // the 6 bytes surrogates as in bb, so only test 206 // toCharArray() here. 207 if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) { 208 System.out.printf(" String.toCharArray() failed"); 209 } 210 } 211 212 static void compare(String csn1, String csn2) throws Exception { 213 System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); 214 char[] cc = getUTFChars(); 215 216 byte[] bb1 = encode(cc, csn1, false); 217 byte[] bb2 = encode(cc, csn2, false); 218 if (!Arrays.equals(bb1, bb2)) 219 System.out.printf(" encoding failed%n"); 220 char[] cc1 = decode(bb1, csn1, false); 221 char[] cc2 = decode(bb1, csn2, false); 222 if (!Arrays.equals(cc1, cc2)) { 223 System.out.printf(" decoding failed%n"); 224 } 225 226 bb1 = encode(cc, csn1, true); 227 bb2 = encode(cc, csn2, true); 228 if (!Arrays.equals(bb1, bb2)) 229 System.out.printf(" encoding (direct) failed%n"); 230 cc1 = decode(bb1, csn1, true); 231 cc2 = decode(bb1, csn2, true); 232 if (!Arrays.equals(cc1, cc2)) { 233 System.out.printf(" decoding (direct) failed%n"); 234 } 235 } 236 237 // The first byte is the length of malformed bytes 238 static byte[][] malformed = { 239 // One-byte sequences: 240 {1, (byte)0xFF }, 241 {1, (byte)0xC0 }, 242 {1, (byte)0x80 }, 243 244 {1, (byte)0xFF, (byte)0xFF}, // all ones 245 {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble 246 247 // Two-byte sequences: 248 {1, (byte)0xC0, (byte)0x80}, // invalid first byte 249 {1, (byte)0xC1, (byte)0xBF}, // invalid first byte 250 {1, (byte)0xC2, (byte)0x00}, // invalid second byte 251 {1, (byte)0xC2, (byte)0xC0}, // invalid second byte 252 {1, (byte)0xD0, (byte)0x00}, // invalid second byte 253 {1, (byte)0xD0, (byte)0xC0}, // invalid second byte 254 {1, (byte)0xDF, (byte)0x00}, // invalid second byte 255 {1, (byte)0xDF, (byte)0xC0}, // invalid second byte 256 257 // Three-byte sequences 258 {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble 259 {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 260 {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 261 {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 262 263 {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte 264 {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte 265 {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte 266 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 267 {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte 268 {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte 269 270 // Four-byte sequences 271 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 272 {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 273 {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded 274 {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded 275 276 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 277 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte 278 {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 279 {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte 280 {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte 281 282 {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 283 {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte 284 {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte 285 {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 286 {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 287 {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 288 289 // Five-byte sequences 290 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte 291 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 292 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 293 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 294 {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 295 296 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, 297 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, 298 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, 299 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, 300 301 // Six-byte sequences 302 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 303 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 304 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 305 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 306 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, 307 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, 308 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, 309 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, 310 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, 311 }; 312 313 static void checkMalformed(String csn) throws Exception { 314 boolean failed = false; 315 System.out.printf(" Check malformed <%s>...%n", csn); 316 Charset cs = Charset.forName(csn); 317 for (boolean direct: new boolean[] {false, true}) { 318 for (byte[] bins : malformed) { 319 int mlen = bins[0]; 320 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); 321 CoderResult cr = decodeCR(bin, csn, direct); 322 String ashex = ""; 323 for (int i = 0; i < bin.length; i++) { 324 if (i > 0) ashex += " "; 325 ashex += Integer.toBinaryString((int)bin[i] & 0xff); 326 } 327 if (!cr.isMalformed()) { 328 System.out.printf(" FAIL(direct=%b): [%s] not malformed.%n", direct, ashex); 329 failed = true; 330 } else if (cr.length() != mlen) { 331 System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length()); 332 failed = true; 333 } 334 if (!Arrays.equals(decode(cs, bin, 0, bin.length), 335 new String(bin, csn).toCharArray())) { 336 System.out.printf(" FAIL(new String(bb, %s)) failed%n", csn); 337 failed = true; 338 } 339 } 340 } 341 if (failed) 342 throw new RuntimeException("Check malformed failed " + csn); 343 } 344 345 static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) { 346 int inPos = flow[0]; 347 int inLen = flow[1]; 348 int outPos = flow[2]; 349 int outLen = flow[3]; 350 int expedInPos = flow[4]; 351 int expedOutPos = flow[5]; 352 CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW 353 :CoderResult.OVERFLOW; 354 ByteBuffer bbf; 355 CharBuffer cbf; 356 if (direct) { 357 bbf = ByteBuffer.allocateDirect(inPos + utf8s.length); 358 cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer(); 359 } else { 360 bbf = ByteBuffer.allocate(inPos + utf8s.length); 361 cbf = CharBuffer.allocate(outPos + outLen); 362 } 363 bbf.position(inPos); 364 bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen); 365 cbf.position(outPos); 366 dec.reset(); 367 CoderResult cr = dec.decode(bbf, cbf, false); 368 if (cr != expedCR || 369 bbf.position() != expedInPos || 370 cbf.position() != expedOutPos) { 371 System.out.printf("Expected(direct=%5b): [", direct); 372 for (int i:flow) System.out.print(" " + i); 373 System.out.println("] CR=" + cr + 374 ", inPos=" + bbf.position() + 375 ", outPos=" + cbf.position()); 376 return false; 377 } 378 return true; 379 } 380 381 static void checkUnderOverflow(String csn) throws Exception { 382 System.out.printf(" Check under/overflow <%s>...%n", csn); 383 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 384 boolean failed = false; 385 byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8"); 386 int inlen = utf8s.length; 387 388 for (int inoff = 0; inoff < 20; inoff++) { 389 for (int outoff = 0; outoff < 20; outoff++) { 390 int[][] Flows = { 391 //inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1) 392 {inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1}, 393 {inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1}, 394 {inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1}, 395 {inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1}, 396 {inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0}, 397 // underflow 398 {inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0}, 399 {inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0}, 400 {inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0}, 401 {inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0}, 402 {inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0}, 403 {inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0}, 404 {inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0}, 405 {inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0}, 406 {inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0}, 407 {inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0}, 408 // 2-byte underflow/overflow 409 {inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0}, 410 {inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1}, 411 // 3-byte underflow/overflow 412 {inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0}, 413 {inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0}, 414 {inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1}, 415 // 4-byte underflow/overflow 416 {inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0}, 417 {inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0}, 418 {inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0}, 419 {inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1}, 420 }; 421 for (boolean direct: new boolean[] {false, true}) { 422 for (int[] flow: Flows) { 423 if (!check(dec, utf8s, direct, flow)) 424 failed = true; 425 } 426 }}} 427 if (failed) 428 throw new RuntimeException("Check under/overflow failed " + csn); 429 } 430 431 public static void main(String[] args) throws Exception { 432 checkRoundtrip("UTF-8"); 433 check6ByteSurrs("UTF-8"); 434 //compare("UTF-8", "UTF-8-OLD"); 435 checkMalformed("UTF-8"); 436 checkUnderOverflow("UTF-8"); 437 } 438 }