1 /* 2 * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* 25 * @test 26 * @bug 4486841 27 * @summary Test UTF-8 charset 28 */ 29 30 import java.nio.charset.*; 31 import java.nio.*; 32 import java.util.*; 33 34 public class TestUTF8 { 35 static char[] decode(byte[] bb, String csn, boolean testDirect) 36 throws Exception { 37 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 38 ByteBuffer bbf; 39 CharBuffer cbf; 40 if (testDirect) { 41 bbf = ByteBuffer.allocateDirect(bb.length); 42 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 43 bbf.put(bb).flip(); 44 } else { 45 bbf = ByteBuffer.wrap(bb); 46 cbf = CharBuffer.allocate(bb.length); 47 } 48 CoderResult cr = dec.decode(bbf, cbf, true); 49 if (cr != CoderResult.UNDERFLOW) 50 throw new RuntimeException("Decoding err: " + csn); 51 char[] cc = new char[cbf.position()]; 52 cbf.flip(); cbf.get(cc); 53 return cc; 54 55 } 56 57 static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect) 58 throws Exception { 59 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 60 ByteBuffer bbf; 61 CharBuffer cbf; 62 if (testDirect) { 63 bbf = ByteBuffer.allocateDirect(bb.length); 64 cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); 65 bbf.put(bb).flip(); 66 } else { 67 bbf = ByteBuffer.wrap(bb); 68 cbf = CharBuffer.allocate(bb.length); 69 } 70 return dec.decode(bbf, cbf, true); 71 } 72 73 static byte[] encode(char[] cc, String csn, boolean testDirect) 74 throws Exception { 75 ByteBuffer bbf; 76 CharBuffer cbf; 77 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 78 if (testDirect) { 79 bbf = ByteBuffer.allocateDirect(cc.length * 4); 80 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 81 cbf.put(cc).flip(); 82 } else { 83 bbf = ByteBuffer.allocate(cc.length * 4); 84 cbf = CharBuffer.wrap(cc); 85 } 86 87 CoderResult cr = enc.encode(cbf, bbf, true); 88 if (cr != CoderResult.UNDERFLOW) 89 throw new RuntimeException("Encoding err: " + csn); 90 byte[] bb = new byte[bbf.position()]; 91 bbf.flip(); bbf.get(bb); 92 return bb; 93 } 94 95 static CoderResult encodeCR(char[] cc, String csn, boolean testDirect) 96 throws Exception { 97 ByteBuffer bbf; 98 CharBuffer cbf; 99 CharsetEncoder enc = Charset.forName(csn).newEncoder(); 100 if (testDirect) { 101 bbf = ByteBuffer.allocateDirect(cc.length * 4); 102 cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); 103 cbf.put(cc).flip(); 104 } else { 105 bbf = ByteBuffer.allocate(cc.length * 4); 106 cbf = CharBuffer.wrap(cc); 107 } 108 return enc.encode(cbf, bbf, true); 109 } 110 111 static char[] getUTFChars() { 112 char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp 113 (0x110000 - 0x10000) * 2]; //supp 114 int pos = 0; 115 int i = 0; 116 for (i = 0; i < 0xd800; i++) 117 cc[pos++] = (char)i; 118 for (i = 0xe000; i < 0x10000; i++) 119 cc[pos++] = (char)i; 120 for (i = 0x10000; i < 0x110000; i++) { 121 pos += Character.toChars(i, cc, pos); 122 } 123 return cc; 124 } 125 126 static int to3ByteUTF8(char c, byte[] bb, int pos) { 127 bb[pos++] = (byte)(0xe0 | ((c >> 12))); 128 bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f)); 129 bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f)); 130 return 3; 131 } 132 133 static void checkRoundtrip(String csn) throws Exception { 134 System.out.printf(" Check roundtrip <%s>...", csn); 135 char[] cc = getUTFChars(); 136 byte[] bb = encode(cc, csn, false); 137 char[] ccO = decode(bb, csn, false); 138 139 if (!Arrays.equals(cc, ccO)) { 140 System.out.printf(" non-direct failed"); 141 } 142 bb = encode(cc, csn, true); 143 ccO = decode(bb, csn, true); 144 if (!Arrays.equals(cc, ccO)) { 145 System.out.printf(" (direct) failed"); 146 } 147 System.out.println(); 148 } 149 150 static void check6ByteSurrs(String csn) throws Exception { 151 System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); 152 byte[] bb = new byte[(0x110000 - 0x10000) * 6]; 153 char[] cc = new char[(0x110000 - 0x10000) * 2]; 154 int bpos = 0; 155 int cpos = 0; 156 for (int i = 0x10000; i < 0x110000; i++) { 157 Character.toChars(i, cc, cpos); 158 bpos += to3ByteUTF8(cc[cpos], bb, bpos); 159 bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); 160 cpos += 2; 161 } 162 163 char[] ccO = decode(bb, csn, false); 164 if (!Arrays.equals(cc, ccO)) { 165 System.out.printf(" decoding failed%n"); 166 } 167 ccO = decode(bb, csn, true); 168 if (!Arrays.equals(cc, ccO)) { 169 System.out.printf(" decoding(direct) failed%n"); 170 } 171 } 172 173 static void compare(String csn1, String csn2) throws Exception { 174 System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); 175 char[] cc = getUTFChars(); 176 177 byte[] bb1 = encode(cc, csn1, false); 178 byte[] bb2 = encode(cc, csn2, false); 179 if (!Arrays.equals(bb1, bb2)) 180 System.out.printf(" encoding failed%n"); 181 char[] cc1 = decode(bb1, csn1, false); 182 char[] cc2 = decode(bb1, csn2, false); 183 if (!Arrays.equals(cc1, cc2)) { 184 System.out.printf(" decoding failed%n"); 185 } 186 187 bb1 = encode(cc, csn1, true); 188 bb2 = encode(cc, csn2, true); 189 if (!Arrays.equals(bb1, bb2)) 190 System.out.printf(" encoding (direct) failed%n"); 191 cc1 = decode(bb1, csn1, true); 192 cc2 = decode(bb1, csn2, true); 193 if (!Arrays.equals(cc1, cc2)) { 194 System.out.printf(" decoding (direct) failed%n"); 195 } 196 } 197 198 // The first byte is the length of malformed bytes 199 static byte[][] malformed = { 200 // One-byte sequences: 201 {1, (byte)0xFF }, 202 {1, (byte)0xC0 }, 203 {1, (byte)0x80 }, 204 205 {1, (byte)0xFF, (byte)0xFF}, // all ones 206 {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble 207 208 // Two-byte sequences: 209 {1, (byte)0xC0, (byte)0x80}, // invalid first byte 210 {1, (byte)0xC1, (byte)0xBF}, // invalid first byte 211 {1, (byte)0xC2, (byte)0x00}, // invalid second byte 212 {1, (byte)0xC2, (byte)0xC0}, // invalid second byte 213 {1, (byte)0xD0, (byte)0x00}, // invalid second byte 214 {1, (byte)0xD0, (byte)0xC0}, // invalid second byte 215 {1, (byte)0xDF, (byte)0x00}, // invalid second byte 216 {1, (byte)0xDF, (byte)0xC0}, // invalid second byte 217 218 // Three-byte sequences 219 {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble 220 {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 221 {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 222 {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 223 224 {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte 225 {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte 226 {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte 227 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 228 {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte 229 {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte 230 231 // Four-byte sequences 232 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 233 {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 234 {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded 235 {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded 236 237 {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones 238 {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte 239 {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 240 {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte 241 {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte 242 243 {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte 244 {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte 245 {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte 246 {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 247 {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 248 {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte 249 250 // Five-byte sequences 251 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte 252 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 253 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 254 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 255 {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 256 257 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, 258 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, 259 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, 260 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, 261 262 // Six-byte sequences 263 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded 264 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded 265 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded 266 {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded 267 {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, 268 {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, 269 {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, 270 {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, 271 {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, 272 }; 273 274 static void checkMalformed(String csn) throws Exception { 275 boolean failed = false; 276 System.out.printf(" Check malformed <%s>...%n", csn); 277 for (boolean direct: new boolean[] {false, true}) { 278 for (byte[] bins : malformed) { 279 int mlen = bins[0]; 280 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); 281 CoderResult cr = decodeCR(bin, csn, direct); 282 String ashex = ""; 283 for (int i = 0; i < bin.length; i++) { 284 if (i > 0) ashex += " "; 285 ashex += Integer.toBinaryString((int)bin[i] & 0xff); 286 } 287 if (!cr.isMalformed()) { 288 System.out.printf(" FAIL(direct=%b): [%s] not malformed.\n", direct, ashex); 289 failed = true; 290 } else if (cr.length() != mlen) { 291 System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length()); 292 failed = true; 293 } 294 } 295 } 296 if (failed) 297 throw new RuntimeException("Check malformed failed " + csn); 298 } 299 300 static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) { 301 int inPos = flow[0]; 302 int inLen = flow[1]; 303 int outPos = flow[2]; 304 int outLen = flow[3]; 305 int expedInPos = flow[4]; 306 int expedOutPos = flow[5]; 307 CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW 308 :CoderResult.OVERFLOW; 309 ByteBuffer bbf; 310 CharBuffer cbf; 311 if (direct) { 312 bbf = ByteBuffer.allocateDirect(inPos + utf8s.length); 313 cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer(); 314 } else { 315 bbf = ByteBuffer.allocate(inPos + utf8s.length); 316 cbf = CharBuffer.allocate(outPos + outLen); 317 } 318 bbf.position(inPos); 319 bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen); 320 cbf.position(outPos); 321 dec.reset(); 322 CoderResult cr = dec.decode(bbf, cbf, false); 323 if (cr != expedCR || 324 bbf.position() != expedInPos || 325 cbf.position() != expedOutPos) { 326 System.out.printf("Expected(direct=%5b): [", direct); 327 for (int i:flow) System.out.print(" " + i); 328 System.out.println("] CR=" + cr + 329 ", inPos=" + bbf.position() + 330 ", outPos=" + cbf.position()); 331 return false; 332 } 333 return true; 334 } 335 336 static void checkUnderOverflow(String csn) throws Exception { 337 System.out.printf(" Check under/overflow <%s>...%n", csn); 338 CharsetDecoder dec = Charset.forName(csn).newDecoder(); 339 boolean failed = false; 340 byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8"); 341 int inlen = utf8s.length; 342 343 for (int inoff = 0; inoff < 20; inoff++) { 344 for (int outoff = 0; outoff < 20; outoff++) { 345 int[][] Flows = { 346 //inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1) 347 {inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1}, 348 {inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1}, 349 {inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1}, 350 {inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1}, 351 {inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0}, 352 // underflow 353 {inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0}, 354 {inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0}, 355 {inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0}, 356 {inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0}, 357 {inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0}, 358 {inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0}, 359 {inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0}, 360 {inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0}, 361 {inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0}, 362 {inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0}, 363 // 2-byte underflow/overflow 364 {inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0}, 365 {inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1}, 366 // 3-byte underflow/overflow 367 {inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0}, 368 {inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0}, 369 {inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1}, 370 // 4-byte underflow/overflow 371 {inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0}, 372 {inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0}, 373 {inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0}, 374 {inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1}, 375 }; 376 for (boolean direct: new boolean[] {false, true}) { 377 for (int[] flow: Flows) { 378 if (!check(dec, utf8s, direct, flow)) 379 failed = true; 380 } 381 }}} 382 if (failed) 383 throw new RuntimeException("Check under/overflow failed " + csn); 384 } 385 386 public static void main(String[] args) throws Exception { 387 checkRoundtrip("UTF-8"); 388 check6ByteSurrs("UTF-8"); 389 //compare("UTF-8", "UTF-8-OLD"); 390 checkMalformed("UTF-8"); 391 checkUnderOverflow("UTF-8"); 392 } 393 }