1 /*
   2  * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4486841 7040220
  27  * @summary Test UTF-8 charset
  28  */
  29 
  30 import java.nio.charset.*;
  31 import java.nio.*;
  32 import java.util.*;
  33 
  34 public class TestUTF8 {
  35     static char[] decode(byte[] bb, String csn, boolean testDirect)
  36         throws Exception {
  37         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  38         ByteBuffer bbf;
  39         CharBuffer cbf;
  40         if (testDirect) {
  41             bbf = ByteBuffer.allocateDirect(bb.length);
  42             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  43             bbf.put(bb).flip();
  44         } else {
  45             bbf = ByteBuffer.wrap(bb);
  46             cbf = CharBuffer.allocate(bb.length);
  47         }
  48         CoderResult cr = dec.decode(bbf, cbf, true);
  49         if (cr != CoderResult.UNDERFLOW)
  50             throw new RuntimeException("Decoding err: " + csn);
  51         char[] cc = new char[cbf.position()];
  52         cbf.flip(); cbf.get(cc);
  53         return cc;
  54 
  55     }
  56 
  57     static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
  58         throws Exception {
  59         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  60         ByteBuffer bbf;
  61         CharBuffer cbf;
  62         if (testDirect) {
  63             bbf = ByteBuffer.allocateDirect(bb.length);
  64             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  65             bbf.put(bb).flip();
  66         } else {
  67             bbf = ByteBuffer.wrap(bb);
  68             cbf = CharBuffer.allocate(bb.length);
  69         }
  70         return dec.decode(bbf, cbf, true);
  71     }
  72 
  73     // copy/paste of the StringCoding.decode()
  74     static char[] decode(Charset cs, byte[] ba, int off, int len) {
  75         CharsetDecoder cd = cs.newDecoder();
  76         int en = (int)(len * cd.maxCharsPerByte());
  77         char[] ca = new char[en];
  78         if (len == 0)
  79             return ca;
  80         cd.onMalformedInput(CodingErrorAction.REPLACE)
  81           .onUnmappableCharacter(CodingErrorAction.REPLACE)
  82           .reset();
  83 
  84         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
  85         CharBuffer cb = CharBuffer.wrap(ca);
  86         try {
  87             CoderResult cr = cd.decode(bb, cb, true);
  88             if (!cr.isUnderflow())
  89                 cr.throwException();
  90             cr = cd.flush(cb);
  91             if (!cr.isUnderflow())
  92                 cr.throwException();
  93         } catch (CharacterCodingException x) {
  94             throw new Error(x);
  95         }
  96         return Arrays.copyOf(ca, cb.position());
  97     }
  98 
  99     static byte[] encode(char[] cc, String csn, boolean testDirect)
 100         throws Exception {
 101         ByteBuffer bbf;
 102         CharBuffer cbf;
 103         CharsetEncoder enc = Charset.forName(csn).newEncoder();
 104         if (testDirect) {
 105             bbf = ByteBuffer.allocateDirect(cc.length * 4);
 106             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
 107             cbf.put(cc).flip();
 108         } else {
 109             bbf = ByteBuffer.allocate(cc.length * 4);
 110             cbf = CharBuffer.wrap(cc);
 111         }
 112 
 113         CoderResult cr = enc.encode(cbf, bbf, true);
 114         if (cr != CoderResult.UNDERFLOW)
 115             throw new RuntimeException("Encoding err: " + csn);
 116         byte[] bb = new byte[bbf.position()];
 117         bbf.flip(); bbf.get(bb);
 118         return bb;
 119     }
 120 
 121     static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
 122         throws Exception {
 123         ByteBuffer bbf;
 124         CharBuffer cbf;
 125         CharsetEncoder enc = Charset.forName(csn).newEncoder();
 126         if (testDirect) {
 127             bbf = ByteBuffer.allocateDirect(cc.length * 4);
 128             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
 129             cbf.put(cc).flip();
 130         } else {
 131             bbf = ByteBuffer.allocate(cc.length * 4);
 132             cbf = CharBuffer.wrap(cc);
 133         }
 134         return enc.encode(cbf, bbf, true);
 135     }
 136 
 137     static char[] getUTFChars() {
 138         char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
 139                              (0x110000 - 0x10000) * 2];    //supp
 140         int pos = 0;
 141         int i = 0;
 142         for (i = 0; i < 0xd800; i++)
 143             cc[pos++] = (char)i;
 144         for (i = 0xe000; i < 0x10000; i++)
 145             cc[pos++] = (char)i;
 146         for (i = 0x10000; i < 0x110000; i++) {
 147             pos += Character.toChars(i, cc, pos);
 148         }
 149         return cc;
 150     }
 151 
 152     static int to3ByteUTF8(char c, byte[] bb, int pos) {
 153         bb[pos++] = (byte)(0xe0 | ((c >> 12)));
 154         bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
 155         bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
 156         return 3;
 157     }
 158 
 159     static void checkRoundtrip(String csn) throws Exception {
 160         System.out.printf("    Check roundtrip <%s>...", csn);
 161         char[] cc = getUTFChars();
 162         byte[] bb = encode(cc, csn, false);
 163         char[] ccO = decode(bb, csn, false);
 164 
 165         if (!Arrays.equals(cc, ccO)) {
 166             System.out.printf("    non-direct failed");
 167         }
 168         bb = encode(cc, csn, true);
 169         ccO = decode(bb, csn, true);
 170         if (!Arrays.equals(cc, ccO)) {
 171             System.out.print("    (direct) failed");
 172         }
 173         // String.getBytes()/toCharArray() goes to ArrayDe/Encoder path
 174         if (!Arrays.equals(bb, new String(cc).getBytes(csn))) {
 175             System.out.printf("    String.getBytes() failed");
 176         }
 177         if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
 178             System.out.printf("    String.toCharArray() failed");
 179         }
 180         System.out.println();
 181     }
 182 
 183     static void check6ByteSurrs(String csn) throws Exception {
 184         System.out.printf("    Check 6-byte Surrogates <%s>...%n", csn);
 185         byte[] bb = new byte[(0x110000 - 0x10000) * 6];
 186         char[] cc = new char[(0x110000 - 0x10000) * 2];
 187         int bpos = 0;
 188         int cpos = 0;
 189         for (int i = 0x10000; i < 0x110000; i++) {
 190             Character.toChars(i, cc, cpos);
 191             bpos += to3ByteUTF8(cc[cpos], bb, bpos);
 192             bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
 193             cpos += 2;
 194         }
 195 
 196         char[] ccO = decode(bb, csn, false);
 197         if (!Arrays.equals(cc, ccO)) {
 198             System.out.printf("    decoding failed%n");
 199         }
 200         ccO = decode(bb, csn, true);
 201         if (!Arrays.equals(cc, ccO)) {
 202             System.out.printf("    decoding(direct) failed%n");
 203         }
 204         // new String(bb, csn).getBytes(csn) will not return
 205         // the 6 bytes surrogates as in bb, so only test
 206         // toCharArray() here.
 207         if (!Arrays.equals(cc, new String(bb, csn).toCharArray())) {
 208             System.out.printf("    String.toCharArray() failed");
 209         }
 210     }
 211 
 212     static void compare(String csn1, String csn2) throws Exception {
 213         System.out.printf("    Diff <%s> <%s>...%n", csn1, csn2);
 214         char[] cc = getUTFChars();
 215 
 216         byte[] bb1 = encode(cc, csn1, false);
 217         byte[] bb2 = encode(cc, csn2, false);
 218         if (!Arrays.equals(bb1, bb2))
 219             System.out.printf("        encoding failed%n");
 220         char[] cc1 = decode(bb1, csn1, false);
 221         char[] cc2 = decode(bb1, csn2, false);
 222         if (!Arrays.equals(cc1, cc2)) {
 223             System.out.printf("        decoding failed%n");
 224         }
 225 
 226         bb1 = encode(cc, csn1, true);
 227         bb2 = encode(cc, csn2, true);
 228         if (!Arrays.equals(bb1, bb2))
 229             System.out.printf("        encoding (direct) failed%n");
 230         cc1 = decode(bb1, csn1, true);
 231         cc2 = decode(bb1, csn2, true);
 232         if (!Arrays.equals(cc1, cc2)) {
 233             System.out.printf("        decoding (direct) failed%n");
 234         }
 235     }
 236 
 237     // The first byte is the length of malformed bytes
 238     static byte[][] malformed = {
 239         // One-byte sequences:
 240         {1, (byte)0xFF },
 241         {1, (byte)0xC0 },
 242         {1, (byte)0x80 },
 243 
 244         {1, (byte)0xFF, (byte)0xFF}, // all ones
 245         {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
 246 
 247         // Two-byte sequences:
 248         {1, (byte)0xC0, (byte)0x80}, // invalid first byte
 249         {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
 250         {1, (byte)0xC2, (byte)0x00}, // invalid second byte
 251         {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
 252         {1, (byte)0xD0, (byte)0x00}, // invalid second byte
 253         {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
 254         {1, (byte)0xDF, (byte)0x00}, // invalid second byte
 255         {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
 256 
 257         // Three-byte sequences
 258         {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
 259         {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 260         {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 261         {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 262 
 263         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
 264         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
 265         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
 266         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 267         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
 268         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
 269 
 270         // Four-byte sequences
 271         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 272         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 273         {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
 274         {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
 275 
 276         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 277         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
 278         {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 279         {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
 280         {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte
 281 
 282         {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 283         {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
 284         {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 285         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 286         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 287         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 288 
 289         // Five-byte sequences
 290         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
 291         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 292         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 293         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 294         {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 295 
 296         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
 297         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
 298         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
 299         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
 300 
 301         // Six-byte sequences
 302         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 303         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 304         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 305         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 306         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
 307         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
 308         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
 309         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
 310         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
 311     };
 312 
 313     static void checkMalformed(String csn) throws Exception {
 314         boolean failed = false;
 315         System.out.printf("    Check malformed <%s>...%n", csn);
 316         Charset cs = Charset.forName(csn);
 317         for (boolean direct: new boolean[] {false, true}) {
 318             for (byte[] bins : malformed) {
 319                 int mlen = bins[0];
 320                 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
 321                 CoderResult cr = decodeCR(bin, csn, direct);
 322                 String ashex = "";
 323                 for (int i = 0; i < bin.length; i++) {
 324                     if (i > 0) ashex += " ";
 325                         ashex += Integer.toBinaryString((int)bin[i] & 0xff);
 326                 }
 327                 if (!cr.isMalformed()) {
 328                     System.out.printf("        FAIL(direct=%b): [%s] not malformed.%n", direct, ashex);
 329                     failed = true;
 330                 } else if (cr.length() != mlen) {
 331                     System.out.printf("        FAIL(direct=%b): [%s] malformed[len=%d].%n", direct, ashex, cr.length());
 332                     failed = true;
 333                 }
 334                 if (!Arrays.equals(decode(cs, bin, 0, bin.length),
 335                                    new String(bin, csn).toCharArray())) {
 336                     System.out.printf("        FAIL(new String(bb, %s)) failed%n", csn);
 337                     failed = true;
 338                 }
 339             }
 340         }
 341         if (failed)
 342             throw new RuntimeException("Check malformed failed " + csn);
 343     }
 344 
 345     static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
 346         int inPos = flow[0];
 347         int inLen = flow[1];
 348         int outPos = flow[2];
 349         int outLen = flow[3];
 350         int expedInPos = flow[4];
 351         int expedOutPos = flow[5];
 352         CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
 353                                           :CoderResult.OVERFLOW;
 354         ByteBuffer bbf;
 355         CharBuffer cbf;
 356         if (direct) {
 357             bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
 358             cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
 359         } else {
 360             bbf = ByteBuffer.allocate(inPos + utf8s.length);
 361             cbf = CharBuffer.allocate(outPos + outLen);
 362         }
 363         bbf.position(inPos);
 364         bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
 365         cbf.position(outPos);
 366         dec.reset();
 367         CoderResult cr = dec.decode(bbf, cbf, false);
 368         if (cr != expedCR ||
 369             bbf.position() != expedInPos ||
 370             cbf.position() != expedOutPos) {
 371             System.out.printf("Expected(direct=%5b): [", direct);
 372             for (int i:flow) System.out.print(" " + i);
 373             System.out.println("]  CR=" + cr +
 374                                ", inPos=" + bbf.position() +
 375                                ", outPos=" + cbf.position());
 376             return false;
 377         }
 378         return true;
 379     }
 380 
 381     static void checkUnderOverflow(String csn) throws Exception {
 382         System.out.printf("    Check under/overflow <%s>...%n", csn);
 383         CharsetDecoder dec = Charset.forName(csn).newDecoder();
 384         boolean failed = false;
 385         byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
 386         int    inlen = utf8s.length;
 387 
 388         for (int inoff = 0; inoff < 20; inoff++) {
 389             for (int outoff = 0; outoff < 20; outoff++) {
 390         int[][] Flows = {
 391             //inpos, inLen, outPos,  outLen, inPosEP,   outposEP,   under(0)/over(1)
 392             {inoff,  inlen, outoff,  1,      inoff + 1, outoff + 1, 1},
 393             {inoff,  inlen, outoff,  2,      inoff + 3, outoff + 2, 1},
 394             {inoff,  inlen, outoff,  3,      inoff + 6, outoff + 3, 1},
 395             {inoff,  inlen, outoff,  4,      inoff + 6, outoff + 3, 1},
 396             {inoff,  inlen, outoff,  5,      inoff + 10,outoff + 5, 0},
 397              // underflow
 398             {inoff,  1,     outoff,  5,      inoff + 1, outoff + 1, 0},
 399             {inoff,  2,     outoff,  5,      inoff + 1, outoff + 1, 0},
 400             {inoff,  3,     outoff,  5,      inoff + 3, outoff + 2, 0},
 401             {inoff,  4,     outoff,  5,      inoff + 3, outoff + 2, 0},
 402             {inoff,  5,     outoff,  5,      inoff + 3, outoff + 2, 0},
 403             {inoff,  6,     outoff,  5,      inoff + 6, outoff + 3, 0},
 404             {inoff,  7,     outoff,  5,      inoff + 6, outoff + 3, 0},
 405             {inoff,  8,     outoff,  5,      inoff + 6, outoff + 3, 0},
 406             {inoff,  9,     outoff,  5,      inoff + 6, outoff + 3, 0},
 407             {inoff,  10,    outoff,  5,      inoff + 10,outoff + 5, 0},
 408              // 2-byte underflow/overflow
 409             {inoff,  2,     outoff,  1,      inoff + 1, outoff + 1, 0},
 410             {inoff,  3,     outoff,  1,      inoff + 1, outoff + 1, 1},
 411              // 3-byte underflow/overflow
 412             {inoff,  4,     outoff,  2,      inoff + 3, outoff + 2, 0},
 413             {inoff,  5,     outoff,  2,      inoff + 3, outoff + 2, 0},
 414             {inoff,  6,     outoff,  2,      inoff + 3, outoff + 2, 1},
 415              // 4-byte underflow/overflow
 416             {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
 417             {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
 418             {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
 419             {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
 420         };
 421         for (boolean direct: new boolean[] {false, true}) {
 422             for (int[] flow: Flows) {
 423                 if (!check(dec, utf8s, direct, flow))
 424                     failed = true;
 425             }
 426         }}}
 427         if (failed)
 428             throw new RuntimeException("Check under/overflow failed " + csn);
 429     }
 430 
 431     public static void main(String[] args) throws Exception {
 432         checkRoundtrip("UTF-8");
 433         check6ByteSurrs("UTF-8");
 434         //compare("UTF-8", "UTF-8-OLD");
 435         checkMalformed("UTF-8");
 436         checkUnderOverflow("UTF-8");
 437     }
 438 }