1 /*
   2  * Copyright (c) 2008, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4486841
  27  * @summary Test UTF-8 charset
  28  */
  29 
  30 import java.nio.charset.*;
  31 import java.nio.*;
  32 import java.util.*;
  33 
  34 public class TestUTF8 {
  35     static char[] decode(byte[] bb, String csn, boolean testDirect)
  36         throws Exception {
  37         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  38         ByteBuffer bbf;
  39         CharBuffer cbf;
  40         if (testDirect) {
  41             bbf = ByteBuffer.allocateDirect(bb.length);
  42             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  43             bbf.put(bb).flip();
  44         } else {
  45             bbf = ByteBuffer.wrap(bb);
  46             cbf = CharBuffer.allocate(bb.length);
  47         }
  48         CoderResult cr = dec.decode(bbf, cbf, true);
  49         if (cr != CoderResult.UNDERFLOW)
  50             throw new RuntimeException("Decoding err: " + csn);
  51         char[] cc = new char[cbf.position()];
  52         cbf.flip(); cbf.get(cc);
  53         return cc;
  54 
  55     }
  56 
  57     static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect)
  58         throws Exception {
  59         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  60         ByteBuffer bbf;
  61         CharBuffer cbf;
  62         if (testDirect) {
  63             bbf = ByteBuffer.allocateDirect(bb.length);
  64             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  65             bbf.put(bb).flip();
  66         } else {
  67             bbf = ByteBuffer.wrap(bb);
  68             cbf = CharBuffer.allocate(bb.length);
  69         }
  70         return dec.decode(bbf, cbf, true);
  71     }
  72 
  73     static byte[] encode(char[] cc, String csn, boolean testDirect)
  74         throws Exception {
  75         ByteBuffer bbf;
  76         CharBuffer cbf;
  77         CharsetEncoder enc = Charset.forName(csn).newEncoder();
  78         if (testDirect) {
  79             bbf = ByteBuffer.allocateDirect(cc.length * 4);
  80             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
  81             cbf.put(cc).flip();
  82         } else {
  83             bbf = ByteBuffer.allocate(cc.length * 4);
  84             cbf = CharBuffer.wrap(cc);
  85         }
  86 
  87         CoderResult cr = enc.encode(cbf, bbf, true);
  88         if (cr != CoderResult.UNDERFLOW)
  89             throw new RuntimeException("Encoding err: " + csn);
  90         byte[] bb = new byte[bbf.position()];
  91         bbf.flip(); bbf.get(bb);
  92         return bb;
  93     }
  94 
  95     static CoderResult encodeCR(char[] cc, String csn, boolean testDirect)
  96         throws Exception {
  97         ByteBuffer bbf;
  98         CharBuffer cbf;
  99         CharsetEncoder enc = Charset.forName(csn).newEncoder();
 100         if (testDirect) {
 101             bbf = ByteBuffer.allocateDirect(cc.length * 4);
 102             cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer();
 103             cbf.put(cc).flip();
 104         } else {
 105             bbf = ByteBuffer.allocate(cc.length * 4);
 106             cbf = CharBuffer.wrap(cc);
 107         }
 108         return enc.encode(cbf, bbf, true);
 109     }
 110 
 111     static char[] getUTFChars() {
 112         char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp
 113                              (0x110000 - 0x10000) * 2];    //supp
 114         int pos = 0;
 115         int i = 0;
 116         for (i = 0; i < 0xd800; i++)
 117             cc[pos++] = (char)i;
 118         for (i = 0xe000; i < 0x10000; i++)
 119             cc[pos++] = (char)i;
 120         for (i = 0x10000; i < 0x110000; i++) {
 121             pos += Character.toChars(i, cc, pos);
 122         }
 123         return cc;
 124     }
 125 
 126     static int to3ByteUTF8(char c, byte[] bb, int pos) {
 127         bb[pos++] = (byte)(0xe0 | ((c >> 12)));
 128         bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f));
 129         bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f));
 130         return 3;
 131     }
 132 
 133     static void checkRoundtrip(String csn) throws Exception {
 134         System.out.printf("    Check roundtrip <%s>...", csn);
 135         char[] cc = getUTFChars();
 136         byte[] bb = encode(cc, csn, false);
 137         char[] ccO = decode(bb, csn, false);
 138 
 139         if (!Arrays.equals(cc, ccO)) {
 140             System.out.printf("    non-direct failed");
 141         }
 142         bb = encode(cc, csn, true);
 143         ccO = decode(bb, csn, true);
 144         if (!Arrays.equals(cc, ccO)) {
 145             System.out.printf("    (direct) failed");
 146         }
 147         System.out.println();
 148     }
 149 
 150     static void check6ByteSurrs(String csn) throws Exception {
 151         System.out.printf("    Check 6-byte Surrogates <%s>...%n", csn);
 152         byte[] bb = new byte[(0x110000 - 0x10000) * 6];
 153         char[] cc = new char[(0x110000 - 0x10000) * 2];
 154         int bpos = 0;
 155         int cpos = 0;
 156         for (int i = 0x10000; i < 0x110000; i++) {
 157             Character.toChars(i, cc, cpos);
 158             bpos += to3ByteUTF8(cc[cpos], bb, bpos);
 159             bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos);
 160             cpos += 2;
 161         }
 162 
 163         char[] ccO = decode(bb, csn, false);
 164         if (!Arrays.equals(cc, ccO)) {
 165             System.out.printf("    decoding failed%n");
 166         }
 167         ccO = decode(bb, csn, true);
 168         if (!Arrays.equals(cc, ccO)) {
 169             System.out.printf("    decoding(direct) failed%n");
 170         }
 171     }
 172 
 173     static void compare(String csn1, String csn2) throws Exception {
 174         System.out.printf("    Diff <%s> <%s>...%n", csn1, csn2);
 175         char[] cc = getUTFChars();
 176 
 177         byte[] bb1 = encode(cc, csn1, false);
 178         byte[] bb2 = encode(cc, csn2, false);
 179         if (!Arrays.equals(bb1, bb2))
 180             System.out.printf("        encoding failed%n");
 181         char[] cc1 = decode(bb1, csn1, false);
 182         char[] cc2 = decode(bb1, csn2, false);
 183         if (!Arrays.equals(cc1, cc2)) {
 184             System.out.printf("        decoding failed%n");
 185         }
 186 
 187         bb1 = encode(cc, csn1, true);
 188         bb2 = encode(cc, csn2, true);
 189         if (!Arrays.equals(bb1, bb2))
 190             System.out.printf("        encoding (direct) failed%n");
 191         cc1 = decode(bb1, csn1, true);
 192         cc2 = decode(bb1, csn2, true);
 193         if (!Arrays.equals(cc1, cc2)) {
 194             System.out.printf("        decoding (direct) failed%n");
 195         }
 196     }
 197 
 198     // The first byte is the length of malformed bytes
 199     static byte[][] malformed = {
 200         // One-byte sequences:
 201         {1, (byte)0xFF },
 202         {1, (byte)0xC0 },
 203         {1, (byte)0x80 },
 204 
 205         {1, (byte)0xFF, (byte)0xFF}, // all ones
 206         {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble
 207 
 208         // Two-byte sequences:
 209         {1, (byte)0xC0, (byte)0x80}, // invalid first byte
 210         {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
 211         {1, (byte)0xC2, (byte)0x00}, // invalid second byte
 212         {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
 213         {1, (byte)0xD0, (byte)0x00}, // invalid second byte
 214         {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
 215         {1, (byte)0xDF, (byte)0x00}, // invalid second byte
 216         {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
 217 
 218         // Three-byte sequences
 219         {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
 220         {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 221         {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 222         {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 223 
 224         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
 225         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
 226         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
 227         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 228         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
 229         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
 230 
 231         // Four-byte sequences
 232         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 233         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 234         {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
 235         {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
 236 
 237         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 238         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
 239         {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 240         {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
 241         {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte
 242 
 243         {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 244         {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
 245         {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 246         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 247         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 248         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 249 
 250         // Five-byte sequences
 251         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
 252         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 253         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 254         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 255         {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 256 
 257         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
 258         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
 259         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
 260         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
 261 
 262         // Six-byte sequences
 263         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 264         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 265         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 266         {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 267         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
 268         {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
 269         {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },
 270         {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 },
 271         {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 },
 272     };
 273 
 274     static void checkMalformed(String csn) throws Exception {
 275         boolean failed = false;
 276         System.out.printf("    Check malformed <%s>...%n", csn);
 277         for (boolean direct: new boolean[] {false, true}) {
 278             for (byte[] bins : malformed) {
 279                 int mlen = bins[0];
 280                 byte[] bin = Arrays.copyOfRange(bins, 1, bins.length);
 281                 CoderResult cr = decodeCR(bin, csn, direct);
 282                 String ashex = "";
 283                 for (int i = 0; i < bin.length; i++) {
 284                     if (i > 0) ashex += " ";
 285                         ashex += Integer.toBinaryString((int)bin[i] & 0xff);
 286                 }
 287                 if (!cr.isMalformed()) {
 288                     System.out.printf("        FAIL(direct=%b): [%s] not malformed.\n", direct, ashex);
 289                     failed = true;
 290                 } else if (cr.length() != mlen) {
 291                     System.out.printf("        FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length());
 292                     failed = true;
 293                 }
 294             }
 295         }
 296         if (failed)
 297             throw new RuntimeException("Check malformed failed " + csn);
 298     }
 299 
 300     static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) {
 301         int inPos = flow[0];
 302         int inLen = flow[1];
 303         int outPos = flow[2];
 304         int outLen = flow[3];
 305         int expedInPos = flow[4];
 306         int expedOutPos = flow[5];
 307         CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW
 308                                           :CoderResult.OVERFLOW;
 309         ByteBuffer bbf;
 310         CharBuffer cbf;
 311         if (direct) {
 312             bbf = ByteBuffer.allocateDirect(inPos + utf8s.length);
 313             cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer();
 314         } else {
 315             bbf = ByteBuffer.allocate(inPos + utf8s.length);
 316             cbf = CharBuffer.allocate(outPos + outLen);
 317         }
 318         bbf.position(inPos);
 319         bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen);
 320         cbf.position(outPos);
 321         dec.reset();
 322         CoderResult cr = dec.decode(bbf, cbf, false);
 323         if (cr != expedCR ||
 324             bbf.position() != expedInPos ||
 325             cbf.position() != expedOutPos) {
 326             System.out.printf("Expected(direct=%5b): [", direct);
 327             for (int i:flow) System.out.print(" " + i);
 328             System.out.println("]  CR=" + cr +
 329                                ", inPos=" + bbf.position() +
 330                                ", outPos=" + cbf.position());
 331             return false;
 332         }
 333         return true;
 334     }
 335 
 336     static void checkUnderOverflow(String csn) throws Exception {
 337         System.out.printf("    Check under/overflow <%s>...%n", csn);
 338         CharsetDecoder dec = Charset.forName(csn).newDecoder();
 339         boolean failed = false;
 340         byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8");
 341         int    inlen = utf8s.length;
 342 
 343         for (int inoff = 0; inoff < 20; inoff++) {
 344             for (int outoff = 0; outoff < 20; outoff++) {
 345         int[][] Flows = {
 346             //inpos, inLen, outPos,  outLen, inPosEP,   outposEP,   under(0)/over(1)
 347             {inoff,  inlen, outoff,  1,      inoff + 1, outoff + 1, 1},
 348             {inoff,  inlen, outoff,  2,      inoff + 3, outoff + 2, 1},
 349             {inoff,  inlen, outoff,  3,      inoff + 6, outoff + 3, 1},
 350             {inoff,  inlen, outoff,  4,      inoff + 6, outoff + 3, 1},
 351             {inoff,  inlen, outoff,  5,      inoff + 10,outoff + 5, 0},
 352              // underflow
 353             {inoff,  1,     outoff,  5,      inoff + 1, outoff + 1, 0},
 354             {inoff,  2,     outoff,  5,      inoff + 1, outoff + 1, 0},
 355             {inoff,  3,     outoff,  5,      inoff + 3, outoff + 2, 0},
 356             {inoff,  4,     outoff,  5,      inoff + 3, outoff + 2, 0},
 357             {inoff,  5,     outoff,  5,      inoff + 3, outoff + 2, 0},
 358             {inoff,  6,     outoff,  5,      inoff + 6, outoff + 3, 0},
 359             {inoff,  7,     outoff,  5,      inoff + 6, outoff + 3, 0},
 360             {inoff,  8,     outoff,  5,      inoff + 6, outoff + 3, 0},
 361             {inoff,  9,     outoff,  5,      inoff + 6, outoff + 3, 0},
 362             {inoff,  10,    outoff,  5,      inoff + 10,outoff + 5, 0},
 363              // 2-byte underflow/overflow
 364             {inoff,  2,     outoff,  1,      inoff + 1, outoff + 1, 0},
 365             {inoff,  3,     outoff,  1,      inoff + 1, outoff + 1, 1},
 366              // 3-byte underflow/overflow
 367             {inoff,  4,     outoff,  2,      inoff + 3, outoff + 2, 0},
 368             {inoff,  5,     outoff,  2,      inoff + 3, outoff + 2, 0},
 369             {inoff,  6,     outoff,  2,      inoff + 3, outoff + 2, 1},
 370              // 4-byte underflow/overflow
 371             {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
 372             {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
 373             {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
 374             {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
 375         };
 376         for (boolean direct: new boolean[] {false, true}) {
 377             for (int[] flow: Flows) {
 378                 if (!check(dec, utf8s, direct, flow))
 379                     failed = true;
 380             }
 381         }}}
 382         if (failed)
 383             throw new RuntimeException("Check under/overflow failed " + csn);
 384     }
 385 
 386     public static void main(String[] args) throws Exception {
 387         checkRoundtrip("UTF-8");
 388         check6ByteSurrs("UTF-8");
 389         //compare("UTF-8", "UTF-8-OLD");
 390         checkMalformed("UTF-8");
 391         checkUnderOverflow("UTF-8");
 392     }
 393 }