test/sun/nio/cs/TestUTF8.java

Print this page




   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4486841 7040220 7096080
  27  * @summary Test UTF-8 charset
  28  */
  29 
  30 import java.nio.charset.*;
  31 import java.nio.*;
  32 import java.util.*;
  33 
  34 public class TestUTF8 {
  35     static char[] decode(byte[] bb, String csn, boolean testDirect)
  36         throws Exception {
  37         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  38         ByteBuffer bbf;
  39         CharBuffer cbf;
  40         if (testDirect) {
  41             bbf = ByteBuffer.allocateDirect(bb.length);
  42             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  43             bbf.put(bb).flip();
  44         } else {
  45             bbf = ByteBuffer.wrap(bb);
  46             cbf = CharBuffer.allocate(bb.length);


 274 
 275         // Two-byte sequences:
 276         {1, (byte)0xC0, (byte)0x80}, // invalid first byte
 277         {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
 278         {1, (byte)0xC2, (byte)0x00}, // invalid second byte
 279         {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
 280         {1, (byte)0xD0, (byte)0x00}, // invalid second byte
 281         {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
 282         {1, (byte)0xDF, (byte)0x00}, // invalid second byte
 283         {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
 284 
 285         // Three-byte sequences
 286         {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
 287         {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 288         {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 289         {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 290 
 291         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
 292         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
 293         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte


 294         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 295         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
 296         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
 297         {1, (byte)0xE0, (byte)0x41,},             // invalid second byte & 2 bytes

 298         {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
 299         {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
 300 
 301 

 302         // Four-byte sequences
 303         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 304         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 305         {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
 306         {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
 307 
 308         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 309         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
 310         {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 311         {1, (byte)0xF0, (byte)41 },                           // invalid second byte
 312                                                               // & only 2 bytes
 313 
 314         {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
 315         {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 316         {2, (byte)0xF0, (byte)0x90, (byte)0x41 },             // invalid third byte
 317                                                               // & 3 bytes input
 318 
 319         {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 320         {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
 321         {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 322         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 323         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 324         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 325 


























 326         // Five-byte sequences
 327         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
 328         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 329         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 330         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 331         {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 332 
 333         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
 334         {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
 335         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
 336         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
 337 
 338         // Six-byte sequences
 339         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 340         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 341         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 342         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 343         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
 344         {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
 345         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },


 536             {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
 537             {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
 538             {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
 539             {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
 540         };
 541         for (boolean direct: new boolean[] {false, true}) {
 542             for (int[] flow: Flows) {
 543                 if (!check(dec, utf8s, direct, flow))
 544                     failed = true;
 545             }
 546         }}}
 547         if (failed)
 548             throw new RuntimeException("Check under/overflow failed " + csn);
 549     }
 550 
 551     public static void main(String[] args) throws Exception {
 552         checkRoundtrip("UTF-8");
 553         check4ByteSurrs("UTF-8");
 554         checkMalformed("UTF-8", malformed);
 555         checkUnderOverflow("UTF-8");
 556 
 557         checkRoundtrip("CESU-8");
 558         check6ByteSurrs("CESU-8");
 559         checkMalformed("CESU-8", malformed_cesu8);
 560     }
 561 }


   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4486841 7040220 7096080 8039751
  27  * @summary Test UTF-8 charset
  28  */
  29 
  30 import java.nio.charset.*;
  31 import java.nio.*;
  32 import java.util.*;
  33 
  34 public class TestUTF8 {
  35     static char[] decode(byte[] bb, String csn, boolean testDirect)
  36         throws Exception {
  37         CharsetDecoder dec = Charset.forName(csn).newDecoder();
  38         ByteBuffer bbf;
  39         CharBuffer cbf;
  40         if (testDirect) {
  41             bbf = ByteBuffer.allocateDirect(bb.length);
  42             cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer();
  43             bbf.put(bb).flip();
  44         } else {
  45             bbf = ByteBuffer.wrap(bb);
  46             cbf = CharBuffer.allocate(bb.length);


 274 
 275         // Two-byte sequences:
 276         {1, (byte)0xC0, (byte)0x80}, // invalid first byte
 277         {1, (byte)0xC1, (byte)0xBF}, // invalid first byte
 278         {1, (byte)0xC2, (byte)0x00}, // invalid second byte
 279         {1, (byte)0xC2, (byte)0xC0}, // invalid second byte
 280         {1, (byte)0xD0, (byte)0x00}, // invalid second byte
 281         {1, (byte)0xD0, (byte)0xC0}, // invalid second byte
 282         {1, (byte)0xDF, (byte)0x00}, // invalid second byte
 283         {1, (byte)0xDF, (byte)0xC0}, // invalid second byte
 284 
 285         // Three-byte sequences
 286         {1, (byte)0xE0, (byte)0x80, (byte)0x80},  // 111x first byte first nibble
 287         {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 288         {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 289         {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 290 
 291         {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte
 292         {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte
 293         {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte
 294         {2, (byte)0xE1, (byte)0x80, (byte)0x42},  // invalid third byte
 295 
 296         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 297         {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte
 298         {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte
 299         {1, (byte)0xE0, (byte)0x41,},             // invalid second byte & 2 bytes
 300         {1, (byte)0xE1, (byte)0x40,},             // invalid second byte & 2 bytes
 301         {3, (byte)0xED, (byte)0xAE, (byte)0x80 }, // 3 bytes surrogate
 302         {3, (byte)0xED, (byte)0xB0, (byte)0x80 }, // 3 bytes surrogate
 303 
 304 
 305 
 306         // Four-byte sequences
 307         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 308         {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 309         {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded
 310         {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded
 311 
 312         {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones
 313         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid second byte
 314         {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 315         {1, (byte)0xF0, (byte)41 },                           // invalid second byte
 316                                                               // & only 2 bytes
 317 
 318         {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte
 319         {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 320         {2, (byte)0xF0, (byte)0x90, (byte)0x41 },             // invalid third byte
 321                                                               // & 3 bytes input
 322 
 323         {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte
 324         {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte
 325         {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte
 326         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 327         {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 328         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte
 329 
 330         // #8039751
 331         {1, (byte)0xF6, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
 332         {1, (byte)0xF6, (byte)0x80, (byte)0x80,  },
 333         {1, (byte)0xF6, (byte)0x80, },
 334         {1, (byte)0xF6, },
 335         {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 1st byte
 336         {1, (byte)0xF5, (byte)0x80, (byte)0x80,  },
 337         {1, (byte)0xF5, (byte)0x80,  },
 338         {1, (byte)0xF5  },
 339 
 340         {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
 341         {1, (byte)0xF4, (byte)0x90, (byte)0x80 },
 342         {1, (byte)0xF4, (byte)0x90 },
 343 
 344         {1, (byte)0xF4, (byte)0x7f, (byte)0x80, (byte)0x80 }, // out-range/ascii 2nd byte
 345         {1, (byte)0xF4, (byte)0x7f, (byte)0x80 },
 346         {1, (byte)0xF4, (byte)0x7f },
 347 
 348         {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
 349         {1, (byte)0xF0, (byte)0x80, (byte)0x80 },
 350         {1, (byte)0xF0, (byte)0x80 },
 351 
 352         {1, (byte)0xF0, (byte)0xc0, (byte)0x80, (byte)0x80 }, // out-range 2nd byte
 353         {1, (byte)0xF0, (byte)0xc0, (byte)0x80 },
 354         {1, (byte)0xF0, (byte)0xc0 },
 355 
 356         // Five-byte sequences
 357         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80},  // invalid first byte
 358         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 359         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 360         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 361         {1, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 362 
 363         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80},
 364         {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 },
 365         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF },
 366         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 },
 367 
 368         // Six-byte sequences
 369         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded
 370         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded
 371         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded
 372         {1, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded
 373         {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 },
 374         {1, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 },
 375         {1, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 },


 566             {inoff,  7,     outoff,  4,      inoff + 6, outoff + 3, 0},
 567             {inoff,  8,     outoff,  4,      inoff + 6, outoff + 3, 0},
 568             {inoff,  9,     outoff,  4,      inoff + 6, outoff + 3, 0},
 569             {inoff,  10,    outoff,  4,      inoff + 6, outoff + 3, 1},
 570         };
 571         for (boolean direct: new boolean[] {false, true}) {
 572             for (int[] flow: Flows) {
 573                 if (!check(dec, utf8s, direct, flow))
 574                     failed = true;
 575             }
 576         }}}
 577         if (failed)
 578             throw new RuntimeException("Check under/overflow failed " + csn);
 579     }
 580 
 581     public static void main(String[] args) throws Exception {
 582         checkRoundtrip("UTF-8");
 583         check4ByteSurrs("UTF-8");
 584         checkMalformed("UTF-8", malformed);
 585         checkUnderOverflow("UTF-8");

 586         checkRoundtrip("CESU-8");
 587         check6ByteSurrs("CESU-8");
 588         checkMalformed("CESU-8", malformed_cesu8);
 589     }
 590 }