src/share/classes/sun/nio/cs/UTF_8.java

Print this page




  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package sun.nio.cs;
  27 
  28 import java.nio.Buffer;
  29 import java.nio.ByteBuffer;
  30 import java.nio.CharBuffer;
  31 import java.nio.charset.Charset;
  32 import java.nio.charset.CharsetDecoder;
  33 import java.nio.charset.CharsetEncoder;
  34 import java.nio.charset.CoderResult;

  35 
  36 /* Legal UTF-8 Byte Sequences
  37  *
  38  * #    Code Points      Bits   Bit/Byte pattern
  39  * 1                     7      0xxxxxxx
  40  *      U+0000..U+007F          00..7F
  41  *
  42  * 2                     11     110xxxxx    10xxxxxx
  43  *      U+0080..U+07FF          C2..DF      80..BF
  44  *
  45  * 3                     16     1110xxxx    10xxxxxx    10xxxxxx
  46  *      U+0800..U+0FFF          E0          A0..BF      80..BF
  47  *      U+1000..U+FFFF          E1..EF      80..BF      80..BF
  48  *
  49  * 4                     21     11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
  50  *     U+10000..U+3FFFF         F0          90..BF      80..BF      80..BF
  51  *     U+40000..U+FFFFF         F1..F3      80..BF      80..BF      80..BF
  52  *    U+100000..U10FFFF         F4          80..8F      80..BF      80..BF
  53  *
  54  */


  60     }
  61 
  62     public String historicalName() {
  63         return "UTF8";
  64     }
  65 
  66     public CharsetDecoder newDecoder() {
  67         return new Decoder(this);
  68     }
  69 
  70     public CharsetEncoder newEncoder() {
  71         return new Encoder(this);
  72     }
  73 
  74     static final void updatePositions(Buffer src, int sp,
  75                                       Buffer dst, int dp) {
  76         src.position(sp - src.arrayOffset());
  77         dst.position(dp - dst.arrayOffset());
  78     }
  79 
  80     private static class Decoder extends CharsetDecoder {

  81         private Decoder(Charset cs) {
  82             super(cs, 1.0f, 1.0f);
  83         }
  84 
  85         private static boolean isNotContinuation(int b) {
  86             return (b & 0xc0) != 0x80;
  87         }
  88 
  89         //  [C2..DF] [80..BF]
  90         private static boolean isMalformed2(int b1, int b2) {
  91             return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;
  92         }
  93 
  94         //  [E0]     [A0..BF] [80..BF]
  95         //  [E1..EF] [80..BF] [80..BF]
  96         private static boolean isMalformed3(int b1, int b2, int b3) {
  97             return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
  98                    (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
  99         }
 100 


 336                         return malformed(src, mark, 4);
 337                     }
 338                     dst.put(Character.highSurrogate(uc));
 339                     dst.put(Character.lowSurrogate(uc));
 340                     mark += 4;
 341                 } else {
 342                     return malformed(src, mark, 1);
 343                 }
 344             }
 345             return xflow(src, mark, 0);
 346         }
 347 
 348         protected CoderResult decodeLoop(ByteBuffer src,
 349                                          CharBuffer dst)
 350         {
 351             if (src.hasArray() && dst.hasArray())
 352                 return decodeArrayLoop(src, dst);
 353             else
 354                 return decodeBufferLoop(src, dst);
 355         }


























































































































 356     }
 357 
 358     private static class Encoder extends CharsetEncoder {

 359 
 360         private Encoder(Charset cs) {
 361             super(cs, 1.1f, 3.0f);
 362         }
 363 
 364         public boolean canEncode(char c) {
 365             return !Character.isSurrogate(c);
 366         }
 367 
 368         public boolean isLegalReplacement(byte[] repl) {
 369             return ((repl.length == 1 && repl[0] >= 0) ||
 370                     super.isLegalReplacement(repl));
 371         }
 372 
 373         private static CoderResult overflow(CharBuffer src, int sp,
 374                                             ByteBuffer dst, int dp) {
 375             updatePositions(src, sp, dst, dp);
 376             return CoderResult.OVERFLOW;
 377         }
 378 


 478                     if (dst.remaining() < 3)
 479                         return overflow(src, mark);
 480                     dst.put((byte)(0xe0 | ((c >> 12))));
 481                     dst.put((byte)(0x80 | ((c >>  6) & 0x3f)));
 482                     dst.put((byte)(0x80 | (c & 0x3f)));
 483                 }
 484                 mark++;
 485             }
 486             src.position(mark);
 487             return CoderResult.UNDERFLOW;
 488         }
 489 
 490         protected final CoderResult encodeLoop(CharBuffer src,
 491                                                ByteBuffer dst)
 492         {
 493             if (src.hasArray() && dst.hasArray())
 494                 return encodeArrayLoop(src, dst);
 495             else
 496                 return encodeBufferLoop(src, dst);
 497         }













































 498     }
 499 }


  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package sun.nio.cs;
  27 
  28 import java.nio.Buffer;
  29 import java.nio.ByteBuffer;
  30 import java.nio.CharBuffer;
  31 import java.nio.charset.Charset;
  32 import java.nio.charset.CharsetDecoder;
  33 import java.nio.charset.CharsetEncoder;
  34 import java.nio.charset.CoderResult;
  35 import java.nio.charset.CodingErrorAction;
  36 
  37 /* Legal UTF-8 Byte Sequences
  38  *
  39  * #    Code Points      Bits   Bit/Byte pattern
  40  * 1                     7      0xxxxxxx
  41  *      U+0000..U+007F          00..7F
  42  *
  43  * 2                     11     110xxxxx    10xxxxxx
  44  *      U+0080..U+07FF          C2..DF      80..BF
  45  *
  46  * 3                     16     1110xxxx    10xxxxxx    10xxxxxx
  47  *      U+0800..U+0FFF          E0          A0..BF      80..BF
  48  *      U+1000..U+FFFF          E1..EF      80..BF      80..BF
  49  *
  50  * 4                     21     11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
  51  *     U+10000..U+3FFFF         F0          90..BF      80..BF      80..BF
  52  *     U+40000..U+FFFFF         F1..F3      80..BF      80..BF      80..BF
  53  *    U+100000..U10FFFF         F4          80..8F      80..BF      80..BF
  54  *
  55  */


  61     }
  62 
  63     public String historicalName() {
  64         return "UTF8";
  65     }
  66 
  67     public CharsetDecoder newDecoder() {
  68         return new Decoder(this);
  69     }
  70 
  71     public CharsetEncoder newEncoder() {
  72         return new Encoder(this);
  73     }
  74 
  75     static final void updatePositions(Buffer src, int sp,
  76                                       Buffer dst, int dp) {
  77         src.position(sp - src.arrayOffset());
  78         dst.position(dp - dst.arrayOffset());
  79     }
  80 
  81     private static class Decoder extends CharsetDecoder
  82                                  implements ArrayDecoder {
  83         private Decoder(Charset cs) {
  84             super(cs, 1.0f, 1.0f);
  85         }
  86 
  87         private static boolean isNotContinuation(int b) {
  88             return (b & 0xc0) != 0x80;
  89         }
  90 
  91         //  [C2..DF] [80..BF]
  92         private static boolean isMalformed2(int b1, int b2) {
  93             return (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;
  94         }
  95 
  96         //  [E0]     [A0..BF] [80..BF]
  97         //  [E1..EF] [80..BF] [80..BF]
  98         private static boolean isMalformed3(int b1, int b2, int b3) {
  99             return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 100                    (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 101         }
 102 


 338                         return malformed(src, mark, 4);
 339                     }
 340                     dst.put(Character.highSurrogate(uc));
 341                     dst.put(Character.lowSurrogate(uc));
 342                     mark += 4;
 343                 } else {
 344                     return malformed(src, mark, 1);
 345                 }
 346             }
 347             return xflow(src, mark, 0);
 348         }
 349 
 350         protected CoderResult decodeLoop(ByteBuffer src,
 351                                          CharBuffer dst)
 352         {
 353             if (src.hasArray() && dst.hasArray())
 354                 return decodeArrayLoop(src, dst);
 355             else
 356                 return decodeBufferLoop(src, dst);
 357         }
 358 
 359         private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
 360         {
 361             if (bb == null)
 362                 bb = ByteBuffer.wrap(ba);
 363             bb.position(sp);
 364             return bb;
 365         }
 366 
 367         // returns -1 if there is malformed byte(s) and the
 368         // "action" for malformed input is not REPLACE.
 369         public int decode(byte[] sa, int sp, int len, char[] da) {
 370             final int sl = sp + len;
 371             int dp = 0;
 372             int dlASCII = Math.min(len, da.length);
 373             ByteBuffer bb = null;  // only necessary if malformed
 374 
 375             // ASCII only optimized loop
 376             while (dp < dlASCII && sa[sp] >= 0)
 377                 da[dp++] = (char) sa[sp++];
 378 
 379             while (sp < sl) {
 380                 int b1 = sa[sp++];
 381                 if (b1 >= 0) {
 382                     // 1 byte, 7 bits: 0xxxxxxx
 383                     da[dp++] = (char) b1;
 384                 } else if ((b1 >> 5) == -2) {
 385                     // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
 386                     if (sp < sl) {
 387                         int b2 = sa[sp++];
 388                         if (isMalformed2(b1, b2)) {
 389                             if (malformedInputAction() != CodingErrorAction.REPLACE)
 390                                 return -1;
 391                             da[dp++] = replacement().charAt(0);
 392                             sp--;            // malformedN(bb, 2) always returns 1
 393                         } else {
 394                             da[dp++] = (char) (((b1 << 6) ^ b2)^
 395                                            (((byte) 0xC0 << 6) ^
 396                                             ((byte) 0x80 << 0)));
 397                         }
 398                         continue;
 399                     }
 400                     if (malformedInputAction() != CodingErrorAction.REPLACE)
 401                         return -1;
 402                     da[dp++] = replacement().charAt(0);
 403                     return dp;
 404                 } else if ((b1 >> 4) == -2) {
 405                     // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
 406                     if (sp + 1 < sl) {
 407                         int b2 = sa[sp++];
 408                         int b3 = sa[sp++];
 409                         if (isMalformed3(b1, b2, b3)) {
 410                             if (malformedInputAction() != CodingErrorAction.REPLACE)
 411                                 return -1;
 412                             da[dp++] = replacement().charAt(0);
 413                             sp -=3;
 414                             bb = getByteBuffer(bb, sa, sp);
 415                             sp += malformedN(bb, 3).length();
 416                         } else {
 417                             da[dp++] = (char)((b1 << 12) ^
 418                                               (b2 <<  6) ^
 419                                               (b3 ^
 420                                               (((byte) 0xE0 << 12) ^
 421                                               ((byte) 0x80 <<  6) ^
 422                                               ((byte) 0x80 <<  0))));
 423                         }
 424                         continue;
 425                     }
 426                     if (malformedInputAction() != CodingErrorAction.REPLACE)
 427                         return -1;
 428                     da[dp++] = replacement().charAt(0);
 429                     return dp;
 430                 } else if ((b1 >> 3) == -2) {
 431                     // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 432                     if (sp + 2 < sl) {
 433                         int b2 = sa[sp++];
 434                         int b3 = sa[sp++];
 435                         int b4 = sa[sp++];
 436                         int uc = ((b1 << 18) ^
 437                                   (b2 << 12) ^
 438                                   (b3 <<  6) ^
 439                                   (b4 ^
 440                                    (((byte) 0xF0 << 18) ^
 441                                    ((byte) 0x80 << 12) ^
 442                                    ((byte) 0x80 <<  6) ^
 443                                    ((byte) 0x80 <<  0))));
 444                         if (isMalformed4(b2, b3, b4) ||
 445                             // shortest form check
 446                             !Character.isSupplementaryCodePoint(uc)) {
 447                             if (malformedInputAction() != CodingErrorAction.REPLACE)
 448                                 return -1;
 449                             da[dp++] = replacement().charAt(0);
 450                             sp -= 4;
 451                             bb = getByteBuffer(bb, sa, sp);
 452                             sp += malformedN(bb, 4).length();
 453                         } else {
 454                             da[dp++] = Character.highSurrogate(uc);
 455                             da[dp++] = Character.lowSurrogate(uc);
 456                         }
 457                         continue;
 458                     }
 459                     if (malformedInputAction() != CodingErrorAction.REPLACE)
 460                         return -1;
 461                     da[dp++] = replacement().charAt(0);
 462                     return dp;
 463                 } else {
 464                     if (malformedInputAction() != CodingErrorAction.REPLACE)
 465                         return -1;
 466                     da[dp++] = replacement().charAt(0);
 467                     sp--;
 468                     bb = getByteBuffer(bb, sa, sp);
 469                     CoderResult cr = malformedN(bb, 1);
 470                     if (!cr.isError()) {
 471                         // leading byte for 5 or 6-byte, but don't have enough
 472                         // bytes in buffer to check. Consumed rest as malformed.
 473                         return dp;
 474                     }
 475                     sp +=  cr.length();
 476                 }
 477             }
 478             return dp;
 479         }
 480     }
 481 
 482     private static class Encoder extends CharsetEncoder
 483                                  implements ArrayEncoder {
 484 
 485         private Encoder(Charset cs) {
 486             super(cs, 1.1f, 3.0f);
 487         }
 488 
 489         public boolean canEncode(char c) {
 490             return !Character.isSurrogate(c);
 491         }
 492 
 493         public boolean isLegalReplacement(byte[] repl) {
 494             return ((repl.length == 1 && repl[0] >= 0) ||
 495                     super.isLegalReplacement(repl));
 496         }
 497 
 498         private static CoderResult overflow(CharBuffer src, int sp,
 499                                             ByteBuffer dst, int dp) {
 500             updatePositions(src, sp, dst, dp);
 501             return CoderResult.OVERFLOW;
 502         }
 503 


 603                     if (dst.remaining() < 3)
 604                         return overflow(src, mark);
 605                     dst.put((byte)(0xe0 | ((c >> 12))));
 606                     dst.put((byte)(0x80 | ((c >>  6) & 0x3f)));
 607                     dst.put((byte)(0x80 | (c & 0x3f)));
 608                 }
 609                 mark++;
 610             }
 611             src.position(mark);
 612             return CoderResult.UNDERFLOW;
 613         }
 614 
 615         protected final CoderResult encodeLoop(CharBuffer src,
 616                                                ByteBuffer dst)
 617         {
 618             if (src.hasArray() && dst.hasArray())
 619                 return encodeArrayLoop(src, dst);
 620             else
 621                 return encodeBufferLoop(src, dst);
 622         }
 623 
 624         // returns -1 if there is malformed char(s) and the
 625         // "action" for malformed input is not REPLACE.
 626         public int encode(char[] sa, int sp, int len, byte[] da) {
 627             int sl = sp + len;
 628             int dp = 0;
 629             int dlASCII = dp + Math.min(len, da.length);
 630 
 631             // ASCII only optimized loop
 632             while (dp < dlASCII && sa[sp] < '\u0080')
 633                 da[dp++] = (byte) sa[sp++];
 634 
 635             while (sp < sl) {
 636                 char c = sa[sp++];
 637                 if (c < 0x80) {
 638                     // Have at most seven bits
 639                     da[dp++] = (byte)c;
 640                 } else if (c < 0x800) {
 641                     // 2 bytes, 11 bits
 642                     da[dp++] = (byte)(0xc0 | (c >> 6));
 643                     da[dp++] = (byte)(0x80 | (c & 0x3f));
 644                 } else if (Character.isSurrogate(c)) {
 645                     if (sgp == null)
 646                         sgp = new Surrogate.Parser();
 647                     int uc = sgp.parse(c, sa, sp - 1, sl);
 648                     if (uc < 0) {
 649                         if (malformedInputAction() != CodingErrorAction.REPLACE)
 650                             return -1;
 651                         da[dp++] = replacement()[0];
 652                     } else {
 653                         da[dp++] = (byte)(0xf0 | ((uc >> 18)));
 654                         da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 655                         da[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 656                         da[dp++] = (byte)(0x80 | (uc & 0x3f));
 657                         sp++;  // 2 chars
 658                     }
 659                 } else {
 660                     // 3 bytes, 16 bits
 661                     da[dp++] = (byte)(0xe0 | ((c >> 12)));
 662                     da[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 663                     da[dp++] = (byte)(0x80 | (c & 0x3f));
 664                 }
 665             }
 666             return dp;
 667         }
 668     }
 669 }