1 /*
   2  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package sun.nio.cs;
  27 
  28 import java.nio.Buffer;
  29 import java.nio.ByteBuffer;
  30 import java.nio.CharBuffer;
  31 import java.nio.charset.Charset;
  32 import java.nio.charset.CharsetDecoder;
  33 import java.nio.charset.CharsetEncoder;
  34 import java.nio.charset.CoderResult;
  35 import java.nio.charset.CodingErrorAction;
  36 
  37 /* Legal UTF-8 Byte Sequences
  38  *
  39  * #    Code Points      Bits   Bit/Byte pattern
  40  * 1                     7      0xxxxxxx
  41  *      U+0000..U+007F          00..7F
  42  *
  43  * 2                     11     110xxxxx    10xxxxxx
  44  *      U+0080..U+07FF          C2..DF      80..BF
  45  *
  46  * 3                     16     1110xxxx    10xxxxxx    10xxxxxx
  47  *      U+0800..U+0FFF          E0          A0..BF      80..BF
  48  *      U+1000..U+FFFF          E1..EF      80..BF      80..BF
  49  *
  50  * 4                     21     11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
  51  *     U+10000..U+3FFFF         F0          90..BF      80..BF      80..BF
  52  *     U+40000..U+FFFFF         F1..F3      80..BF      80..BF      80..BF
  53  *    U+100000..U10FFFF         F4          80..8F      80..BF      80..BF
  54  *
  55  */
  56 
  57 public final class UTF_8 extends Unicode {
  58     public UTF_8() {
  59         super("UTF-8", StandardCharsets.aliases_UTF_8());
  60     }
  61 
  62     public String historicalName() {
  63         return "UTF8";
  64     }
  65 
  66     public CharsetDecoder newDecoder() {
  67         return new Decoder(this);
  68     }
  69 
  70     public CharsetEncoder newEncoder() {
  71         return new Encoder(this);
  72     }
  73 
  74     static final void updatePositions(Buffer src, int sp,
  75                                               Buffer dst, int dp) {
  76         src.position(sp - src.arrayOffset());
  77         dst.position(dp - dst.arrayOffset());
  78     }
  79 
  80     private static class Decoder extends CharsetDecoder {
  81 
  82         private Decoder(Charset cs) {
  83             super(cs, 1.0f, 1.0f);
  84         }
  85 
  86         private static boolean isNotContinuation(int b) {
  87             return (b & 0xc0) != 0x80;
  88         }
  89 
  90         //  [E0]     [A0..BF] [80..BF]
  91         //  [E1..EF] [80..BF] [80..BF]
  92         private static boolean isMalformed3(int b1, int b2, int b3) {
  93             return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
  94                    (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
  95         }
  96 
  97         // only used when there is only one byte left in src buffer
  98         private static boolean isMalformed3_2(int b1, int b2) {
  99             return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 100                    (b2 & 0xc0) != 0x80;
 101         }
 102 
 103         //  [F0]     [90..BF] [80..BF] [80..BF]
 104         //  [F1..F3] [80..BF] [80..BF] [80..BF]
 105         //  [F4]     [80..8F] [80..BF] [80..BF]
 106         //  only check 80-be range here, the [0xf0,0x80...] and [0xf4,0x90-...]
 107         //  will be checked by Character.isSupplementaryCodePoint(uc)
 108         private static boolean isMalformed4(int b2, int b3, int b4) {
 109             return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 110                    (b4 & 0xc0) != 0x80;
 111         }
 112 
 113         // only used when there is less than 4 bytes left in src buffer.
 114         // both b1 and b2 should be "& 0xff" before passed in.
 115         private static boolean isMalformed4_2(int b1, int b2) {
 116             return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 117                    (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 118                    (b2 & 0xc0) != 0x80;
 119         }
 120 
 121         // tests if b1 and b2 are malformed as the first 2 bytes of a
 122         // legal`4-byte utf-8 byte sequence.
 123         // only used when there is less than 4 bytes left in src buffer,
 124         // after isMalformed4_2 has been invoked.
 125         private static boolean isMalformed4_3(int b3) {
 126             return (b3 & 0xc0) != 0x80;
 127         }
 128 
 129         private static CoderResult lookupN(ByteBuffer src, int n)
 130         {
 131             for (int i = 1; i < n; i++) {
 132                if (isNotContinuation(src.get()))
 133                    return CoderResult.malformedForLength(i);
 134             }
 135             return CoderResult.malformedForLength(n);
 136         }
 137 
 138         private static CoderResult malformedN(ByteBuffer src, int nb) {
 139             switch (nb) {
 140             case 1:
 141             case 2:                    // always 1
 142                 return CoderResult.malformedForLength(1);
 143             case 3:
 144                 int b1 = src.get();
 145                 int b2 = src.get();    // no need to lookup b3
 146                 return CoderResult.malformedForLength(
 147                     ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 148                      isNotContinuation(b2)) ? 1 : 2);
 149             case 4:  // we don't care the speed here
 150                 b1 = src.get() & 0xff;
 151                 b2 = src.get() & 0xff;
 152                 if (b1 > 0xf4 ||
 153                     (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 154                     (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 155                     isNotContinuation(b2))
 156                     return CoderResult.malformedForLength(1);
 157                 if (isNotContinuation(src.get()))
 158                     return CoderResult.malformedForLength(2);
 159                 return CoderResult.malformedForLength(3);
 160             default:
 161                 assert false;
 162                 return null;
 163             }
 164         }
 165 
 166         private static CoderResult malformed(ByteBuffer src, int sp,
 167                                              CharBuffer dst, int dp,
 168                                              int nb)
 169         {
 170             src.position(sp - src.arrayOffset());
 171             CoderResult cr = malformedN(src, nb);
 172             updatePositions(src, sp, dst, dp);
 173             return cr;
 174         }
 175 
 176 
 177         private static CoderResult malformed(ByteBuffer src,
 178                                              int mark, int nb)
 179         {
 180             src.position(mark);
 181             CoderResult cr = malformedN(src, nb);
 182             src.position(mark);
 183             return cr;
 184         }
 185 
 186         private static CoderResult malformedForLength(ByteBuffer src,
 187                                                       int sp,
 188                                                       CharBuffer dst,
 189                                                       int dp,
 190                                                       int malformedNB)
 191         {
 192             updatePositions(src, sp, dst, dp);
 193             return CoderResult.malformedForLength(malformedNB);
 194         }
 195 
 196         private static CoderResult malformedForLength(ByteBuffer src,
 197                                                       int mark,
 198                                                       int malformedNB)
 199         {
 200             src.position(mark);
 201             return CoderResult.malformedForLength(malformedNB);
 202         }
 203 
 204 
 205         private static CoderResult xflow(Buffer src, int sp, int sl,
 206                                          Buffer dst, int dp, int nb) {
 207             updatePositions(src, sp, dst, dp);
 208             return (nb == 0 || sl - sp < nb)
 209                    ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
 210         }
 211 
 212         private static CoderResult xflow(Buffer src, int mark, int nb) {
 213             src.position(mark);
 214             return (nb == 0 || src.remaining() < nb)
 215                    ? CoderResult.UNDERFLOW : CoderResult.OVERFLOW;
 216         }
 217 
 218         private CoderResult decodeArrayLoop(ByteBuffer src,
 219                                             CharBuffer dst)
 220         {
 221             // This method is optimized for ASCII input.
 222             byte[] sa = src.array();
 223             int sp = src.arrayOffset() + src.position();
 224             int sl = src.arrayOffset() + src.limit();
 225 
 226             char[] da = dst.array();
 227             int dp = dst.arrayOffset() + dst.position();
 228             int dl = dst.arrayOffset() + dst.limit();
 229             int dlASCII = dp + Math.min(sl - sp, dl - dp);
 230 
 231             // ASCII only loop
 232             while (dp < dlASCII && sa[sp] >= 0)
 233                 da[dp++] = (char) sa[sp++];
 234             while (sp < sl) {
 235                 int b1 = sa[sp];
 236                 if (b1 >= 0) {
 237                     // 1 byte, 7 bits: 0xxxxxxx
 238                     if (dp >= dl)
 239                         return xflow(src, sp, sl, dst, dp, 1);
 240                     da[dp++] = (char) b1;
 241                     sp++;
 242                 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 243                     // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
 244                     //                   [C2..DF] [80..BF]
 245                     if (sl - sp < 2 || dp >= dl)
 246                         return xflow(src, sp, sl, dst, dp, 2);
 247                     int b2 = sa[sp + 1];
 248                     // Now we check the first byte of 2-byte sequence as
 249                     //     if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0)
 250                     // no longer need to check b1 against c1 & c0 for
 251                     // malformed as we did in previous version
 252                     //   (b1 & 0x1e) == 0x0 || (b2 & 0xc0) != 0x80;
 253                     // only need to check the second byte b2.
 254                     if (isNotContinuation(b2))
 255                         return malformedForLength(src, sp, dst, dp, 1);
 256                     da[dp++] = (char) (((b1 << 6) ^ b2)
 257                                        ^
 258                                        (((byte) 0xC0 << 6) ^
 259                                         ((byte) 0x80 << 0)));
 260                     sp += 2;
 261                 } else if ((b1 >> 4) == -2) {
 262                     // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
 263                     int srcRemaining = sl - sp;
 264                     if (srcRemaining < 3 || dp >= dl) {
 265                         if (srcRemaining > 1 && isMalformed3_2(b1, sa[sp + 1]))
 266                             return malformedForLength(src, sp, dst, dp, 1);
 267                         return xflow(src, sp, sl, dst, dp, 3);
 268                     }
 269                     int b2 = sa[sp + 1];
 270                     int b3 = sa[sp + 2];
 271                     if (isMalformed3(b1, b2, b3))
 272                         return malformed(src, sp, dst, dp, 3);
 273                     char c = (char)
 274                         ((b1 << 12) ^
 275                          (b2 <<  6) ^
 276                          (b3 ^
 277                           (((byte) 0xE0 << 12) ^
 278                            ((byte) 0x80 <<  6) ^
 279                            ((byte) 0x80 <<  0))));
 280                     if (Character.isSurrogate(c))
 281                         return malformedForLength(src, sp, dst, dp, 3);
 282                     da[dp++] = c;
 283                     sp += 3;
 284                 } else if ((b1 >> 3) == -2) {
 285                     // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 286                     int srcRemaining = sl - sp;
 287                     if (srcRemaining < 4 || dl - dp < 2) {
 288                         b1 &= 0xff;
 289                         if (b1 > 0xf4 ||
 290                             srcRemaining > 1 && isMalformed4_2(b1, sa[sp + 1] & 0xff))
 291                             return malformedForLength(src, sp, dst, dp, 1);
 292                         if (srcRemaining > 2 && isMalformed4_3(sa[sp + 2]))
 293                             return malformedForLength(src, sp, dst, dp, 2);
 294                         return xflow(src, sp, sl, dst, dp, 4);
 295                     }
 296                     int b2 = sa[sp + 1];
 297                     int b3 = sa[sp + 2];
 298                     int b4 = sa[sp + 3];
 299                     int uc = ((b1 << 18) ^
 300                               (b2 << 12) ^
 301                               (b3 <<  6) ^
 302                               (b4 ^
 303                                (((byte) 0xF0 << 18) ^
 304                                 ((byte) 0x80 << 12) ^
 305                                 ((byte) 0x80 <<  6) ^
 306                                 ((byte) 0x80 <<  0))));
 307                     if (isMalformed4(b2, b3, b4) ||
 308                         // shortest form check
 309                         !Character.isSupplementaryCodePoint(uc)) {
 310                         return malformed(src, sp, dst, dp, 4);
 311                     }
 312                     da[dp++] = Character.highSurrogate(uc);
 313                     da[dp++] = Character.lowSurrogate(uc);
 314                     sp += 4;
 315                 } else
 316                     return malformed(src, sp, dst, dp, 1);
 317             }
 318             return xflow(src, sp, sl, dst, dp, 0);
 319         }
 320 
 321         private CoderResult decodeBufferLoop(ByteBuffer src,
 322                                              CharBuffer dst)
 323         {
 324             int mark = src.position();
 325             int limit = src.limit();
 326             while (mark < limit) {
 327                 int b1 = src.get();
 328                 if (b1 >= 0) {
 329                     // 1 byte, 7 bits: 0xxxxxxx
 330                     if (dst.remaining() < 1)
 331                         return xflow(src, mark, 1); // overflow
 332                     dst.put((char) b1);
 333                     mark++;
 334                 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 335                     // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
 336                     if (limit - mark < 2|| dst.remaining() < 1)
 337                         return xflow(src, mark, 2);
 338                     int b2 = src.get();
 339                     if (isNotContinuation(b2))
 340                         return malformedForLength(src, mark, 1);
 341                      dst.put((char) (((b1 << 6) ^ b2)
 342                                     ^
 343                                     (((byte) 0xC0 << 6) ^
 344                                      ((byte) 0x80 << 0))));
 345                     mark += 2;
 346                 } else if ((b1 >> 4) == -2) {
 347                     // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
 348                     int srcRemaining = limit - mark;
 349                     if (srcRemaining < 3 || dst.remaining() < 1) {
 350                         if (srcRemaining > 1 && isMalformed3_2(b1, src.get()))
 351                             return malformedForLength(src, mark, 1);
 352                         return xflow(src, mark, 3);
 353                     }
 354                     int b2 = src.get();
 355                     int b3 = src.get();
 356                     if (isMalformed3(b1, b2, b3))
 357                         return malformed(src, mark, 3);
 358                     char c = (char)
 359                         ((b1 << 12) ^
 360                          (b2 <<  6) ^
 361                          (b3 ^
 362                           (((byte) 0xE0 << 12) ^
 363                            ((byte) 0x80 <<  6) ^
 364                            ((byte) 0x80 <<  0))));
 365                     if (Character.isSurrogate(c))
 366                         return malformedForLength(src, mark, 3);
 367                     dst.put(c);
 368                     mark += 3;
 369                 } else if ((b1 >> 3) == -2) {
 370                     // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 371                     int srcRemaining = limit - mark;
 372                     if (srcRemaining < 4 || dst.remaining() < 2) {
 373                         b1 &= 0xff;
 374                         if (b1 > 0xf4 ||
 375                             srcRemaining > 1 && isMalformed4_2(b1, src.get() & 0xff))
 376                             return malformedForLength(src, mark, 1);
 377                         if (srcRemaining > 2 && isMalformed4_3(src.get()))
 378                             return malformedForLength(src, mark, 2);
 379                         return xflow(src, mark, 4);
 380                     }
 381                     int b2 = src.get();
 382                     int b3 = src.get();
 383                     int b4 = src.get();
 384                     int uc = ((b1 << 18) ^
 385                               (b2 << 12) ^
 386                               (b3 <<  6) ^
 387                               (b4 ^
 388                                (((byte) 0xF0 << 18) ^
 389                                 ((byte) 0x80 << 12) ^
 390                                 ((byte) 0x80 <<  6) ^
 391                                 ((byte) 0x80 <<  0))));
 392                     if (isMalformed4(b2, b3, b4) ||
 393                         // shortest form check
 394                         !Character.isSupplementaryCodePoint(uc)) {
 395                         return malformed(src, mark, 4);
 396                     }
 397                     dst.put(Character.highSurrogate(uc));
 398                     dst.put(Character.lowSurrogate(uc));
 399                     mark += 4;
 400                 } else {
 401                     return malformed(src, mark, 1);
 402                 }
 403             }
 404             return xflow(src, mark, 0);
 405         }
 406 
 407         protected CoderResult decodeLoop(ByteBuffer src,
 408                                          CharBuffer dst)
 409         {
 410             if (src.hasArray() && dst.hasArray())
 411                 return decodeArrayLoop(src, dst);
 412             else
 413                 return decodeBufferLoop(src, dst);
 414         }
 415 
 416         private static ByteBuffer getByteBuffer(ByteBuffer bb, byte[] ba, int sp)
 417         {
 418             if (bb == null)
 419                 bb = ByteBuffer.wrap(ba);
 420             bb.position(sp);
 421             return bb;
 422         }
 423     }
 424 
 425     private static final class Encoder extends CharsetEncoder {
 426 
 427         private Encoder(Charset cs) {
 428             super(cs, 1.1f, 3.0f);
 429         }
 430 
 431         public boolean canEncode(char c) {
 432             return !Character.isSurrogate(c);
 433         }
 434 
 435         public boolean isLegalReplacement(byte[] repl) {
 436             return ((repl.length == 1 && repl[0] >= 0) ||
 437                     super.isLegalReplacement(repl));
 438         }
 439 
 440         private static CoderResult overflow(CharBuffer src, int sp,
 441                                             ByteBuffer dst, int dp) {
 442             updatePositions(src, sp, dst, dp);
 443             return CoderResult.OVERFLOW;
 444         }
 445 
 446         private static CoderResult overflow(CharBuffer src, int mark) {
 447             src.position(mark);
 448             return CoderResult.OVERFLOW;
 449         }
 450 
 451         private Surrogate.Parser sgp;
 452         private CoderResult encodeArrayLoop(CharBuffer src,
 453                                             ByteBuffer dst)
 454         {
 455             char[] sa = src.array();
 456             int sp = src.arrayOffset() + src.position();
 457             int sl = src.arrayOffset() + src.limit();
 458 
 459             byte[] da = dst.array();
 460             int dp = dst.arrayOffset() + dst.position();
 461             int dl = dst.arrayOffset() + dst.limit();
 462             int dlASCII = dp + Math.min(sl - sp, dl - dp);
 463 
 464             // ASCII only loop
 465             while (dp < dlASCII && sa[sp] < '\u0080')
 466                 da[dp++] = (byte) sa[sp++];
 467             while (sp < sl) {
 468                 char c = sa[sp];
 469                 if (c < 0x80) {
 470                     // Have at most seven bits
 471                     if (dp >= dl)
 472                         return overflow(src, sp, dst, dp);
 473                     da[dp++] = (byte)c;
 474                 } else if (c < 0x800) {
 475                     // 2 bytes, 11 bits
 476                     if (dl - dp < 2)
 477                         return overflow(src, sp, dst, dp);
 478                     da[dp++] = (byte)(0xc0 | (c >> 6));
 479                     da[dp++] = (byte)(0x80 | (c & 0x3f));
 480                 } else if (Character.isSurrogate(c)) {
 481                     // Have a surrogate pair
 482                     if (sgp == null)
 483                         sgp = new Surrogate.Parser();
 484                     int uc = sgp.parse(c, sa, sp, sl);
 485                     if (uc < 0) {
 486                         updatePositions(src, sp, dst, dp);
 487                         return sgp.error();
 488                     }
 489                     if (dl - dp < 4)
 490                         return overflow(src, sp, dst, dp);
 491                     da[dp++] = (byte)(0xf0 | ((uc >> 18)));
 492                     da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 493                     da[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 494                     da[dp++] = (byte)(0x80 | (uc & 0x3f));
 495                     sp++;  // 2 chars
 496                 } else {
 497                     // 3 bytes, 16 bits
 498                     if (dl - dp < 3)
 499                         return overflow(src, sp, dst, dp);
 500                     da[dp++] = (byte)(0xe0 | ((c >> 12)));
 501                     da[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 502                     da[dp++] = (byte)(0x80 | (c & 0x3f));
 503                 }
 504                 sp++;
 505             }
 506             updatePositions(src, sp, dst, dp);
 507             return CoderResult.UNDERFLOW;
 508         }
 509 
 510         private CoderResult encodeBufferLoop(CharBuffer src,
 511                                              ByteBuffer dst)
 512         {
 513             int mark = src.position();
 514             while (src.hasRemaining()) {
 515                 char c = src.get();
 516                 if (c < 0x80) {
 517                     // Have at most seven bits
 518                     if (!dst.hasRemaining())
 519                         return overflow(src, mark);
 520                     dst.put((byte)c);
 521                 } else if (c < 0x800) {
 522                     // 2 bytes, 11 bits
 523                     if (dst.remaining() < 2)
 524                         return overflow(src, mark);
 525                     dst.put((byte)(0xc0 | (c >> 6)));
 526                     dst.put((byte)(0x80 | (c & 0x3f)));
 527                 } else if (Character.isSurrogate(c)) {
 528                     // Have a surrogate pair
 529                     if (sgp == null)
 530                         sgp = new Surrogate.Parser();
 531                     int uc = sgp.parse(c, src);
 532                     if (uc < 0) {
 533                         src.position(mark);
 534                         return sgp.error();
 535                     }
 536                     if (dst.remaining() < 4)
 537                         return overflow(src, mark);
 538                     dst.put((byte)(0xf0 | ((uc >> 18))));
 539                     dst.put((byte)(0x80 | ((uc >> 12) & 0x3f)));
 540                     dst.put((byte)(0x80 | ((uc >>  6) & 0x3f)));
 541                     dst.put((byte)(0x80 | (uc & 0x3f)));
 542                     mark++;  // 2 chars
 543                 } else {
 544                     // 3 bytes, 16 bits
 545                     if (dst.remaining() < 3)
 546                         return overflow(src, mark);
 547                     dst.put((byte)(0xe0 | ((c >> 12))));
 548                     dst.put((byte)(0x80 | ((c >>  6) & 0x3f)));
 549                     dst.put((byte)(0x80 | (c & 0x3f)));
 550                 }
 551                 mark++;
 552             }
 553             src.position(mark);
 554             return CoderResult.UNDERFLOW;
 555         }
 556 
 557         protected final CoderResult encodeLoop(CharBuffer src,
 558                                                ByteBuffer dst)
 559         {
 560             if (src.hasArray() && dst.hasArray())
 561                 return encodeArrayLoop(src, dst);
 562             else
 563                 return encodeBufferLoop(src, dst);
 564         }
 565 
 566     }
 567 }