1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.lang;
  27 
  28 import java.nio.charset.Charset;
  29 import java.util.Arrays;
  30 
  31 import static java.lang.String.LATIN1;
  32 import static java.lang.String.UTF16;
  33 import static java.lang.String.COMPACT_STRINGS;
  34 import static java.lang.Character.isSurrogate;
  35 import static java.lang.Character.highSurrogate;
  36 import static java.lang.Character.lowSurrogate;
  37 import static java.lang.Character.isSupplementaryCodePoint;
  38 import static java.lang.StringUTF16.putChar;
  39 
  40 class StringDecoderUTF8 extends StringCoding.StringDecoder {
  41 
  42     StringDecoderUTF8(Charset cs, String rcn) {
  43         super(cs, rcn);
  44     }
  45 
  46     private static boolean isNotContinuation(int b) {
  47         return (b & 0xc0) != 0x80;
  48     }
  49 
  50     private static boolean isMalformed3(int b1, int b2, int b3) {
  51         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
  52                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
  53     }
  54 
  55     private static boolean isMalformed3_2(int b1, int b2) {
  56         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
  57                (b2 & 0xc0) != 0x80;
  58     }
  59 
  60     private static boolean isMalformed4(int b2, int b3, int b4) {
  61         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
  62                (b4 & 0xc0) != 0x80;
  63     }
  64 
  65     private static boolean isMalformed4_2(int b1, int b2) {
  66         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
  67                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
  68                (b2 & 0xc0) != 0x80;
  69     }
  70 
  71     private static boolean isMalformed4_3(int b3) {
  72         return (b3 & 0xc0) != 0x80;
  73     }
  74 
  75     // for nb == 3/4
  76     private static int malformedN(byte[] src, int sp, int nb) {
  77         if (nb == 3) {
  78             int b1 = src[sp++];
  79             int b2 = src[sp++];    // no need to lookup b3
  80             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
  81                     isNotContinuation(b2)) ? 1 : 2;
  82         } else if (nb == 4) { // we don't care the speed here
  83             int b1 = src[sp++] & 0xff;
  84             int b2 = src[sp++] & 0xff;
  85             if (b1 > 0xf4 ||
  86                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
  87                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
  88                 isNotContinuation(b2))
  89                 return 1;
  90             if (isNotContinuation(src[sp++]))
  91                 return 2;
  92             return 3;
  93         }
  94         assert false;
  95         return -1;
  96     }
  97 
  98     private static char repl = '\ufffd';
  99 
 100     StringCoding.Result decode(byte[] src, int sp, int len) {
 101         return decode(src, sp, len, result);
 102     }
 103 
 104     static StringCoding.Result decode(byte[] src, int sp, int len,
 105                                       StringCoding.Result ret) {
 106         int sl = sp + len;
 107         byte[] dst = new byte[len];
 108         int dp = 0;
 109         if (COMPACT_STRINGS) {   // Latin1 only loop
 110             while (sp < sl) {
 111                 int b1 = src[sp];
 112                 if (b1 >= 0) {
 113                     dst[dp++] = (byte)b1;
 114                     sp++;
 115                     continue;
 116                 }
 117                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 118                     sp + 1 < sl) {
 119                     int b2 = src[sp + 1];
 120                     if (!isNotContinuation(b2)) {
 121                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 122                                            (((byte) 0xC0 << 6) ^
 123                                            ((byte) 0x80 << 0)));
 124                         sp += 2;
 125                         continue;
 126                     }
 127                 }
 128                 // anything not a latin1, including the repl
 129                 // we have to go with the utf16
 130                 break;
 131             }
 132             if (sp == sl) {
 133                 if (dp != dst.length) {
 134                     dst = Arrays.copyOf(dst, dp);
 135                 }
 136                 return ret.with(dst, LATIN1);
 137             }
 138         }
 139         if (dp == 0) {
 140             dst = new byte[len << 1];
 141         } else {
 142             byte[] buf = new byte[len << 1];
 143             StringLatin1.inflate(dst, 0, buf, 0, dp);
 144             dst = buf;
 145         }
 146         while (sp < sl) {
 147             int b1 = src[sp++];
 148             if (b1 >= 0) {
 149                 putChar(dst, dp++, (char) b1);
 150             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 151                 if (sp < sl) {
 152                     int b2 = src[sp++];
 153                     if (isNotContinuation(b2)) {
 154                         putChar(dst, dp++, repl);
 155                         sp--;
 156                     } else {
 157                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 158                                                   (((byte) 0xC0 << 6) ^
 159                                                   ((byte) 0x80 << 0))));
 160                     }
 161                     continue;
 162                 }
 163                 putChar(dst, dp++, repl);
 164                 break;
 165             } else if ((b1 >> 4) == -2) {
 166                 if (sp + 1 < sl) {
 167                     int b2 = src[sp++];
 168                     int b3 = src[sp++];
 169                     if (isMalformed3(b1, b2, b3)) {
 170                         putChar(dst, dp++, repl);
 171                         sp -= 3;
 172                         sp += malformedN(src, sp, 3);
 173                     } else {
 174                         char c = (char)((b1 << 12) ^
 175                                         (b2 <<  6) ^
 176                                         (b3 ^
 177                                          (((byte) 0xE0 << 12) ^
 178                                          ((byte) 0x80 <<  6) ^
 179                                          ((byte) 0x80 <<  0))));
 180                         putChar(dst, dp++, isSurrogate(c) ?  repl : c);
 181                     }
 182                     continue;
 183                 }
 184                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 185                     putChar(dst, dp++, repl);
 186                     continue;
 187                 }
 188                 putChar(dst, dp++, repl);
 189                 break;
 190             } else if ((b1 >> 3) == -2) {
 191                 if (sp + 2 < sl) {
 192                     int b2 = src[sp++];
 193                     int b3 = src[sp++];
 194                     int b4 = src[sp++];
 195                     int uc = ((b1 << 18) ^
 196                               (b2 << 12) ^
 197                               (b3 <<  6) ^
 198                               (b4 ^
 199                                (((byte) 0xF0 << 18) ^
 200                                ((byte) 0x80 << 12) ^
 201                                ((byte) 0x80 <<  6) ^
 202                                ((byte) 0x80 <<  0))));
 203                     if (isMalformed4(b2, b3, b4) ||
 204                         !isSupplementaryCodePoint(uc)) { // shortest form check
 205                         putChar(dst, dp++, repl);
 206                         sp -= 4;
 207                         sp += malformedN(src, sp, 4);
 208                     } else {
 209                         putChar(dst, dp++, highSurrogate(uc));
 210                         putChar(dst, dp++, lowSurrogate(uc));
 211                     }
 212                     continue;
 213                 }
 214                 b1 &= 0xff;
 215                 if (b1 > 0xf4 ||
 216                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 217                     putChar(dst, dp++, repl);
 218                     continue;
 219                 }
 220                 sp++;
 221                 putChar(dst, dp++, repl);
 222                 if (sp  < sl && isMalformed4_3(src[sp])) {
 223                     continue;
 224                 }
 225                 break;
 226             } else {
 227                 putChar(dst, dp++, repl);
 228             }
 229         }
 230         if (dp != len) {
 231             dst = Arrays.copyOf(dst, dp << 1);
 232         }
 233         return ret.with(dst, UTF16);
 234     }
 235 }