1 /*
   2  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package sun.nio.cs.ext;
  27 
  28 import java.nio.ByteBuffer;
  29 import java.nio.CharBuffer;
  30 import java.nio.charset.Charset;
  31 import java.nio.charset.CharsetDecoder;
  32 import java.nio.charset.CharsetEncoder;
  33 import java.nio.charset.CoderResult;
  34 import java.nio.charset.CharacterCodingException;
  35 import java.nio.charset.MalformedInputException;
  36 import sun.nio.cs.DelegatableDecoder;
  37 import sun.nio.cs.HistoricallyNamedCharset;
  38 import java.security.AccessController;
  39 import java.security.PrivilegedAction;
  40 import sun.nio.cs.*;
  41 import static java.lang.Character.UnicodeBlock;
  42 
  43 
  44 public class JISAutoDetect
  45     extends Charset
  46     implements HistoricallyNamedCharset
  47 {
  48 
  49     private final static int EUCJP_MASK       = 0x01;
  50     private final static int SJIS2B_MASK      = 0x02;
  51     private final static int SJIS1B_MASK      = 0x04;
  52     private final static int EUCJP_KANA1_MASK = 0x08;
  53     private final static int EUCJP_KANA2_MASK = 0x10;
  54 
  55     public JISAutoDetect() {
  56         super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect"));
  57     }
  58 
  59     public boolean contains(Charset cs) {
  60         return ((cs.name().equals("US-ASCII"))
  61                 || (cs instanceof SJIS)
  62                 || (cs instanceof EUC_JP)
  63                 || (cs instanceof ISO2022_JP));
  64     }
  65 
  66     public boolean canEncode() {
  67         return false;
  68     }
  69 
  70     public CharsetDecoder newDecoder() {
  71         return new Decoder(this);
  72     }
  73 
  74     public String historicalName() {
  75         return "JISAutoDetect";
  76     }
  77 
  78     public CharsetEncoder newEncoder() {
  79         throw new UnsupportedOperationException();
  80     }
  81 
  82     // A heuristic algorithm for guessing if EUC-decoded text really
  83     // might be Japanese text.  Better heuristics are possible...
  84     private static boolean looksLikeJapanese(CharBuffer cb) {
  85         int hiragana = 0;       // Fullwidth Hiragana
  86         int katakana = 0;       // Halfwidth Katakana
  87         while (cb.hasRemaining()) {
  88             char c = cb.get();
  89             if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true;
  90             if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true;
  91         }
  92         return false;
  93     }
  94 
  95     private static class Decoder extends CharsetDecoder {
  96         private final static String osName = AccessController.doPrivileged(
  97             (PrivilegedAction<String>) () -> System.getProperty("os.name"));
  98 
  99         private final static String SJISName = getSJISName();
 100         private final static String EUCJPName = getEUCJPName();
 101         private DelegatableDecoder detectedDecoder = null;
 102 
 103         public Decoder(Charset cs) {
 104             super(cs, 0.5f, 1.0f);
 105         }
 106 
 107         private static boolean isPlainASCII(byte b) {
 108             return b >= 0 && b != 0x1b;
 109         }
 110 
 111         private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) {
 112             int start = src.position();
 113             int limit = start + Math.min(src.remaining(), dst.remaining());
 114             int p;
 115             byte b;
 116             for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++)
 117                 dst.put((char)(b & 0xff));
 118             src.position(p);
 119         }
 120 
 121         private CoderResult decodeLoop(DelegatableDecoder decoder,
 122                                        ByteBuffer src, CharBuffer dst) {
 123             ((CharsetDecoder)decoder).reset();
 124             detectedDecoder = decoder;
 125             return detectedDecoder.decodeLoop(src, dst);
 126         }
 127 
 128         protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) {
 129             if (detectedDecoder == null) {
 130                 copyLeadingASCII(src, dst);
 131 
 132                 // All ASCII?
 133                 if (! src.hasRemaining())
 134                     return CoderResult.UNDERFLOW;
 135                 // Overflow only if there is still ascii but no out buffer.
 136                 if (!dst.hasRemaining() &&
 137                     isPlainASCII(src.get(src.position())))
 138                     return CoderResult.OVERFLOW;
 139 
 140                 // We need to perform double, not float, arithmetic; otherwise
 141                 // we lose low order bits when src is larger than 2**24.
 142                 int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte());
 143                 CharBuffer sandbox = CharBuffer.allocate(cbufsiz);
 144 
 145                 // First try ISO-2022-JP, since there is no ambiguity
 146                 Charset cs2022 = Charset.forName("ISO-2022-JP");
 147                 DelegatableDecoder dd2022
 148                     = (DelegatableDecoder) cs2022.newDecoder();
 149                 ByteBuffer src2022 = src.asReadOnlyBuffer();
 150                 CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox);
 151                 if (! res2022.isError())
 152                     return decodeLoop(dd2022, src, dst);
 153 
 154                 // We must choose between EUC and SJIS
 155                 Charset csEUCJ = Charset.forName(EUCJPName);
 156                 Charset csSJIS = Charset.forName(SJISName);
 157 
 158                 DelegatableDecoder ddEUCJ
 159                     = (DelegatableDecoder) csEUCJ.newDecoder();
 160                 DelegatableDecoder ddSJIS
 161                     = (DelegatableDecoder) csSJIS.newDecoder();
 162 
 163                 ByteBuffer srcEUCJ = src.asReadOnlyBuffer();
 164                 sandbox.clear();
 165                 CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox);
 166                 // If EUC decoding fails, must be SJIS
 167                 if (resEUCJ.isError())
 168                     return decodeLoop(ddSJIS, src, dst);
 169                 ByteBuffer srcSJIS = src.asReadOnlyBuffer();
 170                 CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz);
 171                 CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS);
 172                 // If SJIS decoding fails, must be EUC
 173                 if (resSJIS.isError())
 174                     return decodeLoop(ddEUCJ, src, dst);
 175 
 176                 // From here on, we have some ambiguity, and must guess.
 177 
 178                 // We prefer input that does not appear to end mid-character.
 179                 if (srcEUCJ.position() > srcSJIS.position())
 180                     return decodeLoop(ddEUCJ, src, dst);
 181 
 182                 if (srcEUCJ.position() < srcSJIS.position())
 183                     return decodeLoop(ddSJIS, src, dst);
 184 
 185                 // end-of-input is after the first byte of the first char?
 186                 if (src.position() == srcEUCJ.position())
 187                     return CoderResult.UNDERFLOW;
 188 
 189                 // Use heuristic knowledge of typical Japanese text
 190                 sandbox.flip();
 191                 return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS,
 192                                   src, dst);
 193             }
 194 
 195             return detectedDecoder.decodeLoop(src, dst);
 196         }
 197 
 198         protected void implReset() {
 199             detectedDecoder = null;
 200         }
 201 
 202         protected CoderResult implFlush(CharBuffer out) {
 203             if (detectedDecoder != null)
 204                 return detectedDecoder.implFlush(out);
 205             else
 206                 return super.implFlush(out);
 207         }
 208 
 209         public boolean isAutoDetecting() {
 210             return true;
 211         }
 212 
 213         public boolean isCharsetDetected() {
 214             return detectedDecoder != null;
 215         }
 216 
 217         public Charset detectedCharset() {
 218             if (detectedDecoder == null)
 219                 throw new IllegalStateException("charset not yet detected");
 220             return ((CharsetDecoder) detectedDecoder).charset();
 221         }
 222 
 223 
 224         /**
 225          * Returned Shift_JIS Charset name is OS dependent
 226          */
 227         private static String getSJISName() {
 228             if (osName.equals("Solaris") || osName.equals("SunOS"))
 229                 return("PCK");
 230             else if (osName.startsWith("Windows"))
 231                 return("windows-31J");
 232             else
 233                 return("Shift_JIS");
 234         }
 235 
 236         /**
 237          * Returned EUC-JP Charset name is OS dependent
 238          */
 239 
 240         private static String getEUCJPName() {
 241             if (osName.equals("Solaris") || osName.equals("SunOS"))
 242                 return("x-eucjp-open");
 243             else
 244                 return("EUC_JP");
 245         }
 246 
 247     }
 248 }