1 /* 2 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package sun.nio.cs.ext; 27 28 import java.nio.ByteBuffer; 29 import java.nio.CharBuffer; 30 import java.nio.charset.Charset; 31 import java.nio.charset.CharsetDecoder; 32 import java.nio.charset.CharsetEncoder; 33 import java.nio.charset.CoderResult; 34 import java.nio.charset.CharacterCodingException; 35 import java.nio.charset.MalformedInputException; 36 import sun.nio.cs.DelegatableDecoder; 37 import sun.nio.cs.HistoricallyNamedCharset; 38 import java.security.AccessController; 39 import java.security.PrivilegedAction; 40 import sun.nio.cs.*; 41 import static java.lang.Character.UnicodeBlock; 42 43 44 public class JISAutoDetect 45 extends Charset 46 implements HistoricallyNamedCharset 47 { 48 49 private final static int EUCJP_MASK = 0x01; 50 private final static int SJIS2B_MASK = 0x02; 51 private final static int SJIS1B_MASK = 0x04; 52 private final static int EUCJP_KANA1_MASK = 0x08; 53 private final static int EUCJP_KANA2_MASK = 0x10; 54 55 public JISAutoDetect() { 56 super("x-JISAutoDetect", ExtendedCharsets.aliasesFor("x-JISAutoDetect")); 57 } 58 59 public boolean contains(Charset cs) { 60 return ((cs.name().equals("US-ASCII")) 61 || (cs instanceof SJIS) 62 || (cs instanceof EUC_JP) 63 || (cs instanceof ISO2022_JP)); 64 } 65 66 public boolean canEncode() { 67 return false; 68 } 69 70 public CharsetDecoder newDecoder() { 71 return new Decoder(this); 72 } 73 74 public String historicalName() { 75 return "JISAutoDetect"; 76 } 77 78 public CharsetEncoder newEncoder() { 79 throw new UnsupportedOperationException(); 80 } 81 82 // A heuristic algorithm for guessing if EUC-decoded text really 83 // might be Japanese text. Better heuristics are possible... 84 private static boolean looksLikeJapanese(CharBuffer cb) { 85 int hiragana = 0; // Fullwidth Hiragana 86 int katakana = 0; // Halfwidth Katakana 87 while (cb.hasRemaining()) { 88 char c = cb.get(); 89 if (0x3040 <= c && c <= 0x309f && ++hiragana > 1) return true; 90 if (0xff65 <= c && c <= 0xff9f && ++katakana > 1) return true; 91 } 92 return false; 93 } 94 95 private static class Decoder extends CharsetDecoder { 96 private final static String osName = AccessController.doPrivileged( 97 (PrivilegedAction<String>) () -> System.getProperty("os.name")); 98 99 private final static String SJISName = getSJISName(); 100 private final static String EUCJPName = getEUCJPName(); 101 private DelegatableDecoder detectedDecoder = null; 102 103 public Decoder(Charset cs) { 104 super(cs, 0.5f, 1.0f); 105 } 106 107 private static boolean isPlainASCII(byte b) { 108 return b >= 0 && b != 0x1b; 109 } 110 111 private static void copyLeadingASCII(ByteBuffer src, CharBuffer dst) { 112 int start = src.position(); 113 int limit = start + Math.min(src.remaining(), dst.remaining()); 114 int p; 115 byte b; 116 for (p = start; p < limit && isPlainASCII(b = src.get(p)); p++) 117 dst.put((char)(b & 0xff)); 118 src.position(p); 119 } 120 121 private CoderResult decodeLoop(DelegatableDecoder decoder, 122 ByteBuffer src, CharBuffer dst) { 123 ((CharsetDecoder)decoder).reset(); 124 detectedDecoder = decoder; 125 return detectedDecoder.decodeLoop(src, dst); 126 } 127 128 protected CoderResult decodeLoop(ByteBuffer src, CharBuffer dst) { 129 if (detectedDecoder == null) { 130 copyLeadingASCII(src, dst); 131 132 // All ASCII? 133 if (! src.hasRemaining()) 134 return CoderResult.UNDERFLOW; 135 // Overflow only if there is still ascii but no out buffer. 136 if (!dst.hasRemaining() && 137 isPlainASCII(src.get(src.position()))) 138 return CoderResult.OVERFLOW; 139 140 // We need to perform double, not float, arithmetic; otherwise 141 // we lose low order bits when src is larger than 2**24. 142 int cbufsiz = (int)(src.limit() * (double)maxCharsPerByte()); 143 CharBuffer sandbox = CharBuffer.allocate(cbufsiz); 144 145 // First try ISO-2022-JP, since there is no ambiguity 146 Charset cs2022 = Charset.forName("ISO-2022-JP"); 147 DelegatableDecoder dd2022 148 = (DelegatableDecoder) cs2022.newDecoder(); 149 ByteBuffer src2022 = src.asReadOnlyBuffer(); 150 CoderResult res2022 = dd2022.decodeLoop(src2022, sandbox); 151 if (! res2022.isError()) 152 return decodeLoop(dd2022, src, dst); 153 154 // We must choose between EUC and SJIS 155 Charset csEUCJ = Charset.forName(EUCJPName); 156 Charset csSJIS = Charset.forName(SJISName); 157 158 DelegatableDecoder ddEUCJ 159 = (DelegatableDecoder) csEUCJ.newDecoder(); 160 DelegatableDecoder ddSJIS 161 = (DelegatableDecoder) csSJIS.newDecoder(); 162 163 ByteBuffer srcEUCJ = src.asReadOnlyBuffer(); 164 sandbox.clear(); 165 CoderResult resEUCJ = ddEUCJ.decodeLoop(srcEUCJ, sandbox); 166 // If EUC decoding fails, must be SJIS 167 if (resEUCJ.isError()) 168 return decodeLoop(ddSJIS, src, dst); 169 ByteBuffer srcSJIS = src.asReadOnlyBuffer(); 170 CharBuffer sandboxSJIS = CharBuffer.allocate(cbufsiz); 171 CoderResult resSJIS = ddSJIS.decodeLoop(srcSJIS, sandboxSJIS); 172 // If SJIS decoding fails, must be EUC 173 if (resSJIS.isError()) 174 return decodeLoop(ddEUCJ, src, dst); 175 176 // From here on, we have some ambiguity, and must guess. 177 178 // We prefer input that does not appear to end mid-character. 179 if (srcEUCJ.position() > srcSJIS.position()) 180 return decodeLoop(ddEUCJ, src, dst); 181 182 if (srcEUCJ.position() < srcSJIS.position()) 183 return decodeLoop(ddSJIS, src, dst); 184 185 // end-of-input is after the first byte of the first char? 186 if (src.position() == srcEUCJ.position()) 187 return CoderResult.UNDERFLOW; 188 189 // Use heuristic knowledge of typical Japanese text 190 sandbox.flip(); 191 return decodeLoop(looksLikeJapanese(sandbox) ? ddEUCJ : ddSJIS, 192 src, dst); 193 } 194 195 return detectedDecoder.decodeLoop(src, dst); 196 } 197 198 protected void implReset() { 199 detectedDecoder = null; 200 } 201 202 protected CoderResult implFlush(CharBuffer out) { 203 if (detectedDecoder != null) 204 return detectedDecoder.implFlush(out); 205 else 206 return super.implFlush(out); 207 } 208 209 public boolean isAutoDetecting() { 210 return true; 211 } 212 213 public boolean isCharsetDetected() { 214 return detectedDecoder != null; 215 } 216 217 public Charset detectedCharset() { 218 if (detectedDecoder == null) 219 throw new IllegalStateException("charset not yet detected"); 220 return ((CharsetDecoder) detectedDecoder).charset(); 221 } 222 223 224 /** 225 * Returned Shift_JIS Charset name is OS dependent 226 */ 227 private static String getSJISName() { 228 if (osName.equals("Solaris") || osName.equals("SunOS")) 229 return("PCK"); 230 else if (osName.startsWith("Windows")) 231 return("windows-31J"); 232 else 233 return("Shift_JIS"); 234 } 235 236 /** 237 * Returned EUC-JP Charset name is OS dependent 238 */ 239 240 private static String getEUCJPName() { 241 if (osName.equals("Solaris") || osName.equals("SunOS")) 242 return("x-eucjp-open"); 243 else 244 return("EUC_JP"); 245 } 246 247 } 248 }