1 /* 2 * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 package jnlp.converter.parser.xml; 25 26 import java.io.ByteArrayInputStream; 27 import java.io.EOFException; 28 import java.io.InputStreamReader; 29 import java.io.IOException; 30 import java.io.Reader; 31 import java.io.UnsupportedEncodingException; 32 33 public class XMLEncoding { 34 /** 35 * Decodes a byte stream into a String by testing for a Byte Order Mark 36 * (BOM) or an XML declaration. 37 * <br /> 38 * Detection begins by examining the first four octets of the stream for a 39 * BOM. If a BOM is not found, then an encoding declaration is looked for 40 * at the beginning of the stream. If the encoding still can not be 41 * determined at this point, then UTF-8 is assumed. 42 * 43 * @param data an array of bytes containing an encoded XML document. 44 * 45 * @return A string containing the decoded XML document. 46 */ 47 public static String decodeXML(byte [] data) throws IOException { 48 int start = 0; 49 String encoding; 50 51 if (data.length < BOM_LENGTH) { 52 throw (new EOFException("encoding.error.not.xml")); 53 } 54 // no else required; successfully read stream 55 int firstFour = ((0xff000000 & ((int) data[0] << 24)) | 56 (0x00ff0000 & ((int) data[1] << 16)) | 57 (0x0000ff00 & ((int) data[2] << 8)) | 58 (0x000000ff & (int) data[3])); 59 60 // start by examining the first four bytes for a BOM 61 switch (firstFour) { 62 case EBCDIC: 63 // examine the encoding declaration 64 encoding = examineEncodingDeclaration(data, IBM037_ENC); 65 break; 66 67 case XML_DECLARATION: 68 // assume UTF-8, but examine the encoding declaration 69 encoding = examineEncodingDeclaration(data, UTF_8_ENC); 70 break; 71 72 case UTF_16BE: 73 encoding = UTF_16BE_ENC; 74 break; 75 76 case UTF_16LE: 77 encoding = UTF_16LE_ENC; 78 break; 79 80 case UNUSUAL_OCTET_1: 81 case UNUSUAL_OCTET_2: 82 throw (new UnsupportedEncodingException("encoding.error.unusual.octet")); 83 84 case UTF_32_BE_BOM: 85 case UTF_32_LE_BOM: 86 encoding = UTF_32_ENC; 87 break; 88 89 default: 90 int firstThree = firstFour & 0xffffff00; 91 92 switch (firstThree) { 93 case UTF_8_BOM: 94 // the InputStreamReader class doen't properly handle 95 // the Byte Order Mark (BOM) in UTF-8 streams, so don't 96 // putback those 3 bytes. 97 start = 3; 98 encoding = UTF_8_ENC; 99 break; 100 101 default: 102 int firstTwo = firstFour & 0xffff0000; 103 104 switch (firstTwo) { 105 case UTF_16_BE_BOM: 106 case UTF_16_LE_BOM: 107 encoding = UTF_16_ENC; 108 break; 109 110 default: 111 // this is probably UTF-8 without the encoding 112 // declaration 113 encoding = UTF_8_ENC; 114 break; 115 } 116 break; 117 } 118 break; 119 } 120 121 return (new String(data, start, data.length - start, encoding)); 122 } 123 124 /** 125 * [3] S ::= ( #x20 | #x09 | #x0d | #x0a ) 126 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 127 * [24] VersionInfo ::= S 'version' Eq ( '"' VersionNum '"' | 128 * "'" VersionNum "'" ) 129 * [25] Eq ::= S? '=' S? 130 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+ 131 * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' | 132 * "'" EncName "'" ) 133 * [81] EncName ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')* 134 */ 135 private static String examineEncodingDeclaration(byte [] data, 136 String encoding) throws IOException { 137 boolean loop = false; 138 boolean recognized = false; 139 boolean almost = false; 140 boolean question = false; 141 boolean done = false; 142 boolean found = false; 143 int pos = 0; 144 int ch = -1; 145 Reader reader = null; 146 String result = ((encoding != null) ? encoding : UTF_8_ENC); 147 148 reader = new InputStreamReader(new ByteArrayInputStream(data), result); 149 ch = reader.read(); 150 151 // if this is an XML declaration, it will start with the text '<?xml' 152 for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) { 153 if (ch != XML_DECL_START.charAt(i)) { 154 // This doesn't look like an XML declaration. This method 155 // should only be called if the stream contains an XML 156 // declaration in the encoding that is passed into the method. 157 done = true; 158 break; 159 } 160 // no else required; still matches 161 ch = reader.read(); 162 } 163 164 // there must be at least one whitespace character next. 165 loop = true; 166 while ((loop == true) && (done == false)) { 167 switch (ch) { 168 case SPACE: 169 case TAB: // intentional 170 case LINEFEED: // fall 171 case RETURN: // through 172 ch = reader.read(); 173 break; 174 175 case -1: 176 // unexpected EOF 177 done = true; 178 break; 179 180 default: 181 // non-whitespace 182 loop = false; 183 break; 184 } 185 } 186 187 // now look for the text 'encoding', but if the end of the XML 188 // declaration (signified by the text '?>') comes first, then 189 // assume the encoding is UTF-8 190 loop = true; 191 while ((loop == true) && (done == false)) { 192 if (ch == -1) { 193 // unexpected EOF 194 done = true; 195 break; 196 } else if (recognized == true) { 197 // this is the encoding declaration as long as the next few 198 // characters are whitespace and/or the equals ('=') sign 199 switch (ch) { 200 case SPACE: // intentional 201 case TAB: // fall 202 case LINEFEED: // through 203 case RETURN: 204 // don't need to do anything 205 break; 206 207 case EQUAL: 208 if (almost == false) { 209 // got the equal, now find a quote 210 almost = true; 211 } else { 212 // this is not valid XML, so punt 213 recognized = false; 214 done = true; 215 } 216 break; 217 218 case DOUBLE_QUOTE: // intentional 219 case SINGLE_QUOTE: // fall through 220 if (almost == true) { 221 // got the quote, so move on to get the value 222 loop = false; 223 } else { 224 // got a quote before the equal; this is not valid 225 // XML, so punt 226 recognized = false; 227 done = true; 228 } 229 break; 230 231 default: 232 // non-whitespace 233 recognized = false; 234 if (almost == true) { 235 // this is not valid XML, so punt 236 done = true; 237 } 238 // no else required; this wasn't the encoding 239 // declaration 240 break; 241 } 242 243 if (recognized == false) { 244 // this isn't the encoding declaration, so go back to the 245 // top without reading the next character 246 pos = 0; 247 continue; 248 } 249 // no else required; still looking good 250 } else if (ch == ENCODING_DECL.charAt(pos++)) { 251 if (ENCODING_DECL.length() == pos) { 252 // this looks like the encoding declaration 253 recognized = true; 254 } 255 // no else required; this might be the encoding declaration 256 } else if (ch == '?') { 257 question = true; 258 pos = 0; 259 } else if ((ch == '>') && (question == true)) { 260 // there is no encoding declaration, so assume that the initial 261 // encoding guess was correct 262 done = true; 263 continue; 264 } else { 265 // still searching for the encoding declaration 266 pos = 0; 267 } 268 269 ch = reader.read(); 270 } 271 272 if (done == false) { 273 StringBuilder buffer = new StringBuilder(MAX_ENC_NAME); 274 275 if (((ch >= 'a') && (ch <= 'z')) | 276 ((ch >= 'A') && (ch <= 'Z'))) { 277 // add the character to the result 278 buffer.append((char) ch); 279 280 loop = true; 281 while ((loop == true) && (done == false)) { 282 ch = reader.read(); 283 284 if (((ch >= 'a') && (ch <= 'z')) || 285 ((ch >= 'A') && (ch <= 'Z')) || 286 ((ch >= '0') && (ch <= '9')) || 287 (ch == '_') || (ch == '.') || (ch == '-')) { 288 // add the character to the result 289 buffer.append((char) ch); 290 } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) { 291 // finished! 292 found = true; 293 done = true; 294 result = buffer.toString(); 295 } else { 296 // this is not a valid encoding name, so punt 297 done = true; 298 } 299 } 300 } else { 301 // this is not a valid encoding name, so punt 302 done = true; 303 } 304 } 305 // no else required; already failed to find the encoding somewhere else 306 307 return (result); 308 } 309 310 private static final int BOM_LENGTH = 4; 311 private static final int MAX_ENC_NAME = 512; 312 313 private static final int SPACE = 0x00000020; 314 private static final int TAB = 0x00000009; 315 private static final int LINEFEED = 0x0000000a; 316 private static final int RETURN = 0x0000000d; 317 private static final int EQUAL = '='; 318 private static final int DOUBLE_QUOTE = '\"'; 319 private static final int SINGLE_QUOTE = '\''; 320 321 private static final int UTF_32_BE_BOM = 0x0000feff; 322 private static final int UTF_32_LE_BOM = 0xfffe0000; 323 private static final int UTF_16_BE_BOM = 0xfeff0000; 324 private static final int UTF_16_LE_BOM = 0xfffe0000; 325 private static final int UTF_8_BOM = 0xefbbbf00; 326 private static final int UNUSUAL_OCTET_1 = 0x00003c00; 327 private static final int UNUSUAL_OCTET_2 = 0x003c0000; 328 private static final int UTF_16BE = 0x003c003f; 329 private static final int UTF_16LE = 0x3c003f00; 330 private static final int EBCDIC = 0x4c6fa794; 331 private static final int XML_DECLARATION = 0x3c3f786d; 332 333 private static final String UTF_32_ENC = "UTF-32"; 334 private static final String UTF_16_ENC = "UTF-16"; 335 private static final String UTF_16BE_ENC = "UTF-16BE"; 336 private static final String UTF_16LE_ENC = "UTF-16LE"; 337 private static final String UTF_8_ENC = "UTF-8"; 338 private static final String IBM037_ENC = "IBM037"; 339 340 private static final String XML_DECL_START = "<?xml"; 341 private static final String ENCODING_DECL = "encoding"; 342 }