2287 * Returns the IANA encoding name that is auto-detected from 2288 * the bytes specified, with the endian-ness of that encoding where appropriate. 2289 * 2290 * @param b4 The first four bytes of the input. 2291 * @param count The number of bytes actually read. 2292 * @return a 2-element array: the first element, an IANA-encoding string, 2293 * the second element a Boolean which is true iff the document is big endian, false 2294 * if it's little-endian, and null if the distinction isn't relevant. 2295 */ 2296 protected Object[] getEncodingName(byte[] b4, int count) { 2297 2298 if (count < 2) { 2299 return defaultEncoding; 2300 } 2301 2302 // UTF-16, with BOM 2303 int b0 = b4[0] & 0xFF; 2304 int b1 = b4[1] & 0xFF; 2305 if (b0 == 0xFE && b1 == 0xFF) { 2306 // UTF-16, big-endian 2307 return new Object [] {"UTF-16BE", new Boolean(true)}; 2308 } 2309 if (b0 == 0xFF && b1 == 0xFE) { 2310 // UTF-16, little-endian 2311 return new Object [] {"UTF-16LE", new Boolean(false)}; 2312 } 2313 2314 // default to UTF-8 if we don't have enough bytes to make a 2315 // good determination of the encoding 2316 if (count < 3) { 2317 return defaultEncoding; 2318 } 2319 2320 // UTF-8 with a BOM 2321 int b2 = b4[2] & 0xFF; 2322 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 2323 return defaultEncoding; 2324 } 2325 2326 // default to UTF-8 if we don't have enough bytes to make a 2327 // good determination of the encoding 2328 if (count < 4) { 2329 return defaultEncoding; 2330 } 2331 2332 // other encodings 2333 int b3 = b4[3] & 0xFF; 2334 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 2335 // UCS-4, big endian (1234) 2336 return new Object [] {"ISO-10646-UCS-4", new Boolean(true)}; 2337 } 2338 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 2339 // UCS-4, little endian (4321) 2340 return new Object [] {"ISO-10646-UCS-4", new Boolean(false)}; 2341 } 2342 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 2343 // UCS-4, unusual octet order (2143) 2344 // REVISIT: What should this be? 2345 return new Object [] {"ISO-10646-UCS-4", null}; 2346 } 2347 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 2348 // UCS-4, unusual octect order (3412) 2349 // REVISIT: What should this be? 2350 return new Object [] {"ISO-10646-UCS-4", null}; 2351 } 2352 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 2353 // UTF-16, big-endian, no BOM 2354 // (or could turn out to be UCS-2... 2355 // REVISIT: What should this be? 2356 return new Object [] {"UTF-16BE", new Boolean(true)}; 2357 } 2358 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 2359 // UTF-16, little-endian, no BOM 2360 // (or could turn out to be UCS-2... 2361 return new Object [] {"UTF-16LE", new Boolean(false)}; 2362 } 2363 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 2364 // EBCDIC 2365 // a la xerces1, return CP037 instead of EBCDIC here 2366 return new Object [] {"CP037", null}; 2367 } 2368 2369 return defaultEncoding; 2370 2371 } // getEncodingName(byte[],int):Object[] 2372 2373 /** 2374 * Creates a reader capable of reading the given input stream in 2375 * the specified encoding. 2376 * 2377 * @param inputStream The input stream. 2378 * @param encoding The encoding name that the input stream is 2379 * encoded using. If the user has specified that 2380 * Java encoding names are allowed, then the 2381 * encoding name may be a Java encoding name; | 2287 * Returns the IANA encoding name that is auto-detected from 2288 * the bytes specified, with the endian-ness of that encoding where appropriate. 2289 * 2290 * @param b4 The first four bytes of the input. 2291 * @param count The number of bytes actually read. 2292 * @return a 2-element array: the first element, an IANA-encoding string, 2293 * the second element a Boolean which is true iff the document is big endian, false 2294 * if it's little-endian, and null if the distinction isn't relevant. 2295 */ 2296 protected Object[] getEncodingName(byte[] b4, int count) { 2297 2298 if (count < 2) { 2299 return defaultEncoding; 2300 } 2301 2302 // UTF-16, with BOM 2303 int b0 = b4[0] & 0xFF; 2304 int b1 = b4[1] & 0xFF; 2305 if (b0 == 0xFE && b1 == 0xFF) { 2306 // UTF-16, big-endian 2307 return new Object [] {"UTF-16BE", Boolean.TRUE}; 2308 } 2309 if (b0 == 0xFF && b1 == 0xFE) { 2310 // UTF-16, little-endian 2311 return new Object [] {"UTF-16LE", Boolean.FALSE}; 2312 } 2313 2314 // default to UTF-8 if we don't have enough bytes to make a 2315 // good determination of the encoding 2316 if (count < 3) { 2317 return defaultEncoding; 2318 } 2319 2320 // UTF-8 with a BOM 2321 int b2 = b4[2] & 0xFF; 2322 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 2323 return defaultEncoding; 2324 } 2325 2326 // default to UTF-8 if we don't have enough bytes to make a 2327 // good determination of the encoding 2328 if (count < 4) { 2329 return defaultEncoding; 2330 } 2331 2332 // other encodings 2333 int b3 = b4[3] & 0xFF; 2334 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 2335 // UCS-4, big endian (1234) 2336 return new Object [] {"ISO-10646-UCS-4", Boolean.TRUE}; 2337 } 2338 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 2339 // UCS-4, little endian (4321) 2340 return new Object [] {"ISO-10646-UCS-4", Boolean.FALSE}; 2341 } 2342 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 2343 // UCS-4, unusual octet order (2143) 2344 // REVISIT: What should this be? 2345 return new Object [] {"ISO-10646-UCS-4", null}; 2346 } 2347 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 2348 // UCS-4, unusual octect order (3412) 2349 // REVISIT: What should this be? 2350 return new Object [] {"ISO-10646-UCS-4", null}; 2351 } 2352 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 2353 // UTF-16, big-endian, no BOM 2354 // (or could turn out to be UCS-2... 2355 // REVISIT: What should this be? 2356 return new Object [] {"UTF-16BE", Boolean.TRUE}; 2357 } 2358 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 2359 // UTF-16, little-endian, no BOM 2360 // (or could turn out to be UCS-2... 2361 return new Object [] {"UTF-16LE", Boolean.FALSE}; 2362 } 2363 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 2364 // EBCDIC 2365 // a la xerces1, return CP037 instead of EBCDIC here 2366 return new Object [] {"CP037", null}; 2367 } 2368 2369 return defaultEncoding; 2370 2371 } // getEncodingName(byte[],int):Object[] 2372 2373 /** 2374 * Creates a reader capable of reading the given input stream in 2375 * the specified encoding. 2376 * 2377 * @param inputStream The input stream. 2378 * @param encoding The encoding name that the input stream is 2379 * encoded using. If the user has specified that 2380 * Java encoding names are allowed, then the 2381 * encoding name may be a Java encoding name; |