< prev index next >

src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLEntityManager.java

Print this page




2287      * Returns the IANA encoding name that is auto-detected from
2288      * the bytes specified, with the endian-ness of that encoding where appropriate.
2289      *
2290      * @param b4    The first four bytes of the input.
2291      * @param count The number of bytes actually read.
2292      * @return a 2-element array:  the first element, an IANA-encoding string,
2293      *  the second element a Boolean which is true iff the document is big endian, false
2294      *  if it's little-endian, and null if the distinction isn't relevant.
2295      */
2296     protected Object[] getEncodingName(byte[] b4, int count) {
2297 
2298         if (count < 2) {
2299             return defaultEncoding;
2300         }
2301 
2302         // UTF-16, with BOM
2303         int b0 = b4[0] & 0xFF;
2304         int b1 = b4[1] & 0xFF;
2305         if (b0 == 0xFE && b1 == 0xFF) {
2306             // UTF-16, big-endian
2307             return new Object [] {"UTF-16BE", new Boolean(true)};
2308         }
2309         if (b0 == 0xFF && b1 == 0xFE) {
2310             // UTF-16, little-endian
2311             return new Object [] {"UTF-16LE", new Boolean(false)};
2312         }
2313 
2314         // default to UTF-8 if we don't have enough bytes to make a
2315         // good determination of the encoding
2316         if (count < 3) {
2317             return defaultEncoding;
2318         }
2319 
2320         // UTF-8 with a BOM
2321         int b2 = b4[2] & 0xFF;
2322         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2323             return defaultEncoding;
2324         }
2325 
2326         // default to UTF-8 if we don't have enough bytes to make a
2327         // good determination of the encoding
2328         if (count < 4) {
2329             return defaultEncoding;
2330         }
2331 
2332         // other encodings
2333         int b3 = b4[3] & 0xFF;
2334         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2335             // UCS-4, big endian (1234)
2336             return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
2337         }
2338         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2339             // UCS-4, little endian (4321)
2340             return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
2341         }
2342         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2343             // UCS-4, unusual octet order (2143)
2344             // REVISIT: What should this be?
2345             return new Object [] {"ISO-10646-UCS-4", null};
2346         }
2347         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2348             // UCS-4, unusual octect order (3412)
2349             // REVISIT: What should this be?
2350             return new Object [] {"ISO-10646-UCS-4", null};
2351         }
2352         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2353             // UTF-16, big-endian, no BOM
2354             // (or could turn out to be UCS-2...
2355             // REVISIT: What should this be?
2356             return new Object [] {"UTF-16BE", new Boolean(true)};
2357         }
2358         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2359             // UTF-16, little-endian, no BOM
2360             // (or could turn out to be UCS-2...
2361             return new Object [] {"UTF-16LE", new Boolean(false)};
2362         }
2363         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2364             // EBCDIC
2365             // a la xerces1, return CP037 instead of EBCDIC here
2366             return new Object [] {"CP037", null};
2367         }
2368 
2369         return defaultEncoding;
2370 
2371     } // getEncodingName(byte[],int):Object[]
2372 
2373     /**
2374      * Creates a reader capable of reading the given input stream in
2375      * the specified encoding.
2376      *
2377      * @param inputStream  The input stream.
2378      * @param encoding     The encoding name that the input stream is
2379      *                     encoded using. If the user has specified that
2380      *                     Java encoding names are allowed, then the
2381      *                     encoding name may be a Java encoding name;




2287      * Returns the IANA encoding name that is auto-detected from
2288      * the bytes specified, with the endian-ness of that encoding where appropriate.
2289      *
2290      * @param b4    The first four bytes of the input.
2291      * @param count The number of bytes actually read.
2292      * @return a 2-element array:  the first element, an IANA-encoding string,
2293      *  the second element a Boolean which is true iff the document is big endian, false
2294      *  if it's little-endian, and null if the distinction isn't relevant.
2295      */
2296     protected Object[] getEncodingName(byte[] b4, int count) {
2297 
2298         if (count < 2) {
2299             return defaultEncoding;
2300         }
2301 
2302         // UTF-16, with BOM
2303         int b0 = b4[0] & 0xFF;
2304         int b1 = b4[1] & 0xFF;
2305         if (b0 == 0xFE && b1 == 0xFF) {
2306             // UTF-16, big-endian
2307             return new Object [] {"UTF-16BE", Boolean.TRUE};
2308         }
2309         if (b0 == 0xFF && b1 == 0xFE) {
2310             // UTF-16, little-endian
2311             return new Object [] {"UTF-16LE", Boolean.FALSE};
2312         }
2313 
2314         // default to UTF-8 if we don't have enough bytes to make a
2315         // good determination of the encoding
2316         if (count < 3) {
2317             return defaultEncoding;
2318         }
2319 
2320         // UTF-8 with a BOM
2321         int b2 = b4[2] & 0xFF;
2322         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
2323             return defaultEncoding;
2324         }
2325 
2326         // default to UTF-8 if we don't have enough bytes to make a
2327         // good determination of the encoding
2328         if (count < 4) {
2329             return defaultEncoding;
2330         }
2331 
2332         // other encodings
2333         int b3 = b4[3] & 0xFF;
2334         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
2335             // UCS-4, big endian (1234)
2336             return new Object [] {"ISO-10646-UCS-4", Boolean.TRUE};
2337         }
2338         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
2339             // UCS-4, little endian (4321)
2340             return new Object [] {"ISO-10646-UCS-4", Boolean.FALSE};
2341         }
2342         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
2343             // UCS-4, unusual octet order (2143)
2344             // REVISIT: What should this be?
2345             return new Object [] {"ISO-10646-UCS-4", null};
2346         }
2347         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
2348             // UCS-4, unusual octect order (3412)
2349             // REVISIT: What should this be?
2350             return new Object [] {"ISO-10646-UCS-4", null};
2351         }
2352         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2353             // UTF-16, big-endian, no BOM
2354             // (or could turn out to be UCS-2...
2355             // REVISIT: What should this be?
2356             return new Object [] {"UTF-16BE", Boolean.TRUE};
2357         }
2358         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2359             // UTF-16, little-endian, no BOM
2360             // (or could turn out to be UCS-2...
2361             return new Object [] {"UTF-16LE", Boolean.FALSE};
2362         }
2363         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
2364             // EBCDIC
2365             // a la xerces1, return CP037 instead of EBCDIC here
2366             return new Object [] {"CP037", null};
2367         }
2368 
2369         return defaultEncoding;
2370 
2371     } // getEncodingName(byte[],int):Object[]
2372 
2373     /**
2374      * Creates a reader capable of reading the given input stream in
2375      * the specified encoding.
2376      *
2377      * @param inputStream  The input stream.
2378      * @param encoding     The encoding name that the input stream is
2379      *                     encoded using. If the user has specified that
2380      *                     Java encoding names are allowed, then the
2381      *                     encoding name may be a Java encoding name;


< prev index next >