< prev index next >

src/java.xml/share/classes/com/sun/org/apache/xerces/internal/impl/XMLEntityScanner.java

Print this page




1907      * Returns the IANA encoding name that is auto-detected from
1908      * the bytes specified, with the endian-ness of that encoding where appropriate.
1909      *
1910      * @param b4    The first four bytes of the input.
1911      * @param count The number of bytes actually read.
1912      * @return a 2-element array:  the first element, an IANA-encoding string,
1913      *  the second element a Boolean which is true iff the document is big endian, false
1914      *  if it's little-endian, and null if the distinction isn't relevant.
1915      */
1916     protected Object[] getEncodingName(byte[] b4, int count) {
1917 
1918         if (count < 2) {
1919             return new Object[]{"UTF-8", null};
1920         }
1921 
1922         // UTF-16, with BOM
1923         int b0 = b4[0] & 0xFF;
1924         int b1 = b4[1] & 0xFF;
1925         if (b0 == 0xFE && b1 == 0xFF) {
1926             // UTF-16, big-endian
1927             return new Object [] {"UTF-16BE", new Boolean(true)};
1928         }
1929         if (b0 == 0xFF && b1 == 0xFE) {
1930             // UTF-16, little-endian
1931             return new Object [] {"UTF-16LE", new Boolean(false)};
1932         }
1933 
1934         // default to UTF-8 if we don't have enough bytes to make a
1935         // good determination of the encoding
1936         if (count < 3) {
1937             return new Object [] {"UTF-8", null};
1938         }
1939 
1940         // UTF-8 with a BOM
1941         int b2 = b4[2] & 0xFF;
1942         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
1943             return new Object [] {"UTF-8", null};
1944         }
1945 
1946         // default to UTF-8 if we don't have enough bytes to make a
1947         // good determination of the encoding
1948         if (count < 4) {
1949             return new Object [] {"UTF-8", null};
1950         }
1951 
1952         // other encodings
1953         int b3 = b4[3] & 0xFF;
1954         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
1955             // UCS-4, big endian (1234)
1956             return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
1957         }
1958         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
1959             // UCS-4, little endian (4321)
1960             return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
1961         }
1962         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
1963             // UCS-4, unusual octet order (2143)
1964             // REVISIT: What should this be?
1965             return new Object [] {"ISO-10646-UCS-4", null};
1966         }
1967         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
1968             // UCS-4, unusual octect order (3412)
1969             // REVISIT: What should this be?
1970             return new Object [] {"ISO-10646-UCS-4", null};
1971         }
1972         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
1973             // UTF-16, big-endian, no BOM
1974             // (or could turn out to be UCS-2...
1975             // REVISIT: What should this be?
1976             return new Object [] {"UTF-16BE", new Boolean(true)};
1977         }
1978         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
1979             // UTF-16, little-endian, no BOM
1980             // (or could turn out to be UCS-2...
1981             return new Object [] {"UTF-16LE", new Boolean(false)};
1982         }
1983         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
1984             // EBCDIC
1985             // a la xerces1, return CP037 instead of EBCDIC here
1986             return new Object [] {"CP037", null};
1987         }
1988 
1989         // default encoding
1990         return new Object [] {"UTF-8", null};
1991 
1992     } // getEncodingName(byte[],int):Object[]
1993 
1994     /**
1995      * xxx not removing endEntity() so that i remember that we need to implement it.
1996      * Ends an entity.
1997      *
1998      * @throws XNIException Thrown by entity handler to signal an error.
1999      */
2000     //
2001     /** Prints the contents of the buffer. */




1907      * Returns the IANA encoding name that is auto-detected from
1908      * the bytes specified, with the endian-ness of that encoding where appropriate.
1909      *
1910      * @param b4    The first four bytes of the input.
1911      * @param count The number of bytes actually read.
1912      * @return a 2-element array:  the first element, an IANA-encoding string,
1913      *  the second element a Boolean which is true iff the document is big endian, false
1914      *  if it's little-endian, and null if the distinction isn't relevant.
1915      */
1916     protected Object[] getEncodingName(byte[] b4, int count) {
1917 
1918         if (count < 2) {
1919             return new Object[]{"UTF-8", null};
1920         }
1921 
1922         // UTF-16, with BOM
1923         int b0 = b4[0] & 0xFF;
1924         int b1 = b4[1] & 0xFF;
1925         if (b0 == 0xFE && b1 == 0xFF) {
1926             // UTF-16, big-endian
1927             return new Object [] {"UTF-16BE", Boolean.TRUE};
1928         }
1929         if (b0 == 0xFF && b1 == 0xFE) {
1930             // UTF-16, little-endian
1931             return new Object [] {"UTF-16LE", Boolean.FALSE};
1932         }
1933 
1934         // default to UTF-8 if we don't have enough bytes to make a
1935         // good determination of the encoding
1936         if (count < 3) {
1937             return new Object [] {"UTF-8", null};
1938         }
1939 
1940         // UTF-8 with a BOM
1941         int b2 = b4[2] & 0xFF;
1942         if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
1943             return new Object [] {"UTF-8", null};
1944         }
1945 
1946         // default to UTF-8 if we don't have enough bytes to make a
1947         // good determination of the encoding
1948         if (count < 4) {
1949             return new Object [] {"UTF-8", null};
1950         }
1951 
1952         // other encodings
1953         int b3 = b4[3] & 0xFF;
1954         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
1955             // UCS-4, big endian (1234)
1956             return new Object [] {"ISO-10646-UCS-4", Boolean.TRUE};
1957         }
1958         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
1959             // UCS-4, little endian (4321)
1960             return new Object [] {"ISO-10646-UCS-4", Boolean.FALSE};
1961         }
1962         if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
1963             // UCS-4, unusual octet order (2143)
1964             // REVISIT: What should this be?
1965             return new Object [] {"ISO-10646-UCS-4", null};
1966         }
1967         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
1968             // UCS-4, unusual octect order (3412)
1969             // REVISIT: What should this be?
1970             return new Object [] {"ISO-10646-UCS-4", null};
1971         }
1972         if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
1973             // UTF-16, big-endian, no BOM
1974             // (or could turn out to be UCS-2...
1975             // REVISIT: What should this be?
1976             return new Object [] {"UTF-16BE", Boolean.TRUE};
1977         }
1978         if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
1979             // UTF-16, little-endian, no BOM
1980             // (or could turn out to be UCS-2...
1981             return new Object [] {"UTF-16LE", Boolean.FALSE};
1982         }
1983         if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
1984             // EBCDIC
1985             // a la xerces1, return CP037 instead of EBCDIC here
1986             return new Object [] {"CP037", null};
1987         }
1988 
1989         // default encoding
1990         return new Object [] {"UTF-8", null};
1991 
1992     } // getEncodingName(byte[],int):Object[]
1993 
1994     /**
1995      * xxx not removing endEntity() so that i remember that we need to implement it.
1996      * Ends an entity.
1997      *
1998      * @throws XNIException Thrown by entity handler to signal an error.
1999      */
2000     //
2001     /** Prints the contents of the buffer. */


< prev index next >