1907 * Returns the IANA encoding name that is auto-detected from 1908 * the bytes specified, with the endian-ness of that encoding where appropriate. 1909 * 1910 * @param b4 The first four bytes of the input. 1911 * @param count The number of bytes actually read. 1912 * @return a 2-element array: the first element, an IANA-encoding string, 1913 * the second element a Boolean which is true iff the document is big endian, false 1914 * if it's little-endian, and null if the distinction isn't relevant. 1915 */ 1916 protected Object[] getEncodingName(byte[] b4, int count) { 1917 1918 if (count < 2) { 1919 return new Object[]{"UTF-8", null}; 1920 } 1921 1922 // UTF-16, with BOM 1923 int b0 = b4[0] & 0xFF; 1924 int b1 = b4[1] & 0xFF; 1925 if (b0 == 0xFE && b1 == 0xFF) { 1926 // UTF-16, big-endian 1927 return new Object [] {"UTF-16BE", new Boolean(true)}; 1928 } 1929 if (b0 == 0xFF && b1 == 0xFE) { 1930 // UTF-16, little-endian 1931 return new Object [] {"UTF-16LE", new Boolean(false)}; 1932 } 1933 1934 // default to UTF-8 if we don't have enough bytes to make a 1935 // good determination of the encoding 1936 if (count < 3) { 1937 return new Object [] {"UTF-8", null}; 1938 } 1939 1940 // UTF-8 with a BOM 1941 int b2 = b4[2] & 0xFF; 1942 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 1943 return new Object [] {"UTF-8", null}; 1944 } 1945 1946 // default to UTF-8 if we don't have enough bytes to make a 1947 // good determination of the encoding 1948 if (count < 4) { 1949 return new Object [] {"UTF-8", null}; 1950 } 1951 1952 // other encodings 1953 int b3 = b4[3] & 0xFF; 1954 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 1955 // UCS-4, big endian (1234) 1956 return new Object [] {"ISO-10646-UCS-4", new Boolean(true)}; 1957 } 1958 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 1959 // UCS-4, little endian (4321) 1960 return new Object [] {"ISO-10646-UCS-4", new Boolean(false)}; 1961 } 1962 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 1963 // UCS-4, unusual octet order (2143) 1964 // REVISIT: What should this be? 1965 return new Object [] {"ISO-10646-UCS-4", null}; 1966 } 1967 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 1968 // UCS-4, unusual octect order (3412) 1969 // REVISIT: What should this be? 1970 return new Object [] {"ISO-10646-UCS-4", null}; 1971 } 1972 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 1973 // UTF-16, big-endian, no BOM 1974 // (or could turn out to be UCS-2... 1975 // REVISIT: What should this be? 1976 return new Object [] {"UTF-16BE", new Boolean(true)}; 1977 } 1978 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 1979 // UTF-16, little-endian, no BOM 1980 // (or could turn out to be UCS-2... 1981 return new Object [] {"UTF-16LE", new Boolean(false)}; 1982 } 1983 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 1984 // EBCDIC 1985 // a la xerces1, return CP037 instead of EBCDIC here 1986 return new Object [] {"CP037", null}; 1987 } 1988 1989 // default encoding 1990 return new Object [] {"UTF-8", null}; 1991 1992 } // getEncodingName(byte[],int):Object[] 1993 1994 /** 1995 * xxx not removing endEntity() so that i remember that we need to implement it. 1996 * Ends an entity. 1997 * 1998 * @throws XNIException Thrown by entity handler to signal an error. 1999 */ 2000 // 2001 /** Prints the contents of the buffer. */ | 1907 * Returns the IANA encoding name that is auto-detected from 1908 * the bytes specified, with the endian-ness of that encoding where appropriate. 1909 * 1910 * @param b4 The first four bytes of the input. 1911 * @param count The number of bytes actually read. 1912 * @return a 2-element array: the first element, an IANA-encoding string, 1913 * the second element a Boolean which is true iff the document is big endian, false 1914 * if it's little-endian, and null if the distinction isn't relevant. 1915 */ 1916 protected Object[] getEncodingName(byte[] b4, int count) { 1917 1918 if (count < 2) { 1919 return new Object[]{"UTF-8", null}; 1920 } 1921 1922 // UTF-16, with BOM 1923 int b0 = b4[0] & 0xFF; 1924 int b1 = b4[1] & 0xFF; 1925 if (b0 == 0xFE && b1 == 0xFF) { 1926 // UTF-16, big-endian 1927 return new Object [] {"UTF-16BE", Boolean.TRUE}; 1928 } 1929 if (b0 == 0xFF && b1 == 0xFE) { 1930 // UTF-16, little-endian 1931 return new Object [] {"UTF-16LE", Boolean.FALSE}; 1932 } 1933 1934 // default to UTF-8 if we don't have enough bytes to make a 1935 // good determination of the encoding 1936 if (count < 3) { 1937 return new Object [] {"UTF-8", null}; 1938 } 1939 1940 // UTF-8 with a BOM 1941 int b2 = b4[2] & 0xFF; 1942 if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { 1943 return new Object [] {"UTF-8", null}; 1944 } 1945 1946 // default to UTF-8 if we don't have enough bytes to make a 1947 // good determination of the encoding 1948 if (count < 4) { 1949 return new Object [] {"UTF-8", null}; 1950 } 1951 1952 // other encodings 1953 int b3 = b4[3] & 0xFF; 1954 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { 1955 // UCS-4, big endian (1234) 1956 return new Object [] {"ISO-10646-UCS-4", Boolean.TRUE}; 1957 } 1958 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { 1959 // UCS-4, little endian (4321) 1960 return new Object [] {"ISO-10646-UCS-4", Boolean.FALSE}; 1961 } 1962 if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { 1963 // UCS-4, unusual octet order (2143) 1964 // REVISIT: What should this be? 1965 return new Object [] {"ISO-10646-UCS-4", null}; 1966 } 1967 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { 1968 // UCS-4, unusual octect order (3412) 1969 // REVISIT: What should this be? 1970 return new Object [] {"ISO-10646-UCS-4", null}; 1971 } 1972 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 1973 // UTF-16, big-endian, no BOM 1974 // (or could turn out to be UCS-2... 1975 // REVISIT: What should this be? 1976 return new Object [] {"UTF-16BE", Boolean.TRUE}; 1977 } 1978 if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 1979 // UTF-16, little-endian, no BOM 1980 // (or could turn out to be UCS-2... 1981 return new Object [] {"UTF-16LE", Boolean.FALSE}; 1982 } 1983 if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { 1984 // EBCDIC 1985 // a la xerces1, return CP037 instead of EBCDIC here 1986 return new Object [] {"CP037", null}; 1987 } 1988 1989 // default encoding 1990 return new Object [] {"UTF-8", null}; 1991 1992 } // getEncodingName(byte[],int):Object[] 1993 1994 /** 1995 * xxx not removing endEntity() so that i remember that we need to implement it. 1996 * Ends an entity. 1997 * 1998 * @throws XNIException Thrown by entity handler to signal an error. 1999 */ 2000 // 2001 /** Prints the contents of the buffer. */ |