2843 } 2844 mIsSAloneSet = false; 2845 if (is.getCharacterStream() != null) { 2846 // Ignore encoding in the xml text decl. 2847 reader = is.getCharacterStream(); 2848 xml(reader); 2849 } else if (is.getByteStream() != null) { 2850 String expenc; 2851 if (is.getEncoding() != null) { 2852 // Ignore encoding in the xml text decl. 2853 expenc = is.getEncoding().toUpperCase(); 2854 if (expenc.equals("UTF-16")) { 2855 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2856 } else { 2857 reader = enc(expenc, is.getByteStream()); 2858 } 2859 xml(reader); 2860 } else { 2861 // Get encoding from BOM or the xml text decl. 2862 reader = bom(is.getByteStream(), ' '); 2863 if (reader == null) { 2864 // Encoding is defined by the xml text decl. 2865 reader = enc("UTF-8", is.getByteStream()); 2866 expenc = xml(reader); 2867 if (expenc.startsWith("UTF-16")) { 2868 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2869 } 2870 reader = enc(expenc, is.getByteStream()); 2871 } else { 2872 // Encoding is defined by the BOM. 2873 xml(reader); 2874 } 2875 } 2876 } else { 2877 // There is no support for public/system identifiers. 2878 panic(FAULT); 2879 } 2880 mInp.src = reader; 2881 mInp.pubid = is.getPublicId(); 2882 mInp.sysid = is.getSystemId(); 2883 } 2884 2885 /** 2886 * Determines the entity encoding. 2887 * 2888 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2889 * first byte returned by the entity's byte stream has to be the first byte 2890 * in the entity. Also, there is no support for UCS-4. 2939 case 0xd0: 2940 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2941 break; 2942 2943 case 0xe0: 2944 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2945 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2946 break; 2947 2948 case 0xf0: // UCS-4 character 2949 throw new UnsupportedEncodingException(); 2950 2951 default: 2952 mChars[mChIdx++] = (char) val; 2953 break; 2954 } 2955 return null; 2956 } 2957 } 2958 2959 /** 2960 * Parses the xml text declaration. 2961 * 2962 * This method gets encoding from the xml text declaration [#4.3.1] if any. 2963 * The method assumes the buffer (mChars) is big enough to accommodate whole 2964 * xml text declaration. 2965 * 2966 * @param reader is entity reader. 2967 * @return The xml text declaration encoding or default UTF-8 encoding. 2968 * @exception Exception is parser specific exception form panic method. 2969 * @exception IOException 2970 */ 2971 private String xml(Reader reader) 2972 throws Exception { 2973 String str = null; 2974 String enc = "UTF-8"; 2975 char ch; 2976 int val; 2977 short st; 2978 // Read the xml text declaration into the buffer 2979 if (mChIdx != 0) { 2980 // The bom method have read ONE char into the buffer. 2981 st = (short) ((mChars[0] == '<') ? 1 : -1); 2982 } else { 2983 st = 0; 2984 } 2985 while (st >= 0 && mChIdx < mChars.length) { 2986 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 2987 mChars[mChIdx++] = ch; 2988 switch (st) { 2989 case 0: // read '<' of xml declaration 2990 switch (ch) { 2991 case '<': 2992 st = 1; 2993 break; 2994 2995 case 0xfeff: // the byte order mask 2996 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 2997 mChars[mChIdx - 1] = ch; 2998 st = (short) ((ch == '<') ? 1 : -1); 2999 break; 3000 3001 default: 3002 st = -1; 3003 break; 3004 } 3005 break; 3006 3007 case 1: // read '?' of xml declaration [#4.3.1] | 2843 } 2844 mIsSAloneSet = false; 2845 if (is.getCharacterStream() != null) { 2846 // Ignore encoding in the xml text decl. 2847 reader = is.getCharacterStream(); 2848 xml(reader); 2849 } else if (is.getByteStream() != null) { 2850 String expenc; 2851 if (is.getEncoding() != null) { 2852 // Ignore encoding in the xml text decl. 2853 expenc = is.getEncoding().toUpperCase(); 2854 if (expenc.equals("UTF-16")) { 2855 reader = bom(is.getByteStream(), 'U'); // UTF-16 [#4.3.3] 2856 } else { 2857 reader = enc(expenc, is.getByteStream()); 2858 } 2859 xml(reader); 2860 } else { 2861 // Get encoding from BOM or the xml text decl. 2862 reader = bom(is.getByteStream(), ' '); 2863 /** 2864 * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon 2865 * that it may be missing. A mature technique exists in Xerces 2866 * to further check for possible UTF-16 encoding 2867 */ 2868 if (reader == null) { 2869 reader = utf16(is.getByteStream()); 2870 } 2871 2872 if (reader == null) { 2873 // Encoding is defined by the xml text decl. 2874 reader = enc("UTF-8", is.getByteStream()); 2875 expenc = xml(reader); 2876 if (!expenc.equals("UTF-8")) { 2877 if (expenc.startsWith("UTF-16")) { 2878 panic(FAULT); // UTF-16 must have BOM [#4.3.3] 2879 } 2880 reader = enc(expenc, is.getByteStream()); 2881 } 2882 } else { 2883 // Encoding is defined by the BOM. 2884 xml(reader); 2885 } 2886 } 2887 } else { 2888 // There is no support for public/system identifiers. 2889 panic(FAULT); 2890 } 2891 mInp.src = reader; 2892 mInp.pubid = is.getPublicId(); 2893 mInp.sysid = is.getSystemId(); 2894 } 2895 2896 /** 2897 * Determines the entity encoding. 2898 * 2899 * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the 2900 * first byte returned by the entity's byte stream has to be the first byte 2901 * in the entity. Also, there is no support for UCS-4. 2950 case 0xd0: 2951 mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f)); 2952 break; 2953 2954 case 0xe0: 2955 mChars[mChIdx++] = (char) (((val & 0x0f) << 12) 2956 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f)); 2957 break; 2958 2959 case 0xf0: // UCS-4 character 2960 throw new UnsupportedEncodingException(); 2961 2962 default: 2963 mChars[mChIdx++] = (char) val; 2964 break; 2965 } 2966 return null; 2967 } 2968 } 2969 2970 2971 /** 2972 * Using a mature technique from Xerces, this method checks further after 2973 * the bom method above to see if the encoding is UTF-16 2974 * 2975 * @param is A byte stream of the entity. 2976 * @return a reader, may be null 2977 * @exception Exception is parser specific exception form panic method. 2978 * @exception IOException 2979 */ 2980 private Reader utf16(InputStream is) 2981 throws Exception { 2982 if (mChIdx != 0) { 2983 //The bom method has read ONE byte into the buffer. 2984 byte b0 = (byte)mChars[0]; 2985 if (b0 == 0x00 || b0 == 0x3C) { 2986 int b1 = is.read(); 2987 int b2 = is.read(); 2988 int b3 = is.read(); 2989 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { 2990 // UTF-16, big-endian, no BOM 2991 mChars[0] = (char)(b1); 2992 mChars[mChIdx++] = (char)(b3); 2993 return new ReaderUTF16(is, 'b'); 2994 } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { 2995 // UTF-16, little-endian, no BOM 2996 mChars[0] = (char)(b0); 2997 mChars[mChIdx++] = (char)(b2); 2998 return new ReaderUTF16(is, 'l'); 2999 } else { 3000 /**not every InputStream supports reset, so we have to remember 3001 * the state for further parsing 3002 **/ 3003 mChars[0] = (char)(b0); 3004 mChars[mChIdx++] = (char)(b1); 3005 mChars[mChIdx++] = (char)(b2); 3006 mChars[mChIdx++] = (char)(b3); 3007 } 3008 3009 } 3010 } 3011 return null; 3012 } 3013 /** 3014 * Parses the xml text declaration. 3015 * 3016 * This method gets encoding from the xml text declaration [#4.3.1] if any. 3017 * The method assumes the buffer (mChars) is big enough to accommodate whole 3018 * xml text declaration. 3019 * 3020 * @param reader is entity reader. 3021 * @return The xml text declaration encoding or default UTF-8 encoding. 3022 * @exception Exception is parser specific exception form panic method. 3023 * @exception IOException 3024 */ 3025 private String xml(Reader reader) 3026 throws Exception { 3027 String str = null; 3028 String enc = "UTF-8"; 3029 char ch; 3030 int val; 3031 short st = 0; 3032 int byteRead = mChIdx; //number of bytes read prior to entering this method 3033 3034 while (st >= 0 && mChIdx < mChars.length) { 3035 if (st < byteRead) { 3036 ch = mChars[st]; 3037 } else { 3038 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3039 mChars[mChIdx++] = ch; 3040 } 3041 3042 switch (st) { 3043 case 0: // read '<' of xml declaration 3044 switch (ch) { 3045 case '<': 3046 st = 1; 3047 break; 3048 3049 case 0xfeff: // the byte order mask 3050 ch = ((val = reader.read()) >= 0) ? (char) val : EOS; 3051 mChars[mChIdx - 1] = ch; 3052 st = (short) ((ch == '<') ? 1 : -1); 3053 break; 3054 3055 default: 3056 st = -1; 3057 break; 3058 } 3059 break; 3060 3061 case 1: // read '?' of xml declaration [#4.3.1] |