src/share/classes/jdk/internal/util/xml/impl/Parser.java

Print this page




2843         }
2844         mIsSAloneSet = false;
2845         if (is.getCharacterStream() != null) {
2846             //          Ignore encoding in the xml text decl.
2847             reader = is.getCharacterStream();
2848             xml(reader);
2849         } else if (is.getByteStream() != null) {
2850             String expenc;
2851             if (is.getEncoding() != null) {
2852                 //              Ignore encoding in the xml text decl.
2853                 expenc = is.getEncoding().toUpperCase();
2854                 if (expenc.equals("UTF-16")) {
2855                     reader = bom(is.getByteStream(), 'U');  // UTF-16 [#4.3.3]
2856                 } else {
2857                     reader = enc(expenc, is.getByteStream());
2858                 }
2859                 xml(reader);
2860             } else {
2861                 //              Get encoding from BOM or the xml text decl.
2862                 reader = bom(is.getByteStream(), ' ');





2863                 if (reader == null) {




2864                     //          Encoding is defined by the xml text decl.
2865                     reader = enc("UTF-8", is.getByteStream());
2866                     expenc = xml(reader);

2867                     if (expenc.startsWith("UTF-16")) {
2868                         panic(FAULT);  // UTF-16 must have BOM [#4.3.3]
2869                     }
2870                     reader = enc(expenc, is.getByteStream());

2871                 } else {
2872                     //          Encoding is defined by the BOM.
2873                     xml(reader);
2874                 }
2875             }
2876         } else {
2877             //          There is no support for public/system identifiers.
2878             panic(FAULT);
2879         }
2880         mInp.src = reader;
2881         mInp.pubid = is.getPublicId();
2882         mInp.sysid = is.getSystemId();
2883     }
2884 
2885     /**
2886      * Determines the entity encoding.
2887      *
2888      * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the
2889      * first byte returned by the entity's byte stream has to be the first byte
2890      * in the entity. Also, there is no support for UCS-4.


2939                     case 0xd0:
2940                         mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f));
2941                         break;
2942 
2943                     case 0xe0:
2944                         mChars[mChIdx++] = (char) (((val & 0x0f) << 12)
2945                                 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f));
2946                         break;
2947 
2948                     case 0xf0:  // UCS-4 character
2949                         throw new UnsupportedEncodingException();
2950 
2951                     default:
2952                         mChars[mChIdx++] = (char) val;
2953                         break;
2954                 }
2955                 return null;
2956         }
2957     }
2958 

2959     /**










































2960      * Parses the xml text declaration.
2961      *
2962      * This method gets encoding from the xml text declaration [#4.3.1] if any.
2963      * The method assumes the buffer (mChars) is big enough to accommodate whole
2964      * xml text declaration.
2965      *
2966      * @param reader is entity reader.
2967      * @return The xml text declaration encoding or default UTF-8 encoding.
2968      * @exception Exception is parser specific exception form panic method.
2969      * @exception IOException
2970      */
2971     private String xml(Reader reader)
2972             throws Exception {
2973         String str = null;
2974         String enc = "UTF-8";
2975         char ch;
2976         int val;
2977         short st;
2978         //              Read the xml text declaration into the buffer
2979         if (mChIdx != 0) {
2980             //          The bom method have read ONE char into the buffer.
2981             st = (short) ((mChars[0] == '<') ? 1 : -1);
2982         } else {
2983             st = 0;
2984         }
2985         while (st >= 0 && mChIdx < mChars.length) {



2986             ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
2987             mChars[mChIdx++] = ch;


2988             switch (st) {
2989                 case 0:     // read '<' of xml declaration
2990                     switch (ch) {
2991                         case '<':
2992                             st = 1;
2993                             break;
2994 
2995                         case 0xfeff:    // the byte order mask
2996                             ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
2997                             mChars[mChIdx - 1] = ch;
2998                             st = (short) ((ch == '<') ? 1 : -1);
2999                             break;
3000 
3001                         default:
3002                             st = -1;
3003                             break;
3004                     }
3005                     break;
3006 
3007                 case 1:     // read '?' of xml declaration [#4.3.1]




2843         }
2844         mIsSAloneSet = false;
2845         if (is.getCharacterStream() != null) {
2846             //          Ignore encoding in the xml text decl.
2847             reader = is.getCharacterStream();
2848             xml(reader);
2849         } else if (is.getByteStream() != null) {
2850             String expenc;
2851             if (is.getEncoding() != null) {
2852                 //              Ignore encoding in the xml text decl.
2853                 expenc = is.getEncoding().toUpperCase();
2854                 if (expenc.equals("UTF-16")) {
2855                     reader = bom(is.getByteStream(), 'U');  // UTF-16 [#4.3.3]
2856                 } else {
2857                     reader = enc(expenc, is.getByteStream());
2858                 }
2859                 xml(reader);
2860             } else {
2861                 //              Get encoding from BOM or the xml text decl.
2862                 reader = bom(is.getByteStream(), ' ');
2863                 /**
2864                  * [#4.3.3] requires BOM for UTF-16, however, it's not uncommon
2865                  * that it may be missing. A mature technique exists in Xerces
2866                  * to further check for possible UTF-16 encoding
2867                  */
2868                 if (reader == null) {
2869                     reader = utf16(is.getByteStream());
2870                 }
2871 
2872                 if (reader == null) {
2873                     //          Encoding is defined by the xml text decl.
2874                     reader = enc("UTF-8", is.getByteStream());
2875                     expenc = xml(reader);
2876                     if (!expenc.equals("UTF-8")) {
2877                         if (expenc.startsWith("UTF-16")) {
2878                             panic(FAULT);  // UTF-16 must have BOM [#4.3.3]
2879                         }
2880                         reader = enc(expenc, is.getByteStream());
2881                     }
2882                 } else {
2883                     //          Encoding is defined by the BOM.
2884                     xml(reader);
2885                 }
2886             }
2887         } else {
2888             //          There is no support for public/system identifiers.
2889             panic(FAULT);
2890         }
2891         mInp.src = reader;
2892         mInp.pubid = is.getPublicId();
2893         mInp.sysid = is.getSystemId();
2894     }
2895 
2896     /**
2897      * Determines the entity encoding.
2898      *
2899      * This method gets encoding from Byte Order Mask [#4.3.3] if any. Note, the
2900      * first byte returned by the entity's byte stream has to be the first byte
2901      * in the entity. Also, there is no support for UCS-4.


2950                     case 0xd0:
2951                         mChars[mChIdx++] = (char) (((val & 0x1f) << 6) | (is.read() & 0x3f));
2952                         break;
2953 
2954                     case 0xe0:
2955                         mChars[mChIdx++] = (char) (((val & 0x0f) << 12)
2956                                 | ((is.read() & 0x3f) << 6) | (is.read() & 0x3f));
2957                         break;
2958 
2959                     case 0xf0:  // UCS-4 character
2960                         throw new UnsupportedEncodingException();
2961 
2962                     default:
2963                         mChars[mChIdx++] = (char) val;
2964                         break;
2965                 }
2966                 return null;
2967         }
2968     }
2969 
2970 
2971     /**
2972      * Using a mature technique from Xerces, this method checks further after 
2973      * the bom method above to see if the encoding is UTF-16
2974      * 
2975      * @param is A byte stream of the entity.
2976      * @return a reader, may be null
2977      * @exception Exception is parser specific exception form panic method.
2978      * @exception IOException
2979      */
2980     private Reader utf16(InputStream is)
2981             throws Exception {
2982         if (mChIdx != 0) {
2983             //The bom method has read ONE byte into the buffer. 
2984             byte b0 = (byte)mChars[0];
2985             if (b0 == 0x00 || b0 == 0x3C) {
2986                 int b1 = is.read();
2987                 int b2 = is.read();
2988                 int b3 = is.read();
2989                 if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
2990                     // UTF-16, big-endian, no BOM
2991                     mChars[0] = (char)(b1);
2992                     mChars[mChIdx++] = (char)(b3);
2993                     return new ReaderUTF16(is, 'b');
2994                 } else if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
2995                     // UTF-16, little-endian, no BOM
2996                     mChars[0] = (char)(b0);
2997                     mChars[mChIdx++] = (char)(b2);
2998                     return new ReaderUTF16(is, 'l');
2999                 } else {
3000                     /**not every InputStream supports reset, so we have to remember
3001                      * the state for further parsing
3002                     **/
3003                     mChars[0] = (char)(b0);
3004                     mChars[mChIdx++] = (char)(b1);
3005                     mChars[mChIdx++] = (char)(b2);
3006                     mChars[mChIdx++] = (char)(b3);
3007                 }
3008                 
3009             }
3010         }
3011         return null;
3012     }
3013     /**
3014      * Parses the xml text declaration.
3015      *
3016      * This method gets encoding from the xml text declaration [#4.3.1] if any.
3017      * The method assumes the buffer (mChars) is big enough to accommodate whole
3018      * xml text declaration.
3019      *
3020      * @param reader is entity reader.
3021      * @return The xml text declaration encoding or default UTF-8 encoding.
3022      * @exception Exception is parser specific exception form panic method.
3023      * @exception IOException
3024      */
3025     private String xml(Reader reader)
3026             throws Exception {
3027         String str = null;
3028         String enc = "UTF-8";
3029         char ch;
3030         int val;
3031         short st = 0;
3032         int byteRead =  mChIdx; //number of bytes read prior to entering this method
3033 





3034         while (st >= 0 && mChIdx < mChars.length) {
3035             if (st < byteRead) {
3036                 ch = mChars[st];
3037             } else {
3038                 ch = ((val = reader.read()) >= 0) ? (char) val : EOS;            
3039                 mChars[mChIdx++] = ch;
3040             }
3041 
3042             switch (st) {
3043                 case 0:     // read '<' of xml declaration
3044                     switch (ch) {
3045                         case '<':
3046                             st = 1;
3047                             break;
3048 
3049                         case 0xfeff:    // the byte order mask
3050                             ch = ((val = reader.read()) >= 0) ? (char) val : EOS;
3051                             mChars[mChIdx - 1] = ch;
3052                             st = (short) ((ch == '<') ? 1 : -1);
3053                             break;
3054 
3055                         default:
3056                             st = -1;
3057                             break;
3058                     }
3059                     break;
3060 
3061                 case 1:     // read '?' of xml declaration [#4.3.1]