1 /*
   2  * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 package jnlp.converter.parser.xml;
  25 
  26 import java.io.ByteArrayInputStream;
  27 import java.io.EOFException;
  28 import java.io.InputStreamReader;
  29 import java.io.IOException;
  30 import java.io.Reader;
  31 import java.io.UnsupportedEncodingException;
  32 
  33 public class XMLEncoding {
  34     /**
  35      * Decodes a byte stream into a String by testing for a Byte Order Mark
  36      * (BOM) or an XML declaration.
  37      * <br />
  38      * Detection begins by examining the first four octets of the stream for a
  39      * BOM. If a BOM is not found, then an encoding declaration is looked for
  40      * at the beginning of the stream. If the encoding still can not be
  41      * determined at this point, then UTF-8 is assumed.
  42      *
  43      * @param data  an array of bytes containing an encoded XML document.
  44      *
  45      * @return A string containing the decoded XML document.
  46      */
  47     public static String decodeXML(byte [] data) throws IOException {
  48         int start = 0;
  49         String encoding;
  50 
  51         if (data.length < BOM_LENGTH) {
  52             throw (new EOFException("encoding.error.not.xml"));
  53         }
  54         // no else required; successfully read stream
  55         int firstFour = ((0xff000000 & ((int) data[0] << 24)) |
  56                          (0x00ff0000 & ((int) data[1] << 16)) |
  57                          (0x0000ff00 & ((int) data[2] <<  8)) |
  58                          (0x000000ff &  (int) data[3]));
  59 
  60         // start by examining the first four bytes for a BOM
  61         switch (firstFour) {
  62             case EBCDIC:
  63                 // examine the encoding declaration
  64                 encoding = examineEncodingDeclaration(data, IBM037_ENC);
  65                 break;
  66 
  67             case XML_DECLARATION:
  68                 // assume UTF-8, but examine the encoding declaration
  69                 encoding = examineEncodingDeclaration(data, UTF_8_ENC);
  70                 break;
  71 
  72             case UTF_16BE:
  73                 encoding = UTF_16BE_ENC;
  74                 break;
  75 
  76             case UTF_16LE:
  77                 encoding = UTF_16LE_ENC;
  78                 break;
  79 
  80             case UNUSUAL_OCTET_1:
  81             case UNUSUAL_OCTET_2:
  82                 throw (new UnsupportedEncodingException("encoding.error.unusual.octet"));
  83 
  84             case UTF_32_BE_BOM:
  85             case UTF_32_LE_BOM:
  86                 encoding = UTF_32_ENC;
  87                 break;
  88 
  89             default:
  90                 int firstThree = firstFour & 0xffffff00;
  91 
  92                 switch (firstThree) {
  93                     case UTF_8_BOM:
  94                         // the InputStreamReader class doen't properly handle
  95                         // the Byte Order Mark (BOM) in UTF-8 streams, so don't
  96                         // putback those 3 bytes.
  97                         start    = 3;
  98                         encoding = UTF_8_ENC;
  99                         break;
 100 
 101                     default:
 102                         int firstTwo = firstFour & 0xffff0000;
 103 
 104                         switch (firstTwo) {
 105                             case UTF_16_BE_BOM:
 106                             case UTF_16_LE_BOM:
 107                                 encoding = UTF_16_ENC;
 108                                 break;
 109 
 110                             default:
 111                                 // this is probably UTF-8 without the encoding
 112                                 // declaration
 113                                 encoding = UTF_8_ENC;
 114                                 break;
 115                         }
 116                         break;
 117                 }
 118                 break;
 119         }
 120 
 121         return (new String(data, start, data.length - start, encoding));
 122     }
 123 
 124     /**
 125      * [3]  S            ::= ( #x20 | #x09 | #x0d | #x0a )
 126      * [23] XMLDecl      ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
 127      * [24] VersionInfo  ::= S 'version' Eq ( '"' VersionNum '"' |
 128      *                                        "'" VersionNum "'" )
 129      * [25] Eq           ::= S? '=' S?
 130      * [26] VersionNum   ::= ([a-zA-Z0-9_.:] | '-')+
 131      * [80] EncodingDecl ::= S 'encoding' Eq ( '"' EncName '"' |
 132      *                                         "'" EncName "'" )
 133      * [81] EncName      ::= [a-zA-Z] ([a-zA-Z0-9_.] | '-')*
 134      */
 135     private static String examineEncodingDeclaration(byte [] data,
 136                           String    encoding) throws IOException {
 137         boolean loop       = false;
 138         boolean recognized = false;
 139         boolean almost     = false;
 140         boolean question   = false;
 141         boolean done       = false;
 142         boolean found      = false;
 143         int     pos        = 0;
 144         int     ch         = -1;
 145         Reader  reader     = null;
 146         String  result     = ((encoding != null) ? encoding : UTF_8_ENC);
 147 
 148         reader = new InputStreamReader(new ByteArrayInputStream(data), result);
 149         ch     = reader.read();
 150 
 151         // if this is an XML declaration, it will start with the text '<?xml'
 152         for (int i = 0; ((i < XML_DECL_START.length()) && (done == false)); i++) {
 153             if (ch != XML_DECL_START.charAt(i)) {
 154                 // This doesn't look like an XML declaration.  This method
 155                 // should only be called if the stream contains an XML
 156                 // declaration in the encoding that is passed into the method.
 157                 done = true;
 158                 break;
 159             }
 160             // no else required; still matches
 161             ch = reader.read();
 162         }
 163 
 164         // there must be at least one whitespace character next.
 165         loop = true;
 166         while ((loop == true) && (done == false)) {
 167             switch (ch) {
 168                 case SPACE:
 169                 case TAB:         // intentional
 170                 case LINEFEED:    // fall
 171                 case RETURN:      // through
 172                     ch = reader.read();
 173                     break;
 174 
 175                 case -1:
 176                     // unexpected EOF
 177                     done = true;
 178                     break;
 179 
 180                 default:
 181                     // non-whitespace
 182                     loop = false;
 183                     break;
 184             }
 185         }
 186 
 187         // now look for the text 'encoding', but if the end of the XML
 188         // declaration (signified by the text '?>') comes first, then
 189         // assume the encoding is UTF-8
 190         loop = true;
 191         while ((loop == true) && (done == false)) {
 192             if (ch == -1) {
 193                 // unexpected EOF
 194                 done = true;
 195                 break;
 196             } else if (recognized == true) {
 197                 // this is the encoding declaration as long as the next few
 198                 // characters are whitespace and/or the equals ('=') sign
 199                 switch (ch) {
 200                     case SPACE:       // intentional
 201                     case TAB:         // fall
 202                     case LINEFEED:    // through
 203                     case RETURN:
 204                         // don't need to do anything
 205                         break;
 206 
 207                     case EQUAL:
 208                         if (almost == false) {
 209                             // got the equal, now find a quote
 210                             almost = true;
 211                         } else {
 212                             // this is not valid XML, so punt
 213                             recognized = false;
 214                             done       = true;
 215                         }
 216                         break;
 217 
 218                     case DOUBLE_QUOTE:    // intentional
 219                     case SINGLE_QUOTE:    // fall through
 220                         if (almost == true) {
 221                             // got the quote, so move on to get the value
 222                             loop = false;
 223                         } else {
 224                             // got a quote before the equal; this is not valid
 225                             // XML, so punt
 226                             recognized = false;
 227                             done       = true;
 228                         }
 229                         break;
 230 
 231                     default:
 232                         // non-whitespace
 233                         recognized = false;
 234                         if (almost == true) {
 235                             // this is not valid XML, so punt
 236                             done = true;
 237                         }
 238                         // no else required; this wasn't the encoding
 239                         // declaration
 240                         break;
 241                 }
 242 
 243                 if (recognized == false) {
 244                     // this isn't the encoding declaration, so go back to the
 245                     // top without reading the next character
 246                     pos = 0;
 247                     continue;
 248                 }
 249                 // no else required; still looking good
 250             } else if (ch == ENCODING_DECL.charAt(pos++)) {
 251                 if (ENCODING_DECL.length() == pos) {
 252                     // this looks like the encoding declaration
 253                     recognized = true;
 254                 }
 255                 // no else required; this might be the encoding declaration
 256             } else if (ch == '?') {
 257                 question = true;
 258                 pos      = 0;
 259             } else if ((ch == '>') && (question == true)) {
 260                 // there is no encoding declaration, so assume that the initial
 261                 // encoding guess was correct
 262                 done   = true;
 263                 continue;
 264             } else {
 265                 // still searching for the encoding declaration
 266                 pos = 0;
 267             }
 268 
 269             ch = reader.read();
 270         }
 271 
 272         if (done == false) {
 273             StringBuilder buffer = new StringBuilder(MAX_ENC_NAME);
 274 
 275             if (((ch >= 'a') && (ch <= 'z')) |
 276                 ((ch >= 'A') && (ch <= 'Z'))) {
 277                 // add the character to the result
 278                 buffer.append((char) ch);
 279 
 280                 loop = true;
 281                 while ((loop == true) && (done == false)) {
 282                     ch = reader.read();
 283 
 284                     if (((ch >= 'a') && (ch <= 'z')) ||
 285                         ((ch >= 'A') && (ch <= 'Z')) ||
 286                         ((ch >= '0') && (ch <= '9')) ||
 287                         (ch == '_') || (ch == '.') || (ch == '-')) {
 288                         // add the character to the result
 289                         buffer.append((char) ch);
 290                     } else if ((ch == DOUBLE_QUOTE) || (ch == SINGLE_QUOTE)) {
 291                         // finished!
 292                         found  = true;
 293                         done   = true;
 294                         result = buffer.toString();
 295                     } else {
 296                         // this is not a valid encoding name, so punt
 297                         done = true;
 298                     }
 299                 }
 300             } else {
 301                 // this is not a valid encoding name, so punt
 302                 done = true;
 303             }
 304         }
 305         // no else required; already failed to find the encoding somewhere else
 306 
 307         return (result);
 308     }
 309 
 310     private static final int BOM_LENGTH   = 4;
 311     private static final int MAX_ENC_NAME = 512;
 312 
 313     private static final int SPACE        = 0x00000020;
 314     private static final int TAB          = 0x00000009;
 315     private static final int LINEFEED     = 0x0000000a;
 316     private static final int RETURN       = 0x0000000d;
 317     private static final int EQUAL        = '=';
 318     private static final int DOUBLE_QUOTE = '\"';
 319     private static final int SINGLE_QUOTE = '\'';
 320 
 321     private static final int UTF_32_BE_BOM   = 0x0000feff;
 322     private static final int UTF_32_LE_BOM   = 0xfffe0000;
 323     private static final int UTF_16_BE_BOM   = 0xfeff0000;
 324     private static final int UTF_16_LE_BOM   = 0xfffe0000;
 325     private static final int UTF_8_BOM       = 0xefbbbf00;
 326     private static final int UNUSUAL_OCTET_1 = 0x00003c00;
 327     private static final int UNUSUAL_OCTET_2 = 0x003c0000;
 328     private static final int UTF_16BE        = 0x003c003f;
 329     private static final int UTF_16LE        = 0x3c003f00;
 330     private static final int EBCDIC          = 0x4c6fa794;
 331     private static final int XML_DECLARATION = 0x3c3f786d;
 332 
 333     private static final String UTF_32_ENC   = "UTF-32";
 334     private static final String UTF_16_ENC   = "UTF-16";
 335     private static final String UTF_16BE_ENC = "UTF-16BE";
 336     private static final String UTF_16LE_ENC = "UTF-16LE";
 337     private static final String UTF_8_ENC    = "UTF-8";
 338     private static final String IBM037_ENC   = "IBM037";
 339 
 340     private static final String XML_DECL_START = "<?xml";
 341     private static final String ENCODING_DECL  = "encoding";
 342 }