1 /*
   2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl;
  22 
  23 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
  24 import com.sun.org.apache.xerces.internal.util.XML11Char;
  25 import com.sun.org.apache.xerces.internal.util.XMLChar;
  26 import com.sun.org.apache.xerces.internal.util.XMLStringBuffer;
  27 import com.sun.org.apache.xerces.internal.xni.XMLString;
  28 import com.sun.org.apache.xerces.internal.xni.XNIException;
  29 import java.io.IOException;
  30 
  31 /**
  32  * This class is responsible for scanning XML document structure
  33  * and content. The scanner acts as the source for the document
  34  * information which is communicated to the document handler.
  35  * <p>
  36  * This component requires the following features and properties from the
  37  * component manager that uses it:
  38  * <ul>
  39  *  <li>http://xml.org/sax/features/namespaces</li>
  40  *  <li>http://xml.org/sax/features/validation</li>
  41  *  <li>http://apache.org/xml/features/nonvalidating/load-external-dtd</li>
  42  *  <li>http://apache.org/xml/features/scanner/notify-char-refs</li>
  43  *  <li>http://apache.org/xml/features/scanner/notify-builtin-refs</li>
  44  *  <li>http://apache.org/xml/properties/internal/symbol-table</li>
  45  *  <li>http://apache.org/xml/properties/internal/error-reporter</li>
  46  *  <li>http://apache.org/xml/properties/internal/entity-manager</li>
  47  *  <li>http://apache.org/xml/properties/internal/dtd-scanner</li>
  48  * </ul>
  49  *
  50  * @xerces.internal
  51  *
  52  * @author Glenn Marcy, IBM
  53  * @author Andy Clark, IBM
  54  * @author Arnaud  Le Hors, IBM
  55  * @author Eric Ye, IBM
  56  *
  57  * @version $Id: XML11DocumentScannerImpl.java,v 1.5 2010/08/04 20:59:09 joehw Exp $
  58  */
  59 public class XML11DocumentScannerImpl
  60     extends XMLDocumentScannerImpl {
  61 
  62 
  63     /** String buffer. */
  64     private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
  65     private final XMLStringBuffer fStringBuffer2 = new XMLStringBuffer();
  66     private final XMLStringBuffer fStringBuffer3 = new XMLStringBuffer();
  67 
  68     //
  69     // Constructors
  70     //
  71 
  72     /** Default constructor. */
  73     public XML11DocumentScannerImpl() {super();} // <init>()
  74 
  75     //
  76     // overridden methods
  77     //
  78 
  79     // XMLDocumentFragmentImpl methods
  80 
  81     /**
  82      * Scans element content.
  83      *
  84      * @return Returns the next character on the stream.
  85      */
  86     protected int scanContent(XMLStringBuffer content) throws IOException, XNIException {
  87 
  88         fTempString.length = 0;
  89         int c = fEntityScanner.scanContent(fTempString);
  90         content.append(fTempString);
  91 
  92         if (c == '\r' || c == 0x85 || c == 0x2028) {
  93             // happens when there is the character reference 
  94             // but scanContent doesn't do entity expansions...
  95             // is this *really* necessary???  - NG
  96             fEntityScanner.scanChar(null);
  97             content.append((char)c);
  98             c = -1;
  99         }
 100         /*if (fDocumentHandler != null && content.length > 0) {
 101             fDocumentHandler.characters(content, null);
 102         } */
 103 
 104         if (c == ']') {
 105             content.append((char)fEntityScanner.scanChar(null));
 106             // remember where we are in case we get an endEntity before we
 107             // could flush the buffer out - this happens when we're parsing an
 108             // entity which ends with a ]
 109             fInScanContent = true;
 110             //
 111             // We work on a single character basis to handle cases such as:
 112             // ']]]>' which we might otherwise miss.
 113             //
 114             if (fEntityScanner.skipChar(']', null)) {
 115                 content.append(']');
 116                 while (fEntityScanner.skipChar(']', null)) {
 117                     content.append(']');
 118                 }
 119                 if (fEntityScanner.skipChar('>', null)) {
 120                     reportFatalError("CDEndInContent", null);
 121                 }
 122             }
 123             /*if (fDocumentHandler != null && fStringBuffer.length != 0) {
 124                 fDocumentHandler.characters(fStringBuffer, null);
 125             }*/
 126             fInScanContent = false;
 127             c = -1;
 128         }
 129         return c;
 130 
 131     } // scanContent():int
 132 
 133     /**
 134      * Scans an attribute value and normalizes whitespace converting all
 135      * whitespace characters to space characters.
 136      *
 137      * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'"
 138      *
 139      * @param value The XMLString to fill in with the value.
 140      * @param nonNormalizedValue The XMLString to fill in with the
 141      *                           non-normalized value.
 142      * @param atName The name of the attribute being parsed (for error msgs).
 143      * @param checkEntities true if undeclared entities should be reported as VC violation,
 144      *                      false if undeclared entities should be reported as WFC violation.
 145      * @param eleName The name of element to which this attribute belongs.
 146      * @param isNSURI The flag indicating whether the content is a namespace URI
 147      *
 148      * @return true if the non-normalized and normalized value are the same
 149      *
 150      * <strong>Note:</strong> This method uses fStringBuffer2, anything in it
 151      * at the time of calling is lost.
 152      **/
 153     protected boolean scanAttributeValue(XMLString value,
 154                                       XMLString nonNormalizedValue,
 155                                       String atName,
 156                                       boolean checkEntities,String eleName, boolean isNSURI)
 157         throws IOException, XNIException
 158     {
 159         // quote
 160         int quote = fEntityScanner.peekChar();
 161         if (quote != '\'' && quote != '"') {
 162             reportFatalError("OpenQuoteExpected", new Object[]{eleName,atName});
 163         }
 164 
 165         fEntityScanner.scanChar(NameType.ATTRIBUTE);
 166         int entityDepth = fEntityDepth;
 167 
 168         int c = fEntityScanner.scanLiteral(quote, value, isNSURI);
 169         if (DEBUG_ATTR_NORMALIZATION) {
 170             System.out.println("** scanLiteral -> \""
 171                                + value.toString() + "\"");
 172         }
 173 
 174         int fromIndex = 0;
 175         if (c == quote && (fromIndex = isUnchangedByNormalization(value)) == -1) {
 176             /** Both the non-normalized and normalized attribute values are equal. **/
 177             nonNormalizedValue.setValues(value);
 178             int cquote = fEntityScanner.scanChar(NameType.ATTRIBUTE);
 179             if (cquote != quote) {
 180                 reportFatalError("CloseQuoteExpected", new Object[]{eleName,atName});
 181             }
 182             return true;
 183         }
 184         fStringBuffer2.clear();
 185         fStringBuffer2.append(value);
 186         normalizeWhitespace(value, fromIndex);
 187         if (DEBUG_ATTR_NORMALIZATION) {
 188             System.out.println("** normalizeWhitespace -> \""
 189                                + value.toString() + "\"");
 190         }
 191         if (c != quote) {
 192             fScanningAttribute = true;
 193             fStringBuffer.clear();
 194             do {
 195                 fStringBuffer.append(value);
 196                 if (DEBUG_ATTR_NORMALIZATION) {
 197                     System.out.println("** value2: \""
 198                                        + fStringBuffer.toString() + "\"");
 199                 }
 200                 if (c == '&') {
 201                     fEntityScanner.skipChar('&', NameType.REFERENCE);
 202                     if (entityDepth == fEntityDepth) {
 203                         fStringBuffer2.append('&');
 204                     }
 205                     if (fEntityScanner.skipChar('#', NameType.REFERENCE)) {
 206                         if (entityDepth == fEntityDepth) {
 207                             fStringBuffer2.append('#');
 208                         }
 209                         int ch = scanCharReferenceValue(fStringBuffer, fStringBuffer2);
 210                         if (ch != -1) {
 211                             if (DEBUG_ATTR_NORMALIZATION) {
 212                                 System.out.println("** value3: \""
 213                                                    + fStringBuffer.toString()
 214                                                    + "\"");
 215                             }
 216                         }
 217                     }
 218                     else {
 219                         String entityName = fEntityScanner.scanName(NameType.REFERENCE);
 220                         if (entityName == null) {
 221                             reportFatalError("NameRequiredInReference", null);
 222                         }
 223                         else if (entityDepth == fEntityDepth) {
 224                             fStringBuffer2.append(entityName);
 225                         }
 226                         if (!fEntityScanner.skipChar(';', NameType.REFERENCE)) {
 227                             reportFatalError("SemicolonRequiredInReference",
 228                                              new Object []{entityName});
 229                         }
 230                         else if (entityDepth == fEntityDepth) {
 231                             fStringBuffer2.append(';');
 232                         }
 233                         if (resolveCharacter(entityName, fStringBuffer)) {
 234                             checkEntityLimit(false, fEntityScanner.fCurrentEntity.name, 1);
 235                         }
 236                         else {
 237                             if (fEntityManager.isExternalEntity(entityName)) {
 238                                 reportFatalError("ReferenceToExternalEntity",
 239                                                  new Object[] { entityName });
 240                             }
 241                             else {
 242                                 if (!fEntityManager.isDeclaredEntity(entityName)) {
 243                                     //WFC & VC: Entity Declared
 244                                     if (checkEntities) {
 245                                         if (fValidation) {
 246                                             fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
 247                                                                        "EntityNotDeclared",
 248                                                                        new Object[]{entityName},
 249                                                                        XMLErrorReporter.SEVERITY_ERROR);
 250                                         }
 251                                     }
 252                                     else {
 253                                         reportFatalError("EntityNotDeclared",
 254                                                          new Object[]{entityName});
 255                                     }
 256                                 }
 257                                 fEntityManager.startEntity(true, entityName, true);
 258                             }
 259                         }
 260                     }
 261                 }
 262                 else if (c == '<') {
 263                     reportFatalError("LessthanInAttValue",
 264                                      new Object[] { eleName, atName });
 265                     fEntityScanner.scanChar(null);
 266                     if (entityDepth == fEntityDepth) {
 267                         fStringBuffer2.append((char)c);
 268                     }
 269                 }
 270                 else if (c == '%' || c == ']') {
 271                     fEntityScanner.scanChar(null);
 272                     fStringBuffer.append((char)c);
 273                     if (entityDepth == fEntityDepth) {
 274                         fStringBuffer2.append((char)c);
 275                     }
 276                     if (DEBUG_ATTR_NORMALIZATION) {
 277                         System.out.println("** valueF: \""
 278                                            + fStringBuffer.toString() + "\"");
 279                     }
 280                 }
 281                 // note that none of these characters should ever get through
 282                 // XML11EntityScanner.  Not sure why
 283                 // this check was originally necessary.  - NG
 284                 else if (c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
 285                     fEntityScanner.scanChar(null);
 286                     fStringBuffer.append(' ');
 287                     if (entityDepth == fEntityDepth) {
 288                         fStringBuffer2.append('\n');
 289                     }
 290                 }
 291                 else if (c != -1 && XMLChar.isHighSurrogate(c)) {
 292                     fStringBuffer3.clear();
 293                     if (scanSurrogates(fStringBuffer3)) {
 294                         fStringBuffer.append(fStringBuffer3);
 295                         if (entityDepth == fEntityDepth) {
 296                             fStringBuffer2.append(fStringBuffer3);
 297                         }
 298                         if (DEBUG_ATTR_NORMALIZATION) {
 299                             System.out.println("** valueI: \""
 300                                                + fStringBuffer.toString()
 301                                                + "\"");
 302                         }
 303                     }
 304                 }
 305                 else if (c != -1 && isInvalidLiteral(c)) {
 306                     reportFatalError("InvalidCharInAttValue",
 307                                      new Object[] {eleName, atName, Integer.toString(c, 16)});
 308                     fEntityScanner.scanChar(null);
 309                     if (entityDepth == fEntityDepth) {
 310                         fStringBuffer2.append((char)c);
 311                     }
 312                 }
 313                 c = fEntityScanner.scanLiteral(quote, value, isNSURI);
 314                 if (entityDepth == fEntityDepth) {
 315                     fStringBuffer2.append(value);
 316                 }
 317                 normalizeWhitespace(value);
 318             } while (c != quote || entityDepth != fEntityDepth);
 319             fStringBuffer.append(value);
 320             if (DEBUG_ATTR_NORMALIZATION) {
 321                 System.out.println("** valueN: \""
 322                                    + fStringBuffer.toString() + "\"");
 323             }
 324             value.setValues(fStringBuffer);
 325             fScanningAttribute = false;
 326         }
 327         nonNormalizedValue.setValues(fStringBuffer2);
 328 
 329         // quote
 330         int cquote = fEntityScanner.scanChar(null);
 331         if (cquote != quote) {
 332             reportFatalError("CloseQuoteExpected", new Object[]{eleName,atName});
 333         }
 334         return nonNormalizedValue.equals(value.ch, value.offset, value.length);
 335     } // scanAttributeValue()
 336 
 337     //
 338     // XMLScanner methods
 339     //
 340     // NOTE:  this is a carbon copy of the code in XML11DTDScannerImpl;
 341     // we need to override these methods in both places.
 342     // this needs to be refactored!!!  - NG
 343     /**
 344      * Scans public ID literal.
 345      *
 346      * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
 347      * [13] PubidChar::= #x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
 348      *
 349      * The returned string is normalized according to the following rule,
 350      * from http://www.w3.org/TR/REC-xml#dt-pubid:
 351      *
 352      * Before a match is attempted, all strings of white space in the public
 353      * identifier must be normalized to single space characters (#x20), and
 354      * leading and trailing white space must be removed.
 355      *
 356      * @param literal The string to fill in with the public ID literal.
 357      * @return True on success.
 358      *
 359      * <strong>Note:</strong> This method uses fStringBuffer, anything in it at
 360      * the time of calling is lost.
 361      */
 362     protected boolean scanPubidLiteral(XMLString literal)
 363         throws IOException, XNIException
 364     {
 365         int quote = fEntityScanner.scanChar(null);
 366         if (quote != '\'' && quote != '"') {
 367             reportFatalError("QuoteRequiredInPublicID", null);
 368             return false;
 369         }
 370 
 371         fStringBuffer.clear();
 372         // skip leading whitespace
 373         boolean skipSpace = true;
 374         boolean dataok = true;
 375         while (true) {
 376             int c = fEntityScanner.scanChar(null);
 377             // REVISIT:  none of these except \n and 0x20 should make it past the entity scanner
 378             if (c == ' ' || c == '\n' || c == '\r' || c == 0x85 || c == 0x2028) {
 379                 if (!skipSpace) {
 380                     // take the first whitespace as a space and skip the others
 381                     fStringBuffer.append(' ');
 382                     skipSpace = true;
 383                 }
 384             }
 385             else if (c == quote) {
 386                 if (skipSpace) {
 387                     // if we finished on a space let's trim it
 388                     fStringBuffer.length--;
 389                 }
 390                 literal.setValues(fStringBuffer);
 391                 break;
 392             }
 393             else if (XMLChar.isPubid(c)) {
 394                 fStringBuffer.append((char)c);
 395                 skipSpace = false;
 396             }
 397             else if (c == -1) {
 398                 reportFatalError("PublicIDUnterminated", null);
 399                 return false;
 400             }
 401             else {
 402                 dataok = false;
 403                 reportFatalError("InvalidCharInPublicID",
 404                                  new Object[]{Integer.toHexString(c)});
 405             }
 406         }
 407         return dataok;
 408    }
 409 
 410     /**
 411      * Normalize whitespace in an XMLString converting all whitespace
 412      * characters to space characters.
 413      */
 414     protected void normalizeWhitespace(XMLString value) {
 415         int end = value.offset + value.length;
 416             for (int i = value.offset; i < end; ++i) {
 417            int c = value.ch[i];
 418            if (XMLChar.isSpace(c)) {
 419                value.ch[i] = ' ';
 420            }
 421        }
 422     }
 423 
 424     /**
 425      * Normalize whitespace in an XMLString converting all whitespace
 426      * characters to space characters.
 427      */
 428     protected void normalizeWhitespace(XMLString value, int fromIndex) {
 429         int end = value.offset + value.length;
 430         for (int i = value.offset + fromIndex; i < end; ++i) {
 431             int c = value.ch[i];
 432             if (XMLChar.isSpace(c)) {
 433                 value.ch[i] = ' ';
 434             }
 435         }
 436     }
 437 
 438     /**
 439      * Checks whether this string would be unchanged by normalization.
 440      *
 441      * @return -1 if the value would be unchanged by normalization,
 442      * otherwise the index of the first whitespace character which
 443      * would be transformed.
 444      */
 445     protected int isUnchangedByNormalization(XMLString value) {
 446         int end = value.offset + value.length;
 447         for (int i = value.offset; i < end; ++i) {
 448             int c = value.ch[i];
 449             if (XMLChar.isSpace(c)) {
 450                 return i - value.offset;
 451             }
 452         }
 453         return -1;
 454     }
 455 
 456     // returns true if the given character is not
 457     // valid with respect to the version of
 458     // XML understood by this scanner.
 459     protected boolean isInvalid(int value) {
 460         return (XML11Char.isXML11Invalid(value));
 461     } // isInvalid(int):  boolean
 462 
 463     // returns true if the given character is not
 464     // valid or may not be used outside a character reference
 465     // with respect to the version of XML understood by this scanner.
 466     protected boolean isInvalidLiteral(int value) {
 467         return (!XML11Char.isXML11ValidLiteral(value));
 468     } // isInvalidLiteral(int):  boolean
 469 
 470     // returns true if the given character is
 471     // a valid nameChar with respect to the version of
 472     // XML understood by this scanner.
 473     protected boolean isValidNameChar(int value) {
 474         return (XML11Char.isXML11Name(value));
 475     } // isValidNameChar(int):  boolean
 476 
 477     // returns true if the given character is
 478     // a valid nameStartChar with respect to the version of
 479     // XML understood by this scanner.
 480     protected boolean isValidNameStartChar(int value) {
 481         return (XML11Char.isXML11NameStart(value));
 482     } // isValidNameStartChar(int):  boolean
 483 
 484     // returns true if the given character is
 485     // a valid NCName character with respect to the version of
 486     // XML understood by this scanner.
 487     protected boolean isValidNCName(int value) {
 488         return (XML11Char.isXML11NCName(value));
 489     } // isValidNCName(int):  boolean
 490 
 491     // returns true if the given character is
 492     // a valid high surrogate for a nameStartChar
 493     // with respect to the version of XML understood
 494     // by this scanner.
 495     protected boolean isValidNameStartHighSurrogate(int value) {
 496         return XML11Char.isXML11NameHighSurrogate(value);
 497     } // isValidNameStartHighSurrogate(int):  boolean
 498 
 499     protected boolean versionSupported(String version) {
 500         return (version.equals("1.1") || version.equals("1.0"));
 501     } // versionSupported(String):  boolean
 502 
 503     // returns the error message key for unsupported
 504     // versions of XML with respect to the version of
 505     // XML understood by this scanner.
 506     protected String getVersionNotSupportedKey () {
 507         return "VersionNotSupported11";
 508     } // getVersionNotSupportedKey: String
 509 
 510 } // class XML11DocumentScannerImpl