1 /*
   2  * Copyright (c) 2015, 2017 Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 // Aug 21, 2000:
  22 //   Fixed bug in isElement and made HTMLdtd public.
  23 //   Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
  24 
  25 
  26 package com.sun.org.apache.xml.internal.serialize;
  27 
  28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
  29 import java.io.BufferedReader;
  30 import java.io.InputStream;
  31 import java.io.InputStreamReader;
  32 import java.util.HashMap;
  33 import java.util.Locale;
  34 import java.util.Map;
  35 
  36 
  37 /**
  38  * Utility class for accessing information specific to HTML documents.
  39  * The HTML DTD is expressed as three utility function groups. Two methods
  40  * allow for checking whether an element requires an open tag on printing
  41  * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
  42  * <P>
  43  * Two other methods translate character references from name to value and
  44  * from value to name. A small entities resource is loaded into memory the
  45  * first time any of these methods is called for fast and efficient access.
  46  *
  47  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  48  *
  49  * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation
  50  * is replaced by that of Xalan. Main class
  51  * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced
  52  * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}.
  53  */
  54 @Deprecated
  55 public final class HTMLdtd
  56 {
  57 
  58     /**
  59      * Public identifier for HTML 4.01 (Strict) document type.
  60      */
  61     public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
  62 
  63     /**
  64      * System identifier for HTML 4.01 (Strict) document type.
  65      */
  66     public static final String HTMLSystemId =
  67         "http://www.w3.org/TR/html4/strict.dtd";
  68 
  69     /**
  70      * Public identifier for XHTML 1.0 (Strict) document type.
  71      */
  72     public static final String XHTMLPublicId =
  73         "-//W3C//DTD XHTML 1.0 Strict//EN";
  74 
  75     /**
  76      * System identifier for XHTML 1.0 (Strict) document type.
  77      */
  78     public static final String XHTMLSystemId =
  79         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
  80 
  81     /**
  82      * Table of reverse character reference mapping. Character codes are held
  83      * as single-character strings, mapped to their reference name.
  84      */
  85     private static Map<Integer, String> _byChar;
  86 
  87 
  88     /**
  89      * Table of entity name to value mapping. Entities are held as strings,
  90      * character references as <TT>Character</TT> objects.
  91      */
  92     private static Map<String, Integer> _byName;
  93 
  94 
  95     private static final Map<String, String[]> _boolAttrs;
  96 
  97 
  98     /**
  99      * Holds element definitions.
 100      */
 101     private static final Map<String, Integer> _elemDefs;
 102 
 103 
 104     /**
 105      * Locates the HTML entities file that is loaded upon initialization.
 106      * This file is a resource loaded with the default class loader.
 107      */
 108     private static final String     ENTITIES_RESOURCE = "HTMLEntities.res";
 109 
 110 
 111     /**
 112      * Only opening tag should be printed.
 113      */
 114     private static final int ONLY_OPENING = 0x0001;
 115 
 116     /**
 117      * Element contains element content only.
 118      */
 119     private static final int ELEM_CONTENT = 0x0002;
 120 
 121 
 122     /**
 123      * Element preserve spaces.
 124      */
 125     private static final int PRESERVE     = 0x0004;
 126 
 127 
 128     /**
 129      * Optional closing tag.
 130      */
 131     private static final int OPT_CLOSING  = 0x0008;
 132 
 133 
 134     /**
 135      * Element is empty (also means only opening tag)
 136      */
 137     private static final int EMPTY        = 0x0010 | ONLY_OPENING;
 138 
 139 
 140     /**
 141      * Allowed to appear in head.
 142      */
 143     private static final int ALLOWED_HEAD = 0x0020;
 144 
 145 
 146     /**
 147      * When opened, closes P.
 148      */
 149     private static final int CLOSE_P      = 0x0040;
 150 
 151 
 152     /**
 153      * When opened, closes DD or DT.
 154      */
 155     private static final int CLOSE_DD_DT  = 0x0080;
 156 
 157 
 158     /**
 159      * When opened, closes itself.
 160      */
 161     private static final int CLOSE_SELF   = 0x0100;
 162 
 163 
 164     /**
 165      * When opened, closes another table section.
 166      */
 167     private static final int CLOSE_TABLE  = 0x0200;
 168 
 169 
 170     /**
 171      * When opened, closes TH or TD.
 172      */
 173     private static final int CLOSE_TH_TD  = 0x04000;
 174 
 175 
 176     /**
 177      * Returns true if element is declared to be empty. HTML elements are
 178      * defines as empty in the DTD, not by the document syntax.
 179      *
 180      * @param tagName The element tag name (upper case)
 181      * @return True if element is empty
 182      */
 183     public static boolean isEmptyTag( String tagName )
 184     {
 185         return isElement( tagName, EMPTY );
 186     }
 187 
 188 
 189     /**
 190      * Returns true if element is declared to have element content.
 191      * Whitespaces appearing inside element content will be ignored,
 192      * other text will simply report an error.
 193      *
 194      * @param tagName The element tag name (upper case)
 195      * @return True if element content
 196      */
 197     public static boolean isElementContent( String tagName )
 198     {
 199         return isElement( tagName, ELEM_CONTENT );
 200     }
 201 
 202 
 203     /**
 204      * Returns true if element's textual contents preserves spaces.
 205      * This only applies to PRE and TEXTAREA, all other HTML elements
 206      * do not preserve space.
 207      *
 208      * @param tagName The element tag name (upper case)
 209      * @return True if element's text content preserves spaces
 210      */
 211     public static boolean isPreserveSpace( String tagName )
 212     {
 213         return isElement( tagName, PRESERVE );
 214     }
 215 
 216 
 217     /**
 218      * Returns true if element's closing tag is optional and need not
 219      * exist. An error will not be reported for such elements if they
 220      * are not closed. For example, <tt>LI</tt> is most often not closed.
 221      *
 222      * @param tagName The element tag name (upper case)
 223      * @return True if closing tag implied
 224      */
 225     public static boolean isOptionalClosing( String tagName )
 226     {
 227         return isElement( tagName, OPT_CLOSING );
 228     }
 229 
 230 
 231     /**
 232      * Returns true if element's closing tag is generally not printed.
 233      * For example, <tt>LI</tt> should not print the closing tag.
 234      *
 235      * @param tagName The element tag name (upper case)
 236      * @return True if only opening tag should be printed
 237      */
 238     public static boolean isOnlyOpening( String tagName )
 239     {
 240         return isElement( tagName, ONLY_OPENING );
 241     }
 242 
 243 
 244     /**
 245      * Returns true if the opening of one element (<tt>tagName</tt>) implies
 246      * the closing of another open element (<tt>openTag</tt>). For example,
 247      * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
 248      * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
 249      *
 250      * @param tagName The newly opened element
 251      * @param openTag The already opened element
 252      * @return True if closing tag closes opening tag
 253      */
 254     public static boolean isClosing( String tagName, String openTag )
 255     {
 256         // Several elements are defined as closing the HEAD
 257         if ( openTag.equalsIgnoreCase( "HEAD" ) )
 258             return ! isElement( tagName, ALLOWED_HEAD );
 259         // P closes iteself
 260         if ( openTag.equalsIgnoreCase( "P" ) )
 261             return isElement( tagName, CLOSE_P );
 262         // DT closes DD, DD closes DT
 263         if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
 264             return isElement( tagName, CLOSE_DD_DT );
 265         // LI and OPTION close themselves
 266         if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
 267             return isElement( tagName, CLOSE_SELF );
 268         // Each of these table sections closes all the others
 269         if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
 270              openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
 271              openTag.equalsIgnoreCase( "COLGROUP" ) )
 272             return isElement( tagName, CLOSE_TABLE );
 273         // TD closes TH and TH closes TD
 274         if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
 275             return isElement( tagName, CLOSE_TH_TD );
 276         return false;
 277     }
 278 
 279 
 280     /**
 281      * Returns true if the specified attribute it a URI and should be
 282      * escaped appropriately. In HTML URIs are escaped differently
 283      * than normal attributes.
 284      *
 285      * @param tagName The element's tag name
 286      * @param attrName The attribute's name
 287      */
 288     public static boolean isURI( String tagName, String attrName )
 289     {
 290         // Stupid checks.
 291         return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
 292     }
 293 
 294 
 295     /**
 296      * Returns true if the specified attribute is a boolean and should be
 297      * printed without the value. This applies to attributes that are true
 298      * if they exist, such as selected (OPTION/INPUT).
 299      *
 300      * @param tagName The element's tag name
 301      * @param attrName The attribute's name
 302      */
 303     public static boolean isBoolean( String tagName, String attrName )
 304     {
 305         String[] attrNames;
 306 
 307         attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
 308         if ( attrNames == null )
 309             return false;
 310         for ( int i = 0 ; i < attrNames.length ; ++i )
 311             if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
 312                 return true;
 313         return false;
 314     }
 315 
 316 
 317     /**
 318      * Returns the value of an HTML character reference by its name. If the
 319      * reference is not found or was not defined as a character reference,
 320      * returns EOF (-1).
 321      *
 322      * @param name Name of character reference
 323      * @return Character code or EOF (-1)
 324      */
 325     public static int charFromName( String name )
 326     {
 327         Object    value;
 328 
 329         initialize();
 330         value = _byName.get( name );
 331         if ( value != null && value instanceof Integer )
 332             return ( (Integer) value ).intValue();
 333         else
 334             return -1;
 335     }
 336 
 337 
 338     /**
 339      * Returns the name of an HTML character reference based on its character
 340      * value. Only valid for entities defined from character references. If no
 341      * such character value was defined, return null.
 342      *
 343      * @param value Character value of entity
 344      * @return Entity's name or null
 345      */
 346     public static String fromChar(int value )
 347     {
 348        if (value > 0xffff)
 349             return null;
 350 
 351         String name;
 352 
 353         initialize();
 354         name = _byChar.get(value);
 355         return name;
 356     }
 357 
 358 
 359     /**
 360      * Initialize upon first access. Will load all the HTML character references
 361      * into a list that is accessible by name or character value and is optimized
 362      * for character substitution. This method may be called any number of times
 363      * but will execute only once.
 364      */
 365     private static void initialize()
 366     {
 367         InputStream     is = null;
 368         BufferedReader  reader = null;
 369         int             index;
 370         String          name;
 371         String          value;
 372         int             code;
 373         String          line;
 374 
 375         // Make sure not to initialize twice.
 376         if ( _byName != null )
 377             return;
 378         try {
 379             _byName = new HashMap<>();
 380             _byChar = new HashMap<>();
 381             is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
 382             if ( is == null ) {
 383                 throw new RuntimeException(
 384                                     DOMMessageFormatter.formatMessage(
 385                                     DOMMessageFormatter.SERIALIZER_DOMAIN,
 386                     "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
 387             }
 388             reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
 389             line = reader.readLine();
 390             while ( line != null ) {
 391                 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
 392                     line = reader.readLine();
 393                     continue;
 394                 }
 395                 index = line.indexOf( ' ' );
 396                 if ( index > 1 ) {
 397                     name = line.substring( 0, index );
 398                     ++index;
 399                     if ( index < line.length() ) {
 400                         value = line.substring( index );
 401                         index = value.indexOf( ' ' );
 402                         if ( index > 0 )
 403                             value = value.substring( 0, index );
 404                         code = Integer.parseInt( value );
 405                                         defineEntity( name, (char) code );
 406                     }
 407                 }
 408                 line = reader.readLine();
 409             }
 410             is.close();
 411         }  catch ( Exception except ) {
 412                         throw new RuntimeException(
 413                                 DOMMessageFormatter.formatMessage(
 414                                 DOMMessageFormatter.SERIALIZER_DOMAIN,
 415                 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
 416         } finally {
 417             if ( is != null ) {
 418                 try {
 419                     is.close();
 420                 } catch ( Exception except ) { }
 421             }
 422         }
 423     }
 424 
 425 
 426     /**
 427      * Defines a new character reference. The reference's name and value are
 428      * supplied. Nothing happens if the character reference is already defined.
 429      * <P>
 430      * Unlike internal entities, character references are a string to single
 431      * character mapping. They are used to map non-ASCII characters both on
 432      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
 433      * example of a character reference.
 434      *
 435      * @param name The entity's name
 436      * @param value The entity's value
 437      */
 438     private static void defineEntity( String name, char value )
 439     {
 440         if ( _byName.get( name ) == null ) {
 441             _byName.put( name, new Integer( value ) );
 442             _byChar.put( new Integer( value ), name );
 443         }
 444     }
 445 
 446 
 447     private static void defineElement( String name, int flags )
 448     {
 449         _elemDefs.put(name, flags);
 450     }
 451 
 452 
 453     private static void defineBoolean( String tagName, String attrName )
 454     {
 455         defineBoolean( tagName, new String[] { attrName } );
 456     }
 457 
 458 
 459     private static void defineBoolean( String tagName, String[] attrNames )
 460     {
 461         _boolAttrs.put( tagName, attrNames );
 462     }
 463 
 464 
 465     private static boolean isElement( String name, int flag )
 466     {
 467         Integer flags;
 468 
 469         flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
 470         if ( flags == null )
 471             return false;
 472         else
 473             return ( ( flags.intValue() & flag ) == flag );
 474     }
 475 
 476 
 477     static
 478     {
 479         _elemDefs = new HashMap<>();
 480         defineElement( "ADDRESS", CLOSE_P );
 481         defineElement( "AREA", EMPTY );
 482         defineElement( "BASE",  EMPTY | ALLOWED_HEAD );
 483         defineElement( "BASEFONT", EMPTY );
 484         defineElement( "BLOCKQUOTE", CLOSE_P );
 485         defineElement( "BODY", OPT_CLOSING );
 486         defineElement( "BR", EMPTY );
 487         defineElement( "COL", EMPTY );
 488         defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 489         defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
 490         defineElement( "DIV", CLOSE_P );
 491         defineElement( "DL", ELEM_CONTENT | CLOSE_P );
 492         defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
 493         defineElement( "FIELDSET", CLOSE_P );
 494         defineElement( "FORM", CLOSE_P );
 495         defineElement( "FRAME", EMPTY | OPT_CLOSING );
 496         defineElement( "H1", CLOSE_P );
 497         defineElement( "H2", CLOSE_P );
 498         defineElement( "H3", CLOSE_P );
 499         defineElement( "H4", CLOSE_P );
 500         defineElement( "H5", CLOSE_P );
 501         defineElement( "H6", CLOSE_P );
 502         defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
 503         defineElement( "HR", EMPTY | CLOSE_P );
 504         defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
 505         defineElement( "IMG", EMPTY );
 506         defineElement( "INPUT", EMPTY );
 507         defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
 508         defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
 509         defineElement( "LINK", EMPTY | ALLOWED_HEAD );
 510         defineElement( "MAP", ALLOWED_HEAD );
 511         defineElement( "META", EMPTY | ALLOWED_HEAD );
 512         defineElement( "OL", ELEM_CONTENT | CLOSE_P );
 513         defineElement( "OPTGROUP", ELEM_CONTENT );
 514         defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
 515         defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
 516         defineElement( "PARAM", EMPTY );
 517         defineElement( "PRE", PRESERVE | CLOSE_P );
 518         defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
 519         defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
 520         defineElement( "SELECT", ELEM_CONTENT );
 521         defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
 522         defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
 523         defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 524         defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
 525         defineElement( "TEXTAREA", PRESERVE );
 526         defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 527         defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
 528         defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 529         defineElement( "TITLE", ALLOWED_HEAD );
 530         defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 531         defineElement( "UL", ELEM_CONTENT | CLOSE_P );
 532 
 533         _boolAttrs = new HashMap<>();
 534         defineBoolean( "AREA", "href" );
 535         defineBoolean( "BUTTON", "disabled" );
 536         defineBoolean( "DIR", "compact" );
 537         defineBoolean( "DL", "compact" );
 538         defineBoolean( "FRAME", "noresize" );
 539         defineBoolean( "HR", "noshade" );
 540         defineBoolean( "IMAGE", "ismap" );
 541         defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
 542         defineBoolean( "LINK", "link" );
 543         defineBoolean( "MENU", "compact" );
 544         defineBoolean( "OBJECT", "declare" );
 545         defineBoolean( "OL", "compact" );
 546         defineBoolean( "OPTGROUP", "disabled" );
 547         defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
 548         defineBoolean( "SCRIPT", "defer" );
 549         defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
 550         defineBoolean( "STYLE", "disabled" );
 551         defineBoolean( "TD", "nowrap" );
 552         defineBoolean( "TH", "nowrap" );
 553         defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
 554         defineBoolean( "UL", "compact" );
 555 
 556         initialize();
 557     }
 558 
 559 
 560 
 561 }