1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 // Aug 21, 2000:
  22 //   Fixed bug in isElement and made HTMLdtd public.
  23 //   Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com>
  24 
  25 
  26 package com.sun.org.apache.xml.internal.serialize;
  27 
  28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter;
  29 import java.io.BufferedReader;
  30 import java.io.InputStream;
  31 import java.io.InputStreamReader;
  32 import java.util.HashMap;
  33 import java.util.Locale;
  34 import java.util.Map;
  35 
  36 
  37 /**
  38  * Utility class for accessing information specific to HTML documents.
  39  * The HTML DTD is expressed as three utility function groups. Two methods
  40  * allow for checking whether an element requires an open tag on printing
  41  * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}).
  42  * <P>
  43  * Two other methods translate character references from name to value and
  44  * from value to name. A small entities resource is loaded into memory the
  45  * first time any of these methods is called for fast and efficient access.
  46  *
  47  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  48  *
  49  * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation
  50  * is replaced by that of Xalan. Main class
  51  * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced
  52  * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}.
  53  */
  54 public final class HTMLdtd
  55 {
  56 
  57     /**
  58      * Public identifier for HTML 4.01 (Strict) document type.
  59      */
  60     public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN";
  61 
  62     /**
  63      * System identifier for HTML 4.01 (Strict) document type.
  64      */
  65     public static final String HTMLSystemId =
  66         "http://www.w3.org/TR/html4/strict.dtd";
  67 
  68     /**
  69      * Public identifier for XHTML 1.0 (Strict) document type.
  70      */
  71     public static final String XHTMLPublicId =
  72         "-//W3C//DTD XHTML 1.0 Strict//EN";
  73 
  74     /**
  75      * System identifier for XHTML 1.0 (Strict) document type.
  76      */
  77     public static final String XHTMLSystemId =
  78         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
  79 
  80     /**
  81      * Table of reverse character reference mapping. Character codes are held
  82      * as single-character strings, mapped to their reference name.
  83      */
  84     private static Map<Integer, String> _byChar;
  85 
  86 
  87     /**
  88      * Table of entity name to value mapping. Entities are held as strings,
  89      * character references as <TT>Character</TT> objects.
  90      */
  91     private static Map<String, Integer> _byName;
  92 
  93 
  94     private static final Map<String, String[]> _boolAttrs;
  95 
  96 
  97     /**
  98      * Holds element definitions.
  99      */
 100     private static final Map<String, Integer> _elemDefs;
 101 
 102 
 103     /**
 104      * Locates the HTML entities file that is loaded upon initialization.
 105      * This file is a resource loaded with the default class loader.
 106      */
 107     private static final String     ENTITIES_RESOURCE = "HTMLEntities.res";
 108 
 109 
 110     /**
 111      * Only opening tag should be printed.
 112      */
 113     private static final int ONLY_OPENING = 0x0001;
 114 
 115     /**
 116      * Element contains element content only.
 117      */
 118     private static final int ELEM_CONTENT = 0x0002;
 119 
 120 
 121     /**
 122      * Element preserve spaces.
 123      */
 124     private static final int PRESERVE     = 0x0004;
 125 
 126 
 127     /**
 128      * Optional closing tag.
 129      */
 130     private static final int OPT_CLOSING  = 0x0008;
 131 
 132 
 133     /**
 134      * Element is empty (also means only opening tag)
 135      */
 136     private static final int EMPTY        = 0x0010 | ONLY_OPENING;
 137 
 138 
 139     /**
 140      * Allowed to appear in head.
 141      */
 142     private static final int ALLOWED_HEAD = 0x0020;
 143 
 144 
 145     /**
 146      * When opened, closes P.
 147      */
 148     private static final int CLOSE_P      = 0x0040;
 149 
 150 
 151     /**
 152      * When opened, closes DD or DT.
 153      */
 154     private static final int CLOSE_DD_DT  = 0x0080;
 155 
 156 
 157     /**
 158      * When opened, closes itself.
 159      */
 160     private static final int CLOSE_SELF   = 0x0100;
 161 
 162 
 163     /**
 164      * When opened, closes another table section.
 165      */
 166     private static final int CLOSE_TABLE  = 0x0200;
 167 
 168 
 169     /**
 170      * When opened, closes TH or TD.
 171      */
 172     private static final int CLOSE_TH_TD  = 0x04000;
 173 
 174 
 175     /**
 176      * Returns true if element is declared to be empty. HTML elements are
 177      * defines as empty in the DTD, not by the document syntax.
 178      *
 179      * @param tagName The element tag name (upper case)
 180      * @return True if element is empty
 181      */
 182     public static boolean isEmptyTag( String tagName )
 183     {
 184         return isElement( tagName, EMPTY );
 185     }
 186 
 187 
 188     /**
 189      * Returns true if element is declared to have element content.
 190      * Whitespaces appearing inside element content will be ignored,
 191      * other text will simply report an error.
 192      *
 193      * @param tagName The element tag name (upper case)
 194      * @return True if element content
 195      */
 196     public static boolean isElementContent( String tagName )
 197     {
 198         return isElement( tagName, ELEM_CONTENT );
 199     }
 200 
 201 
 202     /**
 203      * Returns true if element's textual contents preserves spaces.
 204      * This only applies to PRE and TEXTAREA, all other HTML elements
 205      * do not preserve space.
 206      *
 207      * @param tagName The element tag name (upper case)
 208      * @return True if element's text content preserves spaces
 209      */
 210     public static boolean isPreserveSpace( String tagName )
 211     {
 212         return isElement( tagName, PRESERVE );
 213     }
 214 
 215 
 216     /**
 217      * Returns true if element's closing tag is optional and need not
 218      * exist. An error will not be reported for such elements if they
 219      * are not closed. For example, <tt>LI</tt> is most often not closed.
 220      *
 221      * @param tagName The element tag name (upper case)
 222      * @return True if closing tag implied
 223      */
 224     public static boolean isOptionalClosing( String tagName )
 225     {
 226         return isElement( tagName, OPT_CLOSING );
 227     }
 228 
 229 
 230     /**
 231      * Returns true if element's closing tag is generally not printed.
 232      * For example, <tt>LI</tt> should not print the closing tag.
 233      *
 234      * @param tagName The element tag name (upper case)
 235      * @return True if only opening tag should be printed
 236      */
 237     public static boolean isOnlyOpening( String tagName )
 238     {
 239         return isElement( tagName, ONLY_OPENING );
 240     }
 241 
 242 
 243     /**
 244      * Returns true if the opening of one element (<tt>tagName</tt>) implies
 245      * the closing of another open element (<tt>openTag</tt>). For example,
 246      * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>,
 247      * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>.
 248      *
 249      * @param tagName The newly opened element
 250      * @param openTag The already opened element
 251      * @return True if closing tag closes opening tag
 252      */
 253     public static boolean isClosing( String tagName, String openTag )
 254     {
 255         // Several elements are defined as closing the HEAD
 256         if ( openTag.equalsIgnoreCase( "HEAD" ) )
 257             return ! isElement( tagName, ALLOWED_HEAD );
 258         // P closes iteself
 259         if ( openTag.equalsIgnoreCase( "P" ) )
 260             return isElement( tagName, CLOSE_P );
 261         // DT closes DD, DD closes DT
 262         if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) )
 263             return isElement( tagName, CLOSE_DD_DT );
 264         // LI and OPTION close themselves
 265         if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) )
 266             return isElement( tagName, CLOSE_SELF );
 267         // Each of these table sections closes all the others
 268         if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) ||
 269              openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) ||
 270              openTag.equalsIgnoreCase( "COLGROUP" ) )
 271             return isElement( tagName, CLOSE_TABLE );
 272         // TD closes TH and TH closes TD
 273         if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) )
 274             return isElement( tagName, CLOSE_TH_TD );
 275         return false;
 276     }
 277 
 278 
 279     /**
 280      * Returns true if the specified attribute it a URI and should be
 281      * escaped appropriately. In HTML URIs are escaped differently
 282      * than normal attributes.
 283      *
 284      * @param tagName The element's tag name
 285      * @param attrName The attribute's name
 286      */
 287     public static boolean isURI( String tagName, String attrName )
 288     {
 289         // Stupid checks.
 290         return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) );
 291     }
 292 
 293 
 294     /**
 295      * Returns true if the specified attribute is a boolean and should be
 296      * printed without the value. This applies to attributes that are true
 297      * if they exist, such as selected (OPTION/INPUT).
 298      *
 299      * @param tagName The element's tag name
 300      * @param attrName The attribute's name
 301      */
 302     public static boolean isBoolean( String tagName, String attrName )
 303     {
 304         String[] attrNames;
 305 
 306         attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) );
 307         if ( attrNames == null )
 308             return false;
 309         for ( int i = 0 ; i < attrNames.length ; ++i )
 310             if ( attrNames[ i ].equalsIgnoreCase( attrName ) )
 311                 return true;
 312         return false;
 313     }
 314 
 315 
 316     /**
 317      * Returns the value of an HTML character reference by its name. If the
 318      * reference is not found or was not defined as a character reference,
 319      * returns EOF (-1).
 320      *
 321      * @param name Name of character reference
 322      * @return Character code or EOF (-1)
 323      */
 324     public static int charFromName( String name )
 325     {
 326         Object    value;
 327 
 328         initialize();
 329         value = _byName.get( name );
 330         if ( value != null && value instanceof Integer )
 331             return ( (Integer) value ).intValue();
 332         else
 333             return -1;
 334     }
 335 
 336 
 337     /**
 338      * Returns the name of an HTML character reference based on its character
 339      * value. Only valid for entities defined from character references. If no
 340      * such character value was defined, return null.
 341      *
 342      * @param value Character value of entity
 343      * @return Entity's name or null
 344      */
 345     public static String fromChar(int value )
 346     {
 347        if (value > 0xffff)
 348             return null;
 349 
 350         String name;
 351 
 352         initialize();
 353         name = _byChar.get(value);
 354         return name;
 355     }
 356 
 357 
 358     /**
 359      * Initialize upon first access. Will load all the HTML character references
 360      * into a list that is accessible by name or character value and is optimized
 361      * for character substitution. This method may be called any number of times
 362      * but will execute only once.
 363      */
 364     private static void initialize()
 365     {
 366         InputStream     is = null;
 367         BufferedReader  reader = null;
 368         int             index;
 369         String          name;
 370         String          value;
 371         int             code;
 372         String          line;
 373 
 374         // Make sure not to initialize twice.
 375         if ( _byName != null )
 376             return;
 377         try {
 378             _byName = new HashMap<>();
 379             _byChar = new HashMap<>();
 380             is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE );
 381             if ( is == null ) {
 382                 throw new RuntimeException(
 383                                     DOMMessageFormatter.formatMessage(
 384                                     DOMMessageFormatter.SERIALIZER_DOMAIN,
 385                     "ResourceNotFound", new Object[] {ENTITIES_RESOURCE}));
 386             }
 387             reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
 388             line = reader.readLine();
 389             while ( line != null ) {
 390                 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) {
 391                     line = reader.readLine();
 392                     continue;
 393                 }
 394                 index = line.indexOf( ' ' );
 395                 if ( index > 1 ) {
 396                     name = line.substring( 0, index );
 397                     ++index;
 398                     if ( index < line.length() ) {
 399                         value = line.substring( index );
 400                         index = value.indexOf( ' ' );
 401                         if ( index > 0 )
 402                             value = value.substring( 0, index );
 403                         code = Integer.parseInt( value );
 404                                         defineEntity( name, (char) code );
 405                     }
 406                 }
 407                 line = reader.readLine();
 408             }
 409             is.close();
 410         }  catch ( Exception except ) {
 411                         throw new RuntimeException(
 412                                 DOMMessageFormatter.formatMessage(
 413                                 DOMMessageFormatter.SERIALIZER_DOMAIN,
 414                 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()}));
 415         } finally {
 416             if ( is != null ) {
 417                 try {
 418                     is.close();
 419                 } catch ( Exception except ) { }
 420             }
 421         }
 422     }
 423 
 424 
 425     /**
 426      * Defines a new character reference. The reference's name and value are
 427      * supplied. Nothing happens if the character reference is already defined.
 428      * <P>
 429      * Unlike internal entities, character references are a string to single
 430      * character mapping. They are used to map non-ASCII characters both on
 431      * parsing and printing, primarily for HTML documents. '&lt;amp;' is an
 432      * example of a character reference.
 433      *
 434      * @param name The entity's name
 435      * @param value The entity's value
 436      */
 437     private static void defineEntity( String name, char value )
 438     {
 439         if ( _byName.get( name ) == null ) {
 440             _byName.put( name, new Integer( value ) );
 441             _byChar.put( new Integer( value ), name );
 442         }
 443     }
 444 
 445 
 446     private static void defineElement( String name, int flags )
 447     {
 448         _elemDefs.put(name, flags);
 449     }
 450 
 451 
 452     private static void defineBoolean( String tagName, String attrName )
 453     {
 454         defineBoolean( tagName, new String[] { attrName } );
 455     }
 456 
 457 
 458     private static void defineBoolean( String tagName, String[] attrNames )
 459     {
 460         _boolAttrs.put( tagName, attrNames );
 461     }
 462 
 463 
 464     private static boolean isElement( String name, int flag )
 465     {
 466         Integer flags;
 467 
 468         flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) );
 469         if ( flags == null )
 470             return false;
 471         else
 472             return ( ( flags.intValue() & flag ) == flag );
 473     }
 474 
 475 
 476     static
 477     {
 478         _elemDefs = new HashMap<>();
 479         defineElement( "ADDRESS", CLOSE_P );
 480         defineElement( "AREA", EMPTY );
 481         defineElement( "BASE",  EMPTY | ALLOWED_HEAD );
 482         defineElement( "BASEFONT", EMPTY );
 483         defineElement( "BLOCKQUOTE", CLOSE_P );
 484         defineElement( "BODY", OPT_CLOSING );
 485         defineElement( "BR", EMPTY );
 486         defineElement( "COL", EMPTY );
 487         defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 488         defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
 489         defineElement( "DIV", CLOSE_P );
 490         defineElement( "DL", ELEM_CONTENT | CLOSE_P );
 491         defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT );
 492         defineElement( "FIELDSET", CLOSE_P );
 493         defineElement( "FORM", CLOSE_P );
 494         defineElement( "FRAME", EMPTY | OPT_CLOSING );
 495         defineElement( "H1", CLOSE_P );
 496         defineElement( "H2", CLOSE_P );
 497         defineElement( "H3", CLOSE_P );
 498         defineElement( "H4", CLOSE_P );
 499         defineElement( "H5", CLOSE_P );
 500         defineElement( "H6", CLOSE_P );
 501         defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING );
 502         defineElement( "HR", EMPTY | CLOSE_P );
 503         defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING );
 504         defineElement( "IMG", EMPTY );
 505         defineElement( "INPUT", EMPTY );
 506         defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD );
 507         defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
 508         defineElement( "LINK", EMPTY | ALLOWED_HEAD );
 509         defineElement( "MAP", ALLOWED_HEAD );
 510         defineElement( "META", EMPTY | ALLOWED_HEAD );
 511         defineElement( "OL", ELEM_CONTENT | CLOSE_P );
 512         defineElement( "OPTGROUP", ELEM_CONTENT );
 513         defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF );
 514         defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF );
 515         defineElement( "PARAM", EMPTY );
 516         defineElement( "PRE", PRESERVE | CLOSE_P );
 517         defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE );
 518         defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE );
 519         defineElement( "SELECT", ELEM_CONTENT );
 520         defineElement( "STYLE", ALLOWED_HEAD | PRESERVE );
 521         defineElement( "TABLE", ELEM_CONTENT | CLOSE_P );
 522         defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 523         defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD );
 524         defineElement( "TEXTAREA", PRESERVE );
 525         defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 526         defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD );
 527         defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 528         defineElement( "TITLE", ALLOWED_HEAD );
 529         defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE );
 530         defineElement( "UL", ELEM_CONTENT | CLOSE_P );
 531 
 532         _boolAttrs = new HashMap<>();
 533         defineBoolean( "AREA", "href" );
 534         defineBoolean( "BUTTON", "disabled" );
 535         defineBoolean( "DIR", "compact" );
 536         defineBoolean( "DL", "compact" );
 537         defineBoolean( "FRAME", "noresize" );
 538         defineBoolean( "HR", "noshade" );
 539         defineBoolean( "IMAGE", "ismap" );
 540         defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } );
 541         defineBoolean( "LINK", "link" );
 542         defineBoolean( "MENU", "compact" );
 543         defineBoolean( "OBJECT", "declare" );
 544         defineBoolean( "OL", "compact" );
 545         defineBoolean( "OPTGROUP", "disabled" );
 546         defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } );
 547         defineBoolean( "SCRIPT", "defer" );
 548         defineBoolean( "SELECT", new String[] { "multiple", "disabled" } );
 549         defineBoolean( "STYLE", "disabled" );
 550         defineBoolean( "TD", "nowrap" );
 551         defineBoolean( "TH", "nowrap" );
 552         defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } );
 553         defineBoolean( "UL", "compact" );
 554 
 555         initialize();
 556     }
 557 
 558 
 559 
 560 }