/* * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. */ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Aug 21, 2000: // Fixed bug in isElement and made HTMLdtd public. // Contributed by Eric SCHAEFFER" package com.sun.org.apache.xml.internal.serialize; import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Locale; import java.util.Map; /** * Utility class for accessing information specific to HTML documents. * The HTML DTD is expressed as three utility function groups. Two methods * allow for checking whether an element requires an open tag on printing * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}). *

* Two other methods translate character references from name to value and * from value to name. A small entities resource is loaded into memory the * first time any of these methods is called for fast and efficient access. * * @author Assaf Arkin * * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation * is replaced by that of Xalan. Main class * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}. */ public final class HTMLdtd { /** * Public identifier for HTML 4.01 (Strict) document type. */ public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN"; /** * System identifier for HTML 4.01 (Strict) document type. */ public static final String HTMLSystemId = "http://www.w3.org/TR/html4/strict.dtd"; /** * Public identifier for XHTML 1.0 (Strict) document type. */ public static final String XHTMLPublicId = "-//W3C//DTD XHTML 1.0 Strict//EN"; /** * System identifier for XHTML 1.0 (Strict) document type. */ public static final String XHTMLSystemId = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; /** * Table of reverse character reference mapping. Character codes are held * as single-character strings, mapped to their reference name. */ private static Map _byChar; /** * Table of entity name to value mapping. Entities are held as strings, * character references as Character objects. */ private static Map _byName; private static final Map _boolAttrs; /** * Holds element definitions. */ private static final Map _elemDefs; /** * Locates the HTML entities file that is loaded upon initialization. * This file is a resource loaded with the default class loader. */ private static final String ENTITIES_RESOURCE = "HTMLEntities.res"; /** * Only opening tag should be printed. */ private static final int ONLY_OPENING = 0x0001; /** * Element contains element content only. */ private static final int ELEM_CONTENT = 0x0002; /** * Element preserve spaces. */ private static final int PRESERVE = 0x0004; /** * Optional closing tag. */ private static final int OPT_CLOSING = 0x0008; /** * Element is empty (also means only opening tag) */ private static final int EMPTY = 0x0010 | ONLY_OPENING; /** * Allowed to appear in head. */ private static final int ALLOWED_HEAD = 0x0020; /** * When opened, closes P. */ private static final int CLOSE_P = 0x0040; /** * When opened, closes DD or DT. */ private static final int CLOSE_DD_DT = 0x0080; /** * When opened, closes itself. */ private static final int CLOSE_SELF = 0x0100; /** * When opened, closes another table section. */ private static final int CLOSE_TABLE = 0x0200; /** * When opened, closes TH or TD. */ private static final int CLOSE_TH_TD = 0x04000; /** * Returns true if element is declared to be empty. HTML elements are * defines as empty in the DTD, not by the document syntax. * * @param tagName The element tag name (upper case) * @return True if element is empty */ public static boolean isEmptyTag( String tagName ) { return isElement( tagName, EMPTY ); } /** * Returns true if element is declared to have element content. * Whitespaces appearing inside element content will be ignored, * other text will simply report an error. * * @param tagName The element tag name (upper case) * @return True if element content */ public static boolean isElementContent( String tagName ) { return isElement( tagName, ELEM_CONTENT ); } /** * Returns true if element's textual contents preserves spaces. * This only applies to PRE and TEXTAREA, all other HTML elements * do not preserve space. * * @param tagName The element tag name (upper case) * @return True if element's text content preserves spaces */ public static boolean isPreserveSpace( String tagName ) { return isElement( tagName, PRESERVE ); } /** * Returns true if element's closing tag is optional and need not * exist. An error will not be reported for such elements if they * are not closed. For example, LI is most often not closed. * * @param tagName The element tag name (upper case) * @return True if closing tag implied */ public static boolean isOptionalClosing( String tagName ) { return isElement( tagName, OPT_CLOSING ); } /** * Returns true if element's closing tag is generally not printed. * For example, LI should not print the closing tag. * * @param tagName The element tag name (upper case) * @return True if only opening tag should be printed */ public static boolean isOnlyOpening( String tagName ) { return isElement( tagName, ONLY_OPENING ); } /** * Returns true if the opening of one element (tagName) implies * the closing of another open element (openTag). For example, * every opening LI will close the previously open LI, * and every opening BODY will close the previously open HEAD. * * @param tagName The newly opened element * @param openTag The already opened element * @return True if closing tag closes opening tag */ public static boolean isClosing( String tagName, String openTag ) { // Several elements are defined as closing the HEAD if ( openTag.equalsIgnoreCase( "HEAD" ) ) return ! isElement( tagName, ALLOWED_HEAD ); // P closes iteself if ( openTag.equalsIgnoreCase( "P" ) ) return isElement( tagName, CLOSE_P ); // DT closes DD, DD closes DT if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) ) return isElement( tagName, CLOSE_DD_DT ); // LI and OPTION close themselves if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) ) return isElement( tagName, CLOSE_SELF ); // Each of these table sections closes all the others if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) || openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) || openTag.equalsIgnoreCase( "COLGROUP" ) ) return isElement( tagName, CLOSE_TABLE ); // TD closes TH and TH closes TD if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) ) return isElement( tagName, CLOSE_TH_TD ); return false; } /** * Returns true if the specified attribute it a URI and should be * escaped appropriately. In HTML URIs are escaped differently * than normal attributes. * * @param tagName The element's tag name * @param attrName The attribute's name */ public static boolean isURI( String tagName, String attrName ) { // Stupid checks. return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) ); } /** * Returns true if the specified attribute is a boolean and should be * printed without the value. This applies to attributes that are true * if they exist, such as selected (OPTION/INPUT). * * @param tagName The element's tag name * @param attrName The attribute's name */ public static boolean isBoolean( String tagName, String attrName ) { String[] attrNames; attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) ); if ( attrNames == null ) return false; for ( int i = 0 ; i < attrNames.length ; ++i ) if ( attrNames[ i ].equalsIgnoreCase( attrName ) ) return true; return false; } /** * Returns the value of an HTML character reference by its name. If the * reference is not found or was not defined as a character reference, * returns EOF (-1). * * @param name Name of character reference * @return Character code or EOF (-1) */ public static int charFromName( String name ) { Object value; initialize(); value = _byName.get( name ); if ( value != null && value instanceof Integer ) return ( (Integer) value ).intValue(); else return -1; } /** * Returns the name of an HTML character reference based on its character * value. Only valid for entities defined from character references. If no * such character value was defined, return null. * * @param value Character value of entity * @return Entity's name or null */ public static String fromChar(int value ) { if (value > 0xffff) return null; String name; initialize(); name = _byChar.get(value); return name; } /** * Initialize upon first access. Will load all the HTML character references * into a list that is accessible by name or character value and is optimized * for character substitution. This method may be called any number of times * but will execute only once. */ private static void initialize() { InputStream is = null; BufferedReader reader = null; int index; String name; String value; int code; String line; // Make sure not to initialize twice. if ( _byName != null ) return; try { _byName = new HashMap<>(); _byChar = new HashMap<>(); is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE ); if ( is == null ) { throw new RuntimeException( DOMMessageFormatter.formatMessage( DOMMessageFormatter.SERIALIZER_DOMAIN, "ResourceNotFound", new Object[] {ENTITIES_RESOURCE})); } reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) ); line = reader.readLine(); while ( line != null ) { if ( line.length() == 0 || line.charAt( 0 ) == '#' ) { line = reader.readLine(); continue; } index = line.indexOf( ' ' ); if ( index > 1 ) { name = line.substring( 0, index ); ++index; if ( index < line.length() ) { value = line.substring( index ); index = value.indexOf( ' ' ); if ( index > 0 ) value = value.substring( 0, index ); code = Integer.parseInt( value ); defineEntity( name, (char) code ); } } line = reader.readLine(); } is.close(); } catch ( Exception except ) { throw new RuntimeException( DOMMessageFormatter.formatMessage( DOMMessageFormatter.SERIALIZER_DOMAIN, "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()})); } finally { if ( is != null ) { try { is.close(); } catch ( Exception except ) { } } } } /** * Defines a new character reference. The reference's name and value are * supplied. Nothing happens if the character reference is already defined. *

* Unlike internal entities, character references are a string to single * character mapping. They are used to map non-ASCII characters both on * parsing and printing, primarily for HTML documents. '<amp;' is an * example of a character reference. * * @param name The entity's name * @param value The entity's value */ private static void defineEntity( String name, char value ) { if ( _byName.get( name ) == null ) { _byName.put( name, new Integer( value ) ); _byChar.put( new Integer( value ), name ); } } private static void defineElement( String name, int flags ) { _elemDefs.put(name, flags); } private static void defineBoolean( String tagName, String attrName ) { defineBoolean( tagName, new String[] { attrName } ); } private static void defineBoolean( String tagName, String[] attrNames ) { _boolAttrs.put( tagName, attrNames ); } private static boolean isElement( String name, int flag ) { Integer flags; flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) ); if ( flags == null ) return false; else return ( ( flags.intValue() & flag ) == flag ); } static { _elemDefs = new HashMap<>(); defineElement( "ADDRESS", CLOSE_P ); defineElement( "AREA", EMPTY ); defineElement( "BASE", EMPTY | ALLOWED_HEAD ); defineElement( "BASEFONT", EMPTY ); defineElement( "BLOCKQUOTE", CLOSE_P ); defineElement( "BODY", OPT_CLOSING ); defineElement( "BR", EMPTY ); defineElement( "COL", EMPTY ); defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); defineElement( "DIV", CLOSE_P ); defineElement( "DL", ELEM_CONTENT | CLOSE_P ); defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); defineElement( "FIELDSET", CLOSE_P ); defineElement( "FORM", CLOSE_P ); defineElement( "FRAME", EMPTY | OPT_CLOSING ); defineElement( "H1", CLOSE_P ); defineElement( "H2", CLOSE_P ); defineElement( "H3", CLOSE_P ); defineElement( "H4", CLOSE_P ); defineElement( "H5", CLOSE_P ); defineElement( "H6", CLOSE_P ); defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING ); defineElement( "HR", EMPTY | CLOSE_P ); defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING ); defineElement( "IMG", EMPTY ); defineElement( "INPUT", EMPTY ); defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD ); defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); defineElement( "LINK", EMPTY | ALLOWED_HEAD ); defineElement( "MAP", ALLOWED_HEAD ); defineElement( "META", EMPTY | ALLOWED_HEAD ); defineElement( "OL", ELEM_CONTENT | CLOSE_P ); defineElement( "OPTGROUP", ELEM_CONTENT ); defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF ); defineElement( "PARAM", EMPTY ); defineElement( "PRE", PRESERVE | CLOSE_P ); defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE ); defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE ); defineElement( "SELECT", ELEM_CONTENT ); defineElement( "STYLE", ALLOWED_HEAD | PRESERVE ); defineElement( "TABLE", ELEM_CONTENT | CLOSE_P ); defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD ); defineElement( "TEXTAREA", PRESERVE ); defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD ); defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); defineElement( "TITLE", ALLOWED_HEAD ); defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); defineElement( "UL", ELEM_CONTENT | CLOSE_P ); _boolAttrs = new HashMap<>(); defineBoolean( "AREA", "href" ); defineBoolean( "BUTTON", "disabled" ); defineBoolean( "DIR", "compact" ); defineBoolean( "DL", "compact" ); defineBoolean( "FRAME", "noresize" ); defineBoolean( "HR", "noshade" ); defineBoolean( "IMAGE", "ismap" ); defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } ); defineBoolean( "LINK", "link" ); defineBoolean( "MENU", "compact" ); defineBoolean( "OBJECT", "declare" ); defineBoolean( "OL", "compact" ); defineBoolean( "OPTGROUP", "disabled" ); defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } ); defineBoolean( "SCRIPT", "defer" ); defineBoolean( "SELECT", new String[] { "multiple", "disabled" } ); defineBoolean( "STYLE", "disabled" ); defineBoolean( "TD", "nowrap" ); defineBoolean( "TH", "nowrap" ); defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } ); defineBoolean( "UL", "compact" ); initialize(); } }