1 /* 2 * Copyright (c) 2015, 2017 Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 // Aug 21, 2000: 22 // Fixed bug in isElement and made HTMLdtd public. 23 // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com> 24 25 26 package com.sun.org.apache.xml.internal.serialize; 27 28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter; 29 import java.io.BufferedReader; 30 import java.io.InputStream; 31 import java.io.InputStreamReader; 32 import java.util.HashMap; 33 import java.util.Locale; 34 import java.util.Map; 35 36 37 /** 38 * Utility class for accessing information specific to HTML documents. 39 * The HTML DTD is expressed as three utility function groups. Two methods 40 * allow for checking whether an element requires an open tag on printing 41 * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}). 42 * <P> 43 * Two other methods translate character references from name to value and 44 * from value to name. A small entities resource is loaded into memory the 45 * first time any of these methods is called for fast and efficient access. 46 * 47 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 48 * 49 * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation 50 * is replaced by that of Xalan. Main class 51 * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced 52 * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}. 53 */ 54 @Deprecated 55 public final class HTMLdtd 56 { 57 58 /** 59 * Public identifier for HTML 4.01 (Strict) document type. 60 */ 61 public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN"; 62 63 /** 64 * System identifier for HTML 4.01 (Strict) document type. 65 */ 66 public static final String HTMLSystemId = 67 "http://www.w3.org/TR/html4/strict.dtd"; 68 69 /** 70 * Public identifier for XHTML 1.0 (Strict) document type. 71 */ 72 public static final String XHTMLPublicId = 73 "-//W3C//DTD XHTML 1.0 Strict//EN"; 74 75 /** 76 * System identifier for XHTML 1.0 (Strict) document type. 77 */ 78 public static final String XHTMLSystemId = 79 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 80 81 /** 82 * Table of reverse character reference mapping. Character codes are held 83 * as single-character strings, mapped to their reference name. 84 */ 85 private static Map<Integer, String> _byChar; 86 87 88 /** 89 * Table of entity name to value mapping. Entities are held as strings, 90 * character references as <TT>Character</TT> objects. 91 */ 92 private static Map<String, Integer> _byName; 93 94 95 private static final Map<String, String[]> _boolAttrs; 96 97 98 /** 99 * Holds element definitions. 100 */ 101 private static final Map<String, Integer> _elemDefs; 102 103 104 /** 105 * Locates the HTML entities file that is loaded upon initialization. 106 * This file is a resource loaded with the default class loader. 107 */ 108 private static final String ENTITIES_RESOURCE = "HTMLEntities.res"; 109 110 111 /** 112 * Only opening tag should be printed. 113 */ 114 private static final int ONLY_OPENING = 0x0001; 115 116 /** 117 * Element contains element content only. 118 */ 119 private static final int ELEM_CONTENT = 0x0002; 120 121 122 /** 123 * Element preserve spaces. 124 */ 125 private static final int PRESERVE = 0x0004; 126 127 128 /** 129 * Optional closing tag. 130 */ 131 private static final int OPT_CLOSING = 0x0008; 132 133 134 /** 135 * Element is empty (also means only opening tag) 136 */ 137 private static final int EMPTY = 0x0010 | ONLY_OPENING; 138 139 140 /** 141 * Allowed to appear in head. 142 */ 143 private static final int ALLOWED_HEAD = 0x0020; 144 145 146 /** 147 * When opened, closes P. 148 */ 149 private static final int CLOSE_P = 0x0040; 150 151 152 /** 153 * When opened, closes DD or DT. 154 */ 155 private static final int CLOSE_DD_DT = 0x0080; 156 157 158 /** 159 * When opened, closes itself. 160 */ 161 private static final int CLOSE_SELF = 0x0100; 162 163 164 /** 165 * When opened, closes another table section. 166 */ 167 private static final int CLOSE_TABLE = 0x0200; 168 169 170 /** 171 * When opened, closes TH or TD. 172 */ 173 private static final int CLOSE_TH_TD = 0x04000; 174 175 176 /** 177 * Returns true if element is declared to be empty. HTML elements are 178 * defines as empty in the DTD, not by the document syntax. 179 * 180 * @param tagName The element tag name (upper case) 181 * @return True if element is empty 182 */ 183 public static boolean isEmptyTag( String tagName ) 184 { 185 return isElement( tagName, EMPTY ); 186 } 187 188 189 /** 190 * Returns true if element is declared to have element content. 191 * Whitespaces appearing inside element content will be ignored, 192 * other text will simply report an error. 193 * 194 * @param tagName The element tag name (upper case) 195 * @return True if element content 196 */ 197 public static boolean isElementContent( String tagName ) 198 { 199 return isElement( tagName, ELEM_CONTENT ); 200 } 201 202 203 /** 204 * Returns true if element's textual contents preserves spaces. 205 * This only applies to PRE and TEXTAREA, all other HTML elements 206 * do not preserve space. 207 * 208 * @param tagName The element tag name (upper case) 209 * @return True if element's text content preserves spaces 210 */ 211 public static boolean isPreserveSpace( String tagName ) 212 { 213 return isElement( tagName, PRESERVE ); 214 } 215 216 217 /** 218 * Returns true if element's closing tag is optional and need not 219 * exist. An error will not be reported for such elements if they 220 * are not closed. For example, <tt>LI</tt> is most often not closed. 221 * 222 * @param tagName The element tag name (upper case) 223 * @return True if closing tag implied 224 */ 225 public static boolean isOptionalClosing( String tagName ) 226 { 227 return isElement( tagName, OPT_CLOSING ); 228 } 229 230 231 /** 232 * Returns true if element's closing tag is generally not printed. 233 * For example, <tt>LI</tt> should not print the closing tag. 234 * 235 * @param tagName The element tag name (upper case) 236 * @return True if only opening tag should be printed 237 */ 238 public static boolean isOnlyOpening( String tagName ) 239 { 240 return isElement( tagName, ONLY_OPENING ); 241 } 242 243 244 /** 245 * Returns true if the opening of one element (<tt>tagName</tt>) implies 246 * the closing of another open element (<tt>openTag</tt>). For example, 247 * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>, 248 * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>. 249 * 250 * @param tagName The newly opened element 251 * @param openTag The already opened element 252 * @return True if closing tag closes opening tag 253 */ 254 public static boolean isClosing( String tagName, String openTag ) 255 { 256 // Several elements are defined as closing the HEAD 257 if ( openTag.equalsIgnoreCase( "HEAD" ) ) 258 return ! isElement( tagName, ALLOWED_HEAD ); 259 // P closes iteself 260 if ( openTag.equalsIgnoreCase( "P" ) ) 261 return isElement( tagName, CLOSE_P ); 262 // DT closes DD, DD closes DT 263 if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) ) 264 return isElement( tagName, CLOSE_DD_DT ); 265 // LI and OPTION close themselves 266 if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) ) 267 return isElement( tagName, CLOSE_SELF ); 268 // Each of these table sections closes all the others 269 if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) || 270 openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) || 271 openTag.equalsIgnoreCase( "COLGROUP" ) ) 272 return isElement( tagName, CLOSE_TABLE ); 273 // TD closes TH and TH closes TD 274 if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) ) 275 return isElement( tagName, CLOSE_TH_TD ); 276 return false; 277 } 278 279 280 /** 281 * Returns true if the specified attribute it a URI and should be 282 * escaped appropriately. In HTML URIs are escaped differently 283 * than normal attributes. 284 * 285 * @param tagName The element's tag name 286 * @param attrName The attribute's name 287 */ 288 public static boolean isURI( String tagName, String attrName ) 289 { 290 // Stupid checks. 291 return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) ); 292 } 293 294 295 /** 296 * Returns true if the specified attribute is a boolean and should be 297 * printed without the value. This applies to attributes that are true 298 * if they exist, such as selected (OPTION/INPUT). 299 * 300 * @param tagName The element's tag name 301 * @param attrName The attribute's name 302 */ 303 public static boolean isBoolean( String tagName, String attrName ) 304 { 305 String[] attrNames; 306 307 attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) ); 308 if ( attrNames == null ) 309 return false; 310 for ( int i = 0 ; i < attrNames.length ; ++i ) 311 if ( attrNames[ i ].equalsIgnoreCase( attrName ) ) 312 return true; 313 return false; 314 } 315 316 317 /** 318 * Returns the value of an HTML character reference by its name. If the 319 * reference is not found or was not defined as a character reference, 320 * returns EOF (-1). 321 * 322 * @param name Name of character reference 323 * @return Character code or EOF (-1) 324 */ 325 public static int charFromName( String name ) 326 { 327 Object value; 328 329 initialize(); 330 value = _byName.get( name ); 331 if ( value != null && value instanceof Integer ) 332 return ( (Integer) value ).intValue(); 333 else 334 return -1; 335 } 336 337 338 /** 339 * Returns the name of an HTML character reference based on its character 340 * value. Only valid for entities defined from character references. If no 341 * such character value was defined, return null. 342 * 343 * @param value Character value of entity 344 * @return Entity's name or null 345 */ 346 public static String fromChar(int value ) 347 { 348 if (value > 0xffff) 349 return null; 350 351 String name; 352 353 initialize(); 354 name = _byChar.get(value); 355 return name; 356 } 357 358 359 /** 360 * Initialize upon first access. Will load all the HTML character references 361 * into a list that is accessible by name or character value and is optimized 362 * for character substitution. This method may be called any number of times 363 * but will execute only once. 364 */ 365 private static void initialize() 366 { 367 InputStream is = null; 368 BufferedReader reader = null; 369 int index; 370 String name; 371 String value; 372 int code; 373 String line; 374 375 // Make sure not to initialize twice. 376 if ( _byName != null ) 377 return; 378 try { 379 _byName = new HashMap<>(); 380 _byChar = new HashMap<>(); 381 is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE ); 382 if ( is == null ) { 383 throw new RuntimeException( 384 DOMMessageFormatter.formatMessage( 385 DOMMessageFormatter.SERIALIZER_DOMAIN, 386 "ResourceNotFound", new Object[] {ENTITIES_RESOURCE})); 387 } 388 reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) ); 389 line = reader.readLine(); 390 while ( line != null ) { 391 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) { 392 line = reader.readLine(); 393 continue; 394 } 395 index = line.indexOf( ' ' ); 396 if ( index > 1 ) { 397 name = line.substring( 0, index ); 398 ++index; 399 if ( index < line.length() ) { 400 value = line.substring( index ); 401 index = value.indexOf( ' ' ); 402 if ( index > 0 ) 403 value = value.substring( 0, index ); 404 code = Integer.parseInt( value ); 405 defineEntity( name, (char) code ); 406 } 407 } 408 line = reader.readLine(); 409 } 410 is.close(); 411 } catch ( Exception except ) { 412 throw new RuntimeException( 413 DOMMessageFormatter.formatMessage( 414 DOMMessageFormatter.SERIALIZER_DOMAIN, 415 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()})); 416 } finally { 417 if ( is != null ) { 418 try { 419 is.close(); 420 } catch ( Exception except ) { } 421 } 422 } 423 } 424 425 426 /** 427 * Defines a new character reference. The reference's name and value are 428 * supplied. Nothing happens if the character reference is already defined. 429 * <P> 430 * Unlike internal entities, character references are a string to single 431 * character mapping. They are used to map non-ASCII characters both on 432 * parsing and printing, primarily for HTML documents. '<amp;' is an 433 * example of a character reference. 434 * 435 * @param name The entity's name 436 * @param value The entity's value 437 */ 438 private static void defineEntity( String name, char value ) 439 { 440 if ( _byName.get( name ) == null ) { 441 _byName.put( name, new Integer( value ) ); 442 _byChar.put( new Integer( value ), name ); 443 } 444 } 445 446 447 private static void defineElement( String name, int flags ) 448 { 449 _elemDefs.put(name, flags); 450 } 451 452 453 private static void defineBoolean( String tagName, String attrName ) 454 { 455 defineBoolean( tagName, new String[] { attrName } ); 456 } 457 458 459 private static void defineBoolean( String tagName, String[] attrNames ) 460 { 461 _boolAttrs.put( tagName, attrNames ); 462 } 463 464 465 private static boolean isElement( String name, int flag ) 466 { 467 Integer flags; 468 469 flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) ); 470 if ( flags == null ) 471 return false; 472 else 473 return ( ( flags.intValue() & flag ) == flag ); 474 } 475 476 477 static 478 { 479 _elemDefs = new HashMap<>(); 480 defineElement( "ADDRESS", CLOSE_P ); 481 defineElement( "AREA", EMPTY ); 482 defineElement( "BASE", EMPTY | ALLOWED_HEAD ); 483 defineElement( "BASEFONT", EMPTY ); 484 defineElement( "BLOCKQUOTE", CLOSE_P ); 485 defineElement( "BODY", OPT_CLOSING ); 486 defineElement( "BR", EMPTY ); 487 defineElement( "COL", EMPTY ); 488 defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 489 defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 490 defineElement( "DIV", CLOSE_P ); 491 defineElement( "DL", ELEM_CONTENT | CLOSE_P ); 492 defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 493 defineElement( "FIELDSET", CLOSE_P ); 494 defineElement( "FORM", CLOSE_P ); 495 defineElement( "FRAME", EMPTY | OPT_CLOSING ); 496 defineElement( "H1", CLOSE_P ); 497 defineElement( "H2", CLOSE_P ); 498 defineElement( "H3", CLOSE_P ); 499 defineElement( "H4", CLOSE_P ); 500 defineElement( "H5", CLOSE_P ); 501 defineElement( "H6", CLOSE_P ); 502 defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING ); 503 defineElement( "HR", EMPTY | CLOSE_P ); 504 defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING ); 505 defineElement( "IMG", EMPTY ); 506 defineElement( "INPUT", EMPTY ); 507 defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD ); 508 defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 509 defineElement( "LINK", EMPTY | ALLOWED_HEAD ); 510 defineElement( "MAP", ALLOWED_HEAD ); 511 defineElement( "META", EMPTY | ALLOWED_HEAD ); 512 defineElement( "OL", ELEM_CONTENT | CLOSE_P ); 513 defineElement( "OPTGROUP", ELEM_CONTENT ); 514 defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 515 defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF ); 516 defineElement( "PARAM", EMPTY ); 517 defineElement( "PRE", PRESERVE | CLOSE_P ); 518 defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE ); 519 defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE ); 520 defineElement( "SELECT", ELEM_CONTENT ); 521 defineElement( "STYLE", ALLOWED_HEAD | PRESERVE ); 522 defineElement( "TABLE", ELEM_CONTENT | CLOSE_P ); 523 defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 524 defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD ); 525 defineElement( "TEXTAREA", PRESERVE ); 526 defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 527 defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD ); 528 defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 529 defineElement( "TITLE", ALLOWED_HEAD ); 530 defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 531 defineElement( "UL", ELEM_CONTENT | CLOSE_P ); 532 533 _boolAttrs = new HashMap<>(); 534 defineBoolean( "AREA", "href" ); 535 defineBoolean( "BUTTON", "disabled" ); 536 defineBoolean( "DIR", "compact" ); 537 defineBoolean( "DL", "compact" ); 538 defineBoolean( "FRAME", "noresize" ); 539 defineBoolean( "HR", "noshade" ); 540 defineBoolean( "IMAGE", "ismap" ); 541 defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } ); 542 defineBoolean( "LINK", "link" ); 543 defineBoolean( "MENU", "compact" ); 544 defineBoolean( "OBJECT", "declare" ); 545 defineBoolean( "OL", "compact" ); 546 defineBoolean( "OPTGROUP", "disabled" ); 547 defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } ); 548 defineBoolean( "SCRIPT", "defer" ); 549 defineBoolean( "SELECT", new String[] { "multiple", "disabled" } ); 550 defineBoolean( "STYLE", "disabled" ); 551 defineBoolean( "TD", "nowrap" ); 552 defineBoolean( "TH", "nowrap" ); 553 defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } ); 554 defineBoolean( "UL", "compact" ); 555 556 initialize(); 557 } 558 559 560 561 }