1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 // Aug 21, 2000: 22 // Fixed bug in isElement and made HTMLdtd public. 23 // Contributed by Eric SCHAEFFER" <eschaeffer@posterconseil.com> 24 25 26 package com.sun.org.apache.xml.internal.serialize; 27 28 import com.sun.org.apache.xerces.internal.dom.DOMMessageFormatter; 29 import java.io.BufferedReader; 30 import java.io.InputStream; 31 import java.io.InputStreamReader; 32 import java.util.HashMap; 33 import java.util.Locale; 34 import java.util.Map; 35 36 37 /** 38 * Utility class for accessing information specific to HTML documents. 39 * The HTML DTD is expressed as three utility function groups. Two methods 40 * allow for checking whether an element requires an open tag on printing 41 * ({@link #isEmptyTag}) or on parsing ({@link #isOptionalClosing}). 42 * <P> 43 * Two other methods translate character references from name to value and 44 * from value to name. A small entities resource is loaded into memory the 45 * first time any of these methods is called for fast and efficient access. 46 * 47 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 48 * 49 * @deprecated As of JDK 9, Xerces 2.9.0, Xerces DOM L3 Serializer implementation 50 * is replaced by that of Xalan. Main class 51 * {@link com.sun.org.apache.xml.internal.serialize.DOMSerializerImpl} is replaced 52 * by {@link com.sun.org.apache.xml.internal.serializer.dom3.LSSerializerImpl}. 53 */ 54 public final class HTMLdtd 55 { 56 57 /** 58 * Public identifier for HTML 4.01 (Strict) document type. 59 */ 60 public static final String HTMLPublicId = "-//W3C//DTD HTML 4.01//EN"; 61 62 /** 63 * System identifier for HTML 4.01 (Strict) document type. 64 */ 65 public static final String HTMLSystemId = 66 "http://www.w3.org/TR/html4/strict.dtd"; 67 68 /** 69 * Public identifier for XHTML 1.0 (Strict) document type. 70 */ 71 public static final String XHTMLPublicId = 72 "-//W3C//DTD XHTML 1.0 Strict//EN"; 73 74 /** 75 * System identifier for XHTML 1.0 (Strict) document type. 76 */ 77 public static final String XHTMLSystemId = 78 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; 79 80 /** 81 * Table of reverse character reference mapping. Character codes are held 82 * as single-character strings, mapped to their reference name. 83 */ 84 private static Map<Integer, String> _byChar; 85 86 87 /** 88 * Table of entity name to value mapping. Entities are held as strings, 89 * character references as <TT>Character</TT> objects. 90 */ 91 private static Map<String, Integer> _byName; 92 93 94 private static final Map<String, String[]> _boolAttrs; 95 96 97 /** 98 * Holds element definitions. 99 */ 100 private static final Map<String, Integer> _elemDefs; 101 102 103 /** 104 * Locates the HTML entities file that is loaded upon initialization. 105 * This file is a resource loaded with the default class loader. 106 */ 107 private static final String ENTITIES_RESOURCE = "HTMLEntities.res"; 108 109 110 /** 111 * Only opening tag should be printed. 112 */ 113 private static final int ONLY_OPENING = 0x0001; 114 115 /** 116 * Element contains element content only. 117 */ 118 private static final int ELEM_CONTENT = 0x0002; 119 120 121 /** 122 * Element preserve spaces. 123 */ 124 private static final int PRESERVE = 0x0004; 125 126 127 /** 128 * Optional closing tag. 129 */ 130 private static final int OPT_CLOSING = 0x0008; 131 132 133 /** 134 * Element is empty (also means only opening tag) 135 */ 136 private static final int EMPTY = 0x0010 | ONLY_OPENING; 137 138 139 /** 140 * Allowed to appear in head. 141 */ 142 private static final int ALLOWED_HEAD = 0x0020; 143 144 145 /** 146 * When opened, closes P. 147 */ 148 private static final int CLOSE_P = 0x0040; 149 150 151 /** 152 * When opened, closes DD or DT. 153 */ 154 private static final int CLOSE_DD_DT = 0x0080; 155 156 157 /** 158 * When opened, closes itself. 159 */ 160 private static final int CLOSE_SELF = 0x0100; 161 162 163 /** 164 * When opened, closes another table section. 165 */ 166 private static final int CLOSE_TABLE = 0x0200; 167 168 169 /** 170 * When opened, closes TH or TD. 171 */ 172 private static final int CLOSE_TH_TD = 0x04000; 173 174 175 /** 176 * Returns true if element is declared to be empty. HTML elements are 177 * defines as empty in the DTD, not by the document syntax. 178 * 179 * @param tagName The element tag name (upper case) 180 * @return True if element is empty 181 */ 182 public static boolean isEmptyTag( String tagName ) 183 { 184 return isElement( tagName, EMPTY ); 185 } 186 187 188 /** 189 * Returns true if element is declared to have element content. 190 * Whitespaces appearing inside element content will be ignored, 191 * other text will simply report an error. 192 * 193 * @param tagName The element tag name (upper case) 194 * @return True if element content 195 */ 196 public static boolean isElementContent( String tagName ) 197 { 198 return isElement( tagName, ELEM_CONTENT ); 199 } 200 201 202 /** 203 * Returns true if element's textual contents preserves spaces. 204 * This only applies to PRE and TEXTAREA, all other HTML elements 205 * do not preserve space. 206 * 207 * @param tagName The element tag name (upper case) 208 * @return True if element's text content preserves spaces 209 */ 210 public static boolean isPreserveSpace( String tagName ) 211 { 212 return isElement( tagName, PRESERVE ); 213 } 214 215 216 /** 217 * Returns true if element's closing tag is optional and need not 218 * exist. An error will not be reported for such elements if they 219 * are not closed. For example, <tt>LI</tt> is most often not closed. 220 * 221 * @param tagName The element tag name (upper case) 222 * @return True if closing tag implied 223 */ 224 public static boolean isOptionalClosing( String tagName ) 225 { 226 return isElement( tagName, OPT_CLOSING ); 227 } 228 229 230 /** 231 * Returns true if element's closing tag is generally not printed. 232 * For example, <tt>LI</tt> should not print the closing tag. 233 * 234 * @param tagName The element tag name (upper case) 235 * @return True if only opening tag should be printed 236 */ 237 public static boolean isOnlyOpening( String tagName ) 238 { 239 return isElement( tagName, ONLY_OPENING ); 240 } 241 242 243 /** 244 * Returns true if the opening of one element (<tt>tagName</tt>) implies 245 * the closing of another open element (<tt>openTag</tt>). For example, 246 * every opening <tt>LI</tt> will close the previously open <tt>LI</tt>, 247 * and every opening <tt>BODY</tt> will close the previously open <tt>HEAD</tt>. 248 * 249 * @param tagName The newly opened element 250 * @param openTag The already opened element 251 * @return True if closing tag closes opening tag 252 */ 253 public static boolean isClosing( String tagName, String openTag ) 254 { 255 // Several elements are defined as closing the HEAD 256 if ( openTag.equalsIgnoreCase( "HEAD" ) ) 257 return ! isElement( tagName, ALLOWED_HEAD ); 258 // P closes iteself 259 if ( openTag.equalsIgnoreCase( "P" ) ) 260 return isElement( tagName, CLOSE_P ); 261 // DT closes DD, DD closes DT 262 if ( openTag.equalsIgnoreCase( "DT" ) || openTag.equalsIgnoreCase( "DD" ) ) 263 return isElement( tagName, CLOSE_DD_DT ); 264 // LI and OPTION close themselves 265 if ( openTag.equalsIgnoreCase( "LI" ) || openTag.equalsIgnoreCase( "OPTION" ) ) 266 return isElement( tagName, CLOSE_SELF ); 267 // Each of these table sections closes all the others 268 if ( openTag.equalsIgnoreCase( "THEAD" ) || openTag.equalsIgnoreCase( "TFOOT" ) || 269 openTag.equalsIgnoreCase( "TBODY" ) || openTag.equalsIgnoreCase( "TR" ) || 270 openTag.equalsIgnoreCase( "COLGROUP" ) ) 271 return isElement( tagName, CLOSE_TABLE ); 272 // TD closes TH and TH closes TD 273 if ( openTag.equalsIgnoreCase( "TH" ) || openTag.equalsIgnoreCase( "TD" ) ) 274 return isElement( tagName, CLOSE_TH_TD ); 275 return false; 276 } 277 278 279 /** 280 * Returns true if the specified attribute it a URI and should be 281 * escaped appropriately. In HTML URIs are escaped differently 282 * than normal attributes. 283 * 284 * @param tagName The element's tag name 285 * @param attrName The attribute's name 286 */ 287 public static boolean isURI( String tagName, String attrName ) 288 { 289 // Stupid checks. 290 return ( attrName.equalsIgnoreCase( "href" ) || attrName.equalsIgnoreCase( "src" ) ); 291 } 292 293 294 /** 295 * Returns true if the specified attribute is a boolean and should be 296 * printed without the value. This applies to attributes that are true 297 * if they exist, such as selected (OPTION/INPUT). 298 * 299 * @param tagName The element's tag name 300 * @param attrName The attribute's name 301 */ 302 public static boolean isBoolean( String tagName, String attrName ) 303 { 304 String[] attrNames; 305 306 attrNames = _boolAttrs.get( tagName.toUpperCase(Locale.ENGLISH) ); 307 if ( attrNames == null ) 308 return false; 309 for ( int i = 0 ; i < attrNames.length ; ++i ) 310 if ( attrNames[ i ].equalsIgnoreCase( attrName ) ) 311 return true; 312 return false; 313 } 314 315 316 /** 317 * Returns the value of an HTML character reference by its name. If the 318 * reference is not found or was not defined as a character reference, 319 * returns EOF (-1). 320 * 321 * @param name Name of character reference 322 * @return Character code or EOF (-1) 323 */ 324 public static int charFromName( String name ) 325 { 326 Object value; 327 328 initialize(); 329 value = _byName.get( name ); 330 if ( value != null && value instanceof Integer ) 331 return ( (Integer) value ).intValue(); 332 else 333 return -1; 334 } 335 336 337 /** 338 * Returns the name of an HTML character reference based on its character 339 * value. Only valid for entities defined from character references. If no 340 * such character value was defined, return null. 341 * 342 * @param value Character value of entity 343 * @return Entity's name or null 344 */ 345 public static String fromChar(int value ) 346 { 347 if (value > 0xffff) 348 return null; 349 350 String name; 351 352 initialize(); 353 name = _byChar.get(value); 354 return name; 355 } 356 357 358 /** 359 * Initialize upon first access. Will load all the HTML character references 360 * into a list that is accessible by name or character value and is optimized 361 * for character substitution. This method may be called any number of times 362 * but will execute only once. 363 */ 364 private static void initialize() 365 { 366 InputStream is = null; 367 BufferedReader reader = null; 368 int index; 369 String name; 370 String value; 371 int code; 372 String line; 373 374 // Make sure not to initialize twice. 375 if ( _byName != null ) 376 return; 377 try { 378 _byName = new HashMap<>(); 379 _byChar = new HashMap<>(); 380 is = HTMLdtd.class.getResourceAsStream( ENTITIES_RESOURCE ); 381 if ( is == null ) { 382 throw new RuntimeException( 383 DOMMessageFormatter.formatMessage( 384 DOMMessageFormatter.SERIALIZER_DOMAIN, 385 "ResourceNotFound", new Object[] {ENTITIES_RESOURCE})); 386 } 387 reader = new BufferedReader( new InputStreamReader( is, "ASCII" ) ); 388 line = reader.readLine(); 389 while ( line != null ) { 390 if ( line.length() == 0 || line.charAt( 0 ) == '#' ) { 391 line = reader.readLine(); 392 continue; 393 } 394 index = line.indexOf( ' ' ); 395 if ( index > 1 ) { 396 name = line.substring( 0, index ); 397 ++index; 398 if ( index < line.length() ) { 399 value = line.substring( index ); 400 index = value.indexOf( ' ' ); 401 if ( index > 0 ) 402 value = value.substring( 0, index ); 403 code = Integer.parseInt( value ); 404 defineEntity( name, (char) code ); 405 } 406 } 407 line = reader.readLine(); 408 } 409 is.close(); 410 } catch ( Exception except ) { 411 throw new RuntimeException( 412 DOMMessageFormatter.formatMessage( 413 DOMMessageFormatter.SERIALIZER_DOMAIN, 414 "ResourceNotLoaded", new Object[] {ENTITIES_RESOURCE, except.toString()})); 415 } finally { 416 if ( is != null ) { 417 try { 418 is.close(); 419 } catch ( Exception except ) { } 420 } 421 } 422 } 423 424 425 /** 426 * Defines a new character reference. The reference's name and value are 427 * supplied. Nothing happens if the character reference is already defined. 428 * <P> 429 * Unlike internal entities, character references are a string to single 430 * character mapping. They are used to map non-ASCII characters both on 431 * parsing and printing, primarily for HTML documents. '<amp;' is an 432 * example of a character reference. 433 * 434 * @param name The entity's name 435 * @param value The entity's value 436 */ 437 private static void defineEntity( String name, char value ) 438 { 439 if ( _byName.get( name ) == null ) { 440 _byName.put( name, new Integer( value ) ); 441 _byChar.put( new Integer( value ), name ); 442 } 443 } 444 445 446 private static void defineElement( String name, int flags ) 447 { 448 _elemDefs.put(name, flags); 449 } 450 451 452 private static void defineBoolean( String tagName, String attrName ) 453 { 454 defineBoolean( tagName, new String[] { attrName } ); 455 } 456 457 458 private static void defineBoolean( String tagName, String[] attrNames ) 459 { 460 _boolAttrs.put( tagName, attrNames ); 461 } 462 463 464 private static boolean isElement( String name, int flag ) 465 { 466 Integer flags; 467 468 flags = _elemDefs.get( name.toUpperCase(Locale.ENGLISH) ); 469 if ( flags == null ) 470 return false; 471 else 472 return ( ( flags.intValue() & flag ) == flag ); 473 } 474 475 476 static 477 { 478 _elemDefs = new HashMap<>(); 479 defineElement( "ADDRESS", CLOSE_P ); 480 defineElement( "AREA", EMPTY ); 481 defineElement( "BASE", EMPTY | ALLOWED_HEAD ); 482 defineElement( "BASEFONT", EMPTY ); 483 defineElement( "BLOCKQUOTE", CLOSE_P ); 484 defineElement( "BODY", OPT_CLOSING ); 485 defineElement( "BR", EMPTY ); 486 defineElement( "COL", EMPTY ); 487 defineElement( "COLGROUP", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 488 defineElement( "DD", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 489 defineElement( "DIV", CLOSE_P ); 490 defineElement( "DL", ELEM_CONTENT | CLOSE_P ); 491 defineElement( "DT", OPT_CLOSING | ONLY_OPENING | CLOSE_DD_DT ); 492 defineElement( "FIELDSET", CLOSE_P ); 493 defineElement( "FORM", CLOSE_P ); 494 defineElement( "FRAME", EMPTY | OPT_CLOSING ); 495 defineElement( "H1", CLOSE_P ); 496 defineElement( "H2", CLOSE_P ); 497 defineElement( "H3", CLOSE_P ); 498 defineElement( "H4", CLOSE_P ); 499 defineElement( "H5", CLOSE_P ); 500 defineElement( "H6", CLOSE_P ); 501 defineElement( "HEAD", ELEM_CONTENT | OPT_CLOSING ); 502 defineElement( "HR", EMPTY | CLOSE_P ); 503 defineElement( "HTML", ELEM_CONTENT | OPT_CLOSING ); 504 defineElement( "IMG", EMPTY ); 505 defineElement( "INPUT", EMPTY ); 506 defineElement( "ISINDEX", EMPTY | ALLOWED_HEAD ); 507 defineElement( "LI", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 508 defineElement( "LINK", EMPTY | ALLOWED_HEAD ); 509 defineElement( "MAP", ALLOWED_HEAD ); 510 defineElement( "META", EMPTY | ALLOWED_HEAD ); 511 defineElement( "OL", ELEM_CONTENT | CLOSE_P ); 512 defineElement( "OPTGROUP", ELEM_CONTENT ); 513 defineElement( "OPTION", OPT_CLOSING | ONLY_OPENING | CLOSE_SELF ); 514 defineElement( "P", OPT_CLOSING | CLOSE_P | CLOSE_SELF ); 515 defineElement( "PARAM", EMPTY ); 516 defineElement( "PRE", PRESERVE | CLOSE_P ); 517 defineElement( "SCRIPT", ALLOWED_HEAD | PRESERVE ); 518 defineElement( "NOSCRIPT", ALLOWED_HEAD | PRESERVE ); 519 defineElement( "SELECT", ELEM_CONTENT ); 520 defineElement( "STYLE", ALLOWED_HEAD | PRESERVE ); 521 defineElement( "TABLE", ELEM_CONTENT | CLOSE_P ); 522 defineElement( "TBODY", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 523 defineElement( "TD", OPT_CLOSING | CLOSE_TH_TD ); 524 defineElement( "TEXTAREA", PRESERVE ); 525 defineElement( "TFOOT", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 526 defineElement( "TH", OPT_CLOSING | CLOSE_TH_TD ); 527 defineElement( "THEAD", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 528 defineElement( "TITLE", ALLOWED_HEAD ); 529 defineElement( "TR", ELEM_CONTENT | OPT_CLOSING | CLOSE_TABLE ); 530 defineElement( "UL", ELEM_CONTENT | CLOSE_P ); 531 532 _boolAttrs = new HashMap<>(); 533 defineBoolean( "AREA", "href" ); 534 defineBoolean( "BUTTON", "disabled" ); 535 defineBoolean( "DIR", "compact" ); 536 defineBoolean( "DL", "compact" ); 537 defineBoolean( "FRAME", "noresize" ); 538 defineBoolean( "HR", "noshade" ); 539 defineBoolean( "IMAGE", "ismap" ); 540 defineBoolean( "INPUT", new String[] { "defaultchecked", "checked", "readonly", "disabled" } ); 541 defineBoolean( "LINK", "link" ); 542 defineBoolean( "MENU", "compact" ); 543 defineBoolean( "OBJECT", "declare" ); 544 defineBoolean( "OL", "compact" ); 545 defineBoolean( "OPTGROUP", "disabled" ); 546 defineBoolean( "OPTION", new String[] { "default-selected", "selected", "disabled" } ); 547 defineBoolean( "SCRIPT", "defer" ); 548 defineBoolean( "SELECT", new String[] { "multiple", "disabled" } ); 549 defineBoolean( "STYLE", "disabled" ); 550 defineBoolean( "TD", "nowrap" ); 551 defineBoolean( "TH", "nowrap" ); 552 defineBoolean( "TEXTAREA", new String[] { "disabled", "readonly" } ); 553 defineBoolean( "UL", "compact" ); 554 555 initialize(); 556 } 557 558 559 560 }