1 /* 2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.xml.internal.dtdparser; 27 28 29 /** 30 * Methods in this class are used to determine whether characters may 31 * appear in certain roles in XML documents. Such methods are used 32 * both to parse and to create such documents. 33 * 34 * @author David Brownell 35 * @version 1.1, 00/08/05 36 */ 37 public class XmlChars { 38 // can't construct instances 39 private XmlChars() { 40 } 41 42 /** 43 * Returns true if the argument, a UCS-4 character code, is valid in 44 * XML documents. Unicode characters fit into the low sixteen 45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate 46 * characters</em> can be combined to encode UCS-4 characters in 47 * documents containing only Unicode. (The <code>char</code> datatype 48 * in the Java Programming Language represents Unicode characters, 49 * including unpaired surrogates.) 50 * <p/> 51 * <P> In XML, UCS-4 characters can also be encoded by the use of 52 * <em>character references</em> such as <b>&#x12345678;</b>, which 53 * happens to refer to a character that is disallowed in XML documents. 54 * UCS-4 characters allowed in XML documents can be expressed with 55 * one or two Unicode characters. 56 * 57 * @param ucs4char The 32-bit UCS-4 character being tested. 58 */ 59 static public boolean isChar(int ucs4char) { 60 // [2] Char ::= #x0009 | #x000A | #x000D 61 // | [#x0020-#xD7FF] 62 // ... surrogates excluded! 63 // | [#xE000-#xFFFD] 64 // | [#x10000-#x10ffff] 65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF) 66 || ucs4char == 0x000A || ucs4char == 0x0009 67 || ucs4char == 0x000D 68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD) 69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff)); 70 } 71 72 /** 73 * Returns true if the character is allowed to be a non-initial 74 * character in names according to the XML recommendation. 75 * 76 * @see #isNCNameChar(char) 77 * @see #isLetter(char) 78 */ 79 public static boolean isNameChar(char c) { 80 // [4] NameChar ::= Letter | Digit | '.' | '_' | ':' 81 // | CombiningChar | Extender 82 83 if (isLetter2(c)) 84 return true; 85 else if (c == '>') 86 return false; 87 else if (c == '.' || c == '-' || c == '_' || c == ':' 88 || isExtender(c)) 89 return true; 90 else 91 return false; 92 } 93 94 /** 95 * Returns true if the character is allowed to be a non-initial 96 * character in unscoped names according to the rules of the XML 97 * Namespaces proposed recommendation. Except for precluding 98 * the colon (used to separate names from their scopes) these 99 * characters are just as allowed by the XML recommendation. 100 * 101 * @see #isNameChar(char) 102 * @see #isLetter(char) 103 */ 104 public static boolean isNCNameChar(char c) { 105 // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_' 106 // | CombiningChar | Extender 107 return c != ':' && isNameChar(c); 108 } 109 110 /** 111 * Returns true if the character is allowed where XML supports 112 * whitespace characters, false otherwise. 113 */ 114 public static boolean isSpace(char c) { 115 return c == ' ' || c == '\t' || c == '\n' || c == '\r'; 116 } 117 118 119 /* 120 * NOTE: java.lang.Character.getType() values are: 121 * 122 * UNASSIGNED = 0, 123 * 124 * UPPERCASE_LETTER = 1, // Lu 125 * LOWERCASE_LETTER = 2, // Ll 126 * TITLECASE_LETTER = 3, // Lt 127 * MODIFIER_LETTER = 4, // Lm 128 * OTHER_LETTER = 5, // Lo 129 * NON_SPACING_MARK = 6, // Mn 130 * ENCLOSING_MARK = 7, // Me 131 * COMBINING_SPACING_MARK = 8, // Mc 132 * DECIMAL_DIGIT_NUMBER = 9, // Nd 133 * LETTER_NUMBER = 10, // Nl 134 * OTHER_NUMBER = 11, // No 135 * SPACE_SEPARATOR = 12, // Zs 136 * LINE_SEPARATOR = 13, // Zl 137 * PARAGRAPH_SEPARATOR = 14, // Zp 138 * CONTROL = 15, // Cc 139 * FORMAT = 16, // Cf 140 * // 17 reserved for proposed Ci category 141 * PRIVATE_USE = 18, // Co 142 * SURROGATE = 19, // Cs 143 * DASH_PUNCTUATION = 20, // Pd 144 * START_PUNCTUATION = 21, // Ps 145 * END_PUNCTUATION = 22, // Pe 146 * CONNECTOR_PUNCTUATION = 23, // Pc 147 * OTHER_PUNCTUATION = 24, // Po 148 * MATH_SYMBOL = 25, // Sm 149 * CURRENCY_SYMBOL = 26, // Sc 150 * MODIFIER_SYMBOL = 27, // Sk 151 * OTHER_SYMBOL = 28; // So 152 */ 153 154 /** 155 * Returns true if the character is an XML "letter". XML Names must 156 * start with Letters or a few other characters, but other characters 157 * in names must only satisfy the <em>isNameChar</em> predicate. 158 * 159 * @see #isNameChar(char) 160 * @see #isNCNameChar(char) 161 */ 162 public static boolean isLetter(char c) { 163 // [84] Letter ::= BaseChar | Ideographic 164 // [85] BaseChar ::= ... too much to repeat 165 // [86] Ideographic ::= ... too much to repeat 166 167 // 168 // Optimize the typical case. 169 // 170 if (c >= 'a' && c <= 'z') 171 return true; 172 if (c == '/') 173 return false; 174 if (c >= 'A' && c <= 'Z') 175 return true; 176 177 // 178 // Since the tables are too ridiculous to use in code, 179 // we're using the footnotes here to drive this test. 180 // 181 switch (Character.getType(c)) { 182 // app. B footnote says these are 'name start' 183 // chars' ... 184 case Character.LOWERCASE_LETTER: // Ll 185 case Character.UPPERCASE_LETTER: // Lu 186 case Character.OTHER_LETTER: // Lo 187 case Character.TITLECASE_LETTER: // Lt 188 case Character.LETTER_NUMBER: // Nl 189 190 // OK, here we just have some exceptions to check... 191 return !isCompatibilityChar(c) 192 // per "5.14 of Unicode", rule out some combiners 193 && !(c >= 0x20dd && c <= 0x20e0); 194 195 default: 196 // check for some exceptions: these are "alphabetic" 197 return ((c >= 0x02bb && c <= 0x02c1) 198 || c == 0x0559 || c == 0x06e5 || c == 0x06e6); 199 } 200 } 201 202 // 203 // XML 1.0 discourages "compatibility" characters in names; these 204 // were defined to permit passing through some information stored in 205 // older non-Unicode character sets. These always have alternative 206 // representations in Unicode, e.g. using combining chars. 207 // 208 private static boolean isCompatibilityChar(char c) { 209 // the numerous comparisions here seem unavoidable, 210 // but the switch can reduce the number which must 211 // actually be executed. 212 213 switch ((c >> 8) & 0x0ff) { 214 case 0x00: 215 // ISO Latin/1 has a few compatibility characters 216 return c == 0x00aa || c == 0x00b5 || c == 0x00ba; 217 218 case 0x01: 219 // as do Latin Extended A and (parts of) B 220 return (c >= 0x0132 && c <= 0x0133) 221 || (c >= 0x013f && c <= 0x0140) 222 || c == 0x0149 223 || c == 0x017f 224 || (c >= 0x01c4 && c <= 0x01cc) 225 || (c >= 0x01f1 && c <= 0x01f3); 226 227 case 0x02: 228 // some spacing modifiers 229 return (c >= 0x02b0 && c <= 0x02b8) 230 || (c >= 0x02e0 && c <= 0x02e4); 231 232 case 0x03: 233 return c == 0x037a; // Greek 234 235 case 0x05: 236 return c == 0x0587; // Armenian 237 238 case 0x0e: 239 return c >= 0x0edc && c <= 0x0edd; // Laotian 240 241 case 0x11: 242 // big chunks of Hangul Jamo are all "compatibility" 243 return c == 0x1101 244 || c == 0x1104 245 || c == 0x1108 246 || c == 0x110a 247 || c == 0x110d 248 || (c >= 0x1113 && c <= 0x113b) 249 || c == 0x113d 250 || c == 0x113f 251 || (c >= 0x1141 && c <= 0x114b) 252 || c == 0x114d 253 || c == 0x114f 254 || (c >= 0x1151 && c <= 0x1153) 255 || (c >= 0x1156 && c <= 0x1158) 256 || c == 0x1162 257 || c == 0x1164 258 || c == 0x1166 259 || c == 0x1168 260 || (c >= 0x116a && c <= 0x116c) 261 || (c >= 0x116f && c <= 0x1171) 262 || c == 0x1174 263 || (c >= 0x1176 && c <= 0x119d) 264 || (c >= 0x119f && c <= 0x11a2) 265 || (c >= 0x11a9 && c <= 0x11aa) 266 || (c >= 0x11ac && c <= 0x11ad) 267 || (c >= 0x11b0 && c <= 0x11b6) 268 || c == 0x11b9 269 || c == 0x11bb 270 || (c >= 0x11c3 && c <= 0x11ea) 271 || (c >= 0x11ec && c <= 0x11ef) 272 || (c >= 0x11f1 && c <= 0x11f8) 273 ; 274 275 case 0x20: 276 return c == 0x207f; // superscript 277 278 case 0x21: 279 return 280 // various letterlike symbols 281 c == 0x2102 282 || c == 0x2107 283 || (c >= 0x210a && c <= 0x2113) 284 || c == 0x2115 285 || (c >= 0x2118 && c <= 0x211d) 286 || c == 0x2124 287 || c == 0x2128 288 || (c >= 0x212c && c <= 0x212d) 289 || (c >= 0x212f && c <= 0x2138) 290 291 // most Roman numerals (less 1K, 5K, 10K) 292 || (c >= 0x2160 && c <= 0x217f) 293 ; 294 295 case 0x30: 296 // some Hiragana 297 return c >= 0x309b && c <= 0x309c; 298 299 case 0x31: 300 // all Hangul Compatibility Jamo 301 return c >= 0x3131 && c <= 0x318e; 302 303 case 0xf9: 304 case 0xfa: 305 case 0xfb: 306 case 0xfc: 307 case 0xfd: 308 case 0xfe: 309 case 0xff: 310 // the whole "compatibility" area is for that purpose! 311 return true; 312 313 default: 314 // most of Unicode isn't flagged as being for compatibility 315 return false; 316 } 317 } 318 319 // guts of isNameChar/isNCNameChar 320 private static boolean isLetter2(char c) { 321 // [84] Letter ::= BaseChar | Ideographic 322 // [85] BaseChar ::= ... too much to repeat 323 // [86] Ideographic ::= ... too much to repeat 324 // [87] CombiningChar ::= ... too much to repeat 325 326 // 327 // Optimize the typical case. 328 // 329 if (c >= 'a' && c <= 'z') 330 return true; 331 if (c == '>') 332 return false; 333 if (c >= 'A' && c <= 'Z') 334 return true; 335 336 // 337 // Since the tables are too ridiculous to use in code, 338 // we're using the footnotes here to drive this test. 339 // 340 switch (Character.getType(c)) { 341 // app. B footnote says these are 'name start' 342 // chars' ... 343 case Character.LOWERCASE_LETTER: // Ll 344 case Character.UPPERCASE_LETTER: // Lu 345 case Character.OTHER_LETTER: // Lo 346 case Character.TITLECASE_LETTER: // Lt 347 case Character.LETTER_NUMBER: // Nl 348 // ... and these are name characters 'other 349 // than name start characters' 350 case Character.COMBINING_SPACING_MARK: // Mc 351 case Character.ENCLOSING_MARK: // Me 352 case Character.NON_SPACING_MARK: // Mn 353 case Character.MODIFIER_LETTER: // Lm 354 case Character.DECIMAL_DIGIT_NUMBER: // Nd 355 356 // OK, here we just have some exceptions to check... 357 return !isCompatibilityChar(c) 358 // per "5.14 of Unicode", rule out some combiners 359 && !(c >= 0x20dd && c <= 0x20e0); 360 361 default: 362 // added a character ... 363 return c == 0x0387; 364 } 365 } 366 367 private static boolean isDigit(char c) { 368 // [88] Digit ::= ... 369 370 // 371 // java.lang.Character.isDigit is correct from the XML point 372 // of view except that it allows "fullwidth" digits. 373 // 374 return Character.isDigit(c) 375 && !((c >= 0xff10) && (c <= 0xff19)); 376 } 377 378 private static boolean isExtender(char c) { 379 // [89] Extender ::= ... 380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035) 383 || (c >= 0x309d && c <= 0x309e) 384 || (c >= 0x30fc && c <= 0x30fe) 385 ; 386 } 387 }