1 /*
   2  * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package com.sun.xml.internal.dtdparser;
  27 
  28 
  29 /**
  30  * Methods in this class are used to determine whether characters may
  31  * appear in certain roles in XML documents.  Such methods are used
  32  * both to parse and to create such documents.
  33  *
  34  * @author David Brownell
  35  * @version 1.1, 00/08/05
  36  */
  37 public class XmlChars {
  38     // can't construct instances
  39     private XmlChars() {
  40     }
  41 
  42     /**
  43      * Returns true if the argument, a UCS-4 character code, is valid in
  44      * XML documents.  Unicode characters fit into the low sixteen
  45      * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
  46      * characters</em> can be combined to encode UCS-4 characters in
  47      * documents containing only Unicode.  (The <code>char</code> datatype
  48      * in the Java Programming Language represents Unicode characters,
  49      * including unpaired surrogates.)
  50      * <p/>
  51      * <P> In XML, UCS-4 characters can also be encoded by the use of
  52      * <em>character references</em> such as <b>&amp;#x12345678;</b>, which
  53      * happens to refer to a character that is disallowed in XML documents.
  54      * UCS-4 characters allowed in XML documents can be expressed with
  55      * one or two Unicode characters.
  56      *
  57      * @param ucs4char The 32-bit UCS-4 character being tested.
  58      */
  59     static public boolean isChar(int ucs4char) {
  60         // [2] Char ::= #x0009 | #x000A | #x000D
  61         //            | [#x0020-#xD7FF]
  62         //    ... surrogates excluded!
  63         //            | [#xE000-#xFFFD]
  64         //             | [#x10000-#x10ffff]
  65         return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
  66                 || ucs4char == 0x000A || ucs4char == 0x0009
  67                 || ucs4char == 0x000D
  68                 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
  69                 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
  70     }
  71 
  72     /**
  73      * Returns true if the character is allowed to be a non-initial
  74      * character in names according to the XML recommendation.
  75      *
  76      * @see #isNCNameChar(char)
  77      * @see #isLetter(char)
  78      */
  79     public static boolean isNameChar(char c) {
  80         // [4] NameChar ::= Letter | Digit | '.' | '_' | ':'
  81         //            | CombiningChar | Extender
  82 
  83         if (isLetter2(c))
  84             return true;
  85         else if (c == '>')
  86             return false;
  87         else if (c == '.' || c == '-' || c == '_' || c == ':'
  88                 || isExtender(c))
  89             return true;
  90         else
  91             return false;
  92     }
  93 
  94     /**
  95      * Returns true if the character is allowed to be a non-initial
  96      * character in unscoped names according to the rules of the XML
  97      * Namespaces proposed recommendation.  Except for precluding
  98      * the colon (used to separate names from their scopes) these
  99      * characters are just as allowed by the XML recommendation.
 100      *
 101      * @see #isNameChar(char)
 102      * @see #isLetter(char)
 103      */
 104     public static boolean isNCNameChar(char c) {
 105         // [NC 5] NCNameChar ::= Letter | Digit | '.' | '_'
 106         //            | CombiningChar | Extender
 107         return c != ':' && isNameChar(c);
 108     }
 109 
 110     /**
 111      * Returns true if the character is allowed where XML supports
 112      * whitespace characters, false otherwise.
 113      */
 114     public static boolean isSpace(char c) {
 115         return c == ' ' || c == '\t' || c == '\n' || c == '\r';
 116     }
 117 
 118 
 119     /*
 120      * NOTE:  java.lang.Character.getType() values are:
 121      *
 122      * UNASSIGNED                    = 0,
 123      *
 124      * UPPERCASE_LETTER            = 1,    // Lu
 125      * LOWERCASE_LETTER            = 2,    // Ll
 126      * TITLECASE_LETTER            = 3,    // Lt
 127      * MODIFIER_LETTER             = 4,    // Lm
 128      * OTHER_LETTER                = 5,    // Lo
 129      * NON_SPACING_MARK            = 6,    // Mn
 130      * ENCLOSING_MARK              = 7,    // Me
 131      * COMBINING_SPACING_MARK      = 8,    // Mc
 132      * DECIMAL_DIGIT_NUMBER        = 9,    // Nd
 133      * LETTER_NUMBER               = 10,   // Nl
 134      * OTHER_NUMBER                = 11,   // No
 135      * SPACE_SEPARATOR             = 12,   // Zs
 136      * LINE_SEPARATOR              = 13,   // Zl
 137      * PARAGRAPH_SEPARATOR         = 14,   // Zp
 138      * CONTROL                     = 15,   // Cc
 139      * FORMAT                      = 16,   // Cf
 140      *                         // 17 reserved for proposed Ci category
 141      * PRIVATE_USE                 = 18,   // Co
 142      * SURROGATE                   = 19,   // Cs
 143      * DASH_PUNCTUATION            = 20,   // Pd
 144      * START_PUNCTUATION           = 21,   // Ps
 145      * END_PUNCTUATION             = 22,   // Pe
 146      * CONNECTOR_PUNCTUATION       = 23,   // Pc
 147      * OTHER_PUNCTUATION           = 24,   // Po
 148      * MATH_SYMBOL                 = 25,   // Sm
 149      * CURRENCY_SYMBOL             = 26,   // Sc
 150      * MODIFIER_SYMBOL             = 27,   // Sk
 151      * OTHER_SYMBOL                = 28;   // So
 152      */
 153 
 154     /**
 155      * Returns true if the character is an XML "letter".  XML Names must
 156      * start with Letters or a few other characters, but other characters
 157      * in names must only satisfy the <em>isNameChar</em> predicate.
 158      *
 159      * @see #isNameChar(char)
 160      * @see #isNCNameChar(char)
 161      */
 162     public static boolean isLetter(char c) {
 163         // [84] Letter ::= BaseChar | Ideographic
 164         // [85] BaseChar ::= ... too much to repeat
 165         // [86] Ideographic ::= ... too much to repeat
 166 
 167         //
 168         // Optimize the typical case.
 169         //
 170         if (c >= 'a' && c <= 'z')
 171             return true;
 172         if (c == '/')
 173             return false;
 174         if (c >= 'A' && c <= 'Z')
 175             return true;
 176 
 177         //
 178         // Since the tables are too ridiculous to use in code,
 179         // we're using the footnotes here to drive this test.
 180         //
 181         switch (Character.getType(c)) {
 182         // app. B footnote says these are 'name start'
 183         // chars' ...
 184         case Character.LOWERCASE_LETTER:        // Ll
 185         case Character.UPPERCASE_LETTER:        // Lu
 186         case Character.OTHER_LETTER:            // Lo
 187         case Character.TITLECASE_LETTER:        // Lt
 188         case Character.LETTER_NUMBER:            // Nl
 189 
 190             // OK, here we just have some exceptions to check...
 191             return !isCompatibilityChar(c)
 192                     // per "5.14 of Unicode", rule out some combiners
 193                     && !(c >= 0x20dd && c <= 0x20e0);
 194 
 195         default:
 196             // check for some exceptions:  these are "alphabetic"
 197             return ((c >= 0x02bb && c <= 0x02c1)
 198                     || c == 0x0559 || c == 0x06e5 || c == 0x06e6);
 199         }
 200     }
 201 
 202     //
 203     // XML 1.0 discourages "compatibility" characters in names; these
 204     // were defined to permit passing through some information stored in
 205     // older non-Unicode character sets.  These always have alternative
 206     // representations in Unicode, e.g. using combining chars.
 207     //
 208     private static boolean isCompatibilityChar(char c) {
 209         // the numerous comparisions here seem unavoidable,
 210         // but the switch can reduce the number which must
 211         // actually be executed.
 212 
 213         switch ((c >> 8) & 0x0ff) {
 214         case 0x00:
 215             // ISO Latin/1 has a few compatibility characters
 216             return c == 0x00aa || c == 0x00b5 || c == 0x00ba;
 217 
 218         case 0x01:
 219             // as do Latin Extended A and (parts of) B
 220             return (c >= 0x0132 && c <= 0x0133)
 221                     || (c >= 0x013f && c <= 0x0140)
 222                     || c == 0x0149
 223                     || c == 0x017f
 224                     || (c >= 0x01c4 && c <= 0x01cc)
 225                     || (c >= 0x01f1 && c <= 0x01f3);
 226 
 227         case 0x02:
 228             // some spacing modifiers
 229             return (c >= 0x02b0 && c <= 0x02b8)
 230                     || (c >= 0x02e0 && c <= 0x02e4);
 231 
 232         case 0x03:
 233             return c == 0x037a;            // Greek
 234 
 235         case 0x05:
 236             return c == 0x0587;            // Armenian
 237 
 238         case 0x0e:
 239             return c >= 0x0edc && c <= 0x0edd;    // Laotian
 240 
 241         case 0x11:
 242             // big chunks of Hangul Jamo are all "compatibility"
 243             return c == 0x1101
 244                     || c == 0x1104
 245                     || c == 0x1108
 246                     || c == 0x110a
 247                     || c == 0x110d
 248                     || (c >= 0x1113 && c <= 0x113b)
 249                     || c == 0x113d
 250                     || c == 0x113f
 251                     || (c >= 0x1141 && c <= 0x114b)
 252                     || c == 0x114d
 253                     || c == 0x114f
 254                     || (c >= 0x1151 && c <= 0x1153)
 255                     || (c >= 0x1156 && c <= 0x1158)
 256                     || c == 0x1162
 257                     || c == 0x1164
 258                     || c == 0x1166
 259                     || c == 0x1168
 260                     || (c >= 0x116a && c <= 0x116c)
 261                     || (c >= 0x116f && c <= 0x1171)
 262                     || c == 0x1174
 263                     || (c >= 0x1176 && c <= 0x119d)
 264                     || (c >= 0x119f && c <= 0x11a2)
 265                     || (c >= 0x11a9 && c <= 0x11aa)
 266                     || (c >= 0x11ac && c <= 0x11ad)
 267                     || (c >= 0x11b0 && c <= 0x11b6)
 268                     || c == 0x11b9
 269                     || c == 0x11bb
 270                     || (c >= 0x11c3 && c <= 0x11ea)
 271                     || (c >= 0x11ec && c <= 0x11ef)
 272                     || (c >= 0x11f1 && c <= 0x11f8)
 273                     ;
 274 
 275         case 0x20:
 276             return c == 0x207f;            // superscript
 277 
 278         case 0x21:
 279             return
 280                     // various letterlike symbols
 281                     c == 0x2102
 282                     || c == 0x2107
 283                     || (c >= 0x210a && c <= 0x2113)
 284                     || c == 0x2115
 285                     || (c >= 0x2118 && c <= 0x211d)
 286                     || c == 0x2124
 287                     || c == 0x2128
 288                     || (c >= 0x212c && c <= 0x212d)
 289                     || (c >= 0x212f && c <= 0x2138)
 290 
 291                     // most Roman numerals (less 1K, 5K, 10K)
 292                     || (c >= 0x2160 && c <= 0x217f)
 293                     ;
 294 
 295         case 0x30:
 296             // some Hiragana
 297             return c >= 0x309b && c <= 0x309c;
 298 
 299         case 0x31:
 300             // all Hangul Compatibility Jamo
 301             return c >= 0x3131 && c <= 0x318e;
 302 
 303         case 0xf9:
 304         case 0xfa:
 305         case 0xfb:
 306         case 0xfc:
 307         case 0xfd:
 308         case 0xfe:
 309         case 0xff:
 310             // the whole "compatibility" area is for that purpose!
 311             return true;
 312 
 313         default:
 314             // most of Unicode isn't flagged as being for compatibility
 315             return false;
 316         }
 317     }
 318 
 319     // guts of isNameChar/isNCNameChar
 320     private static boolean isLetter2(char c) {
 321         // [84] Letter ::= BaseChar | Ideographic
 322         // [85] BaseChar ::= ... too much to repeat
 323         // [86] Ideographic ::= ... too much to repeat
 324         // [87] CombiningChar ::= ... too much to repeat
 325 
 326         //
 327         // Optimize the typical case.
 328         //
 329         if (c >= 'a' && c <= 'z')
 330             return true;
 331         if (c == '>')
 332             return false;
 333         if (c >= 'A' && c <= 'Z')
 334             return true;
 335 
 336         //
 337         // Since the tables are too ridiculous to use in code,
 338         // we're using the footnotes here to drive this test.
 339         //
 340         switch (Character.getType(c)) {
 341         // app. B footnote says these are 'name start'
 342         // chars' ...
 343         case Character.LOWERCASE_LETTER:        // Ll
 344         case Character.UPPERCASE_LETTER:        // Lu
 345         case Character.OTHER_LETTER:            // Lo
 346         case Character.TITLECASE_LETTER:        // Lt
 347         case Character.LETTER_NUMBER:            // Nl
 348             // ... and these are name characters 'other
 349             // than name start characters'
 350         case Character.COMBINING_SPACING_MARK:    // Mc
 351         case Character.ENCLOSING_MARK:        // Me
 352         case Character.NON_SPACING_MARK:        // Mn
 353         case Character.MODIFIER_LETTER:        // Lm
 354         case Character.DECIMAL_DIGIT_NUMBER:        // Nd
 355 
 356             // OK, here we just have some exceptions to check...
 357             return !isCompatibilityChar(c)
 358                     // per "5.14 of Unicode", rule out some combiners
 359                     && !(c >= 0x20dd && c <= 0x20e0);
 360 
 361         default:
 362             // added a character ...
 363             return c == 0x0387;
 364         }
 365     }
 366 
 367     private static boolean isDigit(char c) {
 368         // [88] Digit ::= ...
 369 
 370         //
 371         // java.lang.Character.isDigit is correct from the XML point
 372         // of view except that it allows "fullwidth" digits.
 373         //
 374         return Character.isDigit(c)
 375                 && !((c >= 0xff10) && (c <= 0xff19));
 376     }
 377 
 378     private static boolean isExtender(char c) {
 379         // [89] Extender ::= ...
 380         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
 381                 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
 382                 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
 383                 || (c >= 0x309d && c <= 0x309e)
 384                 || (c >= 0x30fc && c <= 0x30fe)
 385                 ;
 386     }
 387 }