1 /*
2 * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
30 * Methods in this class are used to determine whether characters may
31 * appear in certain roles in XML documents. Such methods are used
32 * both to parse and to create such documents.
33 *
34 * @author David Brownell
35 * @version 1.1, 00/08/05
36 */
37 public class XmlChars {
38 // can't construct instances
39 private XmlChars() {
40 }
41
42 /**
43 * Returns true if the argument, a UCS-4 character code, is valid in
44 * XML documents. Unicode characters fit into the low sixteen
45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
46 * characters</em> can be combined to encode UCS-4 characters in
47 * documents containing only Unicode. (The <code>char</code> datatype
48 * in the Java Programming Language represents Unicode characters,
49 * including unpaired surrogates.)
50 * <p/>
51 * <P> In XML, UCS-4 characters can also be encoded by the use of
52 * <em>character references</em> such as <b>&#x12345678;</b>, which
53 * happens to refer to a character that is disallowed in XML documents.
54 * UCS-4 characters allowed in XML documents can be expressed with
55 * one or two Unicode characters.
56 *
57 * @param ucs4char The 32-bit UCS-4 character being tested.
58 */
59 static public boolean isChar(int ucs4char) {
60 // [2] Char ::= #x0009 | #x000A | #x000D
61 // | [#x0020-#xD7FF]
62 // ... surrogates excluded!
63 // | [#xE000-#xFFFD]
64 // | [#x10000-#x10ffff]
65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
66 || ucs4char == 0x000A || ucs4char == 0x0009
67 || ucs4char == 0x000D
68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
70 }
347 case Character.LETTER_NUMBER: // Nl
348 // ... and these are name characters 'other
349 // than name start characters'
350 case Character.COMBINING_SPACING_MARK: // Mc
351 case Character.ENCLOSING_MARK: // Me
352 case Character.NON_SPACING_MARK: // Mn
353 case Character.MODIFIER_LETTER: // Lm
354 case Character.DECIMAL_DIGIT_NUMBER: // Nd
355
356 // OK, here we just have some exceptions to check...
357 return !isCompatibilityChar(c)
358 // per "5.14 of Unicode", rule out some combiners
359 && !(c >= 0x20dd && c <= 0x20e0);
360
361 default:
362 // added a character ...
363 return c == 0x0387;
364 }
365 }
366
367 private static boolean isDigit(char c) {
368 // [88] Digit ::= ...
369
370 //
371 // java.lang.Character.isDigit is correct from the XML point
372 // of view except that it allows "fullwidth" digits.
373 //
374 return Character.isDigit(c)
375 && !((c >= 0xff10) && (c <= 0xff19));
376 }
377
378 private static boolean isExtender(char c) {
379 // [89] Extender ::= ...
380 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
381 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
382 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
383 || (c >= 0x309d && c <= 0x309e)
384 || (c >= 0x30fc && c <= 0x30fe)
385 ;
386 }
387 }
|
1 /*
2 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
30 * Methods in this class are used to determine whether characters may
31 * appear in certain roles in XML documents. Such methods are used
32 * both to parse and to create such documents.
33 *
34 * @author David Brownell
35 * @version 1.1, 00/08/05
36 */
37 public class XmlChars {
38 // can't construct instances
39 private XmlChars() {
40 }
41
42 /**
43 * Returns true if the argument, a UCS-4 character code, is valid in
44 * XML documents. Unicode characters fit into the low sixteen
45 * bits of a UCS-4 character, and pairs of Unicode <em>surrogate
46 * characters</em> can be combined to encode UCS-4 characters in
47 * documents containing only Unicode. (The <code>char</code> datatype
48 * in the Java Programming Language represents Unicode characters,
49 * including unpaired surrogates.)
50 * <p>
51 * <P> In XML, UCS-4 characters can also be encoded by the use of
52 * <em>character references</em> such as <b>&#x12345678;</b>, which
53 * happens to refer to a character that is disallowed in XML documents.
54 * UCS-4 characters allowed in XML documents can be expressed with
55 * one or two Unicode characters.
56 *
57 * @param ucs4char The 32-bit UCS-4 character being tested.
58 */
59 static public boolean isChar(int ucs4char) {
60 // [2] Char ::= #x0009 | #x000A | #x000D
61 // | [#x0020-#xD7FF]
62 // ... surrogates excluded!
63 // | [#xE000-#xFFFD]
64 // | [#x10000-#x10ffff]
65 return ((ucs4char >= 0x0020 && ucs4char <= 0xD7FF)
66 || ucs4char == 0x000A || ucs4char == 0x0009
67 || ucs4char == 0x000D
68 || (ucs4char >= 0xE000 && ucs4char <= 0xFFFD)
69 || (ucs4char >= 0x10000 && ucs4char <= 0x10ffff));
70 }
347 case Character.LETTER_NUMBER: // Nl
348 // ... and these are name characters 'other
349 // than name start characters'
350 case Character.COMBINING_SPACING_MARK: // Mc
351 case Character.ENCLOSING_MARK: // Me
352 case Character.NON_SPACING_MARK: // Mn
353 case Character.MODIFIER_LETTER: // Lm
354 case Character.DECIMAL_DIGIT_NUMBER: // Nd
355
356 // OK, here we just have some exceptions to check...
357 return !isCompatibilityChar(c)
358 // per "5.14 of Unicode", rule out some combiners
359 && !(c >= 0x20dd && c <= 0x20e0);
360
361 default:
362 // added a character ...
363 return c == 0x0387;
364 }
365 }
366
367 private static boolean isExtender(char c) {
368 // [89] Extender ::= ...
369 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
370 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6
371 || c == 0x3005 || (c >= 0x3031 && c <= 0x3035)
372 || (c >= 0x309d && c <= 0x309e)
373 || (c >= 0x30fc && c <= 0x30fe)
374 ;
375 }
376 }
|