--- old/jdk/make/data/characterdata/CharacterData00.java.template 2015-07-13 16:11:32.000000000 +0900
+++ new/jdk/make/data/characterdata/CharacterData00.java.template 2015-07-13 16:11:31.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -226,6 +226,11 @@
case 0xA77D : mapChar = 0x1D79; break;
case 0xA78D : mapChar = 0x0265; break;
case 0xA7AA : mapChar = 0x0266; break;
+ case 0xA7AB : mapChar = 0x025C; break;
+ case 0xA7AC : mapChar = 0x0261; break;
+ case 0xA7AD : mapChar = 0x026C; break;
+ case 0xA7B0 : mapChar = 0x029E; break;
+ case 0xA7B1 : mapChar = 0x0287; break;
// default mapChar is already set, so no
// need to redo it here.
// default : mapChar = ch;
@@ -284,10 +289,15 @@
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x0252 : mapChar = 0x2C70; break;
+ case 0x025C : mapChar = 0xA7AB; break;
+ case 0x0261 : mapChar = 0xA7AC; break;
case 0x0265 : mapChar = 0xA78D; break;
case 0x0266 : mapChar = 0xA7AA; break;
case 0x026B : mapChar = 0x2C62; break;
+ case 0x026C : mapChar = 0xA7AD; break;
case 0x0271 : mapChar = 0x2C6E; break;
+ case 0x0287 : mapChar = 0xA7B1; break;
+ case 0x029E : mapChar = 0xA7B0; break;
case 0x027D : mapChar = 0x2C64; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
@@ -503,6 +513,22 @@
// This is the only char with RLO
directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE;
break;
+ case 0x2066 :
+ // This is the only char with LRI
+ directionality = Character.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE;
+ break;
+ case 0x2067 :
+ // This is the only char with RLI
+ directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE;
+ break;
+ case 0x2068 :
+ // This is the only char with FSI
+ directionality = Character.DIRECTIONALITY_FIRST_STRONG_ISOLATE;
+ break;
+ case 0x2069 :
+ // This is the only char with PDI
+ directionality = Character.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE;
+ break;
default :
directionality = Character.DIRECTIONALITY_UNDEFINED;
break;
@@ -537,11 +563,16 @@
case 0x0250 : mapChar = 0x2C6F; break;
case 0x0251 : mapChar = 0x2C6D; break;
case 0x0252 : mapChar = 0x2C70; break;
+ case 0x025C : mapChar = 0xA7AB; break;
+ case 0x0261 : mapChar = 0xA7AC; break;
case 0x0265 : mapChar = 0xA78D; break;
case 0x0266 : mapChar = 0xA7AA; break;
case 0x026B : mapChar = 0x2C62; break;
+ case 0x026C : mapChar = 0xA7AD; break;
case 0x0271 : mapChar = 0x2C6E; break;
case 0x027D : mapChar = 0x2C64; break;
+ case 0x0287 : mapChar = 0xA7B1; break;
+ case 0x029E : mapChar = 0xA7B0; break;
case 0x1D79 : mapChar = 0xA77D; break;
case 0x1D7D : mapChar = 0x2C63; break;
case 0x2C65 : mapChar = 0x023A; break;
--- old/jdk/make/data/characterdata/CharacterData01.java.template 2015-07-13 16:11:32.000000000 +0900
+++ new/jdk/make/data/characterdata/CharacterData01.java.template 2015-07-13 16:11:32.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -244,81 +244,118 @@
case 0x10132: retval = 80000; break; // AEGEAN NUMBER EIGHTY THOUSAND
case 0x10133: retval = 90000; break; // AEGEAN NUMBER NINETY THOUSAND
case 0x10323: retval = 50; break; // OLD ITALIC NUMERAL FIFTY
-
- case 0x010144: retval = 50; break; // ACROPHONIC ATTIC FIFTY
- case 0x010145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED
- case 0x010146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND
- case 0x010147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND
- case 0x01014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS
- case 0x01014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS
- case 0x01014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS
- case 0x01014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS
- case 0x01014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS
- case 0x010151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS
- case 0x010152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS
- case 0x010153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS
- case 0x010154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS
- case 0x010155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS
- case 0x010156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS
- case 0x010166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY
- case 0x010167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM
- case 0x010168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY
- case 0x010169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY
- case 0x01016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED
- case 0x01016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED
- case 0x01016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED
- case 0x01016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED
- case 0x01016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED
- case 0x01016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED
- case 0x010170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED
- case 0x010171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND
- case 0x010172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND
- case 0x010174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS
- case 0x010341: retval = 90; break; // GOTHIC LETTER NINETY
- case 0x01034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED
- case 0x0103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED
- case 0x01085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED
- case 0x01085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND
- case 0x01085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND
- case 0x010919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED
- case 0x010A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED
- case 0x010A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND
- case 0x010A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY
- case 0x010B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED
- case 0x010B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
- case 0x010B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED
- case 0x010B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
- case 0x010E6C: retval = 40; break; // RUMI NUMBER FORTY
- case 0x010E6D: retval = 50; break; // RUMI NUMBER FIFTY
- case 0x010E6E: retval = 60; break; // RUMI NUMBER SIXTY
- case 0x010E6F: retval = 70; break; // RUMI NUMBER SEVENTY
- case 0x010E70: retval = 80; break; // RUMI NUMBER EIGHTY
- case 0x010E71: retval = 90; break; // RUMI NUMBER NINETY
- case 0x010E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED
- case 0x010E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED
- case 0x010E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED
- case 0x010E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED
- case 0x010E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED
- case 0x010E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED
- case 0x010E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED
- case 0x010E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED
- case 0x010E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED
- case 0x01105E: retval = 40; break; // BRAHMI NUMBER FORTY
- case 0x01105F: retval = 50; break; // BRAHMI NUMBER FIFTY
- case 0x011060: retval = 60; break; // BRAHMI NUMBER SIXTY
- case 0x011061: retval = 70; break; // BRAHMI NUMBER SEVENTY
- case 0x011062: retval = 80; break; // BRAHMI NUMBER EIGHTY
- case 0x011063: retval = 90; break; // BRAHMI NUMBER NINETY
- case 0x011064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED
- case 0x011065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND
- case 0x012432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH
- case 0x012433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN
- case 0x01D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR
- case 0x01D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE
- case 0x01D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX
- case 0x01D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN
- case 0x01D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT
- case 0x01D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE
+ case 0x10144: retval = 50; break; // ACROPHONIC ATTIC FIFTY
+ case 0x10145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED
+ case 0x10146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND
+ case 0x10147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND
+ case 0x1014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS
+ case 0x1014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS
+ case 0x1014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS
+ case 0x1014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS
+ case 0x1014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS
+ case 0x10151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS
+ case 0x10152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS
+ case 0x10153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS
+ case 0x10154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS
+ case 0x10155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS
+ case 0x10156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS
+ case 0x10166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY
+ case 0x10167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM
+ case 0x10168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY
+ case 0x10169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY
+ case 0x1016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED
+ case 0x1016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED
+ case 0x1016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED
+ case 0x1016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED
+ case 0x1016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED
+ case 0x1016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED
+ case 0x10170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED
+ case 0x10171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND
+ case 0x10172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND
+ case 0x10174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS
+ case 0x102ED: retval = 40; break; // COPTIC EPACT NUMBER FORTY
+ case 0x102EE: retval = 50; break; // COPTIC EPACT NUMBER FIFTY
+ case 0x102EF: retval = 60; break; // COPTIC EPACT NUMBER SIXTY
+ case 0x102F0: retval = 70; break; // COPTIC EPACT NUMBER SEVENTY
+ case 0x102F1: retval = 80; break; // COPTIC EPACT NUMBER EIGHTY
+ case 0x102F2: retval = 90; break; // COPTIC EPACT NUMBER NINETY
+ case 0x102F3: retval = 100; break; // COPTIC EPACT NUMBER ONE HUNDRED
+ case 0x102F4: retval = 200; break; // COPTIC EPACT NUMBER TWO HUNDRED
+ case 0x102F5: retval = 300; break; // COPTIC EPACT NUMBER THREE HUNDRED
+ case 0x102F6: retval = 400; break; // COPTIC EPACT NUMBER FOUR HUNDRED
+ case 0x102F7: retval = 500; break; // COPTIC EPACT NUMBER FIVE HUNDRED
+ case 0x102F8: retval = 600; break; // COPTIC EPACT NUMBER SIX HUNDRED
+ case 0x102F9: retval = 700; break; // COPTIC EPACT NUMBER SEVEN HUNDRED
+ case 0x102FA: retval = 800; break; // COPTIC EPACT NUMBER EIGHT HUNDRED
+ case 0x102FB: retval = 900; break; // COPTIC EPACT NUMBER NINE HUNDRED
+ case 0x10341: retval = 90; break; // GOTHIC LETTER NINETY
+ case 0x1034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED
+ case 0x103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED
+ case 0x1085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED
+ case 0x1085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND
+ case 0x1085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND
+ case 0x108AF: retval = 100; break; // NABATAEAN NUMBER ONE HUNDRED
+ case 0x10919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED
+ case 0x10A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED
+ case 0x10A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND
+ case 0x10A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY
+ case 0x10AEF: retval = 100; break; // MANICHAEAN NUMBER ONE HUNDRED
+ case 0x10B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED
+ case 0x10B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND
+ case 0x10B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED
+ case 0x10B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND
+ case 0x10BAF: retval = 100; break; // PSALTER PAHLAVI NUMBER ONE HUNDRED
+ case 0x10E6C: retval = 40; break; // RUMI NUMBER FORTY
+ case 0x10E6D: retval = 50; break; // RUMI NUMBER FIFTY
+ case 0x10E6E: retval = 60; break; // RUMI NUMBER SIXTY
+ case 0x10E6F: retval = 70; break; // RUMI NUMBER SEVENTY
+ case 0x10E70: retval = 80; break; // RUMI NUMBER EIGHTY
+ case 0x10E71: retval = 90; break; // RUMI NUMBER NINETY
+ case 0x10E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED
+ case 0x10E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED
+ case 0x10E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED
+ case 0x10E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED
+ case 0x10E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED
+ case 0x10E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED
+ case 0x10E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED
+ case 0x10E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED
+ case 0x10E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED
+ case 0x1105E: retval = 40; break; // BRAHMI NUMBER FORTY
+ case 0x1105F: retval = 50; break; // BRAHMI NUMBER FIFTY
+ case 0x11060: retval = 60; break; // BRAHMI NUMBER SIXTY
+ case 0x11061: retval = 70; break; // BRAHMI NUMBER SEVENTY
+ case 0x11062: retval = 80; break; // BRAHMI NUMBER EIGHTY
+ case 0x11063: retval = 90; break; // BRAHMI NUMBER NINETY
+ case 0x11064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED
+ case 0x11065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND
+ case 0x111ED: retval = 40; break; // SINHALA ARCHAIC NUMBER FORTY
+ case 0x111EE: retval = 50; break; // SINHALA ARCHAIC NUMBER FIFTY
+ case 0x111EF: retval = 60; break; // SINHALA ARCHAIC NUMBER SIXTY
+ case 0x111F0: retval = 70; break; // SINHALA ARCHAIC NUMBER SEVENTY
+ case 0x111F1: retval = 80; break; // SINHALA ARCHAIC NUMBER EIGHTY
+ case 0x111F2: retval = 90; break; // SINHALA ARCHAIC NUMBER NINETY
+ case 0x111F3: retval = 100; break; // SINHALA ARCHAIC NUMBER ONE HUNDRED
+ case 0x111F4: retval = 1000; break; // SINHALA ARCHAIC NUMBER ONE THOUSAND
+ case 0x118ED: retval = 40; break; // WARANG CITI NUMBER FORTY
+ case 0x118EE: retval = 50; break; // WARANG CITI NUMBER FIFTY
+ case 0x118EF: retval = 60; break; // WARANG CITI NUMBER SIXTY
+ case 0x118F0: retval = 70; break; // WARANG CITI NUMBER SEVENTY
+ case 0x118F1: retval = 80; break; // WARANG CITI NUMBER EIGHTY
+ case 0x118F2: retval = 90; break; // WARANG CITI NUMBER NINETY
+ case 0x12432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH
+ case 0x12433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN
+ case 0x12467: retval = 40; break; // CUNEIFORM NUMERIC SIGN ELAMITE FORTY
+ case 0x12468: retval = 50; break; // CUNEIFORM NUMERIC SIGN ELAMITE FIFTY
+ case 0x16B5C: retval = 100; break; // PAHAWH HMONG NUMBER HUNDREDS
+ case 0x16B5D: retval = 10000; break; // PAHAWH HMONG NUMBER TEN THOUSANDS
+ case 0x16B5E: retval = 1000000; break; // PAHAWH HMONG NUMBER MILLIONS
+ case 0x16B5F: retval = 100000000; break;// PAHAWH HMONG NUMBER HUNDRED MILLIONS
+ case 0x1D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR
+ case 0x1D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE
+ case 0x1D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX
+ case 0x1D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN
+ case 0x1D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT
+ case 0x1D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE
default: retval = -2; break;
}
--- old/jdk/make/data/unicodedata/PropList.txt 2015-07-13 16:11:33.000000000 +0900
+++ new/jdk/make/data/unicodedata/PropList.txt 2015-07-13 16:11:33.000000000 +0900
@@ -1,8 +1,8 @@
-# PropList-6.2.0.txt
-# Date: 2012-05-23, 20:34:59 GMT [MD]
+# PropList-7.0.0.txt
+# Date: 2014-02-19, 15:51:26 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -13,7 +13,6 @@
0085 ; White_Space # Cc
- * Character information is based on the Unicode Standard, version 6.2.0.
+ * Character information is based on the Unicode Standard, version 7.0.0.
*
* The methods and data of class {@code Character} are defined by
* the information in the UnicodeData file that is part of the
@@ -490,6 +490,30 @@
public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18;
/**
+ * Weak bidirectional character type "LRI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19;
+
+ /**
+ * Weak bidirectional character type "RLI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20;
+
+ /**
+ * Weak bidirectional character type "FSI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21;
+
+ /**
+ * Weak bidirectional character type "PDI" in the Unicode specification.
+ * @since 1.9
+ */
+ public static final byte DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22;
+
+ /**
* The minimum value of a
*
* Unicode high-surrogate code unit
@@ -2561,6 +2585,269 @@
"ARABIC MATHEMATICAL ALPHABETIC SYMBOLS",
"ARABICMATHEMATICALALPHABETICSYMBOLS");
+ /**
+ * Constant for the "Combining Diacritical Marks Extended" Unicode
+ * character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_EXTENDED =
+ new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_EXTENDED",
+ "COMBINING DIACRITICAL MARKS EXTENDED",
+ "COMBININGDIACRITICALMARKSEXTENDED");
+
+ /**
+ * Constant for the "Myanmar Extended-B" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MYANMAR_EXTENDED_B =
+ new UnicodeBlock("MYANMAR_EXTENDED_B",
+ "MYANMAR EXTENDED-B",
+ "MYANMAREXTENDED-B");
+
+ /**
+ * Constant for the "Latin Extended-E" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock LATIN_EXTENDED_E =
+ new UnicodeBlock("LATIN_EXTENDED_E",
+ "LATIN EXTENDED-E",
+ "LATINEXTENDED-E");
+
+ /**
+ * Constant for the "Coptic Epact Numbers" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock COPTIC_EPACT_NUMBERS =
+ new UnicodeBlock("COPTIC_EPACT_NUMBERS",
+ "COPTIC EPACT NUMBERS",
+ "COPTICEPACTNUMBERS");
+
+ /**
+ * Constant for the "Old Permic" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock OLD_PERMIC =
+ new UnicodeBlock("OLD_PERMIC",
+ "OLD PERMIC",
+ "OLDPERMIC");
+
+ /**
+ * Constant for the "Elbasan" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock ELBASAN =
+ new UnicodeBlock("ELBASAN");
+
+ /**
+ * Constant for the "Caucasian Albanian" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock CAUCASIAN_ALBANIAN =
+ new UnicodeBlock("CAUCASIAN_ALBANIAN",
+ "CAUCASIAN ALBANIAN",
+ "CAUCASIANALBANIAN");
+
+ /**
+ * Constant for the "Linear A" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock LINEAR_A =
+ new UnicodeBlock("LINEAR_A",
+ "LINEAR A",
+ "LINEARA");
+
+ /**
+ * Constant for the "Palmyrene" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PALMYRENE =
+ new UnicodeBlock("PALMYRENE");
+
+ /**
+ * Constant for the "Nabataean" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock NABATAEAN =
+ new UnicodeBlock("NABATAEAN");
+
+ /**
+ * Constant for the "Old North Arabian" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock OLD_NORTH_ARABIAN =
+ new UnicodeBlock("OLD_NORTH_ARABIAN",
+ "OLD NORTH ARABIAN",
+ "OLDNORTHARABIAN");
+
+ /**
+ * Constant for the "Manichaean" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MANICHAEAN =
+ new UnicodeBlock("MANICHAEAN");
+
+ /**
+ * Constant for the "Psalter Pahlavi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PSALTER_PAHLAVI =
+ new UnicodeBlock("PSALTER_PAHLAVI",
+ "PSALTER PAHLAVI",
+ "PSALTERPAHLAVI");
+
+ /**
+ * Constant for the "Mahajani" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MAHAJANI =
+ new UnicodeBlock("MAHAJANI");
+
+ /**
+ * Constant for the "Sinhala Archaic Numbers" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SINHALA_ARCHAIC_NUMBERS =
+ new UnicodeBlock("SINHALA_ARCHAIC_NUMBERS",
+ "SINHALA ARCHAIC NUMBERS",
+ "SINHALAARCHAICNUMBERS");
+
+ /**
+ * Constant for the "Khojki" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock KHOJKI =
+ new UnicodeBlock("KHOJKI");
+
+ /**
+ * Constant for the "Khudawadi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock KHUDAWADI =
+ new UnicodeBlock("KHUDAWADI");
+
+ /**
+ * Constant for the "Grantha" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock GRANTHA =
+ new UnicodeBlock("GRANTHA");
+
+ /**
+ * Constant for the "Tirhuta" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock TIRHUTA =
+ new UnicodeBlock("TIRHUTA");
+
+ /**
+ * Constant for the "Siddham" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SIDDHAM =
+ new UnicodeBlock("SIDDHAM");
+
+ /**
+ * Constant for the "Modi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MODI =
+ new UnicodeBlock("MODI");
+
+ /**
+ * Constant for the "Warang Citi" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock WARANG_CITI =
+ new UnicodeBlock("WARANG_CITI",
+ "WARANG CITI",
+ "WARANGCITI");
+
+ /**
+ * Constant for the "Pau Cin Hau" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PAU_CIN_HAU =
+ new UnicodeBlock("PAU_CIN_HAU",
+ "PAU CIN HAU",
+ "PAUCINHAU");
+
+ /**
+ * Constant for the "Mro" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MRO =
+ new UnicodeBlock("MRO");
+
+ /**
+ * Constant for the "Bassa Vah" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock BASSA_VAH =
+ new UnicodeBlock("BASSA_VAH",
+ "BASSA VAH",
+ "BASSAVAH");
+
+ /**
+ * Constant for the "Pahawh Hmong" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock PAHAWH_HMONG =
+ new UnicodeBlock("PAHAWH_HMONG",
+ "PAHAWH HMONG",
+ "PAHAWHHMONG");
+
+ /**
+ * Constant for the "Duployan" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock DUPLOYAN =
+ new UnicodeBlock("DUPLOYAN");
+
+ /**
+ * Constant for the "Shorthand Format Controls" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SHORTHAND_FORMAT_CONTROLS =
+ new UnicodeBlock("SHORTHAND_FORMAT_CONTROLS",
+ "SHORTHAND FORMAT CONTROLS",
+ "SHORTHANDFORMATCONTROLS");
+
+ /**
+ * Constant for the "Mende Kikakui" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock MENDE_KIKAKUI =
+ new UnicodeBlock("MENDE_KIKAKUI",
+ "MENDE KIKAKUI",
+ "MENDEKIKAKUI");
+
+ /**
+ * Constant for the "Ornamental Dingbats" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock ORNAMENTAL_DINGBATS =
+ new UnicodeBlock("ORNAMENTAL_DINGBATS",
+ "ORNAMENTAL DINGBATS",
+ "ORNAMENTALDINGBATS");
+
+ /**
+ * Constant for the "Geometric Shapes Extended" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock GEOMETRIC_SHAPES_EXTENDED =
+ new UnicodeBlock("GEOMETRIC_SHAPES_EXTENDED",
+ "GEOMETRIC SHAPES EXTENDED",
+ "GEOMETRICSHAPESEXTENDED");
+
+ /**
+ * Constant for the "Supplemental Arrows-C" Unicode character block.
+ * @since 1.9
+ */
+ public static final UnicodeBlock SUPPLEMENTAL_ARROWS_C =
+ new UnicodeBlock("SUPPLEMENTAL_ARROWS_C",
+ "SUPPLEMENTAL ARROWS-C",
+ "SUPPLEMENTALARROWS-C");
+
private static final int blockStarts[] = {
0x0000, // 0000..007F; Basic Latin
0x0080, // 0080..00FF; Latin-1 Supplement
@@ -2618,7 +2905,7 @@
0x19E0, // 19E0..19FF; Khmer Symbols
0x1A00, // 1A00..1A1F; Buginese
0x1A20, // 1A20..1AAF; Tai Tham
- 0x1AB0, // unassigned
+ 0x1AB0, // 1AB0..1AFF; Combining Diacritical Marks Extended
0x1B00, // 1B00..1B7F; Balinese
0x1B80, // 1B80..1BBF; Sundanese
0x1BC0, // 1BC0..1BFF; Batak
@@ -2699,13 +2986,14 @@
0xA930, // A930..A95F; Rejang
0xA960, // A960..A97F; Hangul Jamo Extended-A
0xA980, // A980..A9DF; Javanese
- 0xA9E0, // unassigned
+ 0xA9E0, // A9E0..A9FF; Myanmar Extended-B
0xAA00, // AA00..AA5F; Cham
0xAA60, // AA60..AA7F; Myanmar Extended-A
0xAA80, // AA80..AADF; Tai Viet
0xAAE0, // AAE0..AAFF; Meetei Mayek Extensions
0xAB00, // AB00..AB2F; Ethiopic Extended-A
- 0xAB30, // unassigned
+ 0xAB30, // AB30..AB6F; Latin Extended-E
+ 0xAB70, // unassigned
0xABC0, // ABC0..ABFF; Meetei Mayek
0xAC00, // AC00..D7AF; Hangul Syllables
0xD7B0, // D7B0..D7FF; Hangul Jamo Extended-B
@@ -2733,10 +3021,10 @@
0x10200, // unassigned
0x10280, // 10280..1029F; Lycian
0x102A0, // 102A0..102DF; Carian
- 0x102E0, // unassigned
+ 0x102E0, // 102E0..102FF; Coptic Epact Numbers
0x10300, // 10300..1032F; Old Italic
0x10330, // 10330..1034F; Gothic
- 0x10350, // unassigned
+ 0x10350, // 10350..1037F; Old Permic
0x10380, // 10380..1039F; Ugaritic
0x103A0, // 103A0..103DF; Old Persian
0x103E0, // unassigned
@@ -2744,9 +3032,16 @@
0x10450, // 10450..1047F; Shavian
0x10480, // 10480..104AF; Osmanya
0x104B0, // unassigned
+ 0x10500, // 10500..1052F; Elbasan
+ 0x10530, // 10530..1056F; Caucasian Albanian
+ 0x10570, // unassigned
+ 0x10600, // 10600..1077F; Linear A
+ 0x10780, // unassigned
0x10800, // 10800..1083F; Cypriot Syllabary
0x10840, // 10840..1085F; Imperial Aramaic
- 0x10860, // unassigned
+ 0x10860, // 10860..1087F; Palmyrene
+ 0x10880, // 10880..108AF; Nabataean
+ 0x108B0, // unassigned
0x10900, // 10900..1091F; Phoenician
0x10920, // 10920..1093F; Lydian
0x10940, // unassigned
@@ -2754,11 +3049,14 @@
0x109A0, // 109A0..109FF; Meroitic Cursive
0x10A00, // 10A00..10A5F; Kharoshthi
0x10A60, // 10A60..10A7F; Old South Arabian
- 0x10A80, // unassigned
+ 0x10A80, // 10A80..10A9F; Old North Arabian
+ 0x10AA0, // unassigned
+ 0x10AC0, // 10AC0..10AFF; Manichaean
0x10B00, // 10B00..10B3F; Avestan
0x10B40, // 10B40..10B5F; Inscriptional Parthian
0x10B60, // 10B60..10B7F; Inscriptional Pahlavi
- 0x10B80, // unassigned
+ 0x10B80, // 10B80..10BAF; Psalter Pahlavi
+ 0x10BB0, // unassigned
0x10C00, // 10C00..10C4F; Old Turkic
0x10C50, // unassigned
0x10E60, // 10E60..10E7F; Rumi Numeral Symbols
@@ -2767,22 +3065,43 @@
0x11080, // 11080..110CF; Kaithi
0x110D0, // 110D0..110FF; Sora Sompeng
0x11100, // 11100..1114F; Chakma
- 0x11150, // unassigned
+ 0x11150, // 11150..1117F; Mahajani
0x11180, // 11180..111DF; Sharada
- 0x111E0, // unassigned
+ 0x111E0, // 111E0..111FF; Sinhala Archaic Numbers
+ 0x11200, // 11200..1124F; Khojki
+ 0x11250, // unassigned
+ 0x112B0, // 112B0..112FF; Khudawadi
+ 0x11300, // 11300..1137F; Grantha
+ 0x11380, // unassigned
+ 0x11480, // 11480..114DF; Tirhuta
+ 0x114E0, // unassigned
+ 0x11580, // 11580..115FF; Siddham
+ 0x11600, // 11600..1165F; Modi
+ 0x11660, // unassigned
0x11680, // 11680..116CF; Takri
0x116D0, // unassigned
+ 0x118A0, // 118A0..118FF; Warang Citi
+ 0x11900, // unassigned
+ 0x11AC0, // 11AC0..11AFF; Pau Cin Hau
+ 0x11B00, // unassigned
0x12000, // 12000..123FF; Cuneiform
0x12400, // 12400..1247F; Cuneiform Numbers and Punctuation
0x12480, // unassigned
0x13000, // 13000..1342F; Egyptian Hieroglyphs
0x13430, // unassigned
0x16800, // 16800..16A3F; Bamum Supplement
- 0x16A40, // unassigned
+ 0x16A40, // 16A40..16A6F; Mro
+ 0x16A70, // unassigned
+ 0x16AD0, // 16AD0..16AFF; Bassa Vah
+ 0x16B00, // 16B00..16B8F; Pahawh Hmong
+ 0x16B90, // unassigned
0x16F00, // 16F00..16F9F; Miao
0x16FA0, // unassigned
0x1B000, // 1B000..1B0FF; Kana Supplement
0x1B100, // unassigned
+ 0x1BC00, // 1BC00..1BC9F; Duployan
+ 0x1BCA0, // 1BCA0..1BCAF; Shorthand Format Controls
+ 0x1BCB0, // unassigned
0x1D000, // 1D000..1D0FF; Byzantine Musical Symbols
0x1D100, // 1D100..1D1FF; Musical Symbols
0x1D200, // 1D200..1D24F; Ancient Greek Musical Notation
@@ -2792,6 +3111,8 @@
0x1D380, // unassigned
0x1D400, // 1D400..1D7FF; Mathematical Alphanumeric Symbols
0x1D800, // unassigned
+ 0x1E800, // 1E800..1E8DF; Mende Kikakui
+ 0x1E8E0, // unassigned
0x1EE00, // 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols
0x1EF00, // unassigned
0x1F000, // 1F000..1F02F; Mahjong Tiles
@@ -2801,10 +3122,12 @@
0x1F200, // 1F200..1F2FF; Enclosed Ideographic Supplement
0x1F300, // 1F300..1F5FF; Miscellaneous Symbols And Pictographs
0x1F600, // 1F600..1F64F; Emoticons
- 0x1F650, // unassigned
+ 0x1F650, // 1F650..1F67F; Ornamental Dingbats
0x1F680, // 1F680..1F6FF; Transport And Map Symbols
0x1F700, // 1F700..1F77F; Alchemical Symbols
- 0x1F780, // unassigned
+ 0x1F780, // 1F780..1F7FF; Geometric Shapes Extended
+ 0x1F800, // 1F800..1F8FF; Supplemental Arrows-C
+ 0x1F900, // unassigned
0x20000, // 20000..2A6DF; CJK Unified Ideographs Extension B
0x2A6E0, // unassigned
0x2A700, // 2A700..2B73F; CJK Unified Ideographs Extension C
@@ -2877,7 +3200,7 @@
KHMER_SYMBOLS,
BUGINESE,
TAI_THAM,
- null,
+ COMBINING_DIACRITICAL_MARKS_EXTENDED,
BALINESE,
SUNDANESE,
BATAK,
@@ -2958,12 +3281,13 @@
REJANG,
HANGUL_JAMO_EXTENDED_A,
JAVANESE,
- null,
+ MYANMAR_EXTENDED_B,
CHAM,
MYANMAR_EXTENDED_A,
TAI_VIET,
MEETEI_MAYEK_EXTENSIONS,
ETHIOPIC_EXTENDED_A,
+ LATIN_EXTENDED_E,
null,
MEETEI_MAYEK,
HANGUL_SYLLABLES,
@@ -2992,10 +3316,10 @@
null,
LYCIAN,
CARIAN,
- null,
+ COPTIC_EPACT_NUMBERS,
OLD_ITALIC,
GOTHIC,
- null,
+ OLD_PERMIC,
UGARITIC,
OLD_PERSIAN,
null,
@@ -3003,8 +3327,15 @@
SHAVIAN,
OSMANYA,
null,
+ ELBASAN,
+ CAUCASIAN_ALBANIAN,
+ null,
+ LINEAR_A,
+ null,
CYPRIOT_SYLLABARY,
IMPERIAL_ARAMAIC,
+ PALMYRENE,
+ NABATAEAN,
null,
PHOENICIAN,
LYDIAN,
@@ -3013,10 +3344,13 @@
MEROITIC_CURSIVE,
KHAROSHTHI,
OLD_SOUTH_ARABIAN,
+ OLD_NORTH_ARABIAN,
null,
+ MANICHAEAN,
AVESTAN,
INSCRIPTIONAL_PARTHIAN,
INSCRIPTIONAL_PAHLAVI,
+ PSALTER_PAHLAVI,
null,
OLD_TURKIC,
null,
@@ -3026,22 +3360,43 @@
KAITHI,
SORA_SOMPENG,
CHAKMA,
- null,
+ MAHAJANI,
SHARADA,
+ SINHALA_ARCHAIC_NUMBERS,
+ KHOJKI,
+ null,
+ KHUDAWADI,
+ GRANTHA,
+ null,
+ TIRHUTA,
+ null,
+ SIDDHAM,
+ MODI,
null,
TAKRI,
null,
+ WARANG_CITI,
+ null,
+ PAU_CIN_HAU,
+ null,
CUNEIFORM,
CUNEIFORM_NUMBERS_AND_PUNCTUATION,
null,
EGYPTIAN_HIEROGLYPHS,
null,
BAMUM_SUPPLEMENT,
+ MRO,
+ null,
+ BASSA_VAH,
+ PAHAWH_HMONG,
null,
MIAO,
null,
KANA_SUPPLEMENT,
null,
+ DUPLOYAN,
+ SHORTHAND_FORMAT_CONTROLS,
+ null,
BYZANTINE_MUSICAL_SYMBOLS,
MUSICAL_SYMBOLS,
ANCIENT_GREEK_MUSICAL_NOTATION,
@@ -3051,6 +3406,8 @@
null,
MATHEMATICAL_ALPHANUMERIC_SYMBOLS,
null,
+ MENDE_KIKAKUI,
+ null,
ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS,
null,
MAHJONG_TILES,
@@ -3060,9 +3417,11 @@
ENCLOSED_IDEOGRAPHIC_SUPPLEMENT,
MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS,
EMOTICONS,
- null,
+ ORNAMENTAL_DINGBATS,
TRANSPORT_AND_MAP_SYMBOLS,
ALCHEMICAL_SYMBOLS,
+ GEOMETRIC_SHAPES_EXTENDED,
+ SUPPLEMENTAL_ARROWS_C,
null,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
null,
@@ -3675,40 +4034,185 @@
/**
* Unicode script "Meroitic Hieroglyphs".
+ * @since 1.8
*/
MEROITIC_HIEROGLYPHS,
/**
* Unicode script "Meroitic Cursive".
+ * @since 1.8
*/
MEROITIC_CURSIVE,
/**
* Unicode script "Sora Sompeng".
+ * @since 1.8
*/
SORA_SOMPENG,
/**
* Unicode script "Chakma".
+ * @since 1.8
*/
CHAKMA,
/**
* Unicode script "Sharada".
+ * @since 1.8
*/
SHARADA,
/**
* Unicode script "Takri".
+ * @since 1.8
*/
TAKRI,
/**
* Unicode script "Miao".
+ * @since 1.8
*/
MIAO,
/**
+ * Unicode script "Caucasian Albanian".
+ * @since 1.9
+ */
+ CAUCASIAN_ALBANIAN,
+
+ /**
+ * Unicode script "Bassa Vah".
+ * @since 1.9
+ */
+ BASSA_VAH,
+
+ /**
+ * Unicode script "Duployan".
+ * @since 1.9
+ */
+ DUPLOYAN,
+
+ /**
+ * Unicode script "Elbasan".
+ * @since 1.9
+ */
+ ELBASAN,
+
+ /**
+ * Unicode script "Grantha".
+ * @since 1.9
+ */
+ GRANTHA,
+
+ /**
+ * Unicode script "Pahawh Hmong".
+ * @since 1.9
+ */
+ PAHAWH_HMONG,
+
+ /**
+ * Unicode script "Khojki".
+ * @since 1.9
+ */
+ KHOJKI,
+
+ /**
+ * Unicode script "Linear A".
+ * @since 1.9
+ */
+ LINEAR_A,
+
+ /**
+ * Unicode script "Mahajani".
+ * @since 1.9
+ */
+ MAHAJANI,
+
+ /**
+ * Unicode script "Manichaean".
+ * @since 1.9
+ */
+ MANICHAEAN,
+
+ /**
+ * Unicode script "Mende Kikakui".
+ * @since 1.9
+ */
+ MENDE_KIKAKUI,
+
+ /**
+ * Unicode script "Modi".
+ * @since 1.9
+ */
+ MODI,
+
+ /**
+ * Unicode script "Mro".
+ * @since 1.9
+ */
+ MRO,
+
+ /**
+ * Unicode script "Old North Arabian".
+ * @since 1.9
+ */
+ OLD_NORTH_ARABIAN,
+
+ /**
+ * Unicode script "Nabataean".
+ * @since 1.9
+ */
+ NABATAEAN,
+
+ /**
+ * Unicode script "Palmyrene".
+ * @since 1.9
+ */
+ PALMYRENE,
+
+ /**
+ * Unicode script "Pau Cin Hau".
+ * @since 1.9
+ */
+ PAU_CIN_HAU,
+
+ /**
+ * Unicode script "Old Permic".
+ * @since 1.9
+ */
+ OLD_PERMIC,
+
+ /**
+ * Unicode script "Psalter Pahlavi".
+ * @since 1.9
+ */
+ PSALTER_PAHLAVI,
+
+ /**
+ * Unicode script "Siddham".
+ * @since 1.9
+ */
+ SIDDHAM,
+
+ /**
+ * Unicode script "Khudawadi".
+ * @since 1.9
+ */
+ KHUDAWADI,
+
+ /**
+ * Unicode script "Tirhuta".
+ * @since 1.9
+ */
+ TIRHUTA,
+
+ /**
+ * Unicode script "Warang Citi".
+ * @since 1.9
+ */
+ WARANG_CITI,
+
+ /**
* Unicode script "Unknown".
*/
UNKNOWN;
@@ -3719,14 +4223,14 @@
0x005B, // 005B..0060; COMMON
0x0061, // 0061..007A; LATIN
0x007B, // 007B..00A9; COMMON
- 0x00AA, // 00AA..00AA; LATIN
+ 0x00AA, // 00AA ; LATIN
0x00AB, // 00AB..00B9; COMMON
- 0x00BA, // 00BA..00BA; LATIN
+ 0x00BA, // 00BA ; LATIN
0x00BB, // 00BB..00BF; COMMON
0x00C0, // 00C0..00D6; LATIN
- 0x00D7, // 00D7..00D7; COMMON
+ 0x00D7, // 00D7 ; COMMON
0x00D8, // 00D8..00F6; LATIN
- 0x00F7, // 00F7..00F7; COMMON
+ 0x00F7, // 00F7 ; COMMON
0x00F8, // 00F8..02B8; LATIN
0x02B9, // 02B9..02DF; COMMON
0x02E0, // 02E0..02E4; LATIN
@@ -3735,284 +4239,1178 @@
0x02EC, // 02EC..02FF; COMMON
0x0300, // 0300..036F; INHERITED
0x0370, // 0370..0373; GREEK
- 0x0374, // 0374..0374; COMMON
- 0x0375, // 0375..037D; GREEK
- 0x037E, // 037E..0383; COMMON
- 0x0384, // 0384..0384; GREEK
- 0x0385, // 0385..0385; COMMON
- 0x0386, // 0386..0386; GREEK
- 0x0387, // 0387..0387; COMMON
- 0x0388, // 0388..03E1; GREEK
+ 0x0374, // 0374 ; COMMON
+ 0x0375, // 0375..0377; GREEK
+ 0x0378, // 0378..0379; UNKNOWN
+ 0x037A, // 037A..037D; GREEK
+ 0x037E, // 037E ; COMMON
+ 0x037F, // 037F ; GREEK
+ 0x0380, // 0380..0383; UNKNOWN
+ 0x0384, // 0384 ; GREEK
+ 0x0385, // 0385 ; COMMON
+ 0x0386, // 0386 ; GREEK
+ 0x0387, // 0387 ; COMMON
+ 0x0388, // 0388..038A; GREEK
+ 0x038B, // 038B ; UNKNOWN
+ 0x038C, // 038C ; GREEK
+ 0x038D, // 038D ; UNKNOWN
+ 0x038E, // 038E..03A1; GREEK
+ 0x03A2, // 03A2 ; UNKNOWN
+ 0x03A3, // 03A3..03E1; GREEK
0x03E2, // 03E2..03EF; COPTIC
0x03F0, // 03F0..03FF; GREEK
0x0400, // 0400..0484; CYRILLIC
0x0485, // 0485..0486; INHERITED
- 0x0487, // 0487..0530; CYRILLIC
- 0x0531, // 0531..0588; ARMENIAN
- 0x0589, // 0589..0589; COMMON
- 0x058A, // 058A..0590; ARMENIAN
- 0x0591, // 0591..05FF; HEBREW
- 0x0600, // 0600..060B; ARABIC
- 0x060C, // 060C..060C; COMMON
+ 0x0487, // 0487..052F; CYRILLIC
+ 0x0530, // 0530 ; UNKNOWN
+ 0x0531, // 0531..0556; ARMENIAN
+ 0x0557, // 0557..0558; UNKNOWN
+ 0x0559, // 0559..055F; ARMENIAN
+ 0x0560, // 0560 ; UNKNOWN
+ 0x0561, // 0561..0587; ARMENIAN
+ 0x0588, // 0588 ; UNKNOWN
+ 0x0589, // 0589 ; COMMON
+ 0x058A, // 058A ; ARMENIAN
+ 0x058B, // 058B..058C; UNKNOWN
+ 0x058D, // 058D..058F; ARMENIAN
+ 0x0590, // 0590 ; UNKNOWN
+ 0x0591, // 0591..05C7; HEBREW
+ 0x05C8, // 05C8..05CF; UNKNOWN
+ 0x05D0, // 05D0..05EA; HEBREW
+ 0x05EB, // 05EB..05EF; UNKNOWN
+ 0x05F0, // 05F0..05F4; HEBREW
+ 0x05F5, // 05F5..05FF; UNKNOWN
+ 0x0600, // 0600..0604; ARABIC
+ 0x0605, // 0605 ; COMMON
+ 0x0606, // 0606..060B; ARABIC
+ 0x060C, // 060C ; COMMON
0x060D, // 060D..061A; ARABIC
- 0x061B, // 061B..061D; COMMON
- 0x061E, // 061E..061E; ARABIC
- 0x061F, // 061F..061F; COMMON
+ 0x061B, // 061B..061C; COMMON
+ 0x061D, // 061D ; UNKNOWN
+ 0x061E, // 061E ; ARABIC
+ 0x061F, // 061F ; COMMON
0x0620, // 0620..063F; ARABIC
- 0x0640, // 0640..0640; COMMON
+ 0x0640, // 0640 ; COMMON
0x0641, // 0641..064A; ARABIC
0x064B, // 064B..0655; INHERITED
0x0656, // 0656..065F; ARABIC
0x0660, // 0660..0669; COMMON
0x066A, // 066A..066F; ARABIC
- 0x0670, // 0670..0670; INHERITED
+ 0x0670, // 0670 ; INHERITED
0x0671, // 0671..06DC; ARABIC
- 0x06DD, // 06DD..06DD; COMMON
+ 0x06DD, // 06DD ; COMMON
0x06DE, // 06DE..06FF; ARABIC
- 0x0700, // 0700..074F; SYRIAC
+ 0x0700, // 0700..070D; SYRIAC
+ 0x070E, // 070E ; UNKNOWN
+ 0x070F, // 070F..074A; SYRIAC
+ 0x074B, // 074B..074C; UNKNOWN
+ 0x074D, // 074D..074F; SYRIAC
0x0750, // 0750..077F; ARABIC
- 0x0780, // 0780..07BF; THAANA
- 0x07C0, // 07C0..07FF; NKO
- 0x0800, // 0800..083F; SAMARITAN
- 0x0840, // 0840..089F; MANDAIC
- 0x08A0, // 08A0..08FF; ARABIC
+ 0x0780, // 0780..07B1; THAANA
+ 0x07B2, // 07B2..07BF; UNKNOWN
+ 0x07C0, // 07C0..07FA; NKO
+ 0x07FB, // 07FB..07FF; UNKNOWN
+ 0x0800, // 0800..082D; SAMARITAN
+ 0x082E, // 082E..082F; UNKNOWN
+ 0x0830, // 0830..083E; SAMARITAN
+ 0x083F, // 083F ; UNKNOWN
+ 0x0840, // 0840..085B; MANDAIC
+ 0x085C, // 085C..085D; UNKNOWN
+ 0x085E, // 085E ; MANDAIC
+ 0x085F, // 085F..089F; UNKNOWN
+ 0x08A0, // 08A0..08B2; ARABIC
+ 0x08B3, // 08B3..08E3; UNKNOWN
+ 0x08E4, // 08E4..08FF; ARABIC
0x0900, // 0900..0950; DEVANAGARI
0x0951, // 0951..0952; INHERITED
0x0953, // 0953..0963; DEVANAGARI
0x0964, // 0964..0965; COMMON
- 0x0966, // 0966..0980; DEVANAGARI
- 0x0981, // 0981..0A00; BENGALI
- 0x0A01, // 0A01..0A80; GURMUKHI
- 0x0A81, // 0A81..0B00; GUJARATI
- 0x0B01, // 0B01..0B81; ORIYA
- 0x0B82, // 0B82..0C00; TAMIL
- 0x0C01, // 0C01..0C81; TELUGU
- 0x0C82, // 0C82..0CF0; KANNADA
- 0x0D02, // 0D02..0D81; MALAYALAM
- 0x0D82, // 0D82..0E00; SINHALA
- 0x0E01, // 0E01..0E3E; THAI
- 0x0E3F, // 0E3F..0E3F; COMMON
- 0x0E40, // 0E40..0E80; THAI
- 0x0E81, // 0E81..0EFF; LAO
- 0x0F00, // 0F00..0FD4; TIBETAN
+ 0x0966, // 0966..097F; DEVANAGARI
+ 0x0980, // 0980..0983; BENGALI
+ 0x0984, // 0984 ; UNKNOWN
+ 0x0985, // 0985..098C; BENGALI
+ 0x098D, // 098D..098E; UNKNOWN
+ 0x098F, // 098F..0990; BENGALI
+ 0x0991, // 0991..0992; UNKNOWN
+ 0x0993, // 0993..09A8; BENGALI
+ 0x09A9, // 09A9 ; UNKNOWN
+ 0x09AA, // 09AA..09B0; BENGALI
+ 0x09B1, // 09B1 ; UNKNOWN
+ 0x09B2, // 09B2 ; BENGALI
+ 0x09B3, // 09B3..09B5; UNKNOWN
+ 0x09B6, // 09B6..09B9; BENGALI
+ 0x09BA, // 09BA..09BB; UNKNOWN
+ 0x09BC, // 09BC..09C4; BENGALI
+ 0x09C5, // 09C5..09C6; UNKNOWN
+ 0x09C7, // 09C7..09C8; BENGALI
+ 0x09C9, // 09C9..09CA; UNKNOWN
+ 0x09CB, // 09CB..09CE; BENGALI
+ 0x09CF, // 09CF..09D6; UNKNOWN
+ 0x09D7, // 09D7 ; BENGALI
+ 0x09D8, // 09D8..09DB; UNKNOWN
+ 0x09DC, // 09DC..09DD; BENGALI
+ 0x09DE, // 09DE ; UNKNOWN
+ 0x09DF, // 09DF..09E3; BENGALI
+ 0x09E4, // 09E4..09E5; UNKNOWN
+ 0x09E6, // 09E6..09FB; BENGALI
+ 0x09FC, // 09FC..0A00; UNKNOWN
+ 0x0A01, // 0A01..0A03; GURMUKHI
+ 0x0A04, // 0A04 ; UNKNOWN
+ 0x0A05, // 0A05..0A0A; GURMUKHI
+ 0x0A0B, // 0A0B..0A0E; UNKNOWN
+ 0x0A0F, // 0A0F..0A10; GURMUKHI
+ 0x0A11, // 0A11..0A12; UNKNOWN
+ 0x0A13, // 0A13..0A28; GURMUKHI
+ 0x0A29, // 0A29 ; UNKNOWN
+ 0x0A2A, // 0A2A..0A30; GURMUKHI
+ 0x0A31, // 0A31 ; UNKNOWN
+ 0x0A32, // 0A32..0A33; GURMUKHI
+ 0x0A34, // 0A34 ; UNKNOWN
+ 0x0A35, // 0A35..0A36; GURMUKHI
+ 0x0A37, // 0A37 ; UNKNOWN
+ 0x0A38, // 0A38..0A39; GURMUKHI
+ 0x0A3A, // 0A3A..0A3B; UNKNOWN
+ 0x0A3C, // 0A3C ; GURMUKHI
+ 0x0A3D, // 0A3D ; UNKNOWN
+ 0x0A3E, // 0A3E..0A42; GURMUKHI
+ 0x0A43, // 0A43..0A46; UNKNOWN
+ 0x0A47, // 0A47..0A48; GURMUKHI
+ 0x0A49, // 0A49..0A4A; UNKNOWN
+ 0x0A4B, // 0A4B..0A4D; GURMUKHI
+ 0x0A4E, // 0A4E..0A50; UNKNOWN
+ 0x0A51, // 0A51 ; GURMUKHI
+ 0x0A52, // 0A52..0A58; UNKNOWN
+ 0x0A59, // 0A59..0A5C; GURMUKHI
+ 0x0A5D, // 0A5D ; UNKNOWN
+ 0x0A5E, // 0A5E ; GURMUKHI
+ 0x0A5F, // 0A5F..0A65; UNKNOWN
+ 0x0A66, // 0A66..0A75; GURMUKHI
+ 0x0A76, // 0A76..0A80; UNKNOWN
+ 0x0A81, // 0A81..0A83; GUJARATI
+ 0x0A84, // 0A84 ; UNKNOWN
+ 0x0A85, // 0A85..0A8D; GUJARATI
+ 0x0A8E, // 0A8E ; UNKNOWN
+ 0x0A8F, // 0A8F..0A91; GUJARATI
+ 0x0A92, // 0A92 ; UNKNOWN
+ 0x0A93, // 0A93..0AA8; GUJARATI
+ 0x0AA9, // 0AA9 ; UNKNOWN
+ 0x0AAA, // 0AAA..0AB0; GUJARATI
+ 0x0AB1, // 0AB1 ; UNKNOWN
+ 0x0AB2, // 0AB2..0AB3; GUJARATI
+ 0x0AB4, // 0AB4 ; UNKNOWN
+ 0x0AB5, // 0AB5..0AB9; GUJARATI
+ 0x0ABA, // 0ABA..0ABB; UNKNOWN
+ 0x0ABC, // 0ABC..0AC5; GUJARATI
+ 0x0AC6, // 0AC6 ; UNKNOWN
+ 0x0AC7, // 0AC7..0AC9; GUJARATI
+ 0x0ACA, // 0ACA ; UNKNOWN
+ 0x0ACB, // 0ACB..0ACD; GUJARATI
+ 0x0ACE, // 0ACE..0ACF; UNKNOWN
+ 0x0AD0, // 0AD0 ; GUJARATI
+ 0x0AD1, // 0AD1..0ADF; UNKNOWN
+ 0x0AE0, // 0AE0..0AE3; GUJARATI
+ 0x0AE4, // 0AE4..0AE5; UNKNOWN
+ 0x0AE6, // 0AE6..0AF1; GUJARATI
+ 0x0AF2, // 0AF2..0B00; UNKNOWN
+ 0x0B01, // 0B01..0B03; ORIYA
+ 0x0B04, // 0B04 ; UNKNOWN
+ 0x0B05, // 0B05..0B0C; ORIYA
+ 0x0B0D, // 0B0D..0B0E; UNKNOWN
+ 0x0B0F, // 0B0F..0B10; ORIYA
+ 0x0B11, // 0B11..0B12; UNKNOWN
+ 0x0B13, // 0B13..0B28; ORIYA
+ 0x0B29, // 0B29 ; UNKNOWN
+ 0x0B2A, // 0B2A..0B30; ORIYA
+ 0x0B31, // 0B31 ; UNKNOWN
+ 0x0B32, // 0B32..0B33; ORIYA
+ 0x0B34, // 0B34 ; UNKNOWN
+ 0x0B35, // 0B35..0B39; ORIYA
+ 0x0B3A, // 0B3A..0B3B; UNKNOWN
+ 0x0B3C, // 0B3C..0B44; ORIYA
+ 0x0B45, // 0B45..0B46; UNKNOWN
+ 0x0B47, // 0B47..0B48; ORIYA
+ 0x0B49, // 0B49..0B4A; UNKNOWN
+ 0x0B4B, // 0B4B..0B4D; ORIYA
+ 0x0B4E, // 0B4E..0B55; UNKNOWN
+ 0x0B56, // 0B56..0B57; ORIYA
+ 0x0B58, // 0B58..0B5B; UNKNOWN
+ 0x0B5C, // 0B5C..0B5D; ORIYA
+ 0x0B5E, // 0B5E ; UNKNOWN
+ 0x0B5F, // 0B5F..0B63; ORIYA
+ 0x0B64, // 0B64..0B65; UNKNOWN
+ 0x0B66, // 0B66..0B77; ORIYA
+ 0x0B78, // 0B78..0B81; UNKNOWN
+ 0x0B82, // 0B82..0B83; TAMIL
+ 0x0B84, // 0B84 ; UNKNOWN
+ 0x0B85, // 0B85..0B8A; TAMIL
+ 0x0B8B, // 0B8B..0B8D; UNKNOWN
+ 0x0B8E, // 0B8E..0B90; TAMIL
+ 0x0B91, // 0B91 ; UNKNOWN
+ 0x0B92, // 0B92..0B95; TAMIL
+ 0x0B96, // 0B96..0B98; UNKNOWN
+ 0x0B99, // 0B99..0B9A; TAMIL
+ 0x0B9B, // 0B9B ; UNKNOWN
+ 0x0B9C, // 0B9C ; TAMIL
+ 0x0B9D, // 0B9D ; UNKNOWN
+ 0x0B9E, // 0B9E..0B9F; TAMIL
+ 0x0BA0, // 0BA0..0BA2; UNKNOWN
+ 0x0BA3, // 0BA3..0BA4; TAMIL
+ 0x0BA5, // 0BA5..0BA7; UNKNOWN
+ 0x0BA8, // 0BA8..0BAA; TAMIL
+ 0x0BAB, // 0BAB..0BAD; UNKNOWN
+ 0x0BAE, // 0BAE..0BB9; TAMIL
+ 0x0BBA, // 0BBA..0BBD; UNKNOWN
+ 0x0BBE, // 0BBE..0BC2; TAMIL
+ 0x0BC3, // 0BC3..0BC5; UNKNOWN
+ 0x0BC6, // 0BC6..0BC8; TAMIL
+ 0x0BC9, // 0BC9 ; UNKNOWN
+ 0x0BCA, // 0BCA..0BCD; TAMIL
+ 0x0BCE, // 0BCE..0BCF; UNKNOWN
+ 0x0BD0, // 0BD0 ; TAMIL
+ 0x0BD1, // 0BD1..0BD6; UNKNOWN
+ 0x0BD7, // 0BD7 ; TAMIL
+ 0x0BD8, // 0BD8..0BE5; UNKNOWN
+ 0x0BE6, // 0BE6..0BFA; TAMIL
+ 0x0BFB, // 0BFB..0BFF; UNKNOWN
+ 0x0C00, // 0C00..0C03; TELUGU
+ 0x0C04, // 0C04 ; UNKNOWN
+ 0x0C05, // 0C05..0C0C; TELUGU
+ 0x0C0D, // 0C0D ; UNKNOWN
+ 0x0C0E, // 0C0E..0C10; TELUGU
+ 0x0C11, // 0C11 ; UNKNOWN
+ 0x0C12, // 0C12..0C28; TELUGU
+ 0x0C29, // 0C29 ; UNKNOWN
+ 0x0C2A, // 0C2A..0C39; TELUGU
+ 0x0C3A, // 0C3A..0C3C; UNKNOWN
+ 0x0C3D, // 0C3D..0C44; TELUGU
+ 0x0C45, // 0C45 ; UNKNOWN
+ 0x0C46, // 0C46..0C48; TELUGU
+ 0x0C49, // 0C49 ; UNKNOWN
+ 0x0C4A, // 0C4A..0C4D; TELUGU
+ 0x0C4E, // 0C4E..0C54; UNKNOWN
+ 0x0C55, // 0C55..0C56; TELUGU
+ 0x0C57, // 0C57 ; UNKNOWN
+ 0x0C58, // 0C58..0C59; TELUGU
+ 0x0C5A, // 0C5A..0C5F; UNKNOWN
+ 0x0C60, // 0C60..0C63; TELUGU
+ 0x0C64, // 0C64..0C65; UNKNOWN
+ 0x0C66, // 0C66..0C6F; TELUGU
+ 0x0C70, // 0C70..0C77; UNKNOWN
+ 0x0C78, // 0C78..0C7F; TELUGU
+ 0x0C80, // 0C80 ; UNKNOWN
+ 0x0C81, // 0C81..0C83; KANNADA
+ 0x0C84, // 0C84 ; UNKNOWN
+ 0x0C85, // 0C85..0C8C; KANNADA
+ 0x0C8D, // 0C8D ; UNKNOWN
+ 0x0C8E, // 0C8E..0C90; KANNADA
+ 0x0C91, // 0C91 ; UNKNOWN
+ 0x0C92, // 0C92..0CA8; KANNADA
+ 0x0CA9, // 0CA9 ; UNKNOWN
+ 0x0CAA, // 0CAA..0CB3; KANNADA
+ 0x0CB4, // 0CB4 ; UNKNOWN
+ 0x0CB5, // 0CB5..0CB9; KANNADA
+ 0x0CBA, // 0CBA..0CBB; UNKNOWN
+ 0x0CBC, // 0CBC..0CC4; KANNADA
+ 0x0CC5, // 0CC5 ; UNKNOWN
+ 0x0CC6, // 0CC6..0CC8; KANNADA
+ 0x0CC9, // 0CC9 ; UNKNOWN
+ 0x0CCA, // 0CCA..0CCD; KANNADA
+ 0x0CCE, // 0CCE..0CD4; UNKNOWN
+ 0x0CD5, // 0CD5..0CD6; KANNADA
+ 0x0CD7, // 0CD7..0CDD; UNKNOWN
+ 0x0CDE, // 0CDE ; KANNADA
+ 0x0CDF, // 0CDF ; UNKNOWN
+ 0x0CE0, // 0CE0..0CE3; KANNADA
+ 0x0CE4, // 0CE4..0CE5; UNKNOWN
+ 0x0CE6, // 0CE6..0CEF; KANNADA
+ 0x0CF0, // 0CF0 ; UNKNOWN
+ 0x0CF1, // 0CF1..0CF2; KANNADA
+ 0x0CF3, // 0CF3..0D00; UNKNOWN
+ 0x0D01, // 0D01..0D03; MALAYALAM
+ 0x0D04, // 0D04 ; UNKNOWN
+ 0x0D05, // 0D05..0D0C; MALAYALAM
+ 0x0D0D, // 0D0D ; UNKNOWN
+ 0x0D0E, // 0D0E..0D10; MALAYALAM
+ 0x0D11, // 0D11 ; UNKNOWN
+ 0x0D12, // 0D12..0D3A; MALAYALAM
+ 0x0D3B, // 0D3B..0D3C; UNKNOWN
+ 0x0D3D, // 0D3D..0D44; MALAYALAM
+ 0x0D45, // 0D45 ; UNKNOWN
+ 0x0D46, // 0D46..0D48; MALAYALAM
+ 0x0D49, // 0D49 ; UNKNOWN
+ 0x0D4A, // 0D4A..0D4E; MALAYALAM
+ 0x0D4F, // 0D4F..0D56; UNKNOWN
+ 0x0D57, // 0D57 ; MALAYALAM
+ 0x0D58, // 0D58..0D5F; UNKNOWN
+ 0x0D60, // 0D60..0D63; MALAYALAM
+ 0x0D64, // 0D64..0D65; UNKNOWN
+ 0x0D66, // 0D66..0D75; MALAYALAM
+ 0x0D76, // 0D76..0D78; UNKNOWN
+ 0x0D79, // 0D79..0D7F; MALAYALAM
+ 0x0D80, // 0D80..0D81; UNKNOWN
+ 0x0D82, // 0D82..0D83; SINHALA
+ 0x0D84, // 0D84 ; UNKNOWN
+ 0x0D85, // 0D85..0D96; SINHALA
+ 0x0D97, // 0D97..0D99; UNKNOWN
+ 0x0D9A, // 0D9A..0DB1; SINHALA
+ 0x0DB2, // 0DB2 ; UNKNOWN
+ 0x0DB3, // 0DB3..0DBB; SINHALA
+ 0x0DBC, // 0DBC ; UNKNOWN
+ 0x0DBD, // 0DBD ; SINHALA
+ 0x0DBE, // 0DBE..0DBF; UNKNOWN
+ 0x0DC0, // 0DC0..0DC6; SINHALA
+ 0x0DC7, // 0DC7..0DC9; UNKNOWN
+ 0x0DCA, // 0DCA ; SINHALA
+ 0x0DCB, // 0DCB..0DCE; UNKNOWN
+ 0x0DCF, // 0DCF..0DD4; SINHALA
+ 0x0DD5, // 0DD5 ; UNKNOWN
+ 0x0DD6, // 0DD6 ; SINHALA
+ 0x0DD7, // 0DD7 ; UNKNOWN
+ 0x0DD8, // 0DD8..0DDF; SINHALA
+ 0x0DE0, // 0DE0..0DE5; UNKNOWN
+ 0x0DE6, // 0DE6..0DEF; SINHALA
+ 0x0DF0, // 0DF0..0DF1; UNKNOWN
+ 0x0DF2, // 0DF2..0DF4; SINHALA
+ 0x0DF5, // 0DF5..0E00; UNKNOWN
+ 0x0E01, // 0E01..0E3A; THAI
+ 0x0E3B, // 0E3B..0E3E; UNKNOWN
+ 0x0E3F, // 0E3F ; COMMON
+ 0x0E40, // 0E40..0E5B; THAI
+ 0x0E5C, // 0E5C..0E80; UNKNOWN
+ 0x0E81, // 0E81..0E82; LAO
+ 0x0E83, // 0E83 ; UNKNOWN
+ 0x0E84, // 0E84 ; LAO
+ 0x0E85, // 0E85..0E86; UNKNOWN
+ 0x0E87, // 0E87..0E88; LAO
+ 0x0E89, // 0E89 ; UNKNOWN
+ 0x0E8A, // 0E8A ; LAO
+ 0x0E8B, // 0E8B..0E8C; UNKNOWN
+ 0x0E8D, // 0E8D ; LAO
+ 0x0E8E, // 0E8E..0E93; UNKNOWN
+ 0x0E94, // 0E94..0E97; LAO
+ 0x0E98, // 0E98 ; UNKNOWN
+ 0x0E99, // 0E99..0E9F; LAO
+ 0x0EA0, // 0EA0 ; UNKNOWN
+ 0x0EA1, // 0EA1..0EA3; LAO
+ 0x0EA4, // 0EA4 ; UNKNOWN
+ 0x0EA5, // 0EA5 ; LAO
+ 0x0EA6, // 0EA6 ; UNKNOWN
+ 0x0EA7, // 0EA7 ; LAO
+ 0x0EA8, // 0EA8..0EA9; UNKNOWN
+ 0x0EAA, // 0EAA..0EAB; LAO
+ 0x0EAC, // 0EAC ; UNKNOWN
+ 0x0EAD, // 0EAD..0EB9; LAO
+ 0x0EBA, // 0EBA ; UNKNOWN
+ 0x0EBB, // 0EBB..0EBD; LAO
+ 0x0EBE, // 0EBE..0EBF; UNKNOWN
+ 0x0EC0, // 0EC0..0EC4; LAO
+ 0x0EC5, // 0EC5 ; UNKNOWN
+ 0x0EC6, // 0EC6 ; LAO
+ 0x0EC7, // 0EC7 ; UNKNOWN
+ 0x0EC8, // 0EC8..0ECD; LAO
+ 0x0ECE, // 0ECE..0ECF; UNKNOWN
+ 0x0ED0, // 0ED0..0ED9; LAO
+ 0x0EDA, // 0EDA..0EDB; UNKNOWN
+ 0x0EDC, // 0EDC..0EDF; LAO
+ 0x0EE0, // 0EE0..0EFF; UNKNOWN
+ 0x0F00, // 0F00..0F47; TIBETAN
+ 0x0F48, // 0F48 ; UNKNOWN
+ 0x0F49, // 0F49..0F6C; TIBETAN
+ 0x0F6D, // 0F6D..0F70; UNKNOWN
+ 0x0F71, // 0F71..0F97; TIBETAN
+ 0x0F98, // 0F98 ; UNKNOWN
+ 0x0F99, // 0F99..0FBC; TIBETAN
+ 0x0FBD, // 0FBD ; UNKNOWN
+ 0x0FBE, // 0FBE..0FCC; TIBETAN
+ 0x0FCD, // 0FCD ; UNKNOWN
+ 0x0FCE, // 0FCE..0FD4; TIBETAN
0x0FD5, // 0FD5..0FD8; COMMON
- 0x0FD9, // 0FD9..0FFF; TIBETAN
+ 0x0FD9, // 0FD9..0FDA; TIBETAN
+ 0x0FDB, // 0FDB..FFF; UNKNOWN
0x1000, // 1000..109F; MYANMAR
- 0x10A0, // 10A0..10FA; GEORGIAN
- 0x10FB, // 10FB..10FB; COMMON
+ 0x10A0, // 10A0..10C5; GEORGIAN
+ 0x10C6, // 10C6 ; UNKNOWN
+ 0x10C7, // 10C7 ; GEORGIAN
+ 0x10C8, // 10C8..10CC; UNKNOWN
+ 0x10CD, // 10CD ; GEORGIAN
+ 0x10CE, // 10CE..10CF; UNKNOWN
+ 0x10D0, // 10D0..10FA; GEORGIAN
+ 0x10FB, // 10FB ; COMMON
0x10FC, // 10FC..10FF; GEORGIAN
0x1100, // 1100..11FF; HANGUL
- 0x1200, // 1200..139F; ETHIOPIC
- 0x13A0, // 13A0..13FF; CHEROKEE
+ 0x1200, // 1200..1248; ETHIOPIC
+ 0x1249, // 1249 ; UNKNOWN
+ 0x124A, // 124A..124D; ETHIOPIC
+ 0x124E, // 124E..124F; UNKNOWN
+ 0x1250, // 1250..1256; ETHIOPIC
+ 0x1257, // 1257 ; UNKNOWN
+ 0x1258, // 1258 ; ETHIOPIC
+ 0x1259, // 1259 ; UNKNOWN
+ 0x125A, // 125A..125D; ETHIOPIC
+ 0x125E, // 125E..125F; UNKNOWN
+ 0x1260, // 1260..1288; ETHIOPIC
+ 0x1289, // 1289 ; UNKNOWN
+ 0x128A, // 128A..128D; ETHIOPIC
+ 0x128E, // 128E..128F; UNKNOWN
+ 0x1290, // 1290..12B0; ETHIOPIC
+ 0x12B1, // 12B1 ; UNKNOWN
+ 0x12B2, // 12B2..12B5; ETHIOPIC
+ 0x12B6, // 12B6..12B7; UNKNOWN
+ 0x12B8, // 12B8..12BE; ETHIOPIC
+ 0x12BF, // 12BF ; UNKNOWN
+ 0x12C0, // 12C0 ; ETHIOPIC
+ 0x12C1, // 12C1 ; UNKNOWN
+ 0x12C2, // 12C2..12C5; ETHIOPIC
+ 0x12C6, // 12C6..12C7; UNKNOWN
+ 0x12C8, // 12C8..12D6; ETHIOPIC
+ 0x12D7, // 12D7 ; UNKNOWN
+ 0x12D8, // 12D8..1310; ETHIOPIC
+ 0x1311, // 1311 ; UNKNOWN
+ 0x1312, // 1312..1315; ETHIOPIC
+ 0x1316, // 1316..1317; UNKNOWN
+ 0x1318, // 1318..135A; ETHIOPIC
+ 0x135B, // 135B..135C; UNKNOWN
+ 0x135D, // 135D..137C; ETHIOPIC
+ 0x137D, // 137D..137F; UNKNOWN
+ 0x1380, // 1380..1399; ETHIOPIC
+ 0x139A, // 139A..139F; UNKNOWN
+ 0x13A0, // 13A0..13F4; CHEROKEE
+ 0x13F5, // 13F5..13FF; UNKNOWN
0x1400, // 1400..167F; CANADIAN_ABORIGINAL
- 0x1680, // 1680..169F; OGHAM
+ 0x1680, // 1680..169C; OGHAM
+ 0x169D, // 169D..169F; UNKNOWN
0x16A0, // 16A0..16EA; RUNIC
0x16EB, // 16EB..16ED; COMMON
- 0x16EE, // 16EE..16FF; RUNIC
- 0x1700, // 1700..171F; TAGALOG
+ 0x16EE, // 16EE..16F8; RUNIC
+ 0x16F9, // 16F9..16FF; UNKNOWN
+ 0x1700, // 1700..170C; TAGALOG
+ 0x170D, // 170D ; UNKNOWN
+ 0x170E, // 170E..1714; TAGALOG
+ 0x1715, // 1715..171F; UNKNOWN
0x1720, // 1720..1734; HANUNOO
- 0x1735, // 1735..173F; COMMON
- 0x1740, // 1740..175F; BUHID
- 0x1760, // 1760..177F; TAGBANWA
- 0x1780, // 1780..17FF; KHMER
+ 0x1735, // 1735..1736; COMMON
+ 0x1737, // 1737..173F; UNKNOWN
+ 0x1740, // 1740..1753; BUHID
+ 0x1754, // 1754..175F; UNKNOWN
+ 0x1760, // 1760..176C; TAGBANWA
+ 0x176D, // 176D ; UNKNOWN
+ 0x176E, // 176E..1770; TAGBANWA
+ 0x1771, // 1771 ; UNKNOWN
+ 0x1772, // 1772..1773; TAGBANWA
+ 0x1774, // 1774..177F; UNKNOWN
+ 0x1780, // 1780..17DD; KHMER
+ 0x17DE, // 17DE..17DF; UNKNOWN
+ 0x17E0, // 17E0..17E9; KHMER
+ 0x17EA, // 17EA..17EF; UNKNOWN
+ 0x17F0, // 17F0..17F9; KHMER
+ 0x17FA, // 17FA..17FF; UNKNOWN
0x1800, // 1800..1801; MONGOLIAN
0x1802, // 1802..1803; COMMON
- 0x1804, // 1804..1804; MONGOLIAN
- 0x1805, // 1805..1805; COMMON
- 0x1806, // 1806..18AF; MONGOLIAN
- 0x18B0, // 18B0..18FF; CANADIAN_ABORIGINAL
- 0x1900, // 1900..194F; LIMBU
- 0x1950, // 1950..197F; TAI_LE
- 0x1980, // 1980..19DF; NEW_TAI_LUE
+ 0x1804, // 1804 ; MONGOLIAN
+ 0x1805, // 1805 ; COMMON
+ 0x1806, // 1806..180E; MONGOLIAN
+ 0x180F, // 180F ; UNKNOWN
+ 0x1810, // 1810..1819; MONGOLIAN
+ 0x181A, // 181A..181F; UNKNOWN
+ 0x1820, // 1820..1877; MONGOLIAN
+ 0x1878, // 1878..187F; UNKNOWN
+ 0x1880, // 1880..18AA; MONGOLIAN
+ 0x18AB, // 18AB..18AF; UNKNOWN
+ 0x18B0, // 18B0..18F5; CANADIAN_ABORIGINAL
+ 0x18F6, // 18F6..18FF; UNKNOWN
+ 0x1900, // 1900..191E; LIMBU
+ 0x191F, // 191F ; UNKNOWN
+ 0x1920, // 1920..192B; LIMBU
+ 0x192C, // 192C..192F; UNKNOWN
+ 0x1930, // 1930..193B; LIMBU
+ 0x193C, // 193C..193F; UNKNOWN
+ 0x1940, // 1940 ; LIMBU
+ 0x1941, // 1941..1943; UNKNOWN
+ 0x1944, // 1944..194F; LIMBU
+ 0x1950, // 1950..196D; TAI_LE
+ 0x196E, // 196E..196F; UNKNOWN
+ 0x1970, // 1970..1974; TAI_LE
+ 0x1975, // 1975..197F; UNKNOWN
+ 0x1980, // 1980..19AB; NEW_TAI_LUE
+ 0x19AC, // 19AC..19AF; UNKNOWN
+ 0x19B0, // 19B0..19C9; NEW_TAI_LUE
+ 0x19CA, // 19CA..19CF; UNKNOWN
+ 0x19D0, // 19D0..19DA; NEW_TAI_LUE
+ 0x19DB, // 19DB..19DD; UNKNOWN
+ 0x19DE, // 19DE..19DF; NEW_TAI_LUE
0x19E0, // 19E0..19FF; KHMER
- 0x1A00, // 1A00..1A1F; BUGINESE
- 0x1A20, // 1A20..1AFF; TAI_THAM
- 0x1B00, // 1B00..1B7F; BALINESE
+ 0x1A00, // 1A00..1A1B; BUGINESE
+ 0x1A1C, // 1A1C..1A1D; UNKNOWN
+ 0x1A1E, // 1A1E..1A1F; BUGINESE
+ 0x1A20, // 1A20..1A5E; TAI_THAM
+ 0x1A5F, // 1A5F ; UNKNOWN
+ 0x1A60, // 1A60..1A7C; TAI_THAM
+ 0x1A7D, // 1A7D..1A7E; UNKNOWN
+ 0x1A7F, // 1A7F..1A89; TAI_THAM
+ 0x1A8A, // 1A8A..1A8F; UNKNOWN
+ 0x1A90, // 1A90..1A99; TAI_THAM
+ 0x1A9A, // 1A9A..1A9F; UNKNOWN
+ 0x1AA0, // 1AA0..1AAD; TAI_THAM
+ 0x1AAE, // 1AAE..1AAF; UNKNOWN
+ 0x1AB0, // 1AB0..1ABE; INHERITED
+ 0x1ABF, // 1ABF..1AFF; UNKNOWN
+ 0x1B00, // 1B00..1B4B; BALINESE
+ 0x1B4C, // 1B4C..1B4F; UNKNOWN
+ 0x1B50, // 1B50..1B7C; BALINESE
+ 0x1B7D, // 1B7D..1B7F; UNKNOWN
0x1B80, // 1B80..1BBF; SUNDANESE
- 0x1BC0, // 1BC0..1BFF; BATAK
- 0x1C00, // 1C00..1C4F; LEPCHA
- 0x1C50, // 1C50..1CBF; OL_CHIKI
- 0x1CC0, // 1CC0..1CCF; SUNDANESE
+ 0x1BC0, // 1BC0..1BF3; BATAK
+ 0x1BF4, // 1BF4..1BFB; UNKNOWN
+ 0x1BFC, // 1BFC..1BFF; BATAK
+ 0x1C00, // 1C00..1C37; LEPCHA
+ 0x1C38, // 1C38..1C3A; UNKNOWN
+ 0x1C3B, // 1C3B..1C49; LEPCHA
+ 0x1C4A, // 1C4A..1C4C; UNKNOWN
+ 0x1C4D, // 1C4D..1C4F; LEPCHA
+ 0x1C50, // 1C50..1C7F; OL_CHIKI
+ 0x1C80, // 1C80..1CBF; UNKNOWN
+ 0x1CC0, // 1CC0..1CC7; SUNDANESE
+ 0x1CC8, // 1CC8..1CCF; UNKNOWN
0x1CD0, // 1CD0..1CD2; INHERITED
- 0x1CD3, // 1CD3..1CD3; COMMON
+ 0x1CD3, // 1CD3 ; COMMON
0x1CD4, // 1CD4..1CE0; INHERITED
- 0x1CE1, // 1CE1..1CE1; COMMON
+ 0x1CE1, // 1CE1 ; COMMON
0x1CE2, // 1CE2..1CE8; INHERITED
0x1CE9, // 1CE9..1CEC; COMMON
- 0x1CED, // 1CED..1CED; INHERITED
+ 0x1CED, // 1CED ; INHERITED
0x1CEE, // 1CEE..1CF3; COMMON
- 0x1CF4, // 1CF4..1CF4; INHERITED
- 0x1CF5, // 1CF5..1CFF; COMMON
+ 0x1CF4, // 1CF4 ; INHERITED
+ 0x1CF5, // 1CF5..1CF6; COMMON
+ 0x1CF7, // 1CF7 ; UNKNOWN
+ 0x1CF8, // 1CF8..1CF9; INHERITED
+ 0x1CFA, // 1CFA..1CFF; UNKNOWN
0x1D00, // 1D00..1D25; LATIN
0x1D26, // 1D26..1D2A; GREEK
- 0x1D2B, // 1D2B..1D2B; CYRILLIC
+ 0x1D2B, // 1D2B ; CYRILLIC
0x1D2C, // 1D2C..1D5C; LATIN
0x1D5D, // 1D5D..1D61; GREEK
0x1D62, // 1D62..1D65; LATIN
0x1D66, // 1D66..1D6A; GREEK
0x1D6B, // 1D6B..1D77; LATIN
- 0x1D78, // 1D78..1D78; CYRILLIC
+ 0x1D78, // 1D78 ; CYRILLIC
0x1D79, // 1D79..1DBE; LATIN
- 0x1DBF, // 1DBF..1DBF; GREEK
- 0x1DC0, // 1DC0..1DFF; INHERITED
+ 0x1DBF, // 1DBF ; GREEK
+ 0x1DC0, // 1DC0..1DF5; INHERITED
+ 0x1DF6, // 1DF6..1DFB; UNKNOWN
+ 0x1DFC, // 1DFC..1DFF; INHERITED
0x1E00, // 1E00..1EFF; LATIN
- 0x1F00, // 1F00..1FFF; GREEK
+ 0x1F00, // 1F00..1F15; GREEK
+ 0x1F16, // 1F16..1F17; UNKNOWN
+ 0x1F18, // 1F18..1F1D; GREEK
+ 0x1F1E, // 1F1E..1F1F; UNKNOWN
+ 0x1F20, // 1F20..1F45; GREEK
+ 0x1F46, // 1F46..1F47; UNKNOWN
+ 0x1F48, // 1F48..1F4D; GREEK
+ 0x1F4E, // 1F4E..1F4F; UNKNOWN
+ 0x1F50, // 1F50..1F57; GREEK
+ 0x1F58, // 1F58 ; UNKNOWN
+ 0x1F59, // 1F59 ; GREEK
+ 0x1F5A, // 1F5A ; UNKNOWN
+ 0x1F5B, // 1F5B ; GREEK
+ 0x1F5C, // 1F5C ; UNKNOWN
+ 0x1F5D, // 1F5D ; GREEK
+ 0x1F5E, // 1F5E ; UNKNOWN
+ 0x1F5F, // 1F5F..1F7D; GREEK
+ 0x1F7E, // 1F7E..1F7F; UNKNOWN
+ 0x1F80, // 1F80..1FB4; GREEK
+ 0x1FB5, // 1FB5 ; UNKNOWN
+ 0x1FB6, // 1FB6..1FC4; GREEK
+ 0x1FC5, // 1FC5 ; UNKNOWN
+ 0x1FC6, // 1FC6..1FD3; GREEK
+ 0x1FD4, // 1FD4..1FD5; UNKNOWN
+ 0x1FD6, // 1FD6..1FDB; GREEK
+ 0x1FDC, // 1FDC ; UNKNOWN
+ 0x1FDD, // 1FDD..1FEF; GREEK
+ 0x1FF0, // 1FF0..1FF1; UNKNOWN
+ 0x1FF2, // 1FF2..1FF4; GREEK
+ 0x1FF5, // 1FF5 ; UNKNOWN
+ 0x1FF6, // 1FF6..1FFE; GREEK
+ 0x1FFF, // 1FFF ; UNKNOWN
0x2000, // 2000..200B; COMMON
0x200C, // 200C..200D; INHERITED
- 0x200E, // 200E..2070; COMMON
- 0x2071, // 2071..2073; LATIN
+ 0x200E, // 200E..2064; COMMON
+ 0x2065, // 2065 ; UNKNOWN
+ 0x2066, // 2066..2070; COMMON
+ 0x2071, // 2071 ; LATIN
+ 0x2072, // 2072..2073; UNKNOWN
0x2074, // 2074..207E; COMMON
- 0x207F, // 207F..207F; LATIN
- 0x2080, // 2080..208F; COMMON
- 0x2090, // 2090..209F; LATIN
- 0x20A0, // 20A0..20CF; COMMON
- 0x20D0, // 20D0..20FF; INHERITED
+ 0x207F, // 207F ; LATIN
+ 0x2080, // 2080..208E; COMMON
+ 0x208F, // 208F ; UNKNOWN
+ 0x2090, // 2090..209C; LATIN
+ 0x209D, // 209D..209F; UNKNOWN
+ 0x20A0, // 20A0..20BD; COMMON
+ 0x20BE, // 20BE..20CF; UNKNOWN
+ 0x20D0, // 20D0..20F0; INHERITED
+ 0x20F1, // 20F1..20FF; UNKNOWN
0x2100, // 2100..2125; COMMON
- 0x2126, // 2126..2126; GREEK
+ 0x2126, // 2126 ; GREEK
0x2127, // 2127..2129; COMMON
0x212A, // 212A..212B; LATIN
0x212C, // 212C..2131; COMMON
- 0x2132, // 2132..2132; LATIN
+ 0x2132, // 2132 ; LATIN
0x2133, // 2133..214D; COMMON
- 0x214E, // 214E..214E; LATIN
+ 0x214E, // 214E ; LATIN
0x214F, // 214F..215F; COMMON
0x2160, // 2160..2188; LATIN
- 0x2189, // 2189..27FF; COMMON
+ 0x2189, // 2189 ; COMMON
+ 0x218A, // 218A..218F; UNKNOWN
+ 0x2190, // 2190..23FA; COMMON
+ 0x23FB, // 23FB..23FF; UNKNOWN
+ 0x2400, // 2400..2426; COMMON
+ 0x2427, // 2427..243F; UNKNOWN
+ 0x2440, // 2440..244A; COMMON
+ 0x244B, // 244B..245F; UNKNOWN
+ 0x2460, // 2460..27FF; COMMON
0x2800, // 2800..28FF; BRAILLE
- 0x2900, // 2900..2BFF; COMMON
- 0x2C00, // 2C00..2C5F; GLAGOLITIC
+ 0x2900, // 2900..2B73; COMMON
+ 0x2B74, // 2B74..2B75; UNKNOWN
+ 0x2B76, // 2B76..2B95; COMMON
+ 0x2B96, // 2B96..2B97; UNKNOWN
+ 0x2B98, // 2B98..2BB9; COMMON
+ 0x2BBA, // 2BBA..2BBC; UNKNOWN
+ 0x2BBD, // 2BBD..2BC8; COMMON
+ 0x2BC9, // 2BC9 ; UNKNOWN
+ 0x2BCA, // 2BCA..2BD1; COMMON
+ 0x2BD2, // 2BD2..2BFF; UNKNOWN
+ 0x2C00, // 2C00..2C2E; GLAGOLITIC
+ 0x2C2F, // 2C2F ; UNKNOWN
+ 0x2C30, // 2C30..2C5E; GLAGOLITIC
+ 0x2C5F, // 2C5F ; UNKNOWN
0x2C60, // 2C60..2C7F; LATIN
- 0x2C80, // 2C80..2CFF; COPTIC
- 0x2D00, // 2D00..2D2F; GEORGIAN
- 0x2D30, // 2D30..2D7F; TIFINAGH
- 0x2D80, // 2D80..2DDF; ETHIOPIC
+ 0x2C80, // 2C80..2CF3; COPTIC
+ 0x2CF4, // 2CF4..2CF8; UNKNOWN
+ 0x2CF9, // 2CF9..2CFF; COPTIC
+ 0x2D00, // 2D00..2D25; GEORGIAN
+ 0x2D26, // 2D26 ; UNKNOWN
+ 0x2D27, // 2D27 ; GEORGIAN
+ 0x2D28, // 2D28..2D2C; UNKNOWN
+ 0x2D2D, // 2D2D ; GEORGIAN
+ 0x2D2E, // 2D2E..2D2F; UNKNOWN
+ 0x2D30, // 2D30..2D67; TIFINAGH
+ 0x2D68, // 2D68..2D6E; UNKNOWN
+ 0x2D6F, // 2D6F..2D70; TIFINAGH
+ 0x2D71, // 2D71..2D7E; UNKNOWN
+ 0x2D7F, // 2D7F ; TIFINAGH
+ 0x2D80, // 2D80..2D96; ETHIOPIC
+ 0x2D97, // 2D97..2D9F; UNKNOWN
+ 0x2DA0, // 2DA0..2DA6; ETHIOPIC
+ 0x2DA7, // 2DA7 ; UNKNOWN
+ 0x2DA8, // 2DA8..2DAE; ETHIOPIC
+ 0x2DAF, // 2DAF ; UNKNOWN
+ 0x2DB0, // 2DB0..2DB6; ETHIOPIC
+ 0x2DB7, // 2DB7 ; UNKNOWN
+ 0x2DB8, // 2DB8..2DBE; ETHIOPIC
+ 0x2DBF, // 2DBF ; UNKNOWN
+ 0x2DC0, // 2DC0..2DC6; ETHIOPIC
+ 0x2DC7, // 2DC7 ; UNKNOWN
+ 0x2DC8, // 2DC8..2DCE; ETHIOPIC
+ 0x2DCF, // 2DCF ; UNKNOWN
+ 0x2DD0, // 2DD0..2DD6; ETHIOPIC
+ 0x2DD7, // 2DD7 ; UNKNOWN
+ 0x2DD8, // 2DD8..2DDE; ETHIOPIC
+ 0x2DDF, // 2DDF ; UNKNOWN
0x2DE0, // 2DE0..2DFF; CYRILLIC
- 0x2E00, // 2E00..2E7F; COMMON
- 0x2E80, // 2E80..2FEF; HAN
- 0x2FF0, // 2FF0..3004; COMMON
- 0x3005, // 3005..3005; HAN
- 0x3006, // 3006..3006; COMMON
- 0x3007, // 3007..3007; HAN
+ 0x2E00, // 2E00..2E42; COMMON
+ 0x2E43, // 2E43..2E7F; UNKNOWN
+ 0x2E80, // 2E80..2E99; HAN
+ 0x2E9A, // 2E9A ; UNKNOWN
+ 0x2E9B, // 2E9B..2EF3; HAN
+ 0x2EF4, // 2EF4..2EFF; UNKNOWN
+ 0x2F00, // 2F00..2FD5; HAN
+ 0x2FD6, // 2FD6..2FEF; UNKNOWN
+ 0x2FF0, // 2FF0..2FFB; COMMON
+ 0x2FFC, // 2FFC..2FFF; UNKNOWN
+ 0x3000, // 3000..3004; COMMON
+ 0x3005, // 3005 ; HAN
+ 0x3006, // 3006 ; COMMON
+ 0x3007, // 3007 ; HAN
0x3008, // 3008..3020; COMMON
0x3021, // 3021..3029; HAN
0x302A, // 302A..302D; INHERITED
0x302E, // 302E..302F; HANGUL
0x3030, // 3030..3037; COMMON
0x3038, // 3038..303B; HAN
- 0x303C, // 303C..3040; COMMON
- 0x3041, // 3041..3098; HIRAGANA
+ 0x303C, // 303C..303F; COMMON
+ 0x3040, // 3040 ; UNKNOWN
+ 0x3041, // 3041..3096; HIRAGANA
+ 0x3097, // 3097..3098; UNKNOWN
0x3099, // 3099..309A; INHERITED
0x309B, // 309B..309C; COMMON
0x309D, // 309D..309F; HIRAGANA
- 0x30A0, // 30A0..30A0; COMMON
+ 0x30A0, // 30A0 ; COMMON
0x30A1, // 30A1..30FA; KATAKANA
0x30FB, // 30FB..30FC; COMMON
- 0x30FD, // 30FD..3104; KATAKANA
- 0x3105, // 3105..3130; BOPOMOFO
- 0x3131, // 3131..318F; HANGUL
+ 0x30FD, // 30FD..30FF; KATAKANA
+ 0x3100, // 3100..3104; UNKNOWN
+ 0x3105, // 3105..312D; BOPOMOFO
+ 0x312E, // 312E..3130; UNKNOWN
+ 0x3131, // 3131..318E; HANGUL
+ 0x318F, // 318F ; UNKNOWN
0x3190, // 3190..319F; COMMON
- 0x31A0, // 31A0..31BF; BOPOMOFO
- 0x31C0, // 31C0..31EF; COMMON
+ 0x31A0, // 31A0..31BA; BOPOMOFO
+ 0x31BB, // 31BB..31BF; UNKNOWN
+ 0x31C0, // 31C0..31E3; COMMON
+ 0x31E4, // 31E4..31EF; UNKNOWN
0x31F0, // 31F0..31FF; KATAKANA
- 0x3200, // 3200..321F; HANGUL
+ 0x3200, // 3200..321E; HANGUL
+ 0x321F, // 321F ; UNKNOWN
0x3220, // 3220..325F; COMMON
0x3260, // 3260..327E; HANGUL
0x327F, // 327F..32CF; COMMON
- 0x32D0, // 32D0..3357; KATAKANA
+ 0x32D0, // 32D0..32FE; KATAKANA
+ 0x32FF, // 32FF ; UNKNOWN
+ 0x3300, // 3300..3357; KATAKANA
0x3358, // 3358..33FF; COMMON
- 0x3400, // 3400..4DBF; HAN
+ 0x3400, // 3400..4DB5; HAN
+ 0x4DB6, // 4DB6..4DBF; UNKNOWN
0x4DC0, // 4DC0..4DFF; COMMON
- 0x4E00, // 4E00..9FFF; HAN
- 0xA000, // A000..A4CF; YI
+ 0x4E00, // 4E00..9FCC; HAN
+ 0x9FCD, // 9FCD..9FFF; UNKNOWN
+ 0xA000, // A000..A48C; YI
+ 0xA48D, // A48D..A48F; UNKNOWN
+ 0xA490, // A490..A4C6; YI
+ 0xA4C7, // A4C7..A4CF; UNKNOWN
0xA4D0, // A4D0..A4FF; LISU
- 0xA500, // A500..A63F; VAI
- 0xA640, // A640..A69F; CYRILLIC
- 0xA6A0, // A6A0..A6FF; BAMUM
+ 0xA500, // A500..A62B; VAI
+ 0xA62C, // A62C..A63F; UNKNOWN
+ 0xA640, // A640..A69D; CYRILLIC
+ 0xA69E, // A69E ; UNKNOWN
+ 0xA69F, // A69F ; CYRILLIC
+ 0xA6A0, // A6A0..A6F7; BAMUM
+ 0xA6F8, // A6F8..A6FF; UNKNOWN
0xA700, // A700..A721; COMMON
0xA722, // A722..A787; LATIN
0xA788, // A788..A78A; COMMON
- 0xA78B, // A78B..A7FF; LATIN
- 0xA800, // A800..A82F; SYLOTI_NAGRI
- 0xA830, // A830..A83F; COMMON
- 0xA840, // A840..A87F; PHAGS_PA
- 0xA880, // A880..A8DF; SAURASHTRA
- 0xA8E0, // A8E0..A8FF; DEVANAGARI
- 0xA900, // A900..A92F; KAYAH_LI
- 0xA930, // A930..A95F; REJANG
- 0xA960, // A960..A97F; HANGUL
- 0xA980, // A980..A9FF; JAVANESE
- 0xAA00, // AA00..AA5F; CHAM
+ 0xA78B, // A78B..A78E; LATIN
+ 0xA78F, // A78F ; UNKNOWN
+ 0xA790, // A790..A7AD; LATIN
+ 0xA7AE, // A7AE..A7AF; UNKNOWN
+ 0xA7B0, // A7B0..A7B1; LATIN
+ 0xA7B2, // A7B2..A7F6; UNKNOWN
+ 0xA7F7, // A7F7..A7FF; LATIN
+ 0xA800, // A800..A82B; SYLOTI_NAGRI
+ 0xA82C, // A82C..A82F; UNKNOWN
+ 0xA830, // A830..A839; COMMON
+ 0xA83A, // A83A..A83F; UNKNOWN
+ 0xA840, // A840..A877; PHAGS_PA
+ 0xA878, // A878..A87F; UNKNOWN
+ 0xA880, // A880..A8C4; SAURASHTRA
+ 0xA8C5, // A8C5..A8CD; UNKNOWN
+ 0xA8CE, // A8CE..A8D9; SAURASHTRA
+ 0xA8DA, // A8DA..A8DF; UNKNOWN
+ 0xA8E0, // A8E0..A8FB; DEVANAGARI
+ 0xA8FC, // A8FC..A8FF; UNKNOWN
+ 0xA900, // A900..A92D; KAYAH_LI
+ 0xA92E, // A92E ; COMMON
+ 0xA92F, // A92F ; KAYAH_LI
+ 0xA930, // A930..A953; REJANG
+ 0xA954, // A954..A95E; UNKNOWN
+ 0xA95F, // A95F ; REJANG
+ 0xA960, // A960..A97C; HANGUL
+ 0xA97D, // A97D..A97F; UNKNOWN
+ 0xA980, // A980..A9CD; JAVANESE
+ 0xA9CE, // A9CE ; UNKNOWN
+ 0xA9CF, // A9CF ; COMMON
+ 0xA9D0, // A9D0..A9D9; JAVANESE
+ 0xA9DA, // A9DA..A9DD; UNKNOWN
+ 0xA9DE, // A9DE..A9DF; JAVANESE
+ 0xA9E0, // A9E0..A9FE; MYANMAR
+ 0xA9FF, // A9FF ; UNKNOWN
+ 0xAA00, // AA00..AA36; CHAM
+ 0xAA37, // AA37..AA3F; UNKNOWN
+ 0xAA40, // AA40..AA4D; CHAM
+ 0xAA4E, // AA4E..AA4F; UNKNOWN
+ 0xAA50, // AA50..AA59; CHAM
+ 0xAA5A, // AA5A..AA5B; UNKNOWN
+ 0xAA5C, // AA5C..AA5F; CHAM
0xAA60, // AA60..AA7F; MYANMAR
- 0xAA80, // AA80..AADF; TAI_VIET
- 0xAAE0, // AAE0..AB00; MEETEI_MAYEK
- 0xAB01, // AB01..ABBF; ETHIOPIC
- 0xABC0, // ABC0..ABFF; MEETEI_MAYEK
- 0xAC00, // AC00..D7FB; HANGUL
+ 0xAA80, // AA80..AAC2; TAI_VIET
+ 0xAAC3, // AAC3..AADA; UNKNOWN
+ 0xAADB, // AADB..AADF; TAI_VIET
+ 0xAAE0, // AAE0..AAF6; MEETEI_MAYEK
+ 0xAAF7, // AAF7..AB00; UNKNOWN
+ 0xAB01, // AB01..AB06; ETHIOPIC
+ 0xAB07, // AB07..AB08; UNKNOWN
+ 0xAB09, // AB09..AB0E; ETHIOPIC
+ 0xAB0F, // AB0F..AB10; UNKNOWN
+ 0xAB11, // AB11..AB16; ETHIOPIC
+ 0xAB17, // AB17..AB1F; UNKNOWN
+ 0xAB20, // AB20..AB26; ETHIOPIC
+ 0xAB27, // AB27 ; UNKNOWN
+ 0xAB28, // AB28..AB2E; ETHIOPIC
+ 0xAB2F, // AB2F ; UNKNOWN
+ 0xAB30, // AB30..AB5A; LATIN
+ 0xAB5B, // AB5B ; COMMON
+ 0xAB5C, // AB5C..AB5F; LATIN
+ 0xAB60, // AB60..AB63; UNKNOWN
+ 0xAB64, // AB64 ; LATIN
+ 0xAB65, // AB65 ; GREEK
+ 0xAB66, // AB66..ABBF; UNKNOWN
+ 0xABC0, // ABC0..ABED; MEETEI_MAYEK
+ 0xABEE, // ABEE..ABEF; UNKNOWN
+ 0xABF0, // ABF0..ABF9; MEETEI_MAYEK
+ 0xABFA, // ABFA..ABFF; UNKNOWN
+ 0xAC00, // AC00..D7A3; HANGUL
+ 0xD7A4, // D7A4..D7AF; UNKNOWN
+ 0xD7B0, // D7B0..D7C6; HANGUL
+ 0xD7C7, // D7C7..D7CA; UNKNOWN
+ 0xD7CB, // D7CB..D7FB; HANGUL
0xD7FC, // D7FC..F8FF; UNKNOWN
- 0xF900, // F900..FAFF; HAN
- 0xFB00, // FB00..FB12; LATIN
- 0xFB13, // FB13..FB1C; ARMENIAN
- 0xFB1D, // FB1D..FB4F; HEBREW
- 0xFB50, // FB50..FD3D; ARABIC
- 0xFD3E, // FD3E..FD4F; COMMON
- 0xFD50, // FD50..FDFC; ARABIC
- 0xFDFD, // FDFD..FDFF; COMMON
+ 0xF900, // F900..FA6D; HAN
+ 0xFA6E, // FA6E..FA6F; UNKNOWN
+ 0xFA70, // FA70..FAD9; HAN
+ 0xFADA, // FADA..FAFF; UNKNOWN
+ 0xFB00, // FB00..FB06; LATIN
+ 0xFB07, // FB07..FB12; UNKNOWN
+ 0xFB13, // FB13..FB17; ARMENIAN
+ 0xFB18, // FB18..FB1C; UNKNOWN
+ 0xFB1D, // FB1D..FB36; HEBREW
+ 0xFB37, // FB37 ; UNKNOWN
+ 0xFB38, // FB38..FB3C; HEBREW
+ 0xFB3D, // FB3D ; UNKNOWN
+ 0xFB3E, // FB3E ; HEBREW
+ 0xFB3F, // FB3F ; UNKNOWN
+ 0xFB40, // FB40..FB41; HEBREW
+ 0xFB42, // FB42 ; UNKNOWN
+ 0xFB43, // FB43..FB44; HEBREW
+ 0xFB45, // FB45 ; UNKNOWN
+ 0xFB46, // FB46..FB4F; HEBREW
+ 0xFB50, // FB50..FBC1; ARABIC
+ 0xFBC2, // FBC2..FBD2; UNKNOWN
+ 0xFBD3, // FBD3..FD3D; ARABIC
+ 0xFD3E, // FD3E..FD3F; COMMON
+ 0xFD40, // FD40..FD4F; UNKNOWN
+ 0xFD50, // FD50..FD8F; ARABIC
+ 0xFD90, // FD90..FD91; UNKNOWN
+ 0xFD92, // FD92..FDC7; ARABIC
+ 0xFDC8, // FDC8..FDEF; UNKNOWN
+ 0xFDF0, // FDF0..FDFD; ARABIC
+ 0xFDFE, // FDFE..FDFF; UNKNOWN
0xFE00, // FE00..FE0F; INHERITED
- 0xFE10, // FE10..FE1F; COMMON
- 0xFE20, // FE20..FE2F; INHERITED
- 0xFE30, // FE30..FE6F; COMMON
- 0xFE70, // FE70..FEFE; ARABIC
- 0xFEFF, // FEFF..FF20; COMMON
+ 0xFE10, // FE10..FE19; COMMON
+ 0xFE1A, // FE1A..FE1F; UNKNOWN
+ 0xFE20, // FE20..FE2D; INHERITED
+ 0xFE2E, // FE2E..FE2F; UNKNOWN
+ 0xFE30, // FE30..FE52; COMMON
+ 0xFE53, // FE53 ; UNKNOWN
+ 0xFE54, // FE54..FE66; COMMON
+ 0xFE67, // FE67 ; UNKNOWN
+ 0xFE68, // FE68..FE6B; COMMON
+ 0xFE6C, // FE6C..FE6F; UNKNOWN
+ 0xFE70, // FE70..FE74; ARABIC
+ 0xFE75, // FE75 ; UNKNOWN
+ 0xFE76, // FE76..FEFC; ARABIC
+ 0xFEFD, // FEFD..FEFE; UNKNOWN
+ 0xFEFF, // FEFF ; COMMON
+ 0xFF00, // FF00 ; UNKNOWN
+ 0xFF01, // FF01..FF20; COMMON
0xFF21, // FF21..FF3A; LATIN
0xFF3B, // FF3B..FF40; COMMON
0xFF41, // FF41..FF5A; LATIN
0xFF5B, // FF5B..FF65; COMMON
0xFF66, // FF66..FF6F; KATAKANA
- 0xFF70, // FF70..FF70; COMMON
+ 0xFF70, // FF70 ; COMMON
0xFF71, // FF71..FF9D; KATAKANA
0xFF9E, // FF9E..FF9F; COMMON
- 0xFFA0, // FFA0..FFDF; HANGUL
- 0xFFE0, // FFE0..FFFF; COMMON
- 0x10000, // 10000..100FF; LINEAR_B
- 0x10100, // 10100..1013F; COMMON
- 0x10140, // 10140..1018F; GREEK
- 0x10190, // 10190..101FC; COMMON
- 0x101FD, // 101FD..1027F; INHERITED
- 0x10280, // 10280..1029F; LYCIAN
- 0x102A0, // 102A0..102FF; CARIAN
- 0x10300, // 10300..1032F; OLD_ITALIC
- 0x10330, // 10330..1037F; GOTHIC
- 0x10380, // 10380..1039F; UGARITIC
- 0x103A0, // 103A0..103FF; OLD_PERSIAN
+ 0xFFA0, // FFA0..FFBE; HANGUL
+ 0xFFBF, // FFBF..FFC1; UNKNOWN
+ 0xFFC2, // FFC2..FFC7; HANGUL
+ 0xFFC8, // FFC8..FFC9; UNKNOWN
+ 0xFFCA, // FFCA..FFCF; HANGUL
+ 0xFFD0, // FFD0..FFD1; UNKNOWN
+ 0xFFD2, // FFD2..FFD7; HANGUL
+ 0xFFD8, // FFD8..FFD9; UNKNOWN
+ 0xFFDA, // FFDA..FFDC; HANGUL
+ 0xFFDD, // FFDD..FFDF; UNKNOWN
+ 0xFFE0, // FFE0..FFE6; COMMON
+ 0xFFE7, // FFE7 ; UNKNOWN
+ 0xFFE8, // FFE8..FFEE; COMMON
+ 0xFFEF, // FFEF..FFF8; UNKNOWN
+ 0xFFF9, // FFF9..FFFD; COMMON
+ 0xFFFE, // FFFE..FFFF; UNKNOWN
+ 0x10000, // 10000..1000B; LINEAR_B
+ 0x1000C, // 1000C ; UNKNOWN
+ 0x1000D, // 1000D..10026; LINEAR_B
+ 0x10027, // 10027 ; UNKNOWN
+ 0x10028, // 10028..1003A; LINEAR_B
+ 0x1003B, // 1003B ; UNKNOWN
+ 0x1003C, // 1003C..1003D; LINEAR_B
+ 0x1003E, // 1003E ; UNKNOWN
+ 0x1003F, // 1003F..1004D; LINEAR_B
+ 0x1004E, // 1004E..1004F; UNKNOWN
+ 0x10050, // 10050..1005D; LINEAR_B
+ 0x1005E, // 1005E..1007F; UNKNOWN
+ 0x10080, // 10080..100FA; LINEAR_B
+ 0x100FB, // 100FB..100FF; UNKNOWN
+ 0x10100, // 10100..10102; COMMON
+ 0x10103, // 10103..10106; UNKNOWN
+ 0x10107, // 10107..10133; COMMON
+ 0x10134, // 10134..10136; UNKNOWN
+ 0x10137, // 10137..1013F; COMMON
+ 0x10140, // 10140..1018C; GREEK
+ 0x1018D, // 1018D..1018F; UNKNOWN
+ 0x10190, // 10190..1019B; COMMON
+ 0x1019C, // 1019C..1019F; UNKNOWN
+ 0x101A0, // 101A0 ; GREEK
+ 0x101A1, // 101A1..101CF; UNKNOWN
+ 0x101D0, // 101D0..101FC; COMMON
+ 0x101FD, // 101FD ; INHERITED
+ 0x101FE, // 101FE..1027F; UNKNOWN
+ 0x10280, // 10280..1029C; LYCIAN
+ 0x1029D, // 1029D..1029F; UNKNOWN
+ 0x102A0, // 102A0..102D0; CARIAN
+ 0x102D1, // 102D1..102DF; UNKNOWN
+ 0x102E0, // 102E0 ; INHERITED
+ 0x102E1, // 102E1..102FB; COMMON
+ 0x102FC, // 102FC..102FF; UNKNOWN
+ 0x10300, // 10300..10323; OLD_ITALIC
+ 0x10324, // 10324..1032F; UNKNOWN
+ 0x10330, // 10330..1034A; GOTHIC
+ 0x1034B, // 1034B..1034F; UNKNOWN
+ 0x10350, // 10350..1037A; OLD_PERMIC
+ 0x1037B, // 1037B..1037F; UNKNOWN
+ 0x10380, // 10380..1039D; UGARITIC
+ 0x1039E, // 1039E ; UNKNOWN
+ 0x1039F, // 1039F ; UGARITIC
+ 0x103A0, // 103A0..103C3; OLD_PERSIAN
+ 0x103C4, // 103C4..103C7; UNKNOWN
+ 0x103C8, // 103C8..103D5; OLD_PERSIAN
+ 0x103D6, // 103D6..103FF; UNKNOWN
0x10400, // 10400..1044F; DESERET
0x10450, // 10450..1047F; SHAVIAN
- 0x10480, // 10480..107FF; OSMANYA
- 0x10800, // 10800..1083F; CYPRIOT
- 0x10840, // 10840..108FF; IMPERIAL_ARAMAIC
- 0x10900, // 10900..1091F; PHOENICIAN
- 0x10920, // 10920..1097F; LYDIAN
+ 0x10480, // 10480..1049D; OSMANYA
+ 0x1049E, // 1049E..1049F; UNKNOWN
+ 0x104A0, // 104A0..104A9; OSMANYA
+ 0x104AA, // 104AA..104FF; UNKNOWN
+ 0x10500, // 10500..10527; ELBASAN
+ 0x10528, // 10528..1052F; UNKNOWN
+ 0x10530, // 10530..10563; CAUCASIAN_ALBANIAN
+ 0x10564, // 10564..1056E; UNKNOWN
+ 0x1056F, // 1056F ; CAUCASIAN_ALBANIAN
+ 0x10570, // 10570..105FF; UNKNOWN
+ 0x10600, // 10600..10736; LINEAR_A
+ 0x10737, // 10737..1073F; UNKNOWN
+ 0x10740, // 10740..10755; LINEAR_A
+ 0x10756, // 10756..1075F; UNKNOWN
+ 0x10760, // 10760..10767; LINEAR_A
+ 0x10768, // 10768..107FF; UNKNOWN
+ 0x10800, // 10800..10805; CYPRIOT
+ 0x10806, // 10806..10807; UNKNOWN
+ 0x10808, // 10808 ; CYPRIOT
+ 0x10809, // 10809 ; UNKNOWN
+ 0x1080A, // 1080A..10835; CYPRIOT
+ 0x10836, // 10836 ; UNKNOWN
+ 0x10837, // 10837..10838; CYPRIOT
+ 0x10839, // 10839..1083B; UNKNOWN
+ 0x1083C, // 1083C ; CYPRIOT
+ 0x1083D, // 1083D..1083E; UNKNOWN
+ 0x1083F, // 1083F ; CYPRIOT
+ 0x10840, // 10840..10855; IMPERIAL_ARAMAIC
+ 0x10856, // 10856 ; UNKNOWN
+ 0x10857, // 10857..1085F; IMPERIAL_ARAMAIC
+ 0x10860, // 10860..1087F; PALMYRENE
+ 0x10880, // 10880..1089E; NABATAEAN
+ 0x1089F, // 1089F..108A6; UNKNOWN
+ 0x108A7, // 108A7..108AF; NABATAEAN
+ 0x108B0, // 108B0..108FF; UNKNOWN
+ 0x10900, // 10900..1091B; PHOENICIAN
+ 0x1091C, // 1091C..1091E; UNKNOWN
+ 0x1091F, // 1091F ; PHOENICIAN
+ 0x10920, // 10920..10939; LYDIAN
+ 0x1093A, // 1093A..1093E; UNKNOWN
+ 0x1093F, // 1093F ; LYDIAN
+ 0x10940, // 10940..1097F; UNKNOWN
0x10980, // 10980..1099F; MEROITIC_HIEROGLYPHS
- 0x109A0, // 109A0..109FF; MEROITIC_CURSIVE
- 0x10A00, // 10A00..10A5F; KHAROSHTHI
- 0x10A60, // 10A60..10AFF; OLD_SOUTH_ARABIAN
- 0x10B00, // 10B00..10B3F; AVESTAN
- 0x10B40, // 10B40..10B5F; INSCRIPTIONAL_PARTHIAN
- 0x10B60, // 10B60..10BFF; INSCRIPTIONAL_PAHLAVI
- 0x10C00, // 10C00..10E5F; OLD_TURKIC
- 0x10E60, // 10E60..10FFF; ARABIC
- 0x11000, // 11000..1107F; BRAHMI
- 0x11080, // 11080..110CF; KAITHI
- 0x110D0, // 110D0..110FF; SORA_SOMPENG
- 0x11100, // 11100..1117F; CHAKMA
- 0x11180, // 11180..1167F; SHARADA
- 0x11680, // 11680..116CF; TAKRI
- 0x12000, // 12000..12FFF; CUNEIFORM
- 0x13000, // 13000..167FF; EGYPTIAN_HIEROGLYPHS
+ 0x109A0, // 109A0..109B7; MEROITIC_CURSIVE
+ 0x109B8, // 109B8..109BD; UNKNOWN
+ 0x109BE, // 109BE..109BF; MEROITIC_CURSIVE
+ 0x109C0, // 109C0..109FF; UNKNOWN
+ 0x10A00, // 10A00..10A03; KHAROSHTHI
+ 0x10A04, // 10A04 ; UNKNOWN
+ 0x10A05, // 10A05..10A06; KHAROSHTHI
+ 0x10A07, // 10A07..10A0B; UNKNOWN
+ 0x10A0C, // 10A0C..10A13; KHAROSHTHI
+ 0x10A14, // 10A14 ; UNKNOWN
+ 0x10A15, // 10A15..10A17; KHAROSHTHI
+ 0x10A18, // 10A18 ; UNKNOWN
+ 0x10A19, // 10A19..10A33; KHAROSHTHI
+ 0x10A34, // 10A34..10A37; UNKNOWN
+ 0x10A38, // 10A38..10A3A; KHAROSHTHI
+ 0x10A3B, // 10A3B..10A3E; UNKNOWN
+ 0x10A3F, // 10A3F..10A47; KHAROSHTHI
+ 0x10A48, // 10A48..10A4F; UNKNOWN
+ 0x10A50, // 10A50..10A58; KHAROSHTHI
+ 0x10A59, // 10A59..10A5F; UNKNOWN
+ 0x10A60, // 10A60..10A7F; OLD_SOUTH_ARABIAN
+ 0x10A80, // 10A80..10A9F; OLD_NORTH_ARABIAN
+ 0x10AA0, // 10AA0..10ABF; UNKNOWN
+ 0x10AC0, // 10AC0..10AE6; MANICHAEAN
+ 0x10AE7, // 10AE7..10AEA; UNKNOWN
+ 0x10AEB, // 10AEB..10AF6; MANICHAEAN
+ 0x10AF7, // 10AF7..10AFF; UNKNOWN
+ 0x10B00, // 10B00..10B35; AVESTAN
+ 0x10B36, // 10B36..10B38; UNKNOWN
+ 0x10B39, // 10B39..10B3F; AVESTAN
+ 0x10B40, // 10B40..10B55; INSCRIPTIONAL_PARTHIAN
+ 0x10B56, // 10B56..10B57; UNKNOWN
+ 0x10B58, // 10B58..10B5F; INSCRIPTIONAL_PARTHIAN
+ 0x10B60, // 10B60..10B72; INSCRIPTIONAL_PAHLAVI
+ 0x10B73, // 10B73..10B77; UNKNOWN
+ 0x10B78, // 10B78..10B7F; INSCRIPTIONAL_PAHLAVI
+ 0x10B80, // 10B80..10B91; PSALTER_PAHLAVI
+ 0x10B92, // 10B92..10B98; UNKNOWN
+ 0x10B99, // 10B99..10B9C; PSALTER_PAHLAVI
+ 0x10B9D, // 10B9D..10BA8; UNKNOWN
+ 0x10BA9, // 10BA9..10BAF; PSALTER_PAHLAVI
+ 0x10BB0, // 10BB0..10BFF; UNKNOWN
+ 0x10C00, // 10C00..10C48; OLD_TURKIC
+ 0x10C49, // 10C49..10E5F; UNKNOWN
+ 0x10E60, // 10E60..10E7E; ARABIC
+ 0x10E7F, // 10E7F..10FFF; UNKNOWN
+ 0x11000, // 11000..1104D; BRAHMI
+ 0x1104E, // 1104E..11051; UNKNOWN
+ 0x11052, // 11052..1106F; BRAHMI
+ 0x11070, // 11070..1107E; UNKNOWN
+ 0x1107F, // 1107F ; BRAHMI
+ 0x11080, // 11080..110C1; KAITHI
+ 0x110C2, // 110C2..110CF; UNKNOWN
+ 0x110D0, // 110D0..110E8; SORA_SOMPENG
+ 0x110E9, // 110E9..110EF; UNKNOWN
+ 0x110F0, // 110F0..110F9; SORA_SOMPENG
+ 0x110FA, // 110FA..110FF; UNKNOWN
+ 0x11100, // 11100..11134; CHAKMA
+ 0x11135, // 11135 ; UNKNOWN
+ 0x11136, // 11136..11143; CHAKMA
+ 0x11144, // 11144..1114F; UNKNOWN
+ 0x11150, // 11150..11176; MAHAJANI
+ 0x11177, // 11177..1117F; UNKNOWN
+ 0x11180, // 11180..111C8; SHARADA
+ 0x111C9, // 111C9..111CC; UNKNOWN
+ 0x111CD, // 111CD ; SHARADA
+ 0x111CE, // 111CE..111CF; UNKNOWN
+ 0x111D0, // 111D0..111DA; SHARADA
+ 0x111DB, // 111DB..111E0; UNKNOWN
+ 0x111E1, // 111E1..111F4; SINHALA
+ 0x111F5, // 111F5..111FF; UNKNOWN
+ 0x11200, // 11200..11211; KHOJKI
+ 0x11212, // 11212 ; UNKNOWN
+ 0x11213, // 11213..1123D; KHOJKI
+ 0x1123E, // 1123E..112AF; UNKNOWN
+ 0x112B0, // 112B0..112EA; KHUDAWADI
+ 0x112EB, // 112EB..112EF; UNKNOWN
+ 0x112F0, // 112F0..112F9; KHUDAWADI
+ 0x112FA, // 112FA..11300; UNKNOWN
+ 0x11301, // 11301..11303; GRANTHA
+ 0x11304, // 11304 ; UNKNOWN
+ 0x11305, // 11305..1130C; GRANTHA
+ 0x1130D, // 1130D..1130E; UNKNOWN
+ 0x1130F, // 1130F..11310; GRANTHA
+ 0x11311, // 11311..11312; UNKNOWN
+ 0x11313, // 11313..11328; GRANTHA
+ 0x11329, // 11329 ; UNKNOWN
+ 0x1132A, // 1132A..11330; GRANTHA
+ 0x11331, // 11331 ; UNKNOWN
+ 0x11332, // 11332..11333; GRANTHA
+ 0x11334, // 11334 ; UNKNOWN
+ 0x11335, // 11335..11339; GRANTHA
+ 0x1133A, // 1133A..1133B; UNKNOWN
+ 0x1133C, // 1133C..11344; GRANTHA
+ 0x11345, // 11345..11346; UNKNOWN
+ 0x11347, // 11347..11348; GRANTHA
+ 0x11349, // 11349..1134A; UNKNOWN
+ 0x1134B, // 1134B..1134D; GRANTHA
+ 0x1134E, // 1134E..11356; UNKNOWN
+ 0x11357, // 11357 ; GRANTHA
+ 0x11358, // 11358..1135C; UNKNOWN
+ 0x1135D, // 1135D..11363; GRANTHA
+ 0x11364, // 11364..11365; UNKNOWN
+ 0x11366, // 11366..1136C; GRANTHA
+ 0x1136D, // 1136D..1136F; UNKNOWN
+ 0x11370, // 11370..11374; GRANTHA
+ 0x11375, // 11375..1147F; UNKNOWN
+ 0x11480, // 11480..114C7; TIRHUTA
+ 0x114C8, // 114C8..114CF; UNKNOWN
+ 0x114D0, // 114D0..114D9; TIRHUTA
+ 0x114DA, // 114DA..1157F; UNKNOWN
+ 0x11580, // 11580..115B5; SIDDHAM
+ 0x115B6, // 115B6..115B7; UNKNOWN
+ 0x115B8, // 115B8..115C9; SIDDHAM
+ 0x115CA, // 115CA..115FF; UNKNOWN
+ 0x11600, // 11600..11644; MODI
+ 0x11645, // 11645..1164F; UNKNOWN
+ 0x11650, // 11650..11659; MODI
+ 0x1165A, // 1165A..1167F; UNKNOWN
+ 0x11680, // 11680..116B7; TAKRI
+ 0x116B8, // 116B8..116BF; UNKNOWN
+ 0x116C0, // 116C0..116C9; TAKRI
+ 0x116CA, // 116CA..1189F; UNKNOWN
+ 0x118A0, // 118A0..118F2; WARANG_CITI
+ 0x118F3, // 118F3..118FE; UNKNOWN
+ 0x118FF, // 118FF ; WARANG_CITI
+ 0x11900, // 11900..11ABF; UNKNOWN
+ 0x11AC0, // 11AC0..11AF8; PAU_CIN_HAU
+ 0x11AF9, // 11AF9..11FFF; UNKNOWN
+ 0x12000, // 12000..12398; CUNEIFORM
+ 0x12399, // 12399..123FF; UNKNOWN
+ 0x12400, // 12400..1246E; CUNEIFORM
+ 0x1246F, // 1246F ; UNKNOWN
+ 0x12470, // 12470..12474; CUNEIFORM
+ 0x12475, // 12475..12FFF; UNKNOWN
+ 0x13000, // 13000..1342E; EGYPTIAN_HIEROGLYPHS
+ 0x1342F, // 1342F..167FF; UNKNOWN
0x16800, // 16800..16A38; BAMUM
- 0x16F00, // 16F00..16F9F; MIAO
- 0x1B000, // 1B000..1B000; KATAKANA
- 0x1B001, // 1B001..1CFFF; HIRAGANA
- 0x1D000, // 1D000..1D166; COMMON
+ 0x16A39, // 16A39..16A3F; UNKNOWN
+ 0x16A40, // 16A40..16A5E; MRO
+ 0x16A5F, // 16A5F ; UNKNOWN
+ 0x16A60, // 16A60..16A69; MRO
+ 0x16A6A, // 16A6A..16A6D; UNKNOWN
+ 0x16A6E, // 16A6E..16A6F; MRO
+ 0x16A70, // 16A70..16ACF; UNKNOWN
+ 0x16AD0, // 16AD0..16AED; BASSA_VAH
+ 0x16AEE, // 16AEE..16AEF; UNKNOWN
+ 0x16AF0, // 16AF0..16AF5; BASSA_VAH
+ 0x16AF6, // 16AF6..16AFF; UNKNOWN
+ 0x16B00, // 16B00..16B45; PAHAWH_HMONG
+ 0x16B46, // 16B46..16B4F; UNKNOWN
+ 0x16B50, // 16B50..16B59; PAHAWH_HMONG
+ 0x16B5A, // 16B5A ; UNKNOWN
+ 0x16B5B, // 16B5B..16B61; PAHAWH_HMONG
+ 0x16B62, // 16B62 ; UNKNOWN
+ 0x16B63, // 16B63..16B77; PAHAWH_HMONG
+ 0x16B78, // 16B78..16B7C; UNKNOWN
+ 0x16B7D, // 16B7D..16B8F; PAHAWH_HMONG
+ 0x16B90, // 16B90..16EFF; UNKNOWN
+ 0x16F00, // 16F00..16F44; MIAO
+ 0x16F45, // 16F45..16F4F; UNKNOWN
+ 0x16F50, // 16F50..16F7E; MIAO
+ 0x16F7F, // 16F7F..16F8E; UNKNOWN
+ 0x16F8F, // 16F8F..16F9F; MIAO
+ 0x16FA0, // 16FA0..1AFFF; UNKNOWN
+ 0x1B000, // 1B000 ; KATAKANA
+ 0x1B001, // 1B001 ; HIRAGANA
+ 0x1B002, // 1B002..1BBFF; UNKNOWN
+ 0x1BC00, // 1BC00..1BC6A; DUPLOYAN
+ 0x1BC6B, // 1BC6B..1BC6F; UNKNOWN
+ 0x1BC70, // 1BC70..1BC7C; DUPLOYAN
+ 0x1BC7D, // 1BC7D..1BC7F; UNKNOWN
+ 0x1BC80, // 1BC80..1BC88; DUPLOYAN
+ 0x1BC89, // 1BC89..1BC8F; UNKNOWN
+ 0x1BC90, // 1BC90..1BC99; DUPLOYAN
+ 0x1BC9A, // 1BC9A..1BC9B; UNKNOWN
+ 0x1BC9C, // 1BC9C..1BC9F; DUPLOYAN
+ 0x1BCA0, // 1BCA0..1BCA3; COMMON
+ 0x1BCA4, // 1BCA4..1CFFF; UNKNOWN
+ 0x1D000, // 1D000..1D0F5; COMMON
+ 0x1D0F6, // 1D0F6..1D0FF; UNKNOWN
+ 0x1D100, // 1D100..1D126; COMMON
+ 0x1D127, // 1D127..1D128; UNKNOWN
+ 0x1D129, // 1D129..1D166; COMMON
0x1D167, // 1D167..1D169; INHERITED
0x1D16A, // 1D16A..1D17A; COMMON
0x1D17B, // 1D17B..1D182; INHERITED
@@ -4020,354 +5418,1635 @@
0x1D185, // 1D185..1D18B; INHERITED
0x1D18C, // 1D18C..1D1A9; COMMON
0x1D1AA, // 1D1AA..1D1AD; INHERITED
- 0x1D1AE, // 1D1AE..1D1FF; COMMON
- 0x1D200, // 1D200..1D2FF; GREEK
- 0x1D300, // 1D300..1EDFF; COMMON
- 0x1EE00, // 1EE00..1EFFF; ARABIC
- 0x1F000, // 1F000..1F1FF; COMMON
- 0x1F200, // 1F200..1F200; HIRAGANA
- 0x1F201, // 1F210..1FFFF; COMMON
- 0x20000, // 20000..E0000; HAN
- 0xE0001, // E0001..E00FF; COMMON
+ 0x1D1AE, // 1D1AE..1D1DD; COMMON
+ 0x1D1DE, // 1D1DE..1D1FF; UNKNOWN
+ 0x1D200, // 1D200..1D245; GREEK
+ 0x1D246, // 1D246..1D2FF; UNKNOWN
+ 0x1D300, // 1D300..1D356; COMMON
+ 0x1D357, // 1D357..1D35F; UNKNOWN
+ 0x1D360, // 1D360..1D371; COMMON
+ 0x1D372, // 1D372..1D3FF; UNKNOWN
+ 0x1D400, // 1D400..1D454; COMMON
+ 0x1D455, // 1D455 ; UNKNOWN
+ 0x1D456, // 1D456..1D49C; COMMON
+ 0x1D49D, // 1D49D ; UNKNOWN
+ 0x1D49E, // 1D49E..1D49F; COMMON
+ 0x1D4A0, // 1D4A0..1D4A1; UNKNOWN
+ 0x1D4A2, // 1D4A2 ; COMMON
+ 0x1D4A3, // 1D4A3..1D4A4; UNKNOWN
+ 0x1D4A5, // 1D4A5..1D4A6; COMMON
+ 0x1D4A7, // 1D4A7..1D4A8; UNKNOWN
+ 0x1D4A9, // 1D4A9..1D4AC; COMMON
+ 0x1D4AD, // 1D4AD ; UNKNOWN
+ 0x1D4AE, // 1D4AE..1D4B9; COMMON
+ 0x1D4BA, // 1D4BA ; UNKNOWN
+ 0x1D4BB, // 1D4BB ; COMMON
+ 0x1D4BC, // 1D4BC ; UNKNOWN
+ 0x1D4BD, // 1D4BD..1D4C3; COMMON
+ 0x1D4C4, // 1D4C4 ; UNKNOWN
+ 0x1D4C5, // 1D4C5..1D505; COMMON
+ 0x1D506, // 1D506 ; UNKNOWN
+ 0x1D507, // 1D507..1D50A; COMMON
+ 0x1D50B, // 1D50B..1D50C; UNKNOWN
+ 0x1D50D, // 1D50D..1D514; COMMON
+ 0x1D515, // 1D515 ; UNKNOWN
+ 0x1D516, // 1D516..1D51C; COMMON
+ 0x1D51D, // 1D51D ; UNKNOWN
+ 0x1D51E, // 1D51E..1D539; COMMON
+ 0x1D53A, // 1D53A ; UNKNOWN
+ 0x1D53B, // 1D53B..1D53E; COMMON
+ 0x1D53F, // 1D53F ; UNKNOWN
+ 0x1D540, // 1D540..1D544; COMMON
+ 0x1D545, // 1D545 ; UNKNOWN
+ 0x1D546, // 1D546 ; COMMON
+ 0x1D547, // 1D547..1D549; UNKNOWN
+ 0x1D54A, // 1D54A..1D550; COMMON
+ 0x1D551, // 1D551 ; UNKNOWN
+ 0x1D552, // 1D552..1D6A5; COMMON
+ 0x1D6A6, // 1D6A6..1D6A7; UNKNOWN
+ 0x1D6A8, // 1D6A8..1D7CB; COMMON
+ 0x1D7CC, // 1D7CC..1D7CD; UNKNOWN
+ 0x1D7CE, // 1D7CE..1D7FF; COMMON
+ 0x1D800, // 1D800..1E7FF; UNKNOWN
+ 0x1E800, // 1E800..1E8C4; MENDE_KIKAKUI
+ 0x1E8C5, // 1E8C5..1E8C6; UNKNOWN
+ 0x1E8C7, // 1E8C7..1E8D6; MENDE_KIKAKUI
+ 0x1E8D7, // 1E8D7..1EDFF; UNKNOWN
+ 0x1EE00, // 1EE00..1EE03; ARABIC
+ 0x1EE04, // 1EE04 ; UNKNOWN
+ 0x1EE05, // 1EE05..1EE1F; ARABIC
+ 0x1EE20, // 1EE20 ; UNKNOWN
+ 0x1EE21, // 1EE21..1EE22; ARABIC
+ 0x1EE23, // 1EE23 ; UNKNOWN
+ 0x1EE24, // 1EE24 ; ARABIC
+ 0x1EE25, // 1EE25..1EE26; UNKNOWN
+ 0x1EE27, // 1EE27 ; ARABIC
+ 0x1EE28, // 1EE28 ; UNKNOWN
+ 0x1EE29, // 1EE29..1EE32; ARABIC
+ 0x1EE33, // 1EE33 ; UNKNOWN
+ 0x1EE34, // 1EE34..1EE37; ARABIC
+ 0x1EE38, // 1EE38 ; UNKNOWN
+ 0x1EE39, // 1EE39 ; ARABIC
+ 0x1EE3A, // 1EE3A ; UNKNOWN
+ 0x1EE3B, // 1EE3B ; ARABIC
+ 0x1EE3C, // 1EE3C..1EE41; UNKNOWN
+ 0x1EE42, // 1EE42 ; ARABIC
+ 0x1EE43, // 1EE43..1EE46; UNKNOWN
+ 0x1EE47, // 1EE47 ; ARABIC
+ 0x1EE48, // 1EE48 ; UNKNOWN
+ 0x1EE49, // 1EE49 ; ARABIC
+ 0x1EE4A, // 1EE4A ; UNKNOWN
+ 0x1EE4B, // 1EE4B ; ARABIC
+ 0x1EE4C, // 1EE4C ; UNKNOWN
+ 0x1EE4D, // 1EE4D..1EE4F; ARABIC
+ 0x1EE50, // 1EE50 ; UNKNOWN
+ 0x1EE51, // 1EE51..1EE52; ARABIC
+ 0x1EE53, // 1EE53 ; UNKNOWN
+ 0x1EE54, // 1EE54 ; ARABIC
+ 0x1EE55, // 1EE55..1EE56; UNKNOWN
+ 0x1EE57, // 1EE57 ; ARABIC
+ 0x1EE58, // 1EE58 ; UNKNOWN
+ 0x1EE59, // 1EE59 ; ARABIC
+ 0x1EE5A, // 1EE5A ; UNKNOWN
+ 0x1EE5B, // 1EE5B ; ARABIC
+ 0x1EE5C, // 1EE5C ; UNKNOWN
+ 0x1EE5D, // 1EE5D ; ARABIC
+ 0x1EE5E, // 1EE5E ; UNKNOWN
+ 0x1EE5F, // 1EE5F ; ARABIC
+ 0x1EE60, // 1EE60 ; UNKNOWN
+ 0x1EE61, // 1EE61..1EE62; ARABIC
+ 0x1EE63, // 1EE63 ; UNKNOWN
+ 0x1EE64, // 1EE64 ; ARABIC
+ 0x1EE65, // 1EE65..1EE66; UNKNOWN
+ 0x1EE67, // 1EE67..1EE6A; ARABIC
+ 0x1EE6B, // 1EE6B ; UNKNOWN
+ 0x1EE6C, // 1EE6C..1EE72; ARABIC
+ 0x1EE73, // 1EE73 ; UNKNOWN
+ 0x1EE74, // 1EE74..1EE77; ARABIC
+ 0x1EE78, // 1EE78 ; UNKNOWN
+ 0x1EE79, // 1EE79..1EE7C; ARABIC
+ 0x1EE7D, // 1EE7D ; UNKNOWN
+ 0x1EE7E, // 1EE7E ; ARABIC
+ 0x1EE7F, // 1EE7F ; UNKNOWN
+ 0x1EE80, // 1EE80..1EE89; ARABIC
+ 0x1EE8A, // 1EE8A ; UNKNOWN
+ 0x1EE8B, // 1EE8B..1EE9B; ARABIC
+ 0x1EE9C, // 1EE9C..1EEA0; UNKNOWN
+ 0x1EEA1, // 1EEA1..1EEA3; ARABIC
+ 0x1EEA4, // 1EEA4 ; UNKNOWN
+ 0x1EEA5, // 1EEA5..1EEA9; ARABIC
+ 0x1EEAA, // 1EEAA ; UNKNOWN
+ 0x1EEAB, // 1EEAB..1EEBB; ARABIC
+ 0x1EEBC, // 1EEBC..1EEEF; UNKNOWN
+ 0x1EEF0, // 1EEF0..1EEF1; ARABIC
+ 0x1EEF2, // 1EEF2..1EFFF; UNKNOWN
+ 0x1F000, // 1F000..1F02B; COMMON
+ 0x1F02C, // 1F02C..1F02F; UNKNOWN
+ 0x1F030, // 1F030..1F093; COMMON
+ 0x1F094, // 1F094..1F09F; UNKNOWN
+ 0x1F0A0, // 1F0A0..1F0AE; COMMON
+ 0x1F0AF, // 1F0AF..1F0B0; UNKNOWN
+ 0x1F0B1, // 1F0B1..1F0BF; COMMON
+ 0x1F0C0, // 1F0C0 ; UNKNOWN
+ 0x1F0C1, // 1F0C1..1F0CF; COMMON
+ 0x1F0D0, // 1F0D0 ; UNKNOWN
+ 0x1F0D1, // 1F0D1..1F0F5; COMMON
+ 0x1F0F6, // 1F0F6..1F0FF; UNKNOWN
+ 0x1F100, // 1F100..1F10C; COMMON
+ 0x1F10D, // 1F10D..1F10F; UNKNOWN
+ 0x1F110, // 1F110..1F12E; COMMON
+ 0x1F12F, // 1F12F ; UNKNOWN
+ 0x1F130, // 1F130..1F16B; COMMON
+ 0x1F16C, // 1F16C..1F16F; UNKNOWN
+ 0x1F170, // 1F170..1F19A; COMMON
+ 0x1F19B, // 1F19B..1F1E5; UNKNOWN
+ 0x1F1E6, // 1F1E6..1F1FF; COMMON
+ 0x1F200, // 1F200 ; HIRAGANA
+ 0x1F201, // 1F201..1F202; COMMON
+ 0x1F203, // 1F203..1F20F; UNKNOWN
+ 0x1F210, // 1F210..1F23A; COMMON
+ 0x1F23B, // 1F23B..1F23F; UNKNOWN
+ 0x1F240, // 1F240..1F248; COMMON
+ 0x1F249, // 1F249..1F24F; UNKNOWN
+ 0x1F250, // 1F250..1F251; COMMON
+ 0x1F252, // 1F252..1F2FF; UNKNOWN
+ 0x1F300, // 1F300..1F32C; COMMON
+ 0x1F32D, // 1F32D..1F32F; UNKNOWN
+ 0x1F330, // 1F330..1F37D; COMMON
+ 0x1F37E, // 1F37E..1F37F; UNKNOWN
+ 0x1F380, // 1F380..1F3CE; COMMON
+ 0x1F3CF, // 1F3CF..1F3D3; UNKNOWN
+ 0x1F3D4, // 1F3D4..1F3F7; COMMON
+ 0x1F3F8, // 1F3F8..1F3FF; UNKNOWN
+ 0x1F400, // 1F400..1F4FE; COMMON
+ 0x1F4FF, // 1F4FF ; UNKNOWN
+ 0x1F500, // 1F500..1F54A; COMMON
+ 0x1F54B, // 1F54B..1F54F; UNKNOWN
+ 0x1F550, // 1F550..1F579; COMMON
+ 0x1F57A, // 1F57A ; UNKNOWN
+ 0x1F57B, // 1F57B..1F5A3; COMMON
+ 0x1F5A4, // 1F5A4 ; UNKNOWN
+ 0x1F5A5, // 1F5A5..1F642; COMMON
+ 0x1F643, // 1F643..1F644; UNKNOWN
+ 0x1F645, // 1F645..1F6CF; COMMON
+ 0x1F6D0, // 1F6D0..1F6DF; UNKNOWN
+ 0x1F6E0, // 1F6E0..1F6EC; COMMON
+ 0x1F6ED, // 1F6ED..1F6EF; UNKNOWN
+ 0x1F6F0, // 1F6F0..1F6F3; COMMON
+ 0x1F6F4, // 1F6F4..1F6FF; UNKNOWN
+ 0x1F700, // 1F700..1F773; COMMON
+ 0x1F774, // 1F774..1F77F; UNKNOWN
+ 0x1F780, // 1F780..1F7D4; COMMON
+ 0x1F7D5, // 1F7D5..1F7FF; UNKNOWN
+ 0x1F800, // 1F800..1F80B; COMMON
+ 0x1F80C, // 1F80C..1F80F; UNKNOWN
+ 0x1F810, // 1F810..1F847; COMMON
+ 0x1F848, // 1F848..1F84F; UNKNOWN
+ 0x1F850, // 1F850..1F859; COMMON
+ 0x1F85A, // 1F85A..1F85F; UNKNOWN
+ 0x1F860, // 1F860..1F887; COMMON
+ 0x1F888, // 1F888..1F88F; UNKNOWN
+ 0x1F890, // 1F890..1F8AD; COMMON
+ 0x1F8AE, // 1F8AE..1FFFF; UNKNOWN
+ 0x20000, // 20000..2A6D6; HAN
+ 0x2A6D7, // 2A6D7..2A6FF; UNKNOWN
+ 0x2A700, // 2A700..2B734; HAN
+ 0x2B735, // 2B735..2B73F; UNKNOWN
+ 0x2B740, // 2B740..2B81D; HAN
+ 0x2B81E, // 2B81E..2F7FF; UNKNOWN
+ 0x2F800, // 2F800..2FA1D; HAN
+ 0x2FA1E, // 2FA1E..E0000; UNKNOWN
+ 0xE0001, // E0001 ; COMMON
+ 0xE0002, // E0002..E001F; UNKNOWN
+ 0xE0020, // E0020..E007F; COMMON
+ 0xE0080, // E0080..E00FF; UNKNOWN
0xE0100, // E0100..E01EF; INHERITED
- 0xE01F0 // E01F0..10FFFF; UNKNOWN
-
+ 0xE01F0, // E01F0..10FFFF; UNKNOWN
};
private static final UnicodeScript[] scripts = {
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- BOPOMOFO,
- COMMON,
- INHERITED,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COMMON,
- GREEK,
- COPTIC,
- GREEK,
- CYRILLIC,
- INHERITED,
- CYRILLIC,
- ARMENIAN,
- COMMON,
- ARMENIAN,
- HEBREW,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- ARABIC,
- INHERITED,
- ARABIC,
- COMMON,
- ARABIC,
- INHERITED,
- ARABIC,
- COMMON,
- ARABIC,
- SYRIAC,
- ARABIC,
- THAANA,
- NKO,
- SAMARITAN,
- MANDAIC,
- ARABIC,
- DEVANAGARI,
- INHERITED,
- DEVANAGARI,
- COMMON,
- DEVANAGARI,
- BENGALI,
- GURMUKHI,
- GUJARATI,
- ORIYA,
- TAMIL,
- TELUGU,
- KANNADA,
- MALAYALAM,
- SINHALA,
- THAI,
- COMMON,
- THAI,
- LAO,
- TIBETAN,
- COMMON,
- TIBETAN,
- MYANMAR,
- GEORGIAN,
- COMMON,
- GEORGIAN,
- HANGUL,
- ETHIOPIC,
- CHEROKEE,
- CANADIAN_ABORIGINAL,
- OGHAM,
- RUNIC,
- COMMON,
- RUNIC,
- TAGALOG,
- HANUNOO,
- COMMON,
- BUHID,
- TAGBANWA,
- KHMER,
- MONGOLIAN,
- COMMON,
- MONGOLIAN,
- COMMON,
- MONGOLIAN,
- CANADIAN_ABORIGINAL,
- LIMBU,
- TAI_LE,
- NEW_TAI_LUE,
- KHMER,
- BUGINESE,
- TAI_THAM,
- BALINESE,
- SUNDANESE,
- BATAK,
- LEPCHA,
- OL_CHIKI,
- SUNDANESE,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- LATIN,
- GREEK,
- CYRILLIC,
- LATIN,
- GREEK,
- LATIN,
- GREEK,
- LATIN,
- CYRILLIC,
- LATIN,
- GREEK,
- INHERITED,
- LATIN,
- GREEK,
- COMMON,
- INHERITED,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- INHERITED,
- COMMON,
- GREEK,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- BRAILLE,
- COMMON,
- GLAGOLITIC,
- LATIN,
- COPTIC,
- GEORGIAN,
- TIFINAGH,
- ETHIOPIC,
- CYRILLIC,
- COMMON,
- HAN,
- COMMON,
- HAN,
- COMMON,
- HAN,
- COMMON,
- HAN,
- INHERITED,
- HANGUL,
- COMMON,
- HAN,
- COMMON,
- HIRAGANA,
- INHERITED,
- COMMON,
- HIRAGANA,
- COMMON,
- KATAKANA,
- COMMON,
- KATAKANA,
- BOPOMOFO,
- HANGUL,
- COMMON,
- BOPOMOFO,
- COMMON,
- KATAKANA,
- HANGUL,
- COMMON,
- HANGUL,
- COMMON,
- KATAKANA,
- COMMON,
- HAN,
- COMMON,
- HAN,
- YI,
- LISU,
- VAI,
- CYRILLIC,
- BAMUM,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- SYLOTI_NAGRI,
- COMMON,
- PHAGS_PA,
- SAURASHTRA,
- DEVANAGARI,
- KAYAH_LI,
- REJANG,
- HANGUL,
- JAVANESE,
- CHAM,
- MYANMAR,
- TAI_VIET,
- MEETEI_MAYEK,
- ETHIOPIC,
- MEETEI_MAYEK,
- HANGUL,
- UNKNOWN ,
- HAN,
- LATIN,
- ARMENIAN,
- HEBREW,
- ARABIC,
- COMMON,
- ARABIC,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- ARABIC,
- COMMON,
- LATIN,
- COMMON,
- LATIN,
- COMMON,
- KATAKANA,
- COMMON,
- KATAKANA,
- COMMON,
- HANGUL,
- COMMON,
- LINEAR_B,
- COMMON,
- GREEK,
- COMMON,
- INHERITED,
- LYCIAN,
- CARIAN,
- OLD_ITALIC,
- GOTHIC,
- UGARITIC,
- OLD_PERSIAN,
- DESERET,
- SHAVIAN,
- OSMANYA,
- CYPRIOT,
- IMPERIAL_ARAMAIC,
- PHOENICIAN,
- LYDIAN,
- MEROITIC_HIEROGLYPHS,
- MEROITIC_CURSIVE,
- KHAROSHTHI,
- OLD_SOUTH_ARABIAN,
- AVESTAN,
- INSCRIPTIONAL_PARTHIAN,
- INSCRIPTIONAL_PAHLAVI,
- OLD_TURKIC,
- ARABIC,
- BRAHMI,
- KAITHI,
- SORA_SOMPENG,
- CHAKMA,
- SHARADA,
- TAKRI,
- CUNEIFORM,
- EGYPTIAN_HIEROGLYPHS,
- BAMUM,
- MIAO,
- KATAKANA,
- HIRAGANA,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- INHERITED,
- COMMON,
- GREEK,
- COMMON,
- ARABIC,
- COMMON,
- HIRAGANA,
- COMMON,
- HAN,
- COMMON,
- INHERITED,
- UNKNOWN
+ COMMON, // 0000..0040
+ LATIN, // 0041..005A
+ COMMON, // 005B..0060
+ LATIN, // 0061..007A
+ COMMON, // 007B..00A9
+ LATIN, // 00AA
+ COMMON, // 00AB..00B9
+ LATIN, // 00BA
+ COMMON, // 00BB..00BF
+ LATIN, // 00C0..00D6
+ COMMON, // 00D7
+ LATIN, // 00D8..00F6
+ COMMON, // 00F7
+ LATIN, // 00F8..02B8
+ COMMON, // 02B9..02DF
+ LATIN, // 02E0..02E4
+ COMMON, // 02E5..02E9
+ BOPOMOFO, // 02EA..02EB
+ COMMON, // 02EC..02FF
+ INHERITED, // 0300..036F
+ GREEK, // 0370..0373
+ COMMON, // 0374
+ GREEK, // 0375..0377
+ UNKNOWN, // 0378..0379
+ GREEK, // 037A..037D
+ COMMON, // 037E
+ GREEK, // 037F
+ UNKNOWN, // 0380..0383
+ GREEK, // 0384
+ COMMON, // 0385
+ GREEK, // 0386
+ COMMON, // 0387
+ GREEK, // 0388..038A
+ UNKNOWN, // 038B
+ GREEK, // 038C
+ UNKNOWN, // 038D
+ GREEK, // 038E..03A1
+ UNKNOWN, // 03A2
+ GREEK, // 03A3..03E1
+ COPTIC, // 03E2..03EF
+ GREEK, // 03F0..03FF
+ CYRILLIC, // 0400..0484
+ INHERITED, // 0485..0486
+ CYRILLIC, // 0487..052F
+ UNKNOWN, // 0530
+ ARMENIAN, // 0531..0556
+ UNKNOWN, // 0557..0558
+ ARMENIAN, // 0559..055F
+ UNKNOWN, // 0560
+ ARMENIAN, // 0561..0587
+ UNKNOWN, // 0588
+ COMMON, // 0589
+ ARMENIAN, // 058A
+ UNKNOWN, // 058B..058C
+ ARMENIAN, // 058D..058F
+ UNKNOWN, // 0590
+ HEBREW, // 0591..05C7
+ UNKNOWN, // 05C8..05CF
+ HEBREW, // 05D0..05EA
+ UNKNOWN, // 05EB..05EF
+ HEBREW, // 05F0..05F4
+ UNKNOWN, // 05F5..05FF
+ ARABIC, // 0600..0604
+ COMMON, // 0605
+ ARABIC, // 0606..060B
+ COMMON, // 060C
+ ARABIC, // 060D..061A
+ COMMON, // 061B..061C
+ UNKNOWN, // 061D
+ ARABIC, // 061E
+ COMMON, // 061F
+ ARABIC, // 0620..063F
+ COMMON, // 0640
+ ARABIC, // 0641..064A
+ INHERITED, // 064B..0655
+ ARABIC, // 0656..065F
+ COMMON, // 0660..0669
+ ARABIC, // 066A..066F
+ INHERITED, // 0670
+ ARABIC, // 0671..06DC
+ COMMON, // 06DD
+ ARABIC, // 06DE..06FF
+ SYRIAC, // 0700..070D
+ UNKNOWN, // 070E
+ SYRIAC, // 070F..074A
+ UNKNOWN, // 074B..074C
+ SYRIAC, // 074D..074F
+ ARABIC, // 0750..077F
+ THAANA, // 0780..07B1
+ UNKNOWN, // 07B2..07BF
+ NKO, // 07C0..07FA
+ UNKNOWN, // 07FB..07FF
+ SAMARITAN, // 0800..082D
+ UNKNOWN, // 082E..082F
+ SAMARITAN, // 0830..083E
+ UNKNOWN, // 083F
+ MANDAIC, // 0840..085B
+ UNKNOWN, // 085C..085D
+ MANDAIC, // 085E
+ UNKNOWN, // 085F..089F
+ ARABIC, // 08A0..08B2
+ UNKNOWN, // 08B3..08E3
+ ARABIC, // 08E4..08FF
+ DEVANAGARI, // 0900..0950
+ INHERITED, // 0951..0952
+ DEVANAGARI, // 0953..0963
+ COMMON, // 0964..0965
+ DEVANAGARI, // 0966..097F
+ BENGALI, // 0980..0983
+ UNKNOWN, // 0984
+ BENGALI, // 0985..098C
+ UNKNOWN, // 098D..098E
+ BENGALI, // 098F..0990
+ UNKNOWN, // 0991..0992
+ BENGALI, // 0993..09A8
+ UNKNOWN, // 09A9
+ BENGALI, // 09AA..09B0
+ UNKNOWN, // 09B1
+ BENGALI, // 09B2
+ UNKNOWN, // 09B3..09B5
+ BENGALI, // 09B6..09B9
+ UNKNOWN, // 09BA..09BB
+ BENGALI, // 09BC..09C4
+ UNKNOWN, // 09C5..09C6
+ BENGALI, // 09C7..09C8
+ UNKNOWN, // 09C9..09CA
+ BENGALI, // 09CB..09CE
+ UNKNOWN, // 09CF..09D6
+ BENGALI, // 09D7
+ UNKNOWN, // 09D8..09DB
+ BENGALI, // 09DC..09DD
+ UNKNOWN, // 09DE
+ BENGALI, // 09DF..09E3
+ UNKNOWN, // 09E4..09E5
+ BENGALI, // 09E6..09FB
+ UNKNOWN, // 09FC..0A00
+ GURMUKHI, // 0A01..0A03
+ UNKNOWN, // 0A04
+ GURMUKHI, // 0A05..0A0A
+ UNKNOWN, // 0A0B..0A0E
+ GURMUKHI, // 0A0F..0A10
+ UNKNOWN, // 0A11..0A12
+ GURMUKHI, // 0A13..0A28
+ UNKNOWN, // 0A29
+ GURMUKHI, // 0A2A..0A30
+ UNKNOWN, // 0A31
+ GURMUKHI, // 0A32..0A33
+ UNKNOWN, // 0A34
+ GURMUKHI, // 0A35..0A36
+ UNKNOWN, // 0A37
+ GURMUKHI, // 0A38..0A39
+ UNKNOWN, // 0A3A..0A3B
+ GURMUKHI, // 0A3C
+ UNKNOWN, // 0A3D
+ GURMUKHI, // 0A3E..0A42
+ UNKNOWN, // 0A43..0A46
+ GURMUKHI, // 0A47..0A48
+ UNKNOWN, // 0A49..0A4A
+ GURMUKHI, // 0A4B..0A4D
+ UNKNOWN, // 0A4E..0A50
+ GURMUKHI, // 0A51
+ UNKNOWN, // 0A52..0A58
+ GURMUKHI, // 0A59..0A5C
+ UNKNOWN, // 0A5D
+ GURMUKHI, // 0A5E
+ UNKNOWN, // 0A5F..0A65
+ GURMUKHI, // 0A66..0A75
+ UNKNOWN, // 0A76..0A80
+ GUJARATI, // 0A81..0A83
+ UNKNOWN, // 0A84
+ GUJARATI, // 0A85..0A8D
+ UNKNOWN, // 0A8E
+ GUJARATI, // 0A8F..0A91
+ UNKNOWN, // 0A92
+ GUJARATI, // 0A93..0AA8
+ UNKNOWN, // 0AA9
+ GUJARATI, // 0AAA..0AB0
+ UNKNOWN, // 0AB1
+ GUJARATI, // 0AB2..0AB3
+ UNKNOWN, // 0AB4
+ GUJARATI, // 0AB5..0AB9
+ UNKNOWN, // 0ABA..0ABB
+ GUJARATI, // 0ABC..0AC5
+ UNKNOWN, // 0AC6
+ GUJARATI, // 0AC7..0AC9
+ UNKNOWN, // 0ACA
+ GUJARATI, // 0ACB..0ACD
+ UNKNOWN, // 0ACE..0ACF
+ GUJARATI, // 0AD0
+ UNKNOWN, // 0AD1..0ADF
+ GUJARATI, // 0AE0..0AE3
+ UNKNOWN, // 0AE4..0AE5
+ GUJARATI, // 0AE6..0AF1
+ UNKNOWN, // 0AF2..0B00
+ ORIYA, // 0B01..0B03
+ UNKNOWN, // 0B04
+ ORIYA, // 0B05..0B0C
+ UNKNOWN, // 0B0D..0B0E
+ ORIYA, // 0B0F..0B10
+ UNKNOWN, // 0B11..0B12
+ ORIYA, // 0B13..0B28
+ UNKNOWN, // 0B29
+ ORIYA, // 0B2A..0B30
+ UNKNOWN, // 0B31
+ ORIYA, // 0B32..0B33
+ UNKNOWN, // 0B34
+ ORIYA, // 0B35..0B39
+ UNKNOWN, // 0B3A..0B3B
+ ORIYA, // 0B3C..0B44
+ UNKNOWN, // 0B45..0B46
+ ORIYA, // 0B47..0B48
+ UNKNOWN, // 0B49..0B4A
+ ORIYA, // 0B4B..0B4D
+ UNKNOWN, // 0B4E..0B55
+ ORIYA, // 0B56..0B57
+ UNKNOWN, // 0B58..0B5B
+ ORIYA, // 0B5C..0B5D
+ UNKNOWN, // 0B5E
+ ORIYA, // 0B5F..0B63
+ UNKNOWN, // 0B64..0B65
+ ORIYA, // 0B66..0B77
+ UNKNOWN, // 0B78..0B81
+ TAMIL, // 0B82..0B83
+ UNKNOWN, // 0B84
+ TAMIL, // 0B85..0B8A
+ UNKNOWN, // 0B8B..0B8D
+ TAMIL, // 0B8E..0B90
+ UNKNOWN, // 0B91
+ TAMIL, // 0B92..0B95
+ UNKNOWN, // 0B96..0B98
+ TAMIL, // 0B99..0B9A
+ UNKNOWN, // 0B9B
+ TAMIL, // 0B9C
+ UNKNOWN, // 0B9D
+ TAMIL, // 0B9E..0B9F
+ UNKNOWN, // 0BA0..0BA2
+ TAMIL, // 0BA3..0BA4
+ UNKNOWN, // 0BA5..0BA7
+ TAMIL, // 0BA8..0BAA
+ UNKNOWN, // 0BAB..0BAD
+ TAMIL, // 0BAE..0BB9
+ UNKNOWN, // 0BBA..0BBD
+ TAMIL, // 0BBE..0BC2
+ UNKNOWN, // 0BC3..0BC5
+ TAMIL, // 0BC6..0BC8
+ UNKNOWN, // 0BC9
+ TAMIL, // 0BCA..0BCD
+ UNKNOWN, // 0BCE..0BCF
+ TAMIL, // 0BD0
+ UNKNOWN, // 0BD1..0BD6
+ TAMIL, // 0BD7
+ UNKNOWN, // 0BD8..0BE5
+ TAMIL, // 0BE6..0BFA
+ UNKNOWN, // 0BFB..0BFF
+ TELUGU, // 0C00..0C03
+ UNKNOWN, // 0C04
+ TELUGU, // 0C05..0C0C
+ UNKNOWN, // 0C0D
+ TELUGU, // 0C0E..0C10
+ UNKNOWN, // 0C11
+ TELUGU, // 0C12..0C28
+ UNKNOWN, // 0C29
+ TELUGU, // 0C2A..0C39
+ UNKNOWN, // 0C3A..0C3C
+ TELUGU, // 0C3D..0C44
+ UNKNOWN, // 0C45
+ TELUGU, // 0C46..0C48
+ UNKNOWN, // 0C49
+ TELUGU, // 0C4A..0C4D
+ UNKNOWN, // 0C4E..0C54
+ TELUGU, // 0C55..0C56
+ UNKNOWN, // 0C57
+ TELUGU, // 0C58..0C59
+ UNKNOWN, // 0C5A..0C5F
+ TELUGU, // 0C60..0C63
+ UNKNOWN, // 0C64..0C65
+ TELUGU, // 0C66..0C6F
+ UNKNOWN, // 0C70..0C77
+ TELUGU, // 0C78..0C7F
+ UNKNOWN, // 0C80
+ KANNADA, // 0C81..0C83
+ UNKNOWN, // 0C84
+ KANNADA, // 0C85..0C8C
+ UNKNOWN, // 0C8D
+ KANNADA, // 0C8E..0C90
+ UNKNOWN, // 0C91
+ KANNADA, // 0C92..0CA8
+ UNKNOWN, // 0CA9
+ KANNADA, // 0CAA..0CB3
+ UNKNOWN, // 0CB4
+ KANNADA, // 0CB5..0CB9
+ UNKNOWN, // 0CBA..0CBB
+ KANNADA, // 0CBC..0CC4
+ UNKNOWN, // 0CC5
+ KANNADA, // 0CC6..0CC8
+ UNKNOWN, // 0CC9
+ KANNADA, // 0CCA..0CCD
+ UNKNOWN, // 0CCE..0CD4
+ KANNADA, // 0CD5..0CD6
+ UNKNOWN, // 0CD7..0CDD
+ KANNADA, // 0CDE
+ UNKNOWN, // 0CDF
+ KANNADA, // 0CE0..0CE3
+ UNKNOWN, // 0CE4..0CE5
+ KANNADA, // 0CE6..0CEF
+ UNKNOWN, // 0CF0
+ KANNADA, // 0CF1..0CF2
+ UNKNOWN, // 0CF3..0D00
+ MALAYALAM, // 0D01..0D03
+ UNKNOWN, // 0D04
+ MALAYALAM, // 0D05..0D0C
+ UNKNOWN, // 0D0D
+ MALAYALAM, // 0D0E..0D10
+ UNKNOWN, // 0D11
+ MALAYALAM, // 0D12..0D3A
+ UNKNOWN, // 0D3B..0D3C
+ MALAYALAM, // 0D3D..0D44
+ UNKNOWN, // 0D45
+ MALAYALAM, // 0D46..0D48
+ UNKNOWN, // 0D49
+ MALAYALAM, // 0D4A..0D4E
+ UNKNOWN, // 0D4F..0D56
+ MALAYALAM, // 0D57
+ UNKNOWN, // 0D58..0D5F
+ MALAYALAM, // 0D60..0D63
+ UNKNOWN, // 0D64..0D65
+ MALAYALAM, // 0D66..0D75
+ UNKNOWN, // 0D76..0D78
+ MALAYALAM, // 0D79..0D7F
+ UNKNOWN, // 0D80..0D81
+ SINHALA, // 0D82..0D83
+ UNKNOWN, // 0D84
+ SINHALA, // 0D85..0D96
+ UNKNOWN, // 0D97..0D99
+ SINHALA, // 0D9A..0DB1
+ UNKNOWN, // 0DB2
+ SINHALA, // 0DB3..0DBB
+ UNKNOWN, // 0DBC
+ SINHALA, // 0DBD
+ UNKNOWN, // 0DBE..0DBF
+ SINHALA, // 0DC0..0DC6
+ UNKNOWN, // 0DC7..0DC9
+ SINHALA, // 0DCA
+ UNKNOWN, // 0DCB..0DCE
+ SINHALA, // 0DCF..0DD4
+ UNKNOWN, // 0DD5
+ SINHALA, // 0DD6
+ UNKNOWN, // 0DD7
+ SINHALA, // 0DD8..0DDF
+ UNKNOWN, // 0DE0..0DE5
+ SINHALA, // 0DE6..0DEF
+ UNKNOWN, // 0DF0..0DF1
+ SINHALA, // 0DF2..0DF4
+ UNKNOWN, // 0DF5..0E00
+ THAI, // 0E01..0E3A
+ UNKNOWN, // 0E3B..0E3E
+ COMMON, // 0E3F
+ THAI, // 0E40..0E5B
+ UNKNOWN, // 0E5C..0E80
+ LAO, // 0E81..0E82
+ UNKNOWN, // 0E83
+ LAO, // 0E84
+ UNKNOWN, // 0E85..0E86
+ LAO, // 0E87..0E88
+ UNKNOWN, // 0E89
+ LAO, // 0E8A
+ UNKNOWN, // 0E8B..0E8C
+ LAO, // 0E8D
+ UNKNOWN, // 0E8E..0E93
+ LAO, // 0E94..0E97
+ UNKNOWN, // 0E98
+ LAO, // 0E99..0E9F
+ UNKNOWN, // 0EA0
+ LAO, // 0EA1..0EA3
+ UNKNOWN, // 0EA4
+ LAO, // 0EA5
+ UNKNOWN, // 0EA6
+ LAO, // 0EA7
+ UNKNOWN, // 0EA8..0EA9
+ LAO, // 0EAA..0EAB
+ UNKNOWN, // 0EAC
+ LAO, // 0EAD..0EB9
+ UNKNOWN, // 0EBA
+ LAO, // 0EBB..0EBD
+ UNKNOWN, // 0EBE..0EBF
+ LAO, // 0EC0..0EC4
+ UNKNOWN, // 0EC5
+ LAO, // 0EC6
+ UNKNOWN, // 0EC7
+ LAO, // 0EC8..0ECD
+ UNKNOWN, // 0ECE..0ECF
+ LAO, // 0ED0..0ED9
+ UNKNOWN, // 0EDA..0EDB
+ LAO, // 0EDC..0EDF
+ UNKNOWN, // 0EE0..0EFF
+ TIBETAN, // 0F00..0F47
+ UNKNOWN, // 0F48
+ TIBETAN, // 0F49..0F6C
+ UNKNOWN, // 0F6D..0F70
+ TIBETAN, // 0F71..0F97
+ UNKNOWN, // 0F98
+ TIBETAN, // 0F99..0FBC
+ UNKNOWN, // 0FBD
+ TIBETAN, // 0FBE..0FCC
+ UNKNOWN, // 0FCD
+ TIBETAN, // 0FCE..0FD4
+ COMMON, // 0FD5..0FD8
+ TIBETAN, // 0FD9..0FDA
+ UNKNOWN, // 0FDB..FFF
+ MYANMAR, // 1000..109F
+ GEORGIAN, // 10A0..10C5
+ UNKNOWN, // 10C6
+ GEORGIAN, // 10C7
+ UNKNOWN, // 10C8..10CC
+ GEORGIAN, // 10CD
+ UNKNOWN, // 10CE..10CF
+ GEORGIAN, // 10D0..10FA
+ COMMON, // 10FB
+ GEORGIAN, // 10FC..10FF
+ HANGUL, // 1100..11FF
+ ETHIOPIC, // 1200..1248
+ UNKNOWN, // 1249
+ ETHIOPIC, // 124A..124D
+ UNKNOWN, // 124E..124F
+ ETHIOPIC, // 1250..1256
+ UNKNOWN, // 1257
+ ETHIOPIC, // 1258
+ UNKNOWN, // 1259
+ ETHIOPIC, // 125A..125D
+ UNKNOWN, // 125E..125F
+ ETHIOPIC, // 1260..1288
+ UNKNOWN, // 1289
+ ETHIOPIC, // 128A..128D
+ UNKNOWN, // 128E..128F
+ ETHIOPIC, // 1290..12B0
+ UNKNOWN, // 12B1
+ ETHIOPIC, // 12B2..12B5
+ UNKNOWN, // 12B6..12B7
+ ETHIOPIC, // 12B8..12BE
+ UNKNOWN, // 12BF
+ ETHIOPIC, // 12C0
+ UNKNOWN, // 12C1
+ ETHIOPIC, // 12C2..12C5
+ UNKNOWN, // 12C6..12C7
+ ETHIOPIC, // 12C8..12D6
+ UNKNOWN, // 12D7
+ ETHIOPIC, // 12D8..1310
+ UNKNOWN, // 1311
+ ETHIOPIC, // 1312..1315
+ UNKNOWN, // 1316..1317
+ ETHIOPIC, // 1318..135A
+ UNKNOWN, // 135B..135C
+ ETHIOPIC, // 135D..137C
+ UNKNOWN, // 137D..137F
+ ETHIOPIC, // 1380..1399
+ UNKNOWN, // 139A..139F
+ CHEROKEE, // 13A0..13F4
+ UNKNOWN, // 13F5..13FF
+ CANADIAN_ABORIGINAL, // 1400..167F
+ OGHAM, // 1680..169C
+ UNKNOWN, // 169D..169F
+ RUNIC, // 16A0..16EA
+ COMMON, // 16EB..16ED
+ RUNIC, // 16EE..16F8
+ UNKNOWN, // 16F9..16FF
+ TAGALOG, // 1700..170C
+ UNKNOWN, // 170D
+ TAGALOG, // 170E..1714
+ UNKNOWN, // 1715..171F
+ HANUNOO, // 1720..1734
+ COMMON, // 1735..1736
+ UNKNOWN, // 1737..173F
+ BUHID, // 1740..1753
+ UNKNOWN, // 1754..175F
+ TAGBANWA, // 1760..176C
+ UNKNOWN, // 176D
+ TAGBANWA, // 176E..1770
+ UNKNOWN, // 1771
+ TAGBANWA, // 1772..1773
+ UNKNOWN, // 1774..177F
+ KHMER, // 1780..17DD
+ UNKNOWN, // 17DE..17DF
+ KHMER, // 17E0..17E9
+ UNKNOWN, // 17EA..17EF
+ KHMER, // 17F0..17F9
+ UNKNOWN, // 17FA..17FF
+ MONGOLIAN, // 1800..1801
+ COMMON, // 1802..1803
+ MONGOLIAN, // 1804
+ COMMON, // 1805
+ MONGOLIAN, // 1806..180E
+ UNKNOWN, // 180F
+ MONGOLIAN, // 1810..1819
+ UNKNOWN, // 181A..181F
+ MONGOLIAN, // 1820..1877
+ UNKNOWN, // 1878..187F
+ MONGOLIAN, // 1880..18AA
+ UNKNOWN, // 18AB..18AF
+ CANADIAN_ABORIGINAL, // 18B0..18F5
+ UNKNOWN, // 18F6..18FF
+ LIMBU, // 1900..191E
+ UNKNOWN, // 191F
+ LIMBU, // 1920..192B
+ UNKNOWN, // 192C..192F
+ LIMBU, // 1930..193B
+ UNKNOWN, // 193C..193F
+ LIMBU, // 1940
+ UNKNOWN, // 1941..1943
+ LIMBU, // 1944..194F
+ TAI_LE, // 1950..196D
+ UNKNOWN, // 196E..196F
+ TAI_LE, // 1970..1974
+ UNKNOWN, // 1975..197F
+ NEW_TAI_LUE, // 1980..19AB
+ UNKNOWN, // 19AC..19AF
+ NEW_TAI_LUE, // 19B0..19C9
+ UNKNOWN, // 19CA..19CF
+ NEW_TAI_LUE, // 19D0..19DA
+ UNKNOWN, // 19DB..19DD
+ NEW_TAI_LUE, // 19DE..19DF
+ KHMER, // 19E0..19FF
+ BUGINESE, // 1A00..1A1B
+ UNKNOWN, // 1A1C..1A1D
+ BUGINESE, // 1A1E..1A1F
+ TAI_THAM, // 1A20..1A5E
+ UNKNOWN, // 1A5F
+ TAI_THAM, // 1A60..1A7C
+ UNKNOWN, // 1A7D..1A7E
+ TAI_THAM, // 1A7F..1A89
+ UNKNOWN, // 1A8A..1A8F
+ TAI_THAM, // 1A90..1A99
+ UNKNOWN, // 1A9A..1A9F
+ TAI_THAM, // 1AA0..1AAD
+ UNKNOWN, // 1AAE..1AAF
+ INHERITED, // 1AB0..1ABE
+ UNKNOWN, // 1ABF..1AFF
+ BALINESE, // 1B00..1B4B
+ UNKNOWN, // 1B4C..1B4F
+ BALINESE, // 1B50..1B7C
+ UNKNOWN, // 1B7D..1B7F
+ SUNDANESE, // 1B80..1BBF
+ BATAK, // 1BC0..1BF3
+ UNKNOWN, // 1BF4..1BFB
+ BATAK, // 1BFC..1BFF
+ LEPCHA, // 1C00..1C37
+ UNKNOWN, // 1C38..1C3A
+ LEPCHA, // 1C3B..1C49
+ UNKNOWN, // 1C4A..1C4C
+ LEPCHA, // 1C4D..1C4F
+ OL_CHIKI, // 1C50..1C7F
+ UNKNOWN, // 1C80..1CBF
+ SUNDANESE, // 1CC0..1CC7
+ UNKNOWN, // 1CC8..1CCF
+ INHERITED, // 1CD0..1CD2
+ COMMON, // 1CD3
+ INHERITED, // 1CD4..1CE0
+ COMMON, // 1CE1
+ INHERITED, // 1CE2..1CE8
+ COMMON, // 1CE9..1CEC
+ INHERITED, // 1CED
+ COMMON, // 1CEE..1CF3
+ INHERITED, // 1CF4
+ COMMON, // 1CF5..1CF6
+ UNKNOWN, // 1CF7
+ INHERITED, // 1CF8..1CF9
+ UNKNOWN, // 1CFA..1CFF
+ LATIN, // 1D00..1D25
+ GREEK, // 1D26..1D2A
+ CYRILLIC, // 1D2B
+ LATIN, // 1D2C..1D5C
+ GREEK, // 1D5D..1D61
+ LATIN, // 1D62..1D65
+ GREEK, // 1D66..1D6A
+ LATIN, // 1D6B..1D77
+ CYRILLIC, // 1D78
+ LATIN, // 1D79..1DBE
+ GREEK, // 1DBF
+ INHERITED, // 1DC0..1DF5
+ UNKNOWN, // 1DF6..1DFB
+ INHERITED, // 1DFC..1DFF
+ LATIN, // 1E00..1EFF
+ GREEK, // 1F00..1F15
+ UNKNOWN, // 1F16..1F17
+ GREEK, // 1F18..1F1D
+ UNKNOWN, // 1F1E..1F1F
+ GREEK, // 1F20..1F45
+ UNKNOWN, // 1F46..1F47
+ GREEK, // 1F48..1F4D
+ UNKNOWN, // 1F4E..1F4F
+ GREEK, // 1F50..1F57
+ UNKNOWN, // 1F58
+ GREEK, // 1F59
+ UNKNOWN, // 1F5A
+ GREEK, // 1F5B
+ UNKNOWN, // 1F5C
+ GREEK, // 1F5D
+ UNKNOWN, // 1F5E
+ GREEK, // 1F5F..1F7D
+ UNKNOWN, // 1F7E..1F7F
+ GREEK, // 1F80..1FB4
+ UNKNOWN, // 1FB5
+ GREEK, // 1FB6..1FC4
+ UNKNOWN, // 1FC5
+ GREEK, // 1FC6..1FD3
+ UNKNOWN, // 1FD4..1FD5
+ GREEK, // 1FD6..1FDB
+ UNKNOWN, // 1FDC
+ GREEK, // 1FDD..1FEF
+ UNKNOWN, // 1FF0..1FF1
+ GREEK, // 1FF2..1FF4
+ UNKNOWN, // 1FF5
+ GREEK, // 1FF6..1FFE
+ UNKNOWN, // 1FFF
+ COMMON, // 2000..200B
+ INHERITED, // 200C..200D
+ COMMON, // 200E..2064
+ UNKNOWN, // 2065
+ COMMON, // 2066..2070
+ LATIN, // 2071
+ UNKNOWN, // 2072..2073
+ COMMON, // 2074..207E
+ LATIN, // 207F
+ COMMON, // 2080..208E
+ UNKNOWN, // 208F
+ LATIN, // 2090..209C
+ UNKNOWN, // 209D..209F
+ COMMON, // 20A0..20BD
+ UNKNOWN, // 20BE..20CF
+ INHERITED, // 20D0..20F0
+ UNKNOWN, // 20F1..20FF
+ COMMON, // 2100..2125
+ GREEK, // 2126
+ COMMON, // 2127..2129
+ LATIN, // 212A..212B
+ COMMON, // 212C..2131
+ LATIN, // 2132
+ COMMON, // 2133..214D
+ LATIN, // 214E
+ COMMON, // 214F..215F
+ LATIN, // 2160..2188
+ COMMON, // 2189
+ UNKNOWN, // 218A..218F
+ COMMON, // 2190..23FA
+ UNKNOWN, // 23FB..23FF
+ COMMON, // 2400..2426
+ UNKNOWN, // 2427..243F
+ COMMON, // 2440..244A
+ UNKNOWN, // 244B..245F
+ COMMON, // 2460..27FF
+ BRAILLE, // 2800..28FF
+ COMMON, // 2900..2B73
+ UNKNOWN, // 2B74..2B75
+ COMMON, // 2B76..2B95
+ UNKNOWN, // 2B96..2B97
+ COMMON, // 2B98..2BB9
+ UNKNOWN, // 2BBA..2BBC
+ COMMON, // 2BBD..2BC8
+ UNKNOWN, // 2BC9
+ COMMON, // 2BCA..2BD1
+ UNKNOWN, // 2BD2..2BFF
+ GLAGOLITIC, // 2C00..2C2E
+ UNKNOWN, // 2C2F
+ GLAGOLITIC, // 2C30..2C5E
+ UNKNOWN, // 2C5F
+ LATIN, // 2C60..2C7F
+ COPTIC, // 2C80..2CF3
+ UNKNOWN, // 2CF4..2CF8
+ COPTIC, // 2CF9..2CFF
+ GEORGIAN, // 2D00..2D25
+ UNKNOWN, // 2D26
+ GEORGIAN, // 2D27
+ UNKNOWN, // 2D28..2D2C
+ GEORGIAN, // 2D2D
+ UNKNOWN, // 2D2E..2D2F
+ TIFINAGH, // 2D30..2D67
+ UNKNOWN, // 2D68..2D6E
+ TIFINAGH, // 2D6F..2D70
+ UNKNOWN, // 2D71..2D7E
+ TIFINAGH, // 2D7F
+ ETHIOPIC, // 2D80..2D96
+ UNKNOWN, // 2D97..2D9F
+ ETHIOPIC, // 2DA0..2DA6
+ UNKNOWN, // 2DA7
+ ETHIOPIC, // 2DA8..2DAE
+ UNKNOWN, // 2DAF
+ ETHIOPIC, // 2DB0..2DB6
+ UNKNOWN, // 2DB7
+ ETHIOPIC, // 2DB8..2DBE
+ UNKNOWN, // 2DBF
+ ETHIOPIC, // 2DC0..2DC6
+ UNKNOWN, // 2DC7
+ ETHIOPIC, // 2DC8..2DCE
+ UNKNOWN, // 2DCF
+ ETHIOPIC, // 2DD0..2DD6
+ UNKNOWN, // 2DD7
+ ETHIOPIC, // 2DD8..2DDE
+ UNKNOWN, // 2DDF
+ CYRILLIC, // 2DE0..2DFF
+ COMMON, // 2E00..2E42
+ UNKNOWN, // 2E43..2E7F
+ HAN, // 2E80..2E99
+ UNKNOWN, // 2E9A
+ HAN, // 2E9B..2EF3
+ UNKNOWN, // 2EF4..2EFF
+ HAN, // 2F00..2FD5
+ UNKNOWN, // 2FD6..2FEF
+ COMMON, // 2FF0..2FFB
+ UNKNOWN, // 2FFC..2FFF
+ COMMON, // 3000..3004
+ HAN, // 3005
+ COMMON, // 3006
+ HAN, // 3007
+ COMMON, // 3008..3020
+ HAN, // 3021..3029
+ INHERITED, // 302A..302D
+ HANGUL, // 302E..302F
+ COMMON, // 3030..3037
+ HAN, // 3038..303B
+ COMMON, // 303C..303F
+ UNKNOWN, // 3040
+ HIRAGANA, // 3041..3096
+ UNKNOWN, // 3097..3098
+ INHERITED, // 3099..309A
+ COMMON, // 309B..309C
+ HIRAGANA, // 309D..309F
+ COMMON, // 30A0
+ KATAKANA, // 30A1..30FA
+ COMMON, // 30FB..30FC
+ KATAKANA, // 30FD..30FF
+ UNKNOWN, // 3100..3104
+ BOPOMOFO, // 3105..312D
+ UNKNOWN, // 312E..3130
+ HANGUL, // 3131..318E
+ UNKNOWN, // 318F
+ COMMON, // 3190..319F
+ BOPOMOFO, // 31A0..31BA
+ UNKNOWN, // 31BB..31BF
+ COMMON, // 31C0..31E3
+ UNKNOWN, // 31E4..31EF
+ KATAKANA, // 31F0..31FF
+ HANGUL, // 3200..321E
+ UNKNOWN, // 321F
+ COMMON, // 3220..325F
+ HANGUL, // 3260..327E
+ COMMON, // 327F..32CF
+ KATAKANA, // 32D0..32FE
+ UNKNOWN, // 32FF
+ KATAKANA, // 3300..3357
+ COMMON, // 3358..33FF
+ HAN, // 3400..4DB5
+ UNKNOWN, // 4DB6..4DBF
+ COMMON, // 4DC0..4DFF
+ HAN, // 4E00..9FCC
+ UNKNOWN, // 9FCD..9FFF
+ YI, // A000..A48C
+ UNKNOWN, // A48D..A48F
+ YI, // A490..A4C6
+ UNKNOWN, // A4C7..A4CF
+ LISU, // A4D0..A4FF
+ VAI, // A500..A62B
+ UNKNOWN, // A62C..A63F
+ CYRILLIC, // A640..A69D
+ UNKNOWN, // A69E
+ CYRILLIC, // A69F
+ BAMUM, // A6A0..A6F7
+ UNKNOWN, // A6F8..A6FF
+ COMMON, // A700..A721
+ LATIN, // A722..A787
+ COMMON, // A788..A78A
+ LATIN, // A78B..A78E
+ UNKNOWN, // A78F
+ LATIN, // A790..A7AD
+ UNKNOWN, // A7AE..A7AF
+ LATIN, // A7B0..A7B1
+ UNKNOWN, // A7B2..A7F6
+ LATIN, // A7F7..A7FF
+ SYLOTI_NAGRI, // A800..A82B
+ UNKNOWN, // A82C..A82F
+ COMMON, // A830..A839
+ UNKNOWN, // A83A..A83F
+ PHAGS_PA, // A840..A877
+ UNKNOWN, // A878..A87F
+ SAURASHTRA, // A880..A8C4
+ UNKNOWN, // A8C5..A8CD
+ SAURASHTRA, // A8CE..A8D9
+ UNKNOWN, // A8DA..A8DF
+ DEVANAGARI, // A8E0..A8FB
+ UNKNOWN, // A8FC..A8FF
+ KAYAH_LI, // A900..A92D
+ COMMON, // A92E
+ KAYAH_LI, // A92F
+ REJANG, // A930..A953
+ UNKNOWN, // A954..A95E
+ REJANG, // A95F
+ HANGUL, // A960..A97C
+ UNKNOWN, // A97D..A97F
+ JAVANESE, // A980..A9CD
+ UNKNOWN, // A9CE
+ COMMON, // A9CF
+ JAVANESE, // A9D0..A9D9
+ UNKNOWN, // A9DA..A9DD
+ JAVANESE, // A9DE..A9DF
+ MYANMAR, // A9E0..A9FE
+ UNKNOWN, // A9FF
+ CHAM, // AA00..AA36
+ UNKNOWN, // AA37..AA3F
+ CHAM, // AA40..AA4D
+ UNKNOWN, // AA4E..AA4F
+ CHAM, // AA50..AA59
+ UNKNOWN, // AA5A..AA5B
+ CHAM, // AA5C..AA5F
+ MYANMAR, // AA60..AA7F
+ TAI_VIET, // AA80..AAC2
+ UNKNOWN, // AAC3..AADA
+ TAI_VIET, // AADB..AADF
+ MEETEI_MAYEK, // AAE0..AAF6
+ UNKNOWN, // AAF7..AB00
+ ETHIOPIC, // AB01..AB06
+ UNKNOWN, // AB07..AB08
+ ETHIOPIC, // AB09..AB0E
+ UNKNOWN, // AB0F..AB10
+ ETHIOPIC, // AB11..AB16
+ UNKNOWN, // AB17..AB1F
+ ETHIOPIC, // AB20..AB26
+ UNKNOWN, // AB27
+ ETHIOPIC, // AB28..AB2E
+ UNKNOWN, // AB2F
+ LATIN, // AB30..AB5A
+ COMMON, // AB5B
+ LATIN, // AB5C..AB5F
+ UNKNOWN, // AB60..AB63
+ LATIN, // AB64
+ GREEK, // AB65
+ UNKNOWN, // AB66..ABBF
+ MEETEI_MAYEK, // ABC0..ABED
+ UNKNOWN, // ABEE..ABEF
+ MEETEI_MAYEK, // ABF0..ABF9
+ UNKNOWN, // ABFA..ABFF
+ HANGUL, // AC00..D7A3
+ UNKNOWN, // D7A4..D7AF
+ HANGUL, // D7B0..D7C6
+ UNKNOWN, // D7C7..D7CA
+ HANGUL, // D7CB..D7FB
+ UNKNOWN, // D7FC..F8FF
+ HAN, // F900..FA6D
+ UNKNOWN, // FA6E..FA6F
+ HAN, // FA70..FAD9
+ UNKNOWN, // FADA..FAFF
+ LATIN, // FB00..FB06
+ UNKNOWN, // FB07..FB12
+ ARMENIAN, // FB13..FB17
+ UNKNOWN, // FB18..FB1C
+ HEBREW, // FB1D..FB36
+ UNKNOWN, // FB37
+ HEBREW, // FB38..FB3C
+ UNKNOWN, // FB3D
+ HEBREW, // FB3E
+ UNKNOWN, // FB3F
+ HEBREW, // FB40..FB41
+ UNKNOWN, // FB42
+ HEBREW, // FB43..FB44
+ UNKNOWN, // FB45
+ HEBREW, // FB46..FB4F
+ ARABIC, // FB50..FBC1
+ UNKNOWN, // FBC2..FBD2
+ ARABIC, // FBD3..FD3D
+ COMMON, // FD3E..FD3F
+ UNKNOWN, // FD40..FD4F
+ ARABIC, // FD50..FD8F
+ UNKNOWN, // FD90..FD91
+ ARABIC, // FD92..FDC7
+ UNKNOWN, // FDC8..FDEF
+ ARABIC, // FDF0..FDFD
+ UNKNOWN, // FDFE..FDFF
+ INHERITED, // FE00..FE0F
+ COMMON, // FE10..FE19
+ UNKNOWN, // FE1A..FE1F
+ INHERITED, // FE20..FE2D
+ UNKNOWN, // FE2E..FE2F
+ COMMON, // FE30..FE52
+ UNKNOWN, // FE53
+ COMMON, // FE54..FE66
+ UNKNOWN, // FE67
+ COMMON, // FE68..FE6B
+ UNKNOWN, // FE6C..FE6F
+ ARABIC, // FE70..FE74
+ UNKNOWN, // FE75
+ ARABIC, // FE76..FEFC
+ UNKNOWN, // FEFD..FEFE
+ COMMON, // FEFF
+ UNKNOWN, // FF00
+ COMMON, // FF01..FF20
+ LATIN, // FF21..FF3A
+ COMMON, // FF3B..FF40
+ LATIN, // FF41..FF5A
+ COMMON, // FF5B..FF65
+ KATAKANA, // FF66..FF6F
+ COMMON, // FF70
+ KATAKANA, // FF71..FF9D
+ COMMON, // FF9E..FF9F
+ HANGUL, // FFA0..FFBE
+ UNKNOWN, // FFBF..FFC1
+ HANGUL, // FFC2..FFC7
+ UNKNOWN, // FFC8..FFC9
+ HANGUL, // FFCA..FFCF
+ UNKNOWN, // FFD0..FFD1
+ HANGUL, // FFD2..FFD7
+ UNKNOWN, // FFD8..FFD9
+ HANGUL, // FFDA..FFDC
+ UNKNOWN, // FFDD..FFDF
+ COMMON, // FFE0..FFE6
+ UNKNOWN, // FFE7
+ COMMON, // FFE8..FFEE
+ UNKNOWN, // FFEF..FFF8
+ COMMON, // FFF9..FFFD
+ UNKNOWN, // FFFE..FFFF
+ LINEAR_B, // 10000..1000B
+ UNKNOWN, // 1000C
+ LINEAR_B, // 1000D..10026
+ UNKNOWN, // 10027
+ LINEAR_B, // 10028..1003A
+ UNKNOWN, // 1003B
+ LINEAR_B, // 1003C..1003D
+ UNKNOWN, // 1003E
+ LINEAR_B, // 1003F..1004D
+ UNKNOWN, // 1004E..1004F
+ LINEAR_B, // 10050..1005D
+ UNKNOWN, // 1005E..1007F
+ LINEAR_B, // 10080..100FA
+ UNKNOWN, // 100FB..100FF
+ COMMON, // 10100..10102
+ UNKNOWN, // 10103..10106
+ COMMON, // 10107..10133
+ UNKNOWN, // 10134..10136
+ COMMON, // 10137..1013F
+ GREEK, // 10140..1018C
+ UNKNOWN, // 1018D..1018F
+ COMMON, // 10190..1019B
+ UNKNOWN, // 1019C..1019F
+ GREEK, // 101A0
+ UNKNOWN, // 101A1..101CF
+ COMMON, // 101D0..101FC
+ INHERITED, // 101FD
+ UNKNOWN, // 101FE..1027F
+ LYCIAN, // 10280..1029C
+ UNKNOWN, // 1029D..1029F
+ CARIAN, // 102A0..102D0
+ UNKNOWN, // 102D1..102DF
+ INHERITED, // 102E0
+ COMMON, // 102E1..102FB
+ UNKNOWN, // 102FC..102FF
+ OLD_ITALIC, // 10300..10323
+ UNKNOWN, // 10324..1032F
+ GOTHIC, // 10330..1034A
+ UNKNOWN, // 1034B..1034F
+ OLD_PERMIC, // 10350..1037A
+ UNKNOWN, // 1037B..1037F
+ UGARITIC, // 10380..1039D
+ UNKNOWN, // 1039E
+ UGARITIC, // 1039F
+ OLD_PERSIAN, // 103A0..103C3
+ UNKNOWN, // 103C4..103C7
+ OLD_PERSIAN, // 103C8..103D5
+ UNKNOWN, // 103D6..103FF
+ DESERET, // 10400..1044F
+ SHAVIAN, // 10450..1047F
+ OSMANYA, // 10480..1049D
+ UNKNOWN, // 1049E..1049F
+ OSMANYA, // 104A0..104A9
+ UNKNOWN, // 104AA..104FF
+ ELBASAN, // 10500..10527
+ UNKNOWN, // 10528..1052F
+ CAUCASIAN_ALBANIAN, // 10530..10563
+ UNKNOWN, // 10564..1056E
+ CAUCASIAN_ALBANIAN, // 1056F
+ UNKNOWN, // 10570..105FF
+ LINEAR_A, // 10600..10736
+ UNKNOWN, // 10737..1073F
+ LINEAR_A, // 10740..10755
+ UNKNOWN, // 10756..1075F
+ LINEAR_A, // 10760..10767
+ UNKNOWN, // 10768..107FF
+ CYPRIOT, // 10800..10805
+ UNKNOWN, // 10806..10807
+ CYPRIOT, // 10808
+ UNKNOWN, // 10809
+ CYPRIOT, // 1080A..10835
+ UNKNOWN, // 10836
+ CYPRIOT, // 10837..10838
+ UNKNOWN, // 10839..1083B
+ CYPRIOT, // 1083C
+ UNKNOWN, // 1083D..1083E
+ CYPRIOT, // 1083F
+ IMPERIAL_ARAMAIC, // 10840..10855
+ UNKNOWN, // 10856
+ IMPERIAL_ARAMAIC, // 10857..1085F
+ PALMYRENE, // 10860..1087F
+ NABATAEAN, // 10880..1089E
+ UNKNOWN, // 1089F..108A6
+ NABATAEAN, // 108A7..108AF
+ UNKNOWN, // 108B0..108FF
+ PHOENICIAN, // 10900..1091B
+ UNKNOWN, // 1091C..1091E
+ PHOENICIAN, // 1091F
+ LYDIAN, // 10920..10939
+ UNKNOWN, // 1093A..1093E
+ LYDIAN, // 1093F
+ UNKNOWN, // 10940..1097F
+ MEROITIC_HIEROGLYPHS, // 10980..1099F
+ MEROITIC_CURSIVE, // 109A0..109B7
+ UNKNOWN, // 109B8..109BD
+ MEROITIC_CURSIVE, // 109BE..109BF
+ UNKNOWN, // 109C0..109FF
+ KHAROSHTHI, // 10A00..10A03
+ UNKNOWN, // 10A04
+ KHAROSHTHI, // 10A05..10A06
+ UNKNOWN, // 10A07..10A0B
+ KHAROSHTHI, // 10A0C..10A13
+ UNKNOWN, // 10A14
+ KHAROSHTHI, // 10A15..10A17
+ UNKNOWN, // 10A18
+ KHAROSHTHI, // 10A19..10A33
+ UNKNOWN, // 10A34..10A37
+ KHAROSHTHI, // 10A38..10A3A
+ UNKNOWN, // 10A3B..10A3E
+ KHAROSHTHI, // 10A3F..10A47
+ UNKNOWN, // 10A48..10A4F
+ KHAROSHTHI, // 10A50..10A58
+ UNKNOWN, // 10A59..10A5F
+ OLD_SOUTH_ARABIAN, // 10A60..10A7F
+ OLD_NORTH_ARABIAN, // 10A80..10A9F
+ UNKNOWN, // 10AA0..10ABF
+ MANICHAEAN, // 10AC0..10AE6
+ UNKNOWN, // 10AE7..10AEA
+ MANICHAEAN, // 10AEB..10AF6
+ UNKNOWN, // 10AF7..10AFF
+ AVESTAN, // 10B00..10B35
+ UNKNOWN, // 10B36..10B38
+ AVESTAN, // 10B39..10B3F
+ INSCRIPTIONAL_PARTHIAN, // 10B40..10B55
+ UNKNOWN, // 10B56..10B57
+ INSCRIPTIONAL_PARTHIAN, // 10B58..10B5F
+ INSCRIPTIONAL_PAHLAVI, // 10B60..10B72
+ UNKNOWN, // 10B73..10B77
+ INSCRIPTIONAL_PAHLAVI, // 10B78..10B7F
+ PSALTER_PAHLAVI, // 10B80..10B91
+ UNKNOWN, // 10B92..10B98
+ PSALTER_PAHLAVI, // 10B99..10B9C
+ UNKNOWN, // 10B9D..10BA8
+ PSALTER_PAHLAVI, // 10BA9..10BAF
+ UNKNOWN, // 10BB0..10BFF
+ OLD_TURKIC, // 10C00..10C48
+ UNKNOWN, // 10C49..10E5F
+ ARABIC, // 10E60..10E7E
+ UNKNOWN, // 10E7F..10FFF
+ BRAHMI, // 11000..1104D
+ UNKNOWN, // 1104E..11051
+ BRAHMI, // 11052..1106F
+ UNKNOWN, // 11070..1107E
+ BRAHMI, // 1107F
+ KAITHI, // 11080..110C1
+ UNKNOWN, // 110C2..110CF
+ SORA_SOMPENG, // 110D0..110E8
+ UNKNOWN, // 110E9..110EF
+ SORA_SOMPENG, // 110F0..110F9
+ UNKNOWN, // 110FA..110FF
+ CHAKMA, // 11100..11134
+ UNKNOWN, // 11135
+ CHAKMA, // 11136..11143
+ UNKNOWN, // 11144..1114F
+ MAHAJANI, // 11150..11176
+ UNKNOWN, // 11177..1117F
+ SHARADA, // 11180..111C8
+ UNKNOWN, // 111C9..111CC
+ SHARADA, // 111CD
+ UNKNOWN, // 111CE..111CF
+ SHARADA, // 111D0..111DA
+ UNKNOWN, // 111DB..111E0
+ SINHALA, // 111E1..111F4
+ UNKNOWN, // 111F5..111FF
+ KHOJKI, // 11200..11211
+ UNKNOWN, // 11212
+ KHOJKI, // 11213..1123D
+ UNKNOWN, // 1123E..112AF
+ KHUDAWADI, // 112B0..112EA
+ UNKNOWN, // 112EB..112EF
+ KHUDAWADI, // 112F0..112F9
+ UNKNOWN, // 112FA..11300
+ GRANTHA, // 11301..11303
+ UNKNOWN, // 11304
+ GRANTHA, // 11305..1130C
+ UNKNOWN, // 1130D..1130E
+ GRANTHA, // 1130F..11310
+ UNKNOWN, // 11311..11312
+ GRANTHA, // 11313..11328
+ UNKNOWN, // 11329
+ GRANTHA, // 1132A..11330
+ UNKNOWN, // 11331
+ GRANTHA, // 11332..11333
+ UNKNOWN, // 11334
+ GRANTHA, // 11335..11339
+ UNKNOWN, // 1133A..1133B
+ GRANTHA, // 1133C..11344
+ UNKNOWN, // 11345..11346
+ GRANTHA, // 11347..11348
+ UNKNOWN, // 11349..1134A
+ GRANTHA, // 1134B..1134D
+ UNKNOWN, // 1134E..11356
+ GRANTHA, // 11357
+ UNKNOWN, // 11358..1135C
+ GRANTHA, // 1135D..11363
+ UNKNOWN, // 11364..11365
+ GRANTHA, // 11366..1136C
+ UNKNOWN, // 1136D..1136F
+ GRANTHA, // 11370..11374
+ UNKNOWN, // 11375..1147F
+ TIRHUTA, // 11480..114C7
+ UNKNOWN, // 114C8..114CF
+ TIRHUTA, // 114D0..114D9
+ UNKNOWN, // 114DA..1157F
+ SIDDHAM, // 11580..115B5
+ UNKNOWN, // 115B6..115B7
+ SIDDHAM, // 115B8..115C9
+ UNKNOWN, // 115CA..115FF
+ MODI, // 11600..11644
+ UNKNOWN, // 11645..1164F
+ MODI, // 11650..11659
+ UNKNOWN, // 1165A..1167F
+ TAKRI, // 11680..116B7
+ UNKNOWN, // 116B8..116BF
+ TAKRI, // 116C0..116C9
+ UNKNOWN, // 116CA..1189F
+ WARANG_CITI, // 118A0..118F2
+ UNKNOWN, // 118F3..118FE
+ WARANG_CITI, // 118FF
+ UNKNOWN, // 11900..11ABF
+ PAU_CIN_HAU, // 11AC0..11AF8
+ UNKNOWN, // 11AF9..11FFF
+ CUNEIFORM, // 12000..12398
+ UNKNOWN, // 12399..123FF
+ CUNEIFORM, // 12400..1246E
+ UNKNOWN, // 1246F
+ CUNEIFORM, // 12470..12474
+ UNKNOWN, // 12475..12FFF
+ EGYPTIAN_HIEROGLYPHS, // 13000..1342E
+ UNKNOWN, // 1342F..167FF
+ BAMUM, // 16800..16A38
+ UNKNOWN, // 16A39..16A3F
+ MRO, // 16A40..16A5E
+ UNKNOWN, // 16A5F
+ MRO, // 16A60..16A69
+ UNKNOWN, // 16A6A..16A6D
+ MRO, // 16A6E..16A6F
+ UNKNOWN, // 16A70..16ACF
+ BASSA_VAH, // 16AD0..16AED
+ UNKNOWN, // 16AEE..16AEF
+ BASSA_VAH, // 16AF0..16AF5
+ UNKNOWN, // 16AF6..16AFF
+ PAHAWH_HMONG, // 16B00..16B45
+ UNKNOWN, // 16B46..16B4F
+ PAHAWH_HMONG, // 16B50..16B59
+ UNKNOWN, // 16B5A
+ PAHAWH_HMONG, // 16B5B..16B61
+ UNKNOWN, // 16B62
+ PAHAWH_HMONG, // 16B63..16B77
+ UNKNOWN, // 16B78..16B7C
+ PAHAWH_HMONG, // 16B7D..16B8F
+ UNKNOWN, // 16B90..16EFF
+ MIAO, // 16F00..16F44
+ UNKNOWN, // 16F45..16F4F
+ MIAO, // 16F50..16F7E
+ UNKNOWN, // 16F7F..16F8E
+ MIAO, // 16F8F..16F9F
+ UNKNOWN, // 16FA0..1AFFF
+ KATAKANA, // 1B000
+ HIRAGANA, // 1B001
+ UNKNOWN, // 1B002..1BBFF
+ DUPLOYAN, // 1BC00..1BC6A
+ UNKNOWN, // 1BC6B..1BC6F
+ DUPLOYAN, // 1BC70..1BC7C
+ UNKNOWN, // 1BC7D..1BC7F
+ DUPLOYAN, // 1BC80..1BC88
+ UNKNOWN, // 1BC89..1BC8F
+ DUPLOYAN, // 1BC90..1BC99
+ UNKNOWN, // 1BC9A..1BC9B
+ DUPLOYAN, // 1BC9C..1BC9F
+ COMMON, // 1BCA0..1BCA3
+ UNKNOWN, // 1BCA4..1CFFF
+ COMMON, // 1D000..1D0F5
+ UNKNOWN, // 1D0F6..1D0FF
+ COMMON, // 1D100..1D126
+ UNKNOWN, // 1D127..1D128
+ COMMON, // 1D129..1D166
+ INHERITED, // 1D167..1D169
+ COMMON, // 1D16A..1D17A
+ INHERITED, // 1D17B..1D182
+ COMMON, // 1D183..1D184
+ INHERITED, // 1D185..1D18B
+ COMMON, // 1D18C..1D1A9
+ INHERITED, // 1D1AA..1D1AD
+ COMMON, // 1D1AE..1D1DD
+ UNKNOWN, // 1D1DE..1D1FF
+ GREEK, // 1D200..1D245
+ UNKNOWN, // 1D246..1D2FF
+ COMMON, // 1D300..1D356
+ UNKNOWN, // 1D357..1D35F
+ COMMON, // 1D360..1D371
+ UNKNOWN, // 1D372..1D3FF
+ COMMON, // 1D400..1D454
+ UNKNOWN, // 1D455
+ COMMON, // 1D456..1D49C
+ UNKNOWN, // 1D49D
+ COMMON, // 1D49E..1D49F
+ UNKNOWN, // 1D4A0..1D4A1
+ COMMON, // 1D4A2
+ UNKNOWN, // 1D4A3..1D4A4
+ COMMON, // 1D4A5..1D4A6
+ UNKNOWN, // 1D4A7..1D4A8
+ COMMON, // 1D4A9..1D4AC
+ UNKNOWN, // 1D4AD
+ COMMON, // 1D4AE..1D4B9
+ UNKNOWN, // 1D4BA
+ COMMON, // 1D4BB
+ UNKNOWN, // 1D4BC
+ COMMON, // 1D4BD..1D4C3
+ UNKNOWN, // 1D4C4
+ COMMON, // 1D4C5..1D505
+ UNKNOWN, // 1D506
+ COMMON, // 1D507..1D50A
+ UNKNOWN, // 1D50B..1D50C
+ COMMON, // 1D50D..1D514
+ UNKNOWN, // 1D515
+ COMMON, // 1D516..1D51C
+ UNKNOWN, // 1D51D
+ COMMON, // 1D51E..1D539
+ UNKNOWN, // 1D53A
+ COMMON, // 1D53B..1D53E
+ UNKNOWN, // 1D53F
+ COMMON, // 1D540..1D544
+ UNKNOWN, // 1D545
+ COMMON, // 1D546
+ UNKNOWN, // 1D547..1D549
+ COMMON, // 1D54A..1D550
+ UNKNOWN, // 1D551
+ COMMON, // 1D552..1D6A5
+ UNKNOWN, // 1D6A6..1D6A7
+ COMMON, // 1D6A8..1D7CB
+ UNKNOWN, // 1D7CC..1D7CD
+ COMMON, // 1D7CE..1D7FF
+ UNKNOWN, // 1D800..1E7FF
+ MENDE_KIKAKUI, // 1E800..1E8C4
+ UNKNOWN, // 1E8C5..1E8C6
+ MENDE_KIKAKUI, // 1E8C7..1E8D6
+ UNKNOWN, // 1E8D7..1EDFF
+ ARABIC, // 1EE00..1EE03
+ UNKNOWN, // 1EE04
+ ARABIC, // 1EE05..1EE1F
+ UNKNOWN, // 1EE20
+ ARABIC, // 1EE21..1EE22
+ UNKNOWN, // 1EE23
+ ARABIC, // 1EE24
+ UNKNOWN, // 1EE25..1EE26
+ ARABIC, // 1EE27
+ UNKNOWN, // 1EE28
+ ARABIC, // 1EE29..1EE32
+ UNKNOWN, // 1EE33
+ ARABIC, // 1EE34..1EE37
+ UNKNOWN, // 1EE38
+ ARABIC, // 1EE39
+ UNKNOWN, // 1EE3A
+ ARABIC, // 1EE3B
+ UNKNOWN, // 1EE3C..1EE41
+ ARABIC, // 1EE42
+ UNKNOWN, // 1EE43..1EE46
+ ARABIC, // 1EE47
+ UNKNOWN, // 1EE48
+ ARABIC, // 1EE49
+ UNKNOWN, // 1EE4A
+ ARABIC, // 1EE4B
+ UNKNOWN, // 1EE4C
+ ARABIC, // 1EE4D..1EE4F
+ UNKNOWN, // 1EE50
+ ARABIC, // 1EE51..1EE52
+ UNKNOWN, // 1EE53
+ ARABIC, // 1EE54
+ UNKNOWN, // 1EE55..1EE56
+ ARABIC, // 1EE57
+ UNKNOWN, // 1EE58
+ ARABIC, // 1EE59
+ UNKNOWN, // 1EE5A
+ ARABIC, // 1EE5B
+ UNKNOWN, // 1EE5C
+ ARABIC, // 1EE5D
+ UNKNOWN, // 1EE5E
+ ARABIC, // 1EE5F
+ UNKNOWN, // 1EE60
+ ARABIC, // 1EE61..1EE62
+ UNKNOWN, // 1EE63
+ ARABIC, // 1EE64
+ UNKNOWN, // 1EE65..1EE66
+ ARABIC, // 1EE67..1EE6A
+ UNKNOWN, // 1EE6B
+ ARABIC, // 1EE6C..1EE72
+ UNKNOWN, // 1EE73
+ ARABIC, // 1EE74..1EE77
+ UNKNOWN, // 1EE78
+ ARABIC, // 1EE79..1EE7C
+ UNKNOWN, // 1EE7D
+ ARABIC, // 1EE7E
+ UNKNOWN, // 1EE7F
+ ARABIC, // 1EE80..1EE89
+ UNKNOWN, // 1EE8A
+ ARABIC, // 1EE8B..1EE9B
+ UNKNOWN, // 1EE9C..1EEA0
+ ARABIC, // 1EEA1..1EEA3
+ UNKNOWN, // 1EEA4
+ ARABIC, // 1EEA5..1EEA9
+ UNKNOWN, // 1EEAA
+ ARABIC, // 1EEAB..1EEBB
+ UNKNOWN, // 1EEBC..1EEEF
+ ARABIC, // 1EEF0..1EEF1
+ UNKNOWN, // 1EEF2..1EFFF
+ COMMON, // 1F000..1F02B
+ UNKNOWN, // 1F02C..1F02F
+ COMMON, // 1F030..1F093
+ UNKNOWN, // 1F094..1F09F
+ COMMON, // 1F0A0..1F0AE
+ UNKNOWN, // 1F0AF..1F0B0
+ COMMON, // 1F0B1..1F0BF
+ UNKNOWN, // 1F0C0
+ COMMON, // 1F0C1..1F0CF
+ UNKNOWN, // 1F0D0
+ COMMON, // 1F0D1..1F0F5
+ UNKNOWN, // 1F0F6..1F0FF
+ COMMON, // 1F100..1F10C
+ UNKNOWN, // 1F10D..1F10F
+ COMMON, // 1F110..1F12E
+ UNKNOWN, // 1F12F
+ COMMON, // 1F130..1F16B
+ UNKNOWN, // 1F16C..1F16F
+ COMMON, // 1F170..1F19A
+ UNKNOWN, // 1F19B..1F1E5
+ COMMON, // 1F1E6..1F1FF
+ HIRAGANA, // 1F200
+ COMMON, // 1F201..1F202
+ UNKNOWN, // 1F203..1F20F
+ COMMON, // 1F210..1F23A
+ UNKNOWN, // 1F23B..1F23F
+ COMMON, // 1F240..1F248
+ UNKNOWN, // 1F249..1F24F
+ COMMON, // 1F250..1F251
+ UNKNOWN, // 1F252..1F2FF
+ COMMON, // 1F300..1F32C
+ UNKNOWN, // 1F32D..1F32F
+ COMMON, // 1F330..1F37D
+ UNKNOWN, // 1F37E..1F37F
+ COMMON, // 1F380..1F3CE
+ UNKNOWN, // 1F3CF..1F3D3
+ COMMON, // 1F3D4..1F3F7
+ UNKNOWN, // 1F3F8..1F3FF
+ COMMON, // 1F400..1F4FE
+ UNKNOWN, // 1F4FF
+ COMMON, // 1F500..1F54A
+ UNKNOWN, // 1F54B..1F54F
+ COMMON, // 1F550..1F579
+ UNKNOWN, // 1F57A
+ COMMON, // 1F57B..1F5A3
+ UNKNOWN, // 1F5A4
+ COMMON, // 1F5A5..1F642
+ UNKNOWN, // 1F643..1F644
+ COMMON, // 1F645..1F6CF
+ UNKNOWN, // 1F6D0..1F6DF
+ COMMON, // 1F6E0..1F6EC
+ UNKNOWN, // 1F6ED..1F6EF
+ COMMON, // 1F6F0..1F6F3
+ UNKNOWN, // 1F6F4..1F6FF
+ COMMON, // 1F700..1F773
+ UNKNOWN, // 1F774..1F77F
+ COMMON, // 1F780..1F7D4
+ UNKNOWN, // 1F7D5..1F7FF
+ COMMON, // 1F800..1F80B
+ UNKNOWN, // 1F80C..1F80F
+ COMMON, // 1F810..1F847
+ UNKNOWN, // 1F848..1F84F
+ COMMON, // 1F850..1F859
+ UNKNOWN, // 1F85A..1F85F
+ COMMON, // 1F860..1F887
+ UNKNOWN, // 1F888..1F88F
+ COMMON, // 1F890..1F8AD
+ UNKNOWN, // 1F8AE..1FFFF
+ HAN, // 20000..2A6D6
+ UNKNOWN, // 2A6D7..2A6FF
+ HAN, // 2A700..2B734
+ UNKNOWN, // 2B735..2B73F
+ HAN, // 2B740..2B81D
+ UNKNOWN, // 2B81E..2F7FF
+ HAN, // 2F800..2FA1D
+ UNKNOWN, // 2FA1E..E0000
+ COMMON, // E0001
+ UNKNOWN, // E0002..E001F
+ COMMON, // E0020..E007F
+ UNKNOWN, // E0080..E00FF
+ INHERITED, // E0100..E01EF
+ UNKNOWN, // E01F0..10FFFF
};
private static HashMap
*
* Note: Libraries that perform a bidirectional algorithm and reorder strings
@@ -106,6 +99,7 @@
* See Also:
*
*
* Constant indicating that the base direction depends on the first strong
@@ -482,7 +592,7 @@
* is assumed to be visual LTR, and the text after reordering is required
* to be the corresponding logical string with appropriate contextual
* direction. The direction of the result string will be RTL if either
- * the righmost or leftmost strong character of the source text is RTL
+ * the rightmost or leftmost strong character of the source text is RTL
* or Arabic Letter, the direction will be LTR otherwise.
*
* If reordering option
*
@@ -508,7 +618,7 @@
* is assumed to be visual LTR, and the text after reordering is required
* to be the corresponding logical string with appropriate contextual
* direction. The direction of the result string will be RTL if either
- * the righmost or leftmost strong character of the source text is RTL
+ * the rightmost or leftmost strong character of the source text is RTL
* or Arabic Letter, or if the text contains no strong character;
* the direction will be LTR otherwise.
*
@@ -520,21 +630,21 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_DEFAULT_RTL = (byte)0x7f;
+ public static final byte LEVEL_DEFAULT_RTL = (byte)0x7f;
/**
* Maximum explicit embedding level.
* (The maximum resolved level can be up to As return value for This option does not imply corresponding adjustment of the index
+ * mappings. This option does not imply corresponding adjustment of the index
+ * mappings. This has the same effect as calling When the Note: calling this method after setting the reordering mode with
+ *
+ * the same value in such a case.
*
* The text can be composed of multiple paragraphs. Occurrence of a block
* separator in the text terminates a paragraph, and whatever comes next starts
@@ -2421,9 +3498,9 @@
* (same index) character if the level has the
*
*
* @param paragraph a paragraph of text with optional character and
* paragraph attribute information
@@ -2693,13 +3827,14 @@
byte paraLvl;
char ch = paragraph.first();
Boolean runDirection =
- (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION);
+ (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION);
Object shaper = paragraph.getAttribute(TextAttributeConstants.NUMERIC_SHAPING);
+
if (runDirection == null) {
- paraLvl = INTERNAL_LEVEL_DEFAULT_LTR;
+ paraLvl = LEVEL_DEFAULT_LTR;
} else {
paraLvl = (runDirection.equals(TextAttributeConstants.RUN_DIRECTION_LTR)) ?
- (byte)Bidi.DIRECTION_LEFT_TO_RIGHT : (byte)Bidi.DIRECTION_RIGHT_TO_LEFT;
+ LTR : RTL;
}
byte[] lvls = null;
@@ -2717,7 +3852,7 @@
/* no-op */
} else if (level < 0) {
lvls = embeddingLevels;
- embeddingLevels[i] = (byte)((0 - level) | INTERNAL_LEVEL_OVERRIDE);
+ embeddingLevels[i] = (byte)((0 - level) | LEVEL_OVERRIDE);
} else {
lvls = embeddingLevels;
embeddingLevels[i] = level;
@@ -2751,7 +3886,7 @@
* @see #setPara
* @stable ICU 3.8
*/
- private void orderParagraphsLTR(boolean ordarParaLTR) {
+ public void orderParagraphsLTR(boolean ordarParaLTR) {
orderParagraphsLTR = ordarParaLTR;
}
@@ -2771,7 +3906,7 @@
* @see #MIXED
* @stable ICU 3.8
*/
- private byte getDirection()
+ public byte getDirection()
{
verifyValidParaOrLine();
return direction;
@@ -2819,31 +3954,25 @@
}
/**
- * Get the index of a paragraph, given a position within the text.
+ * Retrieves the Bidi class for a given code point.
+ * If a
+ *
+ * Example:
+ *
+ * Note that in right-to-left runs, code like this places
+ * second surrogates before first ones (which is generally a bad idea)
+ * and combining characters before base characters.
+ *
+ * Use of
@@ -3031,19 +4234,10 @@
* Constant indicating that the base direction depends on the first strong
* directional character in the text according to the Unicode Bidirectional
* Algorithm. If no strong directional character is present, the base
- * direction is left-to-right.
- * @stable ICU 3.8
- */
- private static final int INTERNAL_DIRECTION_DEFAULT_LEFT_TO_RIGHT = 0x7e;
-
- /**
- * Constant indicating that the base direction depends on the first strong
- * directional character in the text according to the Unicode Bidirectional
- * Algorithm. If no strong directional character is present, the base
* direction is right-to-left.
* @stable ICU 3.8
*/
- private static final int INTERMAL_DIRECTION_DEFAULT_RIGHT_TO_LEFT = 0x7f;
+ public static final int DIRECTION_DEFAULT_RIGHT_TO_LEFT = LEVEL_DEFAULT_RTL;
/**
* Create Bidi from the given text, embedding, and direction information.
@@ -3080,27 +4274,27 @@
* @stable ICU 3.8
*/
public BidiBase(char[] text,
- int textStart,
- byte[] embeddings,
- int embStart,
- int paragraphLength,
- int flags)
- {
+ int textStart,
+ byte[] embeddings,
+ int embStart,
+ int paragraphLength,
+ int flags)
+ {
this(0, 0);
byte paraLvl;
switch (flags) {
case Bidi.DIRECTION_LEFT_TO_RIGHT:
default:
- paraLvl = Bidi.DIRECTION_LEFT_TO_RIGHT;
+ paraLvl = LTR;
break;
case Bidi.DIRECTION_RIGHT_TO_LEFT:
- paraLvl = Bidi.DIRECTION_RIGHT_TO_LEFT;
+ paraLvl = RTL;
break;
case Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT:
- paraLvl = INTERNAL_LEVEL_DEFAULT_LTR;
+ paraLvl = LEVEL_DEFAULT_LTR;
break;
case Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT:
- paraLvl = INTERNAL_LEVEL_DEFAULT_RTL;
+ paraLvl = LEVEL_DEFAULT_RTL;
break;
}
byte[] paraEmbeddings;
@@ -3112,7 +4306,7 @@
for (int i = 0; i < paragraphLength; i++) {
lev = embeddings[i + embStart];
if (lev < 0) {
- lev = (byte)((- lev) | INTERNAL_LEVEL_OVERRIDE);
+ lev = (byte)((- lev) | LEVEL_OVERRIDE);
} else if (lev == 0) {
lev = paraLvl;
if (paraLvl > MAX_EXPLICIT_LEVEL) {
@@ -3122,13 +4316,10 @@
paraEmbeddings[i] = lev;
}
}
- if (textStart == 0 && embStart == 0 && paragraphLength == text.length) {
- setPara(text, paraLvl, paraEmbeddings);
- } else {
- char[] paraText = new char[paragraphLength];
- System.arraycopy(text, textStart, paraText, 0, paragraphLength);
- setPara(paraText, paraLvl, paraEmbeddings);
- }
+
+ char[] paraText = new char[paragraphLength];
+ System.arraycopy(text, textStart, paraText, 0, paragraphLength);
+ setPara(paraText, paraLvl, paraEmbeddings);
}
/**
@@ -3148,7 +4339,7 @@
}
/**
- * Return true if the line is all left-to-right text and the base direction
+ * Return true if the line is all left-to-right text and the base direction
* is left-to-right.
*
* @return true if the line is all left-to-right text and the base direction
@@ -3160,7 +4351,7 @@
*/
public boolean isLeftToRight()
{
- return (getDirection() == Bidi.DIRECTION_LEFT_TO_RIGHT && (paraLevel & 1) == 0);
+ return (getDirection() == LTR && (paraLevel & 1) == 0);
}
/**
@@ -3176,7 +4367,7 @@
*/
public boolean isRightToLeft()
{
- return (getDirection() == Bidi.DIRECTION_RIGHT_TO_LEFT && (paraLevel & 1) == 1);
+ return (getDirection() == RTL && (paraLevel & 1) == 1);
}
/**
@@ -3191,7 +4382,7 @@
*/
public boolean baseIsLeftToRight()
{
- return (getParaLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT);
+ return (getParaLevel() == LTR);
}
/**
@@ -3212,8 +4403,8 @@
/**
* Compute the logical to visual run mapping
*/
- private void getLogicalToVisualRunsMap()
- {
+ void getLogicalToVisualRunsMap()
+ {
if (isGoodLogicalToVisualRunsMap) {
return;
}
@@ -3231,9 +4422,8 @@
for (i = 0; i < count; i++) {
logicalToVisualRunsMap[i] = (int)(keys[i] & 0x00000000FFFFFFFF);
}
- keys = null;
isGoodLogicalToVisualRunsMap = true;
- }
+ }
/**
* Return the level of the nth logical run in this line.
@@ -3252,9 +4442,12 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (run < 0 || run >= runCount) {
return getParaLevel();
}
+
getLogicalToVisualRunsMap();
return runs[logicalToVisualRunsMap[run]].level;
}
@@ -3277,12 +4470,14 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (runCount == 1) {
return 0;
} else if (run == runCount) {
return length;
}
- verifyIndex(run, 0, runCount);
+
getLogicalToVisualRunsMap();
return runs[logicalToVisualRunsMap[run]].start;
}
@@ -3306,10 +4501,12 @@
{
verifyValidParaOrLine();
BidiLine.getRuns(this);
+
+ // for backward compatibility
if (runCount == 1) {
return length;
}
- verifyIndex(run, 0, runCount);
+
getLogicalToVisualRunsMap();
int idx = logicalToVisualRunsMap[run];
int len = idx == 0 ? runs[idx].limit :
@@ -3336,7 +4533,7 @@
int start,
int limit)
{
- final int RTLMask = (1 << Bidi.DIRECTION_RIGHT_TO_LEFT |
+ final int RTLMask = (1 << R |
1 << AL |
1 << RLE |
1 << RLO |
@@ -3346,6 +4543,7 @@
throw new IllegalArgumentException("Value start " + start +
" is out of range 0 to " + limit);
}
+
for (int i = start; i < limit; ++i) {
if (Character.isHighSurrogate(text[i]) && i < (limit-1) &&
Character.isLowSurrogate(text[i+1])) {
@@ -3356,6 +4554,7 @@
return true;
}
}
+
return false;
}
@@ -3382,8 +4581,9 @@
int objectStart,
int count)
{
+ // for backward compatibility
if (0 > levelStart || levels.length <= levelStart) {
- throw new IllegalArgumentException("Value levelStart " +
+ throw new IllegalArgumentException("Value levelStart " +
levelStart + " is out of range 0 to " +
(levels.length-1));
}
@@ -3397,6 +4597,7 @@
levelStart + " is out of range 0 to " +
(objects.length - objectStart));
}
+
byte[] reorderLevels = new byte[count];
System.arraycopy(levels, levelStart, reorderLevels, 0, count);
int[] indexMap = reorderVisual(reorderLevels);
@@ -3408,6 +4609,74 @@
}
/**
+ * Take a The text may have been aliased (only a reference was stored
+ * without copying the contents), thus it must not have been modified
+ * since the Get a value from a folding offset (from the value of a lead surrogate)
- * and a trail surrogate. If the
- * @param leadvalue value associated with the lead surrogate which contains
- * the folding offset
- * @param trail surrogate
- * @return trie data value associated with the trail character
- * @draft 2.1
- */
- public final char getTrailValue(int leadvalue, char trail)
- {
- if (m_dataManipulate_ == null) {
- throw new NullPointerException(
- "The field DataManipulate in this Trie is null");
- }
- int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
- if (offset > 0) {
- return m_data_[getRawOffset(offset,
- (char)(trail & SURROGATE_MASK_))];
- }
- return m_initialValue_;
- }
-
// protected methods -----------------------------------------------
/**
@@ -309,41 +162,14 @@
return -1;
}
- /**
- * Gets the value at the argument index.
- * For use internally in TrieIterator.
- * @param index value at index will be retrieved
- * @return 32 bit value
- * @see com.ibm.icu.impl.TrieIterator
- * @draft 2.1
- */
- protected final int getValue(int index)
- {
- return m_data_[index];
- }
-
- /**
- * Gets the default initial value
- * @return 32 bit value
- * @draft 2.1
- */
- protected final int getInitialValue()
- {
- return m_initialValue_;
- }
-
// private data members --------------------------------------------
/**
- * Default value
- */
+ * Default value
+ */
private char m_initialValue_;
/**
- * Array of char data
- */
- private char m_data_[];
- /**
- * Agent for friends
+ * Array of char data
*/
- private FriendAgent m_friendAgent_;
+ private char m_data_[];
}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java 2015-07-13 16:11:47.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java 2015-07-13 16:11:47.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -45,7 +45,7 @@
* @author ram
*/
-public class CharacterIteratorWrapper extends UCharacterIterator {
+class CharacterIteratorWrapper extends UCharacterIterator {
private CharacterIterator iterator;
@@ -111,7 +111,6 @@
iterator.setIndex(index);
}
- //// for StringPrep
/**
* @see UCharacterIterator#getText(char[])
*/
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java 2015-07-13 16:11:48.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java 2015-07-13 16:11:48.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -25,25 +25,38 @@
/*
*******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.InputStream;
+import java.io.BufferedInputStream;
import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.FileChannel;
+import java.nio.file.FileSystems;
import java.util.Arrays;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+
+public final class ICUBinary {
+
+ private static final class IsAcceptable implements Authenticate {
+ // @Override when we switch to Java 6
+ public boolean isDataVersionAcceptable(byte version[]) {
+ return version[0] == 1;
+ }
+ }
-public final class ICUBinary
-{
// public inner interface ------------------------------------------------
/**
@@ -63,53 +76,44 @@
// public methods --------------------------------------------------------
/**
- * ICU data header reader method.
- * Takes a ICU generated big-endian input stream, parse the ICU standard
- * file header and authenticates them.
- * Header format:
- *
- * Example of use: Assumes that the ByteBuffer position is 0 on input.
+ * The buffer byte order is set according to the data.
+ * The buffer position is advanced past the header (including UDataInfo and comment).
+ *
+ * See C++ ucmndata.h and unicode/udata.h.
+ *
+ * @return dataVersion
+ * @throws IOException if this is not a valid ICU data item of the expected dataFormat
+ */
+ public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate)
+ throws IOException {
+ assert bytes.position() == 0;
+ byte magic1 = bytes.get(2);
+ byte magic2 = bytes.get(3);
+ if (magic1 != MAGIC1 || magic2 != MAGIC2) {
+ throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
+ }
+
+ byte isBigEndian = bytes.get(8);
+ byte charsetFamily = bytes.get(9);
+ byte sizeofUChar = bytes.get(10);
+ if (isBigEndian < 0 || 1 < isBigEndian ||
+ charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) {
+ throw new IOException(HEADER_AUTHENTICATION_FAILED_);
+ }
+ bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN);
+
+ int headerSize = bytes.getChar(0);
+ int sizeofUDataInfo = bytes.getChar(4);
+ if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) {
+ throw new IOException("Internal Error: Header size error");
+ }
+ // TODO: Change Authenticate to take int major, int minor, int milli, int micro
+ // to avoid array allocation.
+ byte[] formatVersion = new byte[] {
+ bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19)
+ };
+ if (bytes.get(12) != (byte)(dataFormat >> 24) ||
+ bytes.get(13) != (byte)(dataFormat >> 16) ||
+ bytes.get(14) != (byte)(dataFormat >> 8) ||
+ bytes.get(15) != (byte)dataFormat ||
+ (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) {
+ throw new IOException(HEADER_AUTHENTICATION_FAILED_ +
+ String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d",
+ bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15),
+ formatVersion[0] & 0xff, formatVersion[1] & 0xff,
+ formatVersion[2] & 0xff, formatVersion[3] & 0xff));
+ }
+
+ bytes.position(headerSize);
+ return // dataVersion
+ ((int)bytes.get(20) << 24) |
+ ((bytes.get(21) & 0xff) << 16) |
+ ((bytes.get(22) & 0xff) << 8) |
+ (bytes.get(23) & 0xff);
+ }
+
+ public static void skipBytes(ByteBuffer bytes, int skipLength) {
+ if (skipLength > 0) {
+ bytes.position(bytes.position() + skipLength);
+ }
+ }
+
+ /**
+ * Returns a VersionInfo for the bytes in the compact version integer.
+ */
+ public static VersionInfo getVersionInfoFromCompactInt(int version) {
+ return VersionInfo.getInstance(
+ version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
+ }
+
// private variables -------------------------------------------------
/**
@@ -175,7 +253,6 @@
/**
* File format authentication values
*/
- private static final byte BIG_ENDIAN_ = 1;
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
@@ -183,7 +260,7 @@
* Error messages
*/
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
- "ICU data file error: Not an ICU data file";
+ "ICUBinary data file error: Magin number authentication failed";
private static final String HEADER_AUTHENTICATION_FAILED_ =
- "ICU data file error: Header authentication failed, please check if you have a valid ICU data file";
+ "ICUBinary data file error: Header authentication failed";
}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2015-07-13 16:11:49.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2015-07-13 16:11:49.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,18 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 2000-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
-
package sun.text.normalizer;
import java.text.CharacterIterator;
@@ -125,8 +120,8 @@
*
* normalize(FCD) may be implemented with NFD.
*
- * For more details on FCD see the collation design document:
- * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm
+ * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
+ * http://www.unicode.org/notes/tn5/#FCD
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
@@ -138,26 +133,88 @@
* often do not encode any combining marks by themselves. For conversion to such
* character encodings the Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
+ *
+ * Note: The Normalizer class also provides API for iterative normalization.
+ * While the setIndex() and getIndex() refer to indices in the
+ * underlying Unicode input text, the next() and previous() methods
+ * iterate through characters in the normalized output.
+ * This means that there is not necessarily a one-to-one correspondence
+ * between characters returned by next() and previous() and the indices
+ * passed to and returned from setIndex() and getIndex().
+ * It is for this reason that Normalizer does not implement the CharacterIterator interface.
+ *
* @stable ICU 2.8
*/
-
+// Original filename in ICU4J: Normalizer.java
public final class NormalizerBase implements Cloneable {
- //-------------------------------------------------------------------------
- // Private data
- //-------------------------------------------------------------------------
- private char[] buffer = new char[100];
- private int bufferStart = 0;
- private int bufferPos = 0;
- private int bufferLimit = 0;
-
// The input text and our position in it
private UCharacterIterator text;
- private Mode mode = NFC;
- private int options = 0;
+ private Normalizer2 norm2;
+ private Mode mode;
+ private int options;
+
+ // The normalization buffer is the result of normalization
+ // of the source in [currentIndex..nextIndex] .
private int currentIndex;
private int nextIndex;
+ // A buffer for holding intermediate results
+ private StringBuilder buffer;
+ private int bufferPos;
+
+ // Helper classes to defer loading of normalization data.
+ private static final class ModeImpl {
+ private ModeImpl(Normalizer2 n2) {
+ normalizer2 = n2;
+ }
+ private final Normalizer2 normalizer2;
+ }
+
+ private static final class NFDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
+ }
+
+ private static final class NFKDModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
+ }
+
+ private static final class NFCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
+ }
+
+ private static final class NFKCModeImpl {
+ private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
+ }
+
+ private static final class Unicode32 {
+ private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
+ }
+
+ private static final class NFD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKD32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
+ Unicode32.INSTANCE));
+ }
+
+ private static final class NFKC32ModeImpl {
+ private static final ModeImpl INSTANCE =
+ new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
+ Unicode32.INSTANCE));
+ }
+
/**
* Options bit set value to select Unicode 3.2 normalization
* (except NormalizationCorrections).
@@ -166,6 +223,17 @@
*/
public static final int UNICODE_3_2=0x20;
+ public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
+
+ /*
+ * Default option for the latest Unicode normalization. This option is
+ * provided mainly for testing.
+ * The value zero means that normalization is done with the fixes for
+ * - Corrigendum 4 (Five CJK Canonical Mapping Errors)
+ * - Corrigendum 5 (Normalization Idempotency)
+ */
+ public static final int UNICODE_LATEST = 0x00;
+
/**
* Constant indicating that the end of the iteration has been reached.
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
@@ -175,392 +243,120 @@
/**
* Constants for normalization modes.
+ *
+ * The Mode class is not intended for public subclassing.
+ * Only the Mode constants provided by the Normalizer class should be used,
+ * and any fields or methods should not be called or overridden by users.
* @stable ICU 2.8
*/
- public static class Mode {
- private int modeValue;
- private Mode(int value) {
- modeValue = value;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int srcLen = (srcLimit - srcStart);
- int destLen = (destLimit - destStart);
- if( srcLen > destLen ) {
- return srcLen;
- }
- System.arraycopy(src,srcStart,dest,destStart,srcLen);
- return srcLen;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- int options) {
- return normalize( src, srcStart, srcLimit,
- dest,destStart,destLimit,
- NormalizerImpl.getNX(options)
- );
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected String normalize(String src, int options) {
- return src;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected int getMinC() {
- return -1;
- }
-
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected int getMask() {
- return -1;
- }
+ public static abstract class Mode {
/**
- * This method is used for method dispatch
- * @stable ICU 2.8
+ * Sole constructor
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- protected IsPrevBoundary getPrevBoundary() {
- return null;
+ @Deprecated
+ protected Mode() {
}
/**
- * This method is used for method dispatch
- * @stable ICU 2.8
+ * @internal
+ * @deprecated This API is ICU internal only.
*/
- protected IsNextBoundary getNextBoundary() {
- return null;
- }
+ @Deprecated
+ protected abstract Normalizer2 getNormalizer2(int options);
+ }
- /**
- * This method is used for method dispatch
- * @stable ICU 2.6
- */
- protected QuickCheckResult quickCheck(char[] src,int start, int limit,
- boolean allowMaybe,UnicodeSet nx) {
- if(allowMaybe) {
- return MAYBE;
- }
- return NO;
+ private static Mode toMode(Normalizer.Form form) {
+ switch (form) {
+ case NFC :
+ return NFC;
+ case NFD :
+ return NFD;
+ case NFKC :
+ return NFKC;
+ case NFKD :
+ return NFKD;
}
- /**
- * This method is used for method dispatch
- * @stable ICU 2.8
- */
- protected boolean isNFSkippable(int c) {
- return true;
- }
+ throw new IllegalArgumentException("Unexpected normalization form: " +
+ form);
}
- /**
- * No decomposition/composition.
- * @stable ICU 2.8
- */
- public static final Mode NONE = new Mode(1);
-
- /**
- * Canonical decomposition.
- * @stable ICU 2.8
- */
- public static final Mode NFD = new NFDMode(2);
+ private static final class NONEMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
+ }
private static final class NFDMode extends Mode {
- private NFDMode(int value) {
- super(value);
- }
-
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int[] trailCC = new int[1];
- return NormalizerImpl.decompose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- false, trailCC,nx);
- }
-
- protected String normalize( String src, int options) {
- return decompose(src,false,options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.MIN_WITH_LEAD_CC;
- }
-
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevNFDSafe();
- }
-
- protected IsNextBoundary getNextBoundary() {
- return new IsNextNFDSafe();
- }
-
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD);
- }
-
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src, start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE
- ),
- NormalizerImpl.QC_NFD,
- 0,
- allowMaybe,
- nx
- );
- }
-
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c,this,
- (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD)
- );
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFD32ModeImpl.INSTANCE.normalizer2 :
+ NFDModeImpl.INSTANCE.normalizer2;
}
}
- /**
- * Compatibility decomposition.
- * @stable ICU 2.8
- */
- public static final Mode NFKD = new NFKDMode(3);
-
private static final class NFKDMode extends Mode {
- private NFKDMode(int value) {
- super(value);
- }
-
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- int[] trailCC = new int[1];
- return NormalizerImpl.decompose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- true, trailCC, nx);
- }
-
- protected String normalize( String src, int options) {
- return decompose(src,true,options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.MIN_WITH_LEAD_CC;
- }
-
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevNFDSafe();
- }
-
- protected IsNextBoundary getNextBoundary() {
- return new IsNextNFDSafe();
- }
-
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD);
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKD32ModeImpl.INSTANCE.normalizer2 :
+ NFKDModeImpl.INSTANCE.normalizer2;
}
+ }
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE
- ),
- NormalizerImpl.QC_NFKD,
- NormalizerImpl.OPTIONS_COMPAT,
- allowMaybe,
- nx
- );
+ private static final class NFCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFC32ModeImpl.INSTANCE.normalizer2 :
+ NFCModeImpl.INSTANCE.normalizer2;
}
+ }
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c, this,
- (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD)
- );
+ private static final class NFKCMode extends Mode {
+ protected Normalizer2 getNormalizer2(int options) {
+ return (options&UNICODE_3_2) != 0 ?
+ NFKC32ModeImpl.INSTANCE.normalizer2 :
+ NFKCModeImpl.INSTANCE.normalizer2;
}
}
/**
- * Canonical decomposition followed by canonical composition.
+ * No decomposition/composition.
* @stable ICU 2.8
*/
- public static final Mode NFC = new NFCMode(4);
-
- private static final class NFCMode extends Mode{
- private NFCMode(int value) {
- super(value);
- }
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- return NormalizerImpl.compose( src, srcStart, srcLimit,
- dest,destStart,destLimit,
- 0, nx);
- }
-
- protected String normalize( String src, int options) {
- return compose(src, false, options);
- }
-
- protected int getMinC() {
- return NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
- );
- }
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevTrueStarter();
- }
- protected IsNextBoundary getNextBoundary() {
- return new IsNextTrueStarter();
- }
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC);
- }
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE
- ),
- NormalizerImpl.QC_NFC,
- 0,
- allowMaybe,
- nx
- );
- }
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c,this,
- ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
- (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO)
- )
- );
- }
- };
+ public static final Mode NONE = new NONEMode();
/**
- * Compatibility decomposition followed by canonical composition.
+ * Canonical decomposition.
* @stable ICU 2.8
*/
- public static final Mode NFKC =new NFKCMode(5);
-
- private static final class NFKCMode extends Mode{
- private NFKCMode(int value) {
- super(value);
- }
- protected int normalize(char[] src, int srcStart, int srcLimit,
- char[] dest,int destStart,int destLimit,
- UnicodeSet nx) {
- return NormalizerImpl.compose(src, srcStart,srcLimit,
- dest, destStart,destLimit,
- NormalizerImpl.OPTIONS_COMPAT, nx);
- }
-
- protected String normalize( String src, int options) {
- return compose(src, true, options);
- }
- protected int getMinC() {
- return NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
- );
- }
- protected IsPrevBoundary getPrevBoundary() {
- return new IsPrevTrueStarter();
- }
- protected IsNextBoundary getNextBoundary() {
- return new IsNextTrueStarter();
- }
- protected int getMask() {
- return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC);
- }
- protected QuickCheckResult quickCheck(char[] src,int start,
- int limit,boolean allowMaybe,
- UnicodeSet nx) {
- return NormalizerImpl.quickCheck(
- src,start,limit,
- NormalizerImpl.getFromIndexesArr(
- NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE
- ),
- NormalizerImpl.QC_NFKC,
- NormalizerImpl.OPTIONS_COMPAT,
- allowMaybe,
- nx
- );
- }
- protected boolean isNFSkippable(int c) {
- return NormalizerImpl.isNFSkippable(c, this,
- ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY|
- (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO)
- )
- );
- }
- };
+ public static final Mode NFD = new NFDMode();
/**
- * Result values for quickCheck().
- * For details see Unicode Technical Report 15.
- * @stable ICU 2.8
- */
- public static final class QuickCheckResult{
- private int resultValue;
- private QuickCheckResult(int value) {
- resultValue=value;
- }
- }
- /**
- * Indicates that string is not in the normalized format
+ * Compatibility decomposition.
* @stable ICU 2.8
*/
- public static final QuickCheckResult NO = new QuickCheckResult(0);
+ public static final Mode NFKD = new NFKDMode();
/**
- * Indicates that string is in the normalized format
+ * Canonical decomposition followed by canonical composition.
* @stable ICU 2.8
*/
- public static final QuickCheckResult YES = new QuickCheckResult(1);
+ public static final Mode NFC = new NFCMode();
- /**
- * Indicates it cannot be determined if string is in the normalized
- * format without further thorough checks.
- * @stable ICU 2.8
- */
- public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
+ public static final Mode NFKC =new NFKCMode();
//-------------------------------------------------------------------------
- // Constructors
+ // Iterator constructors
//-------------------------------------------------------------------------
/**
- * Creates a new {@code Normalizer} object for iterating over the
+ * Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of a given string.
*
* The {@code options} parameter specifies which optional
- * {@code Normalizer} features are to be enabled for this object.
- *
+ * {@code NormalizerBase} features are to be enabled for this object.
+ *
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
@@ -576,25 +372,19 @@
this.text = UCharacterIterator.getInstance(str);
this.mode = mode;
this.options=opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
}
- /**
- * Creates a new {@code Normalizer} object for iterating over the
- * normalized form of the given text.
- *
- * @param iter The input text to be normalized. The normalization
- * will start at the beginning of the string.
- *
- * @param mode The normalization mode.
- */
- public NormalizerBase(CharacterIterator iter, Mode mode) {
- this(iter, mode, UNICODE_LATEST);
+ public NormalizerBase(String str, Mode mode) {
+ this(str, mode, 0);
}
+
/**
- * Creates a new {@code Normalizer} object for iterating over the
+ * Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of the given text.
- *
+ *
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
@@ -607,15 +397,19 @@
* @stable ICU 2.6
*/
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
- this.text = UCharacterIterator.getInstance(
- (CharacterIterator)iter.clone()
- );
+ this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
this.mode = mode;
this.options = opt;
+ norm2 = mode.getNormalizer2(opt);
+ buffer = new StringBuilder();
+ }
+
+ public NormalizerBase(CharacterIterator iter, Mode mode) {
+ this(iter, mode, 0);
}
/**
- * Clones this {@code Normalizer} object. All properties of this
+ * Clones this {@code NormalizerBase} object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
@@ -628,11 +422,13 @@
try {
NormalizerBase copy = (NormalizerBase) super.clone();
copy.text = (UCharacterIterator) text.clone();
- //clone the internal buffer
- if (buffer != null) {
- copy.buffer = new char[buffer.length];
- System.arraycopy(buffer,0,copy.buffer,0,buffer.length);
- }
+ copy.mode = mode;
+ copy.options = options;
+ copy.norm2 = norm2;
+ copy.buffer = new StringBuilder(buffer);
+ copy.bufferPos = bufferPos;
+ copy.currentIndex = currentIndex;
+ copy.nextIndex = nextIndex;
return copy;
}
catch (CloneNotSupportedException e) {
@@ -640,150 +436,60 @@
}
}
- //--------------------------------------------------------------------------
- // Static Utility methods
- //--------------------------------------------------------------------------
-
/**
- * Compose a string.
- * The string will be composed according to the specified mode.
- * @param str The string to compose.
- * @param compat If true the string will be composed according to
- * NFKC rules and if false will be composed according to
- * NFC rules.
- * @param options The only recognized option is UNICODE_3_2
- * @return String The composed string
+ * Normalizes a {@code String} using the given normalization operation.
+ *
+ * The {@code options} parameter specifies which optional
+ * {@code NormalizerBase} features are to be enabled for this operation.
+ * Currently the only available option is {@link #UNICODE_3_2}.
+ * If you want the default behavior corresponding to one of the standard
+ * Unicode Normalization Forms, use 0 for this argument.
+ *
+ * @param str the input string to be normalized.
+ * @param mode the normalization mode
+ * @param options the optional features to be enabled.
+ * @return String the normalized string
* @stable ICU 2.6
*/
- public static String compose(String str, boolean compat, int options) {
-
- char[] dest, src;
- if (options == UNICODE_3_2_0_ORIGINAL) {
- String mappedStr = NormalizerImpl.convert(str);
- dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE];
- src = mappedStr.toCharArray();
- } else {
- dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
- src = str.toCharArray();
- }
- int destSize=0;
-
- UnicodeSet nx = NormalizerImpl.getNX(options);
-
- /* reset options bits that should only be set here or inside compose() */
- options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
-
- if(compat) {
- options|=NormalizerImpl.OPTIONS_COMPAT;
- }
-
- for(;;) {
- destSize=NormalizerImpl.compose(src,0,src.length,
- dest,0,dest.length,options,
- nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
+ public static String normalize(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).normalize(str);
}
- private static final int MAX_BUF_SIZE_COMPOSE = 2;
- private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
+ public static String normalize(String str, Normalizer.Form form) {
+ return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
+ }
- /**
- * Decompose a string.
- * The string will be decomposed according to the specified mode.
- * @param str The string to decompose.
- * @param compat If true the string will be decomposed according to NFKD
- * rules and if false will be decomposed according to NFD
- * rules.
- * @return String The decomposed string
- * @stable ICU 2.8
- */
- public static String decompose(String str, boolean compat) {
- return decompose(str,compat,UNICODE_LATEST);
+ public static String normalize(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.normalize(str, toMode(form), options);
}
/**
- * Decompose a string.
- * The string will be decomposed according to the specified mode.
- * @param str The string to decompose.
- * @param compat If true the string will be decomposed according to NFKD
- * rules and if false will be decomposed according to NFD
- * rules.
- * @param options The normalization options, ORed together (0 for no options).
- * @return String The decomposed string
+ * Test if a string is in a given normalization form.
+ * This is semantically equivalent to source.equals(normalize(source, mode)).
+ *
+ * Unlike quickCheck(), this function returns a definitive result,
+ * never a "maybe".
+ * For NFD, NFKD, and FCD, both functions work exactly the same.
+ * For NFC and NFKC where quickCheck may return "maybe", this function will
+ * perform further tests to arrive at a true/false result.
+ * @param str the input string to be checked to see if it is
+ * normalized
+ * @param mode the normalization mode
+ * @param options Options for use with exclusion set and tailored Normalization
+ * The only option that is currently recognized is UNICODE_3_2
+ * @see #isNormalized
* @stable ICU 2.6
*/
- public static String decompose(String str, boolean compat, int options) {
-
- int[] trailCC = new int[1];
- int destSize=0;
- UnicodeSet nx = NormalizerImpl.getNX(options);
- char[] dest;
-
- if (options == UNICODE_3_2_0_ORIGINAL) {
- String mappedStr = NormalizerImpl.convert(str);
- dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE];
-
- for(;;) {
- destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(),
- dest,0,dest.length,
- compat,trailCC, nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
- } else {
- dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
-
- for(;;) {
- destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
- dest,0,dest.length,
- compat,trailCC, nx);
- if(destSize<=dest.length) {
- return new String(dest,0,destSize);
- } else {
- dest = new char[destSize];
- }
- }
- }
+ public static boolean isNormalized(String str, Mode mode, int options) {
+ return mode.getNormalizer2(options).isNormalized(str);
}
- /**
- * Normalize a string.
- * The string will be normalized according to the specified normalization
- * mode and options.
- * @param src The char array to compose.
- * @param srcStart Start index of the source
- * @param srcLimit Limit index of the source
- * @param dest The char buffer to fill in
- * @param destStart Start index of the destination buffer
- * @param destLimit End index of the destination buffer
- * @param mode The normalization mode; one of Normalizer.NONE,
- * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
- * Normalizer.NFKD, Normalizer.DEFAULT
- * @param options The normalization options, ORed together (0 for no options).
- * @return int The total buffer size needed;if greater than length of
- * result, the output was truncated.
- * @exception IndexOutOfBoundsException if the target capacity is
- * less than the required length
- * @stable ICU 2.6
- */
- public static int normalize(char[] src,int srcStart, int srcLimit,
- char[] dest,int destStart, int destLimit,
- Mode mode, int options) {
- int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
+ public static boolean isNormalized(String str, Normalizer.Form form) {
+ return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
+ }
- if(length<=(destLimit-destStart)) {
- return length;
- } else {
- throw new IndexOutOfBoundsException(Integer.toString(length));
- }
+ public static boolean isNormalized(String str, Normalizer.Form form, int options) {
+ return NormalizerBase.isNormalized(str, toMode(form), options);
}
//-------------------------------------------------------------------------
@@ -796,8 +502,8 @@
* @stable ICU 2.8
*/
public int current() {
- if(bufferPos
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
@@ -882,11 +587,9 @@
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
- * @return The codepoint as an int
- * @deprecated ICU 3.2
+ * deprecated ICU 3.2
* @obsolete ICU 3.2
*/
- @Deprecated
public int setIndex(int index) {
setIndexOnly(index);
return current();
@@ -895,7 +598,7 @@
/**
* Retrieve the index of the start of the input text. This is the begin
* index of the {@code CharacterIterator} or the start (i.e. 0) of the
- * {@code String} over which this {@code Normalizer} is iterating
+ * {@code String} over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
@@ -908,7 +611,7 @@
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
- * over which this {@code Normalizer} is iterating
+ * over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
@@ -934,7 +637,7 @@
* @stable ICU 2.8
*/
public int getIndex() {
- if(bufferPos
+ * If dest is a StringBuilder, then the buffer writes directly to it.
+ * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
+ * until no further changes are necessary and whole segments are appended.
+ * append() methods that take combining-class values always write to the StringBuilder.
+ * Other append() methods flush and append to the Appendable.
+ */
+ public static final class ReorderingBuffer implements Appendable {
+ public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) {
+ impl=ni;
+ app=dest;
+ if (app instanceof StringBuilder) {
+ appIsStringBuilder=true;
+ str=(StringBuilder)dest;
+ // In Java, the constructor subsumes public void init(int destCapacity)
+ str.ensureCapacity(destCapacity);
+ reorderStart=0;
+ if(str.length()==0) {
+ lastCC=0;
} else {
- norm32=0;
- c2=0;
- }
- }else{
- c2=0;
- }
- if(nx_contains(nx, c, c2)) {
- /* excluded: norm32==0 */
- norm32=0;
- }
-
- // check the combining order
- cc=(char)((norm32>>CC_SHIFT)&0xFF);
- if(cc!=0 && cc 1) {
+ prevBoundary=p;
+ }
+ }
+ if(buffer!=null) {
+ // The last lccc==0 character is excluded from the
+ // flush-and-append call in case it needs to be modified.
+ buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
+ buffer.append(s, prevBoundary, src);
+ }
+ // The start of the current character (c).
+ prevSrc=src;
+ } else if(src==limit) {
+ break;
+ }
- if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){
- return true; /* NF*D, passed (a)..(c), is skippable */
+ src+=Character.charCount(c);
+ // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
+ // Check for proper order, and decompose locally if necessary.
+ if((prevFCD16&0xff)<=(fcd16>>8)) {
+ // proper order: prev tccc <= current lccc
+ if((fcd16&0xff)<=1) {
+ prevBoundary=src;
+ }
+ if(buffer!=null) {
+ buffer.appendZeroCC(c);
+ }
+ prevFCD16=fcd16;
+ continue;
+ } else if(buffer==null) {
+ return prevBoundary; // quick check "no"
+ } else {
+ /*
+ * Back out the part of the source that we copied or appended
+ * already but is now going to be decomposed.
+ * prevSrc is set to after what was copied/appended.
+ */
+ buffer.removeSuffix(prevSrc-prevBoundary);
+ /*
+ * Find the part of the source that needs to be decomposed,
+ * up to the next safe boundary.
+ */
+ src=findNextFCDBoundary(s, src, limit);
+ /*
+ * The source text does not fulfill the conditions for FCD.
+ * Decompose and reorder a limited piece of the text.
+ */
+ decomposeShort(s, prevBoundary, src, buffer);
+ prevBoundary=src;
+ prevFCD16=0;
+ }
}
- /* check conditions (a)..(e), see unormimp.h */
+ return src;
+ }
- /* NF*C/FCC, passed (a)..(e) */
- if((norm32& QC_NFD)==0) {
- return true; /* no canonical decomposition, is skippable */
+ // Note: hasDecompBoundary() could be implemented as aliases to
+ // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
+ // at the cost of building the FCD trie for a decomposition normalizer.
+ public boolean hasDecompBoundary(int c, boolean before) {
+ for(;;) {
+ if(c The compositions list has (trail, compositeAndFwd) pair entries,
+ * encoded as either pairs or triples of 16-bit units.
+ * The last entry has the high bit of its first unit set.
+ *
+ * The list is sorted by ascending trail characters (there are no duplicates).
+ * A linear search is used.
+ *
+ * See normalizer2impl.h for a more detailed description
+ * of the compositions list format.
+ */
+ private static int combine(String compositions, int list, int trail) {
+ int key1, firstUnit;
+ if(trail Trie options field. options bit field: Trie options field. options bit field:
- * The UCharacter class provides extensions to the
- *
+ * The UCharacter class provides extensions to the
+ *
* java.lang.Character class. These extensions provide support for
* more Unicode properties and together with the UTF16
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* Each ICU release supports the latest version of Unicode available at that time.
- *
- * Code points are represented in these API using ints. While it would be
+ *
+ * Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
- *
- * To use this class please add the jar file name icu4j.jar to the
+ *
+ * To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.
- * Aside from the additions for UTF-16 support, and the updated Unicode
+ *
+ * Aside from the additions for UTF-16 support, and the updated Unicode
* properties, the main differences between UCharacter and Character are:
*
- * Further detail differences can be determined from the program
- *
+ * Further detail on differences can be determined using the program
+ *
* com.ibm.icu.dev.test.lang.UCharacterCompare
*
@@ -103,8 +93,11 @@
*
* For more information see
- * "About the Unicode Character Database" (http://www.unicode.org/ucd/)
- * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html).
+ * "About the Unicode Character Database"
+ * (http://www.unicode.org/ucd/)
+ * and the ICU
+ * User Guide chapter on Properties
+ * (http://www.icu-project.org/userguide/properties.html).
*
* There are also functions that provide easy migration from C/POSIX functions
@@ -128,12 +121,15 @@
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
*
* API access for C/POSIX character classes is as follows:
+ *
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
*
- * Note: There are several ICU (and Java) whitespace functions.
- * Comparison:
- * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
+ *
+ * There are several ICU (and Java) whitespace functions.
+ * Comparison:
- * This class is not subclassable
+ * This class is not subclassable.
* ;
;
,
,
, expressed as character values in hex. If there is more than one character,
+# they are separated by spaces. Other than as used to separate elements, spaces are
+# to be ignored.
#
# The
for internal
* storage. The contents of normalize
which transforms Unicode
--- old/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java 2015-07-13 16:11:42.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java 2015-07-13 16:11:42.000000000 +0900
@@ -50,7 +50,6 @@
import sun.text.Normalizer;
import sun.text.normalizer.CharTrie;
import sun.text.normalizer.Trie;
-import sun.text.normalizer.NormalizerImpl;
import sun.text.normalizer.VersionInfo;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UCharacterIterator;
@@ -227,7 +226,7 @@
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
- VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion();
+ VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
@@ -354,7 +353,7 @@
Normalizer.normalize(
src.toString(),
java.text.Normalizer.Form.NFKC,
- Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
+ Normalizer.UNICODE_3_2));
}
/*
boolean isLabelSeparator(int ch){
--- old/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java 2015-07-13 16:11:43.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java 2015-07-13 16:11:43.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -43,7 +43,7 @@
private static int decompNum;
static {
- int maxNum = 2000; //TBD: Unicode 4.0 only has 1926 canoDecomp...
+ int maxNum = 2100;
chars = new int[maxNum];
decomps = new String[maxNum];
decompNum = NormalizerImpl.getDecompose(chars, decomps);
--- old/jdk/src/java.base/share/classes/sun/text/Normalizer.java 2015-07-13 16:11:44.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/Normalizer.java 2015-07-13 16:11:43.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -26,7 +26,7 @@
package sun.text;
import sun.text.normalizer.NormalizerBase;
-import sun.text.normalizer.NormalizerImpl;
+import sun.text.normalizer.UCharacter;
/**
* This Normalizer is for Unicode 3.2 support for IDNA only.
@@ -93,6 +93,6 @@
* @return combining class of the given character
*/
public static final int getCombiningClass(int ch) {
- return NormalizerImpl.getCombiningClass(ch);
+ return UCharacter.getCombiningClass(ch);
}
}
--- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java 2015-07-13 16:11:44.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java 2015-07-13 16:11:44.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,17 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
+*******************************************************************************
+* Copyright (C) 2001-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
/* FOOD FOR THOUGHT: currently the reordering modes are a mixture of
* algorithm for direct BiDi, algorithm for inverse Bidi and the bizarre
@@ -52,12 +48,10 @@
package sun.text.bidi;
-import java.io.IOException;
import java.lang.reflect.Array;
import java.text.AttributedCharacterIterator;
import java.text.Bidi;
import java.util.Arrays;
-import java.util.MissingResourceException;
import sun.misc.JavaAWTFontAccess;
import sun.misc.SharedSecrets;
import sun.text.normalizer.UBiDiProps;
@@ -68,10 +62,9 @@
*
* Bidi algorithm for ICU
*
- * This is an implementation of the Unicode Bidirectional algorithm. The
+ * This is an implementation of the Unicode Bidirectional Algorithm. The
* algorithm is defined in the Unicode Standard Annex #9,
- * version 13, also described in The Unicode Standard, Version 4.0 .
+ * href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9.
* Basic concept: levels
@@ -167,6 +161,7 @@
*
* Basic concept: Reordering Options
* Reordering options can be applied during Bidi text transformations.
+ *
*
*
OPTION_INSERT_MARKS
is set, an RLM may
@@ -493,7 +603,7 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_DEFAULT_LTR = (byte)0x7e;
+ public static final byte LEVEL_DEFAULT_LTR = (byte)0x7e;
/** Paragraph level settingMAX_EXPLICIT_LEVEL+1
).
* @stable ICU 3.8
*/
- public static final byte MAX_EXPLICIT_LEVEL = 61;
+ public static final byte MAX_EXPLICIT_LEVEL = 125;
/**
* Bit flag for level input.
* Overrides directional properties.
* @stable ICU 3.8
*/
- public static final byte INTERNAL_LEVEL_OVERRIDE = (byte)0x80;
+ public static final byte LEVEL_OVERRIDE = (byte)0x80;
/**
* Special value which can be returned by the mapping methods when a
@@ -555,13 +665,53 @@
public static final int MAP_NOWHERE = -1;
/**
+ * Left-to-right text.
+ *
+ *
+ * @stable ICU 3.8
+ */
+ public static final byte LTR = 0;
+
+ /**
+ * Right-to-left text.
+ * getDirection()
, it means
+ * that the source string contains no right-to-left characters, or
+ * that the source string is empty and the paragraph level is even.
+ * getBaseDirection()
, it
+ * means that the first strong character of the source string has
+ * a left-to-right direction.
+ *
+ *
+ * @stable ICU 3.8
+ */
+ public static final byte RTL = 1;
+
+ /**
* Mixed-directional text.
+ * getDirection()
, it means
+ * that the source string contains no left-to-right characters, or
+ * that the source string is empty and the paragraph level is odd.
+ * getBaseDirection()
, it
+ * means that the first strong character of the source string has
+ * a right-to-left direction.
+ * getDirection()
, it means
+ * that the source string contains both left-to-right and
+ * right-to-left characters.
* @stable ICU 3.8
*/
public static final byte MIXED = 2;
/**
* option bit for writeReordered():
+ * keep combining characters after their base characters in RTL runs
+ *
+ * @see #writeReordered
+ * @stable ICU 3.8
+ */
+ public static final short KEEP_BASE_COMBINING = 1;
+
+ /**
+ * option bit for writeReordered():
* replace characters with the "mirrored" property in RTL runs
* by their mirror-image mappings
*
@@ -570,6 +720,50 @@
*/
public static final short DO_MIRRORING = 2;
+ /**
+ * option bit for writeReordered():
+ * surround the run with LRMs if necessary;
+ * this is part of the approximate "inverse Bidi" algorithm
+ *
+ * writeReordered()
+ * first without this option, and then calling
+ * writeReverse()
without mirroring.
+ * Doing this in the same step is faster and avoids a temporary buffer.
+ * An example for using this option is output to a character terminal that
+ * is designed for RTL scripts and stores text in reverse order.setInverse(true)
.
@@ -608,21 +802,21 @@
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_NUMBERS_AS_L = 4;
+ static final short REORDER_INVERSE_NUMBERS_AS_L = 4;
/** Reordering mode: Visual to Logical algorithm equivalent to the regular
* Logical to Visual algorithm.
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_LIKE_DIRECT = 5;
+ static final short REORDER_INVERSE_LIKE_DIRECT = 5;
/** Reordering mode: Inverse Bidi (Visual to Logical) algorithm for the
* REORDER_NUMBERS_SPECIAL
Bidi algorithm.
* @see #setReorderingMode
* @stable ICU 3.8
*/
- private static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6;
+ static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6;
/* Reordering mode values must be ordered so that all the regular logical to
* visual modes come first, and all inverse Bidi modes come last.
@@ -682,7 +876,7 @@
* @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL
* @stable ICU 3.8
*/
- private static final int OPTION_INSERT_MARKS = 1;
+ static final int OPTION_INSERT_MARKS = 1;
/**
* Option bit for setReorderingOptions
:
@@ -704,7 +898,7 @@
* @see #REMOVE_BIDI_CONTROLS
* @stable ICU 3.8
*/
- private static final int OPTION_REMOVE_CONTROLS = 2;
+ static final int OPTION_REMOVE_CONTROLS = 2;
/**
* Option bit for setReorderingOptions
:
@@ -741,8 +935,7 @@
* part of the text.OPTION_STREAMING
option is used, it is
- * recommended to call orderParagraphsLTR()
with argument
- * orderParagraphsLTR
set to true
before calling
+ * recommended to call orderParagraphsLTR(true)
before calling
* setPara()
so that later paragraphs may be concatenated to
* previous paragraphs on the right.
* Bidi
object with preallocated memory
* for internal structures.
@@ -1051,7 +1284,7 @@
* @stable ICU 3.8
*/
public BidiBase(int maxLength, int maxRunCount)
- {
+ {
/* check the argument values */
if (maxLength < 0 || maxRunCount < 0) {
throw new IllegalArgumentException();
@@ -1075,12 +1308,7 @@
direction = 0;
*/
/* get Bidi properties */
- try {
- bdp = UBiDiProps.getSingleton();
- }
- catch (IOException e) {
- throw new MissingResourceException(e.getMessage(), "(BidiProps)", "");
- }
+ bdp = UBiDiProps.INSTANCE;
/* allocate memory for arrays as requested */
if (maxLength > 0) {
@@ -1180,18 +1408,68 @@
getLevelsMemory(true, len);
}
- private void getInitialParasMemory(int len)
- {
- Object array = getMemory("Paras", parasMemory, Integer.TYPE, true, len);
- parasMemory = (int[]) array;
- }
-
private void getInitialRunsMemory(int len)
{
getRunsMemory(true, len);
}
-/* perform (P2)..(P3) ------------------------------------------------------- */
+ /**
+ * Is this Bidi
object set to perform the inverse Bidi
+ * algorithm?
+ * setReorderingMode
will return true
if the
+ * reordering mode was set to
+ * REORDER_INVERSE_NUMBERS_AS_L
, false
+ * for all other values.true
if the Bidi
object is set to
+ * perform the inverse Bidi algorithm by handling numbers as L.
+ *
+ * @see #setInverse
+ * @see #setReorderingMode
+ * @see #REORDER_INVERSE_NUMBERS_AS_L
+ * @stable ICU 3.8
+ */
+ public boolean isInverse() {
+ return isInverse;
+ }
+
+ /* perform (P2)..(P3) ------------------------------------------------------- */
+
+ /*
+ * Check that there are enough entries in the arrays paras_limit and paras_level
+ */
+ private void checkParaCount() {
+ int[] saveLimits;
+ byte[] saveLevels;
+ int count = paraCount;
+ if (count <= paras_level.length)
+ return;
+ int oldLength = paras_level.length;
+ saveLimits = paras_limit;
+ saveLevels = paras_level;
+ try {
+ paras_limit = new int[count * 2];
+ paras_level = new byte[count * 2];
+ } catch (Exception e) {
+ throw new OutOfMemoryError("Failed to allocate memory for paras");
+ }
+ System.arraycopy(saveLimits, 0, paras_limit, 0, oldLength);
+ System.arraycopy(saveLevels, 0, paras_level, 0, oldLength);
+ }
+
+ /*
+ * Get the directional properties for the text, calculate the flags bit-set, and
+ * determine the paragraph level if necessary (in paras_level[i]).
+ * FSI initiators are also resolved and their dirProp replaced with LRI or RLI.
+ * When encountering an FSI, it is initially replaced with an LRI, which is the
+ * default. Only if a strong R or AL is found within its scope will the LRI be
+ * replaced by an RLI.
+ */
+ static final int NOT_SEEKING_STRONG = 0; /* 0: not contextual paraLevel, not after FSI */
+ static final int SEEKING_STRONG_FOR_PARA = 1; /* 1: looking for first strong char in para */
+ static final int SEEKING_STRONG_FOR_FSI = 2; /* 2: looking for first strong after FSI */
+ static final int LOOKING_FOR_PDI = 3; /* 3: found strong after FSI, looking for PDI */
private void getDirProps()
{
@@ -1199,32 +1477,44 @@
flags = 0; /* collect all directionalities in the text */
int uchar;
byte dirProp;
- byte paraDirDefault = 0; /* initialize to avoid compiler warnings */
+ byte defaultParaLevel = 0; /* initialize to avoid compiler warnings */
boolean isDefaultLevel = IsDefaultLevel(paraLevel);
/* for inverse Bidi, the default para level is set to RTL if there is a
strong R or AL character at either end of the text */
+ boolean isDefaultLevelInverse=isDefaultLevel &&
+ (reorderingMode == REORDER_INVERSE_LIKE_DIRECT ||
+ reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL);
lastArabicPos = -1;
- controlCount = 0;
+ int controlCount = 0;
+ boolean removeBidiControls = (reorderingOptions & OPTION_REMOVE_CONTROLS) != 0;
- final int NOT_CONTEXTUAL = 0; /* 0: not contextual paraLevel */
- final int LOOKING_FOR_STRONG = 1; /* 1: looking for first strong char */
- final int FOUND_STRONG_CHAR = 2; /* 2: found first strong char */
-
- int state;
- int paraStart = 0; /* index of first char in paragraph */
- byte paraDir; /* == CONTEXT_RTL within paragraphs
- starting with strong R char */
- byte lastStrongDir=0; /* for default level & inverse Bidi */
- int lastStrongLTR=0; /* for STREAMING option */
+ byte state;
+ byte lastStrong = ON; /* for default level & inverse Bidi */
+ /* The following stacks are used to manage isolate sequences. Those
+ sequences may be nested, but obviously never more deeply than the
+ maximum explicit embedding level.
+ lastStack is the index of the last used entry in the stack. A value of -1
+ means that there is no open isolate sequence.
+ lastStack is reset to -1 on paragraph boundaries. */
+ /* The following stack contains the position of the initiator of
+ each open isolate sequence */
+ int[] isolateStartStack= new int[MAX_EXPLICIT_LEVEL+1];
+ /* The following stack contains the last known state before
+ encountering the initiator of an isolate sequence */
+ byte[] previousStateStack = new byte[MAX_EXPLICIT_LEVEL+1];
+ int stackLast=-1;
+
+ if ((reorderingOptions & OPTION_STREAMING) != 0)
+ length = 0;
+ defaultParaLevel = (byte)(paraLevel & 1);
if (isDefaultLevel) {
- paraDirDefault = ((paraLevel & 1) != 0) ? CONTEXT_RTL : 0;
- paraDir = paraDirDefault;
- lastStrongDir = paraDirDefault;
- state = LOOKING_FOR_STRONG;
+ paras_level[0] = defaultParaLevel;
+ lastStrong = defaultParaLevel;
+ state = SEEKING_STRONG_FOR_PARA;
} else {
- state = NOT_CONTEXTUAL;
- paraDir = 0;
+ paras_level[0] = paraLevel;
+ state = NOT_SEEKING_STRONG;
}
/* count paragraphs and determine the paragraph level (P2..P3) */
/*
@@ -1236,90 +1526,509 @@
for (i = 0; i < originalLength; /* i is incremented in the loop */) {
i0 = i; /* index of first code unit */
uchar = UTF16.charAt(text, 0, originalLength, i);
- i += Character.charCount(uchar);
+ i += UTF16.getCharCount(uchar);
i1 = i - 1; /* index of last code unit, gets the directional property */
- dirProp = (byte)bdp.getClass(uchar);
-
+ dirProp = (byte)getCustomizedClass(uchar);
flags |= DirPropFlag(dirProp);
- dirProps[i1] = (byte)(dirProp | paraDir);
+ dirProps[i1] = dirProp;
if (i1 > i0) { /* set previous code units' properties to BN */
flags |= DirPropFlag(BN);
do {
- dirProps[--i1] = (byte)(BN | paraDir);
+ dirProps[--i1] = BN;
} while (i1 > i0);
}
- if (state == LOOKING_FOR_STRONG) {
- if (dirProp == L) {
- state = FOUND_STRONG_CHAR;
- if (paraDir != 0) {
- paraDir = 0;
- for (i1 = paraStart; i1 < i; i1++) {
- dirProps[i1] &= ~CONTEXT_RTL;
- }
- }
- continue;
+ if (removeBidiControls && IsBidiControlChar(uchar)) {
+ controlCount++;
+ }
+ if (dirProp == L) {
+ if (state == SEEKING_STRONG_FOR_PARA) {
+ paras_level[paraCount - 1] = 0;
+ state = NOT_SEEKING_STRONG;
}
- if (dirProp == R || dirProp == AL) {
- state = FOUND_STRONG_CHAR;
- if (paraDir == 0) {
- paraDir = CONTEXT_RTL;
- for (i1 = paraStart; i1 < i; i1++) {
- dirProps[i1] |= CONTEXT_RTL;
- }
+ else if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
}
- continue;
+ state = LOOKING_FOR_PDI;
}
+ lastStrong = L;
+ continue;
}
- if (dirProp == L) {
- lastStrongDir = 0;
- lastStrongLTR = i; /* i is index to next character */
+ if (dirProp == R || dirProp == AL) {
+ if (state == SEEKING_STRONG_FOR_PARA) {
+ paras_level[paraCount - 1] = 1;
+ state = NOT_SEEKING_STRONG;
+ }
+ else if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ dirProps[isolateStartStack[stackLast]] = RLI;
+ flags |= DirPropFlag(RLI);
+ }
+ state = LOOKING_FOR_PDI;
+ }
+ lastStrong = R;
+ if (dirProp == AL)
+ lastArabicPos = i - 1;
+ continue;
}
- else if (dirProp == R) {
- lastStrongDir = CONTEXT_RTL;
+ if (dirProp >= FSI && dirProp <= RLI) { /* FSI, LRI or RLI */
+ stackLast++;
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ isolateStartStack[stackLast] = i - 1;
+ previousStateStack[stackLast] = state;
+ }
+ if (dirProp == FSI) {
+ dirProps[i-1] = LRI; /* default if no strong char */
+ state = SEEKING_STRONG_FOR_FSI;
+ }
+ else
+ state = LOOKING_FOR_PDI;
+ continue;
}
- else if (dirProp == AL) {
- lastStrongDir = CONTEXT_RTL;
- lastArabicPos = i-1;
- }
- else if (dirProp == B) {
- if (i < originalLength) { /* B not last char in text */
- if (!((uchar == (int)CR) && (text[i] == (int)LF))) {
- paraCount++;
+ if (dirProp == PDI) {
+ if (state == SEEKING_STRONG_FOR_FSI) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
}
+ }
+ if (stackLast >= 0) {
+ if (stackLast <= MAX_EXPLICIT_LEVEL)
+ state = previousStateStack[stackLast];
+ stackLast--;
+ }
+ continue;
+ }
+ if (dirProp == B) {
+ if (i < originalLength && uchar == CR && text[i] == LF) /* do nothing on the CR */
+ continue;
+ paras_limit[paraCount - 1] = i;
+ if (isDefaultLevelInverse && lastStrong == R)
+ paras_level[paraCount - 1] = 1;
+ if ((reorderingOptions & OPTION_STREAMING) != 0) {
+ /* When streaming, we only process whole paragraphs
+ thus some updates are only done on paragraph boundaries */
+ length = i; /* i is index to next character */
+ this.controlCount = controlCount;
+ }
+ if (i < originalLength) { /* B not last char in text */
+ paraCount++;
+ checkParaCount(); /* check that there is enough memory for a new para entry */
if (isDefaultLevel) {
- state=LOOKING_FOR_STRONG;
- paraStart = i; /* i is index to next character */
- paraDir = paraDirDefault;
- lastStrongDir = paraDirDefault;
+ paras_level[paraCount - 1] = defaultParaLevel;
+ state = SEEKING_STRONG_FOR_PARA;
+ lastStrong = defaultParaLevel;
+ } else {
+ paras_level[paraCount - 1] = paraLevel;
+ state = NOT_SEEKING_STRONG;
}
+ stackLast = -1;
}
+ continue;
}
}
+ /* +Ignore still open isolate sequences with overflow */
+ if (stackLast > MAX_EXPLICIT_LEVEL) {
+ stackLast = MAX_EXPLICIT_LEVEL;
+ state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */
+ }
+ /* Resolve direction of still unresolved open FSI sequences */
+ while (stackLast >= 0) {
+ if (state == SEEKING_STRONG_FOR_FSI) {
+ /* no need for next statement, already set by default */
+ /* dirProps[isolateStartStack[stackLast]] = LRI; */
+ flags |= DirPropFlag(LRI);
+ break;
+ }
+ state = previousStateStack[stackLast];
+ stackLast--;
+ }
+ /* When streaming, ignore text after the last paragraph separator */
+ if ((reorderingOptions & OPTION_STREAMING) != 0) {
+ if (length < originalLength)
+ paraCount--;
+ } else {
+ paras_limit[paraCount - 1] = originalLength;
+ this.controlCount = controlCount;
+ }
+ /* For inverse bidi, default para direction is RTL if there is
+ a strong R or AL at either end of the paragraph */
+ if (isDefaultLevelInverse && lastStrong == R) {
+ paras_level[paraCount - 1] = 1;
+ }
if (isDefaultLevel) {
- paraLevel = GetParaLevelAt(0);
+ paraLevel = paras_level[0];
}
-
- /* The following line does nothing new for contextual paraLevel, but is
- needed for absolute paraLevel. */
- flags |= DirPropFlagLR(paraLevel);
+ /* The following is needed to resolve the text direction for default level
+ paragraphs containing no strong character */
+ for (i = 0; i < paraCount; i++)
+ flags |= DirPropFlagLR(paras_level[i]);
if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
flags |= DirPropFlag(L);
}
}
+ /* determine the paragraph level at position index */
+ byte GetParaLevelAt(int pindex)
+ {
+ if (defaultParaLevel == 0 || pindex < paras_limit[0])
+ return paraLevel;
+ int i;
+ for (i = 1; i < paraCount; i++)
+ if (pindex < paras_limit[i])
+ break;
+ if (i >= paraCount)
+ i = paraCount - 1;
+ return paras_level[i];
+ }
+
+ /* Functions for handling paired brackets ----------------------------------- */
+
+ /* In the isoRuns array, the first entry is used for text outside of any
+ isolate sequence. Higher entries are used for each more deeply nested
+ isolate sequence. isoRunLast is the index of the last used entry. The
+ openings array is used to note the data of opening brackets not yet
+ matched by a closing bracket, or matched but still susceptible to change
+ level.
+ Each isoRun entry contains the index of the first and
+ one-after-last openings entries for pending opening brackets it
+ contains. The next openings entry to use is the one-after-last of the
+ most deeply nested isoRun entry.
+ isoRun entries also contain their current embedding level and the last
+ encountered strong character, since these will be needed to resolve
+ the level of paired brackets. */
+
+ private void bracketInit(BracketData bd) {
+ bd.isoRunLast = 0;
+ bd.isoRuns[0] = new IsoRun();
+ bd.isoRuns[0].start = 0;
+ bd.isoRuns[0].limit = 0;
+ bd.isoRuns[0].level = GetParaLevelAt(0);
+ bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(GetParaLevelAt(0) & 1);
+ bd.isoRuns[0].contextPos = 0;
+ bd.openings = new Opening[SIMPLE_PARAS_COUNT];
+ bd.isNumbersSpecial = reorderingMode == REORDER_NUMBERS_SPECIAL ||
+ reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL;
+ }
+
+ /* paragraph boundary */
+ private void bracketProcessB(BracketData bd, byte level) {
+ bd.isoRunLast = 0;
+ bd.isoRuns[0].limit = 0;
+ bd.isoRuns[0].level = level;
+ bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(level & 1);
+ bd.isoRuns[0].contextPos = 0;
+ }
+
+ /* LRE, LRO, RLE, RLO, PDF */
+ private void bracketProcessBoundary(BracketData bd, int lastCcPos,
+ byte contextLevel, byte embeddingLevel) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ if ((DirPropFlag(dirProps[lastCcPos]) & MASK_ISO) != 0) /* after an isolate */
+ return;
+ if (NoOverride(embeddingLevel) > NoOverride(contextLevel)) /* not a PDF */
+ contextLevel = embeddingLevel;
+ pLastIsoRun.limit = pLastIsoRun.start;
+ pLastIsoRun.level = embeddingLevel;
+ pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(contextLevel & 1);
+ pLastIsoRun.contextPos = lastCcPos;
+ }
+
+ /* LRI or RLI */
+ private void bracketProcessLRI_RLI(BracketData bd, byte level) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ short lastLimit;
+ pLastIsoRun.lastBase = ON;
+ lastLimit = pLastIsoRun.limit;
+ bd.isoRunLast++;
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ if (pLastIsoRun == null)
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast] = new IsoRun();
+ pLastIsoRun.start = pLastIsoRun.limit = lastLimit;
+ pLastIsoRun.level = level;
+ pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(level & 1);
+ pLastIsoRun.contextPos = 0;
+ }
+
+ /* PDI */
+ private void bracketProcessPDI(BracketData bd) {
+ IsoRun pLastIsoRun;
+ bd.isoRunLast--;
+ pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ pLastIsoRun.lastBase = ON;
+ }
+
+ /* newly found opening bracket: create an openings entry */
+ private void bracketAddOpening(BracketData bd, char match, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening pOpening;
+ if (pLastIsoRun.limit >= bd.openings.length) { /* no available new entry */
+ Opening[] saveOpenings = bd.openings;
+ int count;
+ try {
+ count = bd.openings.length;
+ bd.openings = new Opening[count * 2];
+ } catch (Exception e) {
+ throw new OutOfMemoryError("Failed to allocate memory for openings");
+ }
+ System.arraycopy(saveOpenings, 0, bd.openings, 0, count);
+ }
+ pOpening = bd.openings[pLastIsoRun.limit];
+ if (pOpening == null)
+ pOpening = bd.openings[pLastIsoRun.limit]= new Opening();
+ pOpening.position = position;
+ pOpening.match = match;
+ pOpening.contextDir = pLastIsoRun.contextDir;
+ pOpening.contextPos = pLastIsoRun.contextPos;
+ pOpening.flags = 0;
+ pLastIsoRun.limit++;
+ }
+
+ /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */
+ private void fixN0c(BracketData bd, int openingIndex, int newPropPosition, byte newProp) {
+ /* This function calls itself recursively */
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening qOpening;
+ int k, openingPosition, closingPosition;
+ for (k = openingIndex+1; k < pLastIsoRun.limit; k++) {
+ qOpening = bd.openings[k];
+ if (qOpening.match >= 0) /* not an N0c match */
+ continue;
+ if (newPropPosition < qOpening.contextPos)
+ break;
+ if (newPropPosition >= qOpening.position)
+ continue;
+ if (newProp == qOpening.contextDir)
+ break;
+ openingPosition = qOpening.position;
+ dirProps[openingPosition] = newProp;
+ closingPosition = -(qOpening.match);
+ dirProps[closingPosition] = newProp;
+ qOpening.match = 0; /* prevent further changes */
+ fixN0c(bd, k, openingPosition, newProp);
+ fixN0c(bd, k, closingPosition, newProp);
+ }
+ }
+
+ /* process closing bracket; return L or R if N0b or N0c, ON if N0d */
+ private byte bracketProcessClosing(BracketData bd, int openIdx, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ Opening pOpening, qOpening;
+ byte direction;
+ boolean stable;
+ byte newProp;
+ pOpening = bd.openings[openIdx];
+ direction = (byte)(pLastIsoRun.level & 1);
+ stable = true; /* assume stable until proved otherwise */
+
+ /* The stable flag is set when brackets are paired and their
+ level is resolved and cannot be changed by what will be
+ found later in the source string.
+ An unstable match can occur only when applying N0c, where
+ the resolved level depends on the preceding context, and
+ this context may be affected by text occurring later.
+ Example: RTL paragraph containing: abc[(latin) HEBREW]
+ When the closing parenthesis is encountered, it appears
+ that N0c1 must be applied since 'abc' sets an opposite
+ direction context and both parentheses receive level 2.
+ However, when the closing square bracket is processed,
+ N0b applies because of 'HEBREW' being included within the
+ brackets, thus the square brackets are treated like R and
+ receive level 1. However, this changes the preceding
+ context of the opening parenthesis, and it now appears
+ that N0c2 must be applied to the parentheses rather than
+ N0c1. */
+
+ if ((direction == 0 && (pOpening.flags & FOUND_L) > 0) ||
+ (direction == 1 && (pOpening.flags & FOUND_R) > 0)) { /* N0b */
+ newProp = direction;
+ }
+ else if ((pOpening.flags & (FOUND_L | FOUND_R)) != 0) { /* N0c */
+ /* it is stable if there is no preceding text or in
+ conditions too complicated and not worth checking */
+ stable = (openIdx == pLastIsoRun.start);
+ if (direction != pOpening.contextDir)
+ newProp = pOpening.contextDir; /* N0c1 */
+ else
+ newProp = direction; /* N0c2 */
+ } else {
+ /* forget this and any brackets nested within this pair */
+ pLastIsoRun.limit = (short)openIdx;
+ return ON; /* N0d */
+ }
+ dirProps[pOpening.position] = newProp;
+ dirProps[position] = newProp;
+ /* Update nested N0c pairs that may be affected */
+ fixN0c(bd, openIdx, pOpening.position, newProp);
+ if (stable) {
+ pLastIsoRun.limit = (short)openIdx; /* forget any brackets nested within this pair */
+ /* remove lower located synonyms if any */
+ while (pLastIsoRun.limit > pLastIsoRun.start &&
+ bd.openings[pLastIsoRun.limit - 1].position == pOpening.position)
+ pLastIsoRun.limit--;
+ } else {
+ int k;
+ pOpening.match = -position;
+ /* neutralize lower located synonyms if any */
+ k = openIdx - 1;
+ while (k >= pLastIsoRun.start &&
+ bd.openings[k].position == pOpening.position)
+ bd.openings[k--].match = 0;
+ /* neutralize any unmatched opening between the current pair;
+ this will also neutralize higher located synonyms if any */
+ for (k = openIdx + 1; k < pLastIsoRun.limit; k++) {
+ qOpening =bd.openings[k];
+ if (qOpening.position >= position)
+ break;
+ if (qOpening.match > 0)
+ qOpening.match = 0;
+ }
+ }
+ return newProp;
+ }
+
+ /* handle strong characters, digits and candidates for closing brackets */
+ private void bracketProcessChar(BracketData bd, int position) {
+ IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast];
+ byte dirProp, newProp;
+ byte level;
+ dirProp = dirProps[position];
+ if (dirProp == ON) {
+ char c, match;
+ int idx;
+ /* First see if it is a matching closing bracket. Hopefully, this is
+ more efficient than checking if it is a closing bracket at all */
+ c = text[position];
+ for (idx = pLastIsoRun.limit - 1; idx >= pLastIsoRun.start; idx--) {
+ if (bd.openings[idx].match != c)
+ continue;
+ /* We have a match */
+ newProp = bracketProcessClosing(bd, idx, position);
+ if(newProp == ON) { /* N0d */
+ c = 0; /* prevent handling as an opening */
+ break;
+ }
+ pLastIsoRun.lastBase = ON;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ level = levels[position];
+ if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */
+ short flag;
+ int i;
+ newProp = (byte)(level & 1);
+ pLastIsoRun.lastStrong = newProp;
+ flag = (short)DirPropFlag(newProp);
+ for (i = pLastIsoRun.start; i < idx; i++)
+ bd.openings[i].flags |= flag;
+ /* matching brackets are not overridden by LRO/RLO */
+ levels[position] &= ~LEVEL_OVERRIDE;
+ }
+ /* matching brackets are not overridden by LRO/RLO */
+ levels[bd.openings[idx].position] &= ~LEVEL_OVERRIDE;
+ return;
+ }
+ /* We get here only if the ON character is not a matching closing
+ bracket or it is a case of N0d */
+ /* Now see if it is an opening bracket */
+ if (c != 0) {
+ match = (char)UCharacter.getBidiPairedBracket(c); /* get the matching char */
+ } else {
+ match = 0;
+ }
+ if (match != c && /* has a matching char */
+ UCharacter.getIntPropertyValue(c, BIDI_PAIRED_BRACKET_TYPE) ==
+ /* opening bracket */ BidiPairedBracketType.OPEN) {
+ /* special case: process synonyms
+ create an opening entry for each synonym */
+ if (match == 0x232A) { /* RIGHT-POINTING ANGLE BRACKET */
+ bracketAddOpening(bd, (char)0x3009, position);
+ }
+ else if (match == 0x3009) { /* RIGHT ANGLE BRACKET */
+ bracketAddOpening(bd, (char)0x232A, position);
+ }
+ bracketAddOpening(bd, match, position);
+ }
+ }
+ level = levels[position];
+ if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */
+ newProp = (byte)(level & 1);
+ if (dirProp != S && dirProp != WS && dirProp != ON)
+ dirProps[position] = newProp;
+ pLastIsoRun.lastBase = newProp;
+ pLastIsoRun.lastStrong = newProp;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ }
+ else if (dirProp <= R || dirProp == AL) {
+ newProp = DirFromStrong(dirProp);
+ pLastIsoRun.lastBase = dirProp;
+ pLastIsoRun.lastStrong = dirProp;
+ pLastIsoRun.contextDir = newProp;
+ pLastIsoRun.contextPos = position;
+ }
+ else if(dirProp == EN) {
+ pLastIsoRun.lastBase = EN;
+ if (pLastIsoRun.lastStrong == L) {
+ newProp = L; /* W7 */
+ if (!bd.isNumbersSpecial)
+ dirProps[position] = ENL;
+ pLastIsoRun.contextDir = L;
+ pLastIsoRun.contextPos = position;
+ }
+ else {
+ newProp = R; /* N0 */
+ if (pLastIsoRun.lastStrong == AL)
+ dirProps[position] = AN; /* W2 */
+ else
+ dirProps[position] = ENR;
+ pLastIsoRun.contextDir = R;
+ pLastIsoRun.contextPos = position;
+ }
+ }
+ else if (dirProp == AN) {
+ newProp = R; /* N0 */
+ pLastIsoRun.lastBase = AN;
+ pLastIsoRun.contextDir = R;
+ pLastIsoRun.contextPos = position;
+ }
+ else if (dirProp == NSM) {
+ /* if the last real char was ON, change NSM to ON so that it
+ will stay ON even if the last real char is a bracket which
+ may be changed to L or R */
+ newProp = pLastIsoRun.lastBase;
+ if (newProp == ON)
+ dirProps[position] = newProp;
+ }
+ else {
+ newProp = dirProp;
+ pLastIsoRun.lastBase = dirProp;
+ }
+ if (newProp <= R || newProp == AL) {
+ int i;
+ short flag = (short)DirPropFlag(DirFromStrong(newProp));
+ for (i = pLastIsoRun.start; i < pLastIsoRun.limit; i++)
+ if (position > bd.openings[i].position)
+ bd.openings[i].flags |= flag;
+ }
+ }
+
/* perform (X1)..(X9) ------------------------------------------------------- */
/* determine if the text is mixed-directional or single-directional */
private byte directionFromFlags() {
+
/* if the text contains AN and neutrals, then some neutrals may become RTL */
if (!((flags & MASK_RTL) != 0 ||
((flags & DirPropFlag(AN)) != 0 &&
(flags & MASK_POSSIBLE_N) != 0))) {
- return Bidi.DIRECTION_LEFT_TO_RIGHT;
+ return LTR;
} else if ((flags & MASK_LTR) == 0) {
- return Bidi.DIRECTION_RIGHT_TO_LEFT;
+ return RTL;
} else {
return MIXED;
}
@@ -1330,16 +2039,16 @@
* Recalculate the flags to have them reflect the real properties
* after taking the explicit embeddings into account.
*
- * The Bidi algorithm is designed to result in the same behavior whether embedding
+ * The BiDi algorithm is designed to result in the same behavior whether embedding
* levels are externally specified (from "styled text", supposedly the preferred
- * method) or set by explicit embedding codes (LRx, RLx, PDF) in the plain text.
- * That is why (X9) instructs to remove all explicit codes (and BN).
- * However, in a real implementation, this removal of these codes and their index
+ * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text.
+ * That is why (X9) instructs to remove all not-isolate explicit codes (and BN).
+ * However, in a real implementation, the removal of these codes and their index
* positions in the plain text is undesirable since it would result in
* reallocated, reindexed text.
* Instead, this implementation leaves the codes in there and just ignores them
* in the subsequent processing.
- * In order to get the same reordering behavior, positions with a BN or an
+ * In order to get the same reordering behavior, positions with a BN or a not-isolate
* explicit embedding code just get the same level assigned as the last "real"
* character.
*
@@ -1351,185 +2060,281 @@
* This limits the scope of the implicit rules in effectively
* the same way as the run limits.
*
- * Instead, this implementation does not modify these codes.
+ * Instead, this implementation does not modify these codes, except for
+ * paired brackets whose properties (ON) may be replaced by L or R.
* On one hand, the paragraph has to be scanned for same-level-runs, but
* on the other hand, this saves another loop to reset these codes,
* or saves making and modifying a copy of dirProps[].
*
*
- * Note that (Pn) and (Xn) changed significantly from version 4 of the Bidi algorithm.
+ * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm.
*
*
* Handling the stack of explicit levels (Xn):
*
- * With the Bidi stack of explicit levels,
- * as pushed with each LRE, RLE, LRO, and RLO and popped with each PDF,
- * the explicit level must never exceed MAX_EXPLICIT_LEVEL==61.
+ * With the BiDi stack of explicit levels, as pushed with each
+ * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI,
+ * the explicit level must never exceed MAX_EXPLICIT_LEVEL.
*
* In order to have a correct push-pop semantics even in the case of overflows,
- * there are two overflow counters:
- * - countOver60 is incremented with each LRx at level 60
- * - from level 60, one RLx increases the level to 61
- * - countOver61 is incremented with each LRx and RLx at level 61
- *
- * Popping levels with PDF must work in the opposite order so that level 61
- * is correct at the correct point. Underflows (too many PDFs) must be checked.
+ * overflow counters and a valid isolate counter are used as described in UAX#9
+ * section 3.3.2 "Explicit Levels and Directions".
*
* This implementation assumes that MAX_EXPLICIT_LEVEL is odd.
+ *
+ * Returns the direction
+ *
*/
private byte resolveExplicitLevels() {
int i = 0;
byte dirProp;
byte level = GetParaLevelAt(0);
-
byte dirct;
- int paraIndex = 0;
+ isolateCount = 0;
/* determine if the text is mixed-directional or single-directional */
dirct = directionFromFlags();
- /* we may not need to resolve any explicit levels, but for multiple
- paragraphs we want to loop on all chars to set the para boundaries */
- if ((dirct != MIXED) && (paraCount == 1)) {
+ /* we may not need to resolve any explicit levels */
+ if (dirct != MIXED) {
/* not mixed directionality: levels don't matter - trailingWSStart will be 0 */
- } else if ((paraCount == 1) &&
- ((flags & MASK_EXPLICIT) == 0)) {
- /* mixed, but all characters are at the same embedding level */
- /* or we are in "inverse Bidi" */
- /* and we don't have contextual multiple paragraphs with some B char */
+ return dirct;
+ }
+
+ if (reorderingMode > REORDER_LAST_LOGICAL_TO_VISUAL) {
+ /* inverse BiDi: mixed, but all characters are at the same embedding level */
/* set all levels to the paragraph level */
- for (i = 0; i < length; ++i) {
- levels[i] = level;
+ int paraIndex, start, limit;
+ for (paraIndex = 0; paraIndex < paraCount; paraIndex++) {
+ if (paraIndex == 0)
+ start = 0;
+ else
+ start = paras_limit[paraIndex - 1];
+ limit = paras_limit[paraIndex];
+ level = paras_level[paraIndex];
+ for (i = start; i < limit; i++)
+ levels[i] =level;
}
- } else {
- /* continue to perform (Xn) */
-
- /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
- /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */
- byte embeddingLevel = level;
- byte newLevel;
- byte stackTop = 0;
-
- byte[] stack = new byte[MAX_EXPLICIT_LEVEL]; /* we never push anything >=MAX_EXPLICIT_LEVEL */
- int countOver60 = 0;
- int countOver61 = 0; /* count overflows of explicit levels */
-
- /* recalculate the flags */
- flags = 0;
-
- for (i = 0; i < length; ++i) {
- dirProp = NoContextRTL(dirProps[i]);
- switch(dirProp) {
- case LRE:
- case LRO:
- /* (X3, X5) */
- newLevel = (byte)((embeddingLevel+2) & ~(INTERNAL_LEVEL_OVERRIDE | 1)); /* least greater even level */
- if (newLevel <= MAX_EXPLICIT_LEVEL) {
- stack[stackTop] = embeddingLevel;
- ++stackTop;
- embeddingLevel = newLevel;
- if (dirProp == LRO) {
- embeddingLevel |= INTERNAL_LEVEL_OVERRIDE;
- }
- /* we don't need to set LEVEL_OVERRIDE off for LRE
- since this has already been done for newLevel which is
- the source for embeddingLevel.
- */
- } else if ((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL) {
- ++countOver61;
- } else /* (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL-1 */ {
- ++countOver60;
- }
- flags |= DirPropFlag(BN);
- break;
- case RLE:
- case RLO:
- /* (X2, X4) */
- newLevel=(byte)(((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) + 1) | 1); /* least greater odd level */
- if (newLevel<=MAX_EXPLICIT_LEVEL) {
- stack[stackTop] = embeddingLevel;
- ++stackTop;
- embeddingLevel = newLevel;
- if (dirProp == RLO) {
- embeddingLevel |= INTERNAL_LEVEL_OVERRIDE;
- }
- /* we don't need to set LEVEL_OVERRIDE off for RLE
- since this has already been done for newLevel which is
- the source for embeddingLevel.
- */
- } else {
- ++countOver61;
- }
- flags |= DirPropFlag(BN);
- break;
- case PDF:
- /* (X7) */
- /* handle all the overflow cases first */
- if (countOver61 > 0) {
- --countOver61;
- } else if (countOver60 > 0 && (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) != MAX_EXPLICIT_LEVEL) {
- /* handle LRx overflows from level 60 */
- --countOver60;
- } else if (stackTop > 0) {
- /* this is the pop operation; it also pops level 61 while countOver60>0 */
- --stackTop;
- embeddingLevel = stack[stackTop];
- /* } else { (underflow) */
- }
- flags |= DirPropFlag(BN);
- break;
- case B:
- stackTop = 0;
- countOver60 = 0;
- countOver61 = 0;
- level = GetParaLevelAt(i);
- if ((i + 1) < length) {
- embeddingLevel = GetParaLevelAt(i+1);
- if (!((text[i] == CR) && (text[i + 1] == LF))) {
- paras[paraIndex++] = i+1;
- }
- }
- flags |= DirPropFlag(B);
- break;
- case BN:
- /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
- /* they will get their levels set correctly in adjustWSLevels() */
- flags |= DirPropFlag(BN);
- break;
- default:
- /* all other types get the "real" level */
- if (level != embeddingLevel) {
- level = embeddingLevel;
- if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) {
- flags |= DirPropFlagO(level) | DirPropFlagMultiRuns;
- } else {
- flags |= DirPropFlagE(level) | DirPropFlagMultiRuns;
+ return dirct; /* no bracket matching for inverse BiDi */
+ }
+ if ((flags & (MASK_EXPLICIT | MASK_ISO)) == 0) {
+ /* no embeddings, set all levels to the paragraph level */
+ /* we still have to perform bracket matching */
+ int paraIndex, start, limit;
+ BracketData bracketData = new BracketData();
+ bracketInit(bracketData);
+ for (paraIndex = 0; paraIndex < paraCount; paraIndex++) {
+ if (paraIndex == 0)
+ start = 0;
+ else
+ start = paras_limit[paraIndex-1];
+ limit = paras_limit[paraIndex];
+ level = paras_level[paraIndex];
+ for (i = start; i < limit; i++) {
+ levels[i] = level;
+ dirProp = dirProps[i];
+ if (dirProp == BN)
+ continue;
+ if (dirProp == B) {
+ if ((i + 1) < length) {
+ if (text[i] == CR && text[i + 1] == LF)
+ continue; /* skip CR when followed by LF */
+ bracketProcessB(bracketData, level);
}
+ continue;
}
- if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) {
- flags |= DirPropFlag(dirProp);
- }
- break;
+ bracketProcessChar(bracketData, i);
}
-
- /*
- * We need to set reasonable levels even on BN codes and
- * explicit codes because we will later look at same-level runs (X10).
- */
- levels[i] = level;
- }
- if ((flags & MASK_EMBEDDING) != 0) {
- flags |= DirPropFlagLR(paraLevel);
- }
- if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
- flags |= DirPropFlag(L);
}
+ return dirct;
+ }
+ /* continue to perform (Xn) */
- /* subsequently, ignore the explicit codes and BN (X9) */
+ /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
+ /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */
+ byte embeddingLevel = level, newLevel;
+ byte previousLevel = level; /* previous level for regular (not CC) characters */
+ int lastCcPos = 0; /* index of last effective LRx,RLx, PDx */
+
+ /* The following stack remembers the embedding level and the ISOLATE flag of level runs.
+ stackLast points to its current entry. */
+ short[] stack = new short[MAX_EXPLICIT_LEVEL + 2]; /* we never push anything >= MAX_EXPLICIT_LEVEL
+ but we need one more entry as base */
+ int stackLast = 0;
+ int overflowIsolateCount = 0;
+ int overflowEmbeddingCount = 0;
+ int validIsolateCount = 0;
+ BracketData bracketData = new BracketData();
+ bracketInit(bracketData);
+ stack[0] = level; /* initialize base entry to para level, no override, no isolate */
- /* again, determine if the text is mixed-directional or single-directional */
- dirct = directionFromFlags();
+ /* recalculate the flags */
+ flags = 0;
+
+ for (i = 0; i < length; i++) {
+ dirProp = dirProps[i];
+ switch (dirProp) {
+ case LRE:
+ case RLE:
+ case LRO:
+ case RLO:
+ /* (X2, X3, X4, X5) */
+ flags |= DirPropFlag(BN);
+ levels[i] = previousLevel;
+ if (dirProp == LRE || dirProp == LRO) {
+ /* least greater even level */
+ newLevel = (byte)((embeddingLevel+2) & ~(LEVEL_OVERRIDE | 1));
+ } else {
+ /* least greater odd level */
+ newLevel = (byte)((NoOverride(embeddingLevel) + 1) | 1);
+ }
+ if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 &&
+ overflowEmbeddingCount == 0) {
+ lastCcPos = i;
+ embeddingLevel = newLevel;
+ if (dirProp == LRO || dirProp == RLO)
+ embeddingLevel |= LEVEL_OVERRIDE;
+ stackLast++;
+ stack[stackLast] = embeddingLevel;
+ /* we don't need to set LEVEL_OVERRIDE off for LRE and RLE
+ since this has already been done for newLevel which is
+ the source for embeddingLevel.
+ */
+ } else {
+ if (overflowIsolateCount == 0)
+ overflowEmbeddingCount++;
+ }
+ break;
+ case PDF:
+ /* (X7) */
+ flags |= DirPropFlag(BN);
+ levels[i] = previousLevel;
+ /* handle all the overflow cases first */
+ if (overflowIsolateCount > 0) {
+ break;
+ }
+ if (overflowEmbeddingCount > 0) {
+ overflowEmbeddingCount--;
+ break;
+ }
+ if (stackLast > 0 && stack[stackLast] < ISOLATE) { /* not an isolate entry */
+ lastCcPos = i;
+ stackLast--;
+ embeddingLevel = (byte)stack[stackLast];
+ }
+ break;
+ case LRI:
+ case RLI:
+ flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel);
+ levels[i] = NoOverride(embeddingLevel);
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ }
+ previousLevel = embeddingLevel;
+ /* (X5a, X5b) */
+ if (dirProp == LRI)
+ /* least greater even level */
+ newLevel=(byte)((embeddingLevel+2)&~(LEVEL_OVERRIDE|1));
+ else
+ /* least greater odd level */
+ newLevel=(byte)((NoOverride(embeddingLevel)+1)|1);
+ if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0
+ && overflowEmbeddingCount == 0) {
+ flags |= DirPropFlag(dirProp);
+ lastCcPos = i;
+ validIsolateCount++;
+ if (validIsolateCount > isolateCount)
+ isolateCount = validIsolateCount;
+ embeddingLevel = newLevel;
+ /* we can increment stackLast without checking because newLevel
+ will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */
+ stackLast++;
+ stack[stackLast] = (short)(embeddingLevel + ISOLATE);
+ bracketProcessLRI_RLI(bracketData, embeddingLevel);
+ } else {
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ overflowIsolateCount++;
+ }
+ break;
+ case PDI:
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ }
+ /* (X6a) */
+ if (overflowIsolateCount > 0) {
+ overflowIsolateCount--;
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ }
+ else if (validIsolateCount > 0) {
+ flags |= DirPropFlag(PDI);
+ lastCcPos = i;
+ overflowEmbeddingCount = 0;
+ while (stack[stackLast] < ISOLATE) /* pop embedding entries */
+ stackLast--; /* until the last isolate entry */
+ stackLast--; /* pop also the last isolate entry */
+ validIsolateCount--;
+ bracketProcessPDI(bracketData);
+ } else
+ /* make it WS so that it is handled by adjustWSLevels() */
+ dirProps[i] = WS;
+ embeddingLevel = (byte)(stack[stackLast] & ~ISOLATE);
+ flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel);
+ previousLevel = embeddingLevel;
+ levels[i] = NoOverride(embeddingLevel);
+ break;
+ case B:
+ flags |= DirPropFlag(B);
+ levels[i] = GetParaLevelAt(i);
+ if ((i + 1) < length) {
+ if (text[i] == CR && text[i + 1] == LF)
+ break; /* skip CR when followed by LF */
+ overflowEmbeddingCount = overflowIsolateCount = 0;
+ validIsolateCount = 0;
+ stackLast = 0;
+ previousLevel = embeddingLevel = GetParaLevelAt(i + 1);
+ stack[0] = embeddingLevel; /* initialize base entry to para level, no override, no isolate */
+ bracketProcessB(bracketData, embeddingLevel);
+ }
+ break;
+ case BN:
+ /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
+ /* they will get their levels set correctly in adjustWSLevels() */
+ levels[i] = previousLevel;
+ flags |= DirPropFlag(BN);
+ break;
+ default:
+ /* all other types are normal characters and get the "real" level */
+ if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) {
+ bracketProcessBoundary(bracketData, lastCcPos,
+ previousLevel, embeddingLevel);
+ flags |= DirPropFlagMultiRuns;
+ if ((embeddingLevel & LEVEL_OVERRIDE) != 0)
+ flags |= DirPropFlagO(embeddingLevel);
+ else
+ flags |= DirPropFlagE(embeddingLevel);
+ }
+ previousLevel = embeddingLevel;
+ levels[i] = embeddingLevel;
+ bracketProcessChar(bracketData, i);
+ /* the dirProp may have been changed in bracketProcessChar() */
+ flags |= DirPropFlag(dirProps[i]);
+ break;
+ }
+ }
+ if ((flags & MASK_EMBEDDING) != 0) {
+ flags |= DirPropFlagLR(paraLevel);
}
+ if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) {
+ flags |= DirPropFlag(L);
+ }
+ /* again, determine if the text is mixed-directional or single-directional */
+ dirct = directionFromFlags();
return dirct;
}
@@ -1547,49 +2352,57 @@
private byte checkExplicitLevels() {
byte dirProp;
int i;
+ int isolateCount = 0;
+
this.flags = 0; /* collect all directionalities in the text */
byte level;
- int paraIndex = 0;
+ this.isolateCount = 0;
for (i = 0; i < length; ++i) {
if (levels[i] == 0) {
- levels[i] = paraLevel;
+ levels[i] = paraLevel;
}
+
+ // for backward compatibility
if (MAX_EXPLICIT_LEVEL < (levels[i]&0x7f)) {
- if ((levels[i] & INTERNAL_LEVEL_OVERRIDE) != 0) {
- levels[i] = (byte)(paraLevel|INTERNAL_LEVEL_OVERRIDE);
+ if ((levels[i] & LEVEL_OVERRIDE) != 0) {
+ levels[i] = (byte)(paraLevel|LEVEL_OVERRIDE);
} else {
levels[i] = paraLevel;
}
}
+
level = levels[i];
- dirProp = NoContextRTL(dirProps[i]);
- if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) {
+ dirProp = dirProps[i];
+ if (dirProp == LRI || dirProp == RLI) {
+ isolateCount++;
+ if (isolateCount > this.isolateCount)
+ this.isolateCount = isolateCount;
+ }
+ else if (dirProp == PDI) {
+ isolateCount--;
+ } else if (dirProp == B) {
+ isolateCount = 0;
+ }
+ if ((level & LEVEL_OVERRIDE) != 0) {
/* keep the override flag in levels[i] but adjust the flags */
- level &= ~INTERNAL_LEVEL_OVERRIDE; /* make the range check below simpler */
+ level &= ~LEVEL_OVERRIDE; /* make the range check below simpler */
flags |= DirPropFlagO(level);
} else {
/* set the flags */
flags |= DirPropFlagE(level) | DirPropFlag(dirProp);
}
-
if ((level < GetParaLevelAt(i) &&
!((0 == level) && (dirProp == B))) ||
- (MAX_EXPLICIT_LEVEL LEVEL_OVERRIDE
bit set.
* Except for that bit, it must be
- * {@code paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL},
+ * paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL
,
* with one exception: a level of zero may be specified for a
- * paragraph separator even if {@code paraLevel > 0} when multiple
+ * paragraph separator even if paraLevel>0
when multiple
* paragraphs are submitted in the same call to setPara()
.
* Caution: A reference to this array, not a copy
* of the levels, will be stored in the Bidi
object;
@@ -2444,22 +3521,28 @@
* @see #MAX_EXPLICIT_LEVEL
* @stable ICU 3.8
*/
- public void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels)
+ void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels)
{
/* check the argument values */
- if (paraLevel < INTERNAL_LEVEL_DEFAULT_LTR) {
+ if (paraLevel < LEVEL_DEFAULT_LTR) {
verifyRange(paraLevel, 0, MAX_EXPLICIT_LEVEL + 1);
}
if (chars == null) {
chars = new char[0];
}
+ /* special treatment for RUNS_ONLY mode */
+ if (reorderingMode == REORDER_RUNS_ONLY) {
+ setParaRunsOnly(chars, paraLevel);
+ return;
+ }
+
/* initialize the Bidi object */
this.paraBidi = null; /* mark unfinished setPara */
this.text = chars;
this.length = this.originalLength = this.resultLength = text.length;
this.paraLevel = paraLevel;
- this.direction = Bidi.DIRECTION_LEFT_TO_RIGHT;
+ this.direction = (byte)(paraLevel & 1);
this.paraCount = 1;
/* Allocate zero-length arrays instead of setting to null here; then
@@ -2475,11 +3558,7 @@
/*
* Save the original paraLevel if contextual; otherwise, set to 0.
*/
- if (IsDefaultLevel(paraLevel)) {
- defaultParaLevel = paraLevel;
- } else {
- defaultParaLevel = 0;
- }
+ defaultParaLevel = IsDefaultLevel(paraLevel) ? paraLevel : 0;
if (length == 0) {
/*
@@ -2491,17 +3570,10 @@
this.paraLevel &= 1;
defaultParaLevel = 0;
}
- if ((this.paraLevel & 1) != 0) {
- flags = DirPropFlag(R);
- direction = Bidi.DIRECTION_RIGHT_TO_LEFT;
- } else {
- flags = DirPropFlag(L);
- direction = Bidi.DIRECTION_LEFT_TO_RIGHT;
- }
-
+ flags = DirPropFlagLR(paraLevel);
runCount = 0;
paraCount = 0;
- paraBidi = this; /* mark successful setPara */
+ setParaSuccess();
return;
}
@@ -2515,21 +3587,9 @@
getDirPropsMemory(length);
dirProps = dirPropsMemory;
getDirProps();
-
/* the processed length may have changed if OPTION_STREAMING is set */
trailingWSStart = length; /* the levels[] will reflect the WS run */
- /* allocate paras memory */
- if (paraCount > 1) {
- getInitialParasMemory(paraCount);
- paras = parasMemory;
- paras[paraCount - 1] = length;
- } else {
- /* initialize paras for single paragraph */
- paras = simpleParas;
- simpleParas[0] = length;
- }
-
/* are explicit levels specified? */
if (embeddingLevels == null) {
/* no: determine explicit levels according to the (Xn) rules */
@@ -2542,28 +3602,62 @@
direction = checkExplicitLevels();
}
+ /* allocate isolate memory */
+ if (isolateCount > 0) {
+ if (isolates == null || isolates.length < isolateCount)
+ isolates = new Isolate[isolateCount + 3]; /* keep some reserve */
+ }
+ isolateCount = -1; /* current isolates stack entry == none */
+
/*
* The steps after (X9) in the Bidi algorithm are performed only if
* the paragraph text has mixed directionality!
*/
switch (direction) {
- case Bidi.DIRECTION_LEFT_TO_RIGHT:
- /* make sure paraLevel is even */
- paraLevel = (byte)((paraLevel + 1) & ~1);
-
+ case LTR:
/* all levels are implicitly at paraLevel (important for getLevels()) */
trailingWSStart = 0;
break;
- case Bidi.DIRECTION_RIGHT_TO_LEFT:
- /* make sure paraLevel is odd */
- paraLevel |= 1;
-
+ case RTL:
/* all levels are implicitly at paraLevel (important for getLevels()) */
trailingWSStart = 0;
break;
default:
- this.impTabPair = impTab_DEFAULT;
-
+ /*
+ * Choose the right implicit state table
+ */
+ switch(reorderingMode) {
+ case REORDER_DEFAULT:
+ this.impTabPair = impTab_DEFAULT;
+ break;
+ case REORDER_NUMBERS_SPECIAL:
+ this.impTabPair = impTab_NUMBERS_SPECIAL;
+ break;
+ case REORDER_GROUP_NUMBERS_WITH_R:
+ this.impTabPair = impTab_GROUP_NUMBERS_WITH_R;
+ break;
+ case REORDER_RUNS_ONLY:
+ /* we should never get here */
+ throw new InternalError("Internal ICU error in setPara");
+ /* break; */
+ case REORDER_INVERSE_NUMBERS_AS_L:
+ this.impTabPair = impTab_INVERSE_NUMBERS_AS_L;
+ break;
+ case REORDER_INVERSE_LIKE_DIRECT:
+ if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) {
+ this.impTabPair = impTab_INVERSE_LIKE_DIRECT_WITH_MARKS;
+ } else {
+ this.impTabPair = impTab_INVERSE_LIKE_DIRECT;
+ }
+ break;
+ case REORDER_INVERSE_FOR_NUMBERS_SPECIAL:
+ if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) {
+ this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS;
+ } else {
+ this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL;
+ }
+ break;
+ }
/*
* If there are no external levels specified and there
* are no significant explicit level codes in the text,
@@ -2601,7 +3695,7 @@
/* the values for this run's start are the same as for the previous run's end */
start = limit;
level = nextLevel;
- if ((start > 0) && (NoContextRTL(dirProps[start - 1]) == B)) {
+ if ((start > 0) && (dirProps[start - 1] == B)) {
/* except if this is a new paragraph, then set sor = para level */
sor = GetLRFromLevel(GetParaLevelAt(start));
} else {
@@ -2609,7 +3703,9 @@
}
/* search for the limit of this run */
- while (++limit < length && levels[limit] == level) {}
+ while ((++limit < length) &&
+ ((levels[limit] == level) ||
+ ((DirPropFlag(dirProps[limit]) & MASK_BN_EXPLICIT) != 0))) {}
/* get the correct level of the next run */
if (limit < length) {
@@ -2619,7 +3715,7 @@
}
/* determine eor from max(level, nextLevel); sor is last run's eor */
- if ((level & ~INTERNAL_LEVEL_OVERRIDE) < (nextLevel & ~INTERNAL_LEVEL_OVERRIDE)) {
+ if (NoOverride(level) < NoOverride(nextLevel)) {
eor = GetLRFromLevel(nextLevel);
} else {
eor = GetLRFromLevel(level);
@@ -2627,12 +3723,12 @@
/* if the run consists of overridden directional types, then there
are no implicit types to be resolved */
- if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) {
+ if ((level & LEVEL_OVERRIDE) == 0) {
resolveImplicitLevels(start, limit, sor, eor);
} else {
/* remove the LEVEL_OVERRIDE flags */
do {
- levels[start++] &= ~INTERNAL_LEVEL_OVERRIDE;
+ levels[start++] &= ~LEVEL_OVERRIDE;
} while (start < limit);
}
} while (limit < length);
@@ -2644,8 +3740,46 @@
break;
}
- resultLength += insertPoints.size;
- paraBidi = this; /* mark successful setPara */
+ /* add RLM for inverse Bidi with contextual orientation resolving
+ * to RTL which would not round-trip otherwise
+ */
+ if ((defaultParaLevel > 0) &&
+ ((reorderingOptions & OPTION_INSERT_MARKS) != 0) &&
+ ((reorderingMode == REORDER_INVERSE_LIKE_DIRECT) ||
+ (reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) {
+ int start, last;
+ byte level;
+ byte dirProp;
+ for (int i = 0; i < paraCount; i++) {
+ last = paras_limit[i] - 1;
+ level = paras_level[i];
+ if (level == 0)
+ continue; /* LTR paragraph */
+ start = i == 0 ? 0 : paras_limit[i - 1];
+ for (int j = last; j >= start; j--) {
+ dirProp = dirProps[j];
+ if (dirProp == L) {
+ if (j < last) {
+ while (dirProps[last] == B) {
+ last--;
+ }
+ }
+ addPoint(last, RLM_BEFORE);
+ break;
+ }
+ if ((DirPropFlag(dirProp) & MASK_R_AL) != 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if ((reorderingOptions & OPTION_REMOVE_CONTROLS) != 0) {
+ resultLength -= controlCount;
+ } else {
+ resultLength += insertPoints.size;
+ }
+ setParaSuccess();
}
/**
@@ -2682,7 +3816,7 @@
* For example, in pure LTR text with numbers the numbers would get
* a resolved level of 2 higher than the surrounding text according to
* the algorithm. This implementation may set all resolved levels to
- * the same value in such a case.
+ * the same value in such a case.BidiClassifier
is defined and returns a value
+ * other than CLASS_DEFAULT
, that value is used; otherwise
+ * the default class determination mechanism is invoked.[0..getProcessedLength()-1]
.
+ * @param c The code point to get a Bidi class for.
*
- * @return The index of the paragraph containing the specified position,
- * starting from 0.
+ * @return The Bidi class for the character c
that is in effect
+ * for this Bidi
instance.
*
- * @throws IllegalStateException if this call is not preceded by a successful
- * call to setPara
or setLine
- * @throws IllegalArgumentException if charIndex is not within the legal range
- *
- * @see com.ibm.icu.text.BidiRun
- * @see #getProcessedLength
* @stable ICU 3.8
*/
- public int getParagraphIndex(int charIndex)
- {
- verifyValidParaOrLine();
- BidiBase bidi = paraBidi; /* get Para object if Line object */
- verifyRange(charIndex, 0, bidi.length);
- int paraIndex;
- for (paraIndex = 0; charIndex >= bidi.paras[paraIndex]; paraIndex++) {
- }
- return paraIndex;
+ public int getCustomizedClass(int c) {
+ int dir;
+
+ dir = bdp.getClass(c);
+ if (dir >= CHAR_DIRECTION_COUNT)
+ dir = ON;
+ return dir;
}
/**
@@ -2891,7 +4020,7 @@
verifyRange(start, 0, limit);
verifyRange(limit, 0, length+1);
- return BidiLine.setLine(bidi, this, newBidi, newBidiBase, start, limit);
+ return BidiLine.setLine(this, newBidi, newBidiBase, start, limit);
}
/**
@@ -2911,9 +4040,11 @@
*/
public byte getLevelAt(int charIndex)
{
+ // for backward compatibility
if (charIndex < 0 || charIndex >= length) {
return (byte)getBaseLevel();
}
+
verifyValidParaOrLine();
verifyRange(charIndex, 0, length);
return BidiLine.getLevelAt(this, charIndex);
@@ -2932,7 +4063,7 @@
* call to setPara
or setLine
* @stable ICU 3.8
*/
- private byte[] getLevels()
+ byte[] getLevels()
{
verifyValidParaOrLine();
if (length <= 0) {
@@ -2963,6 +4094,78 @@
}
/**
+ *
+ * Get a BidiRun
object according to its index. BidiRun methods
+ * may be used to retrieve the run's logical start, length and level,
+ * which can be even for an LTR run or odd for an RTL run.
+ * In an RTL run, the character at the logical start is
+ * visually on the right of the displayed run.
+ * The length is the number of characters in the run.countRuns()
is normally called
+ * before the runs are retrieved.
+ *
+ *
+ * Bidi bidi = new Bidi();
+ * String text = "abc 123 DEFG xyz";
+ * bidi.setPara(text, Bidi.RTL, null);
+ * int i, count=bidi.countRuns(), logicalStart, visualIndex=0, length;
+ * BidiRun run;
+ * for (i = 0; i < count; ++i) {
+ * run = bidi.getVisualRun(i);
+ * logicalStart = run.getStart();
+ * length = run.getLength();
+ * if (Bidi.LTR == run.getEmbeddingLevel()) {
+ * do { // LTR
+ * show_char(text.charAt(logicalStart++), visualIndex++);
+ * } while (--length > 0);
+ * } else {
+ * logicalStart += length; // logicalLimit
+ * do { // RTL
+ * show_char(text.charAt(--logicalStart), visualIndex++);
+ * } while (--length > 0);
+ * }
+ * }
+ *
+ * {@link #writeReordered}
, optionally with the
+ * {@link #KEEP_BASE_COMBINING}
option, can be considered in
+ * order to avoid these issues.
+ *
+ * @param runIndex is the number of the run in visual order, in the
+ * range [0..countRuns()-1]
.
+ *
+ * @return a BidiRun object containing the details of the run. The
+ * directionality of the run is
+ * LTR==0
or RTL==1
,
+ * never MIXED
.
+ *
+ * @throws IllegalStateException if this call is not preceded by a successful
+ * call to setPara
or setLine
+ * @throws IllegalArgumentException if runIndex
is not in
+ * the range 0<=runIndex<countRuns()
+ *
+ * @see #countRuns()
+ * @see com.ibm.icu.text.BidiRun
+ * @see com.ibm.icu.text.BidiRun#getStart()
+ * @see com.ibm.icu.text.BidiRun#getLength()
+ * @see com.ibm.icu.text.BidiRun#getEmbeddingLevel()
+ * @stable ICU 3.8
+ */
+ BidiRun getVisualRun(int runIndex)
+ {
+ verifyValidParaOrLine();
+ BidiLine.getRuns(this);
+ verifyRange(runIndex, 0, runCount);
+ return BidiLine.getVisualRun(this, runIndex);
+ }
+
+ /**
* Get a visual-to-logical index map (array) for the characters in the
* Bidi
(paragraph or line) object.
* Bidi
object containing the reordering
+ * information for a piece of text (one or more paragraphs) set by
+ * setPara()
or for a line of text set by setLine()
+ * and return a string containing the reordered text.
+ *
+ * setPara()
call.options
parameter, and of the option bit flags.
+ *
+ * @param options A bit set of options for the reordering that control
+ * how the reordered text is written.
+ * The options include mirroring the characters on a code
+ * point basis and inserting LRM characters, which is used
+ * especially for transforming visually stored text
+ * to logically stored text (although this is still an
+ * imperfect implementation of an "inverse Bidi" algorithm
+ * because it uses the "forward Bidi" algorithm at its core).
+ * The available options are:
+ * DO_MIRRORING
,
+ * INSERT_LRM_FOR_NUMERIC
,
+ * KEEP_BASE_COMBINING
,
+ * OUTPUT_REVERSE
,
+ * REMOVE_BIDI_CONTROLS
,
+ * STREAMING
+ *
+ * @return The reordered text.
+ * If the INSERT_LRM_FOR_NUMERIC
option is set, then
+ * the length of the returned string could be as large as
+ * getLength()+2*countRuns()
.
+ * If the REMOVE_BIDI_CONTROLS
option is set, then the
+ * length of the returned string may be less than
+ * getLength()
.
+ * If none of these options is set, then the length of the returned
+ * string will be exactly getProcessedLength()
.
+ *
+ * @throws IllegalStateException if this call is not preceded by a successful
+ * call to setPara
or setLine
+ *
+ * @see #DO_MIRRORING
+ * @see #INSERT_LRM_FOR_NUMERIC
+ * @see #KEEP_BASE_COMBINING
+ * @see #OUTPUT_REVERSE
+ * @see #REMOVE_BIDI_CONTROLS
+ * @see #OPTION_STREAMING
+ * @see #getProcessedLength
+ * @stable ICU 3.8
+ */
+ public String writeReordered(int options)
+ {
+ verifyValidParaOrLine();
+ if (length == 0) {
+ /* nothing to do */
+ return "";
+ }
+ return BidiWriter.writeReordered(this, options);
+ }
+
+ /**
* Display the bidi internal state, used in debugging.
*/
public String toString() {
@@ -3507,4 +4776,5 @@
}
}
}
+
}
--- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java 2015-07-13 16:11:45.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java 2015-07-13 16:11:45.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,17 +22,13 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
+*******************************************************************************
+* Copyright (C) 2001-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
@@ -42,7 +38,7 @@
import java.text.Bidi;
import java.util.Arrays;
-public final class BidiLine {
+final class BidiLine {
/*
* General remarks about the functions in this file:
@@ -122,13 +118,13 @@
level of B chars from 0 to paraLevel in getLevels when
orderParagraphsLTR==TRUE
*/
- if (BidiBase.NoContextRTL(dirProps[start - 1]) == BidiBase.B) {
+ if (dirProps[start - 1] == BidiBase.B) {
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
return;
}
/* go backwards across all WS, BN, explicit codes */
while (start > 0 &&
- (BidiBase.DirPropFlagNC(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
+ (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
--start;
}
@@ -140,13 +136,11 @@
bidiBase.trailingWSStart=start;
}
- public static Bidi setLine(Bidi bidi, BidiBase paraBidi,
- Bidi newBidi, BidiBase newBidiBase,
- int start, int limit) {
+ static Bidi setLine(BidiBase paraBidi,
+ Bidi newBidi, BidiBase lineBidi,
+ int start, int limit) {
int length;
- BidiBase lineBidi = newBidiBase;
-
/* set the values in lineBidi from its paraBidi parent */
/* class members are already initialized to 0 */
// lineBidi.paraBidi = null; /* mark unfinished setLine */
@@ -161,6 +155,8 @@
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
lineBidi.paraCount = paraBidi.paraCount;
lineBidi.runs = new BidiRun[0];
+ lineBidi.reorderingMode = paraBidi.reorderingMode;
+ lineBidi.reorderingOptions = paraBidi.reorderingOptions;
if (paraBidi.controlCount > 0) {
int j;
for (j = start; j < limit; j++) {
@@ -206,7 +202,7 @@
setTrailingWSStart(lineBidi);
trailingWSStart = lineBidi.trailingWSStart;
- /* recalculate lineBidi.direction */
+ /* recalculate lineBidiBase.direction */
if (trailingWSStart == 0) {
/* all levels are at paraLevel */
lineBidi.direction = (byte)(lineBidi.paraLevel & 1);
@@ -260,7 +256,8 @@
}
}
- newBidiBase.paraBidi = paraBidi; /* mark successful setLine */
+ lineBidi.paraBidi = paraBidi; /* mark successful setLine */
+
return newBidi;
}
@@ -303,30 +300,19 @@
return bidiBase.levels;
}
- static BidiRun getLogicalRun(BidiBase bidiBase, int logicalPosition)
- {
- /* this is done based on runs rather than on levels since levels have
- a special interpretation when REORDER_RUNS_ONLY
- */
- BidiRun newRun = new BidiRun(), iRun;
- getRuns(bidiBase);
- int runCount = bidiBase.runCount;
- int visualStart = 0, logicalLimit = 0;
- iRun = bidiBase.runs[0];
-
- for (int i = 0; i < runCount; i++) {
- iRun = bidiBase.runs[i];
- logicalLimit = iRun.start + iRun.limit - visualStart;
- if ((logicalPosition >= iRun.start) &&
- (logicalPosition < logicalLimit)) {
- break;
- }
- visualStart = iRun.limit;
- }
- newRun.start = iRun.start;
- newRun.limit = logicalLimit;
- newRun.level = iRun.level;
- return newRun;
+ static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
+ int start = bidiBase.runs[runIndex].start;
+ int limit;
+ byte level = bidiBase.runs[runIndex].level;
+
+ if (runIndex > 0) {
+ limit = start +
+ bidiBase.runs[runIndex].limit -
+ bidiBase.runs[runIndex - 1].limit;
+ } else {
+ limit = start + bidiBase.runs[0].limit;
+ }
+ return new BidiRun(start, limit, level);
}
/* in trivial cases there is only one trivial run; called by getRuns() */
@@ -502,7 +488,7 @@
int length = bidiBase.length, limit;
byte[] levels = bidiBase.levels;
int i, runCount;
- byte level = BidiBase.INTERNAL_LEVEL_DEFAULT_LTR; /* initialize with no valid level */
+ byte level = -1; /* initialize with no valid level */
/*
* If there are WS characters at the end of the line
* and the run preceding them has a level different from
@@ -651,7 +637,7 @@
maxLevel = 0;
for (start = levels.length; start>0; ) {
level = levels[--start];
- if (level > BidiBase.MAX_EXPLICIT_LEVEL + 1) {
+ if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
return null;
}
if (level < minLevel) {
--- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java 2015-07-13 16:11:46.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java 2015-07-13 16:11:46.000000000 +0900
@@ -55,7 +55,7 @@
*
* @see com.ibm.icu.text.Bidi
*/
-public class BidiRun {
+class BidiRun {
int start; /* first logical position of the run */
int limit; /* last visual position of the run +1 */
@@ -106,7 +106,7 @@
/**
* Get level of run
*/
- public byte getEmbeddingLevel()
+ byte getEmbeddingLevel()
{
return level;
}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java 2015-07-13 16:11:47.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java 2015-07-13 16:11:46.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,22 +22,18 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
+ ******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ******************************************************************************
*/
package sun.text.normalizer;
-import java.io.InputStream;
import java.io.DataInputStream;
+import java.io.InputStream;
import java.io.IOException;
/**
@@ -73,120 +69,17 @@
throw new IllegalArgumentException(
"Data given does not belong to a char trie.");
}
- m_friendAgent_ = new FriendAgent();
- }
-
- /**
- * Make a dummy CharTrie.
- * A dummy trie is an empty runtime trie, used when a real data trie cannot
- * be loaded.
- *
- * The trie always returns the initialValue,
- * or the leadUnitValue for lead surrogate code points.
- * The Latin-1 part is always set up to be linear.
- *
- * @param initialValue the initial value that is set for all code points
- * @param leadUnitValue the value for lead surrogate code _units_ that do not
- * have associated supplementary data
- * @param dataManipulate object which provides methods to parse the char data
- */
- public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
- super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
-
- int dataLength, latin1Length, i, limit;
- char block;
-
- /* calculate the actual size of the dummy trie data */
-
- /* max(Latin-1, block 0) */
- dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
- if(leadUnitValue!=initialValue) {
- dataLength+=DATA_BLOCK_LENGTH;
- }
- m_data_=new char[dataLength];
- m_dataLength_=dataLength;
-
- m_initialValue_=(char)initialValue;
-
- /* fill the index and data arrays */
-
- /* indexes are preset to 0 (block 0) */
-
- /* Latin-1 data */
- for(i=0; i
- *
- *
- *
- *
- * try {
- * FileInputStream input = new FileInputStream(filename);
- * If (Utility.readICUDataHeader(input, dataformat, dataversion,
- * unicode) {
- * System.out.println("Verified file header, this is a ICU data file");
- * }
- * } catch (IOException e) {
- * System.out.println("This is not a ICU data file");
- * }
- *
- *
- * @param inputStream input stream that contains the ICU data header
- * @param dataFormatIDExpected Data format expected. An array of 4 bytes
- * information about the data format.
- * E.g. data format ID 1.2.3.4. will became an array of
- * {1, 2, 3, 4}
- * @param authenticate user defined extra data authentication. This value
- * can be null, if no extra authentication is needed.
- * @exception IOException thrown if there is a read error or
- * when header authentication fails.
- * @draft 2.1
- */
+ * Loads an ICU binary data file and returns it as a ByteBuffer.
+ * The buffer contents is normally read-only, but its position etc. can be modified.
+ *
+ * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu".
+ * @return The data as a read-only ByteBuffer.
+ */
+ public static ByteBuffer getRequiredData(String itemPath) {
+ final Class
- *
*
* @see #getMode
@@ -983,9 +686,11 @@
*/
public void setMode(Mode newMode) {
mode = newMode;
+ norm2 = mode.getNormalizer2(options);
}
+
/**
- * Return the basic operation performed by this {@code Normalizer}
+ * Return the basic operation performed by this {@code NormalizerBase}
*
* @see #setMode
* @stable ICU 2.8
@@ -995,688 +700,83 @@
}
/**
- * Set the input text over which this {@code Normalizer} will iterate.
+ * Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
-
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
- throw new InternalError("Could not create a new UCharacterIterator");
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
reset();
}
/**
- * Set the input text over which this {@code Normalizer} will iterate.
+ * Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
-
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
- throw new InternalError("Could not create a new UCharacterIterator");
+ throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
currentIndex=nextIndex=0;
clearBuffer();
}
- //-------------------------------------------------------------------------
- // Private utility methods
- //-------------------------------------------------------------------------
-
-
- /* backward iteration --------------------------------------------------- */
-
- /*
- * read backwards and get norm32
- * return 0 if the character is String
using the given normalization form.
- *
- * @param str the input string to be normalized.
- * @param form the normalization form
- */
- public static String normalize(String str, Normalizer.Form form) {
- return normalize(str, form, UNICODE_LATEST);
- }
-
- /**
- * Normalizes a String
using the given normalization form.
- *
- * @param str the input string to be normalized.
- * @param form the normalization form
- * @param options the optional features to be enabled.
- */
- public static String normalize(String str, Normalizer.Form form, int options) {
- int len = str.length();
- boolean asciiOnly = true;
- if (len < 80) {
- for (int i = 0; i < len; i++) {
- if (str.charAt(i) > 127) {
- asciiOnly = false;
- break;
- }
+ StringBuilder segment=new StringBuilder();
+ int c;
+ while((c=text.previousCodePoint())>=0) {
+ if(c<=0xffff) {
+ segment.insert(0, (char)c);
+ } else {
+ segment.insert(0, Character.toChars(c));
}
- } else {
- char[] a = str.toCharArray();
- for (int i = 0; i < len; i++) {
- if (a[i] > 127) {
- asciiOnly = false;
- break;
- }
+ if(norm2.hasBoundaryBefore(c)) {
+ break;
}
}
-
- switch (form) {
- case NFC :
- return asciiOnly ? str : NFC.normalize(str, options);
- case NFD :
- return asciiOnly ? str : NFD.normalize(str, options);
- case NFKC :
- return asciiOnly ? str : NFKC.normalize(str, options);
- case NFKD :
- return asciiOnly ? str : NFKD.normalize(str, options);
- }
-
- throw new IllegalArgumentException("Unexpected normalization form: " +
- form);
- }
-
- /**
- * Test if a string is in a given normalization form.
- * This is semantically equivalent to source.equals(normalize(source, mode)).
- *
- * Unlike quickCheck(), this function returns a definitive result,
- * never a "maybe".
- * For NFD, NFKD, and FCD, both functions work exactly the same.
- * For NFC and NFKC where quickCheck may return "maybe", this function will
- * perform further tests to arrive at a true/false result.
- * @param str the input string to be checked to see if it is normalized
- * @param form the normalization form
- */
- public static boolean isNormalized(String str, Normalizer.Form form) {
- return isNormalized(str, form, UNICODE_LATEST);
+ currentIndex=text.getIndex();
+ norm2.normalize(segment, buffer);
+ bufferPos=buffer.length();
+ return buffer.length()!=0;
}
- /**
- * Test if a string is in a given normalization form.
- * This is semantically equivalent to source.equals(normalize(source, mode)).
- *
- * Unlike quickCheck(), this function returns a definitive result,
- * never a "maybe".
- * For NFD, NFKD, and FCD, both functions work exactly the same.
- * For NFC and NFKC where quickCheck may return "maybe", this function will
- * perform further tests to arrive at a true/false result.
- * @param str the input string to be checked to see if it is normalized
- * @param form the normalization form
- * @param options the optional features to be enabled.
- */
- public static boolean isNormalized(String str, Normalizer.Form form, int options) {
- switch (form) {
- case NFC:
- return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFD:
- return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFKC:
- return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- case NFKD:
- return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES);
- }
-
- throw new IllegalArgumentException("Unexpected normalization form: " +
- form);
- }
}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:50.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:49.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,2453 +22,1706 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 2009-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.BufferedInputStream;
-import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.text.Normalizer;
-/**
- * @author Ram Viswanadha
- */
+// Original filename in ICU4J: Normalizer2Impl.java
public final class NormalizerImpl {
- // Static block for the class to initialize its own self
- static final NormalizerImpl IMPL;
-
- static
- {
- try
- {
- IMPL = new NormalizerImpl();
- }
- catch (Exception e)
- {
- throw new RuntimeException(e.getMessage());
- }
- }
-
- static final int UNSIGNED_BYTE_MASK =0xFF;
- static final long UNSIGNED_INT_MASK = 0xffffffffL;
- /*
- * This new implementation of the normalization code loads its data from
- * unorm.icu, which is generated with the gennorm tool.
- * The format of that file is described at the end of this file.
- */
- private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu";
-
- // norm32 value constants
-
- // quick check flags 0..3 set mean "no" for their forms
- public static final int QC_NFC=0x11; /* no|maybe */
- public static final int QC_NFKC=0x22; /* no|maybe */
- public static final int QC_NFD=4; /* no */
- public static final int QC_NFKD=8; /* no */
-
- public static final int QC_ANY_NO=0xf;
-
- /* quick check flags 4..5 mean "maybe" for their forms;
- * test flags>=QC_MAYBE
- */
- public static final int QC_MAYBE=0x10;
- public static final int QC_ANY_MAYBE=0x30;
-
- public static final int QC_MASK=0x3f;
-
- private static final int COMBINES_FWD=0x40;
- private static final int COMBINES_BACK=0x80;
- public static final int COMBINES_ANY=0xc0;
- // UnicodeData.txt combining class in bits 15.
- private static final int CC_SHIFT=8;
- public static final int CC_MASK=0xff00;
- // 16 bits for the index to UChars and other extra data
- private static final int EXTRA_SHIFT=16;
-
- /* norm32 value constants using >16 bits */
- private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK;
- private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK;
- private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK;
-// private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK;
- private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK;
-
-
- /* indexes[] value names */
- /* number of bytes in normalization trie */
- static final int INDEX_TRIE_SIZE = 0;
- /* number of chars in extra data */
- static final int INDEX_CHAR_COUNT = 1;
- /* number of uint16_t words for combining data */
- static final int INDEX_COMBINE_DATA_COUNT = 2;
- /* first code point with quick check NFC NO/MAYBE */
- public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
- /* first code point with quick check NFKC NO/MAYBE */
- public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
- /* first code point with quick check NFD NO/MAYBE */
- public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
- /* first code point with quick check NFKD NO/MAYBE */
- public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
- /* number of bytes in FCD trie */
- static final int INDEX_FCD_TRIE_SIZE = 10;
- /* number of bytes in the auxiliary trie */
- static final int INDEX_AUX_TRIE_SIZE = 11;
- /* changing this requires a new formatVersion */
- static final int INDEX_TOP = 32;
-
-
- /* AUX constants */
- /* value constants for auxTrie */
- private static final int AUX_UNSAFE_SHIFT = 11;
- private static final int AUX_COMP_EX_SHIFT = 10;
- private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12;
-
- private static final int AUX_MAX_FNC = 1<
+ * Bits 21..1 composite character
+ * Bit 0 set if the composite is a forward-combining starter
+ *
+ * otherwise it returns -1.
+ *
+ * buf
at the time of
@@ -98,7 +92,6 @@
return buf.charAt(offset);
}
- //// for StringPrep
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
@@ -118,6 +111,8 @@
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
- Utility.getChars(buf, srcStart, srcLimit, dst, dstStart);
+ if (srcStart != srcLimit) {
+ buf.getChars(srcStart, srcLimit, dst, dstStart);
+ }
}
}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java 2015-07-13 16:11:52.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java 2015-07-13 16:11:52.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -47,7 +47,7 @@
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
-public class ReplaceableUCharacterIterator extends UCharacterIterator {
+class ReplaceableUCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
@@ -63,7 +63,6 @@
this.currentIndex = 0;
}
- //// for StringPrep
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
@@ -164,7 +163,6 @@
this.currentIndex = currentIndex;
}
- //// for StringPrep
public int getText(char[] fillIn, int offset){
int length = replaceable.length();
if(offset < 0 || offset + length > fillIn.length){
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java 2015-07-13 16:11:53.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java 2015-07-13 16:11:52.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,16 +22,12 @@
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
+
/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
+ ******************************************************************************
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ******************************************************************************
*/
package sun.text.normalizer;
@@ -135,93 +131,62 @@
unserialize(inputStream);
}
- /**
- * Trie constructor
- * @param index array to be used for index
- * @param options used by the trie
- * @param dataManipulate object containing the information to parse the
- * trie data
- */
- protected Trie(char index[], int options, DataManipulate dataManipulate)
- {
- m_options_ = options;
- if(dataManipulate != null) {
- m_dataManipulate_ = dataManipulate;
- } else {
- m_dataManipulate_ = new DefaultGetFoldingOffset();
- }
- m_isLatin1Linear_ = (m_options_ &
- HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
- m_index_ = index;
- m_dataOffset_ = m_index_.length;
- }
-
// protected data members ------------------------------------------
/**
- * Lead surrogate code points' index displacement in the index array.
- * {@code
- * 0x10000-0xd800=0x2800
- * 0x2800 >> INDEX_STAGE_1_SHIFT_
- * }
- */
+ * Lead surrogate code points' index displacement in the index array.
+ * {@code
+ * 0x10000-0xd800=0x2800
+ * 0x2800 >> INDEX_STAGE_1_SHIFT_
+ * }
+ */
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
- * Shift size for shifting right the input index. 1..9
- */
+ * Shift size for shifting right the input index. 1..9
+ */
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
- * Shift size for shifting left the index array values.
- * Increases possible data size with 16-bit index values at the cost
- * of compactability.
- * This requires blocks of stage 2 data to be aligned by
- * DATA_GRANULARITY.
- * 0..INDEX_STAGE_1_SHIFT
- */
+ * Shift size for shifting left the index array values.
+ * Increases possible data size with 16-bit index values at the cost
+ * of compactability.
+ * This requires blocks of stage 2 data to be aligned by
+ * DATA_GRANULARITY.
+ * 0..INDEX_STAGE_1_SHIFT
+ */
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<
- * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH
- * 8 0 = 16-bit data, 1=32-bit data
- * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT
- * 3..0 INDEX_STAGE_2_SHIFT // 1..9
- */
+ *
+ * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH
+ * 8 0 = 16-bit data, 1=32-bit data
+ * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT
+ * 3..0 INDEX_STAGE_2_SHIFT // 1..9
+ */
private int m_options_;
// private methods ---------------------------------------------------
/**
- * Authenticates raw data header.
- * Checking the header information, signature and options.
- * @param signature This contains the options and type of a Trie
- * @return true if the header is authenticated valid
- */
+ * Authenticates raw data header.
+ * Checking the header information, signature and options.
+ * @param signature This contains the options and type of a Trie
+ * @return true if the header is authenticated valid
+ */
private final boolean checkHeader(int signature)
{
// check the signature
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java 2015-07-13 16:11:53.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java 2015-07-13 16:11:53.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,74 +24,71 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ *
+ * Copyright (C) 2004-2014, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
*******************************************************************************
-* file name: UBiDiProps.java
-* encoding: US-ASCII
-* tab size: 8 (not used)
-* indentation:4
-*
-* created on: 2005jan16
-* created by: Markus W. Scherer
-*
-* Low-level Unicode bidi/shaping properties access.
-* Java port of ubidi_props.h/.c.
-*/
+ * file name: UBiDiProps.java
+ * encoding: US-ASCII
+ * tab size: 8 (not used)
+ * indentation:4
+ *
+ * created on: 2005jan16
+ * created by: Markus W. Scherer
+ *
+ * Low-level Unicode bidi/shaping properties access.
+ * Java port of ubidi_props.h/.c.
+ */
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.DataInputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.MissingResourceException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
- public UBiDiProps() throws IOException{
- InputStream is=ICUData.getStream(DATA_FILE_NAME);
- BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
- readData(b);
- b.close();
- is.close();
-
+ private UBiDiProps() throws IOException{
+ ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
+ readData(bytes);
}
- private void readData(InputStream is) throws IOException {
- DataInputStream inputStream=new DataInputStream(is);
-
+ private void readData(ByteBuffer bytes) throws IOException {
// read the header
- ICUBinary.readHeader(inputStream, FMT, new IsAcceptable());
+ ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
// read indexes[]
int i, count;
- count=inputStream.readInt();
- if(count
* E.g. In Windows
@@ -64,9 +54,8 @@
* unames.icu from the icu4j source subdirectory
* $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory
* $ICU4J_CLASS/com.ibm.icu.impl.data.
- *
*
* {@code
+ *
{@code
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
- * - punct: ((1<
+ *
*
- * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
+ * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
- /**
- * The minimum value for Supplementary code points
- * @stable ICU 2.1
- */
- public static final int SUPPLEMENTARY_MIN_VALUE =
- UTF16.SUPPLEMENTARY_MIN_VALUE;
-
// public methods ----------------------------------------------------
/**
- * Retrieves the numeric value of a decimal digit code point.
+ * Returns the numeric value of a decimal digit code point.
*
This method observes the semantics of
* java.lang.Character.digit()
. Note that this
* will return positive values for code points for which isDigit
@@ -231,15 +288,54 @@
*/
public static int digit(int ch, int radix)
{
- // when ch is out of bounds getProperty == 0
- int props = getProperty(ch);
- int value;
- if (getNumericType(props) == NumericType.DECIMAL) {
- value = UCharacterProperty.getUnsignedValue(props);
+ if (2 <= radix && radix <= 36) {
+ int value = digit(ch);
+ if (value < 0) {
+ // ch is not a decimal digit, try latin letters
+ value = UCharacterProperty.getEuropeanDigit(ch);
+ }
+ return (value < radix) ? value : -1;
} else {
- value = getEuropeanDigit(ch);
+ return -1; // invalid radix
}
- return (0 <= value && value < radix) ? value : -1;
+ }
+
+ /**
+ * Returns the numeric value of a decimal digit code point.
+ *
This is a convenience overload of digit(int, int)
+ * that provides a decimal radix.
+ *
Semantic Change: In release 1.3.1 and prior, this
+ * treated numeric letters and other numbers as digits. This has
+ * been changed to conform to the java semantics.
+ * @param ch the code point to query
+ * @return the numeric value represented by the code point,
+ * or -1 if the code point is not a decimal digit or if its
+ * value is too large for a decimal radix
+ * @stable ICU 2.1
+ */
+ public static int digit(int ch)
+ {
+ return UCharacterProperty.INSTANCE.digit(ch);
+ }
+
+ /**
+ * Returns a value indicating a code point's Unicode category.
+ * Up-to-date Unicode implementation of java.lang.Character.getType()
+ * except for the above mentioned code points that had their category
+ * changed.
+ * Return results are constants from the interface
+ * UCharacterCategory
+ * NOTE: the UCharacterCategory values are not compatible with
+ * those returned by java.lang.Character.getType. UCharacterCategory values
+ * match the ones used in ICU4C, while java.lang.Character type
+ * values, though similar, skip the value 17.
Get the "age" of the code point.
+ * Returns the "age" of the code point. *The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character. @@ -289,143 +445,95 @@ public static VersionInfo getAge(int ch) { if (ch < MIN_VALUE || ch > MAX_VALUE) { - throw new IllegalArgumentException("Codepoint out of bounds"); + throw new IllegalArgumentException("Codepoint out of bounds"); } - return PROPERTY_.getAge(ch); + return UCharacterProperty.INSTANCE.getAge(ch); } - // private variables ------------------------------------------------- - - /** - * Database storing the sets of character property - */ - private static final UCharacterProperty PROPERTY_; /** - * For optimization + * Returns the property value for an Unicode property type of a code point. + * Also returns binary and mask property values.
+ *Unicode, especially in version 3.2, defines many more properties than + * the original set in UnicodeData.txt.
+ *The properties APIs are intended to reflect Unicode properties as + * defined in the Unicode Character Database (UCD) and Unicode Technical + * Reports (UTR). For details about the properties see + * http://www.unicode.org/.
+ *For names of Unicode properties see the UCD file PropertyAliases.txt. + *
+ *+ * Sample usage: + * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH); + * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC); + * boolean b = (ideo == 1) ? true : false; + *+ * @param ch code point to test. + * @param type UProperty selector constant, identifies which binary + * property to check. Must be + * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or + * UProperty.INT_START <= type < UProperty.INT_LIMIT or + * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. + * @return numeric value that is directly the property value or, + * for enumerated properties, corresponds to the numeric value of + * the enumerated constant of the respective property value + * enumeration type (cast to enum type if necessary). + * Returns 0 or 1 (for false / true) for binary Unicode properties. + * Returns a bit-mask for mask properties. + * Returns 0 if 'type' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code + * point. + * @see UProperty + * @see #hasBinaryProperty + * @see #getIntPropertyMinValue + * @see #getIntPropertyMaxValue + * @see #getUnicodeVersion + * @stable ICU 2.4 */ - private static final char[] PROPERTY_TRIE_INDEX_; - private static final char[] PROPERTY_TRIE_DATA_; - private static final int PROPERTY_INITIAL_VALUE_; - - private static final UBiDiProps gBdp; - - // block to initialise character property database - static - { - try - { - PROPERTY_ = UCharacterProperty.getInstance(); - PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; - PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; - PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_; - } - catch (Exception e) - { - throw new MissingResourceException(e.getMessage(),"",""); - } - - UBiDiProps bdp; - try { - bdp=UBiDiProps.getSingleton(); - } catch(IOException e) { - bdp=UBiDiProps.getDummy(); - } - gBdp=bdp; + // for BiDiBase.java + public static int getIntPropertyValue(int ch, int type) { + return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type); } - /** - * Shift to get numeric type - */ - private static final int NUMERIC_TYPE_SHIFT_ = 5; - /** - * Mask to get numeric type - */ - private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_; - - // private methods --------------------------------------------------- + // private constructor ----------------------------------------------- /** - * Getting the digit values of characters like 'A' - 'Z', normal, - * half-width and full-width. This method assumes that the other digit - * characters are checked by the calling method. - * @param ch character to test - * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise - * its corresponding digit will be returned. + * Private constructor to prevent instantiation */ - private static int getEuropeanDigit(int ch) { - if ((ch > 0x7a && ch < 0xff21) - || ch < 0x41 || (ch > 0x5a && ch < 0x61) - || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { - return -1; - } - if (ch <= 0x7a) { - // ch >= 0x41 or ch < 0x61 - return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); - } - // ch >= 0xff21 - if (ch <= 0xff3a) { - return ch + 10 - 0xff21; - } - // ch >= 0xff41 && ch <= 0xff5a - return ch + 10 - 0xff41; - } + private UCharacter() { } - /** - * Gets the numeric type of the property argument - * @param props 32 bit property - * @return the numeric type - */ - private static int getNumericType(int props) - { - return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; - } + /* + * Copied from UCharacterEnums.java + */ - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. - * This is a duplicate of UCharacterProperty.getProperty. For optimization - * purposes, this method calls the trie data directly instead of through - * UCharacterProperty.getProperty. - * @param ch code point whose property value is to be retrieved - * @return property value of code point - * @stable ICU 2.6 - */ - private static final int getProperty(int ch) - { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - try { // using try for ch < 0 is faster than using an if statement - return PROPERTY_TRIE_DATA_[ - (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) - + (ch & 0x1f)]; - } catch (ArrayIndexOutOfBoundsException e) { - return PROPERTY_INITIAL_VALUE_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return PROPERTY_TRIE_DATA_[ - (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) - + (ch & 0x1f)]; - } - // for optimization - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return PROPERTY_.m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & 0x3ff)); - } - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return PROPERTY_INITIAL_VALUE_; - } + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final byte NON_SPACING_MARK = 6; + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final byte ENCLOSING_MARK = 7; + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final byte COMBINING_SPACING_MARK = 8; + /** + * Character type count + * @stable ICU 2.1 + */ + public static final byte CHAR_CATEGORY_COUNT = 30; + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java 2015-07-13 16:11:55.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java 2015-07-13 16:11:55.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,13 +25,8 @@ /* ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and * + * others. All Rights Reserved. * ******************************************************************************* */ @@ -84,7 +79,6 @@ return new ReplaceableUCharacterIterator(source); } - //// for StringPrep /** * Returns a
UCharacterIterator
object given a
* source StringBuffer.
@@ -97,7 +91,7 @@
return new ReplaceableUCharacterIterator(source);
}
- /**
+ /**
* Returns a UCharacterIterator
object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
@@ -112,21 +106,12 @@
// public methods ----------------------------------------------------------
/**
- * Returns the code unit at the current index. If index is out
- * of range, returns DONE. Index is not changed.
- * @return current code unit
- * @stable ICU 2.4
- */
- public abstract int current();
-
- /**
* Returns the length of the text
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
-
/**
* Gets the current index in text.
* @return current index in text.
@@ -134,7 +119,6 @@
*/
public abstract int getIndex();
-
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
@@ -183,6 +167,33 @@
*/
public abstract int previous();
+
+ /**
+ * Retreat to the start of the previous code point in the text,
+ * and return it (pre-decrement semantics). If the index is not
+ * preceeded by a valid surrogate pair, the behavior is the same
+ * as previous()
. Otherwise the iterator is
+ * decremented to the start of the surrogate pair, and the code
+ * point represented by the pair is returned.
+ * @return the previous code point in the text, or DONE if the new
+ * index is before the start of the text.
+ * @stable ICU 2.4
+ */
+ public int previousCodePoint(){
+ int ch1 = previous();
+ if(UTF16.isTrailSurrogate((char)ch1)){
+ int ch2 = previous();
+ if(UTF16.isLeadSurrogate((char)ch2)){
+ return UCharacterProperty.getRawSupplementary((char)ch2,
+ (char)ch1);
+ }else if (ch2 != DONE) {
+ //unmatched trail surrogate so back out
+ next();
+ }
+ }
+ return ch1;
+ }
+
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
@@ -192,7 +203,14 @@
*/
public abstract void setIndex(int index);
- //// for StringPrep
+ /**
+ * Sets the current index to the start.
+ * @stable ICU 2.4
+ */
+ public void setToStart() {
+ setIndex(0);
+ }
+
/**
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
@@ -222,20 +240,19 @@
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
- * @exception IndexOutOfBounds exception if there is not enough
- * room after offset in the array, or if offset {@literal <} 0.
+ * @exception IndexOutOfBoundsException exception if there is not enough
+ * room after offset in the array, or if offset < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
- //// for StringPrep
/**
* Convenience override for getText(char[], int)
that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
- * @exception IndexOutOfBounds exception if there is not enough
+ * @exception IndexOutOfBoundsException exception if there is not enough
* room in the array.
* @stable ICU 2.4
*/
@@ -243,7 +260,6 @@
return getText(fillIn, 0);
}
- //// for StringPrep
/**
* Convenience method for returning the underlying text storage as a string
* @return the underlying text storage in the iterator as a string
@@ -256,25 +272,32 @@
}
/**
- * Moves the current position by the number of code units
- * specified, either forward or backward depending on the sign
- * of delta (positive or negative respectively). If the resulting
- * index would be less than zero, the index is set to zero, and if
- * the resulting index would be greater than limit, the index is
- * set to limit.
- *
- * @param delta the number of code units to move the current
- * index.
- * @return the new index.
- * @exception IndexOutOfBoundsException is thrown if an invalid index is
+ * Moves the current position by the number of code points
+ * specified, either forward or backward depending on the sign of
+ * delta (positive or negative respectively). If the current index
+ * is at a trail surrogate then the first adjustment is by code
+ * unit, and the remaining adjustments are by code points. If the
+ * resulting index would be less than zero, the index is set to
+ * zero, and if the resulting index would be greater than limit,
+ * the index is set to limit.
+ * @param delta the number of code units to move the current index.
+ * @return the new index
+ * @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
* @stable ICU 2.4
*
*/
- public int moveIndex(int delta) {
- int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
- setIndex(x);
- return x;
+ public int moveCodePointIndex(int delta){
+ if(delta>0){
+ while(delta>0 && nextCodePoint() != DONE){delta--;}
+ }else{
+ while(delta<0 && previousCodePoint() != DONE){delta++;}
+ }
+ if(delta!=0){
+ throw new IndexOutOfBoundsException();
+ }
+
+ return getIndex();
}
/**
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,23 +24,21 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
-import java.io.BufferedInputStream;
-import java.io.InputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
import java.util.MissingResourceException;
+import sun.text.normalizer.UCharacter.HangulSyllableType;
+import sun.text.normalizer.UCharacter.NumericType;
+
/**
* Internal class used for Unicode character property database.
*This classes store binary data read from uprops.icu. @@ -56,134 +54,72 @@ * @since release 2.1, february 1st 2002 */ -public final class UCharacterProperty +final class UCharacterProperty { // public data members ----------------------------------------------- + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + /** * Trie data */ - public CharTrie m_trie_; - /** - * Optimization - * CharTrie index array - */ - public char[] m_trieIndex_; - /** - * Optimization - * CharTrie data array - */ - public char[] m_trieData_; - /** - * Optimization - * CharTrie data offset - */ - public int m_trieInitialValue_; + public Trie2_16 m_trie_; + /** * Unicode version */ public VersionInfo m_unicodeVersion_; + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + // uprops.h enum UPropertySource --------------------------------------- *** + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; - /** One more than the highest UPropertySource (SRC_) constant. */ - public static final int SRC_COUNT=9; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** - * Java friends implementation - */ - public void setIndexData(CharTrie.FriendAgent friendagent) - { - m_trieIndex_ = friendagent.getPrivateIndex(); - m_trieData_ = friendagent.getPrivateData(); - m_trieInitialValue_ = friendagent.getPrivateInitialValue(); - } - - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. + * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - // optimized - try { // using try for ch < 0 is faster than using an if statement - return m_trieData_[ - (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } catch (ArrayIndexOutOfBoundsException e) { - return m_trieInitialValue_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return m_trieData_[ - (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ - + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & Trie.SURROGATE_MASK_)); - } - // ch is out of bounds - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return m_trieInitialValue_; - - // this all is an inlined form of return m_trie_.getCodePointValue(ch); - } - - /** - * Getting the unsigned numeric value of a character embedded in the property - * argument - * @param prop the character - * @return unsigned numberic value - */ - public static int getUnsignedValue(int prop) - { - return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; + return m_trie_.get(ch); } /** * Gets the unicode additional properties. - * C version getUnicodeProperties. + * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved - * @param column + * @param column The column index. * @return unicode properties */ - public int getAdditional(int codepoint, int column) { - if (column == -1) { - return getProperty(codepoint); - } - if (column < 0 || column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[ - m_additionalTrie_.getCodePointValue(codepoint) + column]; - } + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } - /** + /** *
Get the "age" of the code point.
*The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
@@ -203,6 +139,91 @@
version & LAST_NIBBLE_MASK_, 0, 0);
}
+ // int-value and enumerated properties --------------------------------- ***
+
+ public int getType(int c) {
+ return getProperty(c)&TYPE_MASK;
+ }
+
+ /*
+ * Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
+ * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
+ */
+ private static final int /* UHangulSyllableType */ gcbToHst[]={
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
+ HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
+ HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
+ HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
+ HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
+ HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
+ HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
+ /*
+ * Omit GCB values beyond what we need for hst.
+ * The code below checks for the array length.
+ */
+ };
+
+ private class IntProperty {
+ int column; // SRC_PROPSVEC column, or "source" if mask==0
+ int mask;
+ int shift;
+
+ IntProperty(int column, int mask, int shift) {
+ this.column=column;
+ this.mask=mask;
+ this.shift=shift;
+ }
+
+ IntProperty(int source) {
+ this.column=source;
+ this.mask=0;
+ }
+
+ int getValue(int c) {
+ // systematic, directly stored properties
+ return (getAdditional(c, column)&mask)>>>shift;
+ }
+ }
+
+ private class BiDiIntProperty extends IntProperty {
+ BiDiIntProperty() {
+ super(SRC_BIDI);
+ }
+ }
+
+ private class CombiningClassIntProperty extends IntProperty {
+ CombiningClassIntProperty(int source) {
+ super(source);
+ }
+ }
+
+ private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
+ int which;
+ int max;
+
+ NormQuickCheckIntProperty(int source, int which, int max) {
+ super(source);
+ this.which=which;
+ this.max=max;
+ }
+ }
+
+ private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
+ int getValue(int c) {
+ return UBiDiProps.INSTANCE.getPairedBracketType(c);
+ }
+ };
+
+ public int getIntPropertyValue(int c, int which) {
+ if (which == BIDI_PAIRED_BRACKET_TYPE) {
+ return intProp.getValue(c);
+ }
+ return 0; // undefined
+ }
+
/**
* Forms a supplementary code point from the argument character The UnicodeSet class is not designed to be subclassed.
*
@@ -118,7 +120,7 @@
*
*
* Any character may be preceded by a backslash in order to remove any special
- * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
+ * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are
* ignored, unless they are escaped.
*
* Property patterns specify a set of characters having a certain
@@ -267,18 +269,24 @@
*
*
*
- * To iterate over contents of UnicodeSet, use UnicodeSetIterator class.
+ * To iterate over contents of UnicodeSet, the following are available:
+ * To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
* @stable ICU 2.0
- * @see UnicodeSetIterator
*/
-@SuppressWarnings("deprecation")
-public class UnicodeSet implements UnicodeMatcher {
+class UnicodeSet {
private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
- // 110000 for codepoints
+ // 110000 for codepoints
/**
* Minimum value that can be stored in a UnicodeSet.
@@ -299,7 +307,7 @@
// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
// is not private so that UnicodeSetIterator can get access
- TreeSet To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return the length of the span
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, SpanCondition spanCondition) {
+ return span(s, 0, spanCondition);
+ }
+
+ /**
+ * Span a string using this UnicodeSet.
+ * If the start index is less than 0, span will start from 0.
+ * If the start index is greater than the string length, span returns the string length.
+ * To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @return the string index which ends the span (i.e. exclusive)
+ * @stable ICU 4.4
+ */
+ public int span(CharSequence s, int start, SpanCondition spanCondition) {
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for span().
+ return bmpSet.span(s, start, spanCondition, null);
+ }
+ if (stringSpan != null) {
+ return stringSpan.span(s, start, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param outCount An output-only object (must not be null) for returning the count.
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) {
+ if (outCount == null) {
+ throw new IllegalArgumentException("outCount must not be null");
+ }
+ int end = s.length();
+ if (start < 0) {
+ start = 0;
+ } else if (start >= end) {
+ return end;
+ }
+ if (stringSpan != null) {
+ // We might also have bmpSet != null,
+ // but fully-contained strings are relevant for counting elements.
+ return stringSpan.spanAndCount(s, start, spanCondition, outCount);
+ } else if (bmpSet != null) {
+ return bmpSet.span(s, start, spanCondition, outCount);
+ } else if (!strings.isEmpty()) {
+ int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.FWD_UTF16_CONTAINED;
+ which |= UnicodeSetStringSpan.WITH_COUNT;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
+ * @param s The string to be spanned
+ * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ * @stable ICU 4.4
+ */
+ public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) {
+ if (fromIndex <= 0) {
+ return 0;
+ }
+ if (fromIndex > s.length()) {
+ fromIndex = s.length();
+ }
+ if (bmpSet != null) {
+ // Frozen set without strings, or no string is relevant for spanBack().
+ return bmpSet.spanBack(s, fromIndex, spanCondition);
+ }
+ if (stringSpan != null) {
+ return stringSpan.spanBack(s, fromIndex, spanCondition);
+ } else if (!strings.isEmpty()) {
+ int which = (spanCondition == SpanCondition.NOT_CONTAINED)
+ ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED
+ : UnicodeSetStringSpan.BACK_UTF16_CONTAINED;
+ UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList
+ * The functionality is straightforward for sets with only single code points, without strings (which is the common
+ * case):
+ *
+ * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point
+ * boundaries, never in the middle of a surrogate pair.
+ *
+ * @stable ICU 4.4
*/
- public static final int IGNORE_SPACE = 1;
+ public enum SpanCondition {
+ /**
+ * Continues a span() while there is no set element at the current position.
+ * Increments by one code point at a time.
+ * Stops before the first set element (character or string).
+ * (For code points only, this is like while contains(current)==false).
+ *
+ * When span() returns, the substring between where it started and the position it returned consists only of
+ * characters that are not in the set, and none of its strings overlap with the span.
+ *
+ * @stable ICU 4.4
+ */
+ NOT_CONTAINED,
-}
+ /**
+ * Spans the longest substring that is a concatenation of set elements (characters or strings).
+ * (For characters only, this is like while contains(current)==true).
+ *
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ *
+ * If a set contains strings, then the span will be the longest substring for which there
+ * exists at least one non-overlapping concatenation of set elements (characters or strings).
+ * This is equivalent to a POSIX regular expression for
+ * When span() returns, the substring between where it started and the position it returned consists only of set
+ * elements (characters or strings) that are in the set.
+ *
+ * If a set only contains single characters, then this is the same as CONTAINED.
+ *
+ * If a set contains strings, then the span will be the longest substring with a match at each position with the
+ * longest single set element (character or string).
+ *
+ * Use this span condition together with other longest-match algorithms, such as ICU converters
+ * (ucnv_getUnicodeSet()).
+ *
+ * @stable ICU 4.4
+ */
+ SIMPLE,
+ }
+
+}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java 2015-07-13 16:11:58.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java 2015-07-13 16:11:58.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -24,47 +24,26 @@
*/
/*
*******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
+ * Copyright (C) 1996-2011, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
-public final class Utility {
+import java.io.IOException;
+import java.util.Locale;
- /**
- * Convenience utility to compare two Object[]s
- * Ought to be in System.
- * @param len the length to compare.
- * The start indices and start+len must be valid.
- */
- public final static boolean arrayRegionMatches(char[] source, int sourceStart,
- char[] target, int targetStart,
- int len)
- {
- int sourceEnd = sourceStart + len;
- int delta = targetStart - sourceStart;
- for (int i = sourceStart; i < sourceEnd; i++) {
- if (source[i]!=target[i + delta])
- return false;
- }
- return true;
- }
+final class Utility {
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
*/
public static final String escape(String s) {
- StringBuffer buf = new StringBuffer();
+ StringBuilder buf = new StringBuilder();
for (int i=0; i Creates a new Trie with the settings for the trie data. Unserialize the 32-bit-aligned input stream and use the data for the
- * trie. Parses the input stream and stores its trie content into a index and
- * data array Protected constructor. The result for each iteration is the consecutive range of
- * {@code
- * Hence value(start) = value(start + 1) = .... = value(start + n) = .... =
- * value(limit - 1). However value(start -1) != value(start) and
- * value(limit) != value(start).
- *
- * Most implementations will be created by factory methods, such as the
- * character type iterator in UCharacter.getTypeIterator. See example below.
- *
- * Example of use: Gets the next maximal result range with a common value and returns
- * true if we are not at the end of the iteration, false otherwise. If the return boolean is a false, the contents of elements will not
- * be updated. A symbol table maintains two kinds of mappings. The first is
- * between symbolic names and their values. For example, if the
- * variable with the name "start" is set to the value "alpha"
- * (perhaps, though not necessarily, through an expression such as
- * "$start=alpha"), then the call lookup("start") will return the
- * char[] array ['a', 'l', 'p', 'h', 'a'].
- *
- * The second kind of mapping is between character values and
- * UnicodeMatcher objects. This is used by RuleBasedTransliterator,
- * which uses characters in the private use area to represent objects
- * such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z],
- * then lookupMatcher(0xE015) will return the UnicodeSet [a-z].
- *
- * Finally, a symbol table defines parsing behavior for symbolic
- * names. All symbolic names start with the SYMBOL_REF character.
- * When a parser encounters this character, it calls parseReference()
- * with the position immediately following the SYMBOL_REF. The symbol
- * table parses the name, if there is one, and returns it.
- *
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
-@Deprecated
-public interface SymbolTable {
-
- /**
- * The character preceding a symbol reference name.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- static final char SYMBOL_REF = '$';
-
- /**
- * Lookup the characters associated with this string and return it.
- * Return {@code null} if no such name exists. The resultant
- * array may have length zero.
- * @param s the symbolic name to lookup
- * @return a char array containing the name's value, or null if
- * there is no mapping for s.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- char[] lookup(String s);
-
- /**
- * Lookup the UnicodeMatcher associated with the given character, and
- * return it. Return {@code null} if not found.
- * @param ch a 32-bit code point from 0 to 0x10FFFF inclusive.
- * @return the UnicodeMatcher object represented by the given
- * character, or null if there is no mapping for ch.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- UnicodeMatcher lookupMatcher(int ch);
-
- /**
- * Parse a symbol reference name from the given string, starting
- * at the given position. If no valid symbol reference name is
- * found, return null and leave pos unchanged. That is, if the
- * character at pos cannot start a name, or if pos is at or after
- * text.length(), then return null. This indicates an isolated
- * SYMBOL_REF character.
- * @param text the text to parse for the name
- * @param pos on entry, the index of the first character to parse.
- * This is the character following the SYMBOL_REF character. On
- * exit, the index after the last parsed character. If the parse
- * failed, pos is unchanged on exit.
- * @param limit the index after the last character to be parsed.
- * @return the parsed name, or null if there is no valid symbolic
- * name at the given position.
- * @draft ICU 2.8
- * @deprecated This is a draft API and might change in a future release of ICU.
- */
- @Deprecated
- String parseReference(String text, ParsePosition pos, int limit);
-}
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/TrieIterator.java 2015-07-13 16:12:10.000000000 +0900
+++ /dev/null 2015-07-13 16:12:10.000000000 +0900
@@ -1,547 +0,0 @@
-/*
- * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-/**
- * Class enabling iteration of the values in a Trie.
- * Result of each iteration contains the interval of codepoints that have
- * the same value type and the value type itself.
- * The comparison of each codepoint value is done via extract(), which the
- * default implementation is to return the value as it is.
- * Method extract() can be overwritten to perform manipulations on
- * codepoint values in order to perform specialized comparison.
- * TrieIterator is designed to be a generic iterator for the CharTrie
- * and the IntTrie, hence to accommodate both types of data, the return
- * result will be in terms of int (32 bit) values.
- * See com.ibm.icu.text.UCharacterTypeIterator for examples of use.
- * Notes for porting utrie_enum from icu4c to icu4j: There are basically 3 usage scenarios for porting:
- * 1) UTrieEnumValue is the only implemented callback then just implement a
- * subclass of TrieIterator and override the extract(int) method. The
- * extract(int) method is analogus to UTrieEnumValue callback.
- *
- * 2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
- * a subclass of TrieIterator, override the extract method and iterate, e.g. 3) UTrieEnumRange is the only implemented callback then just implement
- * the while loop, when utrie_enum is called
- * Returns true if we are not at the end of the iteration, false
- * otherwise. The next set of codepoints with the same value type will be
- * calculated during this call and returned in the arguement element. Internal reader class for ICU data file uprops.icu containing
-* Unicode codepoint data. This class simply reads uprops.icu, authenticates that it is a valid
-* ICU data file and split its contents up into blocks of data for use in
-* com.ibm.icu.impl.UCharacterProperty.
-* uprops.icu which is in big-endian format is jared together with this
-* package. Protected constructor. Reads uprops.icu, parse it into blocks of data to be stored in
- * UCharacterProperty. To iterate over code points, use a loop like this:
- * To iterate over code point ranges, use a loop like this:
- * The order of iteration is all code points ranges in sorted
- * order, followed by all strings sorted order. Ranges are
- * disjoint and non-contiguous. {@code string} is undefined
- * unless {@code codepoint == IS_STRING}. Do not mix calls to
- * {@code next()} and {@code nextRange()} without calling
- * {@code reset()} between them. The results of doing so are
- * undefined.
- *
- * @return true if there was another element in the set and this
- * object contains the element.
- * @stable ICU 2.0
- */
- public boolean nextRange() {
- if (nextElement <= endElement) {
- codepointEnd = endElement;
- codepoint = nextElement;
- nextElement = endElement+1;
- return true;
- }
- if (range < endRange) {
- loadRange(++range);
- codepointEnd = endElement;
- codepoint = nextElement;
- nextElement = endElement+1;
- return true;
- }
-
- // stringIterator == null iff there are no string elements remaining
-
- if (stringIterator == null) return false;
- codepoint = IS_STRING; // signal that value is actually a string
- string = stringIterator.next();
- if (!stringIterator.hasNext()) stringIterator = null;
- return true;
- }
-
- /**
- * Sets this iterator to visit the elements of the given set and
- * resets it to the start of that set. The iterator is valid only
- * so long as {@code set} is valid.
- * @param uset the set to iterate over.
- * @stable ICU 2.0
- */
- public void reset(UnicodeSet uset) {
- set = uset;
- reset();
- }
-
- /**
- * Resets this iterator to the start of the set.
- * @stable ICU 2.0
- */
- public void reset() {
- endRange = set.getRangeCount() - 1;
- range = 0;
- endElement = -1;
- nextElement = 0;
- if (endRange >= 0) {
- loadRange(range);
- }
- stringIterator = null;
- if (set.strings != null) {
- stringIterator = set.strings.iterator();
- if (!stringIterator.hasNext()) stringIterator = null;
- }
- }
-
- // ======================= PRIVATES ===========================
-
- private UnicodeSet set;
- private int endRange = 0;
- private int range = 0;
- /**
- * @internal
- */
- protected int endElement;
- /**
- * @internal
- */
- protected int nextElement;
- private Iterator
+ * The primary functions are to produce a normalized string and to detect whether
+ * a string is already normalized.
+ * The most commonly used normalization forms are those defined in
+ * http://www.unicode.org/unicode/reports/tr15/
+ * However, this API supports additional normalization forms for specialized purposes.
+ * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
+ * and can be used in implementations of UTS #46.
+ *
+ * Not only are the standard compose and decompose modes supplied,
+ * but additional modes are provided as documented in the Mode enum.
+ *
+ * Some of the functions in this class identify normalization boundaries.
+ * At a normalization boundary, the portions of the string
+ * before it and starting from it do not interact and can be handled independently.
+ *
+ * The spanQuickCheckYes() stops at a normalization boundary.
+ * When the goal is a normalized string, then the text before the boundary
+ * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
+ *
+ * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
+ * a character is guaranteed to be at a normalization boundary,
+ * regardless of context.
+ * This is used for moving from one normalization boundary to the next
+ * or preceding boundary, and for performing iterative normalization.
+ *
+ * Iterative normalization is useful when only a small portion of a
+ * longer string needs to be processed.
+ * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
+ * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
+ * (to process only the substring for which sort key bytes are computed).
+ *
+ * The set of normalization boundaries returned by these functions may not be
+ * complete: There may be more boundaries that could be returned.
+ * Different functions may return different boundaries.
+ * @stable ICU 4.4
+ * @author Markus W. Scherer
+ */
+abstract class Normalizer2 {
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFC normalization.
+ * Same as getInstance(null, "nfc", Mode.COMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFCInstance() {
+ return Norm2AllModes.getNFCInstance().comp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFD normalization.
+ * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFDInstance() {
+ return Norm2AllModes.getNFCInstance().decomp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFKC normalization.
+ * Same as getInstance(null, "nfkc", Mode.COMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFKCInstance() {
+ return Norm2AllModes.getNFKCInstance().comp;
+ }
+
+ /**
+ * Returns a Normalizer2 instance for Unicode NFKD normalization.
+ * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
+ * Returns an unmodifiable singleton instance.
+ * @return the requested Normalizer2, if successful
+ * @stable ICU 49
+ */
+ public static Normalizer2 getNFKDInstance() {
+ return Norm2AllModes.getNFKCInstance().decomp;
+ }
+
+ /**
+ * Returns the normalized form of the source string.
+ * @param src source string
+ * @return normalized src
+ * @stable ICU 4.4
+ */
+ public String normalize(CharSequence src) {
+ if(src instanceof String) {
+ // Fastpath: Do not construct a new String if the src is a String
+ // and is already normalized.
+ int spanLength=spanQuickCheckYes(src);
+ if(spanLength==src.length()) {
+ return (String)src;
+ }
+ StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
+ return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
+ }
+ return normalize(src, new StringBuilder(src.length())).toString();
+ }
+
+ /**
+ * Writes the normalized form of the source string to the destination string
+ * (replacing its contents) and returns the destination string.
+ * The source and destination strings must be different objects.
+ * @param src source string
+ * @param dest destination string; its contents is replaced with normalized src
+ * @return dest
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
+
+ /**
+ * Writes the normalized form of the source string to the destination Appendable
+ * and returns the destination Appendable.
+ * The source and destination strings must be different objects.
+ *
+ * Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
+ *
+ * @param src source string
+ * @param dest destination Appendable; gets normalized src appended
+ * @return dest
+ * @stable ICU 4.6
+ */
+ public abstract Appendable normalize(CharSequence src, Appendable dest);
+
+ /**
+ * Appends the normalized form of the second string to the first string
+ * (merging them at the boundary) and returns the first string.
+ * The result is normalized if the first string was normalized.
+ * The first and second strings must be different objects.
+ * @param first string, should be normalized
+ * @param second string, will be normalized
+ * @return first
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder normalizeSecondAndAppend(
+ StringBuilder first, CharSequence second);
+
+ /**
+ * Appends the second string to the first string
+ * (merging them at the boundary) and returns the first string.
+ * The result is normalized if both the strings were normalized.
+ * The first and second strings must be different objects.
+ * @param first string, should be normalized
+ * @param second string, should be normalized
+ * @return first
+ * @stable ICU 4.4
+ */
+ public abstract StringBuilder append(StringBuilder first, CharSequence second);
+
+ /**
+ * Gets the decomposition mapping of c.
+ * Roughly equivalent to normalizing the String form of c
+ * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
+ * returns null if c does not have a decomposition mapping in this instance's data.
+ * This function is independent of the mode of the Normalizer2.
+ * @param c code point
+ * @return c's decomposition mapping, if any; otherwise null
+ * @stable ICU 4.6
+ */
+ public abstract String getDecomposition(int c);
+
+ /**
+ * Gets the combining class of c.
+ * The default implementation returns 0
+ * but all standard implementations return the Unicode Canonical_Combining_Class value.
+ * @param c code point
+ * @return c's combining class
+ * @stable ICU 49
+ */
+ public int getCombiningClass(int c) { return 0; }
+
+ /**
+ * Tests if the string is normalized.
+ * Internally, in cases where the quickCheck() method would return "maybe"
+ * (which is only possible for the two COMPOSE modes) this method
+ * resolves to "yes" or "no" to provide a definitive result,
+ * at the cost of doing more work in those cases.
+ * @param s input string
+ * @return true if s is normalized
+ * @stable ICU 4.4
+ */
+ public abstract boolean isNormalized(CharSequence s);
+
+ /**
+ * Returns the end of the normalized substring of the input string.
+ * In other words, with
+ * The returned end index is usually one or more characters before the
+ * "no" or "maybe" character: The end index is at a normalization boundary.
+ * (See the class documentation for more about normalization boundaries.)
+ *
+ * When the goal is a normalized string and most input strings are expected
+ * to be normalized already, then call this method,
+ * and if it returns a prefix shorter than the input string,
+ * copy that prefix and use normalizeSecondAndAppend() for the remainder.
+ * @param s input string
+ * @return "yes" span end index
+ * @stable ICU 4.4
+ */
+ public abstract int spanQuickCheckYes(CharSequence s);
+
+ /**
+ * Tests if the character always has a normalization boundary before it,
+ * regardless of context.
+ * If true, then the character does not normalization-interact with
+ * preceding characters.
+ * In other words, a string containing this character can be normalized
+ * by processing portions before this character and starting from this
+ * character independently.
+ * This is used for iterative normalization. See the class documentation for details.
+ * @param c character to test
+ * @return true if c has a normalization boundary before it
+ * @stable ICU 4.4
+ */
+ public abstract boolean hasBoundaryBefore(int c);
+
+ /**
+ * Sole constructor. (For invocation by subclass constructors,
+ * typically implicit.)
+ * @internal
+ * deprecated This API is ICU internal only.
+ */
+ protected Normalizer2() {
+ }
+}
--- /dev/null 2015-07-13 16:12:15.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/OutputInt.java 2015-07-13 16:12:15.000000000 +0900
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+package sun.text.normalizer;
+
+/**
+ * Simple struct-like class for int output parameters.
+ * Like For proper counting, we cannot ignore strings that are fully contained in code point spans.
+ *
+ * If the set does not have any fully-contained strings, then we could optimize this
+ * like span(), but such sets are likely rare, and this is at least still linear.
+ *
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param spanCondition The span condition
+ * @param outCount The count
+ * @return the limit (exclusive end) of the span
+ */
+ public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
+ OutputInt outCount) {
+ if (spanCondition == SpanCondition.NOT_CONTAINED) {
+ return spanNot(s, start, outCount);
+ }
+ // Consider strings; they may overlap with the span,
+ // and they may result in a smaller count that with just code points.
+ if (spanCondition == SpanCondition.CONTAINED) {
+ return spanContainedAndCount(s, start, outCount);
+ }
+ // SIMPLE (not synchronized, does not use offsets)
+ int stringsLength = strings.size();
+ int length = s.length();
+ int pos = start;
+ int rest = length - start;
+ int count = 0;
+ while (rest != 0) {
+ // Try to match the next code point.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ int maxInc = (cpLength > 0) ? cpLength : 0;
+ // Try to match all of the strings.
+ for (int i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ if (maxInc < length16 && length16 <= rest &&
+ matches16CPB(s, pos, length, string, length16)) {
+ maxInc = length16;
+ }
+ }
+ // We are done if there is no match beyond pos.
+ if (maxInc == 0) {
+ outCount.value = count;
+ return pos;
+ }
+ // Continue from the longest match.
+ ++count;
+ pos += maxInc;
+ rest -= maxInc;
+ }
+ outCount.value = count;
+ return pos;
+ }
+
+ private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
+ // Use offset list to try all possibilities.
+ offsets.setMaxLength(maxLength16);
+ int stringsLength = strings.size();
+ int length = s.length();
+ int pos = start;
+ int rest = length - start;
+ int count = 0;
+ while (rest != 0) {
+ // Try to match the next code point.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ if (cpLength > 0) {
+ offsets.addOffsetAndCount(cpLength, count + 1);
+ }
+ // Try to match all of the strings.
+ for (int i = 0; i < stringsLength; ++i) {
+ String string = strings.get(i);
+ int length16 = string.length();
+ // Note: If the strings were sorted by length, then we could also
+ // avoid trying to match if there is already a match of the same length.
+ if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
+ matches16CPB(s, pos, length, string, length16)) {
+ offsets.addOffsetAndCount(length16, count + 1);
+ }
+ }
+ // We are done if there is no match beyond pos.
+ if (offsets.isEmpty()) {
+ outCount.value = count;
+ return pos;
+ }
+ // Continue from the nearest match.
+ int minOffset = offsets.popMinimum(outCount);
+ count = outCount.value;
+ pos += minOffset;
+ rest -= minOffset;
+ }
+ outCount.value = count;
+ return pos;
+ }
+
+ /**
+ * Span a string backwards.
+ *
+ * @param s The string to be spanned
+ * @param spanCondition The span condition
+ * @return The string index which starts the span (i.e. inclusive).
+ */
+ public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
+ if (spanCondition == SpanCondition.NOT_CONTAINED) {
+ return spanNotBack(s, length);
+ }
+ int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
+ if (pos == 0) {
+ return 0;
+ }
+ int spanLength = length - pos;
+
+ // Consider strings; they may overlap with the span.
+ int initSize = 0;
+ if (spanCondition == SpanCondition.CONTAINED) {
+ // Use offset list to try all possibilities.
+ initSize = maxLength16;
+ }
+ offsets.setMaxLength(initSize);
+ int i, stringsLength = strings.size();
+ int spanBackLengthsOffset = 0;
+ if (all) {
+ spanBackLengthsOffset = stringsLength;
+ }
+ for (;;) {
+ if (spanCondition == SpanCondition.CONTAINED) {
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[spanBackLengthsOffset + i];
+ if (overlap == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // While contained: No point matching fully inside the code point span.
+ int len1 = 0;
+ len1 = string.offsetByCodePoints(0, 1);
+ overlap -= len1; // Length of the string minus the first code point.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int dec = length16 - overlap; // Keep dec+overlap==length16.
+ for (;;) {
+ if (dec > pos) {
+ break;
+ }
+ // Try to match if the decrement is not listed already.
+ if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
+ if (dec == pos) {
+ return 0; // Reached the start of the string.
+ }
+ offsets.addOffset(dec);
+ }
+ if (overlap == 0) {
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+ } else /* SIMPLE */{
+ int maxDec = 0, maxOverlap = 0;
+ for (i = 0; i < stringsLength; ++i) {
+ int overlap = spanLengths[spanBackLengthsOffset + i];
+ // For longest match, we do need to try to match even an all-contained string
+ // to find the match from the latest end.
+
+ String string = strings.get(i);
+
+ int length16 = string.length();
+
+ // Try to match this string at pos-(length16-overlap)..pos-length16.
+ if (overlap >= LONG_SPAN) {
+ overlap = length16;
+ // Longest match: Need to match fully inside the code point span
+ // to find the match from the latest end.
+ }
+ if (overlap > spanLength) {
+ overlap = spanLength;
+ }
+ int dec = length16 - overlap; // Keep dec+overlap==length16.
+ for (;;) {
+ if (dec > pos || overlap < maxOverlap) {
+ break;
+ }
+ // Try to match if the string is longer or ends later.
+ if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
+ && matches16CPB(s, pos - dec, length, string, length16)) {
+ maxDec = dec; // Longest match from latest end.
+ maxOverlap = overlap;
+ break;
+ }
+ --overlap;
+ ++dec;
+ }
+ }
+
+ if (maxDec != 0 || maxOverlap != 0) {
+ // Longest-match algorithm, and there was a string match.
+ // Simply continue before it.
+ pos -= maxDec;
+ if (pos == 0) {
+ return 0; // Reached the start of the string.
+ }
+ spanLength = 0; // Match strings from before a string match.
+ continue;
+ }
+ }
+ // Finished trying to match all strings at pos.
+
+ if (spanLength != 0 || pos == length) {
+ // The position is before an unlimited code point span (spanLength!=0),
+ // not before a string match.
+ // The only position where spanLength==0 before a span is pos==length.
+ // Otherwise, an unlimited code point span is only tried again when no
+ // strings match, and if such a non-initial span fails we stop.
+ if (offsets.isEmpty()) {
+ return pos; // No strings matched before a span.
+ }
+ // Match strings from before the next string match.
+ } else {
+ // The position is before a string match (or a single code point).
+ if (offsets.isEmpty()) {
+ // No more strings matched before a previous string match.
+ // Try another code point span from before the last string match.
+ int oldPos = pos;
+ pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
+ spanLength = oldPos - pos;
+ if (pos == 0 || // Reached the start of the string, or
+ spanLength == 0 // neither strings nor span progressed.
+ ) {
+ return pos;
+ }
+ continue; // spanLength>0: Match strings from before a span.
+ } else {
+ // Try to match only one code point from before a string match if some
+ // string matched beyond it, so that we try all possible positions
+ // and don't overshoot.
+ spanLength = spanOneBack(spanSet, s, pos);
+ if (spanLength > 0) {
+ if (spanLength == pos) {
+ return 0; // Reached the start of the string.
+ }
+ // Match strings before this code point.
+ // There cannot be any decrements below it because UnicodeSet strings
+ // contain multiple code points.
+ pos -= spanLength;
+ offsets.shift(spanLength);
+ spanLength = 0;
+ continue; // Match strings from before a single code point.
+ }
+ // Match strings from before the next string match.
+ }
+ }
+ pos -= offsets.popMinimum(null);
+ spanLength = 0; // Match strings from before a string match.
+ }
+ }
+
+ /**
+ * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
+ *
+ * Theoretical algorithm:
+ * - Iterate through the string, and at each code point boundary:
+ * + If the code point there is in the set, then return with the current position.
+ * + If a set string matches at the current position, then return with the current position.
+ *
+ * Optimized implementation:
+ *
+ * (Same assumption as for span() above.)
+ *
+ * Create and cache a spanNotSet which contains
+ * all of the single code points of the original set but none of its strings.
+ * For each set string add its initial code point to the spanNotSet.
+ * (Also add its final code point for spanNotBack().)
+ *
+ * - Loop:
+ * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
+ * + If the current code point is in the original set, then return the current position.
+ * + If any set string matches at the current position, then return the current position.
+ * + If there is no match at the current position, neither for the code point
+ * there nor for any set string, then skip this code point and continue the loop.
+ * This happens for set-string-initial code points that were added to spanNotSet
+ * when there is not actually a match for such a set string.
+ *
+ * @param s The string to be spanned
+ * @param start The start index that the span begins
+ * @param outCount If not null: Receives the number of code points across the span.
+ * @return the limit (exclusive end) of the span
+ */
+ private int spanNot(CharSequence s, int start, OutputInt outCount) {
+ int length = s.length();
+ int pos = start, rest = length - start;
+ int stringsLength = strings.size();
+ int count = 0;
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ int spanLimit;
+ if (outCount == null) {
+ spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
+ } else {
+ spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
+ outCount.value = count = count + outCount.value;
+ }
+ if (spanLimit == length) {
+ return length; // Reached the end of the string.
+ }
+ pos = spanLimit;
+ rest = length - spanLimit;
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int cpLength = spanOne(spanSet, s, pos, rest);
+ if (cpLength > 0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for (int i = 0; i < stringsLength; ++i) {
+ if (spanLengths[i] == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+ if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos -= cpLength;
+ rest += cpLength;
+ ++count;
+ } while (rest != 0);
+ if (outCount != null) {
+ outCount.value = count;
+ }
+ return length; // Reached the end of the string.
+ }
+
+ private int spanNotBack(CharSequence s, int length) {
+ int pos = length;
+ int i, stringsLength = strings.size();
+ do {
+ // Span until we find a code point from the set,
+ // or a code point that starts or ends some string.
+ pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
+ if (pos == 0) {
+ return 0; // Reached the start of the string.
+ }
+
+ // Check whether the current code point is in the original set,
+ // without the string starts and ends.
+ int cpLength = spanOneBack(spanSet, s, pos);
+ if (cpLength > 0) {
+ return pos; // There is a set element at pos.
+ }
+
+ // Try to match the strings at pos.
+ for (i = 0; i < stringsLength; ++i) {
+ // Use spanLengths rather than a spanLengths pointer because
+ // it is easier and we only need to know whether the string is irrelevant
+ // which is the same in either array.
+ if (spanLengths[i] == ALL_CP_CONTAINED) {
+ continue; // Irrelevant string.
+ }
+ String string = strings.get(i);
+
+ int length16 = string.length();
+ if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
+ return pos; // There is a set element at pos.
+ }
+ }
+
+ // The span(while not contained) ended on a string start/end which is
+ // not in the original set. Skip this code point and continue.
+ // cpLength<0
+ pos += cpLength;
+ } while (pos != 0);
+ return 0; // Reached the start of the string.
+ }
+
+ static short makeSpanLengthByte(int spanLength) {
+ // 0xfe==UnicodeSetStringSpan::LONG_SPAN
+ return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
+ }
+
+ // Compare strings without any argument checks. Requires length>0.
+ private static boolean matches16(CharSequence s, int start, final String t, int length) {
+ int end = start + length;
+ while (length-- > 0) {
+ if (s.charAt(--end) != t.charAt(length)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Compare 16-bit Unicode strings (which may be malformed UTF-16)
+ * at code point boundaries.
+ * That is, each edge of a match must not be in the middle of a surrogate pair.
+ * @param s The string to match in.
+ * @param start The start index of s.
+ * @param limit The limit of the subsequence of s being spanned.
+ * @param t The substring to be matched in s.
+ * @param tlength The length of t.
+ */
+ static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
+ return matches16(s, start, t, tlength)
+ && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
+ Character.isLowSurrogate(s.charAt(start)))
+ && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
+ Character.isLowSurrogate(s.charAt(start + tlength)));
+ }
+
+ /**
+ * Does the set contain the next code point?
+ * If so, return its length; otherwise return its negative length.
+ */
+ static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
+ char c = s.charAt(start);
+ if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
+ char c2 = s.charAt(start + 1);
+ if (UTF16.isTrailSurrogate(c2)) {
+ int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
+ return set.contains(supplementary) ? 2 : -2;
+ }
+ }
+ return set.contains(c) ? 1 : -1;
+ }
+
+ static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
+ char c = s.charAt(length - 1);
+ if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
+ char c2 = s.charAt(length - 2);
+ if (UTF16.isLeadSurrogate(c2)) {
+ int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
+ return set.contains(supplementary) ? 2 : -2;
+ }
+ }
+ return set.contains(c) ? 1 : -1;
+ }
+
+ /**
+ * Helper class for UnicodeSetStringSpan.
+ *
+ * List of offsets from the current position from where to try matching
+ * a code point or a string.
+ * Stores offsets rather than indexes to simplify the code and use the same list
+ * for both increments (in span()) and decrements (in spanBack()).
+ *
+ * Assumption: The maximum offset is limited, and the offsets that are stored at any one time
+ * are relatively dense, that is,
+ * there are normally no gaps of hundreds or thousands of offset values.
+ *
+ * This class optionally also tracks the minimum non-negative count for each position,
+ * intended to count the smallest number of elements of any path leading to that position.
+ *
+ * The implementation uses a circular buffer of count integers,
+ * each indicating whether the corresponding offset is in the list,
+ * and its path element count.
+ * This avoids inserting into a sorted list of offsets (or absolute indexes)
+ * and physically moving part of the list.
+ *
+ * Note: In principle, the caller should setMaxLength() to
+ * the maximum of the max string length and U16_LENGTH/U8_LENGTH
+ * to account for "long" single code points.
+ *
+ * Note: An earlier version did not track counts and stored only byte flags.
+ * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
+ * the list could be stored as bit flags in a single integer.
+ * Rather than handling a circular buffer with a start list index,
+ * the integer would simply be shifted when lower offsets are removed.
+ * UnicodeSet does not have a limit on the lengths of strings.
+ */
+ private static final class OffsetList {
+ private int[] list;
+ private int length;
+ private int start;
+
+ public OffsetList() {
+ list = new int[16]; // default size
+ }
+
+ public void setMaxLength(int maxLength) {
+ if (maxLength > list.length) {
+ list = new int[maxLength];
+ }
+ clear();
+ }
+
+ public void clear() {
+ for (int i = list.length; i-- > 0;) {
+ list[i] = 0;
+ }
+ start = length = 0;
+ }
+
+ public boolean isEmpty() {
+ return (length == 0);
+ }
+
+ /**
+ * Reduces all stored offsets by delta, used when the current position moves by delta.
+ * There must not be any offsets lower than delta.
+ * If there is an offset equal to delta, it is removed.
+ *
+ * @param delta [1..maxLength]
+ */
+ public void shift(int delta) {
+ int i = start + delta;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ if (list[i] != 0) {
+ list[i] = 0;
+ --length;
+ }
+ start = i;
+ }
+
+ /**
+ * Adds an offset. The list must not contain it yet.
+ * @param offset [1..maxLength]
+ */
+ public void addOffset(int offset) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ assert list[i] == 0;
+ list[i] = 1;
+ ++length;
+ }
+
+ /**
+ * Adds an offset and updates its count.
+ * The list may already contain the offset.
+ * @param offset [1..maxLength]
+ */
+ public void addOffsetAndCount(int offset, int count) {
+ assert count > 0;
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ if (list[i] == 0) {
+ list[i] = count;
+ ++length;
+ } else if (count < list[i]) {
+ list[i] = count;
+ }
+ }
+
+ /**
+ * @param offset [1..maxLength]
+ */
+ public boolean containsOffset(int offset) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ return list[i] != 0;
+ }
+
+ /**
+ * @param offset [1..maxLength]
+ */
+ public boolean hasCountAtOffset(int offset, int count) {
+ int i = start + offset;
+ if (i >= list.length) {
+ i -= list.length;
+ }
+ int oldCount = list[i];
+ return oldCount != 0 && oldCount <= count;
+ }
+
+ /**
+ * Finds the lowest stored offset from a non-empty list, removes it,
+ * and reduces all other offsets by this minimum.
+ * @return min=[1..maxLength]
+ */
+ public int popMinimum(OutputInt outCount) {
+ // Look for the next offset in list[start+1..list.length-1].
+ int i = start, result;
+ while (++i < list.length) {
+ int count = list[i];
+ if (count != 0) {
+ list[i] = 0;
+ --length;
+ result = i - start;
+ start = i;
+ if (outCount != null) { outCount.value = count; }
+ return result;
+ }
+ }
+ // i==list.length
+
+ // Wrap around and look for the next offset in list[0..start].
+ // Since the list is not empty, there will be one.
+ result = list.length - start;
+ i = 0;
+ int count;
+ while ((count = list[i]) == 0) {
+ ++i;
+ }
+ list[i] = 0;
+ --length;
+ start = i;
+ if (outCount != null) { outCount.value = count; }
+ return result + i;
+ }
+ }
+}
Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfc.icu differ
Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfkc.icu differ
Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfkc_cf.icu differ
--- /dev/null 2015-07-13 16:12:20.000000000 +0900
+++ new/jdk/test/java/text/BreakIterator/Bug8032446.java 2015-07-13 16:12:19.000000000 +0900
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+/*
+ * @test
+ * @bug 8032446
+ * @summary Confirm that BreakIterator works as expected with new characters in Unicode 7.
+ */
+
+import java.text.*;
+import java.util.*;
+
+public class Bug8032446 {
+
+ public static void main(String[] args) {
+ boolean err = false;
+
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0x10860; i <= 0x10876; i++) { // Palmyrene Letters
+ sb.append(Character.toChars(i));
+ }
+ sb.append(" ");
+ for (int i = 0x10879; i <= 0x1087D; i++) { // Palmyrene Numbers
+ sb.append(Character.toChars(i));
+ }
+ String s = sb.toString();
+
+ BreakIterator bi = BreakIterator.getWordInstance(Locale.ROOT);
+ bi.setText(s);
+ bi.first();
+
+ if (bi.next() != s.indexOf(' ')) {
+ throw new RuntimeException("Unexpected word breaking.");
+ }
+ }
+
+}
* Note this is for internal use hence no checks for the validity of the
@@ -217,42 +238,48 @@
}
/**
- * Loads the property data and initialize the UCharacterProperty instance.
- * @throws MissingResourceException when data is missing or data has been corrupted
- */
- public static UCharacterProperty getInstance()
+ * Gets the type mask
+ * @param type character type
+ * @return mask
+ */
+ public static final int getMask(int type)
{
- if(INSTANCE_ == null) {
- try {
- INSTANCE_ = new UCharacterProperty();
- }
- catch (Exception e) {
- throw new MissingResourceException(e.getMessage(),"","");
- }
- }
- return INSTANCE_;
+ return 1 << type;
}
/**
- * Checks if the argument c is to be treated as a white space in ICU
- * rules. Usually ICU rule white spaces are ignored unless quoted.
- * Equivalent to test for Pattern_White_Space Unicode property.
- * Stable set of characters, won't change.
- * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- * @param c codepoint to check
- * @return true if c is a ICU white space
- */
- public static boolean isRuleWhiteSpace(int c)
- {
- /* "white space" in the sense of ICU rule parsers
- This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
- See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
- U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
- Equivalent to test for Pattern_White_Space Unicode property.
- */
- return (c >= 0x0009 && c <= 0x2029 &&
- (c <= 0x000D || c == 0x0020 || c == 0x0085 ||
- c == 0x200E || c == 0x200F || c >= 0x2028));
+ * Returns the digit values of characters like 'A' - 'Z', normal,
+ * half-width and full-width. This method assumes that the other digit
+ * characters are checked by the calling method.
+ * @param ch character to test
+ * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
+ * its corresponding digit will be returned.
+ */
+ public static int getEuropeanDigit(int ch) {
+ if ((ch > 0x7a && ch < 0xff21)
+ || ch < 0x41 || (ch > 0x5a && ch < 0x61)
+ || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
+ return -1;
+ }
+ if (ch <= 0x7a) {
+ // ch >= 0x41 or ch < 0x61
+ return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
+ }
+ // ch >= 0xff21
+ if (ch <= 0xff3a) {
+ return ch + 10 - 0xff21;
+ }
+ // ch >= 0xff41 && ch <= 0xff5a
+ return ch + 10 - 0xff41;
+ }
+
+ public int digit(int c) {
+ int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
+ if(value<=9) {
+ return value;
+ } else {
+ return -1;
+ }
}
// protected variables -----------------------------------------------
@@ -260,7 +287,7 @@
/**
* Extra property trie
*/
- CharTrie m_additionalTrie_;
+ Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
@@ -280,40 +307,24 @@
* 0
*/
int m_maxJTGValue_;
+ /**
+ * Script_Extensions data
+ */
+ public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
- /**
- * UnicodeData.txt property object
- */
- private static UCharacterProperty INSTANCE_ = null;
-
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
- * Default buffer size of datafile
- */
- private static final int DATA_BUFFER_SIZE_ = 25000;
-
- /**
- * Numeric value shift
- */
- private static final int VALUE_SHIFT_ = 8;
-
- /**
- * Mask to be applied after shifting to obtain an unsigned numeric value
- */
- private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
-
- /**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
- * Offset to add to combined surrogate pair to avoid msking.
+ * Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
@@ -321,7 +332,153 @@
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
- // additional properties ----------------------------------------------
+
+ // property data constants -------------------------------------------------
+
+ /**
+ * Numeric types and values in the main properties words.
+ */
+ private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
+ private static final int getNumericTypeValue(int props) {
+ return props >> NUMERIC_TYPE_VALUE_SHIFT_;
+ }
+
+ /* constants for the storage form of numeric types and values */
+ /** No numeric value. */
+ private static final int NTV_NONE_ = 0;
+ /** Decimal digits: nv=0..9 */
+ private static final int NTV_DECIMAL_START_ = 1;
+ /** Other digits: nv=0..9 */
+ private static final int NTV_DIGIT_START_ = 11;
+ /** Small integers: nv=0..154 */
+ private static final int NTV_NUMERIC_START_ = 21;
+
+ private static final int ntvGetType(int ntv) {
+ return
+ (ntv==NTV_NONE_) ? NumericType.NONE :
+ (ntvbounds(string, offset16) != TRAIL
.
*
* UCharacter.isLegal()
can be used to check
@@ -106,10 +101,10 @@
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* UTF16.getCharCount()
, as well as random access. If a
* validity check is required, use
@@ -232,19 +261,72 @@
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
- * @param start offset to substring in the source array for analyzing
- * @param limit offset to substring in the source array for analyzing
- * @param offset16 UTF-16 offset relative to start
+ * @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* bounds32()
.
- * @exception IndexOutOfBoundsException thrown if offset16 is not within
- * the range of start and limit.
+ * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
- public static int charAt(char source[], int start, int limit,
- int offset16)
- {
+ public static int charAt(CharSequence source, int offset16) {
+ char single = source.charAt(offset16);
+ if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
+ return single;
+ }
+ return _charAt(source, offset16, single);
+ }
+
+ private static int _charAt(CharSequence source, int offset16, char single) {
+ if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return single;
+ }
+
+ // Convert the UTF-16 surrogate pair if necessary.
+ // For simplicity in usage, and because the frequency of pairs is
+ // low, look both directions.
+
+ if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ ++offset16;
+ if (source.length() != offset16) {
+ char trail = source.charAt(offset16);
+ if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
+ && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(single, trail);
+ }
+ }
+ } else {
+ --offset16;
+ if (offset16 >= 0) {
+ // single is a trail surrogate so
+ char lead = source.charAt(offset16);
+ if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
+ && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
+ return UCharacterProperty.getRawSupplementary(lead, single);
+ }
+ }
+ }
+ return single; // return unmatched surrogate
+ }
+
+ /**
+ * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
+ * (with UTF16.getCharCount()
, as well as random access. If a validity check is
+ * required, use UCharacter.isLegal()
+ *
+ * on the return value. If the char retrieved is part of a surrogate pair, its supplementary
+ * character will be returned. If a complete supplementary character is not found the incomplete
+ * character will be returned
+ *
+ * @param source Array of UTF-16 chars
+ * @param start Offset to substring in the source array for analyzing
+ * @param limit Offset to substring in the source array for analyzing
+ * @param offset16 UTF-16 offset relative to start
+ * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
+ * of that codepoint are the same as in bounds32()
.
+ * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
+ * @stable ICU 2.1
+ */
+ public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
@@ -259,7 +341,7 @@
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
- offset16 ++;
+ offset16++;
if (offset16 >= limit) {
return single;
}
@@ -272,7 +354,7 @@
if (offset16 == start) {
return single;
}
- offset16 --;
+ offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
@@ -300,37 +382,34 @@
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
- * @return true iff the input character is a surrogate.
+ * @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE;
+ return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
- * @return true iff the input character is a trail surrogate.
+ * @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
- return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= TRAIL_SURROGATE_MAX_VALUE);
+ return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
- * @return true iff the input character is a lead surrogate
+ * @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
- return LEAD_SURROGATE_MIN_VALUE <= char16 &&
- char16 <= LEAD_SURROGATE_MAX_VALUE;
+ return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
@@ -359,7 +438,7 @@
* isLegal()
* on char32 before calling.
* @param char32 the input character.
- * @return the trail surrogate if the getCharCount(ch) is 2;
otherwise
+ * @return the trail surrogate if the getCharCount(ch) is 2;
otherwise
* the character itself
* @stable ICU 2.1
*/
@@ -370,7 +449,7 @@
(char32 & TRAIL_SURROGATE_MASK_));
}
- return (char)char32;
+ return (char) char32;
}
/**
@@ -415,16 +494,15 @@
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
{
- target.append(getLeadSurrogate(char32));
- target.append(getTrailSurrogate(char32));
- }
+ target.append(getLeadSurrogate(char32));
+ target.append(getTrailSurrogate(char32));
+ }
else {
- target.append((char)char32);
+ target.append((char) char32);
}
return target;
}
- //// for StringPrep
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
@@ -441,20 +519,20 @@
public static int moveCodePointOffset(char source[], int start, int limit,
int offset16, int shift32)
{
- int size = source.length;
- int count;
- char ch;
- int result = offset16 + start;
- if (start<0 || limit
+ * All of the above can be used in for loops.
+ * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in for loops.
+ * end >
+ * start
then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
@@ -359,7 +371,7 @@
*/
public UnicodeSet(String pattern) {
this();
- applyPattern(pattern, null, null, IGNORE_SPACE);
+ applyPattern(pattern, null);
}
/**
@@ -368,172 +380,29 @@
* copied to this object
* @stable ICU 2.0
*/
- @SuppressWarnings("unchecked") // Casting result of clone of a collection
public UnicodeSet set(UnicodeSet other) {
+ checkFrozen();
list = other.list.clone();
len = other.len;
- pat = other.pat;
- strings = (TreeSet)other.strings.clone();
+ strings = new TreeSettoPattern()
representation of a
- * string to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
- for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
- _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
- }
- }
-
- /**
- * Append the toPattern()
representation of a
- * character to the given StringBuffer
.
- */
- private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
- if (escapeUnprintable && Utility.isUnprintable(c)) {
- // Use hex escape notation (complement(MIN_VALUE, MAX_VALUE)
.
- * @stable ICU 2.0
- */
- public UnicodeSet complement() {
- if (list[0] == LOW) {
- System.arraycopy(list, 1, list, 0, len-1);
- --len;
- } else {
- ensureCapacity(len+1);
- System.arraycopy(list, 0, list, 1, len);
- list[0] = LOW;
- ++len;
- }
- pat = null;
return this;
}
@@ -743,6 +595,12 @@
if (c < MIN_VALUE || c > MAX_VALUE) {
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
}
+ if (bmpSet != null) {
+ return bmpSet.contains(c);
+ }
+ if (stringSpan != null) {
+ return stringSpan.contains(c);
+ }
/*
// Set i to the index of the start item greater than ch
@@ -751,7 +609,7 @@
while (true) {
if (c < list[++i]) break;
}
- */
+ */
int i = findCodePoint(c);
@@ -790,7 +648,7 @@
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
- if (i == lo) return hi;
+ if (i == lo) return hi;
if (c < list[i]) {
hi = i;
} else {
@@ -800,22 +658,6 @@
}
/**
- * Adds all of the elements in the specified set to this set if
- * they're not already present. This operation effectively
- * modifies this set so that its value is the union of the two
- * sets. The behavior of this operation is unspecified if the specified
- * collection is modified while the operation is in progress.
- *
- * @param c set whose elements are to be added to this set.
- * @stable ICU 2.0
- */
- public UnicodeSet addAll(UnicodeSet c) {
- add(c.list, c.len, 0);
- strings.addAll(c.strings);
- return this;
- }
-
- /**
* Retains only the elements in this set that are contained in the
* specified set. In other words, removes from this set all of
* its elements that are not contained in the specified set. This
@@ -826,36 +668,21 @@
* @stable ICU 2.0
*/
public UnicodeSet retainAll(UnicodeSet c) {
+ checkFrozen();
retain(c.list, c.len, 0);
strings.retainAll(c.strings);
return this;
}
/**
- * Removes from this set all of its elements that are contained in the
- * specified set. This operation effectively modifies this
- * set so that its value is the asymmetric set difference of
- * the two sets.
- *
- * @param c set that defines which elements will be removed from
- * this set.
- * @stable ICU 2.0
- */
- public UnicodeSet removeAll(UnicodeSet c) {
- retain(c.list, c.len, 2);
- strings.removeAll(c.strings);
- return this;
- }
-
- /**
* Removes all of the elements from this set. This set will be
* empty after this call returns.
* @stable ICU 2.0
*/
public UnicodeSet clear() {
+ checkFrozen();
list[0] = HIGH;
len = 1;
- pat = null;
strings.clear();
return this;
}
@@ -923,405 +750,18 @@
* of pattern
* @exception java.lang.IllegalArgumentException if the parse fails.
*/
- UnicodeSet applyPattern(String pattern,
- ParsePosition pos,
- SymbolTable symbols,
- int options) {
-
- // Need to build the pattern in a temporary string because
- // _applyPattern calls add() etc., which set pat to empty.
- boolean parsePositionWasNull = pos == null;
- if (parsePositionWasNull) {
- pos = new ParsePosition(0);
- }
-
- StringBuffer rebuiltPat = new StringBuffer();
- RuleCharacterIterator chars =
- new RuleCharacterIterator(pattern, symbols, pos);
- applyPattern(chars, symbols, rebuiltPat, options);
- if (chars.inVariable()) {
- syntaxError(chars, "Extra chars in variable value");
- }
- pat = rebuiltPat.toString();
- if (parsePositionWasNull) {
- int i = pos.getIndex();
-
- // Skip over trailing whitespace
- if ((options & IGNORE_SPACE) != 0) {
- i = Utility.skipWhitespace(pattern, i);
- }
-
- if (i != pattern.length()) {
- throw new IllegalArgumentException("Parse of \"" + pattern +
- "\" failed at " + i);
- }
- }
- return this;
- }
-
- /**
- * Parse the pattern from the given RuleCharacterIterator. The
- * iterator is advanced over the parsed pattern.
- * @param chars iterator over the pattern characters. Upon return
- * it will be advanced to the first character after the parsed
- * pattern, or the end of the iteration if all characters are
- * parsed.
- * @param symbols symbol table to use to parse and dereference
- * variables, or null if none.
- * @param rebuiltPat the pattern that was parsed, rebuilt or
- * copied from the input pattern, as appropriate.
- * @param options a bit mask of zero or more of the following:
- * IGNORE_SPACE, CASE.
- */
- void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
- StringBuffer rebuiltPat, int options) {
- // Syntax characters: [ ] ^ - & { }
-
- // Recognized special forms for chars, sets: c-c s-s s&s
-
- int opts = RuleCharacterIterator.PARSE_VARIABLES |
- RuleCharacterIterator.PARSE_ESCAPES;
- if ((options & IGNORE_SPACE) != 0) {
- opts |= RuleCharacterIterator.SKIP_WHITESPACE;
- }
-
- StringBuffer patBuf = new StringBuffer(), buf = null;
- boolean usePat = false;
- UnicodeSet scratch = null;
- Object backup = null;
-
- // mode: 0=before [, 1=between [...], 2=after ]
- // lastItem: 0=none, 1=char, 2=set
- int lastItem = 0, lastChar = 0, mode = 0;
- char op = 0;
-
- boolean invert = false;
-
- clear();
-
- while (mode != 2 && !chars.atEnd()) {
- if (false) {
- // Debugging assertion
- if (!((lastItem == 0 && op == 0) ||
- (lastItem == 1 && (op == 0 || op == '-')) ||
- (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) {
- throw new IllegalArgumentException();
- }
- }
-
- int c = 0;
- boolean literal = false;
- UnicodeSet nested = null;
-
- // -------- Check for property pattern
-
- // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
- int setMode = 0;
- if (resemblesPropertyPattern(chars, opts)) {
- setMode = 2;
- }
-
- // -------- Parse '[' of opening delimiter OR nested set.
- // If there is a nested set, use `setMode' to define how
- // the set should be parsed. If the '[' is part of the
- // opening delimiter for this pattern, parse special
- // strings "[", "[^", "[-", and "[^-". Check for stand-in
- // characters representing a nested set in the symbol
- // table.
-
- else {
- // Prepare to backup if necessary
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
-
- if (c == '[' && !literal) {
- if (mode == 1) {
- chars.setPos(backup); // backup
- setMode = 1;
- } else {
- // Handle opening '[' delimiter
- mode = 1;
- patBuf.append('[');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '^' && !literal) {
- invert = true;
- patBuf.append('^');
- backup = chars.getPos(backup); // prepare to backup
- c = chars.next(opts);
- literal = chars.isEscaped();
- }
- // Fall through to handle special leading '-';
- // otherwise restart loop for nested [], \p{}, etc.
- if (c == '-') {
- literal = true;
- // Fall through to handle literal '-' below
- } else {
- chars.setPos(backup); // backup
- continue;
- }
- }
- } else if (symbols != null) {
- UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
- if (m != null) {
- try {
- nested = (UnicodeSet) m;
- setMode = 3;
- } catch (ClassCastException e) {
- syntaxError(chars, "Syntax error");
- }
- }
- }
- }
-
- // -------- Handle a nested set. This either is inline in
- // the pattern or represented by a stand-in that has
- // previously been parsed and was looked up in the symbol
- // table.
-
- if (setMode != 0) {
- if (lastItem == 1) {
- if (op != 0) {
- syntaxError(chars, "Char expected after operator");
- }
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastItem = op = 0;
- }
-
- if (op == '-' || op == '&') {
- patBuf.append(op);
- }
-
- if (nested == null) {
- if (scratch == null) scratch = new UnicodeSet();
- nested = scratch;
- }
- switch (setMode) {
- case 1:
- nested.applyPattern(chars, symbols, patBuf, options);
- break;
- case 2:
- chars.skipIgnored(opts);
- nested.applyPropertyPattern(chars, patBuf, symbols);
- break;
- case 3: // `nested' already parsed
- nested._toPattern(patBuf, false);
- break;
- }
-
- usePat = true;
-
- if (mode == 0) {
- // Entire pattern is a category; leave parse loop
- set(nested);
- mode = 2;
- break;
- }
-
- switch (op) {
- case '-':
- removeAll(nested);
- break;
- case '&':
- retainAll(nested);
- break;
- case 0:
- addAll(nested);
- break;
- }
-
- op = 0;
- lastItem = 2;
-
- continue;
- }
-
- if (mode == 0) {
- syntaxError(chars, "Missing '['");
- }
-
- // -------- Parse special (syntax) characters. If the
- // current character is not special, or if it is escaped,
- // then fall through and handle it below.
-
- if (!literal) {
- switch (c) {
- case ']':
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- // Treat final trailing '-' as a literal
- if (op == '-') {
- add_unchecked(op, op);
- patBuf.append(op);
- } else if (op == '&') {
- syntaxError(chars, "Trailing '&'");
- }
- patBuf.append(']');
- mode = 2;
- continue;
- case '-':
- if (op == 0) {
- if (lastItem != 0) {
- op = (char) c;
- continue;
- } else {
- // Treat final trailing '-' as a literal
- add_unchecked(c, c);
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == ']' && !literal) {
- patBuf.append("-]");
- mode = 2;
- continue;
- }
- }
- }
- syntaxError(chars, "'-' not after char or set");
- break;
- case '&':
- if (lastItem == 2 && op == 0) {
- op = (char) c;
- continue;
- }
- syntaxError(chars, "'&' not after set");
- break;
- case '^':
- syntaxError(chars, "'^' not after '['");
- break;
- case '{':
- if (op != 0) {
- syntaxError(chars, "Missing operand after operator");
- }
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- lastItem = 0;
- if (buf == null) {
- buf = new StringBuffer();
- } else {
- buf.setLength(0);
- }
- boolean ok = false;
- while (!chars.atEnd()) {
- c = chars.next(opts);
- literal = chars.isEscaped();
- if (c == '}' && !literal) {
- ok = true;
- break;
- }
- UTF16.append(buf, c);
- }
- if (buf.length() < 1 || !ok) {
- syntaxError(chars, "Invalid multicharacter string");
- }
- // We have new string. Add it to set and continue;
- // we don't need to drop through to the further
- // processing
- add(buf.toString());
- patBuf.append('{');
- _appendToPat(patBuf, buf.toString(), false);
- patBuf.append('}');
- continue;
- case SymbolTable.SYMBOL_REF:
- // symbols nosymbols
- // [a-$] error error (ambiguous)
- // [a$] anchor anchor
- // [a-$x] var "x"* literal '$'
- // [a-$.] error literal '$'
- // *We won't get here in the case of var "x"
- backup = chars.getPos(backup);
- c = chars.next(opts);
- literal = chars.isEscaped();
- boolean anchor = (c == ']' && !literal);
- if (symbols == null && !anchor) {
- c = SymbolTable.SYMBOL_REF;
- chars.setPos(backup);
- break; // literal '$'
- }
- if (anchor && op == 0) {
- if (lastItem == 1) {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- }
- add_unchecked(UnicodeMatcher.ETHER);
- usePat = true;
- patBuf.append(SymbolTable.SYMBOL_REF).append(']');
- mode = 2;
- continue;
- }
- syntaxError(chars, "Unquoted '$'");
- break;
- default:
- break;
- }
- }
-
- // -------- Parse literal characters. This includes both
- // escaped chars ("\u4E01") and non-syntax characters
- // ("a").
-
- switch (lastItem) {
- case 0:
- lastItem = 1;
- lastChar = c;
- break;
- case 1:
- if (op == '-') {
- if (lastChar >= c) {
- // Don't allow redundant (a-a) or empty (b-a) ranges;
- // these are most likely typos.
- syntaxError(chars, "Invalid range");
- }
- add_unchecked(lastChar, c);
- _appendToPat(patBuf, lastChar, false);
- patBuf.append(op);
- _appendToPat(patBuf, c, false);
- lastItem = op = 0;
- } else {
- add_unchecked(lastChar, lastChar);
- _appendToPat(patBuf, lastChar, false);
- lastChar = c;
- }
- break;
- case 2:
- if (op != 0) {
- syntaxError(chars, "Set expected after operator");
- }
- lastChar = c;
- lastItem = 1;
- break;
- }
- }
-
- if (mode != 2) {
- syntaxError(chars, "Missing ']'");
- }
-
- chars.skipIgnored(opts);
-
- if (invert) {
- complement();
- }
-
- // Use the rebuilt pattern (pat) only if necessary. Prefer the
- // generated pattern.
- if (usePat) {
- rebuiltPat.append(patBuf.toString());
+ private UnicodeSet applyPattern(String pattern,
+ ParsePosition pos) {
+ if ("[:age=3.2:]".equals(pattern)) {
+ checkFrozen();
+ VersionInfo version = VersionInfo.getInstance("3.2");
+ applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC);
} else {
- _generatePattern(rebuiltPat, false, true);
+ throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern "
+ + pattern + ")");
}
- }
- private static void syntaxError(RuleCharacterIterator chars, String msg) {
- throw new IllegalArgumentException("Error: " + msg + " at \"" +
- Utility.escape(chars.toString()) +
- '"');
+ return this;
}
//----------------------------------------------------------------
@@ -1397,7 +837,6 @@
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1414,88 +853,87 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; take lower if unequal
- if (a < b) { // take a
- // Back up over overlapping ranges in buffer[]
- if (k > 0 && a <= buffer[k-1]) {
- // Pick latter end value in buffer[] vs. list[]
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
- buffer[k++] = a;
- a = list[i];
- }
- i++; // Common if/else code factored out
- polarity ^= 1;
- } else if (b < a) { // take b
- if (k > 0 && b <= buffer[k-1]) {
- b = max(other[j], buffer[--k]);
- } else {
- buffer[k++] = b;
- b = other[j];
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; take lower if unequal
+ if (a < b) { // take a
+ // Back up over overlapping ranges in buffer[]
+ if (k > 0 && a <= buffer[k-1]) {
+ // Pick latter end value in buffer[] vs. list[]
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++; // Common if/else code factored out
+ polarity ^= 1;
+ } else if (b < a) { // take b
+ if (k > 0 && b <= buffer[k-1]) {
+ b = max(other[j], buffer[--k]);
+ } else {
+ buffer[k++] = b;
+ b = other[j];
+ }
+ j++;
+ polarity ^= 2;
+ } else { // a == b, take a, drop b
+ if (a == HIGH) break main;
+ // This is symmetrical; it doesn't matter if
+ // we backtrack with a or b. - liu
+ if (k > 0 && a <= buffer[k-1]) {
+ a = max(list[i], buffer[--k]);
+ } else {
+ // No overlap
+ buffer[k++] = a;
+ a = list[i];
+ }
+ i++;
+ polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
}
- j++;
- polarity ^= 2;
- } else { // a == b, take a, drop b
- if (a == HIGH) break main;
- // This is symmetrical; it doesn't matter if
- // we backtrack with a or b. - liu
- if (k > 0 && a <= buffer[k-1]) {
- a = max(list[i], buffer[--k]);
- } else {
- // No overlap
+ break;
+ case 3: // both second; take higher if unequal, and drop other
+ if (b <= a) { // take a
+ if (a == HIGH) break main;
buffer[k++] = a;
- a = list[i];
+ } else { // take b
+ if (b == HIGH) break main;
+ buffer[k++] = b;
}
- i++;
- polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take higher if unequal, and drop other
- if (b <= a) { // take a
- if (a == HIGH) break main;
- buffer[k++] = a;
- } else { // take b
- if (b == HIGH) break main;
- buffer[k++] = b;
- }
- a = list[i++]; polarity ^= 1; // factored common code
- b = other[j++]; polarity ^= 2;
- break;
- case 1: // a second, b first; if b < a, overlap
- if (a < b) { // no overlap, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
+ a = list[i++]; polarity ^= 1; // factored common code
b = other[j++]; polarity ^= 2;
+ break;
+ case 1: // a second, b first; if b < a, overlap
+ if (a < b) { // no overlap, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1512,61 +950,60 @@
// change from xor is that we have to check overlapping pairs
// polarity bit 1 means a is second, bit 2 means b is.
main:
- while (true) {
- switch (polarity) {
- case 0: // both first; drop the smaller
- if (a < b) { // drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // drop b
- b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 3: // both second; take lower if unequal
- if (a < b) { // take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else if (b < a) { // take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, take one, drop other
- if (a == HIGH) break main;
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 1: // a second, b first;
- if (a < b) { // NO OVERLAP, drop a
- a = list[i++]; polarity ^= 1;
- } else if (b < a) { // OVERLAP, take b
- buffer[k++] = b; b = other[j++]; polarity ^= 2;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
- }
- break;
- case 2: // a first, b second; if a < b, overlap
- if (b < a) { // no overlap, drop b
- b = other[j++]; polarity ^= 2;
- } else if (a < b) { // OVERLAP, take a
- buffer[k++] = a; a = list[i++]; polarity ^= 1;
- } else { // a == b, drop both!
- if (a == HIGH) break main;
- a = list[i++]; polarity ^= 1;
- b = other[j++]; polarity ^= 2;
+ while (true) {
+ switch (polarity) {
+ case 0: // both first; drop the smaller
+ if (a < b) { // drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // drop b
+ b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 3: // both second; take lower if unequal
+ if (a < b) { // take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, take one, drop other
+ if (a == HIGH) break main;
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 1: // a second, b first;
+ if (a < b) { // NO OVERLAP, drop a
+ a = list[i++]; polarity ^= 1;
+ } else if (b < a) { // OVERLAP, take b
+ buffer[k++] = b; b = other[j++]; polarity ^= 2;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
+ case 2: // a first, b second; if a < b, overlap
+ if (b < a) { // no overlap, drop b
+ b = other[j++]; polarity ^= 2;
+ } else if (a < b) { // OVERLAP, take a
+ buffer[k++] = a; a = list[i++]; polarity ^= 1;
+ } else { // a == b, drop both!
+ if (a == HIGH) break main;
+ a = list[i++]; polarity ^= 1;
+ b = other[j++]; polarity ^= 2;
+ }
+ break;
}
- break;
}
- }
buffer[k++] = HIGH; // terminate
len = k;
// swap list and buffer
int[] temp = list;
list = buffer;
buffer = temp;
- pat = null;
return this;
}
@@ -1582,58 +1019,46 @@
boolean contains(int codePoint);
}
- // VersionInfo for unassigned characters
- static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
+ private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
private static class VersionFilter implements Filter {
VersionInfo version;
-
VersionFilter(VersionInfo version) { this.version = version; }
-
public boolean contains(int ch) {
VersionInfo v = UCharacter.getAge(ch);
// Reference comparison ok; VersionInfo caches and reuses
// unique objects.
return v != NO_VERSION &&
- v.compareTo(version) <= 0;
+ v.compareTo(version) <= 0;
}
}
private static synchronized UnicodeSet getInclusions(int src) {
- if (INCLUSIONS == null) {
- INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT];
+ if (src != UCharacterProperty.SRC_PROPSVEC) {
+ throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
}
- if(INCLUSIONS[src] == null) {
+
+ if (INCLUSION == null) {
UnicodeSet incl = new UnicodeSet();
- switch(src) {
- case UCharacterProperty.SRC_PROPSVEC:
- UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl);
- break;
- default:
- throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")");
- }
- INCLUSIONS[src] = incl;
+ UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
+ INCLUSION = incl;
}
- return INCLUSIONS[src];
+ return INCLUSION;
}
/**
* Generic filter-based scanning code for UCD property UnicodeSets.
*/
private UnicodeSet applyFilter(Filter filter, int src) {
- // Walk through all Unicode characters, noting the start
+ // Logically, walk through all Unicode characters, noting the start
// and end of each range for which filter.contain(c) is
// true. Add each range to a set.
//
- // To improve performance, use the INCLUSIONS set, which
+ // To improve performance, use an inclusions set which
// encodes information about character ranges that are known
- // to have identical properties, such as the CJK Ideographs
- // from U+4E00 to U+9FA5. INCLUSIONS contains all characters
- // except the first characters of such ranges.
- //
- // TODO Where possible, instead of scanning over code points,
- // use internal property data to initialize UnicodeSets for
- // those properties. Scanning code points is slow.
+ // to have identical properties.
+ // getInclusions(src) contains exactly the first characters of
+ // same-value ranges for the given properties "source".
clear();
@@ -1668,204 +1093,315 @@
}
/**
- * Remove leading and trailing rule white space and compress
- * internal rule white space to a single space character.
+ * Is this frozen, according to the Freezable interface?
*
- * @see UCharacterProperty#isRuleWhiteSpace
+ * @return value
+ * @stable ICU 3.8
*/
- private static String mungeCharName(String source) {
- StringBuffer buf = new StringBuffer();
- for (int i=0; i
+ *
+ * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in
+ * the set (for example, whether they overlap with each other) and the string that is processed. For a set with
+ * strings:
+ *
+ *
+ * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then
+ * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could
+ * be used.
+ * (OR of each set element)*
.
+ * (Java/ICU/Perl regex stops at the first match of an OR.)
+ *
+ * @stable ICU 4.4
+ */
+ CONTAINED,
+ /**
+ * Continues a span() while there is a set element at the current position.
+ * Increments by the longest matching element at each position.
+ * (For characters only, this is like while contains(current)==true).
+ * StringBuffer
in the given radix. This is
- * done recursively since it is easiest to generate the low-
- * order digit first, but it must be appended last.
- *
- * @param result is the StringBuffer
to append to
- * @param n is the positive integer
- * @param radix is the radix, from 2 to 36 inclusive
- * @param minDigits is the minimum number of digits to append.
- */
- private static void recursiveAppendNumber(StringBuffer result, int n,
- int radix, int minDigits)
- {
- int digit = n % radix;
-
- if (n >= radix || minDigits > 1) {
- recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
- }
-
- result.append(DIGITS[digit]);
- }
-
- /**
- * Append a number to the given StringBuffer in the given radix.
- * Standard digits '0'-'9' are used and letters 'A'-'Z' for
- * radices 11 through 36.
- * @param result the digits of the number are appended here
- * @param n the number to be converted to digits; may be negative.
- * If negative, a '-' is prepended to the digits.
- * @param radix a radix from 2 to 36 inclusive.
- * @param minDigits the minimum number of digits, not including
- * any '-', to produce. Values less than 2 have no effect. One
- * digit is always emitted regardless of this parameter.
- * @return a reference to result
- */
- public static StringBuffer appendNumber(StringBuffer result, int n,
- int radix, int minDigits)
- throws IllegalArgumentException
- {
- if (radix < 2 || radix > 36) {
- throw new IllegalArgumentException("Illegal radix " + radix);
- }
-
-
- int abs = n;
-
- if (n < 0) {
- abs = -n;
- result.append("-");
- }
-
- recursiveAppendNumber(result, abs, radix, minDigits);
-
- return result;
- }
-
- /**
* Return true if the character is NOT printable ASCII. The tab,
* newline and linefeed characters are considered unprintable.
*/
public static boolean isUnprintable(int c) {
+ //0x20 = 32 and 0x7E = 126
return !(c >= 0x20 && c <= 0x7E);
}
/**
- * Escape unprintable characters using {@code
+
# Math (Math)
Math; N ; No ; F ; False
@@ -1006,12 +1098,14 @@
# Script (sc)
+sc ; Aghb ; Caucasian_Albanian
sc ; Arab ; Arabic
sc ; Armi ; Imperial_Aramaic
sc ; Armn ; Armenian
sc ; Avst ; Avestan
sc ; Bali ; Balinese
sc ; Bamu ; Bamum
+sc ; Bass ; Bassa_Vah
sc ; Batk ; Batak
sc ; Beng ; Bengali
sc ; Bopo ; Bopomofo
@@ -1029,11 +1123,14 @@
sc ; Cyrl ; Cyrillic
sc ; Deva ; Devanagari
sc ; Dsrt ; Deseret
+sc ; Dupl ; Duployan
sc ; Egyp ; Egyptian_Hieroglyphs
+sc ; Elba ; Elbasan
sc ; Ethi ; Ethiopic
sc ; Geor ; Georgian
sc ; Glag ; Glagolitic
sc ; Goth ; Gothic
+sc ; Gran ; Grantha
sc ; Grek ; Greek
sc ; Gujr ; Gujarati
sc ; Guru ; Gurmukhi
@@ -1042,6 +1139,7 @@
sc ; Hano ; Hanunoo
sc ; Hebr ; Hebrew
sc ; Hira ; Hiragana
+sc ; Hmng ; Pahawh_Hmong
sc ; Hrkt ; Katakana_Or_Hiragana
sc ; Ital ; Old_Italic
sc ; Java ; Javanese
@@ -1049,6 +1147,7 @@
sc ; Kana ; Katakana
sc ; Khar ; Kharoshthi
sc ; Khmr ; Khmer
+sc ; Khoj ; Khojki
sc ; Knda ; Kannada
sc ; Kthi ; Kaithi
sc ; Lana ; Tai_Tham
@@ -1056,25 +1155,37 @@
sc ; Latn ; Latin
sc ; Lepc ; Lepcha
sc ; Limb ; Limbu
+sc ; Lina ; Linear_A
sc ; Linb ; Linear_B
sc ; Lisu ; Lisu
sc ; Lyci ; Lycian
sc ; Lydi ; Lydian
+sc ; Mahj ; Mahajani
sc ; Mand ; Mandaic
+sc ; Mani ; Manichaean
+sc ; Mend ; Mende_Kikakui
sc ; Merc ; Meroitic_Cursive
sc ; Mero ; Meroitic_Hieroglyphs
sc ; Mlym ; Malayalam
+sc ; Modi ; Modi
sc ; Mong ; Mongolian
+sc ; Mroo ; Mro
sc ; Mtei ; Meetei_Mayek
sc ; Mymr ; Myanmar
+sc ; Narb ; Old_North_Arabian
+sc ; Nbat ; Nabataean
sc ; Nkoo ; Nko
sc ; Ogam ; Ogham
sc ; Olck ; Ol_Chiki
sc ; Orkh ; Old_Turkic
sc ; Orya ; Oriya
sc ; Osma ; Osmanya
+sc ; Palm ; Palmyrene
+sc ; Pauc ; Pau_Cin_Hau
+sc ; Perm ; Old_Permic
sc ; Phag ; Phags_Pa
sc ; Phli ; Inscriptional_Pahlavi
+sc ; Phlp ; Psalter_Pahlavi
sc ; Phnx ; Phoenician
sc ; Plrd ; Miao
sc ; Prti ; Inscriptional_Parthian
@@ -1085,6 +1196,8 @@
sc ; Saur ; Saurashtra
sc ; Shaw ; Shavian
sc ; Shrd ; Sharada
+sc ; Sidd ; Siddham
+sc ; Sind ; Khudawadi
sc ; Sinh ; Sinhala
sc ; Sora ; Sora_Sompeng
sc ; Sund ; Sundanese
@@ -1102,8 +1215,10 @@
sc ; Thaa ; Thaana
sc ; Thai ; Thai
sc ; Tibt ; Tibetan
+sc ; Tirh ; Tirhuta
sc ; Ugar ; Ugaritic
sc ; Vaii ; Vai
+sc ; Wara ; Warang_Citi
sc ; Xpeo ; Old_Persian
sc ; Xsux ; Cuneiform
sc ; Yiii ; Yi
@@ -1159,6 +1274,10 @@
Term; N ; No ; F ; False
Term; Y ; Yes ; T ; True
+# Titlecase_Mapping (tc)
+
+# @missing: 0000..10FFFF; Titlecase_Mapping;
+
# Unicode_1_Name (na1)
# @missing: 0000..10FFFF; Unicode_1_Name;
+
# Variation_Selector (VS)
VS ; N ; No ; F ; False
@@ -1186,9 +1309,11 @@
# Word_Break (WB)
WB ; CR ; CR
+WB ; DQ ; Double_Quote
WB ; EX ; ExtendNumLet
WB ; Extend ; Extend
WB ; FO ; Format
+WB ; HL ; Hebrew_Letter
WB ; KA ; Katakana
WB ; LE ; ALetter
WB ; LF ; LF
@@ -1198,6 +1323,7 @@
WB ; NL ; Newline
WB ; NU ; Numeric
WB ; RI ; Regional_Indicator
+WB ; SQ ; Single_Quote
WB ; XX ; Other
# XID_Continue (XIDC)
--- old/jdk/test/java/lang/Character/Scripts.txt 2015-07-13 16:12:05.000000000 +0900
+++ new/jdk/test/java/lang/Character/Scripts.txt 2015-07-13 16:12:04.000000000 +0900
@@ -1,8 +1,8 @@
-# Scripts-6.2.0.txt
-# Date: 2012-06-04, 17:21:29 GMT [MD]
+# Scripts-7.0.0.txt
+# Date: 2014-05-15, 00:11:35 GMT [MD]
#
# Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
@@ -83,8 +83,10 @@
0385 ; Common # Sk GREEK DIALYTIKA TONOS
0387 ; Common # Po GREEK ANO TELEIA
0589 ; Common # Po ARMENIAN FULL STOP
+0605 ; Common # Cf ARABIC NUMBER MARK ABOVE
060C ; Common # Po ARABIC COMMA
061B ; Common # Po ARABIC SEMICOLON
+061C ; Common # Cf ARABIC LETTER MARK
061F ; Common # Po ARABIC QUESTION MARK
0640 ; Common # Lm ARABIC TATWEEL
0660..0669 ; Common # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
@@ -136,7 +138,7 @@
2055..205E ; Common # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS
205F ; Common # Zs MEDIUM MATHEMATICAL SPACE
2060..2064 ; Common # Cf [5] WORD JOINER..INVISIBLE PLUS
-206A..206F ; Common # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES
+2066..206F ; Common # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
2070 ; Common # No SUPERSCRIPT ZERO
2074..2079 ; Common # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE
207A..207C ; Common # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN
@@ -146,7 +148,7 @@
208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN
208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS
208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS
-20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN
+20A0..20BD ; Common # Sc [30] EURO-CURRENCY SIGN..RUBLE SIGN
2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT
2102 ; Common # L& DOUBLE-STRUCK CAPITAL C
2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA
@@ -200,7 +202,10 @@
21D5..21F3 ; Common # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW
21F4..22FF ; Common # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP
2300..2307 ; Common # So [8] DIAMETER SIGN..WAVY LINE
-2308..230B ; Common # Sm [4] LEFT CEILING..RIGHT FLOOR
+2308 ; Common # Ps LEFT CEILING
+2309 ; Common # Pe RIGHT CEILING
+230A ; Common # Ps LEFT FLOOR
+230B ; Common # Pe RIGHT FLOOR
230C..231F ; Common # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER
2320..2321 ; Common # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL
2322..2328 ; Common # So [7] FROWN..KEYBOARD
@@ -212,7 +217,7 @@
239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM
23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE
23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET
-23E2..23F3 ; Common # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND
+23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD
2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO
2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH
2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP
@@ -226,8 +231,7 @@
25F8..25FF ; Common # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE
2600..266E ; Common # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN
266F ; Common # Sm MUSIC SHARP SIGN
-2670..26FF ; Common # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE
-2701..2767 ; Common # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET
+2670..2767 ; Common # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET
2768 ; Common # Ps MEDIUM LEFT PARENTHESIS ORNAMENT
2769 ; Common # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT
276A ; Common # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT
@@ -295,7 +299,11 @@
2B30..2B44 ; Common # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET
2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW
2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR
-2B50..2B59 ; Common # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE
+2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR
+2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW
+2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX
+2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED
+2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN
2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER
2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET
2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET
@@ -329,6 +337,10 @@
2E2F ; Common # Lm VERTICAL TILDE
2E30..2E39 ; Common # Po [10] RING POINT..TOP HALF SECTION SIGN
2E3A..2E3B ; Common # Pd [2] TWO-EM DASH..THREE-EM DASH
+2E3C..2E3F ; Common # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM
+2E40 ; Common # Pd DOUBLE HYPHEN
+2E41 ; Common # Po REVERSED COMMA
+2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK
2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
3000 ; Common # Zs IDEOGRAPHIC SPACE
3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK
@@ -392,9 +404,11 @@
A836..A837 ; Common # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Common # Sc NORTH INDIC RUPEE MARK
A839 ; Common # So NORTH INDIC QUANTITY MARK
-FD3E ; Common # Ps ORNATE LEFT PARENTHESIS
-FD3F ; Common # Pe ORNATE RIGHT PARENTHESIS
-FDFD ; Common # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
+A92E ; Common # Po KAYAH LI SIGN CWI
+A9CF ; Common # Lm JAVANESE PANGRANGKEP
+AB5B ; Common # Sk MODIFIER BREVE WITH INVERTED BREVE
+FD3E ; Common # Pe ORNATE LEFT PARENTHESIS
+FD3F ; Common # Ps ORNATE RIGHT PARENTHESIS
FE10..FE16 ; Common # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK
FE17 ; Common # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
FE18 ; Common # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
@@ -487,6 +501,8 @@
10137..1013F ; Common # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
10190..1019B ; Common # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN
101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND
+102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
+1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO
1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2
1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
@@ -543,10 +559,10 @@
1F000..1F02B ; Common # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK
1F030..1F093 ; Common # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06
1F0A0..1F0AE ; Common # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES
-1F0B1..1F0BE ; Common # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS
+1F0B1..1F0BF ; Common # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER
1F0C1..1F0CF ; Common # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER
-1F0D1..1F0DF ; Common # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER
-1F100..1F10A ; Common # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA
+1F0D1..1F0F5 ; Common # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21
+1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO
1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ
1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN
1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS
@@ -555,28 +571,29 @@
1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6
1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
-1F300..1F320 ; Common # So [33] CYCLONE..SHOOTING STAR
-1F330..1F335 ; Common # So [6] CHESTNUT..CACTUS
-1F337..1F37C ; Common # So [70] TULIP..BABY BOTTLE
-1F380..1F393 ; Common # So [20] RIBBON..GRADUATION CAP
-1F3A0..1F3C4 ; Common # So [37] CAROUSEL HORSE..SURFER
-1F3C6..1F3CA ; Common # So [5] TROPHY..SWIMMER
-1F3E0..1F3F0 ; Common # So [17] HOUSE BUILDING..EUROPEAN CASTLE
-1F400..1F43E ; Common # So [63] RAT..PAW PRINTS
-1F440 ; Common # So EYES
-1F442..1F4F7 ; Common # So [182] EAR..CAMERA
-1F4F9..1F4FC ; Common # So [4] VIDEO CAMERA..VIDEOCASSETTE
-1F500..1F53D ; Common # So [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE
-1F540..1F543 ; Common # So [4] CIRCLED CROSS POMMEE..NOTCHED LEFT SEMICIRCLE WITH THREE DOTS
-1F550..1F567 ; Common # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY
-1F5FB..1F640 ; Common # So [70] MOUNT FUJI..WEARY CAT FACE
-1F645..1F64F ; Common # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS
-1F680..1F6C5 ; Common # So [70] ROCKET..LEFT LUGGAGE
+1F300..1F32C ; Common # So [45] CYCLONE..WIND BLOWING FACE
+1F330..1F37D ; Common # So [78] CHESTNUT..FORK AND KNIFE WITH PLATE
+1F380..1F3CE ; Common # So [79] RIBBON..RACING CAR
+1F3D4..1F3F7 ; Common # So [36] SNOW CAPPED MOUNTAIN..LABEL
+1F400..1F4FE ; Common # So [255] RAT..PORTABLE STEREO
+1F500..1F54A ; Common # So [75] TWISTED RIGHTWARDS ARROWS..DOVE OF PEACE
+1F550..1F579 ; Common # So [42] CLOCK FACE ONE OCLOCK..JOYSTICK
+1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX
+1F5A5..1F642 ; Common # So [158] DESKTOP COMPUTER..SLIGHTLY SMILING FACE
+1F645..1F6CF ; Common # So [139] FACE WITH NO GOOD GESTURE..BED
+1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING
+1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP
1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
+1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
+1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
+1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW
+1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW
+1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW
+1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS
E0001 ; Common # Cf LANGUAGE TAG
E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG
-# Total code points: 6413
+# Total code points: 7129
# ================================================
@@ -618,16 +635,20 @@
A770 ; Latin # Lm MODIFIER LETTER US
A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T
A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT
-A790..A793 ; Latin # L& [4] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH BAR
-A7A0..A7AA ; Latin # L& [11] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN CAPITAL LETTER H WITH HOOK
+A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT
+A7B0..A7B1 ; Latin # L& [2] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER TURNED T
+A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I
A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
A7FA ; Latin # L& LATIN LETTER SMALL CAPITAL TURNED M
A7FB..A7FF ; Latin # Lo [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M
+AB30..AB5A ; Latin # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
+AB5C..AB5F ; Latin # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
+AB64 ; Latin # L& LATIN SMALL LETTER INVERTED ALPHA
FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z
FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z
-# Total code points: 1272
+# Total code points: 1338
# ================================================
@@ -636,6 +657,7 @@
0376..0377 ; Greek # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
037A ; Greek # Lm GREEK YPOGEGRAMMENI
037B..037D ; Greek # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL
+037F ; Greek # L& GREEK CAPITAL LETTER YOT
0384 ; Greek # Sk GREEK TONOS
0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS
0388..038A ; Greek # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS
@@ -675,15 +697,18 @@
1FF6..1FFC ; Greek # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
1FFD..1FFE ; Greek # Sk [2] GREEK OXIA..GREEK DASIA
2126 ; Greek # L& OHM SIGN
+AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA
10140..10174 ; Greek # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS
10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN
10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN
-1018A ; Greek # No GREEK ZERO SIGN
+1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN
+1018C ; Greek # So GREEK SINUSOID SIGN
+101A0 ; Greek # So GREEK SYMBOL TAU RHO
1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
1D245 ; Greek # So GREEK MUSICAL LEIMMA
-# Total code points: 511
+# Total code points: 516
# ================================================
@@ -692,7 +717,7 @@
0483..0484 ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE
0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
-048A..0527 ; Cyrillic # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER
+048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER
1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL
1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN
2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
@@ -704,10 +729,11 @@
A674..A67D ; Cyrillic # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
A67E ; Cyrillic # Po CYRILLIC KAVYKA
A67F ; Cyrillic # Lm CYRILLIC PAYEROK
-A680..A697 ; Cyrillic # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE
+A680..A69B ; Cyrillic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O
+A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN
A69F ; Cyrillic # Mn COMBINING CYRILLIC LETTER IOTIFIED E
-# Total code points: 417
+# Total code points: 431
# ================================================
@@ -716,10 +742,11 @@
055A..055F ; Armenian # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK
0561..0587 ; Armenian # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN
058A ; Armenian # Pd ARMENIAN HYPHEN
+058D..058E ; Armenian # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN
058F ; Armenian # Sc ARMENIAN DRAM SIGN
FB13..FB17 ; Armenian # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
-# Total code points: 91
+# Total code points: 93
# ================================================
@@ -779,9 +806,8 @@
06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN
06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V
0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE
-08A0 ; Arabic # Lo ARABIC LETTER BEH WITH SMALL V BELOW
-08A2..08AC ; Arabic # Lo [11] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER ROHINGYA YEH
-08E4..08FE ; Arabic # Mn [27] ARABIC CURLY FATHA..ARABIC DAMMA WITH DOT
+08A0..08B2 ; Arabic # Lo [19] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER ZAIN WITH INVERTED V ABOVE
+08E4..08FF ; Arabic # Mn [28] ARABIC CURLY FATHA..ARABIC MARK SIDEWAYS NOON GHUNNA
FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM
FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW
FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM
@@ -789,6 +815,7 @@
FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU
FDFC ; Arabic # Sc RIAL SIGN
+FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM
FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM
10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS
@@ -827,7 +854,7 @@
1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL
-# Total code points: 1235
+# Total code points: 1244
# ================================================
@@ -870,17 +897,17 @@
0966..096F ; Devanagari # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
0970 ; Devanagari # Po DEVANAGARI ABBREVIATION SIGN
0971 ; Devanagari # Lm DEVANAGARI SIGN HIGH SPACING DOT
-0972..0977 ; Devanagari # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE
-0979..097F ; Devanagari # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
+0972..097F ; Devanagari # Lo [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA
A8E0..A8F1 ; Devanagari # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
A8F2..A8F7 ; Devanagari # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA
A8F8..A8FA ; Devanagari # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET
A8FB ; Devanagari # Lo DEVANAGARI HEADSTROKE
-# Total code points: 151
+# Total code points: 152
# ================================================
+0980 ; Bengali # Lo BENGALI ANJI
0981 ; Bengali # Mn BENGALI SIGN CANDRABINDU
0982..0983 ; Bengali # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA
0985..098C ; Bengali # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L
@@ -908,7 +935,7 @@
09FA ; Bengali # So BENGALI ISSHAR
09FB ; Bengali # Sc BENGALI GANDA MARK
-# Total code points: 92
+# Total code points: 93
# ================================================
@@ -1025,12 +1052,12 @@
# ================================================
+0C00 ; Telugu # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
0C01..0C03 ; Telugu # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
0C05..0C0C ; Telugu # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI
0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA
-0C2A..0C33 ; Telugu # Lo [10] TELUGU LETTER PA..TELUGU LETTER LLA
-0C35..0C39 ; Telugu # Lo [5] TELUGU LETTER VA..TELUGU LETTER HA
+0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA
0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA
0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR
@@ -1044,10 +1071,11 @@
0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR
0C7F ; Telugu # So TELUGU SIGN TUUMU
-# Total code points: 93
+# Total code points: 95
# ================================================
+0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU
0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L
0C8E..0C90 ; Kannada # Lo [3] KANNADA LETTER E..KANNADA LETTER AI
@@ -1070,10 +1098,11 @@
0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
-# Total code points: 86
+# Total code points: 87
# ================================================
+0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU
0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L
0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
@@ -1093,7 +1122,7 @@
0D79 ; Malayalam # So MALAYALAM DATE MARK
0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K
-# Total code points: 98
+# Total code points: 99
# ================================================
@@ -1108,10 +1137,12 @@
0DD2..0DD4 ; Sinhala # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
0DD6 ; Sinhala # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
0DD8..0DDF ; Sinhala # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA
+0DE6..0DEF ; Sinhala # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE
0DF2..0DF3 ; Sinhala # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA
0DF4 ; Sinhala # Po SINHALA PUNCTUATION KUNDDALIYA
+111E1..111F4 ; Sinhala # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND
-# Total code points: 80
+# Total code points: 110
# ================================================
@@ -1234,14 +1265,23 @@
109A..109C ; Myanmar # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A
109D ; Myanmar # Mn MYANMAR VOWEL SIGN AITON AI
109E..109F ; Myanmar # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION
+A9E0..A9E4 ; Myanmar # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA
+A9E5 ; Myanmar # Mn MYANMAR SIGN SHAN SAW
+A9E6 ; Myanmar # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION
+A9E7..A9EF ; Myanmar # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA
+A9F0..A9F9 ; Myanmar # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE
+A9FA..A9FE ; Myanmar # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA
AA60..AA6F ; Myanmar # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA
AA70 ; Myanmar # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION
AA71..AA76 ; Myanmar # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM
AA77..AA79 ; Myanmar # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO
AA7A ; Myanmar # Lo MYANMAR LETTER AITON RA
AA7B ; Myanmar # Mc MYANMAR SIGN PAO KAREN TONE
+AA7C ; Myanmar # Mn MYANMAR SIGN TAI LAING TONE-2
+AA7D ; Myanmar # Mc MYANMAR SIGN TAI LAING TONE-5
+AA7E..AA7F ; Myanmar # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA
-# Total code points: 188
+# Total code points: 223
# ================================================
@@ -1345,8 +1385,9 @@
16A0..16EA ; Runic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X
16EE..16F0 ; Runic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL
+16F1..16F8 ; Runic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC
-# Total code points: 78
+# Total code points: 86
# ================================================
@@ -1377,7 +1418,7 @@
1806 ; Mongolian # Pd MONGOLIAN TODO SOFT HYPHEN
1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU
180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
-180E ; Mongolian # Zs MONGOLIAN VOWEL SEPARATOR
+180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR
1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE
1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI
1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN
@@ -1452,10 +1493,10 @@
# ================================================
-10300..1031E ; Old_Italic # Lo [31] OLD ITALIC LETTER A..OLD ITALIC LETTER UU
+10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS
10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY
-# Total code points: 35
+# Total code points: 36
# ================================================
@@ -1479,12 +1520,15 @@
064B..0655 ; Inherited # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Inherited # Mn ARABIC LETTER SUPERSCRIPT ALEF
0951..0952 ; Inherited # Mn [2] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI STRESS SIGN ANUDATTA
+1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY
1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CED ; Inherited # Mn VEDIC SIGN TIRYAK
1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE
-1DC0..1DE6 ; Inherited # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z
+1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE
1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
@@ -1495,15 +1539,16 @@
302A..302D ; Inherited # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
3099..309A ; Inherited # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
FE00..FE0F ; Inherited # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
-FE20..FE26 ; Inherited # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON
+FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW
101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK
1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
-# Total code points: 523
+# Total code points: 563
# ================================================
@@ -1537,7 +1582,7 @@
# ================================================
-1900..191C ; Limbu # Lo [29] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA
+1900..191E ; Limbu # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA
1920..1922 ; Limbu # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
1923..1926 ; Limbu # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU
1927..1928 ; Limbu # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
@@ -1550,7 +1595,7 @@
1944..1945 ; Limbu # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK
1946..194F ; Limbu # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE
-# Total code points: 66
+# Total code points: 68
# ================================================
@@ -1612,7 +1657,8 @@
1A00..1A16 ; Buginese # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA
1A17..1A18 ; Buginese # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1A19..1A1B ; Buginese # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE
+1A19..1A1A ; Buginese # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O
+1A1B ; Buginese # Mn BUGINESE VOWEL SIGN AE
1A1E..1A1F ; Buginese # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION
# Total code points: 30
@@ -1724,11 +1770,11 @@
# ================================================
-12000..1236E ; Cuneiform # Lo [879] CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM
-12400..12462 ; Cuneiform # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER
-12470..12473 ; Cuneiform # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON
+12000..12398 ; Cuneiform # Lo [921] CUNEIFORM SIGN A..CUNEIFORM SIGN UM TIMES ME
+12400..1246E ; Cuneiform # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
+12470..12474 ; Cuneiform # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON
-# Total code points: 982
+# Total code points: 1037
# ================================================
@@ -1767,8 +1813,7 @@
1BA6..1BA7 ; Sundanese # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG
1BA8..1BA9 ; Sundanese # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
1BAA ; Sundanese # Mc SUNDANESE SIGN PAMAAEH
-1BAB ; Sundanese # Mn SUNDANESE SIGN VIRAMA
-1BAC..1BAD ; Sundanese # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA
+1BAB..1BAD ; Sundanese # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
1BAE..1BAF ; Sundanese # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA
1BB0..1BB9 ; Sundanese # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE
1BBA..1BBF ; Sundanese # Lo [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M
@@ -1825,9 +1870,9 @@
A900..A909 ; Kayah_Li # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE
A90A..A925 ; Kayah_Li # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO
A926..A92D ; Kayah_Li # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
-A92E..A92F ; Kayah_Li # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA
+A92F ; Kayah_Li # Po KAYAH LI SIGN SHYA
-# Total code points: 48
+# Total code points: 47
# ================================================
@@ -1974,11 +2019,10 @@
A9BC ; Javanese # Mn JAVANESE VOWEL SIGN PEPET
A9BD..A9C0 ; Javanese # Mc [4] JAVANESE CONSONANT SIGN KERET..JAVANESE PANGKON
A9C1..A9CD ; Javanese # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH
-A9CF ; Javanese # Lm JAVANESE PANGRANGKEP
A9D0..A9D9 ; Javanese # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE
A9DE..A9DF ; Javanese # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN
-# Total code points: 91
+# Total code points: 90
# ================================================
@@ -2080,8 +2124,9 @@
11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS
11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND
11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE
+1107F ; Brahmi # Mn BRAHMI NUMBER JOINER
-# Total code points: 108
+# Total code points: 109
# ================================================
@@ -2136,9 +2181,11 @@
111BF..111C0 ; Sharada # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA
111C1..111C4 ; Sharada # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM
111C5..111C8 ; Sharada # Po [4] SHARADA DANDA..SHARADA SEPARATOR
+111CD ; Sharada # Po SHARADA SUTRA MARK
111D0..111D9 ; Sharada # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE
+111DA ; Sharada # Lo SHARADA EKAM
-# Total code points: 83
+# Total code points: 85
# ================================================
@@ -2161,4 +2208,244 @@
# Total code points: 66
+# ================================================
+
+10530..10563 ; Caucasian_Albanian # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW
+1056F ; Caucasian_Albanian # Po CAUCASIAN ALBANIAN CITATION MARK
+
+# Total code points: 53
+
+# ================================================
+
+16AD0..16AED ; Bassa_Vah # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I
+16AF0..16AF4 ; Bassa_Vah # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
+16AF5 ; Bassa_Vah # Po BASSA VAH FULL STOP
+
+# Total code points: 36
+
+# ================================================
+
+1BC00..1BC6A ; Duployan # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
+1BC70..1BC7C ; Duployan # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK
+1BC80..1BC88 ; Duployan # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL
+1BC90..1BC99 ; Duployan # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW
+1BC9C ; Duployan # So DUPLOYAN SIGN O WITH CROSS
+1BC9D..1BC9E ; Duployan # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK
+1BC9F ; Duployan # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP
+
+# Total code points: 143
+
+# ================================================
+
+10500..10527 ; Elbasan # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE
+
+# Total code points: 40
+
+# ================================================
+
+11301 ; Grantha # Mn GRANTHA SIGN CANDRABINDU
+11302..11303 ; Grantha # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA
+11305..1130C ; Grantha # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L
+1130F..11310 ; Grantha # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI
+11313..11328 ; Grantha # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA
+1132A..11330 ; Grantha # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA
+11332..11333 ; Grantha # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA
+11335..11339 ; Grantha # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA
+1133C ; Grantha # Mn GRANTHA SIGN NUKTA
+1133D ; Grantha # Lo GRANTHA SIGN AVAGRAHA
+1133E..1133F ; Grantha # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I
+11340 ; Grantha # Mn GRANTHA VOWEL SIGN II
+11341..11344 ; Grantha # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR
+11347..11348 ; Grantha # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI
+1134B..1134D ; Grantha # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA
+11357 ; Grantha # Mc GRANTHA AU LENGTH MARK
+1135D..11361 ; Grantha # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL
+11362..11363 ; Grantha # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL
+11366..1136C ; Grantha # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX
+11370..11374 ; Grantha # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA
+
+# Total code points: 83
+
+# ================================================
+
+16B00..16B2F ; Pahawh_Hmong # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU
+16B30..16B36 ; Pahawh_Hmong # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
+16B37..16B3B ; Pahawh_Hmong # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM
+16B3C..16B3F ; Pahawh_Hmong # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB
+16B40..16B43 ; Pahawh_Hmong # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
+16B44 ; Pahawh_Hmong # Po PAHAWH HMONG SIGN XAUS
+16B45 ; Pahawh_Hmong # So PAHAWH HMONG SIGN CIM TSOV ROG
+16B50..16B59 ; Pahawh_Hmong # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
+16B5B..16B61 ; Pahawh_Hmong # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS
+16B63..16B77 ; Pahawh_Hmong # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS
+16B7D..16B8F ; Pahawh_Hmong # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ
+
+# Total code points: 127
+
+# ================================================
+
+11200..11211 ; Khojki # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA
+11213..1122B ; Khojki # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
+1122C..1122E ; Khojki # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II
+1122F..11231 ; Khojki # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI
+11232..11233 ; Khojki # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
+11234 ; Khojki # Mn KHOJKI SIGN ANUSVARA
+11235 ; Khojki # Mc KHOJKI SIGN VIRAMA
+11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
+11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
+
+# Total code points: 61
+
+# ================================================
+
+10600..10736 ; Linear_A # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664
+10740..10755 ; Linear_A # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE
+10760..10767 ; Linear_A # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807
+
+# Total code points: 341
+
+# ================================================
+
+11150..11172 ; Mahajani # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA
+11173 ; Mahajani # Mn MAHAJANI SIGN NUKTA
+11174..11175 ; Mahajani # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK
+11176 ; Mahajani # Lo MAHAJANI LIGATURE SHRI
+
+# Total code points: 39
+
+# ================================================
+
+10AC0..10AC7 ; Manichaean # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW
+10AC8 ; Manichaean # So MANICHAEAN SIGN UD
+10AC9..10AE4 ; Manichaean # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW
+10AE5..10AE6 ; Manichaean # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+10AEB..10AEF ; Manichaean # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED
+10AF0..10AF6 ; Manichaean # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER
+
+# Total code points: 51
+
+# ================================================
+
+1E800..1E8C4 ; Mende_Kikakui # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON
+1E8C7..1E8CF ; Mende_Kikakui # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE
+1E8D0..1E8D6 ; Mende_Kikakui # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
+
+# Total code points: 213
+
+# ================================================
+
+11600..1162F ; Modi # Lo [48] MODI LETTER A..MODI LETTER LLA
+11630..11632 ; Modi # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II
+11633..1163A ; Modi # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI
+1163B..1163C ; Modi # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU
+1163D ; Modi # Mn MODI SIGN ANUSVARA
+1163E ; Modi # Mc MODI SIGN VISARGA
+1163F..11640 ; Modi # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA
+11641..11643 ; Modi # Po [3] MODI DANDA..MODI ABBREVIATION SIGN
+11644 ; Modi # Lo MODI SIGN HUVA
+11650..11659 ; Modi # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE
+
+# Total code points: 79
+
+# ================================================
+
+16A40..16A5E ; Mro # Lo [31] MRO LETTER TA..MRO LETTER TEK
+16A60..16A69 ; Mro # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE
+16A6E..16A6F ; Mro # Po [2] MRO DANDA..MRO DOUBLE DANDA
+
+# Total code points: 43
+
+# ================================================
+
+10A80..10A9C ; Old_North_Arabian # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH
+10A9D..10A9F ; Old_North_Arabian # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+10880..1089E ; Nabataean # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW
+108A7..108AF ; Nabataean # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED
+
+# Total code points: 40
+
+# ================================================
+
+10860..10876 ; Palmyrene # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW
+10877..10878 ; Palmyrene # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON
+10879..1087F ; Palmyrene # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY
+
+# Total code points: 32
+
+# ================================================
+
+11AC0..11AF8 ; Pau_Cin_Hau # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
+
+# Total code points: 57
+
+# ================================================
+
+10350..10375 ; Old_Permic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA
+10376..1037A ; Old_Permic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+
+# Total code points: 43
+
+# ================================================
+
+10B80..10B91 ; Psalter_Pahlavi # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW
+10B99..10B9C ; Psalter_Pahlavi # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT
+10BA9..10BAF ; Psalter_Pahlavi # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED
+
+# Total code points: 29
+
+# ================================================
+
+11580..115AE ; Siddham # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA
+115AF..115B1 ; Siddham # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II
+115B2..115B5 ; Siddham # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR
+115B8..115BB ; Siddham # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU
+115BC..115BD ; Siddham # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA
+115BE ; Siddham # Mc SIDDHAM SIGN VISARGA
+115BF..115C0 ; Siddham # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA
+115C1..115C9 ; Siddham # Po [9] SIDDHAM SIGN SIDDHAM..SIDDHAM END OF TEXT MARK
+
+# Total code points: 72
+
+# ================================================
+
+112B0..112DE ; Khudawadi # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA
+112DF ; Khudawadi # Mn KHUDAWADI SIGN ANUSVARA
+112E0..112E2 ; Khudawadi # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
+112E3..112EA ; Khudawadi # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
+112F0..112F9 ; Khudawadi # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE
+
+# Total code points: 69
+
+# ================================================
+
+11480..114AF ; Tirhuta # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA
+114B0..114B2 ; Tirhuta # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II
+114B3..114B8 ; Tirhuta # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL
+114B9 ; Tirhuta # Mc TIRHUTA VOWEL SIGN E
+114BA ; Tirhuta # Mn TIRHUTA VOWEL SIGN SHORT E
+114BB..114BE ; Tirhuta # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU
+114BF..114C0 ; Tirhuta # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA
+114C1 ; Tirhuta # Mc TIRHUTA SIGN VISARGA
+114C2..114C3 ; Tirhuta # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA
+114C4..114C5 ; Tirhuta # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG
+114C6 ; Tirhuta # Po TIRHUTA ABBREVIATION SIGN
+114C7 ; Tirhuta # Lo TIRHUTA OM
+114D0..114D9 ; Tirhuta # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE
+
+# Total code points: 82
+
+# ================================================
+
+118A0..118DF ; Warang_Citi # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO
+118E0..118E9 ; Warang_Citi # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE
+118EA..118F2 ; Warang_Citi # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY
+118FF ; Warang_Citi # Lo WARANG CITI OM
+
+# Total code points: 84
+
# EOF
--- old/jdk/test/java/text/Bidi/BidiConformance.java 2015-07-13 16:12:05.000000000 +0900
+++ new/jdk/test/java/text/Bidi/BidiConformance.java 2015-07-13 16:12:05.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -23,7 +23,7 @@
/*
* @test
- * @bug 6850113
+ * @bug 6850113 8032446
* @summary confirm the behavior of new Bidi implementation. (Backward compatibility)
*/
@@ -40,6 +40,8 @@
private static boolean verbose = false;
private static boolean abort = false;
+ private static final byte MAX_EXPLICIT_LEVEL = 125;
+
public static void main(String[] args) {
for (int i = 0; i < args.length; i++) {
String arg = args[i];
@@ -368,15 +370,15 @@
AttributedString astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-61),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-MAX_EXPLICIT_LEVEL),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
i + ") should not be " + bidi.getLevelAt(i) +
- " but 60 when BIDI_EMBEDDING is -61.");
+ " but MAX_EXPLICIT_LEVEL-1 when BIDI_EMBEDDING is -MAX_EXPLICIT_LEVEL.");
}
}
}
@@ -387,14 +389,14 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-62),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-(MAX_EXPLICIT_LEVEL+1)),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
if (bidi.getLevelAt(i) != 1) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " +
- "should be 1 when BIDI_EMBEDDING is -62.");
+ "should be 1 when BIDI_EMBEDDING is -(MAX_EXPLICIT_LEVEL+1).");
}
}
}
@@ -405,14 +407,14 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(60),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL-1),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " +
- "should be 61 when BIDI_EMBEDDING is 60.");
+ "should be MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL-1.");
}
}
}
@@ -423,15 +425,15 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(61),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
- if (bidi.getLevelAt(i) != 61) {
+ if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) {
errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
i + ") should not be " + bidi.getLevelAt(i) +
- " but 61 when BIDI_EMBEDDING is 61.");
+ " but MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL.");
}
}
}
@@ -442,15 +444,15 @@
astr = new AttributedString(paragraph);
astr.addAttribute(TextAttribute.RUN_DIRECTION,
TextAttribute.RUN_DIRECTION_RTL);
- astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(62),
+ astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL+1),
start, limit);
try {
bidi = new Bidi(astr.getIterator());
for (int i = start; i < limit; i++) {
if (bidi.getLevelAt(i) != 1) {
- errorHandling("Bidi(AttributedCharacterIterator).getLevelAt()" +
- " should not be " + bidi.getLevelAt(i) +
- " but 1 when BIDI_EMBEDDING is 62.");
+ errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" +
+ i + ") should not be " + bidi.getLevelAt(i) +
+ " but 1 when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL+1.");
}
}
}
@@ -536,8 +538,8 @@
}
byte[] actualLevels = new byte[text.length];
- byte[] validEmbeddings1 = {0, -61, -60, -2, -1};
- byte[] expectedLevels1 = {0, 61, 60, 2, 1};
+ byte[] validEmbeddings1 = {0, -MAX_EXPLICIT_LEVEL, -(MAX_EXPLICIT_LEVEL-1), -2, -1};
+ byte[] expectedLevels1 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1};
try {
bidi = new Bidi(text, 0, validEmbeddings1, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
@@ -553,11 +555,11 @@
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "when embeddings is valid(-61).");
+ "when embeddings is valid(-MAX_EXPLICIT_LEVEL).");
}
- byte[] validEmbeddings2 = {0, 61, 60, 2, 1};
- byte[] expectedLevels2 = {0, 62, 60, 2, 2};
+ byte[] validEmbeddings2 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1};
+ byte[] expectedLevels2 = {0, MAX_EXPLICIT_LEVEL+1, MAX_EXPLICIT_LEVEL-1, 2, 2};
try {
bidi = new Bidi(text, 0, validEmbeddings2, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
@@ -573,35 +575,35 @@
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "when embeddings is valid(61).");
+ "when embeddings is valid(MAX_EXPLICIT_LEVEL).");
}
- byte[] invalidEmbeddings1 = {0, -62, 0, 0, 0};
+ byte[] invalidEmbeddings1 = {0, -(MAX_EXPLICIT_LEVEL+1), 0, 0, 0};
try {
bidi = new Bidi(text, 0, invalidEmbeddings1, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
if (bidi.getLevelAt(1) != 0) {
errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " +
- "when embeddings[1] is -62.");
+ "when embeddings[1] is -(MAX_EXPLICIT_LEVEL+1).");
}
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "even when embeddings includes -62.");
+ "even when embeddings includes -(MAX_EXPLICIT_LEVEL+1).");
}
- byte[] invalidEmbeddings2 = {0, 62, 0, 0, 0};
+ byte[] invalidEmbeddings2 = {0, MAX_EXPLICIT_LEVEL+1, 0, 0, 0};
try {
bidi = new Bidi(text, 0, invalidEmbeddings2, 0, 5,
Bidi.DIRECTION_LEFT_TO_RIGHT);
if (bidi.getLevelAt(1) != 0) {
errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " +
- "when embeddings[1] is 62.");
+ "when embeddings[1] is MAX_EXPLICIT_LEVEL+1.");
}
}
catch (Exception e) {
errorHandling("Bidi(char[], ...) should not throw an exception " +
- "even when embeddings includes 62.");
+ "even when embeddings includes MAX_EXPLICIT_LEVEL+1.");
}
try {
@@ -1595,6 +1597,10 @@
private static final char PDF = '\u202C';
private static final char LRO = '\u202D';
private static final char RLO = '\u202E';
+ private static final char LRI = '\u2066';
+ private static final char RLI = '\u2067';
+ private static final char FSI = '\u2068';
+ private static final char PDI = '\u2069';
/*
* 0x05D0-0x05EA: [R] Hewbrew letters (Strong)
@@ -2002,8 +2008,8 @@
/* For Text #18 */
{" ABC (" + ArabicABC + " " + Arabic123 + ") 123.",
- "0000001111222112220", "0000001111222112220",
- "0000001111222112220", "1222111111222112221"},
+ "0000001111222002220", "0000001111222002220",
+ "0000001111222002220", "1222111111222112221"},
/* For Text #19 */
{" " + HebrewABC + " (ABC 123) " + NKo123 + ".",
@@ -2028,6 +2034,90 @@
PDF,
"22222221111111111111110", "22222221111111111111110",
"22222221111111111111110", "44444443333333333333331"},
+
+ /* For Text #23 */
+ {" ABC (" + Arabic123 + " " + ArabicABC + ") 123.",
+ "0000002221111002220", "0000002221111002220",
+ "0000002221111002220", "1222112221111112221"},
+
+ /* For Text #24 */
+ {" 123 (" + ArabicABC + " " + Arabic123 + ") ABC.",
+ "1222111111222112221", "1222111111222112221",
+ "0000001111222000000", "1222111111222112221"},
+
+ /* For Text #25 */
+ {" 123 (" + Arabic123 + " " + ArabicABC + ") ABC.",
+ "1222112221111112221", "1222112221111112221",
+ "0000002221111000000", "1222112221111112221"},
+
+ /* For Text #26 */
+ {" " + ArabicABC + " (ABC 123) " + Arabic123 + ".",
+ "1111112222222112221", "1111112222222112221",
+ "0111000000000002220", "1111112222222112221"},
+
+ /* For Text #27 */
+ {" " + ArabicABC + " (123 ABC) " + Arabic123 + ".",
+ "1111112221222112221", "1111112221222112221",
+ "0111002220000002220", "1111112221222112221"},
+
+ /* For Text #28 */
+ {" " + Arabic123 + " (ABC 123) " + ArabicABC + ".",
+ "0222000000000001110", "0222000000000001110",
+ "0222000000000001110", "1222112222222111111"},
+
+ /* For Text #29 */
+ {" " + Arabic123 + " (123 ABC) " + ArabicABC + ".",
+ "0222000000000001110", "0222000000000001110",
+ "0222000000000001110", "1222112221222111111"},
+
+ /* For Text #30 */
+ {RLI + "ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "02221111111110", "14443333333331",
+ "02221111111110", "14443333333331"},
+
+ /* For Text #31 */
+ {"ABC abc \"" + RLI + "IJK " + ArabicABC + " " + ArabicABC + PDI +
+ ".\" \"" + RLI + ArabicABC + " " + ArabicABC + PDI + ",\" xyz XYZ.",
+ "0000000000222111111110000001111111000000000000",
+ "0000000000222111111110000001111111000000000000",
+ "0000000000222111111110000001111111000000000000",
+ "2222222222444333333332222223333333222222222221"},
+
+ /* For Text #32 */
+ {ArabicABC + " " + ArabicABC + " '" + LRI + "abc def \"" + RLI +
+ "xyz " + ArabicABC + " " + ArabicABC + PDI + "\"" + PDI + "'?",
+ "111111111122222222224443333333322111",
+ "111111111122222222224443333333322111",
+ "111111100022222222224443333333322000",
+ "111111111122222222224443333333322111"},
+
+ /* For Text #33 */
+ {FSI + Arabic123 + " ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "044422222333333320", "144422222333333321",
+ "044422222333333320", "144422222333333321"},
+
+ /* For Text #34 */
+ {FSI + "123 ABC " + ArabicABC + " " + ArabicABC + "." + PDI,
+ "022222222333333320", "122222222333333321",
+ "022222222333333320", "122222222333333321"},
+
+ /* For Text #35 */
+ {FSI + "123 " + ArabicABC + " ABC " + ArabicABC + "." + PDI,
+ "022211111222111110", "144433333444333331",
+ "022211111222111110", "144433333444333331"},
+
+ /* For Text #36 */
+ {FSI + Arabic123 + " " + ArabicABC + " ABC " + ArabicABC + "." + PDI,
+ "022211111222111110", "144433333444333331",
+ "022211111222111110", "144433333444333331"},
+
+ /* For Text #37 */
+ {FSI + Arabic123 + " 123." + PDI,
+ "0444222220", "1444222221", "0444222220", "1444222221"},
+
+ /* For Text #38 */
+ {FSI + "123 " + Arabic123 + "." + PDI,
+ "0222244420", "1222244421", "0222244420", "1222244421"},
};
/* Golden data for baseIsLeftToRight() results */
@@ -2060,10 +2150,32 @@
{true, true, true, false},
{false, false, true, false},
- /* For Text #20 - $22 */
+ /* For Text #20 - $24 */
+ {true, true, true, false},
+ {true, true, true, false},
{true, true, true, false},
{true, true, true, false},
+ {false, false, true, false},
+
+ /* For Text #25 - $29 */
+ {false, false, true, false},
+ {false, false, true, false},
+ {false, false, true, false},
{true, true, true, false},
+ {true, true, true, false},
+
+ /* For Text #30 - $34 */
+ {true, false, true, false},
+ {true, true, true, false},
+ {false, false, true, false},
+ {true, false, true, false},
+ {true , false, true, false},
+
+ /* For Text #35 - $38 */
+ {true, false, true, false},
+ {true, false, true, false},
+ {true, false, true, false},
+ {true, false, true, false},
};
/* Golden data for isLeftToRight() & isRightToLeft() results */
@@ -2097,7 +2209,29 @@
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
- /* For Text #20 - $22 */
+ /* For Text #20 - $24 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #25 - $29 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #30 - $34 */
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+ {{false, false, false, false}, {false, false, false, false}},
+
+ /* For Text #35 - $37 */
+ {{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
{{false, false, false, false}, {false, false, false, false}},
@@ -2113,8 +2247,13 @@
true, true, true, true, true,
true, true, true, true, true,
- /* For Text #20 - $22 */
- true, true, true,
+ /* For Text #20 - $29 */
+ true, true, true, true, true,
+ true, true, true, true, true,
+
+ /* For Text #30 - $37 */
+ true, true, true, true, true,
+ true, true, true, true,
};
/* --------------------------------------------------------------------- */
--- old/jdk/test/sun/net/idn/NFS4StringPrep.java 2015-07-13 16:12:06.000000000 +0900
+++ new/jdk/test/sun/net/idn/NFS4StringPrep.java 2015-07-13 16:12:06.000000000 +0900
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,6 @@
import java.io.UnsupportedEncodingException;
import java.text.ParseException;
-import sun.text.normalizer.ICUData;
import sun.net.idn.StringPrep;
import sun.text.normalizer.UCharacterIterator;
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/ICUData.java 2015-07-13 16:12:07.000000000 +0900
+++ /dev/null 2015-07-13 16:12:07.000000000 +0900
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.io.InputStream;
-import java.net.URL;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-import java.util.MissingResourceException;
-
-/**
- * Provides access to ICU data files as InputStreams. Implements security checking.
- */
-public final class ICUData {
-
- private static InputStream getStream(final Class
- *
- *
- *
- * RangeValueIterator iterator = UCharacter.getTypeIterator();
- * RangeValueIterator.Element result = new RangeValueIterator.Element();
- * while (iterator.next(result)) {
- * System.out.println("Codepoint \\u" +
- * Integer.toHexString(result.start) +
- * " to codepoint \\u" +
- * Integer.toHexString(result.limit - 1) +
- * " has the character type " + result.value);
- * }
- *
- * @author synwee
- * @stable ICU 2.6
- */
-public interface RangeValueIterator
-{
- // public inner class ---------------------------------------------
-
- /**
- * Return result wrapper for com.ibm.icu.util.RangeValueIterator.
- * Stores the start and limit of the continous result range and the
- * common value all integers between [start, limit - 1] has.
- * @stable ICU 2.6
- */
- public class Element
- {
- // public data member ---------------------------------------------
-
- /**
- * Starting integer of the continuous result range that has the same
- * value
- * @stable ICU 2.6
- */
- public int start;
- /**
- * (End + 1) integer of continuous result range that has the same
- * value
- * @stable ICU 2.6
- */
- public int limit;
- /**
- * Gets the common value of the continous result range
- * @stable ICU 2.6
- */
- public int value;
-
- // public constructor --------------------------------------------
-
- /**
- * Empty default constructor to make javadoc happy
- * @stable ICU 2.4
- */
- public Element()
- {
- }
- }
-
- // public methods -------------------------------------------------
-
- /**
- *
- * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
- * sense, the caller will have to pass a object with a callback function
- * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
- * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
- * codepoints with the same value as determined by
- * UTrieEnumValue(const void *context, uint32_t value). for each range,
- * utrie_enum calls the callback function to perform a task. In this way,
- * icu4c performs the iteration within utrie_enum.
- * To follow the JDK model, icu4j is slightly different from icu4c.
- * Instead of requesting the caller to implement an object for a callback.
- * The caller will have to implement a subclass of TrieIterator, fleshing out
- * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
- * the caller will have to code his own iteration and flesh out the task
- * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
- *
- *
- * {@code utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
- * set);}
- * In Java:
- *
- * class TrieIteratorImpl extends TrieIterator{
- * public TrieIteratorImpl(Trie data){
- * super(data);
- * }
- * public int extract(int value){
- * // port the implementation of _enumPropertyStartsValue here
- * }
- * }
- * ....
- * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
- * while(fcdIter.next(result)) {
- * // port the implementation of _enumPropertyStartsRange
- * }
- *
- *
- * {@code
- * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
- * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie);
- * while(fcdIter.next(result)){
- * set.add(result.start);
- * }
- * }
- *
- * @author synwee
- * @see com.ibm.icu.impl.Trie
- * @see com.ibm.icu.lang.UCharacterTypeIterator
- * @since release 2.1, Jan 17 2002
- */
-public class TrieIterator implements RangeValueIterator
-{
-
- // public constructor ---------------------------------------------
-
- /**
- * TrieEnumeration constructor
- * @param trie to be used
- * @exception IllegalArgumentException throw when argument is null.
- */
- public TrieIterator(Trie trie)
- {
- if (trie == null) {
- throw new IllegalArgumentException(
- "Argument trie cannot be null");
- }
- m_trie_ = trie;
- // synwee: check that extract belongs to the child class
- m_initialValue_ = extract(m_trie_.getInitialValue());
- reset();
- }
-
- // public methods -------------------------------------------------
-
- /**
- * UnicodeMatcher
defines a protocol for objects that can
- * match a range of characters in a Replaceable string.
- * @stable ICU 2.0
- */
-public interface UnicodeMatcher {
-
- /**
- * The character at index {@code i}, where
- * {@code i < contextStart || i >= contextLimit},
- * is ETHER. This allows explicit matching by rules and UnicodeSets
- * of text outside the context. In traditional terms, this allows anchoring
- * at the start and/or end.
- * @stable ICU 2.0
- */
- static final char ETHER = '\uFFFF';
-
-}
-
-//eof
--- old/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSetIterator.java 2015-07-13 16:12:11.000000000 +0900
+++ /dev/null 2015-07-13 16:12:11.000000000 +0900
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation. Oracle designates this
- * particular file as subject to the "Classpath" exception as provided
- * by Oracle in the LICENSE file that accompanied this code.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- */
-/*
- *******************************************************************************
- * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
- * *
- * The original version of this source code and documentation is copyrighted *
- * and owned by IBM, These materials are provided under terms of a License *
- * Agreement between IBM and Sun. This technology is protected by multiple *
- * US and International patents. This notice and attribution to IBM may not *
- * to removed. *
- *******************************************************************************
- */
-
-package sun.text.normalizer;
-
-import java.util.Iterator;
-
-/**
- * UnicodeSetIterator iterates over the contents of a UnicodeSet. It
- * iterates over either code points or code point ranges. After all
- * code points or ranges have been returned, it returns the
- * multicharacter strings of the UnicodSet, if any.
- *
- *
- * UnicodeSetIterator it(set);
- * while (set.next()) {
- * if (set.codepoint != UnicodeSetIterator::IS_STRING) {
- * processCodepoint(set.codepoint);
- * } else {
- * processString(set.string);
- * }
- * }
- *
- *
- *
- * UnicodeSetIterator it(set);
- * while (set.nextRange()) {
- * if (set.codepoint != UnicodeSetIterator::IS_STRING) {
- * processCodepointRange(set.codepoint, set.codepointEnd);
- * } else {
- * processString(set.string);
- * }
- * }
- *
- * @author M. Davis
- * @stable ICU 2.0
- */
-public class UnicodeSetIterator {
-
- /**
- * Value of {@code codepoint} if the iterator points to a string.
- * If {@code codepoint == IS_STRING}, then examine
- * {@code string} for the current iteration result.
- * @stable ICU 2.0
- */
- public static int IS_STRING = -1;
-
- /**
- * Current code point, or the special value {@code IS_STRING}, if
- * the iterator points to a string.
- * @stable ICU 2.0
- */
- public int codepoint;
-
- /**
- * When iterating over ranges using {@code nextRange()},
- * {@code codepointEnd} contains the inclusive end of the
- * iteration range, if {@code codepoint != IS_STRING}. If
- * iterating over code points using {@code next()}, or if
- * {@code codepoint == IS_STRING}, then the value of
- * {@code codepointEnd} is undefined.
- * @stable ICU 2.0
- */
- public int codepointEnd;
-
- /**
- * If {@code codepoint == IS_STRING}, then {@code string} points
- * to the current string. If {@code codepoint != IS_STRING}, the
- * value of {@code string} is undefined.
- * @stable ICU 2.0
- */
- public String string;
-
- /**
- * Create an iterator over the given set.
- * @param set set to iterate over
- * @stable ICU 2.0
- */
- public UnicodeSetIterator(UnicodeSet set) {
- reset(set);
- }
-
- /**
- * Returns the next element in the set, either a code point range
- * or a string. If there are no more elements in the set, return
- * false. If {@code codepoint == IS_STRING}, the value is a
- * string in the {@code string} field. Otherwise the value is a
- * range of one or more code points from {@code codepoint} to
- * {@code codepointeEnd} inclusive.
- *
- * end=spanQuickCheckYes(s);
+ * the substring s.subSequence(0, end)
+ * will pass the quick check with a "yes" result.
+ * Output<Integer>
but without auto-boxing.
+ *
+ * @internal but could become public
+ * deprecated This API is ICU internal only.
+ */
+class OutputInt {
+
+ /**
+ * The value field.
+ *
+ * @internal
+ * deprecated This API is ICU internal only.
+ */
+ public int value;
+}
--- /dev/null 2015-07-13 16:12:16.000000000 +0900
+++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Trie2.java 2015-07-13 16:12:15.000000000 +0900
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ *******************************************************************************
+ * Copyright (C) 2009-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+package sun.text.normalizer;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+
+/**
+ * This is the interface and common implementation of a Unicode Trie2.
+ * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff)
+ * to 16- or 32-bit integer values. It works best when there are ranges of
+ * characters with the same value, which is generally the case with Unicode
+ * character properties.
+ *
+ * This is the second common version of a Unicode trie (hence the name Trie2).
+ *
+ */
+abstract class Trie2 implements Iterable