--- old/jdk/make/data/characterdata/CharacterData00.java.template 2015-07-13 16:11:32.000000000 +0900 +++ new/jdk/make/data/characterdata/CharacterData00.java.template 2015-07-13 16:11:31.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -226,6 +226,11 @@ case 0xA77D : mapChar = 0x1D79; break; case 0xA78D : mapChar = 0x0265; break; case 0xA7AA : mapChar = 0x0266; break; + case 0xA7AB : mapChar = 0x025C; break; + case 0xA7AC : mapChar = 0x0261; break; + case 0xA7AD : mapChar = 0x026C; break; + case 0xA7B0 : mapChar = 0x029E; break; + case 0xA7B1 : mapChar = 0x0287; break; // default mapChar is already set, so no // need to redo it here. // default : mapChar = ch; @@ -284,10 +289,15 @@ case 0x0250 : mapChar = 0x2C6F; break; case 0x0251 : mapChar = 0x2C6D; break; case 0x0252 : mapChar = 0x2C70; break; + case 0x025C : mapChar = 0xA7AB; break; + case 0x0261 : mapChar = 0xA7AC; break; case 0x0265 : mapChar = 0xA78D; break; case 0x0266 : mapChar = 0xA7AA; break; case 0x026B : mapChar = 0x2C62; break; + case 0x026C : mapChar = 0xA7AD; break; case 0x0271 : mapChar = 0x2C6E; break; + case 0x0287 : mapChar = 0xA7B1; break; + case 0x029E : mapChar = 0xA7B0; break; case 0x027D : mapChar = 0x2C64; break; case 0x1D79 : mapChar = 0xA77D; break; case 0x1D7D : mapChar = 0x2C63; break; @@ -503,6 +513,22 @@ // This is the only char with RLO directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE; break; + case 0x2066 : + // This is the only char with LRI + directionality = Character.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE; + break; + case 0x2067 : + // This is the only char with RLI + directionality = Character.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE; + break; + case 0x2068 : + // This is the only char with FSI + directionality = Character.DIRECTIONALITY_FIRST_STRONG_ISOLATE; + break; + case 0x2069 : + // This is the only char with PDI + directionality = Character.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE; + break; default : directionality = Character.DIRECTIONALITY_UNDEFINED; break; @@ -537,11 +563,16 @@ case 0x0250 : mapChar = 0x2C6F; break; case 0x0251 : mapChar = 0x2C6D; break; case 0x0252 : mapChar = 0x2C70; break; + case 0x025C : mapChar = 0xA7AB; break; + case 0x0261 : mapChar = 0xA7AC; break; case 0x0265 : mapChar = 0xA78D; break; case 0x0266 : mapChar = 0xA7AA; break; case 0x026B : mapChar = 0x2C62; break; + case 0x026C : mapChar = 0xA7AD; break; case 0x0271 : mapChar = 0x2C6E; break; case 0x027D : mapChar = 0x2C64; break; + case 0x0287 : mapChar = 0xA7B1; break; + case 0x029E : mapChar = 0xA7B0; break; case 0x1D79 : mapChar = 0xA77D; break; case 0x1D7D : mapChar = 0x2C63; break; case 0x2C65 : mapChar = 0x023A; break; --- old/jdk/make/data/characterdata/CharacterData01.java.template 2015-07-13 16:11:32.000000000 +0900 +++ new/jdk/make/data/characterdata/CharacterData01.java.template 2015-07-13 16:11:32.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -244,81 +244,118 @@ case 0x10132: retval = 80000; break; // AEGEAN NUMBER EIGHTY THOUSAND case 0x10133: retval = 90000; break; // AEGEAN NUMBER NINETY THOUSAND case 0x10323: retval = 50; break; // OLD ITALIC NUMERAL FIFTY - - case 0x010144: retval = 50; break; // ACROPHONIC ATTIC FIFTY - case 0x010145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED - case 0x010146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND - case 0x010147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND - case 0x01014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS - case 0x01014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS - case 0x01014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS - case 0x01014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS - case 0x01014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS - case 0x010151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS - case 0x010152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS - case 0x010153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS - case 0x010154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS - case 0x010155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS - case 0x010156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS - case 0x010166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY - case 0x010167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM - case 0x010168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY - case 0x010169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY - case 0x01016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED - case 0x01016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED - case 0x01016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED - case 0x01016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED - case 0x01016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED - case 0x01016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED - case 0x010170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED - case 0x010171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND - case 0x010172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND - case 0x010174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS - case 0x010341: retval = 90; break; // GOTHIC LETTER NINETY - case 0x01034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED - case 0x0103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED - case 0x01085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED - case 0x01085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND - case 0x01085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND - case 0x010919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED - case 0x010A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED - case 0x010A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND - case 0x010A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY - case 0x010B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED - case 0x010B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND - case 0x010B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED - case 0x010B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND - case 0x010E6C: retval = 40; break; // RUMI NUMBER FORTY - case 0x010E6D: retval = 50; break; // RUMI NUMBER FIFTY - case 0x010E6E: retval = 60; break; // RUMI NUMBER SIXTY - case 0x010E6F: retval = 70; break; // RUMI NUMBER SEVENTY - case 0x010E70: retval = 80; break; // RUMI NUMBER EIGHTY - case 0x010E71: retval = 90; break; // RUMI NUMBER NINETY - case 0x010E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED - case 0x010E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED - case 0x010E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED - case 0x010E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED - case 0x010E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED - case 0x010E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED - case 0x010E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED - case 0x010E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED - case 0x010E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED - case 0x01105E: retval = 40; break; // BRAHMI NUMBER FORTY - case 0x01105F: retval = 50; break; // BRAHMI NUMBER FIFTY - case 0x011060: retval = 60; break; // BRAHMI NUMBER SIXTY - case 0x011061: retval = 70; break; // BRAHMI NUMBER SEVENTY - case 0x011062: retval = 80; break; // BRAHMI NUMBER EIGHTY - case 0x011063: retval = 90; break; // BRAHMI NUMBER NINETY - case 0x011064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED - case 0x011065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND - case 0x012432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH - case 0x012433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN - case 0x01D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR - case 0x01D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE - case 0x01D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX - case 0x01D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN - case 0x01D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT - case 0x01D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE + case 0x10144: retval = 50; break; // ACROPHONIC ATTIC FIFTY + case 0x10145: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED + case 0x10146: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND + case 0x10147: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND + case 0x1014A: retval = 50; break; // ACROPHONIC ATTIC FIFTY TALENTS + case 0x1014B: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED TALENTS + case 0x1014C: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED TALENTS + case 0x1014D: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND TALENTS + case 0x1014E: retval = 5000; break; // ACROPHONIC ATTIC FIVE THOUSAND TALENTS + case 0x10151: retval = 50; break; // ACROPHONIC ATTIC FIFTY STATERS + case 0x10152: retval = 100; break; // ACROPHONIC ATTIC ONE HUNDRED STATERS + case 0x10153: retval = 500; break; // ACROPHONIC ATTIC FIVE HUNDRED STATERS + case 0x10154: retval = 1000; break; // ACROPHONIC ATTIC ONE THOUSAND STATERS + case 0x10155: retval = 10000; break; // ACROPHONIC ATTIC TEN THOUSAND STATERS + case 0x10156: retval = 50000; break; // ACROPHONIC ATTIC FIFTY THOUSAND STATERS + case 0x10166: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY + case 0x10167: retval = 50; break; // ACROPHONIC TROEZENIAN FIFTY ALTERNATE FORM + case 0x10168: retval = 50; break; // ACROPHONIC HERMIONIAN FIFTY + case 0x10169: retval = 50; break; // ACROPHONIC THESPIAN FIFTY + case 0x1016A: retval = 100; break; // ACROPHONIC THESPIAN ONE HUNDRED + case 0x1016B: retval = 300; break; // ACROPHONIC THESPIAN THREE HUNDRED + case 0x1016C: retval = 500; break; // ACROPHONIC EPIDAUREAN FIVE HUNDRED + case 0x1016D: retval = 500; break; // ACROPHONIC TROEZENIAN FIVE HUNDRED + case 0x1016E: retval = 500; break; // ACROPHONIC THESPIAN FIVE HUNDRED + case 0x1016F: retval = 500; break; // ACROPHONIC CARYSTIAN FIVE HUNDRED + case 0x10170: retval = 500; break; // ACROPHONIC NAXIAN FIVE HUNDRED + case 0x10171: retval = 1000; break; // ACROPHONIC THESPIAN ONE THOUSAND + case 0x10172: retval = 5000; break; // ACROPHONIC THESPIAN FIVE THOUSAND + case 0x10174: retval = 50; break; // ACROPHONIC STRATIAN FIFTY MNAS + case 0x102ED: retval = 40; break; // COPTIC EPACT NUMBER FORTY + case 0x102EE: retval = 50; break; // COPTIC EPACT NUMBER FIFTY + case 0x102EF: retval = 60; break; // COPTIC EPACT NUMBER SIXTY + case 0x102F0: retval = 70; break; // COPTIC EPACT NUMBER SEVENTY + case 0x102F1: retval = 80; break; // COPTIC EPACT NUMBER EIGHTY + case 0x102F2: retval = 90; break; // COPTIC EPACT NUMBER NINETY + case 0x102F3: retval = 100; break; // COPTIC EPACT NUMBER ONE HUNDRED + case 0x102F4: retval = 200; break; // COPTIC EPACT NUMBER TWO HUNDRED + case 0x102F5: retval = 300; break; // COPTIC EPACT NUMBER THREE HUNDRED + case 0x102F6: retval = 400; break; // COPTIC EPACT NUMBER FOUR HUNDRED + case 0x102F7: retval = 500; break; // COPTIC EPACT NUMBER FIVE HUNDRED + case 0x102F8: retval = 600; break; // COPTIC EPACT NUMBER SIX HUNDRED + case 0x102F9: retval = 700; break; // COPTIC EPACT NUMBER SEVEN HUNDRED + case 0x102FA: retval = 800; break; // COPTIC EPACT NUMBER EIGHT HUNDRED + case 0x102FB: retval = 900; break; // COPTIC EPACT NUMBER NINE HUNDRED + case 0x10341: retval = 90; break; // GOTHIC LETTER NINETY + case 0x1034A: retval = 900; break; // GOTHIC LETTER NINE HUNDRED + case 0x103D5: retval = 100; break; // OLD PERSIAN NUMBER HUNDRED + case 0x1085D: retval = 100; break; // IMPERIAL ARAMAIC NUMBER ONE HUNDRED + case 0x1085E: retval = 1000; break; // IMPERIAL ARAMAIC NUMBER ONE THOUSAND + case 0x1085F: retval = 10000; break; // IMPERIAL ARAMAIC NUMBER TEN THOUSAND + case 0x108AF: retval = 100; break; // NABATAEAN NUMBER ONE HUNDRED + case 0x10919: retval = 100; break; // PHOENICIAN NUMBER ONE HUNDRED + case 0x10A46: retval = 100; break; // KHAROSHTHI NUMBER ONE HUNDRED + case 0x10A47: retval = 1000; break; // KHAROSHTHI NUMBER ONE THOUSAND + case 0x10A7E: retval = 50; break; // OLD SOUTH ARABIAN NUMBER FIFTY + case 0x10AEF: retval = 100; break; // MANICHAEAN NUMBER ONE HUNDRED + case 0x10B5E: retval = 100; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE HUNDRED + case 0x10B5F: retval = 1000; break; // INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND + case 0x10B7E: retval = 100; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED + case 0x10B7F: retval = 1000; break; // INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND + case 0x10BAF: retval = 100; break; // PSALTER PAHLAVI NUMBER ONE HUNDRED + case 0x10E6C: retval = 40; break; // RUMI NUMBER FORTY + case 0x10E6D: retval = 50; break; // RUMI NUMBER FIFTY + case 0x10E6E: retval = 60; break; // RUMI NUMBER SIXTY + case 0x10E6F: retval = 70; break; // RUMI NUMBER SEVENTY + case 0x10E70: retval = 80; break; // RUMI NUMBER EIGHTY + case 0x10E71: retval = 90; break; // RUMI NUMBER NINETY + case 0x10E72: retval = 100; break; // RUMI NUMBER ONE HUNDRED + case 0x10E73: retval = 200; break; // RUMI NUMBER TWO HUNDRED + case 0x10E74: retval = 300; break; // RUMI NUMBER THREE HUNDRED + case 0x10E75: retval = 400; break; // RUMI NUMBER FOUR HUNDRED + case 0x10E76: retval = 500; break; // RUMI NUMBER FIVE HUNDRED + case 0x10E77: retval = 600; break; // RUMI NUMBER SIX HUNDRED + case 0x10E78: retval = 700; break; // RUMI NUMBER SEVEN HUNDRED + case 0x10E79: retval = 800; break; // RUMI NUMBER EIGHT HUNDRED + case 0x10E7A: retval = 900; break; // RUMI NUMBER NINE HUNDRED + case 0x1105E: retval = 40; break; // BRAHMI NUMBER FORTY + case 0x1105F: retval = 50; break; // BRAHMI NUMBER FIFTY + case 0x11060: retval = 60; break; // BRAHMI NUMBER SIXTY + case 0x11061: retval = 70; break; // BRAHMI NUMBER SEVENTY + case 0x11062: retval = 80; break; // BRAHMI NUMBER EIGHTY + case 0x11063: retval = 90; break; // BRAHMI NUMBER NINETY + case 0x11064: retval = 100; break; // BRAHMI NUMBER ONE HUNDRED + case 0x11065: retval = 1000; break; // BRAHMI NUMBER ONE THOUSAND + case 0x111ED: retval = 40; break; // SINHALA ARCHAIC NUMBER FORTY + case 0x111EE: retval = 50; break; // SINHALA ARCHAIC NUMBER FIFTY + case 0x111EF: retval = 60; break; // SINHALA ARCHAIC NUMBER SIXTY + case 0x111F0: retval = 70; break; // SINHALA ARCHAIC NUMBER SEVENTY + case 0x111F1: retval = 80; break; // SINHALA ARCHAIC NUMBER EIGHTY + case 0x111F2: retval = 90; break; // SINHALA ARCHAIC NUMBER NINETY + case 0x111F3: retval = 100; break; // SINHALA ARCHAIC NUMBER ONE HUNDRED + case 0x111F4: retval = 1000; break; // SINHALA ARCHAIC NUMBER ONE THOUSAND + case 0x118ED: retval = 40; break; // WARANG CITI NUMBER FORTY + case 0x118EE: retval = 50; break; // WARANG CITI NUMBER FIFTY + case 0x118EF: retval = 60; break; // WARANG CITI NUMBER SIXTY + case 0x118F0: retval = 70; break; // WARANG CITI NUMBER SEVENTY + case 0x118F1: retval = 80; break; // WARANG CITI NUMBER EIGHTY + case 0x118F2: retval = 90; break; // WARANG CITI NUMBER NINETY + case 0x12432: retval = 216000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH + case 0x12433: retval = 432000; break; // CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN + case 0x12467: retval = 40; break; // CUNEIFORM NUMERIC SIGN ELAMITE FORTY + case 0x12468: retval = 50; break; // CUNEIFORM NUMERIC SIGN ELAMITE FIFTY + case 0x16B5C: retval = 100; break; // PAHAWH HMONG NUMBER HUNDREDS + case 0x16B5D: retval = 10000; break; // PAHAWH HMONG NUMBER TEN THOUSANDS + case 0x16B5E: retval = 1000000; break; // PAHAWH HMONG NUMBER MILLIONS + case 0x16B5F: retval = 100000000; break;// PAHAWH HMONG NUMBER HUNDRED MILLIONS + case 0x1D36C: retval = 40; break; // COUNTING ROD TENS DIGIT FOUR + case 0x1D36D: retval = 50; break; // COUNTING ROD TENS DIGIT FIVE + case 0x1D36E: retval = 60; break; // COUNTING ROD TENS DIGIT SIX + case 0x1D36F: retval = 70; break; // COUNTING ROD TENS DIGIT SEVEN + case 0x1D370: retval = 80; break; // COUNTING ROD TENS DIGIT EIGHT + case 0x1D371: retval = 90; break; // COUNTING ROD TENS DIGIT NINE default: retval = -2; break; } --- old/jdk/make/data/unicodedata/PropList.txt 2015-07-13 16:11:33.000000000 +0900 +++ new/jdk/make/data/unicodedata/PropList.txt 2015-07-13 16:11:33.000000000 +0900 @@ -1,8 +1,8 @@ -# PropList-6.2.0.txt -# Date: 2012-05-23, 20:34:59 GMT [MD] +# PropList-7.0.0.txt +# Date: 2014-02-19, 15:51:26 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ @@ -13,7 +13,6 @@ 0085 ; White_Space # Cc 00A0 ; White_Space # Zs NO-BREAK SPACE 1680 ; White_Space # Zs OGHAM SPACE MARK -180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR 2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE 2028 ; White_Space # Zl LINE SEPARATOR 2029 ; White_Space # Zp PARAGRAPH SEPARATOR @@ -21,14 +20,16 @@ 205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE 3000 ; White_Space # Zs IDEOGRAPHIC SPACE -# Total code points: 26 +# Total code points: 25 # ================================================ +061C ; Bidi_Control # Cf ARABIC LETTER MARK 200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK 202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE +2066..2069 ; Bidi_Control # Cf [4] LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE -# Total code points: 7 +# Total code points: 12 # ================================================ @@ -51,6 +52,7 @@ 2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN 2E1A ; Dash # Pd HYPHEN WITH DIAERESIS 2E3A..2E3B ; Dash # Pd [2] TWO-EM DASH..THREE-EM DASH +2E40 ; Dash # Pd DOUBLE HYPHEN 301C ; Dash # Pd WAVE DASH 3030 ; Dash # Pd WAVY DASH 30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN @@ -59,7 +61,7 @@ FE63 ; Dash # Pd SMALL HYPHEN-MINUS FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS -# Total code points: 27 +# Total code points: 28 # ================================================ @@ -91,6 +93,7 @@ 201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +2E42 ; Quotation_Mark # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 300C ; Quotation_Mark # Ps LEFT CORNER BRACKET 300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET 300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET @@ -106,7 +109,7 @@ FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET -# Total code points: 29 +# Total code points: 30 # ================================================ @@ -136,6 +139,7 @@ 1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR 166D..166E ; Terminal_Punctuation # Po [2] CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLABICS FULL STOP 16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION +1735..1736 ; Terminal_Punctuation # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT 1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS @@ -149,6 +153,8 @@ 203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK +2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP +2E41 ; Terminal_Punctuation # Po REVERSED COMMA 3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP A4FE..A4FF ; Terminal_Punctuation # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK @@ -174,14 +180,27 @@ 103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER 10857 ; Terminal_Punctuation # Po IMPERIAL ARAMAIC SECTION SIGN 1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR +10A56..10A57 ; Terminal_Punctuation # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA +10AF0..10AF5 ; Terminal_Punctuation # Po [6] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS 10B3A..10B3F ; Terminal_Punctuation # Po [6] TINY TWO DOTS OVER ONE DOT PUNCTUATION..LARGE ONE RING OVER TWO RINGS PUNCTUATION +10B99..10B9C ; Terminal_Punctuation # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 11047..1104D ; Terminal_Punctuation # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 110BE..110C1 ; Terminal_Punctuation # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; Terminal_Punctuation # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; Terminal_Punctuation # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA -12470..12473 ; Terminal_Punctuation # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON +111CD ; Terminal_Punctuation # Po SHARADA SUTRA MARK +11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK +115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR +115C9 ; Terminal_Punctuation # Po SIDDHAM END OF TEXT MARK +11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA +12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON +16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA +16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP +16B37..16B39 ; Terminal_Punctuation # Po [3] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN CIM CHEEM +16B44 ; Terminal_Punctuation # Po PAHAWH HMONG SIGN XAUS +1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP -# Total code points: 176 +# Total code points: 214 # ================================================ @@ -230,6 +249,10 @@ 21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW 21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW 21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR +2308 ; Other_Math # Ps LEFT CEILING +2309 ; Other_Math # Pe RIGHT CEILING +230A ; Other_Math # Ps LEFT FLOOR +230B ; Other_Math # Pe RIGHT FLOOR 23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET 23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM 23D0 ; Other_Math # So VERTICAL LINE EXTENSION @@ -358,7 +381,7 @@ 1EEA5..1EEA9 ; Other_Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Other_Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN -# Total code points: 1358 +# Total code points: 1362 # ================================================ @@ -403,8 +426,7 @@ 0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN 08E4..08E9 ; Other_Alphabetic # Mn [6] ARABIC CURLY FATHA..ARABIC CURLY KASRATAN -08F0..08FE ; Other_Alphabetic # Mn [15] ARABIC OPEN FATHATAN..ARABIC DAMMA WITH DOT -0900..0902 ; Other_Alphabetic # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA +08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA 0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA 093A ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN OE 093B ; Other_Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE @@ -457,6 +479,7 @@ 0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK +0C00 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR @@ -464,6 +487,7 @@ 0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU 0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +0C81 ; Other_Alphabetic # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA 0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I @@ -474,6 +498,7 @@ 0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU 0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR @@ -538,7 +563,8 @@ 19B0..19C0 ; Other_Alphabetic # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY 19C8..19C9 ; Other_Alphabetic # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2 1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U -1A19..1A1B ; Other_Alphabetic # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE +1A19..1A1A ; Other_Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A1B ; Other_Alphabetic # Mn BUGINESE VOWEL SIGN AE 1A55 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; Other_Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI @@ -564,7 +590,7 @@ 1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG -1BAC..1BAD ; Other_Alphabetic # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BAC..1BAD ; Other_Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE7 ; Other_Alphabetic # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; Other_Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; Other_Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O @@ -575,6 +601,7 @@ 1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1CF2..1CF3 ; Other_Alphabetic # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA +1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA @@ -616,6 +643,7 @@ ABE8 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA +10376..1037A ; Other_Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA @@ -636,14 +664,54 @@ 111B3..111B5 ; Other_Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; Other_Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF ; Other_Alphabetic # Mc SHARADA VOWEL SIGN AU +1122C..1122E ; Other_Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +1122F..11231 ; Other_Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA +11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA +112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA +112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU +11301 ; Other_Alphabetic # Mn GRANTHA SIGN CANDRABINDU +11302..11303 ; Other_Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +1133E..1133F ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11340 ; Other_Alphabetic # Mn GRANTHA VOWEL SIGN II +11341..11344 ; Other_Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU +11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK +11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E +114BA ; Other_Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E +114BB..114BE ; Other_Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114BF..114C0 ; Other_Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C1 ; Other_Alphabetic # Mc TIRHUTA SIGN VISARGA +115AF..115B1 ; Other_Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B2..115B5 ; Other_Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115B8..115BB ; Other_Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BC..115BD ; Other_Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BE ; Other_Alphabetic # Mc SIDDHAM SIGN VISARGA +11630..11632 ; Other_Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +11633..1163A ; Other_Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163B..1163C ; Other_Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163D ; Other_Alphabetic # Mn MODI SIGN ANUSVARA +1163E ; Other_Alphabetic # Mc MODI SIGN VISARGA +11640 ; Other_Alphabetic # Mn MODI SIGN ARDHACANDRA 116AB ; Other_Alphabetic # Mn TAKRI SIGN ANUSVARA 116AC ; Other_Alphabetic # Mc TAKRI SIGN VISARGA 116AD ; Other_Alphabetic # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; Other_Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; Other_Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU +16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG +1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK +1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z +1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z +1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 922 +# Total code points: 1116 # ================================================ @@ -746,6 +814,7 @@ 1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A75..1A7C ; Diacritic # Mn [8] TAI THAM SIGN TONE-1..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Diacritic # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT +1AB0..1ABD ; Diacritic # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1B34 ; Diacritic # Mn BALINESE SIGN REREKAN 1B44 ; Diacritic # Mc BALINESE ADEG ADEG 1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG @@ -760,8 +829,10 @@ 1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Diacritic # Mn VEDIC SIGN TIRYAK 1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE +1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW +1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE 1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1FBD ; Diacritic # Sk GREEK KORONIS 1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI @@ -779,6 +850,7 @@ A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK A67F ; Diacritic # Lm CYRILLIC PAYEROK +A69C..A69D ; Diacritic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6F0..A6F1 ; Diacritic # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE @@ -791,26 +863,45 @@ A953 ; Diacritic # Mc REJANG VIRAMA A9B3 ; Diacritic # Mn JAVANESE SIGN CECAK TELU A9C0 ; Diacritic # Mc JAVANESE PANGKON +A9E5 ; Diacritic # Mn MYANMAR SIGN SHAN SAW AA7B ; Diacritic # Mc MYANMAR SIGN PAO KAREN TONE +AA7C ; Diacritic # Mn MYANMAR SIGN TAI LAING TONE-2 +AA7D ; Diacritic # Mc MYANMAR SIGN TAI LAING TONE-5 AABF ; Diacritic # Mn TAI VIET TONE MAI EK AAC0 ; Diacritic # Lo TAI VIET TONE MAI NUENG AAC1 ; Diacritic # Mn TAI VIET TONE MAI THO AAC2 ; Diacritic # Lo TAI VIET TONE MAI SONG AAF6 ; Diacritic # Mn MEETEI MAYEK VIRAMA +AB5B ; Diacritic # Sk MODIFIER BREVE WITH INVERTED BREVE +AB5C..AB5F ; Diacritic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK ABEC ; Diacritic # Mc MEETEI MAYEK LUM IYEK ABED ; Diacritic # Mn MEETEI MAYEK APUN IYEK FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA -FE20..FE26 ; Diacritic # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON +FE20..FE2D ; Diacritic # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFE3 ; Diacritic # Sk FULLWIDTH MACRON +102E0 ; Diacritic # Mn COPTIC EPACT THOUSANDS MARK +10AE5..10AE6 ; Diacritic # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 110B9..110BA ; Diacritic # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 11133..11134 ; Diacritic # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA +11173 ; Diacritic # Mn MAHAJANI SIGN NUKTA 111C0 ; Diacritic # Mc SHARADA SIGN VIRAMA +11235 ; Diacritic # Mc KHOJKI SIGN VIRAMA +11236 ; Diacritic # Mn KHOJKI SIGN NUKTA +112E9..112EA ; Diacritic # Mn [2] KHUDAWADI SIGN NUKTA..KHUDAWADI SIGN VIRAMA +1133C ; Diacritic # Mn GRANTHA SIGN NUKTA +1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA +11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA +114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +1163F ; Diacritic # Mn MODI SIGN VIRAMA 116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA 116B7 ; Diacritic # Mn TAKRI SIGN NUKTA +16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 @@ -818,8 +909,9 @@ 1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO +1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS -# Total code points: 693 +# Total code points: 766 # ================================================ @@ -841,12 +933,16 @@ A015 ; Extender # Lm YI SYLLABLE WU A60C ; Extender # Lm VAI SYLLABLE LENGTHENER A9CF ; Extender # Lm JAVANESE PANGRANGKEP +A9E6 ; Extender # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION AA70 ; Extender # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AADD ; Extender # Lm TAI VIET SYMBOL SAM AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +1135D ; Extender # Lo GRANTHA SIGN PLUTA +115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3 +16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM -# Total code points: 31 +# Total code points: 38 # ================================================ @@ -866,17 +962,22 @@ 2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C7C..2C7D ; Other_Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V +A69C..A69D ; Other_Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A770 ; Other_Lowercase # Lm MODIFIER LETTER US A7F8..A7F9 ; Other_Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE +AB5C..AB5F ; Other_Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK -# Total code points: 183 +# Total code points: 189 # ================================================ 2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z +1F130..1F149 ; Other_Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z +1F150..1F169 ; Other_Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z +1F170..1F189 ; Other_Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 42 +# Total code points: 120 # ================================================ @@ -918,10 +1019,15 @@ 200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA +11357 ; Other_Grapheme_Extend # Mc GRANTHA AU LENGTH MARK +114B0 ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA +114BD ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O +115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA 1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM 1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 -# Total code points: 25 +# Total code points: 30 # ================================================ @@ -966,7 +1072,7 @@ 034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER 115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER 17B4..17B5 ; Other_Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA -2065..2069 ; Other_Default_Ignorable_Code_Point # Cn [5] .. +2065 ; Other_Default_Ignorable_Code_Point # Cn 3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] .. @@ -975,7 +1081,7 @@ E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] .. E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] .. -# Total code points: 3780 +# Total code points: 3776 # ================================================ @@ -1060,8 +1166,6 @@ 0021 ; STerm # Po EXCLAMATION MARK 002E ; STerm # Po FULL STOP 003F ; STerm # Po QUESTION MARK -055C ; STerm # Po ARMENIAN EXCLAMATION MARK -055E ; STerm # Po ARMENIAN QUESTION MARK 0589 ; STerm # Po ARMENIAN FULL STOP 061F ; STerm # Po ARABIC QUESTION MARK 06D4 ; STerm # Po ARABIC FULL STOP @@ -1084,6 +1188,7 @@ 203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2E2E ; STerm # Po REVERSED QUESTION MARK +2E3C ; STerm # Po STENOGRAPHIC FULL STOP 3002 ; STerm # Po IDEOGRAPHIC FULL STOP A4FF ; STerm # Po LISU PUNCTUATION FULL STOP A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK @@ -1107,8 +1212,19 @@ 110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA +111CD ; STerm # Po SHARADA SUTRA MARK +11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA +1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK +115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA +115C9 ; STerm # Po SIDDHAM END OF TEXT MARK +11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA +16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA +16AF5 ; STerm # Po BASSA VAH FULL STOP +16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB +16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS +1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP -# Total code points: 83 +# Total code points: 99 # ================================================ @@ -1210,7 +1326,10 @@ 21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE -2308..230B ; Pattern_Syntax # Sm [4] LEFT CEILING..RIGHT FLOOR +2308 ; Pattern_Syntax # Ps LEFT CEILING +2309 ; Pattern_Syntax # Pe RIGHT CEILING +230A ; Pattern_Syntax # Ps LEFT FLOOR +230B ; Pattern_Syntax # Pe RIGHT FLOOR 230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD @@ -1222,8 +1341,8 @@ 239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET -23E2..23F3 ; Pattern_Syntax # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND -23F4..23FF ; Pattern_Syntax # Cn [12] .. +23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD +23FB..23FF ; Pattern_Syntax # Cn [5] .. 2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO 2427..243F ; Pattern_Syntax # Cn [25] .. 2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH @@ -1236,9 +1355,7 @@ 25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN -2670..26FF ; Pattern_Syntax # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE -2700 ; Pattern_Syntax # Cn -2701..2767 ; Pattern_Syntax # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET +2670..2767 ; Pattern_Syntax # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT @@ -1306,9 +1423,16 @@ 2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR -2B4D..2B4F ; Pattern_Syntax # Cn [3] .. -2B50..2B59 ; Pattern_Syntax # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE -2B5A..2BFF ; Pattern_Syntax # Cn [166] .. +2B4D..2B73 ; Pattern_Syntax # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B74..2B75 ; Pattern_Syntax # Cn [2] .. +2B76..2B95 ; Pattern_Syntax # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B96..2B97 ; Pattern_Syntax # Cn [2] .. +2B98..2BB9 ; Pattern_Syntax # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX +2BBA..2BBC ; Pattern_Syntax # Cn [3] .. +2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED +2BC9 ; Pattern_Syntax # Cn +2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN +2BD2..2BFF ; Pattern_Syntax # Cn [46] .. 2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET @@ -1342,7 +1466,11 @@ 2E2F ; Pattern_Syntax # Lm VERTICAL TILDE 2E30..2E39 ; Pattern_Syntax # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Pattern_Syntax # Pd [2] TWO-EM DASH..THREE-EM DASH -2E3C..2E7F ; Pattern_Syntax # Cn [68] .. +2E3C..2E3F ; Pattern_Syntax # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN +2E41 ; Pattern_Syntax # Po REVERSED COMMA +2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK +2E43..2E7F ; Pattern_Syntax # Cn [61] .. 3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK 3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET 3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET @@ -1368,8 +1496,8 @@ 301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK 3020 ; Pattern_Syntax # So POSTAL MARK FACE 3030 ; Pattern_Syntax # Pd WAVY DASH -FD3E ; Pattern_Syntax # Ps ORNATE LEFT PARENTHESIS -FD3F ; Pattern_Syntax # Pe ORNATE RIGHT PARENTHESIS +FD3E ; Pattern_Syntax # Pe ORNATE LEFT PARENTHESIS +FD3F ; Pattern_Syntax # Ps ORNATE RIGHT PARENTHESIS FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT # Total code points: 2760 --- old/jdk/make/data/unicodedata/Scripts.txt 2015-07-13 16:11:34.000000000 +0900 +++ new/jdk/make/data/unicodedata/Scripts.txt 2015-07-13 16:11:34.000000000 +0900 @@ -1,8 +1,8 @@ -# Scripts-6.2.0.txt -# Date: 2012-06-04, 17:21:29 GMT [MD] +# Scripts-7.0.0.txt +# Date: 2014-05-15, 00:11:35 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ @@ -83,8 +83,10 @@ 0385 ; Common # Sk GREEK DIALYTIKA TONOS 0387 ; Common # Po GREEK ANO TELEIA 0589 ; Common # Po ARMENIAN FULL STOP +0605 ; Common # Cf ARABIC NUMBER MARK ABOVE 060C ; Common # Po ARABIC COMMA 061B ; Common # Po ARABIC SEMICOLON +061C ; Common # Cf ARABIC LETTER MARK 061F ; Common # Po ARABIC QUESTION MARK 0640 ; Common # Lm ARABIC TATWEEL 0660..0669 ; Common # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE @@ -136,7 +138,7 @@ 2055..205E ; Common # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 205F ; Common # Zs MEDIUM MATHEMATICAL SPACE 2060..2064 ; Common # Cf [5] WORD JOINER..INVISIBLE PLUS -206A..206F ; Common # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES +2066..206F ; Common # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES 2070 ; Common # No SUPERSCRIPT ZERO 2074..2079 ; Common # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 207A..207C ; Common # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN @@ -146,7 +148,7 @@ 208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS -20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN +20A0..20BD ; Common # Sc [30] EURO-CURRENCY SIGN..RUBLE SIGN 2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2102 ; Common # L& DOUBLE-STRUCK CAPITAL C 2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA @@ -200,7 +202,10 @@ 21D5..21F3 ; Common # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Common # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Common # So [8] DIAMETER SIGN..WAVY LINE -2308..230B ; Common # Sm [4] LEFT CEILING..RIGHT FLOOR +2308 ; Common # Ps LEFT CEILING +2309 ; Common # Pe RIGHT CEILING +230A ; Common # Ps LEFT FLOOR +230B ; Common # Pe RIGHT FLOOR 230C..231F ; Common # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Common # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Common # So [7] FROWN..KEYBOARD @@ -212,7 +217,7 @@ 239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET -23E2..23F3 ; Common # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND +23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD 2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO 2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH 2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP @@ -226,8 +231,7 @@ 25F8..25FF ; Common # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Common # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Common # Sm MUSIC SHARP SIGN -2670..26FF ; Common # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE -2701..2767 ; Common # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET +2670..2767 ; Common # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Common # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Common # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Common # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT @@ -295,7 +299,11 @@ 2B30..2B44 ; Common # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR -2B50..2B59 ; Common # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE +2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX +2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED +2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN 2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET @@ -329,6 +337,10 @@ 2E2F ; Common # Lm VERTICAL TILDE 2E30..2E39 ; Common # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Common # Pd [2] TWO-EM DASH..THREE-EM DASH +2E3C..2E3F ; Common # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E40 ; Common # Pd DOUBLE HYPHEN +2E41 ; Common # Po REVERSED COMMA +2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID 3000 ; Common # Zs IDEOGRAPHIC SPACE 3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK @@ -392,9 +404,11 @@ A836..A837 ; Common # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK A838 ; Common # Sc NORTH INDIC RUPEE MARK A839 ; Common # So NORTH INDIC QUANTITY MARK -FD3E ; Common # Ps ORNATE LEFT PARENTHESIS -FD3F ; Common # Pe ORNATE RIGHT PARENTHESIS -FDFD ; Common # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM +A92E ; Common # Po KAYAH LI SIGN CWI +A9CF ; Common # Lm JAVANESE PANGRANGKEP +AB5B ; Common # Sk MODIFIER BREVE WITH INVERTED BREVE +FD3E ; Common # Pe ORNATE LEFT PARENTHESIS +FD3F ; Common # Ps ORNATE RIGHT PARENTHESIS FE10..FE16 ; Common # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE17 ; Common # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET FE18 ; Common # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET @@ -487,6 +501,8 @@ 10137..1013F ; Common # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT 10190..1019B ; Common # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN 101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND +102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED +1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE @@ -543,10 +559,10 @@ 1F000..1F02B ; Common # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK 1F030..1F093 ; Common # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 1F0A0..1F0AE ; Common # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES -1F0B1..1F0BE ; Common # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS +1F0B1..1F0BF ; Common # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER 1F0C1..1F0CF ; Common # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER -1F0D1..1F0DF ; Common # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER -1F100..1F10A ; Common # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA +1F0D1..1F0F5 ; Common # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 +1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO 1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ 1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN 1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS @@ -555,28 +571,29 @@ 1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6 1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT -1F300..1F320 ; Common # So [33] CYCLONE..SHOOTING STAR -1F330..1F335 ; Common # So [6] CHESTNUT..CACTUS -1F337..1F37C ; Common # So [70] TULIP..BABY BOTTLE -1F380..1F393 ; Common # So [20] RIBBON..GRADUATION CAP -1F3A0..1F3C4 ; Common # So [37] CAROUSEL HORSE..SURFER -1F3C6..1F3CA ; Common # So [5] TROPHY..SWIMMER -1F3E0..1F3F0 ; Common # So [17] HOUSE BUILDING..EUROPEAN CASTLE -1F400..1F43E ; Common # So [63] RAT..PAW PRINTS -1F440 ; Common # So EYES -1F442..1F4F7 ; Common # So [182] EAR..CAMERA -1F4F9..1F4FC ; Common # So [4] VIDEO CAMERA..VIDEOCASSETTE -1F500..1F53D ; Common # So [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE -1F540..1F543 ; Common # So [4] CIRCLED CROSS POMMEE..NOTCHED LEFT SEMICIRCLE WITH THREE DOTS -1F550..1F567 ; Common # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY -1F5FB..1F640 ; Common # So [70] MOUNT FUJI..WEARY CAT FACE -1F645..1F64F ; Common # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS -1F680..1F6C5 ; Common # So [70] ROCKET..LEFT LUGGAGE +1F300..1F32C ; Common # So [45] CYCLONE..WIND BLOWING FACE +1F330..1F37D ; Common # So [78] CHESTNUT..FORK AND KNIFE WITH PLATE +1F380..1F3CE ; Common # So [79] RIBBON..RACING CAR +1F3D4..1F3F7 ; Common # So [36] SNOW CAPPED MOUNTAIN..LABEL +1F400..1F4FE ; Common # So [255] RAT..PORTABLE STEREO +1F500..1F54A ; Common # So [75] TWISTED RIGHTWARDS ARROWS..DOVE OF PEACE +1F550..1F579 ; Common # So [42] CLOCK FACE ONE OCLOCK..JOYSTICK +1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX +1F5A5..1F642 ; Common # So [158] DESKTOP COMPUTER..SLIGHTLY SMILING FACE +1F645..1F6CF ; Common # So [139] FACE WITH NO GOOD GESTURE..BED +1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING +1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP 1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE +1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR +1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD +1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW +1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW +1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW +1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS E0001 ; Common # Cf LANGUAGE TAG E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG -# Total code points: 6413 +# Total code points: 7129 # ================================================ @@ -618,16 +635,20 @@ A770 ; Latin # Lm MODIFIER LETTER US A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT -A790..A793 ; Latin # L& [4] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH BAR -A7A0..A7AA ; Latin # L& [11] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN CAPITAL LETTER H WITH HOOK +A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT +A7B0..A7B1 ; Latin # L& [2] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER TURNED T +A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Latin # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A7FF ; Latin # Lo [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M +AB30..AB5A ; Latin # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG +AB5C..AB5F ; Latin # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK +AB64 ; Latin # L& LATIN SMALL LETTER INVERTED ALPHA FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z -# Total code points: 1272 +# Total code points: 1338 # ================================================ @@ -636,6 +657,7 @@ 0376..0377 ; Greek # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Greek # Lm GREEK YPOGEGRAMMENI 037B..037D ; Greek # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL +037F ; Greek # L& GREEK CAPITAL LETTER YOT 0384 ; Greek # Sk GREEK TONOS 0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Greek # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS @@ -675,15 +697,18 @@ 1FF6..1FFC ; Greek # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 1FFD..1FFE ; Greek # Sk [2] GREEK OXIA..GREEK DASIA 2126 ; Greek # L& OHM SIGN +AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA 10140..10174 ; Greek # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN 10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN -1018A ; Greek # No GREEK ZERO SIGN +1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN +1018C ; Greek # So GREEK SINUSOID SIGN +101A0 ; Greek # So GREEK SYMBOL TAU RHO 1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1D245 ; Greek # So GREEK MUSICAL LEIMMA -# Total code points: 511 +# Total code points: 516 # ================================================ @@ -692,7 +717,7 @@ 0483..0484 ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PALATALIZATION 0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE 0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN -048A..0527 ; Cyrillic # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER +048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL 1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN 2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS @@ -704,10 +729,11 @@ A674..A67D ; Cyrillic # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A67E ; Cyrillic # Po CYRILLIC KAVYKA A67F ; Cyrillic # Lm CYRILLIC PAYEROK -A680..A697 ; Cyrillic # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE +A680..A69B ; Cyrillic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O +A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69F ; Cyrillic # Mn COMBINING CYRILLIC LETTER IOTIFIED E -# Total code points: 417 +# Total code points: 431 # ================================================ @@ -716,10 +742,11 @@ 055A..055F ; Armenian # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK 0561..0587 ; Armenian # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 058A ; Armenian # Pd ARMENIAN HYPHEN +058D..058E ; Armenian # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN 058F ; Armenian # Sc ARMENIAN DRAM SIGN FB13..FB17 ; Armenian # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH -# Total code points: 91 +# Total code points: 93 # ================================================ @@ -779,9 +806,8 @@ 06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN 06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V 0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE -08A0 ; Arabic # Lo ARABIC LETTER BEH WITH SMALL V BELOW -08A2..08AC ; Arabic # Lo [11] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER ROHINGYA YEH -08E4..08FE ; Arabic # Mn [27] ARABIC CURLY FATHA..ARABIC DAMMA WITH DOT +08A0..08B2 ; Arabic # Lo [19] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER ZAIN WITH INVERTED V ABOVE +08E4..08FF ; Arabic # Mn [28] ARABIC CURLY FATHA..ARABIC MARK SIDEWAYS NOON GHUNNA FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM @@ -789,6 +815,7 @@ FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; Arabic # Sc RIAL SIGN +FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS @@ -827,7 +854,7 @@ 1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL -# Total code points: 1235 +# Total code points: 1244 # ================================================ @@ -870,17 +897,17 @@ 0966..096F ; Devanagari # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0970 ; Devanagari # Po DEVANAGARI ABBREVIATION SIGN 0971 ; Devanagari # Lm DEVANAGARI SIGN HIGH SPACING DOT -0972..0977 ; Devanagari # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE -0979..097F ; Devanagari # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA +0972..097F ; Devanagari # Lo [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA A8E0..A8F1 ; Devanagari # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8F2..A8F7 ; Devanagari # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8F8..A8FA ; Devanagari # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET A8FB ; Devanagari # Lo DEVANAGARI HEADSTROKE -# Total code points: 151 +# Total code points: 152 # ================================================ +0980 ; Bengali # Lo BENGALI ANJI 0981 ; Bengali # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; Bengali # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; Bengali # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L @@ -908,7 +935,7 @@ 09FA ; Bengali # So BENGALI ISSHAR 09FB ; Bengali # Sc BENGALI GANDA MARK -# Total code points: 92 +# Total code points: 93 # ================================================ @@ -1025,12 +1052,12 @@ # ================================================ +0C00 ; Telugu # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Telugu # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C05..0C0C ; Telugu # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA -0C2A..0C33 ; Telugu # Lo [10] TELUGU LETTER PA..TELUGU LETTER LLA -0C35..0C39 ; Telugu # Lo [5] TELUGU LETTER VA..TELUGU LETTER HA +0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA 0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR @@ -1044,10 +1071,11 @@ 0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0C7F ; Telugu # So TELUGU SIGN TUUMU -# Total code points: 93 +# Total code points: 95 # ================================================ +0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; Kannada # Lo [3] KANNADA LETTER E..KANNADA LETTER AI @@ -1070,10 +1098,11 @@ 0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA -# Total code points: 86 +# Total code points: 87 # ================================================ +0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI @@ -1093,7 +1122,7 @@ 0D79 ; Malayalam # So MALAYALAM DATE MARK 0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K -# Total code points: 98 +# Total code points: 99 # ================================================ @@ -1108,10 +1137,12 @@ 0DD2..0DD4 ; Sinhala # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Sinhala # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; Sinhala # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA +0DE6..0DEF ; Sinhala # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; Sinhala # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0DF4 ; Sinhala # Po SINHALA PUNCTUATION KUNDDALIYA +111E1..111F4 ; Sinhala # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND -# Total code points: 80 +# Total code points: 110 # ================================================ @@ -1234,14 +1265,23 @@ 109A..109C ; Myanmar # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; Myanmar # Mn MYANMAR VOWEL SIGN AITON AI 109E..109F ; Myanmar # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION +A9E0..A9E4 ; Myanmar # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA +A9E5 ; Myanmar # Mn MYANMAR SIGN SHAN SAW +A9E6 ; Myanmar # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION +A9E7..A9EF ; Myanmar # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA +A9F0..A9F9 ; Myanmar # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE +A9FA..A9FE ; Myanmar # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA60..AA6F ; Myanmar # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; Myanmar # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; Myanmar # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA77..AA79 ; Myanmar # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO AA7A ; Myanmar # Lo MYANMAR LETTER AITON RA AA7B ; Myanmar # Mc MYANMAR SIGN PAO KAREN TONE +AA7C ; Myanmar # Mn MYANMAR SIGN TAI LAING TONE-2 +AA7D ; Myanmar # Mc MYANMAR SIGN TAI LAING TONE-5 +AA7E..AA7F ; Myanmar # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA -# Total code points: 188 +# Total code points: 223 # ================================================ @@ -1345,8 +1385,9 @@ 16A0..16EA ; Runic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; Runic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL +16F1..16F8 ; Runic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC -# Total code points: 78 +# Total code points: 86 # ================================================ @@ -1377,7 +1418,7 @@ 1806 ; Mongolian # Pd MONGOLIAN TODO SOFT HYPHEN 1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU 180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE -180E ; Mongolian # Zs MONGOLIAN VOWEL SEPARATOR +180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR 1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN @@ -1452,10 +1493,10 @@ # ================================================ -10300..1031E ; Old_Italic # Lo [31] OLD ITALIC LETTER A..OLD ITALIC LETTER UU +10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY -# Total code points: 35 +# Total code points: 36 # ================================================ @@ -1479,12 +1520,15 @@ 064B..0655 ; Inherited # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 0670 ; Inherited # Mn ARABIC LETTER SUPERSCRIPT ALEF 0951..0952 ; Inherited # Mn [2] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI STRESS SIGN ANUDATTA +1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW +1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY 1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Inherited # Mn VEDIC SIGN TIRYAK 1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE -1DC0..1DE6 ; Inherited # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z +1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE 1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE @@ -1495,15 +1539,16 @@ 302A..302D ; Inherited # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 3099..309A ; Inherited # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK FE00..FE0F ; Inherited # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 -FE20..FE26 ; Inherited # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON +FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW 101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK 1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 523 +# Total code points: 563 # ================================================ @@ -1537,7 +1582,7 @@ # ================================================ -1900..191C ; Limbu # Lo [29] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA +1900..191E ; Limbu # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1920..1922 ; Limbu # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; Limbu # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; Limbu # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O @@ -1550,7 +1595,7 @@ 1944..1945 ; Limbu # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1946..194F ; Limbu # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE -# Total code points: 66 +# Total code points: 68 # ================================================ @@ -1612,7 +1657,8 @@ 1A00..1A16 ; Buginese # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A17..1A18 ; Buginese # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U -1A19..1A1B ; Buginese # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE +1A19..1A1A ; Buginese # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A1B ; Buginese # Mn BUGINESE VOWEL SIGN AE 1A1E..1A1F ; Buginese # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION # Total code points: 30 @@ -1724,11 +1770,11 @@ # ================================================ -12000..1236E ; Cuneiform # Lo [879] CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM -12400..12462 ; Cuneiform # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER -12470..12473 ; Cuneiform # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON +12000..12398 ; Cuneiform # Lo [921] CUNEIFORM SIGN A..CUNEIFORM SIGN UM TIMES ME +12400..1246E ; Cuneiform # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM +12470..12474 ; Cuneiform # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON -# Total code points: 982 +# Total code points: 1037 # ================================================ @@ -1767,8 +1813,7 @@ 1BA6..1BA7 ; Sundanese # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Sundanese # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; Sundanese # Mc SUNDANESE SIGN PAMAAEH -1BAB ; Sundanese # Mn SUNDANESE SIGN VIRAMA -1BAC..1BAD ; Sundanese # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BAB..1BAD ; Sundanese # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BAE..1BAF ; Sundanese # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; Sundanese # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BBF ; Sundanese # Lo [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M @@ -1825,9 +1870,9 @@ A900..A909 ; Kayah_Li # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; Kayah_Li # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A926..A92D ; Kayah_Li # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU -A92E..A92F ; Kayah_Li # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA +A92F ; Kayah_Li # Po KAYAH LI SIGN SHYA -# Total code points: 48 +# Total code points: 47 # ================================================ @@ -1974,11 +2019,10 @@ A9BC ; Javanese # Mn JAVANESE VOWEL SIGN PEPET A9BD..A9C0 ; Javanese # Mc [4] JAVANESE CONSONANT SIGN KERET..JAVANESE PANGKON A9C1..A9CD ; Javanese # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH -A9CF ; Javanese # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; Javanese # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9DE..A9DF ; Javanese # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN -# Total code points: 91 +# Total code points: 90 # ================================================ @@ -2080,8 +2124,9 @@ 11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND 11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE +1107F ; Brahmi # Mn BRAHMI NUMBER JOINER -# Total code points: 108 +# Total code points: 109 # ================================================ @@ -2136,9 +2181,11 @@ 111BF..111C0 ; Sharada # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111C1..111C4 ; Sharada # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C5..111C8 ; Sharada # Po [4] SHARADA DANDA..SHARADA SEPARATOR +111CD ; Sharada # Po SHARADA SUTRA MARK 111D0..111D9 ; Sharada # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE +111DA ; Sharada # Lo SHARADA EKAM -# Total code points: 83 +# Total code points: 85 # ================================================ @@ -2161,4 +2208,244 @@ # Total code points: 66 +# ================================================ + +10530..10563 ; Caucasian_Albanian # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW +1056F ; Caucasian_Albanian # Po CAUCASIAN ALBANIAN CITATION MARK + +# Total code points: 53 + +# ================================================ + +16AD0..16AED ; Bassa_Vah # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I +16AF0..16AF4 ; Bassa_Vah # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE +16AF5 ; Bassa_Vah # Po BASSA VAH FULL STOP + +# Total code points: 36 + +# ================================================ + +1BC00..1BC6A ; Duployan # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M +1BC70..1BC7C ; Duployan # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK +1BC80..1BC88 ; Duployan # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL +1BC90..1BC99 ; Duployan # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW +1BC9C ; Duployan # So DUPLOYAN SIGN O WITH CROSS +1BC9D..1BC9E ; Duployan # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK +1BC9F ; Duployan # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP + +# Total code points: 143 + +# ================================================ + +10500..10527 ; Elbasan # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE + +# Total code points: 40 + +# ================================================ + +11301 ; Grantha # Mn GRANTHA SIGN CANDRABINDU +11302..11303 ; Grantha # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +11305..1130C ; Grantha # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L +1130F..11310 ; Grantha # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI +11313..11328 ; Grantha # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA +1132A..11330 ; Grantha # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA +11332..11333 ; Grantha # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA +11335..11339 ; Grantha # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA +1133C ; Grantha # Mn GRANTHA SIGN NUKTA +1133D ; Grantha # Lo GRANTHA SIGN AVAGRAHA +1133E..1133F ; Grantha # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11340 ; Grantha # Mn GRANTHA VOWEL SIGN II +11341..11344 ; Grantha # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; Grantha # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134D ; Grantha # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA +11357 ; Grantha # Mc GRANTHA AU LENGTH MARK +1135D..11361 ; Grantha # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL +11362..11363 ; Grantha # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +11366..1136C ; Grantha # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; Grantha # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA + +# Total code points: 83 + +# ================================================ + +16B00..16B2F ; Pahawh_Hmong # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU +16B30..16B36 ; Pahawh_Hmong # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM +16B37..16B3B ; Pahawh_Hmong # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM +16B3C..16B3F ; Pahawh_Hmong # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB +16B40..16B43 ; Pahawh_Hmong # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM +16B44 ; Pahawh_Hmong # Po PAHAWH HMONG SIGN XAUS +16B45 ; Pahawh_Hmong # So PAHAWH HMONG SIGN CIM TSOV ROG +16B50..16B59 ; Pahawh_Hmong # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE +16B5B..16B61 ; Pahawh_Hmong # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS +16B63..16B77 ; Pahawh_Hmong # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS +16B7D..16B8F ; Pahawh_Hmong # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ + +# Total code points: 127 + +# ================================================ + +11200..11211 ; Khojki # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA +11213..1122B ; Khojki # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA +1122C..1122E ; Khojki # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +1122F..11231 ; Khojki # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11232..11233 ; Khojki # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11234 ; Khojki # Mn KHOJKI SIGN ANUSVARA +11235 ; Khojki # Mc KHOJKI SIGN VIRAMA +11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA +11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN + +# Total code points: 61 + +# ================================================ + +10600..10736 ; Linear_A # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 +10740..10755 ; Linear_A # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE +10760..10767 ; Linear_A # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 + +# Total code points: 341 + +# ================================================ + +11150..11172 ; Mahajani # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA +11173 ; Mahajani # Mn MAHAJANI SIGN NUKTA +11174..11175 ; Mahajani # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK +11176 ; Mahajani # Lo MAHAJANI LIGATURE SHRI + +# Total code points: 39 + +# ================================================ + +10AC0..10AC7 ; Manichaean # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW +10AC8 ; Manichaean # So MANICHAEAN SIGN UD +10AC9..10AE4 ; Manichaean # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW +10AE5..10AE6 ; Manichaean # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW +10AEB..10AEF ; Manichaean # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED +10AF0..10AF6 ; Manichaean # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER + +# Total code points: 51 + +# ================================================ + +1E800..1E8C4 ; Mende_Kikakui # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON +1E8C7..1E8CF ; Mende_Kikakui # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE +1E8D0..1E8D6 ; Mende_Kikakui # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS + +# Total code points: 213 + +# ================================================ + +11600..1162F ; Modi # Lo [48] MODI LETTER A..MODI LETTER LLA +11630..11632 ; Modi # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +11633..1163A ; Modi # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163B..1163C ; Modi # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163D ; Modi # Mn MODI SIGN ANUSVARA +1163E ; Modi # Mc MODI SIGN VISARGA +1163F..11640 ; Modi # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA +11641..11643 ; Modi # Po [3] MODI DANDA..MODI ABBREVIATION SIGN +11644 ; Modi # Lo MODI SIGN HUVA +11650..11659 ; Modi # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE + +# Total code points: 79 + +# ================================================ + +16A40..16A5E ; Mro # Lo [31] MRO LETTER TA..MRO LETTER TEK +16A60..16A69 ; Mro # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE +16A6E..16A6F ; Mro # Po [2] MRO DANDA..MRO DOUBLE DANDA + +# Total code points: 43 + +# ================================================ + +10A80..10A9C ; Old_North_Arabian # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH +10A9D..10A9F ; Old_North_Arabian # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY + +# Total code points: 32 + +# ================================================ + +10880..1089E ; Nabataean # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW +108A7..108AF ; Nabataean # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED + +# Total code points: 40 + +# ================================================ + +10860..10876 ; Palmyrene # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW +10877..10878 ; Palmyrene # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON +10879..1087F ; Palmyrene # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY + +# Total code points: 32 + +# ================================================ + +11AC0..11AF8 ; Pau_Cin_Hau # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL + +# Total code points: 57 + +# ================================================ + +10350..10375 ; Old_Permic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA +10376..1037A ; Old_Permic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII + +# Total code points: 43 + +# ================================================ + +10B80..10B91 ; Psalter_Pahlavi # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW +10B99..10B9C ; Psalter_Pahlavi # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT +10BA9..10BAF ; Psalter_Pahlavi # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED + +# Total code points: 29 + +# ================================================ + +11580..115AE ; Siddham # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA +115AF..115B1 ; Siddham # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B2..115B5 ; Siddham # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115B8..115BB ; Siddham # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BC..115BD ; Siddham # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BE ; Siddham # Mc SIDDHAM SIGN VISARGA +115BF..115C0 ; Siddham # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +115C1..115C9 ; Siddham # Po [9] SIDDHAM SIGN SIDDHAM..SIDDHAM END OF TEXT MARK + +# Total code points: 72 + +# ================================================ + +112B0..112DE ; Khudawadi # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA +112DF ; Khudawadi # Mn KHUDAWADI SIGN ANUSVARA +112E0..112E2 ; Khudawadi # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +112E3..112EA ; Khudawadi # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA +112F0..112F9 ; Khudawadi # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE + +# Total code points: 69 + +# ================================================ + +11480..114AF ; Tirhuta # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA +114B0..114B2 ; Tirhuta # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B3..114B8 ; Tirhuta # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114B9 ; Tirhuta # Mc TIRHUTA VOWEL SIGN E +114BA ; Tirhuta # Mn TIRHUTA VOWEL SIGN SHORT E +114BB..114BE ; Tirhuta # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114BF..114C0 ; Tirhuta # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C1 ; Tirhuta # Mc TIRHUTA SIGN VISARGA +114C2..114C3 ; Tirhuta # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +114C4..114C5 ; Tirhuta # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG +114C6 ; Tirhuta # Po TIRHUTA ABBREVIATION SIGN +114C7 ; Tirhuta # Lo TIRHUTA OM +114D0..114D9 ; Tirhuta # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE + +# Total code points: 82 + +# ================================================ + +118A0..118DF ; Warang_Citi # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO +118E0..118E9 ; Warang_Citi # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE +118EA..118F2 ; Warang_Citi # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY +118FF ; Warang_Citi # Lo WARANG CITI OM + +# Total code points: 84 + # EOF --- old/jdk/make/data/unicodedata/SpecialCasing.txt 2015-07-13 16:11:35.000000000 +0900 +++ new/jdk/make/data/unicodedata/SpecialCasing.txt 2015-07-13 16:11:35.000000000 +0900 @@ -1,18 +1,25 @@ -# SpecialCasing-6.2.0.txt -# Date: 2012-05-23, 20:35:15 GMT [MD] +# SpecialCasing-7.0.0.txt +# Date: 2014-03-18, 07:18:02 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ # -# Special Casing Properties +# Special Casing # -# This file is a supplement to the UnicodeData file. -# It contains additional information about the casing of Unicode characters. -# (For compatibility, the UnicodeData.txt file only contains case mappings for -# characters where they are 1-1, and independent of context and language. -# For more information, see the discussion of Case Mappings in the Unicode Standard. +# This file is a supplement to the UnicodeData.txt file. It does not define any +# properties, but rather provides additional information about the casing of +# Unicode characters, for situations when casing incurs a change in string length +# or is dependent on context or locale. For compatibility, the UnicodeData.txt +# file only contains simple case mappings for characters where they are one-to-one +# and independent of context and language. The data in this file, combined with +# the simple case mappings in UnicodeData.txt, defines the full case mappings +# Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc). +# +# Note that the preferred mechanism for defining tailored casing operations is +# the Unicode Common Locale Data Repository (CLDR). For more information, see the +# discussion of case mappings and case algorithms in the Unicode Standard. # # All code points not listed in this file that do not have a simple case mappings # in UnicodeData.txt map to themselves. @@ -21,16 +28,17 @@ # ================================================================================ # The entries in this file are in the following machine-readable format: # -# ; ; ; <upper> ; (<condition_list> ;)? # <comment> +# <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> # -# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more -# than one character, they are separated by spaces. Other than as used to separate -# elements, spaces are to be ignored. +# <code>, <lower>, <title>, and <upper> provide the respective full case mappings +# of <code>, expressed as character values in hex. If there is more than one character, +# they are separated by spaces. Other than as used to separate elements, spaces are +# to be ignored. # # The <condition_list> is optional. Where present, it consists of one or more language IDs -# or contexts, separated by spaces. In these conditions: +# or casing contexts, separated by spaces. In these conditions: # - A condition list overrides the normal behavior if all of the listed conditions are true. -# - The context is always the context of the characters in the original string, +# - The casing context is always the context of the characters in the original string, # NOT in the resulting string. # - Case distinctions in the condition list are not significant. # - Conditions preceded by "Not_" represent the negation of the condition. @@ -38,18 +46,14 @@ # # A language ID is defined by BCP 47, with '-' and '_' treated equivalently. # -# A context for a character C is defined by Section 3.13 Default Case -# Operations, of The Unicode Standard, Version 5.0. -# (This is identical to the context defined by Unicode 4.1.0, -# as specified in http://www.unicode.org/versions/Unicode4.1.0/) +# A casing context for a character is defined by Section 3.13 Default Case Algorithms +# of The Unicode Standard. # # Parsers of this file must be prepared to deal with future additions to this format: # * Additional contexts # * Additional fields # ================================================================================ -# @missing: 0000..10FFFF; <slc>; <stc>; <suc>; - # ================================================================================ # Unconditional mappings # ================================================================================ @@ -114,7 +118,7 @@ # This process can be achieved by first transforming the text to NFC before casing. # E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> -# The following cases are already in the UnicodeData file, so are only commented here. +# The following cases are already in the UnicodeData.txt file, so are only commented here. # 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI @@ -205,7 +209,7 @@ 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA -# Note: the following cases for non-final are already in the UnicodeData file. +# Note: the following cases for non-final are already in the UnicodeData.txt file. # 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA # 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA @@ -268,7 +272,7 @@ 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I -# Note: the following case is already in the UnicodeData file. +# Note: the following case is already in the UnicodeData.txt file. # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I --- old/jdk/make/data/unicodedata/UnicodeData.txt 2015-07-13 16:11:36.000000000 +0900 +++ new/jdk/make/data/unicodedata/UnicodeData.txt 2015-07-13 16:11:35.000000000 +0900 @@ -602,12 +602,12 @@ 0259;LATIN SMALL LETTER SCHWA;Ll;0;L;;;;;N;;;018F;;018F 025A;LATIN SMALL LETTER SCHWA WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCHWA HOOK;;;; 025B;LATIN SMALL LETTER OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER EPSILON;;0190;;0190 -025C;LATIN SMALL LETTER REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON;;;; +025C;LATIN SMALL LETTER REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON;;A7AB;;A7AB 025D;LATIN SMALL LETTER REVERSED OPEN E WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED EPSILON HOOK;;;; 025E;LATIN SMALL LETTER CLOSED REVERSED OPEN E;Ll;0;L;;;;;N;LATIN SMALL LETTER CLOSED REVERSED EPSILON;;;; 025F;LATIN SMALL LETTER DOTLESS J WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER DOTLESS J BAR;;;; 0260;LATIN SMALL LETTER G WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER G HOOK;;0193;;0193 -0261;LATIN SMALL LETTER SCRIPT G;Ll;0;L;;;;;N;;;;; +0261;LATIN SMALL LETTER SCRIPT G;Ll;0;L;;;;;N;;;A7AC;;A7AC 0262;LATIN LETTER SMALL CAPITAL G;Ll;0;L;;;;;N;;;;; 0263;LATIN SMALL LETTER GAMMA;Ll;0;L;;;;;N;;;0194;;0194 0264;LATIN SMALL LETTER RAMS HORN;Ll;0;L;;;;;N;LATIN SMALL LETTER BABY GAMMA;;;; @@ -618,7 +618,7 @@ 0269;LATIN SMALL LETTER IOTA;Ll;0;L;;;;;N;;;0196;;0196 026A;LATIN LETTER SMALL CAPITAL I;Ll;0;L;;;;;N;;;;; 026B;LATIN SMALL LETTER L WITH MIDDLE TILDE;Ll;0;L;;;;;N;;;2C62;;2C62 -026C;LATIN SMALL LETTER L WITH BELT;Ll;0;L;;;;;N;LATIN SMALL LETTER L BELT;;;; +026C;LATIN SMALL LETTER L WITH BELT;Ll;0;L;;;;;N;LATIN SMALL LETTER L BELT;;A7AD;;A7AD 026D;LATIN SMALL LETTER L WITH RETROFLEX HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER L RETROFLEX HOOK;;;; 026E;LATIN SMALL LETTER LEZH;Ll;0;L;;;;;N;LATIN SMALL LETTER L YOGH;;;; 026F;LATIN SMALL LETTER TURNED M;Ll;0;L;;;;;N;;;019C;;019C @@ -645,7 +645,7 @@ 0284;LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER DOTLESS J BAR HOOK;;;; 0285;LATIN SMALL LETTER SQUAT REVERSED ESH;Ll;0;L;;;;;N;;;;; 0286;LATIN SMALL LETTER ESH WITH CURL;Ll;0;L;;;;;N;LATIN SMALL LETTER ESH CURL;;;; -0287;LATIN SMALL LETTER TURNED T;Ll;0;L;;;;;N;;;;; +0287;LATIN SMALL LETTER TURNED T;Ll;0;L;;;;;N;;;A7B1;;A7B1 0288;LATIN SMALL LETTER T WITH RETROFLEX HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T RETROFLEX HOOK;;01AE;;01AE 0289;LATIN SMALL LETTER U BAR;Ll;0;L;;;;;N;;;0244;;0244 028A;LATIN SMALL LETTER UPSILON;Ll;0;L;;;;;N;;;01B1;;01B1 @@ -668,7 +668,7 @@ 029B;LATIN LETTER SMALL CAPITAL G WITH HOOK;Ll;0;L;;;;;N;LATIN LETTER SMALL CAPITAL G HOOK;;;; 029C;LATIN LETTER SMALL CAPITAL H;Ll;0;L;;;;;N;;;;; 029D;LATIN SMALL LETTER J WITH CROSSED-TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER CROSSED-TAIL J;;;; -029E;LATIN SMALL LETTER TURNED K;Ll;0;L;;;;;N;;;;; +029E;LATIN SMALL LETTER TURNED K;Ll;0;L;;;;;N;;;A7B0;;A7B0 029F;LATIN LETTER SMALL CAPITAL L;Ll;0;L;;;;;N;;;;; 02A0;LATIN SMALL LETTER Q WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Q HOOK;;;; 02A1;LATIN LETTER GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER GLOTTAL STOP BAR;;;; @@ -891,6 +891,7 @@ 037C;GREEK SMALL DOTTED LUNATE SIGMA SYMBOL;Ll;0;L;;;;;N;;;03FE;;03FE 037D;GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL;Ll;0;L;;;;;N;;;03FF;;03FF 037E;GREEK QUESTION MARK;Po;0;ON;003B;;;;N;;;;; +037F;GREEK CAPITAL LETTER YOT;Lu;0;L;;;;;N;;;;03F3; 0384;GREEK TONOS;Sk;0;ON;<compat> 0020 0301;;;;N;GREEK SPACING TONOS;;;; 0385;GREEK DIALYTIKA TONOS;Sk;0;ON;00A8 0301;;;;N;GREEK SPACING DIAERESIS TONOS;;;; 0386;GREEK CAPITAL LETTER ALPHA WITH TONOS;Lu;0;L;0391 0301;;;;N;GREEK CAPITAL LETTER ALPHA TONOS;;;03AC; @@ -999,7 +1000,7 @@ 03F0;GREEK KAPPA SYMBOL;Ll;0;L;<compat> 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A 03F1;GREEK RHO SYMBOL;Ll;0;L;<compat> 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1 03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L;<compat> 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9 -03F3;GREEK LETTER YOT;Ll;0;L;;;;;N;;;;; +03F3;GREEK LETTER YOT;Ll;0;L;;;;;N;;;037F;;037F 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8; 03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L;<compat> 03B5;;;;N;;;0395;;0395 03F6;GREEK REVERSED LUNATE EPSILON SYMBOL;Sm;0;ON;;;;;N;;;;; @@ -1308,6 +1309,14 @@ 0525;CYRILLIC SMALL LETTER PE WITH DESCENDER;Ll;0;L;;;;;N;;;0524;;0524 0526;CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER;Lu;0;L;;;;;N;;;;0527; 0527;CYRILLIC SMALL LETTER SHHA WITH DESCENDER;Ll;0;L;;;;;N;;;0526;;0526 +0528;CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK;Lu;0;L;;;;;N;;;;0529; +0529;CYRILLIC SMALL LETTER EN WITH LEFT HOOK;Ll;0;L;;;;;N;;;0528;;0528 +052A;CYRILLIC CAPITAL LETTER DZZHE;Lu;0;L;;;;;N;;;;052B; +052B;CYRILLIC SMALL LETTER DZZHE;Ll;0;L;;;;;N;;;052A;;052A +052C;CYRILLIC CAPITAL LETTER DCHE;Lu;0;L;;;;;N;;;;052D; +052D;CYRILLIC SMALL LETTER DCHE;Ll;0;L;;;;;N;;;052C;;052C +052E;CYRILLIC CAPITAL LETTER EL WITH DESCENDER;Lu;0;L;;;;;N;;;;052F; +052F;CYRILLIC SMALL LETTER EL WITH DESCENDER;Ll;0;L;;;;;N;;;052E;;052E 0531;ARMENIAN CAPITAL LETTER AYB;Lu;0;L;;;;;N;;;;0561; 0532;ARMENIAN CAPITAL LETTER BEN;Lu;0;L;;;;;N;;;;0562; 0533;ARMENIAN CAPITAL LETTER GIM;Lu;0;L;;;;;N;;;;0563; @@ -1394,6 +1403,8 @@ 0587;ARMENIAN SMALL LIGATURE ECH YIWN;Ll;0;L;<compat> 0565 0582;;;;N;;;;; 0589;ARMENIAN FULL STOP;Po;0;L;;;;;N;ARMENIAN PERIOD;;;; 058A;ARMENIAN HYPHEN;Pd;0;ON;;;;;N;;;;; +058D;RIGHT-FACING ARMENIAN ETERNITY SIGN;So;0;ON;;;;;N;;;;; +058E;LEFT-FACING ARMENIAN ETERNITY SIGN;So;0;ON;;;;;N;;;;; 058F;ARMENIAN DRAM SIGN;Sc;0;ET;;;;;N;;;;; 0591;HEBREW ACCENT ETNAHTA;Mn;220;NSM;;;;;N;;;;; 0592;HEBREW ACCENT SEGOL;Mn;230;NSM;;;;;N;;;;; @@ -1487,6 +1498,7 @@ 0602;ARABIC FOOTNOTE MARKER;Cf;0;AN;;;;;N;;;;; 0603;ARABIC SIGN SAFHA;Cf;0;AN;;;;;N;;;;; 0604;ARABIC SIGN SAMVAT;Cf;0;AN;;;;;N;;;;; +0605;ARABIC NUMBER MARK ABOVE;Cf;0;AN;;;;;N;;;;; 0606;ARABIC-INDIC CUBE ROOT;Sm;0;ON;;;;;N;;;;; 0607;ARABIC-INDIC FOURTH ROOT;Sm;0;ON;;;;;N;;;;; 0608;ARABIC RAY;Sm;0;AL;;;;;N;;;;; @@ -1509,6 +1521,7 @@ 0619;ARABIC SMALL DAMMA;Mn;31;NSM;;;;;N;;;;; 061A;ARABIC SMALL KASRA;Mn;32;NSM;;;;;N;;;;; 061B;ARABIC SEMICOLON;Po;0;AL;;;;;N;;;;; +061C;ARABIC LETTER MARK;Cf;0;AL;;;;;N;;;;; 061E;ARABIC TRIPLE DOT PUNCTUATION MARK;Po;0;AL;;;;;N;;;;; 061F;ARABIC QUESTION MARK;Po;0;AL;;;;;N;;;;; 0620;ARABIC LETTER KASHMIRI YEH;Lo;0;AL;;;;;N;;;;; @@ -2060,6 +2073,7 @@ 085B;MANDAIC GEMINATION MARK;Mn;220;NSM;;;;;N;;;;; 085E;MANDAIC PUNCTUATION;Po;0;R;;;;;N;;;;; 08A0;ARABIC LETTER BEH WITH SMALL V BELOW;Lo;0;AL;;;;;N;;;;; +08A1;ARABIC LETTER BEH WITH HAMZA ABOVE;Lo;0;AL;;;;;N;;;;; 08A2;ARABIC LETTER JEEM WITH TWO DOTS ABOVE;Lo;0;AL;;;;;N;;;;; 08A3;ARABIC LETTER TAH WITH TWO DOTS ABOVE;Lo;0;AL;;;;;N;;;;; 08A4;ARABIC LETTER FEH WITH DOT BELOW AND THREE DOTS ABOVE;Lo;0;AL;;;;;N;;;;; @@ -2071,6 +2085,12 @@ 08AA;ARABIC LETTER REH WITH LOOP;Lo;0;AL;;;;;N;;;;; 08AB;ARABIC LETTER WAW WITH DOT WITHIN;Lo;0;AL;;;;;N;;;;; 08AC;ARABIC LETTER ROHINGYA YEH;Lo;0;AL;;;;;N;;;;; +08AD;ARABIC LETTER LOW ALEF;Lo;0;AL;;;;;N;;;;; +08AE;ARABIC LETTER DAL WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;; +08AF;ARABIC LETTER SAD WITH THREE DOTS BELOW;Lo;0;AL;;;;;N;;;;; +08B0;ARABIC LETTER GAF WITH INVERTED STROKE;Lo;0;AL;;;;;N;;;;; +08B1;ARABIC LETTER STRAIGHT WAW;Lo;0;AL;;;;;N;;;;; +08B2;ARABIC LETTER ZAIN WITH INVERTED V ABOVE;Lo;0;AL;;;;;N;;;;; 08E4;ARABIC CURLY FATHA;Mn;230;NSM;;;;;N;;;;; 08E5;ARABIC CURLY DAMMA;Mn;230;NSM;;;;;N;;;;; 08E6;ARABIC CURLY KASRA;Mn;220;NSM;;;;;N;;;;; @@ -2098,6 +2118,7 @@ 08FC;ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT;Mn;230;NSM;;;;;N;;;;; 08FD;ARABIC RIGHT ARROWHEAD ABOVE WITH DOT;Mn;230;NSM;;;;;N;;;;; 08FE;ARABIC DAMMA WITH DOT;Mn;230;NSM;;;;;N;;;;; +08FF;ARABIC MARK SIDEWAYS NOON GHUNNA;Mn;230;NSM;;;;;N;;;;; 0900;DEVANAGARI SIGN INVERTED CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; @@ -2218,6 +2239,7 @@ 0975;DEVANAGARI LETTER AW;Lo;0;L;;;;;N;;;;; 0976;DEVANAGARI LETTER UE;Lo;0;L;;;;;N;;;;; 0977;DEVANAGARI LETTER UUE;Lo;0;L;;;;;N;;;;; +0978;DEVANAGARI LETTER MARWARI DDA;Lo;0;L;;;;;N;;;;; 0979;DEVANAGARI LETTER ZHA;Lo;0;L;;;;;N;;;;; 097A;DEVANAGARI LETTER HEAVY YA;Lo;0;L;;;;;N;;;;; 097B;DEVANAGARI LETTER GGA;Lo;0;L;;;;;N;;;;; @@ -2225,6 +2247,7 @@ 097D;DEVANAGARI LETTER GLOTTAL STOP;Lo;0;L;;;;;N;;;;; 097E;DEVANAGARI LETTER DDDA;Lo;0;L;;;;;N;;;;; 097F;DEVANAGARI LETTER BBA;Lo;0;L;;;;;N;;;;; +0980;BENGALI ANJI;Lo;0;L;;;;;N;;;;; 0981;BENGALI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 0982;BENGALI SIGN ANUSVARA;Mc;0;L;;;;;N;;;;; 0983;BENGALI SIGN VISARGA;Mc;0;L;;;;;N;;;;; @@ -2642,6 +2665,7 @@ 0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;; 0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;; 0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;; +0C00;TELUGU SIGN COMBINING CANDRABINDU ABOVE;Mn;0;NSM;;;;;N;;;;; 0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;; 0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;; 0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;; @@ -2689,6 +2713,7 @@ 0C31;TELUGU LETTER RRA;Lo;0;L;;;;;N;;;;; 0C32;TELUGU LETTER LA;Lo;0;L;;;;;N;;;;; 0C33;TELUGU LETTER LLA;Lo;0;L;;;;;N;;;;; +0C34;TELUGU LETTER LLLA;Lo;0;L;;;;;N;;;;; 0C35;TELUGU LETTER VA;Lo;0;L;;;;;N;;;;; 0C36;TELUGU LETTER SHA;Lo;0;L;;;;;N;;;;; 0C37;TELUGU LETTER SSA;Lo;0;L;;;;;N;;;;; @@ -2735,6 +2760,7 @@ 0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;No;0;ON;;;;2;N;;;;; 0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;No;0;ON;;;;3;N;;;;; 0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;; +0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;; 0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;; 0C85;KANNADA LETTER A;Lo;0;L;;;;;N;;;;; @@ -2821,6 +2847,7 @@ 0CEF;KANNADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 0CF1;KANNADA SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;; 0CF2;KANNADA SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;; +0D01;MALAYALAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;; 0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;; 0D05;MALAYALAM LETTER A;Lo;0;L;;;;;N;;;;; @@ -2996,6 +3023,16 @@ 0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;Mc;0;L;0DDC 0DCA;;;;N;;;;; 0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;Mc;0;L;0DD9 0DDF;;;;N;;;;; 0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;; +0DE6;SINHALA LITH DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +0DE7;SINHALA LITH DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +0DE8;SINHALA LITH DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +0DE9;SINHALA LITH DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +0DEA;SINHALA LITH DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +0DEB;SINHALA LITH DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +0DEC;SINHALA LITH DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +0DED;SINHALA LITH DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +0DEE;SINHALA LITH DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +0DEF;SINHALA LITH DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;; 0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;; 0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;; @@ -5087,6 +5124,14 @@ 16EE;RUNIC ARLAUG SYMBOL;Nl;0;L;;;;17;N;;;;; 16EF;RUNIC TVIMADUR SYMBOL;Nl;0;L;;;;18;N;;;;; 16F0;RUNIC BELGTHOR SYMBOL;Nl;0;L;;;;19;N;;;;; +16F1;RUNIC LETTER K;Lo;0;L;;;;;N;;;;; +16F2;RUNIC LETTER SH;Lo;0;L;;;;;N;;;;; +16F3;RUNIC LETTER OO;Lo;0;L;;;;;N;;;;; +16F4;RUNIC LETTER FRANKS CASKET OS;Lo;0;L;;;;;N;;;;; +16F5;RUNIC LETTER FRANKS CASKET IS;Lo;0;L;;;;;N;;;;; +16F6;RUNIC LETTER FRANKS CASKET EH;Lo;0;L;;;;;N;;;;; +16F7;RUNIC LETTER FRANKS CASKET AC;Lo;0;L;;;;;N;;;;; +16F8;RUNIC LETTER FRANKS CASKET AESC;Lo;0;L;;;;;N;;;;; 1700;TAGALOG LETTER A;Lo;0;L;;;;;N;;;;; 1701;TAGALOG LETTER I;Lo;0;L;;;;;N;;;;; 1702;TAGALOG LETTER U;Lo;0;L;;;;;N;;;;; @@ -5296,7 +5341,7 @@ 180B;MONGOLIAN FREE VARIATION SELECTOR ONE;Mn;0;NSM;;;;;N;;;;; 180C;MONGOLIAN FREE VARIATION SELECTOR TWO;Mn;0;NSM;;;;;N;;;;; 180D;MONGOLIAN FREE VARIATION SELECTOR THREE;Mn;0;NSM;;;;;N;;;;; -180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;; +180E;MONGOLIAN VOWEL SEPARATOR;Cf;0;BN;;;;;N;;;;; 1810;MONGOLIAN DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; 1811;MONGOLIAN DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; 1812;MONGOLIAN DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; @@ -5537,6 +5582,8 @@ 191A;LIMBU LETTER SSA;Lo;0;L;;;;;N;;;;; 191B;LIMBU LETTER SA;Lo;0;L;;;;;N;;;;; 191C;LIMBU LETTER HA;Lo;0;L;;;;;N;;;;; +191D;LIMBU LETTER GYAN;Lo;0;L;;;;;N;;;;; +191E;LIMBU LETTER TRA;Lo;0;L;;;;;N;;;;; 1920;LIMBU VOWEL SIGN A;Mn;0;NSM;;;;;N;;;;; 1921;LIMBU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;; 1922;LIMBU VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; @@ -5751,7 +5798,7 @@ 1A18;BUGINESE VOWEL SIGN U;Mn;220;NSM;;;;;N;;;;; 1A19;BUGINESE VOWEL SIGN E;Mc;0;L;;;;;N;;;;; 1A1A;BUGINESE VOWEL SIGN O;Mc;0;L;;;;;N;;;;; -1A1B;BUGINESE VOWEL SIGN AE;Mc;0;L;;;;;N;;;;; +1A1B;BUGINESE VOWEL SIGN AE;Mn;0;NSM;;;;;N;;;;; 1A1E;BUGINESE PALLAWA;Po;0;L;;;;;N;;;;; 1A1F;BUGINESE END OF SECTION;Po;0;L;;;;;N;;;;; 1A20;TAI THAM LETTER HIGH KA;Lo;0;L;;;;;N;;;;; @@ -5881,6 +5928,21 @@ 1AAB;TAI THAM SIGN SATKAANKUU;Po;0;L;;;;;N;;;;; 1AAC;TAI THAM SIGN HANG;Po;0;L;;;;;N;;;;; 1AAD;TAI THAM SIGN CAANG;Po;0;L;;;;;N;;;;; +1AB0;COMBINING DOUBLED CIRCUMFLEX ACCENT;Mn;230;NSM;;;;;N;;;;; +1AB1;COMBINING DIAERESIS-RING;Mn;230;NSM;;;;;N;;;;; +1AB2;COMBINING INFINITY;Mn;230;NSM;;;;;N;;;;; +1AB3;COMBINING DOWNWARDS ARROW;Mn;230;NSM;;;;;N;;;;; +1AB4;COMBINING TRIPLE DOT;Mn;230;NSM;;;;;N;;;;; +1AB5;COMBINING X-X BELOW;Mn;220;NSM;;;;;N;;;;; +1AB6;COMBINING WIGGLY LINE BELOW;Mn;220;NSM;;;;;N;;;;; +1AB7;COMBINING OPEN MARK BELOW;Mn;220;NSM;;;;;N;;;;; +1AB8;COMBINING DOUBLE OPEN MARK BELOW;Mn;220;NSM;;;;;N;;;;; +1AB9;COMBINING LIGHT CENTRALIZATION STROKE BELOW;Mn;220;NSM;;;;;N;;;;; +1ABA;COMBINING STRONG CENTRALIZATION STROKE BELOW;Mn;220;NSM;;;;;N;;;;; +1ABB;COMBINING PARENTHESES ABOVE;Mn;230;NSM;;;;;N;;;;; +1ABC;COMBINING DOUBLE PARENTHESES ABOVE;Mn;230;NSM;;;;;N;;;;; +1ABD;COMBINING PARENTHESES BELOW;Mn;220;NSM;;;;;N;;;;; +1ABE;COMBINING PARENTHESES OVERLAY;Me;0;NSM;;;;;N;;;;; 1B00;BALINESE SIGN ULU RICEM;Mn;0;NSM;;;;;N;;;;; 1B01;BALINESE SIGN ULU CANDRA;Mn;0;NSM;;;;;N;;;;; 1B02;BALINESE SIGN CECEK;Mn;0;NSM;;;;;N;;;;; @@ -6046,8 +6108,8 @@ 1BA9;SUNDANESE VOWEL SIGN PANEULEUNG;Mn;0;NSM;;;;;N;;;;; 1BAA;SUNDANESE SIGN PAMAAEH;Mc;9;L;;;;;N;;;;; 1BAB;SUNDANESE SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; -1BAC;SUNDANESE CONSONANT SIGN PASANGAN MA;Mc;0;L;;;;;N;;;;; -1BAD;SUNDANESE CONSONANT SIGN PASANGAN WA;Mc;0;L;;;;;N;;;;; +1BAC;SUNDANESE CONSONANT SIGN PASANGAN MA;Mn;0;NSM;;;;;N;;;;; +1BAD;SUNDANESE CONSONANT SIGN PASANGAN WA;Mn;0;NSM;;;;;N;;;;; 1BAE;SUNDANESE LETTER KHA;Lo;0;L;;;;;N;;;;; 1BAF;SUNDANESE LETTER SYA;Lo;0;L;;;;;N;;;;; 1BB0;SUNDANESE DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; @@ -6291,6 +6353,8 @@ 1CF4;VEDIC TONE CANDRA ABOVE;Mn;230;NSM;;;;;N;;;;; 1CF5;VEDIC SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;; 1CF6;VEDIC SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;; +1CF8;VEDIC TONE RING ABOVE;Mn;230;NSM;;;;;N;;;;; +1CF9;VEDIC TONE DOUBLE RING ABOVE;Mn;230;NSM;;;;;N;;;;; 1D00;LATIN LETTER SMALL CAPITAL A;Ll;0;L;;;;;N;;;;; 1D01;LATIN LETTER SMALL CAPITAL AE;Ll;0;L;;;;;N;;;;; 1D02;LATIN SMALL LETTER TURNED AE;Ll;0;L;;;;;N;;;;; @@ -6522,6 +6586,21 @@ 1DE4;COMBINING LATIN SMALL LETTER S;Mn;230;NSM;;;;;N;;;;; 1DE5;COMBINING LATIN SMALL LETTER LONG S;Mn;230;NSM;;;;;N;;;;; 1DE6;COMBINING LATIN SMALL LETTER Z;Mn;230;NSM;;;;;N;;;;; +1DE7;COMBINING LATIN SMALL LETTER ALPHA;Mn;230;NSM;;;;;N;;;;; +1DE8;COMBINING LATIN SMALL LETTER B;Mn;230;NSM;;;;;N;;;;; +1DE9;COMBINING LATIN SMALL LETTER BETA;Mn;230;NSM;;;;;N;;;;; +1DEA;COMBINING LATIN SMALL LETTER SCHWA;Mn;230;NSM;;;;;N;;;;; +1DEB;COMBINING LATIN SMALL LETTER F;Mn;230;NSM;;;;;N;;;;; +1DEC;COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE;Mn;230;NSM;;;;;N;;;;; +1DED;COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE;Mn;230;NSM;;;;;N;;;;; +1DEE;COMBINING LATIN SMALL LETTER P;Mn;230;NSM;;;;;N;;;;; +1DEF;COMBINING LATIN SMALL LETTER ESH;Mn;230;NSM;;;;;N;;;;; +1DF0;COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE;Mn;230;NSM;;;;;N;;;;; +1DF1;COMBINING LATIN SMALL LETTER W;Mn;230;NSM;;;;;N;;;;; +1DF2;COMBINING LATIN SMALL LETTER A WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;; +1DF3;COMBINING LATIN SMALL LETTER O WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;; +1DF4;COMBINING LATIN SMALL LETTER U WITH DIAERESIS;Mn;230;NSM;;;;;N;;;;; +1DF5;COMBINING UP TACK ABOVE;Mn;230;NSM;;;;;N;;;;; 1DFC;COMBINING DOUBLE INVERTED BREVE BELOW;Mn;233;NSM;;;;;N;;;;; 1DFD;COMBINING ALMOST EQUAL TO BELOW;Mn;220;NSM;;;;;N;;;;; 1DFE;COMBINING LEFT ARROWHEAD ABOVE;Mn;230;NSM;;;;;N;;;;; @@ -7116,6 +7195,10 @@ 2062;INVISIBLE TIMES;Cf;0;BN;;;;;N;;;;; 2063;INVISIBLE SEPARATOR;Cf;0;BN;;;;;N;;;;; 2064;INVISIBLE PLUS;Cf;0;BN;;;;;N;;;;; +2066;LEFT-TO-RIGHT ISOLATE;Cf;0;LRI;;;;;N;;;;; +2067;RIGHT-TO-LEFT ISOLATE;Cf;0;RLI;;;;;N;;;;; +2068;FIRST STRONG ISOLATE;Cf;0;FSI;;;;;N;;;;; +2069;POP DIRECTIONAL ISOLATE;Cf;0;PDI;;;;;N;;;;; 206A;INHIBIT SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;; 206B;ACTIVATE SYMMETRIC SWAPPING;Cf;0;BN;;;;;N;;;;; 206C;INHIBIT ARABIC FORM SHAPING;Cf;0;BN;;;;;N;;;;; @@ -7191,6 +7274,9 @@ 20B8;TENGE SIGN;Sc;0;ET;;;;;N;;;;; 20B9;INDIAN RUPEE SIGN;Sc;0;ET;;;;;N;;;;; 20BA;TURKISH LIRA SIGN;Sc;0;ET;;;;;N;;;;; +20BB;NORDIC MARK SIGN;Sc;0;ET;;;;;N;;;;; +20BC;MANAT SIGN;Sc;0;ET;;;;;N;;;;; +20BD;RUBLE SIGN;Sc;0;ET;;;;;N;;;;; 20D0;COMBINING LEFT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING LEFT HARPOON ABOVE;;;; 20D1;COMBINING RIGHT HARPOON ABOVE;Mn;230;NSM;;;;;N;NON-SPACING RIGHT HARPOON ABOVE;;;; 20D2;COMBINING LONG VERTICAL LINE OVERLAY;Mn;1;NSM;;;;;N;NON-SPACING LONG VERTICAL BAR OVERLAY;;;; @@ -7738,10 +7824,10 @@ 2305;PROJECTIVE;So;0;ON;;;;;N;;;;; 2306;PERSPECTIVE;So;0;ON;;;;;N;;;;; 2307;WAVY LINE;So;0;ON;;;;;N;;;;; -2308;LEFT CEILING;Sm;0;ON;;;;;Y;;;;; -2309;RIGHT CEILING;Sm;0;ON;;;;;Y;;;;; -230A;LEFT FLOOR;Sm;0;ON;;;;;Y;;;;; -230B;RIGHT FLOOR;Sm;0;ON;;;;;Y;;;;; +2308;LEFT CEILING;Ps;0;ON;;;;;Y;;;;; +2309;RIGHT CEILING;Pe;0;ON;;;;;Y;;;;; +230A;LEFT FLOOR;Ps;0;ON;;;;;Y;;;;; +230B;RIGHT FLOOR;Pe;0;ON;;;;;Y;;;;; 230C;BOTTOM RIGHT CROP;So;0;ON;;;;;N;;;;; 230D;BOTTOM LEFT CROP;So;0;ON;;;;;N;;;;; 230E;TOP RIGHT CROP;So;0;ON;;;;;N;;;;; @@ -7974,6 +8060,13 @@ 23F1;STOPWATCH;So;0;ON;;;;;N;;;;; 23F2;TIMER CLOCK;So;0;ON;;;;;N;;;;; 23F3;HOURGLASS WITH FLOWING SAND;So;0;ON;;;;;N;;;;; +23F4;BLACK MEDIUM LEFT-POINTING TRIANGLE;So;0;ON;;;;;N;;;;; +23F5;BLACK MEDIUM RIGHT-POINTING TRIANGLE;So;0;ON;;;;;N;;;;; +23F6;BLACK MEDIUM UP-POINTING TRIANGLE;So;0;ON;;;;;N;;;;; +23F7;BLACK MEDIUM DOWN-POINTING TRIANGLE;So;0;ON;;;;;N;;;;; +23F8;DOUBLE VERTICAL BAR;So;0;ON;;;;;N;;;;; +23F9;BLACK SQUARE FOR STOP;So;0;ON;;;;;N;;;;; +23FA;BLACK CIRCLE FOR RECORD;So;0;ON;;;;;N;;;;; 2400;SYMBOL FOR NULL;So;0;ON;;;;;N;GRAPHIC FOR NULL;;;; 2401;SYMBOL FOR START OF HEADING;So;0;ON;;;;;N;GRAPHIC FOR START OF HEADING;;;; 2402;SYMBOL FOR START OF TEXT;So;0;ON;;;;;N;GRAPHIC FOR START OF TEXT;;;; @@ -8696,6 +8789,7 @@ 26FD;FUEL PUMP;So;0;ON;;;;;N;;;;; 26FE;CUP ON BLACK SQUARE;So;0;ON;;;;;N;;;;; 26FF;WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE;So;0;ON;;;;;N;;;;; +2700;BLACK SAFETY SCISSORS;So;0;ON;;;;;N;;;;; 2701;UPPER BLADE SCISSORS;So;0;ON;;;;;N;;;;; 2702;BLACK SCISSORS;So;0;ON;;;;;N;;;;; 2703;LOWER BLADE SCISSORS;So;0;ON;;;;;N;;;;; @@ -9796,6 +9890,9 @@ 2B4A;LEFTWARDS ARROW ABOVE ALMOST EQUAL TO;Sm;0;ON;;;;;N;;;;; 2B4B;LEFTWARDS ARROW ABOVE REVERSE TILDE OPERATOR;Sm;0;ON;;;;;N;;;;; 2B4C;RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR;Sm;0;ON;;;;;N;;;;; +2B4D;DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW;So;0;ON;;;;;N;;;;; +2B4E;SHORT SLANTED NORTH ARROW;So;0;ON;;;;;N;;;;; +2B4F;SHORT BACKSLANTED SOUTH ARROW;So;0;ON;;;;;N;;;;; 2B50;WHITE MEDIUM STAR;So;0;ON;;;;;N;;;;; 2B51;BLACK SMALL STAR;So;0;ON;;;;;N;;;;; 2B52;WHITE SMALL STAR;So;0;ON;;;;;N;;;;; @@ -9806,6 +9903,118 @@ 2B57;HEAVY CIRCLE WITH CIRCLE INSIDE;So;0;ON;;;;;N;;;;; 2B58;HEAVY CIRCLE;So;0;ON;;;;;N;;;;; 2B59;HEAVY CIRCLED SALTIRE;So;0;ON;;;;;N;;;;; +2B5A;SLANTED NORTH ARROW WITH HOOKED HEAD;So;0;ON;;;;;N;;;;; +2B5B;BACKSLANTED SOUTH ARROW WITH HOOKED TAIL;So;0;ON;;;;;N;;;;; +2B5C;SLANTED NORTH ARROW WITH HORIZONTAL TAIL;So;0;ON;;;;;N;;;;; +2B5D;BACKSLANTED SOUTH ARROW WITH HORIZONTAL TAIL;So;0;ON;;;;;N;;;;; +2B5E;BENT ARROW POINTING DOWNWARDS THEN NORTH EAST;So;0;ON;;;;;N;;;;; +2B5F;SHORT BENT ARROW POINTING DOWNWARDS THEN NORTH EAST;So;0;ON;;;;;N;;;;; +2B60;LEFTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B61;UPWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B62;RIGHTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B63;DOWNWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B64;LEFT RIGHT TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B65;UP DOWN TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B66;NORTH WEST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B67;NORTH EAST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B68;SOUTH EAST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B69;SOUTH WEST TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B6A;LEFTWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;; +2B6B;UPWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;; +2B6C;RIGHTWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;; +2B6D;DOWNWARDS TRIANGLE-HEADED DASHED ARROW;So;0;ON;;;;;N;;;;; +2B6E;CLOCKWISE TRIANGLE-HEADED OPEN CIRCLE ARROW;So;0;ON;;;;;N;;;;; +2B6F;ANTICLOCKWISE TRIANGLE-HEADED OPEN CIRCLE ARROW;So;0;ON;;;;;N;;;;; +2B70;LEFTWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B71;UPWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B72;RIGHTWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B73;DOWNWARDS TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B76;NORTH WEST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B77;NORTH EAST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B78;SOUTH EAST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B79;SOUTH WEST TRIANGLE-HEADED ARROW TO BAR;So;0;ON;;;;;N;;;;; +2B7A;LEFTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;; +2B7B;UPWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;; +2B7C;RIGHTWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;; +2B7D;DOWNWARDS TRIANGLE-HEADED ARROW WITH DOUBLE HORIZONTAL STROKE;So;0;ON;;;;;N;;;;; +2B7E;HORIZONTAL TAB KEY;So;0;ON;;;;;N;;;;; +2B7F;VERTICAL TAB KEY;So;0;ON;;;;;N;;;;; +2B80;LEFTWARDS TRIANGLE-HEADED ARROW OVER RIGHTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B81;UPWARDS TRIANGLE-HEADED ARROW LEFTWARDS OF DOWNWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B82;RIGHTWARDS TRIANGLE-HEADED ARROW OVER LEFTWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B83;DOWNWARDS TRIANGLE-HEADED ARROW LEFTWARDS OF UPWARDS TRIANGLE-HEADED ARROW;So;0;ON;;;;;N;;;;; +2B84;LEFTWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;; +2B85;UPWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;; +2B86;RIGHTWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;; +2B87;DOWNWARDS TRIANGLE-HEADED PAIRED ARROWS;So;0;ON;;;;;N;;;;; +2B88;LEFTWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;; +2B89;UPWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;; +2B8A;RIGHTWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;; +2B8B;DOWNWARDS BLACK CIRCLED WHITE ARROW;So;0;ON;;;;;N;;;;; +2B8C;ANTICLOCKWISE TRIANGLE-HEADED RIGHT U-SHAPED ARROW;So;0;ON;;;;;N;;;;; +2B8D;ANTICLOCKWISE TRIANGLE-HEADED BOTTOM U-SHAPED ARROW;So;0;ON;;;;;N;;;;; +2B8E;ANTICLOCKWISE TRIANGLE-HEADED LEFT U-SHAPED ARROW;So;0;ON;;;;;N;;;;; +2B8F;ANTICLOCKWISE TRIANGLE-HEADED TOP U-SHAPED ARROW;So;0;ON;;;;;N;;;;; +2B90;RETURN LEFT;So;0;ON;;;;;N;;;;; +2B91;RETURN RIGHT;So;0;ON;;;;;N;;;;; +2B92;NEWLINE LEFT;So;0;ON;;;;;N;;;;; +2B93;NEWLINE RIGHT;So;0;ON;;;;;N;;;;; +2B94;FOUR CORNER ARROWS CIRCLING ANTICLOCKWISE;So;0;ON;;;;;N;;;;; +2B95;RIGHTWARDS BLACK ARROW;So;0;ON;;;;;N;;;;; +2B98;THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B99;THREE-D RIGHT-LIGHTED UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9A;THREE-D TOP-LIGHTED RIGHTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9B;THREE-D LEFT-LIGHTED DOWNWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9C;BLACK LEFTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9D;BLACK UPWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9E;BLACK RIGHTWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2B9F;BLACK DOWNWARDS EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +2BA0;DOWNWARDS TRIANGLE-HEADED ARROW WITH LONG TIP LEFTWARDS;So;0;ON;;;;;N;;;;; +2BA1;DOWNWARDS TRIANGLE-HEADED ARROW WITH LONG TIP RIGHTWARDS;So;0;ON;;;;;N;;;;; +2BA2;UPWARDS TRIANGLE-HEADED ARROW WITH LONG TIP LEFTWARDS;So;0;ON;;;;;N;;;;; +2BA3;UPWARDS TRIANGLE-HEADED ARROW WITH LONG TIP RIGHTWARDS;So;0;ON;;;;;N;;;;; +2BA4;LEFTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP UPWARDS;So;0;ON;;;;;N;;;;; +2BA5;RIGHTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP UPWARDS;So;0;ON;;;;;N;;;;; +2BA6;LEFTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP DOWNWARDS;So;0;ON;;;;;N;;;;; +2BA7;RIGHTWARDS TRIANGLE-HEADED ARROW WITH LONG TIP DOWNWARDS;So;0;ON;;;;;N;;;;; +2BA8;BLACK CURVED DOWNWARDS AND LEFTWARDS ARROW;So;0;ON;;;;;N;;;;; +2BA9;BLACK CURVED DOWNWARDS AND RIGHTWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAA;BLACK CURVED UPWARDS AND LEFTWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAB;BLACK CURVED UPWARDS AND RIGHTWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAC;BLACK CURVED LEFTWARDS AND UPWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAD;BLACK CURVED RIGHTWARDS AND UPWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAE;BLACK CURVED LEFTWARDS AND DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; +2BAF;BLACK CURVED RIGHTWARDS AND DOWNWARDS ARROW;So;0;ON;;;;;N;;;;; +2BB0;RIBBON ARROW DOWN LEFT;So;0;ON;;;;;N;;;;; +2BB1;RIBBON ARROW DOWN RIGHT;So;0;ON;;;;;N;;;;; +2BB2;RIBBON ARROW UP LEFT;So;0;ON;;;;;N;;;;; +2BB3;RIBBON ARROW UP RIGHT;So;0;ON;;;;;N;;;;; +2BB4;RIBBON ARROW LEFT UP;So;0;ON;;;;;N;;;;; +2BB5;RIBBON ARROW RIGHT UP;So;0;ON;;;;;N;;;;; +2BB6;RIBBON ARROW LEFT DOWN;So;0;ON;;;;;N;;;;; +2BB7;RIBBON ARROW RIGHT DOWN;So;0;ON;;;;;N;;;;; +2BB8;UPWARDS WHITE ARROW FROM BAR WITH HORIZONTAL BAR;So;0;ON;;;;;N;;;;; +2BB9;UP ARROWHEAD IN A RECTANGLE BOX;So;0;ON;;;;;N;;;;; +2BBD;BALLOT BOX WITH LIGHT X;So;0;ON;;;;;N;;;;; +2BBE;CIRCLED X;So;0;ON;;;;;N;;;;; +2BBF;CIRCLED BOLD X;So;0;ON;;;;;N;;;;; +2BC0;BLACK SQUARE CENTRED;So;0;ON;;;;;N;;;;; +2BC1;BLACK DIAMOND CENTRED;So;0;ON;;;;;N;;;;; +2BC2;TURNED BLACK PENTAGON;So;0;ON;;;;;N;;;;; +2BC3;HORIZONTAL BLACK OCTAGON;So;0;ON;;;;;N;;;;; +2BC4;BLACK OCTAGON;So;0;ON;;;;;N;;;;; +2BC5;BLACK MEDIUM UP-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;; +2BC6;BLACK MEDIUM DOWN-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;; +2BC7;BLACK MEDIUM LEFT-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;; +2BC8;BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED;So;0;ON;;;;;N;;;;; +2BCA;TOP HALF BLACK CIRCLE;So;0;ON;;;;;N;;;;; +2BCB;BOTTOM HALF BLACK CIRCLE;So;0;ON;;;;;N;;;;; +2BCC;LIGHT FOUR POINTED BLACK CUSP;So;0;ON;;;;;N;;;;; +2BCD;ROTATED LIGHT FOUR POINTED BLACK CUSP;So;0;ON;;;;;N;;;;; +2BCE;WHITE FOUR POINTED CUSP;So;0;ON;;;;;N;;;;; +2BCF;ROTATED WHITE FOUR POINTED CUSP;So;0;ON;;;;;N;;;;; +2BD0;SQUARE POSITION INDICATOR;So;0;ON;;;;;N;;;;; +2BD1;UNCERTAINTY SIGN;So;0;ON;;;;;N;;;;; 2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30; 2C01;GLAGOLITIC CAPITAL LETTER BUKY;Lu;0;L;;;;;N;;;;2C31; 2C02;GLAGOLITIC CAPITAL LETTER VEDE;Lu;0;L;;;;;N;;;;2C32; @@ -10325,6 +10534,13 @@ 2E39;TOP HALF SECTION SIGN;Po;0;ON;;;;;N;;;;; 2E3A;TWO-EM DASH;Pd;0;ON;;;;;N;;;;; 2E3B;THREE-EM DASH;Pd;0;ON;;;;;N;;;;; +2E3C;STENOGRAPHIC FULL STOP;Po;0;ON;;;;;N;;;;; +2E3D;VERTICAL SIX DOTS;Po;0;ON;;;;;N;;;;; +2E3E;WIGGLY VERTICAL LINE;Po;0;ON;;;;;N;;;;; +2E3F;CAPITULUM;Po;0;ON;;;;;N;;;;; +2E40;DOUBLE HYPHEN;Pd;0;ON;;;;;N;;;;; +2E41;REVERSED COMMA;Po;0;ON;;;;;N;;;;; +2E42;DOUBLE LOW-REVERSED-9 QUOTATION MARK;Ps;0;ON;;;;;N;;;;; 2E80;CJK RADICAL REPEAT;So;0;ON;;;;;N;;;;; 2E81;CJK RADICAL CLIFF;So;0;ON;;;;;N;;;;; 2E82;CJK RADICAL SECOND ONE;So;0;ON;;;;;N;;;;; @@ -13383,6 +13599,12 @@ A695;CYRILLIC SMALL LETTER HWE;Ll;0;L;;;;;N;;;A694;;A694 A696;CYRILLIC CAPITAL LETTER SHWE;Lu;0;L;;;;;N;;;;A697; A697;CYRILLIC SMALL LETTER SHWE;Ll;0;L;;;;;N;;;A696;;A696 +A698;CYRILLIC CAPITAL LETTER DOUBLE O;Lu;0;L;;;;;N;;;;A699; +A699;CYRILLIC SMALL LETTER DOUBLE O;Ll;0;L;;;;;N;;;A698;;A698 +A69A;CYRILLIC CAPITAL LETTER CROSSED O;Lu;0;L;;;;;N;;;;A69B; +A69B;CYRILLIC SMALL LETTER CROSSED O;Ll;0;L;;;;;N;;;A69A;;A69A +A69C;MODIFIER LETTER CYRILLIC HARD SIGN;Lm;0;L;<super> 044A;;;;N;;;;; +A69D;MODIFIER LETTER CYRILLIC SOFT SIGN;Lm;0;L;<super> 044C;;;;N;;;;; A69F;COMBINING CYRILLIC LETTER IOTIFIED E;Mn;230;NSM;;;;;N;;;;; A6A0;BAMUM LETTER A;Lo;0;L;;;;;N;;;;; A6A1;BAMUM LETTER KA;Lo;0;L;;;;;N;;;;; @@ -13619,6 +13841,18 @@ A791;LATIN SMALL LETTER N WITH DESCENDER;Ll;0;L;;;;;N;;;A790;;A790 A792;LATIN CAPITAL LETTER C WITH BAR;Lu;0;L;;;;;N;;;;A793; A793;LATIN SMALL LETTER C WITH BAR;Ll;0;L;;;;;N;;;A792;;A792 +A794;LATIN SMALL LETTER C WITH PALATAL HOOK;Ll;0;L;;;;;N;;;;; +A795;LATIN SMALL LETTER H WITH PALATAL HOOK;Ll;0;L;;;;;N;;;;; +A796;LATIN CAPITAL LETTER B WITH FLOURISH;Lu;0;L;;;;;N;;;;A797; +A797;LATIN SMALL LETTER B WITH FLOURISH;Ll;0;L;;;;;N;;;A796;;A796 +A798;LATIN CAPITAL LETTER F WITH STROKE;Lu;0;L;;;;;N;;;;A799; +A799;LATIN SMALL LETTER F WITH STROKE;Ll;0;L;;;;;N;;;A798;;A798 +A79A;LATIN CAPITAL LETTER VOLAPUK AE;Lu;0;L;;;;;N;;;;A79B; +A79B;LATIN SMALL LETTER VOLAPUK AE;Ll;0;L;;;;;N;;;A79A;;A79A +A79C;LATIN CAPITAL LETTER VOLAPUK OE;Lu;0;L;;;;;N;;;;A79D; +A79D;LATIN SMALL LETTER VOLAPUK OE;Ll;0;L;;;;;N;;;A79C;;A79C +A79E;LATIN CAPITAL LETTER VOLAPUK UE;Lu;0;L;;;;;N;;;;A79F; +A79F;LATIN SMALL LETTER VOLAPUK UE;Ll;0;L;;;;;N;;;A79E;;A79E A7A0;LATIN CAPITAL LETTER G WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A1; A7A1;LATIN SMALL LETTER G WITH OBLIQUE STROKE;Ll;0;L;;;;;N;;;A7A0;;A7A0 A7A2;LATIN CAPITAL LETTER K WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A3; @@ -13630,6 +13864,12 @@ A7A8;LATIN CAPITAL LETTER S WITH OBLIQUE STROKE;Lu;0;L;;;;;N;;;;A7A9; A7A9;LATIN SMALL LETTER S WITH OBLIQUE STROKE;Ll;0;L;;;;;N;;;A7A8;;A7A8 A7AA;LATIN CAPITAL LETTER H WITH HOOK;Lu;0;L;;;;;N;;;;0266; +A7AB;LATIN CAPITAL LETTER REVERSED OPEN E;Lu;0;L;;;;;N;;;;025C; +A7AC;LATIN CAPITAL LETTER SCRIPT G;Lu;0;L;;;;;N;;;;0261; +A7AD;LATIN CAPITAL LETTER L WITH BELT;Lu;0;L;;;;;N;;;;026C; +A7B0;LATIN CAPITAL LETTER TURNED K;Lu;0;L;;;;;N;;;;029E; +A7B1;LATIN CAPITAL LETTER TURNED T;Lu;0;L;;;;;N;;;;0287; +A7F7;LATIN EPIGRAPHIC LETTER SIDEWAYS I;Lo;0;L;;;;;N;;;;; A7F8;MODIFIER LETTER CAPITAL H WITH STROKE;Lm;0;L;<super> 0126;;;;N;;;;; A7F9;MODIFIER LETTER SMALL LIGATURE OE;Lm;0;L;<super> 0153;;;;N;;;;; A7FA;LATIN LETTER SMALL CAPITAL TURNED M;Ll;0;L;;;;;N;;;;; @@ -14062,6 +14302,37 @@ A9D9;JAVANESE DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; A9DE;JAVANESE PADA TIRTA TUMETES;Po;0;L;;;;;N;;;;; A9DF;JAVANESE PADA ISEN-ISEN;Po;0;L;;;;;N;;;;; +A9E0;MYANMAR LETTER SHAN GHA;Lo;0;L;;;;;N;;;;; +A9E1;MYANMAR LETTER SHAN CHA;Lo;0;L;;;;;N;;;;; +A9E2;MYANMAR LETTER SHAN JHA;Lo;0;L;;;;;N;;;;; +A9E3;MYANMAR LETTER SHAN NNA;Lo;0;L;;;;;N;;;;; +A9E4;MYANMAR LETTER SHAN BHA;Lo;0;L;;;;;N;;;;; +A9E5;MYANMAR SIGN SHAN SAW;Mn;0;NSM;;;;;N;;;;; +A9E6;MYANMAR MODIFIER LETTER SHAN REDUPLICATION;Lm;0;L;;;;;N;;;;; +A9E7;MYANMAR LETTER TAI LAING NYA;Lo;0;L;;;;;N;;;;; +A9E8;MYANMAR LETTER TAI LAING FA;Lo;0;L;;;;;N;;;;; +A9E9;MYANMAR LETTER TAI LAING GA;Lo;0;L;;;;;N;;;;; +A9EA;MYANMAR LETTER TAI LAING GHA;Lo;0;L;;;;;N;;;;; +A9EB;MYANMAR LETTER TAI LAING JA;Lo;0;L;;;;;N;;;;; +A9EC;MYANMAR LETTER TAI LAING JHA;Lo;0;L;;;;;N;;;;; +A9ED;MYANMAR LETTER TAI LAING DDA;Lo;0;L;;;;;N;;;;; +A9EE;MYANMAR LETTER TAI LAING DDHA;Lo;0;L;;;;;N;;;;; +A9EF;MYANMAR LETTER TAI LAING NNA;Lo;0;L;;;;;N;;;;; +A9F0;MYANMAR TAI LAING DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +A9F1;MYANMAR TAI LAING DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +A9F2;MYANMAR TAI LAING DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +A9F3;MYANMAR TAI LAING DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +A9F4;MYANMAR TAI LAING DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +A9F5;MYANMAR TAI LAING DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +A9F6;MYANMAR TAI LAING DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +A9F7;MYANMAR TAI LAING DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +A9F8;MYANMAR TAI LAING DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +A9F9;MYANMAR TAI LAING DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +A9FA;MYANMAR LETTER TAI LAING LLA;Lo;0;L;;;;;N;;;;; +A9FB;MYANMAR LETTER TAI LAING DA;Lo;0;L;;;;;N;;;;; +A9FC;MYANMAR LETTER TAI LAING DHA;Lo;0;L;;;;;N;;;;; +A9FD;MYANMAR LETTER TAI LAING BA;Lo;0;L;;;;;N;;;;; +A9FE;MYANMAR LETTER TAI LAING BHA;Lo;0;L;;;;;N;;;;; AA00;CHAM LETTER A;Lo;0;L;;;;;N;;;;; AA01;CHAM LETTER I;Lo;0;L;;;;;N;;;;; AA02;CHAM LETTER U;Lo;0;L;;;;;N;;;;; @@ -14173,6 +14444,10 @@ AA79;MYANMAR SYMBOL AITON TWO;So;0;L;;;;;N;;;;; AA7A;MYANMAR LETTER AITON RA;Lo;0;L;;;;;N;;;;; AA7B;MYANMAR SIGN PAO KAREN TONE;Mc;0;L;;;;;N;;;;; +AA7C;MYANMAR SIGN TAI LAING TONE-2;Mn;0;NSM;;;;;N;;;;; +AA7D;MYANMAR SIGN TAI LAING TONE-5;Mc;0;L;;;;;N;;;;; +AA7E;MYANMAR LETTER SHWE PALAUNG CHA;Lo;0;L;;;;;N;;;;; +AA7F;MYANMAR LETTER SHWE PALAUNG SHA;Lo;0;L;;;;;N;;;;; AA80;TAI VIET LETTER LOW KO;Lo;0;L;;;;;N;;;;; AA81;TAI VIET LETTER HIGH KO;Lo;0;L;;;;;N;;;;; AA82;TAI VIET LETTER LOW KHO;Lo;0;L;;;;;N;;;;; @@ -14300,6 +14575,56 @@ AB2C;ETHIOPIC SYLLABLE BBEE;Lo;0;L;;;;;N;;;;; AB2D;ETHIOPIC SYLLABLE BBE;Lo;0;L;;;;;N;;;;; AB2E;ETHIOPIC SYLLABLE BBO;Lo;0;L;;;;;N;;;;; +AB30;LATIN SMALL LETTER BARRED ALPHA;Ll;0;L;;;;;N;;;;; +AB31;LATIN SMALL LETTER A REVERSED-SCHWA;Ll;0;L;;;;;N;;;;; +AB32;LATIN SMALL LETTER BLACKLETTER E;Ll;0;L;;;;;N;;;;; +AB33;LATIN SMALL LETTER BARRED E;Ll;0;L;;;;;N;;;;; +AB34;LATIN SMALL LETTER E WITH FLOURISH;Ll;0;L;;;;;N;;;;; +AB35;LATIN SMALL LETTER LENIS F;Ll;0;L;;;;;N;;;;; +AB36;LATIN SMALL LETTER SCRIPT G WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB37;LATIN SMALL LETTER L WITH INVERTED LAZY S;Ll;0;L;;;;;N;;;;; +AB38;LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE;Ll;0;L;;;;;N;;;;; +AB39;LATIN SMALL LETTER L WITH MIDDLE RING;Ll;0;L;;;;;N;;;;; +AB3A;LATIN SMALL LETTER M WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB3B;LATIN SMALL LETTER N WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB3C;LATIN SMALL LETTER ENG WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB3D;LATIN SMALL LETTER BLACKLETTER O;Ll;0;L;;;;;N;;;;; +AB3E;LATIN SMALL LETTER BLACKLETTER O WITH STROKE;Ll;0;L;;;;;N;;;;; +AB3F;LATIN SMALL LETTER OPEN O WITH STROKE;Ll;0;L;;;;;N;;;;; +AB40;LATIN SMALL LETTER INVERTED OE;Ll;0;L;;;;;N;;;;; +AB41;LATIN SMALL LETTER TURNED OE WITH STROKE;Ll;0;L;;;;;N;;;;; +AB42;LATIN SMALL LETTER TURNED OE WITH HORIZONTAL STROKE;Ll;0;L;;;;;N;;;;; +AB43;LATIN SMALL LETTER TURNED O OPEN-O;Ll;0;L;;;;;N;;;;; +AB44;LATIN SMALL LETTER TURNED O OPEN-O WITH STROKE;Ll;0;L;;;;;N;;;;; +AB45;LATIN SMALL LETTER STIRRUP R;Ll;0;L;;;;;N;;;;; +AB46;LATIN LETTER SMALL CAPITAL R WITH RIGHT LEG;Ll;0;L;;;;;N;;;;; +AB47;LATIN SMALL LETTER R WITHOUT HANDLE;Ll;0;L;;;;;N;;;;; +AB48;LATIN SMALL LETTER DOUBLE R;Ll;0;L;;;;;N;;;;; +AB49;LATIN SMALL LETTER R WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB4A;LATIN SMALL LETTER DOUBLE R WITH CROSSED-TAIL;Ll;0;L;;;;;N;;;;; +AB4B;LATIN SMALL LETTER SCRIPT R;Ll;0;L;;;;;N;;;;; +AB4C;LATIN SMALL LETTER SCRIPT R WITH RING;Ll;0;L;;;;;N;;;;; +AB4D;LATIN SMALL LETTER BASELINE ESH;Ll;0;L;;;;;N;;;;; +AB4E;LATIN SMALL LETTER U WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;; +AB4F;LATIN SMALL LETTER U BAR WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;; +AB50;LATIN SMALL LETTER UI;Ll;0;L;;;;;N;;;;; +AB51;LATIN SMALL LETTER TURNED UI;Ll;0;L;;;;;N;;;;; +AB52;LATIN SMALL LETTER U WITH LEFT HOOK;Ll;0;L;;;;;N;;;;; +AB53;LATIN SMALL LETTER CHI;Ll;0;L;;;;;N;;;;; +AB54;LATIN SMALL LETTER CHI WITH LOW RIGHT RING;Ll;0;L;;;;;N;;;;; +AB55;LATIN SMALL LETTER CHI WITH LOW LEFT SERIF;Ll;0;L;;;;;N;;;;; +AB56;LATIN SMALL LETTER X WITH LOW RIGHT RING;Ll;0;L;;;;;N;;;;; +AB57;LATIN SMALL LETTER X WITH LONG LEFT LEG;Ll;0;L;;;;;N;;;;; +AB58;LATIN SMALL LETTER X WITH LONG LEFT LEG AND LOW RIGHT RING;Ll;0;L;;;;;N;;;;; +AB59;LATIN SMALL LETTER X WITH LONG LEFT LEG WITH SERIF;Ll;0;L;;;;;N;;;;; +AB5A;LATIN SMALL LETTER Y WITH SHORT RIGHT LEG;Ll;0;L;;;;;N;;;;; +AB5B;MODIFIER BREVE WITH INVERTED BREVE;Sk;0;L;;;;;N;;;;; +AB5C;MODIFIER LETTER SMALL HENG;Lm;0;L;<super> A727;;;;N;;;;; +AB5D;MODIFIER LETTER SMALL L WITH INVERTED LAZY S;Lm;0;L;<super> AB37;;;;N;;;;; +AB5E;MODIFIER LETTER SMALL L WITH MIDDLE TILDE;Lm;0;L;<super> 026B;;;;N;;;;; +AB5F;MODIFIER LETTER SMALL U WITH LEFT HOOK;Lm;0;L;<super> AB52;;;;N;;;;; +AB64;LATIN SMALL LETTER INVERTED ALPHA;Ll;0;L;;;;;N;;;;; +AB65;GREEK LETTER SMALL CAPITAL OMEGA;Ll;0;L;;;;;N;;;;; ABC0;MEETEI MAYEK LETTER KOK;Lo;0;L;;;;;N;;;;; ABC1;MEETEI MAYEK LETTER SAM;Lo;0;L;;;;;N;;;;; ABC2;MEETEI MAYEK LETTER LAI;Lo;0;L;;;;;N;;;;; @@ -15445,8 +15770,8 @@ FD3B;ARABIC LIGATURE ZAH WITH MEEM MEDIAL FORM;Lo;0;AL;<medial> 0638 0645;;;;N;;;;; FD3C;ARABIC LIGATURE ALEF WITH FATHATAN FINAL FORM;Lo;0;AL;<final> 0627 064B;;;;N;;;;; FD3D;ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0627 064B;;;;N;;;;; -FD3E;ORNATE LEFT PARENTHESIS;Ps;0;ON;;;;;N;;;;; -FD3F;ORNATE RIGHT PARENTHESIS;Pe;0;ON;;;;;N;;;;; +FD3E;ORNATE LEFT PARENTHESIS;Pe;0;ON;;;;;N;;;;; +FD3F;ORNATE RIGHT PARENTHESIS;Ps;0;ON;;;;;N;;;;; FD50;ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM;Lo;0;AL;<initial> 062A 062C 0645;;;;N;;;;; FD51;ARABIC LIGATURE TEH WITH HAH WITH JEEM FINAL FORM;Lo;0;AL;<final> 062A 062D 062C;;;;N;;;;; FD52;ARABIC LIGATURE TEH WITH HAH WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 062A 062D 062C;;;;N;;;;; @@ -15612,6 +15937,13 @@ FE24;COMBINING MACRON LEFT HALF;Mn;230;NSM;;;;;N;;;;; FE25;COMBINING MACRON RIGHT HALF;Mn;230;NSM;;;;;N;;;;; FE26;COMBINING CONJOINING MACRON;Mn;230;NSM;;;;;N;;;;; +FE27;COMBINING LIGATURE LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE28;COMBINING LIGATURE RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE29;COMBINING TILDE LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE2A;COMBINING TILDE RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE2B;COMBINING MACRON LEFT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE2C;COMBINING MACRON RIGHT HALF BELOW;Mn;220;NSM;;;;;N;;;;; +FE2D;COMBINING CONJOINING MACRON BELOW;Mn;220;NSM;;;;;N;;;;; FE30;PRESENTATION FORM FOR VERTICAL TWO DOT LEADER;Po;0;ON;<vertical> 2025;;;;N;GLYPH FOR VERTICAL TWO DOT LEADER;;;; FE31;PRESENTATION FORM FOR VERTICAL EM DASH;Pd;0;ON;<vertical> 2014;;;;N;GLYPH FOR VERTICAL EM DASH;;;; FE32;PRESENTATION FORM FOR VERTICAL EN DASH;Pd;0;ON;<vertical> 2013;;;;N;GLYPH FOR VERTICAL EN DASH;;;; @@ -16384,6 +16716,8 @@ 10188;GREEK GRAMMA SIGN;So;0;ON;;;;;N;;;;; 10189;GREEK TRYBLION BASE SIGN;So;0;ON;;;;;N;;;;; 1018A;GREEK ZERO SIGN;No;0;ON;;;;0;N;;;;; +1018B;GREEK ONE QUARTER SIGN;No;0;ON;;;;1/4;N;;;;; +1018C;GREEK SINUSOID SIGN;So;0;ON;;;;;N;;;;; 10190;ROMAN SEXTANS SIGN;So;0;ON;;;;;N;;;;; 10191;ROMAN UNCIA SIGN;So;0;ON;;;;;N;;;;; 10192;ROMAN SEMUNCIA SIGN;So;0;ON;;;;;N;;;;; @@ -16396,6 +16730,7 @@ 10199;ROMAN DUPONDIUS SIGN;So;0;ON;;;;;N;;;;; 1019A;ROMAN AS SIGN;So;0;ON;;;;;N;;;;; 1019B;ROMAN CENTURIAL SIGN;So;0;ON;;;;;N;;;;; +101A0;GREEK SYMBOL TAU RHO;So;0;ON;;;;;N;;;;; 101D0;PHAISTOS DISC SIGN PEDESTRIAN;So;0;L;;;;;N;;;;; 101D1;PHAISTOS DISC SIGN PLUMED HEAD;So;0;L;;;;;N;;;;; 101D2;PHAISTOS DISC SIGN TATTOOED HEAD;So;0;L;;;;;N;;;;; @@ -16520,6 +16855,34 @@ 102CE;CARIAN LETTER LD2;Lo;0;L;;;;;N;;;;; 102CF;CARIAN LETTER E2;Lo;0;L;;;;;N;;;;; 102D0;CARIAN LETTER UUU3;Lo;0;L;;;;;N;;;;; +102E0;COPTIC EPACT THOUSANDS MARK;Mn;220;NSM;;;;;N;;;;; +102E1;COPTIC EPACT DIGIT ONE;No;0;EN;;;;1;N;;;;; +102E2;COPTIC EPACT DIGIT TWO;No;0;EN;;;;2;N;;;;; +102E3;COPTIC EPACT DIGIT THREE;No;0;EN;;;;3;N;;;;; +102E4;COPTIC EPACT DIGIT FOUR;No;0;EN;;;;4;N;;;;; +102E5;COPTIC EPACT DIGIT FIVE;No;0;EN;;;;5;N;;;;; +102E6;COPTIC EPACT DIGIT SIX;No;0;EN;;;;6;N;;;;; +102E7;COPTIC EPACT DIGIT SEVEN;No;0;EN;;;;7;N;;;;; +102E8;COPTIC EPACT DIGIT EIGHT;No;0;EN;;;;8;N;;;;; +102E9;COPTIC EPACT DIGIT NINE;No;0;EN;;;;9;N;;;;; +102EA;COPTIC EPACT NUMBER TEN;No;0;EN;;;;10;N;;;;; +102EB;COPTIC EPACT NUMBER TWENTY;No;0;EN;;;;20;N;;;;; +102EC;COPTIC EPACT NUMBER THIRTY;No;0;EN;;;;30;N;;;;; +102ED;COPTIC EPACT NUMBER FORTY;No;0;EN;;;;40;N;;;;; +102EE;COPTIC EPACT NUMBER FIFTY;No;0;EN;;;;50;N;;;;; +102EF;COPTIC EPACT NUMBER SIXTY;No;0;EN;;;;60;N;;;;; +102F0;COPTIC EPACT NUMBER SEVENTY;No;0;EN;;;;70;N;;;;; +102F1;COPTIC EPACT NUMBER EIGHTY;No;0;EN;;;;80;N;;;;; +102F2;COPTIC EPACT NUMBER NINETY;No;0;EN;;;;90;N;;;;; +102F3;COPTIC EPACT NUMBER ONE HUNDRED;No;0;EN;;;;100;N;;;;; +102F4;COPTIC EPACT NUMBER TWO HUNDRED;No;0;EN;;;;200;N;;;;; +102F5;COPTIC EPACT NUMBER THREE HUNDRED;No;0;EN;;;;300;N;;;;; +102F6;COPTIC EPACT NUMBER FOUR HUNDRED;No;0;EN;;;;400;N;;;;; +102F7;COPTIC EPACT NUMBER FIVE HUNDRED;No;0;EN;;;;500;N;;;;; +102F8;COPTIC EPACT NUMBER SIX HUNDRED;No;0;EN;;;;600;N;;;;; +102F9;COPTIC EPACT NUMBER SEVEN HUNDRED;No;0;EN;;;;700;N;;;;; +102FA;COPTIC EPACT NUMBER EIGHT HUNDRED;No;0;EN;;;;800;N;;;;; +102FB;COPTIC EPACT NUMBER NINE HUNDRED;No;0;EN;;;;900;N;;;;; 10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;; 10301;OLD ITALIC LETTER BE;Lo;0;L;;;;;N;;;;; 10302;OLD ITALIC LETTER KE;Lo;0;L;;;;;N;;;;; @@ -16551,6 +16914,7 @@ 1031C;OLD ITALIC LETTER CHE;Lo;0;L;;;;;N;;;;; 1031D;OLD ITALIC LETTER II;Lo;0;L;;;;;N;;;;; 1031E;OLD ITALIC LETTER UU;Lo;0;L;;;;;N;;;;; +1031F;OLD ITALIC LETTER ESS;Lo;0;L;;;;;N;;;;; 10320;OLD ITALIC NUMERAL ONE;No;0;L;;;;1;N;;;;; 10321;OLD ITALIC NUMERAL FIVE;No;0;L;;;;5;N;;;;; 10322;OLD ITALIC NUMERAL TEN;No;0;L;;;;10;N;;;;; @@ -16582,6 +16946,49 @@ 10348;GOTHIC LETTER HWAIR;Lo;0;L;;;;;N;;;;; 10349;GOTHIC LETTER OTHAL;Lo;0;L;;;;;N;;;;; 1034A;GOTHIC LETTER NINE HUNDRED;Nl;0;L;;;;900;N;;;;; +10350;OLD PERMIC LETTER AN;Lo;0;L;;;;;N;;;;; +10351;OLD PERMIC LETTER BUR;Lo;0;L;;;;;N;;;;; +10352;OLD PERMIC LETTER GAI;Lo;0;L;;;;;N;;;;; +10353;OLD PERMIC LETTER DOI;Lo;0;L;;;;;N;;;;; +10354;OLD PERMIC LETTER E;Lo;0;L;;;;;N;;;;; +10355;OLD PERMIC LETTER ZHOI;Lo;0;L;;;;;N;;;;; +10356;OLD PERMIC LETTER DZHOI;Lo;0;L;;;;;N;;;;; +10357;OLD PERMIC LETTER ZATA;Lo;0;L;;;;;N;;;;; +10358;OLD PERMIC LETTER DZITA;Lo;0;L;;;;;N;;;;; +10359;OLD PERMIC LETTER I;Lo;0;L;;;;;N;;;;; +1035A;OLD PERMIC LETTER KOKE;Lo;0;L;;;;;N;;;;; +1035B;OLD PERMIC LETTER LEI;Lo;0;L;;;;;N;;;;; +1035C;OLD PERMIC LETTER MENOE;Lo;0;L;;;;;N;;;;; +1035D;OLD PERMIC LETTER NENOE;Lo;0;L;;;;;N;;;;; +1035E;OLD PERMIC LETTER VOOI;Lo;0;L;;;;;N;;;;; +1035F;OLD PERMIC LETTER PEEI;Lo;0;L;;;;;N;;;;; +10360;OLD PERMIC LETTER REI;Lo;0;L;;;;;N;;;;; +10361;OLD PERMIC LETTER SII;Lo;0;L;;;;;N;;;;; +10362;OLD PERMIC LETTER TAI;Lo;0;L;;;;;N;;;;; +10363;OLD PERMIC LETTER U;Lo;0;L;;;;;N;;;;; +10364;OLD PERMIC LETTER CHERY;Lo;0;L;;;;;N;;;;; +10365;OLD PERMIC LETTER SHOOI;Lo;0;L;;;;;N;;;;; +10366;OLD PERMIC LETTER SHCHOOI;Lo;0;L;;;;;N;;;;; +10367;OLD PERMIC LETTER YRY;Lo;0;L;;;;;N;;;;; +10368;OLD PERMIC LETTER YERU;Lo;0;L;;;;;N;;;;; +10369;OLD PERMIC LETTER O;Lo;0;L;;;;;N;;;;; +1036A;OLD PERMIC LETTER OO;Lo;0;L;;;;;N;;;;; +1036B;OLD PERMIC LETTER EF;Lo;0;L;;;;;N;;;;; +1036C;OLD PERMIC LETTER HA;Lo;0;L;;;;;N;;;;; +1036D;OLD PERMIC LETTER TSIU;Lo;0;L;;;;;N;;;;; +1036E;OLD PERMIC LETTER VER;Lo;0;L;;;;;N;;;;; +1036F;OLD PERMIC LETTER YER;Lo;0;L;;;;;N;;;;; +10370;OLD PERMIC LETTER YERI;Lo;0;L;;;;;N;;;;; +10371;OLD PERMIC LETTER YAT;Lo;0;L;;;;;N;;;;; +10372;OLD PERMIC LETTER IE;Lo;0;L;;;;;N;;;;; +10373;OLD PERMIC LETTER YU;Lo;0;L;;;;;N;;;;; +10374;OLD PERMIC LETTER YA;Lo;0;L;;;;;N;;;;; +10375;OLD PERMIC LETTER IA;Lo;0;L;;;;;N;;;;; +10376;COMBINING OLD PERMIC LETTER AN;Mn;230;NSM;;;;;N;;;;; +10377;COMBINING OLD PERMIC LETTER DOI;Mn;230;NSM;;;;;N;;;;; +10378;COMBINING OLD PERMIC LETTER ZATA;Mn;230;NSM;;;;;N;;;;; +10379;COMBINING OLD PERMIC LETTER NENOE;Mn;230;NSM;;;;;N;;;;; +1037A;COMBINING OLD PERMIC LETTER SII;Mn;230;NSM;;;;;N;;;;; 10380;UGARITIC LETTER ALPA;Lo;0;L;;;;;N;;;;; 10381;UGARITIC LETTER BETA;Lo;0;L;;;;;N;;;;; 10382;UGARITIC LETTER GAMLA;Lo;0;L;;;;;N;;;;; @@ -16831,6 +17238,440 @@ 104A7;OSMANYA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 104A8;OSMANYA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 104A9;OSMANYA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +10500;ELBASAN LETTER A;Lo;0;L;;;;;N;;;;; +10501;ELBASAN LETTER BE;Lo;0;L;;;;;N;;;;; +10502;ELBASAN LETTER CE;Lo;0;L;;;;;N;;;;; +10503;ELBASAN LETTER CHE;Lo;0;L;;;;;N;;;;; +10504;ELBASAN LETTER DE;Lo;0;L;;;;;N;;;;; +10505;ELBASAN LETTER NDE;Lo;0;L;;;;;N;;;;; +10506;ELBASAN LETTER DHE;Lo;0;L;;;;;N;;;;; +10507;ELBASAN LETTER EI;Lo;0;L;;;;;N;;;;; +10508;ELBASAN LETTER E;Lo;0;L;;;;;N;;;;; +10509;ELBASAN LETTER FE;Lo;0;L;;;;;N;;;;; +1050A;ELBASAN LETTER GE;Lo;0;L;;;;;N;;;;; +1050B;ELBASAN LETTER GJE;Lo;0;L;;;;;N;;;;; +1050C;ELBASAN LETTER HE;Lo;0;L;;;;;N;;;;; +1050D;ELBASAN LETTER I;Lo;0;L;;;;;N;;;;; +1050E;ELBASAN LETTER JE;Lo;0;L;;;;;N;;;;; +1050F;ELBASAN LETTER KE;Lo;0;L;;;;;N;;;;; +10510;ELBASAN LETTER LE;Lo;0;L;;;;;N;;;;; +10511;ELBASAN LETTER LLE;Lo;0;L;;;;;N;;;;; +10512;ELBASAN LETTER ME;Lo;0;L;;;;;N;;;;; +10513;ELBASAN LETTER NE;Lo;0;L;;;;;N;;;;; +10514;ELBASAN LETTER NA;Lo;0;L;;;;;N;;;;; +10515;ELBASAN LETTER NJE;Lo;0;L;;;;;N;;;;; +10516;ELBASAN LETTER O;Lo;0;L;;;;;N;;;;; +10517;ELBASAN LETTER PE;Lo;0;L;;;;;N;;;;; +10518;ELBASAN LETTER QE;Lo;0;L;;;;;N;;;;; +10519;ELBASAN LETTER RE;Lo;0;L;;;;;N;;;;; +1051A;ELBASAN LETTER RRE;Lo;0;L;;;;;N;;;;; +1051B;ELBASAN LETTER SE;Lo;0;L;;;;;N;;;;; +1051C;ELBASAN LETTER SHE;Lo;0;L;;;;;N;;;;; +1051D;ELBASAN LETTER TE;Lo;0;L;;;;;N;;;;; +1051E;ELBASAN LETTER THE;Lo;0;L;;;;;N;;;;; +1051F;ELBASAN LETTER U;Lo;0;L;;;;;N;;;;; +10520;ELBASAN LETTER VE;Lo;0;L;;;;;N;;;;; +10521;ELBASAN LETTER XE;Lo;0;L;;;;;N;;;;; +10522;ELBASAN LETTER Y;Lo;0;L;;;;;N;;;;; +10523;ELBASAN LETTER ZE;Lo;0;L;;;;;N;;;;; +10524;ELBASAN LETTER ZHE;Lo;0;L;;;;;N;;;;; +10525;ELBASAN LETTER GHE;Lo;0;L;;;;;N;;;;; +10526;ELBASAN LETTER GHAMMA;Lo;0;L;;;;;N;;;;; +10527;ELBASAN LETTER KHE;Lo;0;L;;;;;N;;;;; +10530;CAUCASIAN ALBANIAN LETTER ALT;Lo;0;L;;;;;N;;;;; +10531;CAUCASIAN ALBANIAN LETTER BET;Lo;0;L;;;;;N;;;;; +10532;CAUCASIAN ALBANIAN LETTER GIM;Lo;0;L;;;;;N;;;;; +10533;CAUCASIAN ALBANIAN LETTER DAT;Lo;0;L;;;;;N;;;;; +10534;CAUCASIAN ALBANIAN LETTER EB;Lo;0;L;;;;;N;;;;; +10535;CAUCASIAN ALBANIAN LETTER ZARL;Lo;0;L;;;;;N;;;;; +10536;CAUCASIAN ALBANIAN LETTER EYN;Lo;0;L;;;;;N;;;;; +10537;CAUCASIAN ALBANIAN LETTER ZHIL;Lo;0;L;;;;;N;;;;; +10538;CAUCASIAN ALBANIAN LETTER TAS;Lo;0;L;;;;;N;;;;; +10539;CAUCASIAN ALBANIAN LETTER CHA;Lo;0;L;;;;;N;;;;; +1053A;CAUCASIAN ALBANIAN LETTER YOWD;Lo;0;L;;;;;N;;;;; +1053B;CAUCASIAN ALBANIAN LETTER ZHA;Lo;0;L;;;;;N;;;;; +1053C;CAUCASIAN ALBANIAN LETTER IRB;Lo;0;L;;;;;N;;;;; +1053D;CAUCASIAN ALBANIAN LETTER SHA;Lo;0;L;;;;;N;;;;; +1053E;CAUCASIAN ALBANIAN LETTER LAN;Lo;0;L;;;;;N;;;;; +1053F;CAUCASIAN ALBANIAN LETTER INYA;Lo;0;L;;;;;N;;;;; +10540;CAUCASIAN ALBANIAN LETTER XEYN;Lo;0;L;;;;;N;;;;; +10541;CAUCASIAN ALBANIAN LETTER DYAN;Lo;0;L;;;;;N;;;;; +10542;CAUCASIAN ALBANIAN LETTER CAR;Lo;0;L;;;;;N;;;;; +10543;CAUCASIAN ALBANIAN LETTER JHOX;Lo;0;L;;;;;N;;;;; +10544;CAUCASIAN ALBANIAN LETTER KAR;Lo;0;L;;;;;N;;;;; +10545;CAUCASIAN ALBANIAN LETTER LYIT;Lo;0;L;;;;;N;;;;; +10546;CAUCASIAN ALBANIAN LETTER HEYT;Lo;0;L;;;;;N;;;;; +10547;CAUCASIAN ALBANIAN LETTER QAY;Lo;0;L;;;;;N;;;;; +10548;CAUCASIAN ALBANIAN LETTER AOR;Lo;0;L;;;;;N;;;;; +10549;CAUCASIAN ALBANIAN LETTER CHOY;Lo;0;L;;;;;N;;;;; +1054A;CAUCASIAN ALBANIAN LETTER CHI;Lo;0;L;;;;;N;;;;; +1054B;CAUCASIAN ALBANIAN LETTER CYAY;Lo;0;L;;;;;N;;;;; +1054C;CAUCASIAN ALBANIAN LETTER MAQ;Lo;0;L;;;;;N;;;;; +1054D;CAUCASIAN ALBANIAN LETTER QAR;Lo;0;L;;;;;N;;;;; +1054E;CAUCASIAN ALBANIAN LETTER NOWC;Lo;0;L;;;;;N;;;;; +1054F;CAUCASIAN ALBANIAN LETTER DZYAY;Lo;0;L;;;;;N;;;;; +10550;CAUCASIAN ALBANIAN LETTER SHAK;Lo;0;L;;;;;N;;;;; +10551;CAUCASIAN ALBANIAN LETTER JAYN;Lo;0;L;;;;;N;;;;; +10552;CAUCASIAN ALBANIAN LETTER ON;Lo;0;L;;;;;N;;;;; +10553;CAUCASIAN ALBANIAN LETTER TYAY;Lo;0;L;;;;;N;;;;; +10554;CAUCASIAN ALBANIAN LETTER FAM;Lo;0;L;;;;;N;;;;; +10555;CAUCASIAN ALBANIAN LETTER DZAY;Lo;0;L;;;;;N;;;;; +10556;CAUCASIAN ALBANIAN LETTER CHAT;Lo;0;L;;;;;N;;;;; +10557;CAUCASIAN ALBANIAN LETTER PEN;Lo;0;L;;;;;N;;;;; +10558;CAUCASIAN ALBANIAN LETTER GHEYS;Lo;0;L;;;;;N;;;;; +10559;CAUCASIAN ALBANIAN LETTER RAT;Lo;0;L;;;;;N;;;;; +1055A;CAUCASIAN ALBANIAN LETTER SEYK;Lo;0;L;;;;;N;;;;; +1055B;CAUCASIAN ALBANIAN LETTER VEYZ;Lo;0;L;;;;;N;;;;; +1055C;CAUCASIAN ALBANIAN LETTER TIWR;Lo;0;L;;;;;N;;;;; +1055D;CAUCASIAN ALBANIAN LETTER SHOY;Lo;0;L;;;;;N;;;;; +1055E;CAUCASIAN ALBANIAN LETTER IWN;Lo;0;L;;;;;N;;;;; +1055F;CAUCASIAN ALBANIAN LETTER CYAW;Lo;0;L;;;;;N;;;;; +10560;CAUCASIAN ALBANIAN LETTER CAYN;Lo;0;L;;;;;N;;;;; +10561;CAUCASIAN ALBANIAN LETTER YAYD;Lo;0;L;;;;;N;;;;; +10562;CAUCASIAN ALBANIAN LETTER PIWR;Lo;0;L;;;;;N;;;;; +10563;CAUCASIAN ALBANIAN LETTER KIW;Lo;0;L;;;;;N;;;;; +1056F;CAUCASIAN ALBANIAN CITATION MARK;Po;0;L;;;;;N;;;;; +10600;LINEAR A SIGN AB001;Lo;0;L;;;;;N;;;;; +10601;LINEAR A SIGN AB002;Lo;0;L;;;;;N;;;;; +10602;LINEAR A SIGN AB003;Lo;0;L;;;;;N;;;;; +10603;LINEAR A SIGN AB004;Lo;0;L;;;;;N;;;;; +10604;LINEAR A SIGN AB005;Lo;0;L;;;;;N;;;;; +10605;LINEAR A SIGN AB006;Lo;0;L;;;;;N;;;;; +10606;LINEAR A SIGN AB007;Lo;0;L;;;;;N;;;;; +10607;LINEAR A SIGN AB008;Lo;0;L;;;;;N;;;;; +10608;LINEAR A SIGN AB009;Lo;0;L;;;;;N;;;;; +10609;LINEAR A SIGN AB010;Lo;0;L;;;;;N;;;;; +1060A;LINEAR A SIGN AB011;Lo;0;L;;;;;N;;;;; +1060B;LINEAR A SIGN AB013;Lo;0;L;;;;;N;;;;; +1060C;LINEAR A SIGN AB016;Lo;0;L;;;;;N;;;;; +1060D;LINEAR A SIGN AB017;Lo;0;L;;;;;N;;;;; +1060E;LINEAR A SIGN AB020;Lo;0;L;;;;;N;;;;; +1060F;LINEAR A SIGN AB021;Lo;0;L;;;;;N;;;;; +10610;LINEAR A SIGN AB021F;Lo;0;L;;;;;N;;;;; +10611;LINEAR A SIGN AB021M;Lo;0;L;;;;;N;;;;; +10612;LINEAR A SIGN AB022;Lo;0;L;;;;;N;;;;; +10613;LINEAR A SIGN AB022F;Lo;0;L;;;;;N;;;;; +10614;LINEAR A SIGN AB022M;Lo;0;L;;;;;N;;;;; +10615;LINEAR A SIGN AB023;Lo;0;L;;;;;N;;;;; +10616;LINEAR A SIGN AB023M;Lo;0;L;;;;;N;;;;; +10617;LINEAR A SIGN AB024;Lo;0;L;;;;;N;;;;; +10618;LINEAR A SIGN AB026;Lo;0;L;;;;;N;;;;; +10619;LINEAR A SIGN AB027;Lo;0;L;;;;;N;;;;; +1061A;LINEAR A SIGN AB028;Lo;0;L;;;;;N;;;;; +1061B;LINEAR A SIGN A028B;Lo;0;L;;;;;N;;;;; +1061C;LINEAR A SIGN AB029;Lo;0;L;;;;;N;;;;; +1061D;LINEAR A SIGN AB030;Lo;0;L;;;;;N;;;;; +1061E;LINEAR A SIGN AB031;Lo;0;L;;;;;N;;;;; +1061F;LINEAR A SIGN AB034;Lo;0;L;;;;;N;;;;; +10620;LINEAR A SIGN AB037;Lo;0;L;;;;;N;;;;; +10621;LINEAR A SIGN AB038;Lo;0;L;;;;;N;;;;; +10622;LINEAR A SIGN AB039;Lo;0;L;;;;;N;;;;; +10623;LINEAR A SIGN AB040;Lo;0;L;;;;;N;;;;; +10624;LINEAR A SIGN AB041;Lo;0;L;;;;;N;;;;; +10625;LINEAR A SIGN AB044;Lo;0;L;;;;;N;;;;; +10626;LINEAR A SIGN AB045;Lo;0;L;;;;;N;;;;; +10627;LINEAR A SIGN AB046;Lo;0;L;;;;;N;;;;; +10628;LINEAR A SIGN AB047;Lo;0;L;;;;;N;;;;; +10629;LINEAR A SIGN AB048;Lo;0;L;;;;;N;;;;; +1062A;LINEAR A SIGN AB049;Lo;0;L;;;;;N;;;;; +1062B;LINEAR A SIGN AB050;Lo;0;L;;;;;N;;;;; +1062C;LINEAR A SIGN AB051;Lo;0;L;;;;;N;;;;; +1062D;LINEAR A SIGN AB053;Lo;0;L;;;;;N;;;;; +1062E;LINEAR A SIGN AB054;Lo;0;L;;;;;N;;;;; +1062F;LINEAR A SIGN AB055;Lo;0;L;;;;;N;;;;; +10630;LINEAR A SIGN AB056;Lo;0;L;;;;;N;;;;; +10631;LINEAR A SIGN AB057;Lo;0;L;;;;;N;;;;; +10632;LINEAR A SIGN AB058;Lo;0;L;;;;;N;;;;; +10633;LINEAR A SIGN AB059;Lo;0;L;;;;;N;;;;; +10634;LINEAR A SIGN AB060;Lo;0;L;;;;;N;;;;; +10635;LINEAR A SIGN AB061;Lo;0;L;;;;;N;;;;; +10636;LINEAR A SIGN AB065;Lo;0;L;;;;;N;;;;; +10637;LINEAR A SIGN AB066;Lo;0;L;;;;;N;;;;; +10638;LINEAR A SIGN AB067;Lo;0;L;;;;;N;;;;; +10639;LINEAR A SIGN AB069;Lo;0;L;;;;;N;;;;; +1063A;LINEAR A SIGN AB070;Lo;0;L;;;;;N;;;;; +1063B;LINEAR A SIGN AB073;Lo;0;L;;;;;N;;;;; +1063C;LINEAR A SIGN AB074;Lo;0;L;;;;;N;;;;; +1063D;LINEAR A SIGN AB076;Lo;0;L;;;;;N;;;;; +1063E;LINEAR A SIGN AB077;Lo;0;L;;;;;N;;;;; +1063F;LINEAR A SIGN AB078;Lo;0;L;;;;;N;;;;; +10640;LINEAR A SIGN AB079;Lo;0;L;;;;;N;;;;; +10641;LINEAR A SIGN AB080;Lo;0;L;;;;;N;;;;; +10642;LINEAR A SIGN AB081;Lo;0;L;;;;;N;;;;; +10643;LINEAR A SIGN AB082;Lo;0;L;;;;;N;;;;; +10644;LINEAR A SIGN AB085;Lo;0;L;;;;;N;;;;; +10645;LINEAR A SIGN AB086;Lo;0;L;;;;;N;;;;; +10646;LINEAR A SIGN AB087;Lo;0;L;;;;;N;;;;; +10647;LINEAR A SIGN A100-102;Lo;0;L;;;;;N;;;;; +10648;LINEAR A SIGN AB118;Lo;0;L;;;;;N;;;;; +10649;LINEAR A SIGN AB120;Lo;0;L;;;;;N;;;;; +1064A;LINEAR A SIGN A120B;Lo;0;L;;;;;N;;;;; +1064B;LINEAR A SIGN AB122;Lo;0;L;;;;;N;;;;; +1064C;LINEAR A SIGN AB123;Lo;0;L;;;;;N;;;;; +1064D;LINEAR A SIGN AB131A;Lo;0;L;;;;;N;;;;; +1064E;LINEAR A SIGN AB131B;Lo;0;L;;;;;N;;;;; +1064F;LINEAR A SIGN A131C;Lo;0;L;;;;;N;;;;; +10650;LINEAR A SIGN AB164;Lo;0;L;;;;;N;;;;; +10651;LINEAR A SIGN AB171;Lo;0;L;;;;;N;;;;; +10652;LINEAR A SIGN AB180;Lo;0;L;;;;;N;;;;; +10653;LINEAR A SIGN AB188;Lo;0;L;;;;;N;;;;; +10654;LINEAR A SIGN AB191;Lo;0;L;;;;;N;;;;; +10655;LINEAR A SIGN A301;Lo;0;L;;;;;N;;;;; +10656;LINEAR A SIGN A302;Lo;0;L;;;;;N;;;;; +10657;LINEAR A SIGN A303;Lo;0;L;;;;;N;;;;; +10658;LINEAR A SIGN A304;Lo;0;L;;;;;N;;;;; +10659;LINEAR A SIGN A305;Lo;0;L;;;;;N;;;;; +1065A;LINEAR A SIGN A306;Lo;0;L;;;;;N;;;;; +1065B;LINEAR A SIGN A307;Lo;0;L;;;;;N;;;;; +1065C;LINEAR A SIGN A308;Lo;0;L;;;;;N;;;;; +1065D;LINEAR A SIGN A309A;Lo;0;L;;;;;N;;;;; +1065E;LINEAR A SIGN A309B;Lo;0;L;;;;;N;;;;; +1065F;LINEAR A SIGN A309C;Lo;0;L;;;;;N;;;;; +10660;LINEAR A SIGN A310;Lo;0;L;;;;;N;;;;; +10661;LINEAR A SIGN A311;Lo;0;L;;;;;N;;;;; +10662;LINEAR A SIGN A312;Lo;0;L;;;;;N;;;;; +10663;LINEAR A SIGN A313A;Lo;0;L;;;;;N;;;;; +10664;LINEAR A SIGN A313B;Lo;0;L;;;;;N;;;;; +10665;LINEAR A SIGN A313C;Lo;0;L;;;;;N;;;;; +10666;LINEAR A SIGN A314;Lo;0;L;;;;;N;;;;; +10667;LINEAR A SIGN A315;Lo;0;L;;;;;N;;;;; +10668;LINEAR A SIGN A316;Lo;0;L;;;;;N;;;;; +10669;LINEAR A SIGN A317;Lo;0;L;;;;;N;;;;; +1066A;LINEAR A SIGN A318;Lo;0;L;;;;;N;;;;; +1066B;LINEAR A SIGN A319;Lo;0;L;;;;;N;;;;; +1066C;LINEAR A SIGN A320;Lo;0;L;;;;;N;;;;; +1066D;LINEAR A SIGN A321;Lo;0;L;;;;;N;;;;; +1066E;LINEAR A SIGN A322;Lo;0;L;;;;;N;;;;; +1066F;LINEAR A SIGN A323;Lo;0;L;;;;;N;;;;; +10670;LINEAR A SIGN A324;Lo;0;L;;;;;N;;;;; +10671;LINEAR A SIGN A325;Lo;0;L;;;;;N;;;;; +10672;LINEAR A SIGN A326;Lo;0;L;;;;;N;;;;; +10673;LINEAR A SIGN A327;Lo;0;L;;;;;N;;;;; +10674;LINEAR A SIGN A328;Lo;0;L;;;;;N;;;;; +10675;LINEAR A SIGN A329;Lo;0;L;;;;;N;;;;; +10676;LINEAR A SIGN A330;Lo;0;L;;;;;N;;;;; +10677;LINEAR A SIGN A331;Lo;0;L;;;;;N;;;;; +10678;LINEAR A SIGN A332;Lo;0;L;;;;;N;;;;; +10679;LINEAR A SIGN A333;Lo;0;L;;;;;N;;;;; +1067A;LINEAR A SIGN A334;Lo;0;L;;;;;N;;;;; +1067B;LINEAR A SIGN A335;Lo;0;L;;;;;N;;;;; +1067C;LINEAR A SIGN A336;Lo;0;L;;;;;N;;;;; +1067D;LINEAR A SIGN A337;Lo;0;L;;;;;N;;;;; +1067E;LINEAR A SIGN A338;Lo;0;L;;;;;N;;;;; +1067F;LINEAR A SIGN A339;Lo;0;L;;;;;N;;;;; +10680;LINEAR A SIGN A340;Lo;0;L;;;;;N;;;;; +10681;LINEAR A SIGN A341;Lo;0;L;;;;;N;;;;; +10682;LINEAR A SIGN A342;Lo;0;L;;;;;N;;;;; +10683;LINEAR A SIGN A343;Lo;0;L;;;;;N;;;;; +10684;LINEAR A SIGN A344;Lo;0;L;;;;;N;;;;; +10685;LINEAR A SIGN A345;Lo;0;L;;;;;N;;;;; +10686;LINEAR A SIGN A346;Lo;0;L;;;;;N;;;;; +10687;LINEAR A SIGN A347;Lo;0;L;;;;;N;;;;; +10688;LINEAR A SIGN A348;Lo;0;L;;;;;N;;;;; +10689;LINEAR A SIGN A349;Lo;0;L;;;;;N;;;;; +1068A;LINEAR A SIGN A350;Lo;0;L;;;;;N;;;;; +1068B;LINEAR A SIGN A351;Lo;0;L;;;;;N;;;;; +1068C;LINEAR A SIGN A352;Lo;0;L;;;;;N;;;;; +1068D;LINEAR A SIGN A353;Lo;0;L;;;;;N;;;;; +1068E;LINEAR A SIGN A354;Lo;0;L;;;;;N;;;;; +1068F;LINEAR A SIGN A355;Lo;0;L;;;;;N;;;;; +10690;LINEAR A SIGN A356;Lo;0;L;;;;;N;;;;; +10691;LINEAR A SIGN A357;Lo;0;L;;;;;N;;;;; +10692;LINEAR A SIGN A358;Lo;0;L;;;;;N;;;;; +10693;LINEAR A SIGN A359;Lo;0;L;;;;;N;;;;; +10694;LINEAR A SIGN A360;Lo;0;L;;;;;N;;;;; +10695;LINEAR A SIGN A361;Lo;0;L;;;;;N;;;;; +10696;LINEAR A SIGN A362;Lo;0;L;;;;;N;;;;; +10697;LINEAR A SIGN A363;Lo;0;L;;;;;N;;;;; +10698;LINEAR A SIGN A364;Lo;0;L;;;;;N;;;;; +10699;LINEAR A SIGN A365;Lo;0;L;;;;;N;;;;; +1069A;LINEAR A SIGN A366;Lo;0;L;;;;;N;;;;; +1069B;LINEAR A SIGN A367;Lo;0;L;;;;;N;;;;; +1069C;LINEAR A SIGN A368;Lo;0;L;;;;;N;;;;; +1069D;LINEAR A SIGN A369;Lo;0;L;;;;;N;;;;; +1069E;LINEAR A SIGN A370;Lo;0;L;;;;;N;;;;; +1069F;LINEAR A SIGN A371;Lo;0;L;;;;;N;;;;; +106A0;LINEAR A SIGN A400-VAS;Lo;0;L;;;;;N;;;;; +106A1;LINEAR A SIGN A401-VAS;Lo;0;L;;;;;N;;;;; +106A2;LINEAR A SIGN A402-VAS;Lo;0;L;;;;;N;;;;; +106A3;LINEAR A SIGN A403-VAS;Lo;0;L;;;;;N;;;;; +106A4;LINEAR A SIGN A404-VAS;Lo;0;L;;;;;N;;;;; +106A5;LINEAR A SIGN A405-VAS;Lo;0;L;;;;;N;;;;; +106A6;LINEAR A SIGN A406-VAS;Lo;0;L;;;;;N;;;;; +106A7;LINEAR A SIGN A407-VAS;Lo;0;L;;;;;N;;;;; +106A8;LINEAR A SIGN A408-VAS;Lo;0;L;;;;;N;;;;; +106A9;LINEAR A SIGN A409-VAS;Lo;0;L;;;;;N;;;;; +106AA;LINEAR A SIGN A410-VAS;Lo;0;L;;;;;N;;;;; +106AB;LINEAR A SIGN A411-VAS;Lo;0;L;;;;;N;;;;; +106AC;LINEAR A SIGN A412-VAS;Lo;0;L;;;;;N;;;;; +106AD;LINEAR A SIGN A413-VAS;Lo;0;L;;;;;N;;;;; +106AE;LINEAR A SIGN A414-VAS;Lo;0;L;;;;;N;;;;; +106AF;LINEAR A SIGN A415-VAS;Lo;0;L;;;;;N;;;;; +106B0;LINEAR A SIGN A416-VAS;Lo;0;L;;;;;N;;;;; +106B1;LINEAR A SIGN A417-VAS;Lo;0;L;;;;;N;;;;; +106B2;LINEAR A SIGN A418-VAS;Lo;0;L;;;;;N;;;;; +106B3;LINEAR A SIGN A501;Lo;0;L;;;;;N;;;;; +106B4;LINEAR A SIGN A502;Lo;0;L;;;;;N;;;;; +106B5;LINEAR A SIGN A503;Lo;0;L;;;;;N;;;;; +106B6;LINEAR A SIGN A504;Lo;0;L;;;;;N;;;;; +106B7;LINEAR A SIGN A505;Lo;0;L;;;;;N;;;;; +106B8;LINEAR A SIGN A506;Lo;0;L;;;;;N;;;;; +106B9;LINEAR A SIGN A508;Lo;0;L;;;;;N;;;;; +106BA;LINEAR A SIGN A509;Lo;0;L;;;;;N;;;;; +106BB;LINEAR A SIGN A510;Lo;0;L;;;;;N;;;;; +106BC;LINEAR A SIGN A511;Lo;0;L;;;;;N;;;;; +106BD;LINEAR A SIGN A512;Lo;0;L;;;;;N;;;;; +106BE;LINEAR A SIGN A513;Lo;0;L;;;;;N;;;;; +106BF;LINEAR A SIGN A515;Lo;0;L;;;;;N;;;;; +106C0;LINEAR A SIGN A516;Lo;0;L;;;;;N;;;;; +106C1;LINEAR A SIGN A520;Lo;0;L;;;;;N;;;;; +106C2;LINEAR A SIGN A521;Lo;0;L;;;;;N;;;;; +106C3;LINEAR A SIGN A523;Lo;0;L;;;;;N;;;;; +106C4;LINEAR A SIGN A524;Lo;0;L;;;;;N;;;;; +106C5;LINEAR A SIGN A525;Lo;0;L;;;;;N;;;;; +106C6;LINEAR A SIGN A526;Lo;0;L;;;;;N;;;;; +106C7;LINEAR A SIGN A527;Lo;0;L;;;;;N;;;;; +106C8;LINEAR A SIGN A528;Lo;0;L;;;;;N;;;;; +106C9;LINEAR A SIGN A529;Lo;0;L;;;;;N;;;;; +106CA;LINEAR A SIGN A530;Lo;0;L;;;;;N;;;;; +106CB;LINEAR A SIGN A531;Lo;0;L;;;;;N;;;;; +106CC;LINEAR A SIGN A532;Lo;0;L;;;;;N;;;;; +106CD;LINEAR A SIGN A534;Lo;0;L;;;;;N;;;;; +106CE;LINEAR A SIGN A535;Lo;0;L;;;;;N;;;;; +106CF;LINEAR A SIGN A536;Lo;0;L;;;;;N;;;;; +106D0;LINEAR A SIGN A537;Lo;0;L;;;;;N;;;;; +106D1;LINEAR A SIGN A538;Lo;0;L;;;;;N;;;;; +106D2;LINEAR A SIGN A539;Lo;0;L;;;;;N;;;;; +106D3;LINEAR A SIGN A540;Lo;0;L;;;;;N;;;;; +106D4;LINEAR A SIGN A541;Lo;0;L;;;;;N;;;;; +106D5;LINEAR A SIGN A542;Lo;0;L;;;;;N;;;;; +106D6;LINEAR A SIGN A545;Lo;0;L;;;;;N;;;;; +106D7;LINEAR A SIGN A547;Lo;0;L;;;;;N;;;;; +106D8;LINEAR A SIGN A548;Lo;0;L;;;;;N;;;;; +106D9;LINEAR A SIGN A549;Lo;0;L;;;;;N;;;;; +106DA;LINEAR A SIGN A550;Lo;0;L;;;;;N;;;;; +106DB;LINEAR A SIGN A551;Lo;0;L;;;;;N;;;;; +106DC;LINEAR A SIGN A552;Lo;0;L;;;;;N;;;;; +106DD;LINEAR A SIGN A553;Lo;0;L;;;;;N;;;;; +106DE;LINEAR A SIGN A554;Lo;0;L;;;;;N;;;;; +106DF;LINEAR A SIGN A555;Lo;0;L;;;;;N;;;;; +106E0;LINEAR A SIGN A556;Lo;0;L;;;;;N;;;;; +106E1;LINEAR A SIGN A557;Lo;0;L;;;;;N;;;;; +106E2;LINEAR A SIGN A559;Lo;0;L;;;;;N;;;;; +106E3;LINEAR A SIGN A563;Lo;0;L;;;;;N;;;;; +106E4;LINEAR A SIGN A564;Lo;0;L;;;;;N;;;;; +106E5;LINEAR A SIGN A565;Lo;0;L;;;;;N;;;;; +106E6;LINEAR A SIGN A566;Lo;0;L;;;;;N;;;;; +106E7;LINEAR A SIGN A568;Lo;0;L;;;;;N;;;;; +106E8;LINEAR A SIGN A569;Lo;0;L;;;;;N;;;;; +106E9;LINEAR A SIGN A570;Lo;0;L;;;;;N;;;;; +106EA;LINEAR A SIGN A571;Lo;0;L;;;;;N;;;;; +106EB;LINEAR A SIGN A572;Lo;0;L;;;;;N;;;;; +106EC;LINEAR A SIGN A573;Lo;0;L;;;;;N;;;;; +106ED;LINEAR A SIGN A574;Lo;0;L;;;;;N;;;;; +106EE;LINEAR A SIGN A575;Lo;0;L;;;;;N;;;;; +106EF;LINEAR A SIGN A576;Lo;0;L;;;;;N;;;;; +106F0;LINEAR A SIGN A577;Lo;0;L;;;;;N;;;;; +106F1;LINEAR A SIGN A578;Lo;0;L;;;;;N;;;;; +106F2;LINEAR A SIGN A579;Lo;0;L;;;;;N;;;;; +106F3;LINEAR A SIGN A580;Lo;0;L;;;;;N;;;;; +106F4;LINEAR A SIGN A581;Lo;0;L;;;;;N;;;;; +106F5;LINEAR A SIGN A582;Lo;0;L;;;;;N;;;;; +106F6;LINEAR A SIGN A583;Lo;0;L;;;;;N;;;;; +106F7;LINEAR A SIGN A584;Lo;0;L;;;;;N;;;;; +106F8;LINEAR A SIGN A585;Lo;0;L;;;;;N;;;;; +106F9;LINEAR A SIGN A586;Lo;0;L;;;;;N;;;;; +106FA;LINEAR A SIGN A587;Lo;0;L;;;;;N;;;;; +106FB;LINEAR A SIGN A588;Lo;0;L;;;;;N;;;;; +106FC;LINEAR A SIGN A589;Lo;0;L;;;;;N;;;;; +106FD;LINEAR A SIGN A591;Lo;0;L;;;;;N;;;;; +106FE;LINEAR A SIGN A592;Lo;0;L;;;;;N;;;;; +106FF;LINEAR A SIGN A594;Lo;0;L;;;;;N;;;;; +10700;LINEAR A SIGN A595;Lo;0;L;;;;;N;;;;; +10701;LINEAR A SIGN A596;Lo;0;L;;;;;N;;;;; +10702;LINEAR A SIGN A598;Lo;0;L;;;;;N;;;;; +10703;LINEAR A SIGN A600;Lo;0;L;;;;;N;;;;; +10704;LINEAR A SIGN A601;Lo;0;L;;;;;N;;;;; +10705;LINEAR A SIGN A602;Lo;0;L;;;;;N;;;;; +10706;LINEAR A SIGN A603;Lo;0;L;;;;;N;;;;; +10707;LINEAR A SIGN A604;Lo;0;L;;;;;N;;;;; +10708;LINEAR A SIGN A606;Lo;0;L;;;;;N;;;;; +10709;LINEAR A SIGN A608;Lo;0;L;;;;;N;;;;; +1070A;LINEAR A SIGN A609;Lo;0;L;;;;;N;;;;; +1070B;LINEAR A SIGN A610;Lo;0;L;;;;;N;;;;; +1070C;LINEAR A SIGN A611;Lo;0;L;;;;;N;;;;; +1070D;LINEAR A SIGN A612;Lo;0;L;;;;;N;;;;; +1070E;LINEAR A SIGN A613;Lo;0;L;;;;;N;;;;; +1070F;LINEAR A SIGN A614;Lo;0;L;;;;;N;;;;; +10710;LINEAR A SIGN A615;Lo;0;L;;;;;N;;;;; +10711;LINEAR A SIGN A616;Lo;0;L;;;;;N;;;;; +10712;LINEAR A SIGN A617;Lo;0;L;;;;;N;;;;; +10713;LINEAR A SIGN A618;Lo;0;L;;;;;N;;;;; +10714;LINEAR A SIGN A619;Lo;0;L;;;;;N;;;;; +10715;LINEAR A SIGN A620;Lo;0;L;;;;;N;;;;; +10716;LINEAR A SIGN A621;Lo;0;L;;;;;N;;;;; +10717;LINEAR A SIGN A622;Lo;0;L;;;;;N;;;;; +10718;LINEAR A SIGN A623;Lo;0;L;;;;;N;;;;; +10719;LINEAR A SIGN A624;Lo;0;L;;;;;N;;;;; +1071A;LINEAR A SIGN A626;Lo;0;L;;;;;N;;;;; +1071B;LINEAR A SIGN A627;Lo;0;L;;;;;N;;;;; +1071C;LINEAR A SIGN A628;Lo;0;L;;;;;N;;;;; +1071D;LINEAR A SIGN A629;Lo;0;L;;;;;N;;;;; +1071E;LINEAR A SIGN A634;Lo;0;L;;;;;N;;;;; +1071F;LINEAR A SIGN A637;Lo;0;L;;;;;N;;;;; +10720;LINEAR A SIGN A638;Lo;0;L;;;;;N;;;;; +10721;LINEAR A SIGN A640;Lo;0;L;;;;;N;;;;; +10722;LINEAR A SIGN A642;Lo;0;L;;;;;N;;;;; +10723;LINEAR A SIGN A643;Lo;0;L;;;;;N;;;;; +10724;LINEAR A SIGN A644;Lo;0;L;;;;;N;;;;; +10725;LINEAR A SIGN A645;Lo;0;L;;;;;N;;;;; +10726;LINEAR A SIGN A646;Lo;0;L;;;;;N;;;;; +10727;LINEAR A SIGN A648;Lo;0;L;;;;;N;;;;; +10728;LINEAR A SIGN A649;Lo;0;L;;;;;N;;;;; +10729;LINEAR A SIGN A651;Lo;0;L;;;;;N;;;;; +1072A;LINEAR A SIGN A652;Lo;0;L;;;;;N;;;;; +1072B;LINEAR A SIGN A653;Lo;0;L;;;;;N;;;;; +1072C;LINEAR A SIGN A654;Lo;0;L;;;;;N;;;;; +1072D;LINEAR A SIGN A655;Lo;0;L;;;;;N;;;;; +1072E;LINEAR A SIGN A656;Lo;0;L;;;;;N;;;;; +1072F;LINEAR A SIGN A657;Lo;0;L;;;;;N;;;;; +10730;LINEAR A SIGN A658;Lo;0;L;;;;;N;;;;; +10731;LINEAR A SIGN A659;Lo;0;L;;;;;N;;;;; +10732;LINEAR A SIGN A660;Lo;0;L;;;;;N;;;;; +10733;LINEAR A SIGN A661;Lo;0;L;;;;;N;;;;; +10734;LINEAR A SIGN A662;Lo;0;L;;;;;N;;;;; +10735;LINEAR A SIGN A663;Lo;0;L;;;;;N;;;;; +10736;LINEAR A SIGN A664;Lo;0;L;;;;;N;;;;; +10740;LINEAR A SIGN A701 A;Lo;0;L;;;;;N;;;;; +10741;LINEAR A SIGN A702 B;Lo;0;L;;;;;N;;;;; +10742;LINEAR A SIGN A703 D;Lo;0;L;;;;;N;;;;; +10743;LINEAR A SIGN A704 E;Lo;0;L;;;;;N;;;;; +10744;LINEAR A SIGN A705 F;Lo;0;L;;;;;N;;;;; +10745;LINEAR A SIGN A706 H;Lo;0;L;;;;;N;;;;; +10746;LINEAR A SIGN A707 J;Lo;0;L;;;;;N;;;;; +10747;LINEAR A SIGN A708 K;Lo;0;L;;;;;N;;;;; +10748;LINEAR A SIGN A709 L;Lo;0;L;;;;;N;;;;; +10749;LINEAR A SIGN A709-2 L2;Lo;0;L;;;;;N;;;;; +1074A;LINEAR A SIGN A709-3 L3;Lo;0;L;;;;;N;;;;; +1074B;LINEAR A SIGN A709-4 L4;Lo;0;L;;;;;N;;;;; +1074C;LINEAR A SIGN A709-6 L6;Lo;0;L;;;;;N;;;;; +1074D;LINEAR A SIGN A710 W;Lo;0;L;;;;;N;;;;; +1074E;LINEAR A SIGN A711 X;Lo;0;L;;;;;N;;;;; +1074F;LINEAR A SIGN A712 Y;Lo;0;L;;;;;N;;;;; +10750;LINEAR A SIGN A713 OMEGA;Lo;0;L;;;;;N;;;;; +10751;LINEAR A SIGN A714 ABB;Lo;0;L;;;;;N;;;;; +10752;LINEAR A SIGN A715 BB;Lo;0;L;;;;;N;;;;; +10753;LINEAR A SIGN A717 DD;Lo;0;L;;;;;N;;;;; +10754;LINEAR A SIGN A726 EYYY;Lo;0;L;;;;;N;;;;; +10755;LINEAR A SIGN A732 JE;Lo;0;L;;;;;N;;;;; +10760;LINEAR A SIGN A800;Lo;0;L;;;;;N;;;;; +10761;LINEAR A SIGN A801;Lo;0;L;;;;;N;;;;; +10762;LINEAR A SIGN A802;Lo;0;L;;;;;N;;;;; +10763;LINEAR A SIGN A803;Lo;0;L;;;;;N;;;;; +10764;LINEAR A SIGN A804;Lo;0;L;;;;;N;;;;; +10765;LINEAR A SIGN A805;Lo;0;L;;;;;N;;;;; +10766;LINEAR A SIGN A806;Lo;0;L;;;;;N;;;;; +10767;LINEAR A SIGN A807;Lo;0;L;;;;;N;;;;; 10800;CYPRIOT SYLLABLE A;Lo;0;R;;;;;N;;;;; 10801;CYPRIOT SYLLABLE E;Lo;0;R;;;;;N;;;;; 10802;CYPRIOT SYLLABLE I;Lo;0;R;;;;;N;;;;; @@ -16917,6 +17758,78 @@ 1085D;IMPERIAL ARAMAIC NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;; 1085E;IMPERIAL ARAMAIC NUMBER ONE THOUSAND;No;0;R;;;;1000;N;;;;; 1085F;IMPERIAL ARAMAIC NUMBER TEN THOUSAND;No;0;R;;;;10000;N;;;;; +10860;PALMYRENE LETTER ALEPH;Lo;0;R;;;;;N;;;;; +10861;PALMYRENE LETTER BETH;Lo;0;R;;;;;N;;;;; +10862;PALMYRENE LETTER GIMEL;Lo;0;R;;;;;N;;;;; +10863;PALMYRENE LETTER DALETH;Lo;0;R;;;;;N;;;;; +10864;PALMYRENE LETTER HE;Lo;0;R;;;;;N;;;;; +10865;PALMYRENE LETTER WAW;Lo;0;R;;;;;N;;;;; +10866;PALMYRENE LETTER ZAYIN;Lo;0;R;;;;;N;;;;; +10867;PALMYRENE LETTER HETH;Lo;0;R;;;;;N;;;;; +10868;PALMYRENE LETTER TETH;Lo;0;R;;;;;N;;;;; +10869;PALMYRENE LETTER YODH;Lo;0;R;;;;;N;;;;; +1086A;PALMYRENE LETTER KAPH;Lo;0;R;;;;;N;;;;; +1086B;PALMYRENE LETTER LAMEDH;Lo;0;R;;;;;N;;;;; +1086C;PALMYRENE LETTER MEM;Lo;0;R;;;;;N;;;;; +1086D;PALMYRENE LETTER FINAL NUN;Lo;0;R;;;;;N;;;;; +1086E;PALMYRENE LETTER NUN;Lo;0;R;;;;;N;;;;; +1086F;PALMYRENE LETTER SAMEKH;Lo;0;R;;;;;N;;;;; +10870;PALMYRENE LETTER AYIN;Lo;0;R;;;;;N;;;;; +10871;PALMYRENE LETTER PE;Lo;0;R;;;;;N;;;;; +10872;PALMYRENE LETTER SADHE;Lo;0;R;;;;;N;;;;; +10873;PALMYRENE LETTER QOPH;Lo;0;R;;;;;N;;;;; +10874;PALMYRENE LETTER RESH;Lo;0;R;;;;;N;;;;; +10875;PALMYRENE LETTER SHIN;Lo;0;R;;;;;N;;;;; +10876;PALMYRENE LETTER TAW;Lo;0;R;;;;;N;;;;; +10877;PALMYRENE LEFT-POINTING FLEURON;So;0;R;;;;;N;;;;; +10878;PALMYRENE RIGHT-POINTING FLEURON;So;0;R;;;;;N;;;;; +10879;PALMYRENE NUMBER ONE;No;0;R;;;;1;N;;;;; +1087A;PALMYRENE NUMBER TWO;No;0;R;;;;2;N;;;;; +1087B;PALMYRENE NUMBER THREE;No;0;R;;;;3;N;;;;; +1087C;PALMYRENE NUMBER FOUR;No;0;R;;;;4;N;;;;; +1087D;PALMYRENE NUMBER FIVE;No;0;R;;;;5;N;;;;; +1087E;PALMYRENE NUMBER TEN;No;0;R;;;;10;N;;;;; +1087F;PALMYRENE NUMBER TWENTY;No;0;R;;;;20;N;;;;; +10880;NABATAEAN LETTER FINAL ALEPH;Lo;0;R;;;;;N;;;;; +10881;NABATAEAN LETTER ALEPH;Lo;0;R;;;;;N;;;;; +10882;NABATAEAN LETTER FINAL BETH;Lo;0;R;;;;;N;;;;; +10883;NABATAEAN LETTER BETH;Lo;0;R;;;;;N;;;;; +10884;NABATAEAN LETTER GIMEL;Lo;0;R;;;;;N;;;;; +10885;NABATAEAN LETTER DALETH;Lo;0;R;;;;;N;;;;; +10886;NABATAEAN LETTER FINAL HE;Lo;0;R;;;;;N;;;;; +10887;NABATAEAN LETTER HE;Lo;0;R;;;;;N;;;;; +10888;NABATAEAN LETTER WAW;Lo;0;R;;;;;N;;;;; +10889;NABATAEAN LETTER ZAYIN;Lo;0;R;;;;;N;;;;; +1088A;NABATAEAN LETTER HETH;Lo;0;R;;;;;N;;;;; +1088B;NABATAEAN LETTER TETH;Lo;0;R;;;;;N;;;;; +1088C;NABATAEAN LETTER FINAL YODH;Lo;0;R;;;;;N;;;;; +1088D;NABATAEAN LETTER YODH;Lo;0;R;;;;;N;;;;; +1088E;NABATAEAN LETTER FINAL KAPH;Lo;0;R;;;;;N;;;;; +1088F;NABATAEAN LETTER KAPH;Lo;0;R;;;;;N;;;;; +10890;NABATAEAN LETTER FINAL LAMEDH;Lo;0;R;;;;;N;;;;; +10891;NABATAEAN LETTER LAMEDH;Lo;0;R;;;;;N;;;;; +10892;NABATAEAN LETTER FINAL MEM;Lo;0;R;;;;;N;;;;; +10893;NABATAEAN LETTER MEM;Lo;0;R;;;;;N;;;;; +10894;NABATAEAN LETTER FINAL NUN;Lo;0;R;;;;;N;;;;; +10895;NABATAEAN LETTER NUN;Lo;0;R;;;;;N;;;;; +10896;NABATAEAN LETTER SAMEKH;Lo;0;R;;;;;N;;;;; +10897;NABATAEAN LETTER AYIN;Lo;0;R;;;;;N;;;;; +10898;NABATAEAN LETTER PE;Lo;0;R;;;;;N;;;;; +10899;NABATAEAN LETTER SADHE;Lo;0;R;;;;;N;;;;; +1089A;NABATAEAN LETTER QOPH;Lo;0;R;;;;;N;;;;; +1089B;NABATAEAN LETTER RESH;Lo;0;R;;;;;N;;;;; +1089C;NABATAEAN LETTER FINAL SHIN;Lo;0;R;;;;;N;;;;; +1089D;NABATAEAN LETTER SHIN;Lo;0;R;;;;;N;;;;; +1089E;NABATAEAN LETTER TAW;Lo;0;R;;;;;N;;;;; +108A7;NABATAEAN NUMBER ONE;No;0;R;;;;1;N;;;;; +108A8;NABATAEAN NUMBER TWO;No;0;R;;;;2;N;;;;; +108A9;NABATAEAN NUMBER THREE;No;0;R;;;;3;N;;;;; +108AA;NABATAEAN NUMBER FOUR;No;0;R;;;;4;N;;;;; +108AB;NABATAEAN CRUCIFORM NUMBER FOUR;No;0;R;;;;4;N;;;;; +108AC;NABATAEAN NUMBER FIVE;No;0;R;;;;5;N;;;;; +108AD;NABATAEAN NUMBER TEN;No;0;R;;;;10;N;;;;; +108AE;NABATAEAN NUMBER TWENTY;No;0;R;;;;20;N;;;;; +108AF;NABATAEAN NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;; 10900;PHOENICIAN LETTER ALF;Lo;0;R;;;;;N;;;;; 10901;PHOENICIAN LETTER BET;Lo;0;R;;;;;N;;;;; 10902;PHOENICIAN LETTER GAML;Lo;0;R;;;;;N;;;;; @@ -17128,6 +18041,89 @@ 10A7D;OLD SOUTH ARABIAN NUMBER ONE;No;0;R;;;;1;N;;;;; 10A7E;OLD SOUTH ARABIAN NUMBER FIFTY;No;0;R;;;;50;N;;;;; 10A7F;OLD SOUTH ARABIAN NUMERIC INDICATOR;Po;0;R;;;;;N;;;;; +10A80;OLD NORTH ARABIAN LETTER HEH;Lo;0;R;;;;;N;;;;; +10A81;OLD NORTH ARABIAN LETTER LAM;Lo;0;R;;;;;N;;;;; +10A82;OLD NORTH ARABIAN LETTER HAH;Lo;0;R;;;;;N;;;;; +10A83;OLD NORTH ARABIAN LETTER MEEM;Lo;0;R;;;;;N;;;;; +10A84;OLD NORTH ARABIAN LETTER QAF;Lo;0;R;;;;;N;;;;; +10A85;OLD NORTH ARABIAN LETTER WAW;Lo;0;R;;;;;N;;;;; +10A86;OLD NORTH ARABIAN LETTER ES-2;Lo;0;R;;;;;N;;;;; +10A87;OLD NORTH ARABIAN LETTER REH;Lo;0;R;;;;;N;;;;; +10A88;OLD NORTH ARABIAN LETTER BEH;Lo;0;R;;;;;N;;;;; +10A89;OLD NORTH ARABIAN LETTER TEH;Lo;0;R;;;;;N;;;;; +10A8A;OLD NORTH ARABIAN LETTER ES-1;Lo;0;R;;;;;N;;;;; +10A8B;OLD NORTH ARABIAN LETTER KAF;Lo;0;R;;;;;N;;;;; +10A8C;OLD NORTH ARABIAN LETTER NOON;Lo;0;R;;;;;N;;;;; +10A8D;OLD NORTH ARABIAN LETTER KHAH;Lo;0;R;;;;;N;;;;; +10A8E;OLD NORTH ARABIAN LETTER SAD;Lo;0;R;;;;;N;;;;; +10A8F;OLD NORTH ARABIAN LETTER ES-3;Lo;0;R;;;;;N;;;;; +10A90;OLD NORTH ARABIAN LETTER FEH;Lo;0;R;;;;;N;;;;; +10A91;OLD NORTH ARABIAN LETTER ALEF;Lo;0;R;;;;;N;;;;; +10A92;OLD NORTH ARABIAN LETTER AIN;Lo;0;R;;;;;N;;;;; +10A93;OLD NORTH ARABIAN LETTER DAD;Lo;0;R;;;;;N;;;;; +10A94;OLD NORTH ARABIAN LETTER GEEM;Lo;0;R;;;;;N;;;;; +10A95;OLD NORTH ARABIAN LETTER DAL;Lo;0;R;;;;;N;;;;; +10A96;OLD NORTH ARABIAN LETTER GHAIN;Lo;0;R;;;;;N;;;;; +10A97;OLD NORTH ARABIAN LETTER TAH;Lo;0;R;;;;;N;;;;; +10A98;OLD NORTH ARABIAN LETTER ZAIN;Lo;0;R;;;;;N;;;;; +10A99;OLD NORTH ARABIAN LETTER THAL;Lo;0;R;;;;;N;;;;; +10A9A;OLD NORTH ARABIAN LETTER YEH;Lo;0;R;;;;;N;;;;; +10A9B;OLD NORTH ARABIAN LETTER THEH;Lo;0;R;;;;;N;;;;; +10A9C;OLD NORTH ARABIAN LETTER ZAH;Lo;0;R;;;;;N;;;;; +10A9D;OLD NORTH ARABIAN NUMBER ONE;No;0;R;;;;1;N;;;;; +10A9E;OLD NORTH ARABIAN NUMBER TEN;No;0;R;;;;10;N;;;;; +10A9F;OLD NORTH ARABIAN NUMBER TWENTY;No;0;R;;;;20;N;;;;; +10AC0;MANICHAEAN LETTER ALEPH;Lo;0;R;;;;;N;;;;; +10AC1;MANICHAEAN LETTER BETH;Lo;0;R;;;;;N;;;;; +10AC2;MANICHAEAN LETTER BHETH;Lo;0;R;;;;;N;;;;; +10AC3;MANICHAEAN LETTER GIMEL;Lo;0;R;;;;;N;;;;; +10AC4;MANICHAEAN LETTER GHIMEL;Lo;0;R;;;;;N;;;;; +10AC5;MANICHAEAN LETTER DALETH;Lo;0;R;;;;;N;;;;; +10AC6;MANICHAEAN LETTER HE;Lo;0;R;;;;;N;;;;; +10AC7;MANICHAEAN LETTER WAW;Lo;0;R;;;;;N;;;;; +10AC8;MANICHAEAN SIGN UD;So;0;R;;;;;N;;;;; +10AC9;MANICHAEAN LETTER ZAYIN;Lo;0;R;;;;;N;;;;; +10ACA;MANICHAEAN LETTER ZHAYIN;Lo;0;R;;;;;N;;;;; +10ACB;MANICHAEAN LETTER JAYIN;Lo;0;R;;;;;N;;;;; +10ACC;MANICHAEAN LETTER JHAYIN;Lo;0;R;;;;;N;;;;; +10ACD;MANICHAEAN LETTER HETH;Lo;0;R;;;;;N;;;;; +10ACE;MANICHAEAN LETTER TETH;Lo;0;R;;;;;N;;;;; +10ACF;MANICHAEAN LETTER YODH;Lo;0;R;;;;;N;;;;; +10AD0;MANICHAEAN LETTER KAPH;Lo;0;R;;;;;N;;;;; +10AD1;MANICHAEAN LETTER XAPH;Lo;0;R;;;;;N;;;;; +10AD2;MANICHAEAN LETTER KHAPH;Lo;0;R;;;;;N;;;;; +10AD3;MANICHAEAN LETTER LAMEDH;Lo;0;R;;;;;N;;;;; +10AD4;MANICHAEAN LETTER DHAMEDH;Lo;0;R;;;;;N;;;;; +10AD5;MANICHAEAN LETTER THAMEDH;Lo;0;R;;;;;N;;;;; +10AD6;MANICHAEAN LETTER MEM;Lo;0;R;;;;;N;;;;; +10AD7;MANICHAEAN LETTER NUN;Lo;0;R;;;;;N;;;;; +10AD8;MANICHAEAN LETTER SAMEKH;Lo;0;R;;;;;N;;;;; +10AD9;MANICHAEAN LETTER AYIN;Lo;0;R;;;;;N;;;;; +10ADA;MANICHAEAN LETTER AAYIN;Lo;0;R;;;;;N;;;;; +10ADB;MANICHAEAN LETTER PE;Lo;0;R;;;;;N;;;;; +10ADC;MANICHAEAN LETTER FE;Lo;0;R;;;;;N;;;;; +10ADD;MANICHAEAN LETTER SADHE;Lo;0;R;;;;;N;;;;; +10ADE;MANICHAEAN LETTER QOPH;Lo;0;R;;;;;N;;;;; +10ADF;MANICHAEAN LETTER XOPH;Lo;0;R;;;;;N;;;;; +10AE0;MANICHAEAN LETTER QHOPH;Lo;0;R;;;;;N;;;;; +10AE1;MANICHAEAN LETTER RESH;Lo;0;R;;;;;N;;;;; +10AE2;MANICHAEAN LETTER SHIN;Lo;0;R;;;;;N;;;;; +10AE3;MANICHAEAN LETTER SSHIN;Lo;0;R;;;;;N;;;;; +10AE4;MANICHAEAN LETTER TAW;Lo;0;R;;;;;N;;;;; +10AE5;MANICHAEAN ABBREVIATION MARK ABOVE;Mn;230;NSM;;;;;N;;;;; +10AE6;MANICHAEAN ABBREVIATION MARK BELOW;Mn;220;NSM;;;;;N;;;;; +10AEB;MANICHAEAN NUMBER ONE;No;0;R;;;;1;N;;;;; +10AEC;MANICHAEAN NUMBER FIVE;No;0;R;;;;5;N;;;;; +10AED;MANICHAEAN NUMBER TEN;No;0;R;;;;10;N;;;;; +10AEE;MANICHAEAN NUMBER TWENTY;No;0;R;;;;20;N;;;;; +10AEF;MANICHAEAN NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;; +10AF0;MANICHAEAN PUNCTUATION STAR;Po;0;R;;;;;N;;;;; +10AF1;MANICHAEAN PUNCTUATION FLEURON;Po;0;R;;;;;N;;;;; +10AF2;MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT;Po;0;R;;;;;N;;;;; +10AF3;MANICHAEAN PUNCTUATION DOT WITHIN DOT;Po;0;R;;;;;N;;;;; +10AF4;MANICHAEAN PUNCTUATION DOT;Po;0;R;;;;;N;;;;; +10AF5;MANICHAEAN PUNCTUATION TWO DOTS;Po;0;R;;;;;N;;;;; +10AF6;MANICHAEAN PUNCTUATION LINE FILLER;Po;0;R;;;;;N;;;;; 10B00;AVESTAN LETTER A;Lo;0;R;;;;;N;;;;; 10B01;AVESTAN LETTER AA;Lo;0;R;;;;;N;;;;; 10B02;AVESTAN LETTER AO;Lo;0;R;;;;;N;;;;; @@ -17246,6 +18242,35 @@ 10B7D;INSCRIPTIONAL PAHLAVI NUMBER TWENTY;No;0;R;;;;20;N;;;;; 10B7E;INSCRIPTIONAL PAHLAVI NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;; 10B7F;INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND;No;0;R;;;;1000;N;;;;; +10B80;PSALTER PAHLAVI LETTER ALEPH;Lo;0;R;;;;;N;;;;; +10B81;PSALTER PAHLAVI LETTER BETH;Lo;0;R;;;;;N;;;;; +10B82;PSALTER PAHLAVI LETTER GIMEL;Lo;0;R;;;;;N;;;;; +10B83;PSALTER PAHLAVI LETTER DALETH;Lo;0;R;;;;;N;;;;; +10B84;PSALTER PAHLAVI LETTER HE;Lo;0;R;;;;;N;;;;; +10B85;PSALTER PAHLAVI LETTER WAW-AYIN-RESH;Lo;0;R;;;;;N;;;;; +10B86;PSALTER PAHLAVI LETTER ZAYIN;Lo;0;R;;;;;N;;;;; +10B87;PSALTER PAHLAVI LETTER HETH;Lo;0;R;;;;;N;;;;; +10B88;PSALTER PAHLAVI LETTER YODH;Lo;0;R;;;;;N;;;;; +10B89;PSALTER PAHLAVI LETTER KAPH;Lo;0;R;;;;;N;;;;; +10B8A;PSALTER PAHLAVI LETTER LAMEDH;Lo;0;R;;;;;N;;;;; +10B8B;PSALTER PAHLAVI LETTER MEM-QOPH;Lo;0;R;;;;;N;;;;; +10B8C;PSALTER PAHLAVI LETTER NUN;Lo;0;R;;;;;N;;;;; +10B8D;PSALTER PAHLAVI LETTER SAMEKH;Lo;0;R;;;;;N;;;;; +10B8E;PSALTER PAHLAVI LETTER PE;Lo;0;R;;;;;N;;;;; +10B8F;PSALTER PAHLAVI LETTER SADHE;Lo;0;R;;;;;N;;;;; +10B90;PSALTER PAHLAVI LETTER SHIN;Lo;0;R;;;;;N;;;;; +10B91;PSALTER PAHLAVI LETTER TAW;Lo;0;R;;;;;N;;;;; +10B99;PSALTER PAHLAVI SECTION MARK;Po;0;R;;;;;N;;;;; +10B9A;PSALTER PAHLAVI TURNED SECTION MARK;Po;0;R;;;;;N;;;;; +10B9B;PSALTER PAHLAVI FOUR DOTS WITH CROSS;Po;0;R;;;;;N;;;;; +10B9C;PSALTER PAHLAVI FOUR DOTS WITH DOT;Po;0;R;;;;;N;;;;; +10BA9;PSALTER PAHLAVI NUMBER ONE;No;0;R;;;;1;N;;;;; +10BAA;PSALTER PAHLAVI NUMBER TWO;No;0;R;;;;2;N;;;;; +10BAB;PSALTER PAHLAVI NUMBER THREE;No;0;R;;;;3;N;;;;; +10BAC;PSALTER PAHLAVI NUMBER FOUR;No;0;R;;;;4;N;;;;; +10BAD;PSALTER PAHLAVI NUMBER TEN;No;0;R;;;;10;N;;;;; +10BAE;PSALTER PAHLAVI NUMBER TWENTY;No;0;R;;;;20;N;;;;; +10BAF;PSALTER PAHLAVI NUMBER ONE HUNDRED;No;0;R;;;;100;N;;;;; 10C00;OLD TURKIC LETTER ORKHON A;Lo;0;R;;;;;N;;;;; 10C01;OLD TURKIC LETTER YENISEI A;Lo;0;R;;;;;N;;;;; 10C02;OLD TURKIC LETTER YENISEI AE;Lo;0;R;;;;;N;;;;; @@ -17458,6 +18483,7 @@ 1106D;BRAHMI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 1106E;BRAHMI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 1106F;BRAHMI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +1107F;BRAHMI NUMBER JOINER;Mn;9;NSM;;;;;N;;;;; 11080;KAITHI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 11081;KAITHI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; 11082;KAITHI SIGN VISARGA;Mc;0;L;;;;;N;;;;; @@ -17626,6 +18652,45 @@ 11141;CHAKMA DANDA;Po;0;L;;;;;N;;;;; 11142;CHAKMA DOUBLE DANDA;Po;0;L;;;;;N;;;;; 11143;CHAKMA QUESTION MARK;Po;0;L;;;;;N;;;;; +11150;MAHAJANI LETTER A;Lo;0;L;;;;;N;;;;; +11151;MAHAJANI LETTER I;Lo;0;L;;;;;N;;;;; +11152;MAHAJANI LETTER U;Lo;0;L;;;;;N;;;;; +11153;MAHAJANI LETTER E;Lo;0;L;;;;;N;;;;; +11154;MAHAJANI LETTER O;Lo;0;L;;;;;N;;;;; +11155;MAHAJANI LETTER KA;Lo;0;L;;;;;N;;;;; +11156;MAHAJANI LETTER KHA;Lo;0;L;;;;;N;;;;; +11157;MAHAJANI LETTER GA;Lo;0;L;;;;;N;;;;; +11158;MAHAJANI LETTER GHA;Lo;0;L;;;;;N;;;;; +11159;MAHAJANI LETTER CA;Lo;0;L;;;;;N;;;;; +1115A;MAHAJANI LETTER CHA;Lo;0;L;;;;;N;;;;; +1115B;MAHAJANI LETTER JA;Lo;0;L;;;;;N;;;;; +1115C;MAHAJANI LETTER JHA;Lo;0;L;;;;;N;;;;; +1115D;MAHAJANI LETTER NYA;Lo;0;L;;;;;N;;;;; +1115E;MAHAJANI LETTER TTA;Lo;0;L;;;;;N;;;;; +1115F;MAHAJANI LETTER TTHA;Lo;0;L;;;;;N;;;;; +11160;MAHAJANI LETTER DDA;Lo;0;L;;;;;N;;;;; +11161;MAHAJANI LETTER DDHA;Lo;0;L;;;;;N;;;;; +11162;MAHAJANI LETTER NNA;Lo;0;L;;;;;N;;;;; +11163;MAHAJANI LETTER TA;Lo;0;L;;;;;N;;;;; +11164;MAHAJANI LETTER THA;Lo;0;L;;;;;N;;;;; +11165;MAHAJANI LETTER DA;Lo;0;L;;;;;N;;;;; +11166;MAHAJANI LETTER DHA;Lo;0;L;;;;;N;;;;; +11167;MAHAJANI LETTER NA;Lo;0;L;;;;;N;;;;; +11168;MAHAJANI LETTER PA;Lo;0;L;;;;;N;;;;; +11169;MAHAJANI LETTER PHA;Lo;0;L;;;;;N;;;;; +1116A;MAHAJANI LETTER BA;Lo;0;L;;;;;N;;;;; +1116B;MAHAJANI LETTER BHA;Lo;0;L;;;;;N;;;;; +1116C;MAHAJANI LETTER MA;Lo;0;L;;;;;N;;;;; +1116D;MAHAJANI LETTER RA;Lo;0;L;;;;;N;;;;; +1116E;MAHAJANI LETTER LA;Lo;0;L;;;;;N;;;;; +1116F;MAHAJANI LETTER VA;Lo;0;L;;;;;N;;;;; +11170;MAHAJANI LETTER SA;Lo;0;L;;;;;N;;;;; +11171;MAHAJANI LETTER HA;Lo;0;L;;;;;N;;;;; +11172;MAHAJANI LETTER RRA;Lo;0;L;;;;;N;;;;; +11173;MAHAJANI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +11174;MAHAJANI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +11175;MAHAJANI SECTION MARK;Po;0;L;;;;;N;;;;; +11176;MAHAJANI LIGATURE SHRI;Lo;0;L;;;;;N;;;;; 11180;SHARADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; 11181;SHARADA SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; 11182;SHARADA SIGN VISARGA;Mc;0;L;;;;;N;;;;; @@ -17699,6 +18764,7 @@ 111C6;SHARADA DOUBLE DANDA;Po;0;L;;;;;N;;;;; 111C7;SHARADA ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; 111C8;SHARADA SEPARATOR;Po;0;L;;;;;N;;;;; +111CD;SHARADA SUTRA MARK;Po;0;L;;;;;N;;;;; 111D0;SHARADA DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; 111D1;SHARADA DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; 111D2;SHARADA DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; @@ -17709,6 +18775,473 @@ 111D7;SHARADA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 111D8;SHARADA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 111D9;SHARADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +111DA;SHARADA EKAM;Lo;0;L;;;;;N;;;;; +111E1;SINHALA ARCHAIC DIGIT ONE;No;0;L;;;;1;N;;;;; +111E2;SINHALA ARCHAIC DIGIT TWO;No;0;L;;;;2;N;;;;; +111E3;SINHALA ARCHAIC DIGIT THREE;No;0;L;;;;3;N;;;;; +111E4;SINHALA ARCHAIC DIGIT FOUR;No;0;L;;;;4;N;;;;; +111E5;SINHALA ARCHAIC DIGIT FIVE;No;0;L;;;;5;N;;;;; +111E6;SINHALA ARCHAIC DIGIT SIX;No;0;L;;;;6;N;;;;; +111E7;SINHALA ARCHAIC DIGIT SEVEN;No;0;L;;;;7;N;;;;; +111E8;SINHALA ARCHAIC DIGIT EIGHT;No;0;L;;;;8;N;;;;; +111E9;SINHALA ARCHAIC DIGIT NINE;No;0;L;;;;9;N;;;;; +111EA;SINHALA ARCHAIC NUMBER TEN;No;0;L;;;;10;N;;;;; +111EB;SINHALA ARCHAIC NUMBER TWENTY;No;0;L;;;;20;N;;;;; +111EC;SINHALA ARCHAIC NUMBER THIRTY;No;0;L;;;;30;N;;;;; +111ED;SINHALA ARCHAIC NUMBER FORTY;No;0;L;;;;40;N;;;;; +111EE;SINHALA ARCHAIC NUMBER FIFTY;No;0;L;;;;50;N;;;;; +111EF;SINHALA ARCHAIC NUMBER SIXTY;No;0;L;;;;60;N;;;;; +111F0;SINHALA ARCHAIC NUMBER SEVENTY;No;0;L;;;;70;N;;;;; +111F1;SINHALA ARCHAIC NUMBER EIGHTY;No;0;L;;;;80;N;;;;; +111F2;SINHALA ARCHAIC NUMBER NINETY;No;0;L;;;;90;N;;;;; +111F3;SINHALA ARCHAIC NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;; +111F4;SINHALA ARCHAIC NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;; +11200;KHOJKI LETTER A;Lo;0;L;;;;;N;;;;; +11201;KHOJKI LETTER AA;Lo;0;L;;;;;N;;;;; +11202;KHOJKI LETTER I;Lo;0;L;;;;;N;;;;; +11203;KHOJKI LETTER U;Lo;0;L;;;;;N;;;;; +11204;KHOJKI LETTER E;Lo;0;L;;;;;N;;;;; +11205;KHOJKI LETTER AI;Lo;0;L;;;;;N;;;;; +11206;KHOJKI LETTER O;Lo;0;L;;;;;N;;;;; +11207;KHOJKI LETTER AU;Lo;0;L;;;;;N;;;;; +11208;KHOJKI LETTER KA;Lo;0;L;;;;;N;;;;; +11209;KHOJKI LETTER KHA;Lo;0;L;;;;;N;;;;; +1120A;KHOJKI LETTER GA;Lo;0;L;;;;;N;;;;; +1120B;KHOJKI LETTER GGA;Lo;0;L;;;;;N;;;;; +1120C;KHOJKI LETTER GHA;Lo;0;L;;;;;N;;;;; +1120D;KHOJKI LETTER NGA;Lo;0;L;;;;;N;;;;; +1120E;KHOJKI LETTER CA;Lo;0;L;;;;;N;;;;; +1120F;KHOJKI LETTER CHA;Lo;0;L;;;;;N;;;;; +11210;KHOJKI LETTER JA;Lo;0;L;;;;;N;;;;; +11211;KHOJKI LETTER JJA;Lo;0;L;;;;;N;;;;; +11213;KHOJKI LETTER NYA;Lo;0;L;;;;;N;;;;; +11214;KHOJKI LETTER TTA;Lo;0;L;;;;;N;;;;; +11215;KHOJKI LETTER TTHA;Lo;0;L;;;;;N;;;;; +11216;KHOJKI LETTER DDA;Lo;0;L;;;;;N;;;;; +11217;KHOJKI LETTER DDHA;Lo;0;L;;;;;N;;;;; +11218;KHOJKI LETTER NNA;Lo;0;L;;;;;N;;;;; +11219;KHOJKI LETTER TA;Lo;0;L;;;;;N;;;;; +1121A;KHOJKI LETTER THA;Lo;0;L;;;;;N;;;;; +1121B;KHOJKI LETTER DA;Lo;0;L;;;;;N;;;;; +1121C;KHOJKI LETTER DDDA;Lo;0;L;;;;;N;;;;; +1121D;KHOJKI LETTER DHA;Lo;0;L;;;;;N;;;;; +1121E;KHOJKI LETTER NA;Lo;0;L;;;;;N;;;;; +1121F;KHOJKI LETTER PA;Lo;0;L;;;;;N;;;;; +11220;KHOJKI LETTER PHA;Lo;0;L;;;;;N;;;;; +11221;KHOJKI LETTER BA;Lo;0;L;;;;;N;;;;; +11222;KHOJKI LETTER BBA;Lo;0;L;;;;;N;;;;; +11223;KHOJKI LETTER BHA;Lo;0;L;;;;;N;;;;; +11224;KHOJKI LETTER MA;Lo;0;L;;;;;N;;;;; +11225;KHOJKI LETTER YA;Lo;0;L;;;;;N;;;;; +11226;KHOJKI LETTER RA;Lo;0;L;;;;;N;;;;; +11227;KHOJKI LETTER LA;Lo;0;L;;;;;N;;;;; +11228;KHOJKI LETTER VA;Lo;0;L;;;;;N;;;;; +11229;KHOJKI LETTER SA;Lo;0;L;;;;;N;;;;; +1122A;KHOJKI LETTER HA;Lo;0;L;;;;;N;;;;; +1122B;KHOJKI LETTER LLA;Lo;0;L;;;;;N;;;;; +1122C;KHOJKI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +1122D;KHOJKI VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +1122E;KHOJKI VOWEL SIGN II;Mc;0;L;;;;;N;;;;; +1122F;KHOJKI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; +11230;KHOJKI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;; +11231;KHOJKI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;; +11232;KHOJKI VOWEL SIGN O;Mc;0;L;;;;;N;;;;; +11233;KHOJKI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;; +11234;KHOJKI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; +11235;KHOJKI SIGN VIRAMA;Mc;9;L;;;;;N;;;;; +11236;KHOJKI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +11237;KHOJKI SIGN SHADDA;Mn;0;NSM;;;;;N;;;;; +11238;KHOJKI DANDA;Po;0;L;;;;;N;;;;; +11239;KHOJKI DOUBLE DANDA;Po;0;L;;;;;N;;;;; +1123A;KHOJKI WORD SEPARATOR;Po;0;L;;;;;N;;;;; +1123B;KHOJKI SECTION MARK;Po;0;L;;;;;N;;;;; +1123C;KHOJKI DOUBLE SECTION MARK;Po;0;L;;;;;N;;;;; +1123D;KHOJKI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +112B0;KHUDAWADI LETTER A;Lo;0;L;;;;;N;;;;; +112B1;KHUDAWADI LETTER AA;Lo;0;L;;;;;N;;;;; +112B2;KHUDAWADI LETTER I;Lo;0;L;;;;;N;;;;; +112B3;KHUDAWADI LETTER II;Lo;0;L;;;;;N;;;;; +112B4;KHUDAWADI LETTER U;Lo;0;L;;;;;N;;;;; +112B5;KHUDAWADI LETTER UU;Lo;0;L;;;;;N;;;;; +112B6;KHUDAWADI LETTER E;Lo;0;L;;;;;N;;;;; +112B7;KHUDAWADI LETTER AI;Lo;0;L;;;;;N;;;;; +112B8;KHUDAWADI LETTER O;Lo;0;L;;;;;N;;;;; +112B9;KHUDAWADI LETTER AU;Lo;0;L;;;;;N;;;;; +112BA;KHUDAWADI LETTER KA;Lo;0;L;;;;;N;;;;; +112BB;KHUDAWADI LETTER KHA;Lo;0;L;;;;;N;;;;; +112BC;KHUDAWADI LETTER GA;Lo;0;L;;;;;N;;;;; +112BD;KHUDAWADI LETTER GGA;Lo;0;L;;;;;N;;;;; +112BE;KHUDAWADI LETTER GHA;Lo;0;L;;;;;N;;;;; +112BF;KHUDAWADI LETTER NGA;Lo;0;L;;;;;N;;;;; +112C0;KHUDAWADI LETTER CA;Lo;0;L;;;;;N;;;;; +112C1;KHUDAWADI LETTER CHA;Lo;0;L;;;;;N;;;;; +112C2;KHUDAWADI LETTER JA;Lo;0;L;;;;;N;;;;; +112C3;KHUDAWADI LETTER JJA;Lo;0;L;;;;;N;;;;; +112C4;KHUDAWADI LETTER JHA;Lo;0;L;;;;;N;;;;; +112C5;KHUDAWADI LETTER NYA;Lo;0;L;;;;;N;;;;; +112C6;KHUDAWADI LETTER TTA;Lo;0;L;;;;;N;;;;; +112C7;KHUDAWADI LETTER TTHA;Lo;0;L;;;;;N;;;;; +112C8;KHUDAWADI LETTER DDA;Lo;0;L;;;;;N;;;;; +112C9;KHUDAWADI LETTER DDDA;Lo;0;L;;;;;N;;;;; +112CA;KHUDAWADI LETTER RRA;Lo;0;L;;;;;N;;;;; +112CB;KHUDAWADI LETTER DDHA;Lo;0;L;;;;;N;;;;; +112CC;KHUDAWADI LETTER NNA;Lo;0;L;;;;;N;;;;; +112CD;KHUDAWADI LETTER TA;Lo;0;L;;;;;N;;;;; +112CE;KHUDAWADI LETTER THA;Lo;0;L;;;;;N;;;;; +112CF;KHUDAWADI LETTER DA;Lo;0;L;;;;;N;;;;; +112D0;KHUDAWADI LETTER DHA;Lo;0;L;;;;;N;;;;; +112D1;KHUDAWADI LETTER NA;Lo;0;L;;;;;N;;;;; +112D2;KHUDAWADI LETTER PA;Lo;0;L;;;;;N;;;;; +112D3;KHUDAWADI LETTER PHA;Lo;0;L;;;;;N;;;;; +112D4;KHUDAWADI LETTER BA;Lo;0;L;;;;;N;;;;; +112D5;KHUDAWADI LETTER BBA;Lo;0;L;;;;;N;;;;; +112D6;KHUDAWADI LETTER BHA;Lo;0;L;;;;;N;;;;; +112D7;KHUDAWADI LETTER MA;Lo;0;L;;;;;N;;;;; +112D8;KHUDAWADI LETTER YA;Lo;0;L;;;;;N;;;;; +112D9;KHUDAWADI LETTER RA;Lo;0;L;;;;;N;;;;; +112DA;KHUDAWADI LETTER LA;Lo;0;L;;;;;N;;;;; +112DB;KHUDAWADI LETTER VA;Lo;0;L;;;;;N;;;;; +112DC;KHUDAWADI LETTER SHA;Lo;0;L;;;;;N;;;;; +112DD;KHUDAWADI LETTER SA;Lo;0;L;;;;;N;;;;; +112DE;KHUDAWADI LETTER HA;Lo;0;L;;;;;N;;;;; +112DF;KHUDAWADI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; +112E0;KHUDAWADI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +112E1;KHUDAWADI VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +112E2;KHUDAWADI VOWEL SIGN II;Mc;0;L;;;;;N;;;;; +112E3;KHUDAWADI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; +112E4;KHUDAWADI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;; +112E5;KHUDAWADI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;; +112E6;KHUDAWADI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;; +112E7;KHUDAWADI VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;; +112E8;KHUDAWADI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;; +112E9;KHUDAWADI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +112EA;KHUDAWADI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; +112F0;KHUDAWADI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +112F1;KHUDAWADI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +112F2;KHUDAWADI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +112F3;KHUDAWADI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +112F4;KHUDAWADI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +112F5;KHUDAWADI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +112F6;KHUDAWADI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +112F7;KHUDAWADI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +112F8;KHUDAWADI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +112F9;KHUDAWADI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +11301;GRANTHA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; +11302;GRANTHA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;; +11303;GRANTHA SIGN VISARGA;Mc;0;L;;;;;N;;;;; +11305;GRANTHA LETTER A;Lo;0;L;;;;;N;;;;; +11306;GRANTHA LETTER AA;Lo;0;L;;;;;N;;;;; +11307;GRANTHA LETTER I;Lo;0;L;;;;;N;;;;; +11308;GRANTHA LETTER II;Lo;0;L;;;;;N;;;;; +11309;GRANTHA LETTER U;Lo;0;L;;;;;N;;;;; +1130A;GRANTHA LETTER UU;Lo;0;L;;;;;N;;;;; +1130B;GRANTHA LETTER VOCALIC R;Lo;0;L;;;;;N;;;;; +1130C;GRANTHA LETTER VOCALIC L;Lo;0;L;;;;;N;;;;; +1130F;GRANTHA LETTER EE;Lo;0;L;;;;;N;;;;; +11310;GRANTHA LETTER AI;Lo;0;L;;;;;N;;;;; +11313;GRANTHA LETTER OO;Lo;0;L;;;;;N;;;;; +11314;GRANTHA LETTER AU;Lo;0;L;;;;;N;;;;; +11315;GRANTHA LETTER KA;Lo;0;L;;;;;N;;;;; +11316;GRANTHA LETTER KHA;Lo;0;L;;;;;N;;;;; +11317;GRANTHA LETTER GA;Lo;0;L;;;;;N;;;;; +11318;GRANTHA LETTER GHA;Lo;0;L;;;;;N;;;;; +11319;GRANTHA LETTER NGA;Lo;0;L;;;;;N;;;;; +1131A;GRANTHA LETTER CA;Lo;0;L;;;;;N;;;;; +1131B;GRANTHA LETTER CHA;Lo;0;L;;;;;N;;;;; +1131C;GRANTHA LETTER JA;Lo;0;L;;;;;N;;;;; +1131D;GRANTHA LETTER JHA;Lo;0;L;;;;;N;;;;; +1131E;GRANTHA LETTER NYA;Lo;0;L;;;;;N;;;;; +1131F;GRANTHA LETTER TTA;Lo;0;L;;;;;N;;;;; +11320;GRANTHA LETTER TTHA;Lo;0;L;;;;;N;;;;; +11321;GRANTHA LETTER DDA;Lo;0;L;;;;;N;;;;; +11322;GRANTHA LETTER DDHA;Lo;0;L;;;;;N;;;;; +11323;GRANTHA LETTER NNA;Lo;0;L;;;;;N;;;;; +11324;GRANTHA LETTER TA;Lo;0;L;;;;;N;;;;; +11325;GRANTHA LETTER THA;Lo;0;L;;;;;N;;;;; +11326;GRANTHA LETTER DA;Lo;0;L;;;;;N;;;;; +11327;GRANTHA LETTER DHA;Lo;0;L;;;;;N;;;;; +11328;GRANTHA LETTER NA;Lo;0;L;;;;;N;;;;; +1132A;GRANTHA LETTER PA;Lo;0;L;;;;;N;;;;; +1132B;GRANTHA LETTER PHA;Lo;0;L;;;;;N;;;;; +1132C;GRANTHA LETTER BA;Lo;0;L;;;;;N;;;;; +1132D;GRANTHA LETTER BHA;Lo;0;L;;;;;N;;;;; +1132E;GRANTHA LETTER MA;Lo;0;L;;;;;N;;;;; +1132F;GRANTHA LETTER YA;Lo;0;L;;;;;N;;;;; +11330;GRANTHA LETTER RA;Lo;0;L;;;;;N;;;;; +11332;GRANTHA LETTER LA;Lo;0;L;;;;;N;;;;; +11333;GRANTHA LETTER LLA;Lo;0;L;;;;;N;;;;; +11335;GRANTHA LETTER VA;Lo;0;L;;;;;N;;;;; +11336;GRANTHA LETTER SHA;Lo;0;L;;;;;N;;;;; +11337;GRANTHA LETTER SSA;Lo;0;L;;;;;N;;;;; +11338;GRANTHA LETTER SA;Lo;0;L;;;;;N;;;;; +11339;GRANTHA LETTER HA;Lo;0;L;;;;;N;;;;; +1133C;GRANTHA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +1133D;GRANTHA SIGN AVAGRAHA;Lo;0;L;;;;;N;;;;; +1133E;GRANTHA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +1133F;GRANTHA VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +11340;GRANTHA VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;; +11341;GRANTHA VOWEL SIGN U;Mc;0;L;;;;;N;;;;; +11342;GRANTHA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;; +11343;GRANTHA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;; +11344;GRANTHA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;; +11347;GRANTHA VOWEL SIGN EE;Mc;0;L;;;;;N;;;;; +11348;GRANTHA VOWEL SIGN AI;Mc;0;L;;;;;N;;;;; +1134B;GRANTHA VOWEL SIGN OO;Mc;0;L;11347 1133E;;;;N;;;;; +1134C;GRANTHA VOWEL SIGN AU;Mc;0;L;11347 11357;;;;N;;;;; +1134D;GRANTHA SIGN VIRAMA;Mc;9;L;;;;;N;;;;; +11357;GRANTHA AU LENGTH MARK;Mc;0;L;;;;;N;;;;; +1135D;GRANTHA SIGN PLUTA;Lo;0;L;;;;;N;;;;; +1135E;GRANTHA LETTER VEDIC ANUSVARA;Lo;0;L;;;;;N;;;;; +1135F;GRANTHA LETTER VEDIC DOUBLE ANUSVARA;Lo;0;L;;;;;N;;;;; +11360;GRANTHA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; +11361;GRANTHA LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; +11362;GRANTHA VOWEL SIGN VOCALIC L;Mc;0;L;;;;;N;;;;; +11363;GRANTHA VOWEL SIGN VOCALIC LL;Mc;0;L;;;;;N;;;;; +11366;COMBINING GRANTHA DIGIT ZERO;Mn;230;NSM;;;;;N;;;;; +11367;COMBINING GRANTHA DIGIT ONE;Mn;230;NSM;;;;;N;;;;; +11368;COMBINING GRANTHA DIGIT TWO;Mn;230;NSM;;;;;N;;;;; +11369;COMBINING GRANTHA DIGIT THREE;Mn;230;NSM;;;;;N;;;;; +1136A;COMBINING GRANTHA DIGIT FOUR;Mn;230;NSM;;;;;N;;;;; +1136B;COMBINING GRANTHA DIGIT FIVE;Mn;230;NSM;;;;;N;;;;; +1136C;COMBINING GRANTHA DIGIT SIX;Mn;230;NSM;;;;;N;;;;; +11370;COMBINING GRANTHA LETTER A;Mn;230;NSM;;;;;N;;;;; +11371;COMBINING GRANTHA LETTER KA;Mn;230;NSM;;;;;N;;;;; +11372;COMBINING GRANTHA LETTER NA;Mn;230;NSM;;;;;N;;;;; +11373;COMBINING GRANTHA LETTER VI;Mn;230;NSM;;;;;N;;;;; +11374;COMBINING GRANTHA LETTER PA;Mn;230;NSM;;;;;N;;;;; +11480;TIRHUTA ANJI;Lo;0;L;;;;;N;;;;; +11481;TIRHUTA LETTER A;Lo;0;L;;;;;N;;;;; +11482;TIRHUTA LETTER AA;Lo;0;L;;;;;N;;;;; +11483;TIRHUTA LETTER I;Lo;0;L;;;;;N;;;;; +11484;TIRHUTA LETTER II;Lo;0;L;;;;;N;;;;; +11485;TIRHUTA LETTER U;Lo;0;L;;;;;N;;;;; +11486;TIRHUTA LETTER UU;Lo;0;L;;;;;N;;;;; +11487;TIRHUTA LETTER VOCALIC R;Lo;0;L;;;;;N;;;;; +11488;TIRHUTA LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; +11489;TIRHUTA LETTER VOCALIC L;Lo;0;L;;;;;N;;;;; +1148A;TIRHUTA LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; +1148B;TIRHUTA LETTER E;Lo;0;L;;;;;N;;;;; +1148C;TIRHUTA LETTER AI;Lo;0;L;;;;;N;;;;; +1148D;TIRHUTA LETTER O;Lo;0;L;;;;;N;;;;; +1148E;TIRHUTA LETTER AU;Lo;0;L;;;;;N;;;;; +1148F;TIRHUTA LETTER KA;Lo;0;L;;;;;N;;;;; +11490;TIRHUTA LETTER KHA;Lo;0;L;;;;;N;;;;; +11491;TIRHUTA LETTER GA;Lo;0;L;;;;;N;;;;; +11492;TIRHUTA LETTER GHA;Lo;0;L;;;;;N;;;;; +11493;TIRHUTA LETTER NGA;Lo;0;L;;;;;N;;;;; +11494;TIRHUTA LETTER CA;Lo;0;L;;;;;N;;;;; +11495;TIRHUTA LETTER CHA;Lo;0;L;;;;;N;;;;; +11496;TIRHUTA LETTER JA;Lo;0;L;;;;;N;;;;; +11497;TIRHUTA LETTER JHA;Lo;0;L;;;;;N;;;;; +11498;TIRHUTA LETTER NYA;Lo;0;L;;;;;N;;;;; +11499;TIRHUTA LETTER TTA;Lo;0;L;;;;;N;;;;; +1149A;TIRHUTA LETTER TTHA;Lo;0;L;;;;;N;;;;; +1149B;TIRHUTA LETTER DDA;Lo;0;L;;;;;N;;;;; +1149C;TIRHUTA LETTER DDHA;Lo;0;L;;;;;N;;;;; +1149D;TIRHUTA LETTER NNA;Lo;0;L;;;;;N;;;;; +1149E;TIRHUTA LETTER TA;Lo;0;L;;;;;N;;;;; +1149F;TIRHUTA LETTER THA;Lo;0;L;;;;;N;;;;; +114A0;TIRHUTA LETTER DA;Lo;0;L;;;;;N;;;;; +114A1;TIRHUTA LETTER DHA;Lo;0;L;;;;;N;;;;; +114A2;TIRHUTA LETTER NA;Lo;0;L;;;;;N;;;;; +114A3;TIRHUTA LETTER PA;Lo;0;L;;;;;N;;;;; +114A4;TIRHUTA LETTER PHA;Lo;0;L;;;;;N;;;;; +114A5;TIRHUTA LETTER BA;Lo;0;L;;;;;N;;;;; +114A6;TIRHUTA LETTER BHA;Lo;0;L;;;;;N;;;;; +114A7;TIRHUTA LETTER MA;Lo;0;L;;;;;N;;;;; +114A8;TIRHUTA LETTER YA;Lo;0;L;;;;;N;;;;; +114A9;TIRHUTA LETTER RA;Lo;0;L;;;;;N;;;;; +114AA;TIRHUTA LETTER LA;Lo;0;L;;;;;N;;;;; +114AB;TIRHUTA LETTER VA;Lo;0;L;;;;;N;;;;; +114AC;TIRHUTA LETTER SHA;Lo;0;L;;;;;N;;;;; +114AD;TIRHUTA LETTER SSA;Lo;0;L;;;;;N;;;;; +114AE;TIRHUTA LETTER SA;Lo;0;L;;;;;N;;;;; +114AF;TIRHUTA LETTER HA;Lo;0;L;;;;;N;;;;; +114B0;TIRHUTA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +114B1;TIRHUTA VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +114B2;TIRHUTA VOWEL SIGN II;Mc;0;L;;;;;N;;;;; +114B3;TIRHUTA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; +114B4;TIRHUTA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;; +114B5;TIRHUTA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;; +114B6;TIRHUTA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;; +114B7;TIRHUTA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;; +114B8;TIRHUTA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;; +114B9;TIRHUTA VOWEL SIGN E;Mc;0;L;;;;;N;;;;; +114BA;TIRHUTA VOWEL SIGN SHORT E;Mn;0;NSM;;;;;N;;;;; +114BB;TIRHUTA VOWEL SIGN AI;Mc;0;L;114B9 114BA;;;;N;;;;; +114BC;TIRHUTA VOWEL SIGN O;Mc;0;L;114B9 114B0;;;;N;;;;; +114BD;TIRHUTA VOWEL SIGN SHORT O;Mc;0;L;;;;;N;;;;; +114BE;TIRHUTA VOWEL SIGN AU;Mc;0;L;114B9 114BD;;;;N;;;;; +114BF;TIRHUTA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; +114C0;TIRHUTA SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; +114C1;TIRHUTA SIGN VISARGA;Mc;0;L;;;;;N;;;;; +114C2;TIRHUTA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; +114C3;TIRHUTA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +114C4;TIRHUTA SIGN AVAGRAHA;Lo;0;L;;;;;N;;;;; +114C5;TIRHUTA GVANG;Lo;0;L;;;;;N;;;;; +114C6;TIRHUTA ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +114C7;TIRHUTA OM;Lo;0;L;;;;;N;;;;; +114D0;TIRHUTA DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +114D1;TIRHUTA DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +114D2;TIRHUTA DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +114D3;TIRHUTA DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +114D4;TIRHUTA DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +114D5;TIRHUTA DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +114D6;TIRHUTA DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +114D7;TIRHUTA DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +114D8;TIRHUTA DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +114D9;TIRHUTA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +11580;SIDDHAM LETTER A;Lo;0;L;;;;;N;;;;; +11581;SIDDHAM LETTER AA;Lo;0;L;;;;;N;;;;; +11582;SIDDHAM LETTER I;Lo;0;L;;;;;N;;;;; +11583;SIDDHAM LETTER II;Lo;0;L;;;;;N;;;;; +11584;SIDDHAM LETTER U;Lo;0;L;;;;;N;;;;; +11585;SIDDHAM LETTER UU;Lo;0;L;;;;;N;;;;; +11586;SIDDHAM LETTER VOCALIC R;Lo;0;L;;;;;N;;;;; +11587;SIDDHAM LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; +11588;SIDDHAM LETTER VOCALIC L;Lo;0;L;;;;;N;;;;; +11589;SIDDHAM LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; +1158A;SIDDHAM LETTER E;Lo;0;L;;;;;N;;;;; +1158B;SIDDHAM LETTER AI;Lo;0;L;;;;;N;;;;; +1158C;SIDDHAM LETTER O;Lo;0;L;;;;;N;;;;; +1158D;SIDDHAM LETTER AU;Lo;0;L;;;;;N;;;;; +1158E;SIDDHAM LETTER KA;Lo;0;L;;;;;N;;;;; +1158F;SIDDHAM LETTER KHA;Lo;0;L;;;;;N;;;;; +11590;SIDDHAM LETTER GA;Lo;0;L;;;;;N;;;;; +11591;SIDDHAM LETTER GHA;Lo;0;L;;;;;N;;;;; +11592;SIDDHAM LETTER NGA;Lo;0;L;;;;;N;;;;; +11593;SIDDHAM LETTER CA;Lo;0;L;;;;;N;;;;; +11594;SIDDHAM LETTER CHA;Lo;0;L;;;;;N;;;;; +11595;SIDDHAM LETTER JA;Lo;0;L;;;;;N;;;;; +11596;SIDDHAM LETTER JHA;Lo;0;L;;;;;N;;;;; +11597;SIDDHAM LETTER NYA;Lo;0;L;;;;;N;;;;; +11598;SIDDHAM LETTER TTA;Lo;0;L;;;;;N;;;;; +11599;SIDDHAM LETTER TTHA;Lo;0;L;;;;;N;;;;; +1159A;SIDDHAM LETTER DDA;Lo;0;L;;;;;N;;;;; +1159B;SIDDHAM LETTER DDHA;Lo;0;L;;;;;N;;;;; +1159C;SIDDHAM LETTER NNA;Lo;0;L;;;;;N;;;;; +1159D;SIDDHAM LETTER TA;Lo;0;L;;;;;N;;;;; +1159E;SIDDHAM LETTER THA;Lo;0;L;;;;;N;;;;; +1159F;SIDDHAM LETTER DA;Lo;0;L;;;;;N;;;;; +115A0;SIDDHAM LETTER DHA;Lo;0;L;;;;;N;;;;; +115A1;SIDDHAM LETTER NA;Lo;0;L;;;;;N;;;;; +115A2;SIDDHAM LETTER PA;Lo;0;L;;;;;N;;;;; +115A3;SIDDHAM LETTER PHA;Lo;0;L;;;;;N;;;;; +115A4;SIDDHAM LETTER BA;Lo;0;L;;;;;N;;;;; +115A5;SIDDHAM LETTER BHA;Lo;0;L;;;;;N;;;;; +115A6;SIDDHAM LETTER MA;Lo;0;L;;;;;N;;;;; +115A7;SIDDHAM LETTER YA;Lo;0;L;;;;;N;;;;; +115A8;SIDDHAM LETTER RA;Lo;0;L;;;;;N;;;;; +115A9;SIDDHAM LETTER LA;Lo;0;L;;;;;N;;;;; +115AA;SIDDHAM LETTER VA;Lo;0;L;;;;;N;;;;; +115AB;SIDDHAM LETTER SHA;Lo;0;L;;;;;N;;;;; +115AC;SIDDHAM LETTER SSA;Lo;0;L;;;;;N;;;;; +115AD;SIDDHAM LETTER SA;Lo;0;L;;;;;N;;;;; +115AE;SIDDHAM LETTER HA;Lo;0;L;;;;;N;;;;; +115AF;SIDDHAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +115B0;SIDDHAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +115B1;SIDDHAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;; +115B2;SIDDHAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; +115B3;SIDDHAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;; +115B4;SIDDHAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;; +115B5;SIDDHAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;; +115B8;SIDDHAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;; +115B9;SIDDHAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;; +115BA;SIDDHAM VOWEL SIGN O;Mc;0;L;115B8 115AF;;;;N;;;;; +115BB;SIDDHAM VOWEL SIGN AU;Mc;0;L;115B9 115AF;;;;N;;;;; +115BC;SIDDHAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;; +115BD;SIDDHAM SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; +115BE;SIDDHAM SIGN VISARGA;Mc;0;L;;;;;N;;;;; +115BF;SIDDHAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; +115C0;SIDDHAM SIGN NUKTA;Mn;7;NSM;;;;;N;;;;; +115C1;SIDDHAM SIGN SIDDHAM;Po;0;L;;;;;N;;;;; +115C2;SIDDHAM DANDA;Po;0;L;;;;;N;;;;; +115C3;SIDDHAM DOUBLE DANDA;Po;0;L;;;;;N;;;;; +115C4;SIDDHAM SEPARATOR DOT;Po;0;L;;;;;N;;;;; +115C5;SIDDHAM SEPARATOR BAR;Po;0;L;;;;;N;;;;; +115C6;SIDDHAM REPETITION MARK-1;Po;0;L;;;;;N;;;;; +115C7;SIDDHAM REPETITION MARK-2;Po;0;L;;;;;N;;;;; +115C8;SIDDHAM REPETITION MARK-3;Po;0;L;;;;;N;;;;; +115C9;SIDDHAM END OF TEXT MARK;Po;0;L;;;;;N;;;;; +11600;MODI LETTER A;Lo;0;L;;;;;N;;;;; +11601;MODI LETTER AA;Lo;0;L;;;;;N;;;;; +11602;MODI LETTER I;Lo;0;L;;;;;N;;;;; +11603;MODI LETTER II;Lo;0;L;;;;;N;;;;; +11604;MODI LETTER U;Lo;0;L;;;;;N;;;;; +11605;MODI LETTER UU;Lo;0;L;;;;;N;;;;; +11606;MODI LETTER VOCALIC R;Lo;0;L;;;;;N;;;;; +11607;MODI LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;; +11608;MODI LETTER VOCALIC L;Lo;0;L;;;;;N;;;;; +11609;MODI LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;; +1160A;MODI LETTER E;Lo;0;L;;;;;N;;;;; +1160B;MODI LETTER AI;Lo;0;L;;;;;N;;;;; +1160C;MODI LETTER O;Lo;0;L;;;;;N;;;;; +1160D;MODI LETTER AU;Lo;0;L;;;;;N;;;;; +1160E;MODI LETTER KA;Lo;0;L;;;;;N;;;;; +1160F;MODI LETTER KHA;Lo;0;L;;;;;N;;;;; +11610;MODI LETTER GA;Lo;0;L;;;;;N;;;;; +11611;MODI LETTER GHA;Lo;0;L;;;;;N;;;;; +11612;MODI LETTER NGA;Lo;0;L;;;;;N;;;;; +11613;MODI LETTER CA;Lo;0;L;;;;;N;;;;; +11614;MODI LETTER CHA;Lo;0;L;;;;;N;;;;; +11615;MODI LETTER JA;Lo;0;L;;;;;N;;;;; +11616;MODI LETTER JHA;Lo;0;L;;;;;N;;;;; +11617;MODI LETTER NYA;Lo;0;L;;;;;N;;;;; +11618;MODI LETTER TTA;Lo;0;L;;;;;N;;;;; +11619;MODI LETTER TTHA;Lo;0;L;;;;;N;;;;; +1161A;MODI LETTER DDA;Lo;0;L;;;;;N;;;;; +1161B;MODI LETTER DDHA;Lo;0;L;;;;;N;;;;; +1161C;MODI LETTER NNA;Lo;0;L;;;;;N;;;;; +1161D;MODI LETTER TA;Lo;0;L;;;;;N;;;;; +1161E;MODI LETTER THA;Lo;0;L;;;;;N;;;;; +1161F;MODI LETTER DA;Lo;0;L;;;;;N;;;;; +11620;MODI LETTER DHA;Lo;0;L;;;;;N;;;;; +11621;MODI LETTER NA;Lo;0;L;;;;;N;;;;; +11622;MODI LETTER PA;Lo;0;L;;;;;N;;;;; +11623;MODI LETTER PHA;Lo;0;L;;;;;N;;;;; +11624;MODI LETTER BA;Lo;0;L;;;;;N;;;;; +11625;MODI LETTER BHA;Lo;0;L;;;;;N;;;;; +11626;MODI LETTER MA;Lo;0;L;;;;;N;;;;; +11627;MODI LETTER YA;Lo;0;L;;;;;N;;;;; +11628;MODI LETTER RA;Lo;0;L;;;;;N;;;;; +11629;MODI LETTER LA;Lo;0;L;;;;;N;;;;; +1162A;MODI LETTER VA;Lo;0;L;;;;;N;;;;; +1162B;MODI LETTER SHA;Lo;0;L;;;;;N;;;;; +1162C;MODI LETTER SSA;Lo;0;L;;;;;N;;;;; +1162D;MODI LETTER SA;Lo;0;L;;;;;N;;;;; +1162E;MODI LETTER HA;Lo;0;L;;;;;N;;;;; +1162F;MODI LETTER LLA;Lo;0;L;;;;;N;;;;; +11630;MODI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;; +11631;MODI VOWEL SIGN I;Mc;0;L;;;;;N;;;;; +11632;MODI VOWEL SIGN II;Mc;0;L;;;;;N;;;;; +11633;MODI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;; +11634;MODI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;; +11635;MODI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;; +11636;MODI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;; +11637;MODI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;; +11638;MODI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;; +11639;MODI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;; +1163A;MODI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;; +1163B;MODI VOWEL SIGN O;Mc;0;L;;;;;N;;;;; +1163C;MODI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;; +1163D;MODI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;; +1163E;MODI SIGN VISARGA;Mc;0;L;;;;;N;;;;; +1163F;MODI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;; +11640;MODI SIGN ARDHACANDRA;Mn;0;NSM;;;;;N;;;;; +11641;MODI DANDA;Po;0;L;;;;;N;;;;; +11642;MODI DOUBLE DANDA;Po;0;L;;;;;N;;;;; +11643;MODI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;; +11644;MODI SIGN HUVA;Lo;0;L;;;;;N;;;;; +11650;MODI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +11651;MODI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +11652;MODI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +11653;MODI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +11654;MODI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +11655;MODI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +11656;MODI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +11657;MODI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +11658;MODI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +11659;MODI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 11680;TAKRI LETTER A;Lo;0;L;;;;;N;;;;; 11681;TAKRI LETTER AA;Lo;0;L;;;;;N;;;;; 11682;TAKRI LETTER I;Lo;0;L;;;;;N;;;;; @@ -17775,6 +19308,147 @@ 116C7;TAKRI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; 116C8;TAKRI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; 116C9;TAKRI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +118A0;WARANG CITI CAPITAL LETTER NGAA;Lu;0;L;;;;;N;;;;118C0; +118A1;WARANG CITI CAPITAL LETTER A;Lu;0;L;;;;;N;;;;118C1; +118A2;WARANG CITI CAPITAL LETTER WI;Lu;0;L;;;;;N;;;;118C2; +118A3;WARANG CITI CAPITAL LETTER YU;Lu;0;L;;;;;N;;;;118C3; +118A4;WARANG CITI CAPITAL LETTER YA;Lu;0;L;;;;;N;;;;118C4; +118A5;WARANG CITI CAPITAL LETTER YO;Lu;0;L;;;;;N;;;;118C5; +118A6;WARANG CITI CAPITAL LETTER II;Lu;0;L;;;;;N;;;;118C6; +118A7;WARANG CITI CAPITAL LETTER UU;Lu;0;L;;;;;N;;;;118C7; +118A8;WARANG CITI CAPITAL LETTER E;Lu;0;L;;;;;N;;;;118C8; +118A9;WARANG CITI CAPITAL LETTER O;Lu;0;L;;;;;N;;;;118C9; +118AA;WARANG CITI CAPITAL LETTER ANG;Lu;0;L;;;;;N;;;;118CA; +118AB;WARANG CITI CAPITAL LETTER GA;Lu;0;L;;;;;N;;;;118CB; +118AC;WARANG CITI CAPITAL LETTER KO;Lu;0;L;;;;;N;;;;118CC; +118AD;WARANG CITI CAPITAL LETTER ENY;Lu;0;L;;;;;N;;;;118CD; +118AE;WARANG CITI CAPITAL LETTER YUJ;Lu;0;L;;;;;N;;;;118CE; +118AF;WARANG CITI CAPITAL LETTER UC;Lu;0;L;;;;;N;;;;118CF; +118B0;WARANG CITI CAPITAL LETTER ENN;Lu;0;L;;;;;N;;;;118D0; +118B1;WARANG CITI CAPITAL LETTER ODD;Lu;0;L;;;;;N;;;;118D1; +118B2;WARANG CITI CAPITAL LETTER TTE;Lu;0;L;;;;;N;;;;118D2; +118B3;WARANG CITI CAPITAL LETTER NUNG;Lu;0;L;;;;;N;;;;118D3; +118B4;WARANG CITI CAPITAL LETTER DA;Lu;0;L;;;;;N;;;;118D4; +118B5;WARANG CITI CAPITAL LETTER AT;Lu;0;L;;;;;N;;;;118D5; +118B6;WARANG CITI CAPITAL LETTER AM;Lu;0;L;;;;;N;;;;118D6; +118B7;WARANG CITI CAPITAL LETTER BU;Lu;0;L;;;;;N;;;;118D7; +118B8;WARANG CITI CAPITAL LETTER PU;Lu;0;L;;;;;N;;;;118D8; +118B9;WARANG CITI CAPITAL LETTER HIYO;Lu;0;L;;;;;N;;;;118D9; +118BA;WARANG CITI CAPITAL LETTER HOLO;Lu;0;L;;;;;N;;;;118DA; +118BB;WARANG CITI CAPITAL LETTER HORR;Lu;0;L;;;;;N;;;;118DB; +118BC;WARANG CITI CAPITAL LETTER HAR;Lu;0;L;;;;;N;;;;118DC; +118BD;WARANG CITI CAPITAL LETTER SSUU;Lu;0;L;;;;;N;;;;118DD; +118BE;WARANG CITI CAPITAL LETTER SII;Lu;0;L;;;;;N;;;;118DE; +118BF;WARANG CITI CAPITAL LETTER VIYO;Lu;0;L;;;;;N;;;;118DF; +118C0;WARANG CITI SMALL LETTER NGAA;Ll;0;L;;;;;N;;;118A0;;118A0 +118C1;WARANG CITI SMALL LETTER A;Ll;0;L;;;;;N;;;118A1;;118A1 +118C2;WARANG CITI SMALL LETTER WI;Ll;0;L;;;;;N;;;118A2;;118A2 +118C3;WARANG CITI SMALL LETTER YU;Ll;0;L;;;;;N;;;118A3;;118A3 +118C4;WARANG CITI SMALL LETTER YA;Ll;0;L;;;;;N;;;118A4;;118A4 +118C5;WARANG CITI SMALL LETTER YO;Ll;0;L;;;;;N;;;118A5;;118A5 +118C6;WARANG CITI SMALL LETTER II;Ll;0;L;;;;;N;;;118A6;;118A6 +118C7;WARANG CITI SMALL LETTER UU;Ll;0;L;;;;;N;;;118A7;;118A7 +118C8;WARANG CITI SMALL LETTER E;Ll;0;L;;;;;N;;;118A8;;118A8 +118C9;WARANG CITI SMALL LETTER O;Ll;0;L;;;;;N;;;118A9;;118A9 +118CA;WARANG CITI SMALL LETTER ANG;Ll;0;L;;;;;N;;;118AA;;118AA +118CB;WARANG CITI SMALL LETTER GA;Ll;0;L;;;;;N;;;118AB;;118AB +118CC;WARANG CITI SMALL LETTER KO;Ll;0;L;;;;;N;;;118AC;;118AC +118CD;WARANG CITI SMALL LETTER ENY;Ll;0;L;;;;;N;;;118AD;;118AD +118CE;WARANG CITI SMALL LETTER YUJ;Ll;0;L;;;;;N;;;118AE;;118AE +118CF;WARANG CITI SMALL LETTER UC;Ll;0;L;;;;;N;;;118AF;;118AF +118D0;WARANG CITI SMALL LETTER ENN;Ll;0;L;;;;;N;;;118B0;;118B0 +118D1;WARANG CITI SMALL LETTER ODD;Ll;0;L;;;;;N;;;118B1;;118B1 +118D2;WARANG CITI SMALL LETTER TTE;Ll;0;L;;;;;N;;;118B2;;118B2 +118D3;WARANG CITI SMALL LETTER NUNG;Ll;0;L;;;;;N;;;118B3;;118B3 +118D4;WARANG CITI SMALL LETTER DA;Ll;0;L;;;;;N;;;118B4;;118B4 +118D5;WARANG CITI SMALL LETTER AT;Ll;0;L;;;;;N;;;118B5;;118B5 +118D6;WARANG CITI SMALL LETTER AM;Ll;0;L;;;;;N;;;118B6;;118B6 +118D7;WARANG CITI SMALL LETTER BU;Ll;0;L;;;;;N;;;118B7;;118B7 +118D8;WARANG CITI SMALL LETTER PU;Ll;0;L;;;;;N;;;118B8;;118B8 +118D9;WARANG CITI SMALL LETTER HIYO;Ll;0;L;;;;;N;;;118B9;;118B9 +118DA;WARANG CITI SMALL LETTER HOLO;Ll;0;L;;;;;N;;;118BA;;118BA +118DB;WARANG CITI SMALL LETTER HORR;Ll;0;L;;;;;N;;;118BB;;118BB +118DC;WARANG CITI SMALL LETTER HAR;Ll;0;L;;;;;N;;;118BC;;118BC +118DD;WARANG CITI SMALL LETTER SSUU;Ll;0;L;;;;;N;;;118BD;;118BD +118DE;WARANG CITI SMALL LETTER SII;Ll;0;L;;;;;N;;;118BE;;118BE +118DF;WARANG CITI SMALL LETTER VIYO;Ll;0;L;;;;;N;;;118BF;;118BF +118E0;WARANG CITI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +118E1;WARANG CITI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +118E2;WARANG CITI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +118E3;WARANG CITI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +118E4;WARANG CITI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +118E5;WARANG CITI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +118E6;WARANG CITI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +118E7;WARANG CITI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +118E8;WARANG CITI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +118E9;WARANG CITI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +118EA;WARANG CITI NUMBER TEN;No;0;L;;;;10;N;;;;; +118EB;WARANG CITI NUMBER TWENTY;No;0;L;;;;20;N;;;;; +118EC;WARANG CITI NUMBER THIRTY;No;0;L;;;;30;N;;;;; +118ED;WARANG CITI NUMBER FORTY;No;0;L;;;;40;N;;;;; +118EE;WARANG CITI NUMBER FIFTY;No;0;L;;;;50;N;;;;; +118EF;WARANG CITI NUMBER SIXTY;No;0;L;;;;60;N;;;;; +118F0;WARANG CITI NUMBER SEVENTY;No;0;L;;;;70;N;;;;; +118F1;WARANG CITI NUMBER EIGHTY;No;0;L;;;;80;N;;;;; +118F2;WARANG CITI NUMBER NINETY;No;0;L;;;;90;N;;;;; +118FF;WARANG CITI OM;Lo;0;L;;;;;N;;;;; +11AC0;PAU CIN HAU LETTER PA;Lo;0;L;;;;;N;;;;; +11AC1;PAU CIN HAU LETTER KA;Lo;0;L;;;;;N;;;;; +11AC2;PAU CIN HAU LETTER LA;Lo;0;L;;;;;N;;;;; +11AC3;PAU CIN HAU LETTER MA;Lo;0;L;;;;;N;;;;; +11AC4;PAU CIN HAU LETTER DA;Lo;0;L;;;;;N;;;;; +11AC5;PAU CIN HAU LETTER ZA;Lo;0;L;;;;;N;;;;; +11AC6;PAU CIN HAU LETTER VA;Lo;0;L;;;;;N;;;;; +11AC7;PAU CIN HAU LETTER NGA;Lo;0;L;;;;;N;;;;; +11AC8;PAU CIN HAU LETTER HA;Lo;0;L;;;;;N;;;;; +11AC9;PAU CIN HAU LETTER GA;Lo;0;L;;;;;N;;;;; +11ACA;PAU CIN HAU LETTER KHA;Lo;0;L;;;;;N;;;;; +11ACB;PAU CIN HAU LETTER SA;Lo;0;L;;;;;N;;;;; +11ACC;PAU CIN HAU LETTER BA;Lo;0;L;;;;;N;;;;; +11ACD;PAU CIN HAU LETTER CA;Lo;0;L;;;;;N;;;;; +11ACE;PAU CIN HAU LETTER TA;Lo;0;L;;;;;N;;;;; +11ACF;PAU CIN HAU LETTER THA;Lo;0;L;;;;;N;;;;; +11AD0;PAU CIN HAU LETTER NA;Lo;0;L;;;;;N;;;;; +11AD1;PAU CIN HAU LETTER PHA;Lo;0;L;;;;;N;;;;; +11AD2;PAU CIN HAU LETTER RA;Lo;0;L;;;;;N;;;;; +11AD3;PAU CIN HAU LETTER FA;Lo;0;L;;;;;N;;;;; +11AD4;PAU CIN HAU LETTER CHA;Lo;0;L;;;;;N;;;;; +11AD5;PAU CIN HAU LETTER A;Lo;0;L;;;;;N;;;;; +11AD6;PAU CIN HAU LETTER E;Lo;0;L;;;;;N;;;;; +11AD7;PAU CIN HAU LETTER I;Lo;0;L;;;;;N;;;;; +11AD8;PAU CIN HAU LETTER O;Lo;0;L;;;;;N;;;;; +11AD9;PAU CIN HAU LETTER U;Lo;0;L;;;;;N;;;;; +11ADA;PAU CIN HAU LETTER UA;Lo;0;L;;;;;N;;;;; +11ADB;PAU CIN HAU LETTER IA;Lo;0;L;;;;;N;;;;; +11ADC;PAU CIN HAU LETTER FINAL P;Lo;0;L;;;;;N;;;;; +11ADD;PAU CIN HAU LETTER FINAL K;Lo;0;L;;;;;N;;;;; +11ADE;PAU CIN HAU LETTER FINAL T;Lo;0;L;;;;;N;;;;; +11ADF;PAU CIN HAU LETTER FINAL M;Lo;0;L;;;;;N;;;;; +11AE0;PAU CIN HAU LETTER FINAL N;Lo;0;L;;;;;N;;;;; +11AE1;PAU CIN HAU LETTER FINAL L;Lo;0;L;;;;;N;;;;; +11AE2;PAU CIN HAU LETTER FINAL W;Lo;0;L;;;;;N;;;;; +11AE3;PAU CIN HAU LETTER FINAL NG;Lo;0;L;;;;;N;;;;; +11AE4;PAU CIN HAU LETTER FINAL Y;Lo;0;L;;;;;N;;;;; +11AE5;PAU CIN HAU RISING TONE LONG;Lo;0;L;;;;;N;;;;; +11AE6;PAU CIN HAU RISING TONE;Lo;0;L;;;;;N;;;;; +11AE7;PAU CIN HAU SANDHI GLOTTAL STOP;Lo;0;L;;;;;N;;;;; +11AE8;PAU CIN HAU RISING TONE LONG FINAL;Lo;0;L;;;;;N;;;;; +11AE9;PAU CIN HAU RISING TONE FINAL;Lo;0;L;;;;;N;;;;; +11AEA;PAU CIN HAU SANDHI GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;; +11AEB;PAU CIN HAU SANDHI TONE LONG;Lo;0;L;;;;;N;;;;; +11AEC;PAU CIN HAU SANDHI TONE;Lo;0;L;;;;;N;;;;; +11AED;PAU CIN HAU SANDHI TONE LONG FINAL;Lo;0;L;;;;;N;;;;; +11AEE;PAU CIN HAU SANDHI TONE FINAL;Lo;0;L;;;;;N;;;;; +11AEF;PAU CIN HAU MID-LEVEL TONE;Lo;0;L;;;;;N;;;;; +11AF0;PAU CIN HAU GLOTTAL STOP VARIANT;Lo;0;L;;;;;N;;;;; +11AF1;PAU CIN HAU MID-LEVEL TONE LONG FINAL;Lo;0;L;;;;;N;;;;; +11AF2;PAU CIN HAU MID-LEVEL TONE FINAL;Lo;0;L;;;;;N;;;;; +11AF3;PAU CIN HAU LOW-FALLING TONE LONG;Lo;0;L;;;;;N;;;;; +11AF4;PAU CIN HAU LOW-FALLING TONE;Lo;0;L;;;;;N;;;;; +11AF5;PAU CIN HAU GLOTTAL STOP;Lo;0;L;;;;;N;;;;; +11AF6;PAU CIN HAU LOW-FALLING TONE LONG FINAL;Lo;0;L;;;;;N;;;;; +11AF7;PAU CIN HAU LOW-FALLING TONE FINAL;Lo;0;L;;;;;N;;;;; +11AF8;PAU CIN HAU GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;; 12000;CUNEIFORM SIGN A;Lo;0;L;;;;;N;;;;; 12001;CUNEIFORM SIGN A TIMES A;Lo;0;L;;;;;N;;;;; 12002;CUNEIFORM SIGN A TIMES BAD;Lo;0;L;;;;;N;;;;; @@ -18654,6 +20328,48 @@ 1236C;CUNEIFORM SIGN ZU5 TIMES A;Lo;0;L;;;;;N;;;;; 1236D;CUNEIFORM SIGN ZUBUR;Lo;0;L;;;;;N;;;;; 1236E;CUNEIFORM SIGN ZUM;Lo;0;L;;;;;N;;;;; +1236F;CUNEIFORM SIGN KAP ELAMITE;Lo;0;L;;;;;N;;;;; +12370;CUNEIFORM SIGN AB TIMES NUN;Lo;0;L;;;;;N;;;;; +12371;CUNEIFORM SIGN AB2 TIMES A;Lo;0;L;;;;;N;;;;; +12372;CUNEIFORM SIGN AMAR TIMES KUG;Lo;0;L;;;;;N;;;;; +12373;CUNEIFORM SIGN DAG KISIM5 TIMES U2 PLUS MASH;Lo;0;L;;;;;N;;;;; +12374;CUNEIFORM SIGN DAG3;Lo;0;L;;;;;N;;;;; +12375;CUNEIFORM SIGN DISH PLUS SHU;Lo;0;L;;;;;N;;;;; +12376;CUNEIFORM SIGN DUB TIMES SHE;Lo;0;L;;;;;N;;;;; +12377;CUNEIFORM SIGN EZEN TIMES GUD;Lo;0;L;;;;;N;;;;; +12378;CUNEIFORM SIGN EZEN TIMES SHE;Lo;0;L;;;;;N;;;;; +12379;CUNEIFORM SIGN GA2 TIMES AN PLUS KAK PLUS A;Lo;0;L;;;;;N;;;;; +1237A;CUNEIFORM SIGN GA2 TIMES ASH2;Lo;0;L;;;;;N;;;;; +1237B;CUNEIFORM SIGN GE22;Lo;0;L;;;;;N;;;;; +1237C;CUNEIFORM SIGN GIG;Lo;0;L;;;;;N;;;;; +1237D;CUNEIFORM SIGN HUSH;Lo;0;L;;;;;N;;;;; +1237E;CUNEIFORM SIGN KA TIMES ANSHE;Lo;0;L;;;;;N;;;;; +1237F;CUNEIFORM SIGN KA TIMES ASH3;Lo;0;L;;;;;N;;;;; +12380;CUNEIFORM SIGN KA TIMES GISH;Lo;0;L;;;;;N;;;;; +12381;CUNEIFORM SIGN KA TIMES GUD;Lo;0;L;;;;;N;;;;; +12382;CUNEIFORM SIGN KA TIMES HI TIMES ASH2;Lo;0;L;;;;;N;;;;; +12383;CUNEIFORM SIGN KA TIMES LUM;Lo;0;L;;;;;N;;;;; +12384;CUNEIFORM SIGN KA TIMES PA;Lo;0;L;;;;;N;;;;; +12385;CUNEIFORM SIGN KA TIMES SHUL;Lo;0;L;;;;;N;;;;; +12386;CUNEIFORM SIGN KA TIMES TU;Lo;0;L;;;;;N;;;;; +12387;CUNEIFORM SIGN KA TIMES UR2;Lo;0;L;;;;;N;;;;; +12388;CUNEIFORM SIGN LAGAB TIMES GI;Lo;0;L;;;;;N;;;;; +12389;CUNEIFORM SIGN LU2 SHESHIG TIMES BAD;Lo;0;L;;;;;N;;;;; +1238A;CUNEIFORM SIGN LU2 TIMES ESH2 PLUS LAL;Lo;0;L;;;;;N;;;;; +1238B;CUNEIFORM SIGN LU2 TIMES SHU;Lo;0;L;;;;;N;;;;; +1238C;CUNEIFORM SIGN MESH;Lo;0;L;;;;;N;;;;; +1238D;CUNEIFORM SIGN MUSH3 TIMES ZA;Lo;0;L;;;;;N;;;;; +1238E;CUNEIFORM SIGN NA4;Lo;0;L;;;;;N;;;;; +1238F;CUNEIFORM SIGN NIN;Lo;0;L;;;;;N;;;;; +12390;CUNEIFORM SIGN NIN9;Lo;0;L;;;;;N;;;;; +12391;CUNEIFORM SIGN NINDA2 TIMES BAL;Lo;0;L;;;;;N;;;;; +12392;CUNEIFORM SIGN NINDA2 TIMES GI;Lo;0;L;;;;;N;;;;; +12393;CUNEIFORM SIGN NU11 ROTATED NINETY DEGREES;Lo;0;L;;;;;N;;;;; +12394;CUNEIFORM SIGN PESH2 ASTERISK;Lo;0;L;;;;;N;;;;; +12395;CUNEIFORM SIGN PIR2;Lo;0;L;;;;;N;;;;; +12396;CUNEIFORM SIGN SAG TIMES IGI GUNU;Lo;0;L;;;;;N;;;;; +12397;CUNEIFORM SIGN TI2;Lo;0;L;;;;;N;;;;; +12398;CUNEIFORM SIGN UM TIMES ME;Lo;0;L;;;;;N;;;;; 12400;CUNEIFORM NUMERIC SIGN TWO ASH;Nl;0;L;;;;2;N;;;;; 12401;CUNEIFORM NUMERIC SIGN THREE ASH;Nl;0;L;;;;3;N;;;;; 12402;CUNEIFORM NUMERIC SIGN FOUR ASH;Nl;0;L;;;;4;N;;;;; @@ -18740,8 +20456,8 @@ 12453;CUNEIFORM NUMERIC SIGN FOUR BAN2 VARIANT FORM;Nl;0;L;;;;4;N;;;;; 12454;CUNEIFORM NUMERIC SIGN FIVE BAN2;Nl;0;L;;;;5;N;;;;; 12455;CUNEIFORM NUMERIC SIGN FIVE BAN2 VARIANT FORM;Nl;0;L;;;;5;N;;;;; -12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;-1;N;;;;; -12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;-1;N;;;;; +12456;CUNEIFORM NUMERIC SIGN NIGIDAMIN;Nl;0;L;;;;2;N;;;;; +12457;CUNEIFORM NUMERIC SIGN NIGIDAESH;Nl;0;L;;;;3;N;;;;; 12458;CUNEIFORM NUMERIC SIGN ONE ESHE3;Nl;0;L;;;;1;N;;;;; 12459;CUNEIFORM NUMERIC SIGN TWO ESHE3;Nl;0;L;;;;2;N;;;;; 1245A;CUNEIFORM NUMERIC SIGN ONE THIRD DISH;Nl;0;L;;;;1/3;N;;;;; @@ -18753,10 +20469,23 @@ 12460;CUNEIFORM NUMERIC SIGN ONE QUARTER ASH;Nl;0;L;;;;1/4;N;;;;; 12461;CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE SIXTH;Nl;0;L;;;;1/6;N;;;;; 12462;CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER;Nl;0;L;;;;1/4;N;;;;; +12463;CUNEIFORM NUMERIC SIGN ONE QUARTER GUR;Nl;0;L;;;;1/4;N;;;;; +12464;CUNEIFORM NUMERIC SIGN ONE HALF GUR;Nl;0;L;;;;1/2;N;;;;; +12465;CUNEIFORM NUMERIC SIGN ELAMITE ONE THIRD;Nl;0;L;;;;1/3;N;;;;; +12466;CUNEIFORM NUMERIC SIGN ELAMITE TWO THIRDS;Nl;0;L;;;;2/3;N;;;;; +12467;CUNEIFORM NUMERIC SIGN ELAMITE FORTY;Nl;0;L;;;;40;N;;;;; +12468;CUNEIFORM NUMERIC SIGN ELAMITE FIFTY;Nl;0;L;;;;50;N;;;;; +12469;CUNEIFORM NUMERIC SIGN FOUR U VARIANT FORM;Nl;0;L;;;;4;N;;;;; +1246A;CUNEIFORM NUMERIC SIGN FIVE U VARIANT FORM;Nl;0;L;;;;5;N;;;;; +1246B;CUNEIFORM NUMERIC SIGN SIX U VARIANT FORM;Nl;0;L;;;;6;N;;;;; +1246C;CUNEIFORM NUMERIC SIGN SEVEN U VARIANT FORM;Nl;0;L;;;;7;N;;;;; +1246D;CUNEIFORM NUMERIC SIGN EIGHT U VARIANT FORM;Nl;0;L;;;;8;N;;;;; +1246E;CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM;Nl;0;L;;;;9;N;;;;; 12470;CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER;Po;0;L;;;;;N;;;;; 12471;CUNEIFORM PUNCTUATION SIGN VERTICAL COLON;Po;0;L;;;;;N;;;;; 12472;CUNEIFORM PUNCTUATION SIGN DIAGONAL COLON;Po;0;L;;;;;N;;;;; 12473;CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON;Po;0;L;;;;;N;;;;; +12474;CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON;Po;0;L;;;;;N;;;;; 13000;EGYPTIAN HIEROGLYPH A001;Lo;0;L;;;;;N;;;;; 13001;EGYPTIAN HIEROGLYPH A002;Lo;0;L;;;;;N;;;;; 13002;EGYPTIAN HIEROGLYPH A003;Lo;0;L;;;;;N;;;;; @@ -20397,6 +22126,212 @@ 16A36;BAMUM LETTER PHASE-F KPA;Lo;0;L;;;;;N;;;;; 16A37;BAMUM LETTER PHASE-F SAMBA;Lo;0;L;;;;;N;;;;; 16A38;BAMUM LETTER PHASE-F VUEQ;Lo;0;L;;;;;N;;;;; +16A40;MRO LETTER TA;Lo;0;L;;;;;N;;;;; +16A41;MRO LETTER NGI;Lo;0;L;;;;;N;;;;; +16A42;MRO LETTER YO;Lo;0;L;;;;;N;;;;; +16A43;MRO LETTER MIM;Lo;0;L;;;;;N;;;;; +16A44;MRO LETTER BA;Lo;0;L;;;;;N;;;;; +16A45;MRO LETTER DA;Lo;0;L;;;;;N;;;;; +16A46;MRO LETTER A;Lo;0;L;;;;;N;;;;; +16A47;MRO LETTER PHI;Lo;0;L;;;;;N;;;;; +16A48;MRO LETTER KHAI;Lo;0;L;;;;;N;;;;; +16A49;MRO LETTER HAO;Lo;0;L;;;;;N;;;;; +16A4A;MRO LETTER DAI;Lo;0;L;;;;;N;;;;; +16A4B;MRO LETTER CHU;Lo;0;L;;;;;N;;;;; +16A4C;MRO LETTER KEAAE;Lo;0;L;;;;;N;;;;; +16A4D;MRO LETTER OL;Lo;0;L;;;;;N;;;;; +16A4E;MRO LETTER MAEM;Lo;0;L;;;;;N;;;;; +16A4F;MRO LETTER NIN;Lo;0;L;;;;;N;;;;; +16A50;MRO LETTER PA;Lo;0;L;;;;;N;;;;; +16A51;MRO LETTER OO;Lo;0;L;;;;;N;;;;; +16A52;MRO LETTER O;Lo;0;L;;;;;N;;;;; +16A53;MRO LETTER RO;Lo;0;L;;;;;N;;;;; +16A54;MRO LETTER SHI;Lo;0;L;;;;;N;;;;; +16A55;MRO LETTER THEA;Lo;0;L;;;;;N;;;;; +16A56;MRO LETTER EA;Lo;0;L;;;;;N;;;;; +16A57;MRO LETTER WA;Lo;0;L;;;;;N;;;;; +16A58;MRO LETTER E;Lo;0;L;;;;;N;;;;; +16A59;MRO LETTER KO;Lo;0;L;;;;;N;;;;; +16A5A;MRO LETTER LAN;Lo;0;L;;;;;N;;;;; +16A5B;MRO LETTER LA;Lo;0;L;;;;;N;;;;; +16A5C;MRO LETTER HAI;Lo;0;L;;;;;N;;;;; +16A5D;MRO LETTER RI;Lo;0;L;;;;;N;;;;; +16A5E;MRO LETTER TEK;Lo;0;L;;;;;N;;;;; +16A60;MRO DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +16A61;MRO DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +16A62;MRO DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +16A63;MRO DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +16A64;MRO DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +16A65;MRO DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +16A66;MRO DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +16A67;MRO DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +16A68;MRO DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +16A69;MRO DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +16A6E;MRO DANDA;Po;0;L;;;;;N;;;;; +16A6F;MRO DOUBLE DANDA;Po;0;L;;;;;N;;;;; +16AD0;BASSA VAH LETTER ENNI;Lo;0;L;;;;;N;;;;; +16AD1;BASSA VAH LETTER KA;Lo;0;L;;;;;N;;;;; +16AD2;BASSA VAH LETTER SE;Lo;0;L;;;;;N;;;;; +16AD3;BASSA VAH LETTER FA;Lo;0;L;;;;;N;;;;; +16AD4;BASSA VAH LETTER MBE;Lo;0;L;;;;;N;;;;; +16AD5;BASSA VAH LETTER YIE;Lo;0;L;;;;;N;;;;; +16AD6;BASSA VAH LETTER GAH;Lo;0;L;;;;;N;;;;; +16AD7;BASSA VAH LETTER DHII;Lo;0;L;;;;;N;;;;; +16AD8;BASSA VAH LETTER KPAH;Lo;0;L;;;;;N;;;;; +16AD9;BASSA VAH LETTER JO;Lo;0;L;;;;;N;;;;; +16ADA;BASSA VAH LETTER HWAH;Lo;0;L;;;;;N;;;;; +16ADB;BASSA VAH LETTER WA;Lo;0;L;;;;;N;;;;; +16ADC;BASSA VAH LETTER ZO;Lo;0;L;;;;;N;;;;; +16ADD;BASSA VAH LETTER GBU;Lo;0;L;;;;;N;;;;; +16ADE;BASSA VAH LETTER DO;Lo;0;L;;;;;N;;;;; +16ADF;BASSA VAH LETTER CE;Lo;0;L;;;;;N;;;;; +16AE0;BASSA VAH LETTER UWU;Lo;0;L;;;;;N;;;;; +16AE1;BASSA VAH LETTER TO;Lo;0;L;;;;;N;;;;; +16AE2;BASSA VAH LETTER BA;Lo;0;L;;;;;N;;;;; +16AE3;BASSA VAH LETTER VU;Lo;0;L;;;;;N;;;;; +16AE4;BASSA VAH LETTER YEIN;Lo;0;L;;;;;N;;;;; +16AE5;BASSA VAH LETTER PA;Lo;0;L;;;;;N;;;;; +16AE6;BASSA VAH LETTER WADDA;Lo;0;L;;;;;N;;;;; +16AE7;BASSA VAH LETTER A;Lo;0;L;;;;;N;;;;; +16AE8;BASSA VAH LETTER O;Lo;0;L;;;;;N;;;;; +16AE9;BASSA VAH LETTER OO;Lo;0;L;;;;;N;;;;; +16AEA;BASSA VAH LETTER U;Lo;0;L;;;;;N;;;;; +16AEB;BASSA VAH LETTER EE;Lo;0;L;;;;;N;;;;; +16AEC;BASSA VAH LETTER E;Lo;0;L;;;;;N;;;;; +16AED;BASSA VAH LETTER I;Lo;0;L;;;;;N;;;;; +16AF0;BASSA VAH COMBINING HIGH TONE;Mn;1;NSM;;;;;N;;;;; +16AF1;BASSA VAH COMBINING LOW TONE;Mn;1;NSM;;;;;N;;;;; +16AF2;BASSA VAH COMBINING MID TONE;Mn;1;NSM;;;;;N;;;;; +16AF3;BASSA VAH COMBINING LOW-MID TONE;Mn;1;NSM;;;;;N;;;;; +16AF4;BASSA VAH COMBINING HIGH-LOW TONE;Mn;1;NSM;;;;;N;;;;; +16AF5;BASSA VAH FULL STOP;Po;0;L;;;;;N;;;;; +16B00;PAHAWH HMONG VOWEL KEEB;Lo;0;L;;;;;N;;;;; +16B01;PAHAWH HMONG VOWEL KEEV;Lo;0;L;;;;;N;;;;; +16B02;PAHAWH HMONG VOWEL KIB;Lo;0;L;;;;;N;;;;; +16B03;PAHAWH HMONG VOWEL KIV;Lo;0;L;;;;;N;;;;; +16B04;PAHAWH HMONG VOWEL KAUB;Lo;0;L;;;;;N;;;;; +16B05;PAHAWH HMONG VOWEL KAUV;Lo;0;L;;;;;N;;;;; +16B06;PAHAWH HMONG VOWEL KUB;Lo;0;L;;;;;N;;;;; +16B07;PAHAWH HMONG VOWEL KUV;Lo;0;L;;;;;N;;;;; +16B08;PAHAWH HMONG VOWEL KEB;Lo;0;L;;;;;N;;;;; +16B09;PAHAWH HMONG VOWEL KEV;Lo;0;L;;;;;N;;;;; +16B0A;PAHAWH HMONG VOWEL KAIB;Lo;0;L;;;;;N;;;;; +16B0B;PAHAWH HMONG VOWEL KAIV;Lo;0;L;;;;;N;;;;; +16B0C;PAHAWH HMONG VOWEL KOOB;Lo;0;L;;;;;N;;;;; +16B0D;PAHAWH HMONG VOWEL KOOV;Lo;0;L;;;;;N;;;;; +16B0E;PAHAWH HMONG VOWEL KAWB;Lo;0;L;;;;;N;;;;; +16B0F;PAHAWH HMONG VOWEL KAWV;Lo;0;L;;;;;N;;;;; +16B10;PAHAWH HMONG VOWEL KUAB;Lo;0;L;;;;;N;;;;; +16B11;PAHAWH HMONG VOWEL KUAV;Lo;0;L;;;;;N;;;;; +16B12;PAHAWH HMONG VOWEL KOB;Lo;0;L;;;;;N;;;;; +16B13;PAHAWH HMONG VOWEL KOV;Lo;0;L;;;;;N;;;;; +16B14;PAHAWH HMONG VOWEL KIAB;Lo;0;L;;;;;N;;;;; +16B15;PAHAWH HMONG VOWEL KIAV;Lo;0;L;;;;;N;;;;; +16B16;PAHAWH HMONG VOWEL KAB;Lo;0;L;;;;;N;;;;; +16B17;PAHAWH HMONG VOWEL KAV;Lo;0;L;;;;;N;;;;; +16B18;PAHAWH HMONG VOWEL KWB;Lo;0;L;;;;;N;;;;; +16B19;PAHAWH HMONG VOWEL KWV;Lo;0;L;;;;;N;;;;; +16B1A;PAHAWH HMONG VOWEL KAAB;Lo;0;L;;;;;N;;;;; +16B1B;PAHAWH HMONG VOWEL KAAV;Lo;0;L;;;;;N;;;;; +16B1C;PAHAWH HMONG CONSONANT VAU;Lo;0;L;;;;;N;;;;; +16B1D;PAHAWH HMONG CONSONANT NTSAU;Lo;0;L;;;;;N;;;;; +16B1E;PAHAWH HMONG CONSONANT LAU;Lo;0;L;;;;;N;;;;; +16B1F;PAHAWH HMONG CONSONANT HAU;Lo;0;L;;;;;N;;;;; +16B20;PAHAWH HMONG CONSONANT NLAU;Lo;0;L;;;;;N;;;;; +16B21;PAHAWH HMONG CONSONANT RAU;Lo;0;L;;;;;N;;;;; +16B22;PAHAWH HMONG CONSONANT NKAU;Lo;0;L;;;;;N;;;;; +16B23;PAHAWH HMONG CONSONANT QHAU;Lo;0;L;;;;;N;;;;; +16B24;PAHAWH HMONG CONSONANT YAU;Lo;0;L;;;;;N;;;;; +16B25;PAHAWH HMONG CONSONANT HLAU;Lo;0;L;;;;;N;;;;; +16B26;PAHAWH HMONG CONSONANT MAU;Lo;0;L;;;;;N;;;;; +16B27;PAHAWH HMONG CONSONANT CHAU;Lo;0;L;;;;;N;;;;; +16B28;PAHAWH HMONG CONSONANT NCHAU;Lo;0;L;;;;;N;;;;; +16B29;PAHAWH HMONG CONSONANT HNAU;Lo;0;L;;;;;N;;;;; +16B2A;PAHAWH HMONG CONSONANT PLHAU;Lo;0;L;;;;;N;;;;; +16B2B;PAHAWH HMONG CONSONANT NTHAU;Lo;0;L;;;;;N;;;;; +16B2C;PAHAWH HMONG CONSONANT NAU;Lo;0;L;;;;;N;;;;; +16B2D;PAHAWH HMONG CONSONANT AU;Lo;0;L;;;;;N;;;;; +16B2E;PAHAWH HMONG CONSONANT XAU;Lo;0;L;;;;;N;;;;; +16B2F;PAHAWH HMONG CONSONANT CAU;Lo;0;L;;;;;N;;;;; +16B30;PAHAWH HMONG MARK CIM TUB;Mn;230;NSM;;;;;N;;;;; +16B31;PAHAWH HMONG MARK CIM SO;Mn;230;NSM;;;;;N;;;;; +16B32;PAHAWH HMONG MARK CIM KES;Mn;230;NSM;;;;;N;;;;; +16B33;PAHAWH HMONG MARK CIM KHAV;Mn;230;NSM;;;;;N;;;;; +16B34;PAHAWH HMONG MARK CIM SUAM;Mn;230;NSM;;;;;N;;;;; +16B35;PAHAWH HMONG MARK CIM HOM;Mn;230;NSM;;;;;N;;;;; +16B36;PAHAWH HMONG MARK CIM TAUM;Mn;230;NSM;;;;;N;;;;; +16B37;PAHAWH HMONG SIGN VOS THOM;Po;0;L;;;;;N;;;;; +16B38;PAHAWH HMONG SIGN VOS TSHAB CEEB;Po;0;L;;;;;N;;;;; +16B39;PAHAWH HMONG SIGN CIM CHEEM;Po;0;L;;;;;N;;;;; +16B3A;PAHAWH HMONG SIGN VOS THIAB;Po;0;L;;;;;N;;;;; +16B3B;PAHAWH HMONG SIGN VOS FEEM;Po;0;L;;;;;N;;;;; +16B3C;PAHAWH HMONG SIGN XYEEM NTXIV;So;0;L;;;;;N;;;;; +16B3D;PAHAWH HMONG SIGN XYEEM RHO;So;0;L;;;;;N;;;;; +16B3E;PAHAWH HMONG SIGN XYEEM TOV;So;0;L;;;;;N;;;;; +16B3F;PAHAWH HMONG SIGN XYEEM FAIB;So;0;L;;;;;N;;;;; +16B40;PAHAWH HMONG SIGN VOS SEEV;Lm;0;L;;;;;N;;;;; +16B41;PAHAWH HMONG SIGN MEEJ SUAB;Lm;0;L;;;;;N;;;;; +16B42;PAHAWH HMONG SIGN VOS NRUA;Lm;0;L;;;;;N;;;;; +16B43;PAHAWH HMONG SIGN IB YAM;Lm;0;L;;;;;N;;;;; +16B44;PAHAWH HMONG SIGN XAUS;Po;0;L;;;;;N;;;;; +16B45;PAHAWH HMONG SIGN CIM TSOV ROG;So;0;L;;;;;N;;;;; +16B50;PAHAWH HMONG DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;; +16B51;PAHAWH HMONG DIGIT ONE;Nd;0;L;;1;1;1;N;;;;; +16B52;PAHAWH HMONG DIGIT TWO;Nd;0;L;;2;2;2;N;;;;; +16B53;PAHAWH HMONG DIGIT THREE;Nd;0;L;;3;3;3;N;;;;; +16B54;PAHAWH HMONG DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;; +16B55;PAHAWH HMONG DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;; +16B56;PAHAWH HMONG DIGIT SIX;Nd;0;L;;6;6;6;N;;;;; +16B57;PAHAWH HMONG DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;; +16B58;PAHAWH HMONG DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;; +16B59;PAHAWH HMONG DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +16B5B;PAHAWH HMONG NUMBER TENS;No;0;L;;;;10;N;;;;; +16B5C;PAHAWH HMONG NUMBER HUNDREDS;No;0;L;;;;100;N;;;;; +16B5D;PAHAWH HMONG NUMBER TEN THOUSANDS;No;0;L;;;;10000;N;;;;; +16B5E;PAHAWH HMONG NUMBER MILLIONS;No;0;L;;;;1000000;N;;;;; +16B5F;PAHAWH HMONG NUMBER HUNDRED MILLIONS;No;0;L;;;;100000000;N;;;;; +16B60;PAHAWH HMONG NUMBER TEN BILLIONS;No;0;L;;;;10000000000;N;;;;; +16B61;PAHAWH HMONG NUMBER TRILLIONS;No;0;L;;;;1000000000000;N;;;;; +16B63;PAHAWH HMONG SIGN VOS LUB;Lo;0;L;;;;;N;;;;; +16B64;PAHAWH HMONG SIGN XYOO;Lo;0;L;;;;;N;;;;; +16B65;PAHAWH HMONG SIGN HLI;Lo;0;L;;;;;N;;;;; +16B66;PAHAWH HMONG SIGN THIRD-STAGE HLI;Lo;0;L;;;;;N;;;;; +16B67;PAHAWH HMONG SIGN ZWJ THAJ;Lo;0;L;;;;;N;;;;; +16B68;PAHAWH HMONG SIGN HNUB;Lo;0;L;;;;;N;;;;; +16B69;PAHAWH HMONG SIGN NQIG;Lo;0;L;;;;;N;;;;; +16B6A;PAHAWH HMONG SIGN XIAB;Lo;0;L;;;;;N;;;;; +16B6B;PAHAWH HMONG SIGN NTUJ;Lo;0;L;;;;;N;;;;; +16B6C;PAHAWH HMONG SIGN AV;Lo;0;L;;;;;N;;;;; +16B6D;PAHAWH HMONG SIGN TXHEEJ CEEV;Lo;0;L;;;;;N;;;;; +16B6E;PAHAWH HMONG SIGN MEEJ TSEEB;Lo;0;L;;;;;N;;;;; +16B6F;PAHAWH HMONG SIGN TAU;Lo;0;L;;;;;N;;;;; +16B70;PAHAWH HMONG SIGN LOS;Lo;0;L;;;;;N;;;;; +16B71;PAHAWH HMONG SIGN MUS;Lo;0;L;;;;;N;;;;; +16B72;PAHAWH HMONG SIGN CIM HAIS LUS NTOG NTOG;Lo;0;L;;;;;N;;;;; +16B73;PAHAWH HMONG SIGN CIM CUAM TSHOOJ;Lo;0;L;;;;;N;;;;; +16B74;PAHAWH HMONG SIGN CIM TXWV;Lo;0;L;;;;;N;;;;; +16B75;PAHAWH HMONG SIGN CIM TXWV CHWV;Lo;0;L;;;;;N;;;;; +16B76;PAHAWH HMONG SIGN CIM PUB DAWB;Lo;0;L;;;;;N;;;;; +16B77;PAHAWH HMONG SIGN CIM NRES TOS;Lo;0;L;;;;;N;;;;; +16B7D;PAHAWH HMONG CLAN SIGN TSHEEJ;Lo;0;L;;;;;N;;;;; +16B7E;PAHAWH HMONG CLAN SIGN YEEG;Lo;0;L;;;;;N;;;;; +16B7F;PAHAWH HMONG CLAN SIGN LIS;Lo;0;L;;;;;N;;;;; +16B80;PAHAWH HMONG CLAN SIGN LAUJ;Lo;0;L;;;;;N;;;;; +16B81;PAHAWH HMONG CLAN SIGN XYOOJ;Lo;0;L;;;;;N;;;;; +16B82;PAHAWH HMONG CLAN SIGN KOO;Lo;0;L;;;;;N;;;;; +16B83;PAHAWH HMONG CLAN SIGN HAWJ;Lo;0;L;;;;;N;;;;; +16B84;PAHAWH HMONG CLAN SIGN MUAS;Lo;0;L;;;;;N;;;;; +16B85;PAHAWH HMONG CLAN SIGN THOJ;Lo;0;L;;;;;N;;;;; +16B86;PAHAWH HMONG CLAN SIGN TSAB;Lo;0;L;;;;;N;;;;; +16B87;PAHAWH HMONG CLAN SIGN PHAB;Lo;0;L;;;;;N;;;;; +16B88;PAHAWH HMONG CLAN SIGN KHAB;Lo;0;L;;;;;N;;;;; +16B89;PAHAWH HMONG CLAN SIGN HAM;Lo;0;L;;;;;N;;;;; +16B8A;PAHAWH HMONG CLAN SIGN VAJ;Lo;0;L;;;;;N;;;;; +16B8B;PAHAWH HMONG CLAN SIGN FAJ;Lo;0;L;;;;;N;;;;; +16B8C;PAHAWH HMONG CLAN SIGN YAJ;Lo;0;L;;;;;N;;;;; +16B8D;PAHAWH HMONG CLAN SIGN TSWB;Lo;0;L;;;;;N;;;;; +16B8E;PAHAWH HMONG CLAN SIGN KWM;Lo;0;L;;;;;N;;;;; +16B8F;PAHAWH HMONG CLAN SIGN VWJ;Lo;0;L;;;;;N;;;;; 16F00;MIAO LETTER PA;Lo;0;L;;;;;N;;;;; 16F01;MIAO LETTER BA;Lo;0;L;;;;;N;;;;; 16F02;MIAO LETTER YI PA;Lo;0;L;;;;;N;;;;; @@ -20532,6 +22467,153 @@ 16F9F;MIAO LETTER REFORMED TONE-8;Lm;0;L;;;;;N;;;;; 1B000;KATAKANA LETTER ARCHAIC E;Lo;0;L;;;;;N;;;;; 1B001;HIRAGANA LETTER ARCHAIC YE;Lo;0;L;;;;;N;;;;; +1BC00;DUPLOYAN LETTER H;Lo;0;L;;;;;N;;;;; +1BC01;DUPLOYAN LETTER X;Lo;0;L;;;;;N;;;;; +1BC02;DUPLOYAN LETTER P;Lo;0;L;;;;;N;;;;; +1BC03;DUPLOYAN LETTER T;Lo;0;L;;;;;N;;;;; +1BC04;DUPLOYAN LETTER F;Lo;0;L;;;;;N;;;;; +1BC05;DUPLOYAN LETTER K;Lo;0;L;;;;;N;;;;; +1BC06;DUPLOYAN LETTER L;Lo;0;L;;;;;N;;;;; +1BC07;DUPLOYAN LETTER B;Lo;0;L;;;;;N;;;;; +1BC08;DUPLOYAN LETTER D;Lo;0;L;;;;;N;;;;; +1BC09;DUPLOYAN LETTER V;Lo;0;L;;;;;N;;;;; +1BC0A;DUPLOYAN LETTER G;Lo;0;L;;;;;N;;;;; +1BC0B;DUPLOYAN LETTER R;Lo;0;L;;;;;N;;;;; +1BC0C;DUPLOYAN LETTER P N;Lo;0;L;;;;;N;;;;; +1BC0D;DUPLOYAN LETTER D S;Lo;0;L;;;;;N;;;;; +1BC0E;DUPLOYAN LETTER F N;Lo;0;L;;;;;N;;;;; +1BC0F;DUPLOYAN LETTER K M;Lo;0;L;;;;;N;;;;; +1BC10;DUPLOYAN LETTER R S;Lo;0;L;;;;;N;;;;; +1BC11;DUPLOYAN LETTER TH;Lo;0;L;;;;;N;;;;; +1BC12;DUPLOYAN LETTER SLOAN DH;Lo;0;L;;;;;N;;;;; +1BC13;DUPLOYAN LETTER DH;Lo;0;L;;;;;N;;;;; +1BC14;DUPLOYAN LETTER KK;Lo;0;L;;;;;N;;;;; +1BC15;DUPLOYAN LETTER SLOAN J;Lo;0;L;;;;;N;;;;; +1BC16;DUPLOYAN LETTER HL;Lo;0;L;;;;;N;;;;; +1BC17;DUPLOYAN LETTER LH;Lo;0;L;;;;;N;;;;; +1BC18;DUPLOYAN LETTER RH;Lo;0;L;;;;;N;;;;; +1BC19;DUPLOYAN LETTER M;Lo;0;L;;;;;N;;;;; +1BC1A;DUPLOYAN LETTER N;Lo;0;L;;;;;N;;;;; +1BC1B;DUPLOYAN LETTER J;Lo;0;L;;;;;N;;;;; +1BC1C;DUPLOYAN LETTER S;Lo;0;L;;;;;N;;;;; +1BC1D;DUPLOYAN LETTER M N;Lo;0;L;;;;;N;;;;; +1BC1E;DUPLOYAN LETTER N M;Lo;0;L;;;;;N;;;;; +1BC1F;DUPLOYAN LETTER J M;Lo;0;L;;;;;N;;;;; +1BC20;DUPLOYAN LETTER S J;Lo;0;L;;;;;N;;;;; +1BC21;DUPLOYAN LETTER M WITH DOT;Lo;0;L;;;;;N;;;;; +1BC22;DUPLOYAN LETTER N WITH DOT;Lo;0;L;;;;;N;;;;; +1BC23;DUPLOYAN LETTER J WITH DOT;Lo;0;L;;;;;N;;;;; +1BC24;DUPLOYAN LETTER J WITH DOTS INSIDE AND ABOVE;Lo;0;L;;;;;N;;;;; +1BC25;DUPLOYAN LETTER S WITH DOT;Lo;0;L;;;;;N;;;;; +1BC26;DUPLOYAN LETTER S WITH DOT BELOW;Lo;0;L;;;;;N;;;;; +1BC27;DUPLOYAN LETTER M S;Lo;0;L;;;;;N;;;;; +1BC28;DUPLOYAN LETTER N S;Lo;0;L;;;;;N;;;;; +1BC29;DUPLOYAN LETTER J S;Lo;0;L;;;;;N;;;;; +1BC2A;DUPLOYAN LETTER S S;Lo;0;L;;;;;N;;;;; +1BC2B;DUPLOYAN LETTER M N S;Lo;0;L;;;;;N;;;;; +1BC2C;DUPLOYAN LETTER N M S;Lo;0;L;;;;;N;;;;; +1BC2D;DUPLOYAN LETTER J M S;Lo;0;L;;;;;N;;;;; +1BC2E;DUPLOYAN LETTER S J S;Lo;0;L;;;;;N;;;;; +1BC2F;DUPLOYAN LETTER J S WITH DOT;Lo;0;L;;;;;N;;;;; +1BC30;DUPLOYAN LETTER J N;Lo;0;L;;;;;N;;;;; +1BC31;DUPLOYAN LETTER J N S;Lo;0;L;;;;;N;;;;; +1BC32;DUPLOYAN LETTER S T;Lo;0;L;;;;;N;;;;; +1BC33;DUPLOYAN LETTER S T R;Lo;0;L;;;;;N;;;;; +1BC34;DUPLOYAN LETTER S P;Lo;0;L;;;;;N;;;;; +1BC35;DUPLOYAN LETTER S P R;Lo;0;L;;;;;N;;;;; +1BC36;DUPLOYAN LETTER T S;Lo;0;L;;;;;N;;;;; +1BC37;DUPLOYAN LETTER T R S;Lo;0;L;;;;;N;;;;; +1BC38;DUPLOYAN LETTER W;Lo;0;L;;;;;N;;;;; +1BC39;DUPLOYAN LETTER WH;Lo;0;L;;;;;N;;;;; +1BC3A;DUPLOYAN LETTER W R;Lo;0;L;;;;;N;;;;; +1BC3B;DUPLOYAN LETTER S N;Lo;0;L;;;;;N;;;;; +1BC3C;DUPLOYAN LETTER S M;Lo;0;L;;;;;N;;;;; +1BC3D;DUPLOYAN LETTER K R S;Lo;0;L;;;;;N;;;;; +1BC3E;DUPLOYAN LETTER G R S;Lo;0;L;;;;;N;;;;; +1BC3F;DUPLOYAN LETTER S K;Lo;0;L;;;;;N;;;;; +1BC40;DUPLOYAN LETTER S K R;Lo;0;L;;;;;N;;;;; +1BC41;DUPLOYAN LETTER A;Lo;0;L;;;;;N;;;;; +1BC42;DUPLOYAN LETTER SLOAN OW;Lo;0;L;;;;;N;;;;; +1BC43;DUPLOYAN LETTER OA;Lo;0;L;;;;;N;;;;; +1BC44;DUPLOYAN LETTER O;Lo;0;L;;;;;N;;;;; +1BC45;DUPLOYAN LETTER AOU;Lo;0;L;;;;;N;;;;; +1BC46;DUPLOYAN LETTER I;Lo;0;L;;;;;N;;;;; +1BC47;DUPLOYAN LETTER E;Lo;0;L;;;;;N;;;;; +1BC48;DUPLOYAN LETTER IE;Lo;0;L;;;;;N;;;;; +1BC49;DUPLOYAN LETTER SHORT I;Lo;0;L;;;;;N;;;;; +1BC4A;DUPLOYAN LETTER UI;Lo;0;L;;;;;N;;;;; +1BC4B;DUPLOYAN LETTER EE;Lo;0;L;;;;;N;;;;; +1BC4C;DUPLOYAN LETTER SLOAN EH;Lo;0;L;;;;;N;;;;; +1BC4D;DUPLOYAN LETTER ROMANIAN I;Lo;0;L;;;;;N;;;;; +1BC4E;DUPLOYAN LETTER SLOAN EE;Lo;0;L;;;;;N;;;;; +1BC4F;DUPLOYAN LETTER LONG I;Lo;0;L;;;;;N;;;;; +1BC50;DUPLOYAN LETTER YE;Lo;0;L;;;;;N;;;;; +1BC51;DUPLOYAN LETTER U;Lo;0;L;;;;;N;;;;; +1BC52;DUPLOYAN LETTER EU;Lo;0;L;;;;;N;;;;; +1BC53;DUPLOYAN LETTER XW;Lo;0;L;;;;;N;;;;; +1BC54;DUPLOYAN LETTER U N;Lo;0;L;;;;;N;;;;; +1BC55;DUPLOYAN LETTER LONG U;Lo;0;L;;;;;N;;;;; +1BC56;DUPLOYAN LETTER ROMANIAN U;Lo;0;L;;;;;N;;;;; +1BC57;DUPLOYAN LETTER UH;Lo;0;L;;;;;N;;;;; +1BC58;DUPLOYAN LETTER SLOAN U;Lo;0;L;;;;;N;;;;; +1BC59;DUPLOYAN LETTER OOH;Lo;0;L;;;;;N;;;;; +1BC5A;DUPLOYAN LETTER OW;Lo;0;L;;;;;N;;;;; +1BC5B;DUPLOYAN LETTER OU;Lo;0;L;;;;;N;;;;; +1BC5C;DUPLOYAN LETTER WA;Lo;0;L;;;;;N;;;;; +1BC5D;DUPLOYAN LETTER WO;Lo;0;L;;;;;N;;;;; +1BC5E;DUPLOYAN LETTER WI;Lo;0;L;;;;;N;;;;; +1BC5F;DUPLOYAN LETTER WEI;Lo;0;L;;;;;N;;;;; +1BC60;DUPLOYAN LETTER WOW;Lo;0;L;;;;;N;;;;; +1BC61;DUPLOYAN LETTER NASAL U;Lo;0;L;;;;;N;;;;; +1BC62;DUPLOYAN LETTER NASAL O;Lo;0;L;;;;;N;;;;; +1BC63;DUPLOYAN LETTER NASAL I;Lo;0;L;;;;;N;;;;; +1BC64;DUPLOYAN LETTER NASAL A;Lo;0;L;;;;;N;;;;; +1BC65;DUPLOYAN LETTER PERNIN AN;Lo;0;L;;;;;N;;;;; +1BC66;DUPLOYAN LETTER PERNIN AM;Lo;0;L;;;;;N;;;;; +1BC67;DUPLOYAN LETTER SLOAN EN;Lo;0;L;;;;;N;;;;; +1BC68;DUPLOYAN LETTER SLOAN AN;Lo;0;L;;;;;N;;;;; +1BC69;DUPLOYAN LETTER SLOAN ON;Lo;0;L;;;;;N;;;;; +1BC6A;DUPLOYAN LETTER VOCALIC M;Lo;0;L;;;;;N;;;;; +1BC70;DUPLOYAN AFFIX LEFT HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;; +1BC71;DUPLOYAN AFFIX MID HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;; +1BC72;DUPLOYAN AFFIX RIGHT HORIZONTAL SECANT;Lo;0;L;;;;;N;;;;; +1BC73;DUPLOYAN AFFIX LOW VERTICAL SECANT;Lo;0;L;;;;;N;;;;; +1BC74;DUPLOYAN AFFIX MID VERTICAL SECANT;Lo;0;L;;;;;N;;;;; +1BC75;DUPLOYAN AFFIX HIGH VERTICAL SECANT;Lo;0;L;;;;;N;;;;; +1BC76;DUPLOYAN AFFIX ATTACHED SECANT;Lo;0;L;;;;;N;;;;; +1BC77;DUPLOYAN AFFIX ATTACHED LEFT-TO-RIGHT SECANT;Lo;0;L;;;;;N;;;;; +1BC78;DUPLOYAN AFFIX ATTACHED TANGENT;Lo;0;L;;;;;N;;;;; +1BC79;DUPLOYAN AFFIX ATTACHED TAIL;Lo;0;L;;;;;N;;;;; +1BC7A;DUPLOYAN AFFIX ATTACHED E HOOK;Lo;0;L;;;;;N;;;;; +1BC7B;DUPLOYAN AFFIX ATTACHED I HOOK;Lo;0;L;;;;;N;;;;; +1BC7C;DUPLOYAN AFFIX ATTACHED TANGENT HOOK;Lo;0;L;;;;;N;;;;; +1BC80;DUPLOYAN AFFIX HIGH ACUTE;Lo;0;L;;;;;N;;;;; +1BC81;DUPLOYAN AFFIX HIGH TIGHT ACUTE;Lo;0;L;;;;;N;;;;; +1BC82;DUPLOYAN AFFIX HIGH GRAVE;Lo;0;L;;;;;N;;;;; +1BC83;DUPLOYAN AFFIX HIGH LONG GRAVE;Lo;0;L;;;;;N;;;;; +1BC84;DUPLOYAN AFFIX HIGH DOT;Lo;0;L;;;;;N;;;;; +1BC85;DUPLOYAN AFFIX HIGH CIRCLE;Lo;0;L;;;;;N;;;;; +1BC86;DUPLOYAN AFFIX HIGH LINE;Lo;0;L;;;;;N;;;;; +1BC87;DUPLOYAN AFFIX HIGH WAVE;Lo;0;L;;;;;N;;;;; +1BC88;DUPLOYAN AFFIX HIGH VERTICAL;Lo;0;L;;;;;N;;;;; +1BC90;DUPLOYAN AFFIX LOW ACUTE;Lo;0;L;;;;;N;;;;; +1BC91;DUPLOYAN AFFIX LOW TIGHT ACUTE;Lo;0;L;;;;;N;;;;; +1BC92;DUPLOYAN AFFIX LOW GRAVE;Lo;0;L;;;;;N;;;;; +1BC93;DUPLOYAN AFFIX LOW LONG GRAVE;Lo;0;L;;;;;N;;;;; +1BC94;DUPLOYAN AFFIX LOW DOT;Lo;0;L;;;;;N;;;;; +1BC95;DUPLOYAN AFFIX LOW CIRCLE;Lo;0;L;;;;;N;;;;; +1BC96;DUPLOYAN AFFIX LOW LINE;Lo;0;L;;;;;N;;;;; +1BC97;DUPLOYAN AFFIX LOW WAVE;Lo;0;L;;;;;N;;;;; +1BC98;DUPLOYAN AFFIX LOW VERTICAL;Lo;0;L;;;;;N;;;;; +1BC99;DUPLOYAN AFFIX LOW ARROW;Lo;0;L;;;;;N;;;;; +1BC9C;DUPLOYAN SIGN O WITH CROSS;So;0;L;;;;;N;;;;; +1BC9D;DUPLOYAN THICK LETTER SELECTOR;Mn;0;NSM;;;;;N;;;;; +1BC9E;DUPLOYAN DOUBLE MARK;Mn;1;NSM;;;;;N;;;;; +1BC9F;DUPLOYAN PUNCTUATION CHINOOK FULL STOP;Po;0;L;;;;;N;;;;; +1BCA0;SHORTHAND FORMAT LETTER OVERLAP;Cf;0;BN;;;;;N;;;;; +1BCA1;SHORTHAND FORMAT CONTINUING OVERLAP;Cf;0;BN;;;;;N;;;;; +1BCA2;SHORTHAND FORMAT DOWN STEP;Cf;0;BN;;;;;N;;;;; +1BCA3;SHORTHAND FORMAT UP STEP;Cf;0;BN;;;;;N;;;;; 1D000;BYZANTINE MUSICAL SYMBOL PSILI;So;0;L;;;;;N;;;;; 1D001;BYZANTINE MUSICAL SYMBOL DASEIA;So;0;L;;;;;N;;;;; 1D002;BYZANTINE MUSICAL SYMBOL PERISPOMENI;So;0;L;;;;;N;;;;; @@ -22169,6 +24251,219 @@ 1D7FD;MATHEMATICAL MONOSPACE DIGIT SEVEN;Nd;0;EN;<font> 0037;7;7;7;N;;;;; 1D7FE;MATHEMATICAL MONOSPACE DIGIT EIGHT;Nd;0;EN;<font> 0038;8;8;8;N;;;;; 1D7FF;MATHEMATICAL MONOSPACE DIGIT NINE;Nd;0;EN;<font> 0039;9;9;9;N;;;;; +1E800;MENDE KIKAKUI SYLLABLE M001 KI;Lo;0;R;;;;;N;;;;; +1E801;MENDE KIKAKUI SYLLABLE M002 KA;Lo;0;R;;;;;N;;;;; +1E802;MENDE KIKAKUI SYLLABLE M003 KU;Lo;0;R;;;;;N;;;;; +1E803;MENDE KIKAKUI SYLLABLE M065 KEE;Lo;0;R;;;;;N;;;;; +1E804;MENDE KIKAKUI SYLLABLE M095 KE;Lo;0;R;;;;;N;;;;; +1E805;MENDE KIKAKUI SYLLABLE M076 KOO;Lo;0;R;;;;;N;;;;; +1E806;MENDE KIKAKUI SYLLABLE M048 KO;Lo;0;R;;;;;N;;;;; +1E807;MENDE KIKAKUI SYLLABLE M179 KUA;Lo;0;R;;;;;N;;;;; +1E808;MENDE KIKAKUI SYLLABLE M004 WI;Lo;0;R;;;;;N;;;;; +1E809;MENDE KIKAKUI SYLLABLE M005 WA;Lo;0;R;;;;;N;;;;; +1E80A;MENDE KIKAKUI SYLLABLE M006 WU;Lo;0;R;;;;;N;;;;; +1E80B;MENDE KIKAKUI SYLLABLE M126 WEE;Lo;0;R;;;;;N;;;;; +1E80C;MENDE KIKAKUI SYLLABLE M118 WE;Lo;0;R;;;;;N;;;;; +1E80D;MENDE KIKAKUI SYLLABLE M114 WOO;Lo;0;R;;;;;N;;;;; +1E80E;MENDE KIKAKUI SYLLABLE M045 WO;Lo;0;R;;;;;N;;;;; +1E80F;MENDE KIKAKUI SYLLABLE M194 WUI;Lo;0;R;;;;;N;;;;; +1E810;MENDE KIKAKUI SYLLABLE M143 WEI;Lo;0;R;;;;;N;;;;; +1E811;MENDE KIKAKUI SYLLABLE M061 WVI;Lo;0;R;;;;;N;;;;; +1E812;MENDE KIKAKUI SYLLABLE M049 WVA;Lo;0;R;;;;;N;;;;; +1E813;MENDE KIKAKUI SYLLABLE M139 WVE;Lo;0;R;;;;;N;;;;; +1E814;MENDE KIKAKUI SYLLABLE M007 MIN;Lo;0;R;;;;;N;;;;; +1E815;MENDE KIKAKUI SYLLABLE M008 MAN;Lo;0;R;;;;;N;;;;; +1E816;MENDE KIKAKUI SYLLABLE M009 MUN;Lo;0;R;;;;;N;;;;; +1E817;MENDE KIKAKUI SYLLABLE M059 MEN;Lo;0;R;;;;;N;;;;; +1E818;MENDE KIKAKUI SYLLABLE M094 MON;Lo;0;R;;;;;N;;;;; +1E819;MENDE KIKAKUI SYLLABLE M154 MUAN;Lo;0;R;;;;;N;;;;; +1E81A;MENDE KIKAKUI SYLLABLE M189 MUEN;Lo;0;R;;;;;N;;;;; +1E81B;MENDE KIKAKUI SYLLABLE M010 BI;Lo;0;R;;;;;N;;;;; +1E81C;MENDE KIKAKUI SYLLABLE M011 BA;Lo;0;R;;;;;N;;;;; +1E81D;MENDE KIKAKUI SYLLABLE M012 BU;Lo;0;R;;;;;N;;;;; +1E81E;MENDE KIKAKUI SYLLABLE M150 BEE;Lo;0;R;;;;;N;;;;; +1E81F;MENDE KIKAKUI SYLLABLE M097 BE;Lo;0;R;;;;;N;;;;; +1E820;MENDE KIKAKUI SYLLABLE M103 BOO;Lo;0;R;;;;;N;;;;; +1E821;MENDE KIKAKUI SYLLABLE M138 BO;Lo;0;R;;;;;N;;;;; +1E822;MENDE KIKAKUI SYLLABLE M013 I;Lo;0;R;;;;;N;;;;; +1E823;MENDE KIKAKUI SYLLABLE M014 A;Lo;0;R;;;;;N;;;;; +1E824;MENDE KIKAKUI SYLLABLE M015 U;Lo;0;R;;;;;N;;;;; +1E825;MENDE KIKAKUI SYLLABLE M163 EE;Lo;0;R;;;;;N;;;;; +1E826;MENDE KIKAKUI SYLLABLE M100 E;Lo;0;R;;;;;N;;;;; +1E827;MENDE KIKAKUI SYLLABLE M165 OO;Lo;0;R;;;;;N;;;;; +1E828;MENDE KIKAKUI SYLLABLE M147 O;Lo;0;R;;;;;N;;;;; +1E829;MENDE KIKAKUI SYLLABLE M137 EI;Lo;0;R;;;;;N;;;;; +1E82A;MENDE KIKAKUI SYLLABLE M131 IN;Lo;0;R;;;;;N;;;;; +1E82B;MENDE KIKAKUI SYLLABLE M135 IN;Lo;0;R;;;;;N;;;;; +1E82C;MENDE KIKAKUI SYLLABLE M195 AN;Lo;0;R;;;;;N;;;;; +1E82D;MENDE KIKAKUI SYLLABLE M178 EN;Lo;0;R;;;;;N;;;;; +1E82E;MENDE KIKAKUI SYLLABLE M019 SI;Lo;0;R;;;;;N;;;;; +1E82F;MENDE KIKAKUI SYLLABLE M020 SA;Lo;0;R;;;;;N;;;;; +1E830;MENDE KIKAKUI SYLLABLE M021 SU;Lo;0;R;;;;;N;;;;; +1E831;MENDE KIKAKUI SYLLABLE M162 SEE;Lo;0;R;;;;;N;;;;; +1E832;MENDE KIKAKUI SYLLABLE M116 SE;Lo;0;R;;;;;N;;;;; +1E833;MENDE KIKAKUI SYLLABLE M136 SOO;Lo;0;R;;;;;N;;;;; +1E834;MENDE KIKAKUI SYLLABLE M079 SO;Lo;0;R;;;;;N;;;;; +1E835;MENDE KIKAKUI SYLLABLE M196 SIA;Lo;0;R;;;;;N;;;;; +1E836;MENDE KIKAKUI SYLLABLE M025 LI;Lo;0;R;;;;;N;;;;; +1E837;MENDE KIKAKUI SYLLABLE M026 LA;Lo;0;R;;;;;N;;;;; +1E838;MENDE KIKAKUI SYLLABLE M027 LU;Lo;0;R;;;;;N;;;;; +1E839;MENDE KIKAKUI SYLLABLE M084 LEE;Lo;0;R;;;;;N;;;;; +1E83A;MENDE KIKAKUI SYLLABLE M073 LE;Lo;0;R;;;;;N;;;;; +1E83B;MENDE KIKAKUI SYLLABLE M054 LOO;Lo;0;R;;;;;N;;;;; +1E83C;MENDE KIKAKUI SYLLABLE M153 LO;Lo;0;R;;;;;N;;;;; +1E83D;MENDE KIKAKUI SYLLABLE M110 LONG LE;Lo;0;R;;;;;N;;;;; +1E83E;MENDE KIKAKUI SYLLABLE M016 DI;Lo;0;R;;;;;N;;;;; +1E83F;MENDE KIKAKUI SYLLABLE M017 DA;Lo;0;R;;;;;N;;;;; +1E840;MENDE KIKAKUI SYLLABLE M018 DU;Lo;0;R;;;;;N;;;;; +1E841;MENDE KIKAKUI SYLLABLE M089 DEE;Lo;0;R;;;;;N;;;;; +1E842;MENDE KIKAKUI SYLLABLE M180 DOO;Lo;0;R;;;;;N;;;;; +1E843;MENDE KIKAKUI SYLLABLE M181 DO;Lo;0;R;;;;;N;;;;; +1E844;MENDE KIKAKUI SYLLABLE M022 TI;Lo;0;R;;;;;N;;;;; +1E845;MENDE KIKAKUI SYLLABLE M023 TA;Lo;0;R;;;;;N;;;;; +1E846;MENDE KIKAKUI SYLLABLE M024 TU;Lo;0;R;;;;;N;;;;; +1E847;MENDE KIKAKUI SYLLABLE M091 TEE;Lo;0;R;;;;;N;;;;; +1E848;MENDE KIKAKUI SYLLABLE M055 TE;Lo;0;R;;;;;N;;;;; +1E849;MENDE KIKAKUI SYLLABLE M104 TOO;Lo;0;R;;;;;N;;;;; +1E84A;MENDE KIKAKUI SYLLABLE M069 TO;Lo;0;R;;;;;N;;;;; +1E84B;MENDE KIKAKUI SYLLABLE M028 JI;Lo;0;R;;;;;N;;;;; +1E84C;MENDE KIKAKUI SYLLABLE M029 JA;Lo;0;R;;;;;N;;;;; +1E84D;MENDE KIKAKUI SYLLABLE M030 JU;Lo;0;R;;;;;N;;;;; +1E84E;MENDE KIKAKUI SYLLABLE M157 JEE;Lo;0;R;;;;;N;;;;; +1E84F;MENDE KIKAKUI SYLLABLE M113 JE;Lo;0;R;;;;;N;;;;; +1E850;MENDE KIKAKUI SYLLABLE M160 JOO;Lo;0;R;;;;;N;;;;; +1E851;MENDE KIKAKUI SYLLABLE M063 JO;Lo;0;R;;;;;N;;;;; +1E852;MENDE KIKAKUI SYLLABLE M175 LONG JO;Lo;0;R;;;;;N;;;;; +1E853;MENDE KIKAKUI SYLLABLE M031 YI;Lo;0;R;;;;;N;;;;; +1E854;MENDE KIKAKUI SYLLABLE M032 YA;Lo;0;R;;;;;N;;;;; +1E855;MENDE KIKAKUI SYLLABLE M033 YU;Lo;0;R;;;;;N;;;;; +1E856;MENDE KIKAKUI SYLLABLE M109 YEE;Lo;0;R;;;;;N;;;;; +1E857;MENDE KIKAKUI SYLLABLE M080 YE;Lo;0;R;;;;;N;;;;; +1E858;MENDE KIKAKUI SYLLABLE M141 YOO;Lo;0;R;;;;;N;;;;; +1E859;MENDE KIKAKUI SYLLABLE M121 YO;Lo;0;R;;;;;N;;;;; +1E85A;MENDE KIKAKUI SYLLABLE M034 FI;Lo;0;R;;;;;N;;;;; +1E85B;MENDE KIKAKUI SYLLABLE M035 FA;Lo;0;R;;;;;N;;;;; +1E85C;MENDE KIKAKUI SYLLABLE M036 FU;Lo;0;R;;;;;N;;;;; +1E85D;MENDE KIKAKUI SYLLABLE M078 FEE;Lo;0;R;;;;;N;;;;; +1E85E;MENDE KIKAKUI SYLLABLE M075 FE;Lo;0;R;;;;;N;;;;; +1E85F;MENDE KIKAKUI SYLLABLE M133 FOO;Lo;0;R;;;;;N;;;;; +1E860;MENDE KIKAKUI SYLLABLE M088 FO;Lo;0;R;;;;;N;;;;; +1E861;MENDE KIKAKUI SYLLABLE M197 FUA;Lo;0;R;;;;;N;;;;; +1E862;MENDE KIKAKUI SYLLABLE M101 FAN;Lo;0;R;;;;;N;;;;; +1E863;MENDE KIKAKUI SYLLABLE M037 NIN;Lo;0;R;;;;;N;;;;; +1E864;MENDE KIKAKUI SYLLABLE M038 NAN;Lo;0;R;;;;;N;;;;; +1E865;MENDE KIKAKUI SYLLABLE M039 NUN;Lo;0;R;;;;;N;;;;; +1E866;MENDE KIKAKUI SYLLABLE M117 NEN;Lo;0;R;;;;;N;;;;; +1E867;MENDE KIKAKUI SYLLABLE M169 NON;Lo;0;R;;;;;N;;;;; +1E868;MENDE KIKAKUI SYLLABLE M176 HI;Lo;0;R;;;;;N;;;;; +1E869;MENDE KIKAKUI SYLLABLE M041 HA;Lo;0;R;;;;;N;;;;; +1E86A;MENDE KIKAKUI SYLLABLE M186 HU;Lo;0;R;;;;;N;;;;; +1E86B;MENDE KIKAKUI SYLLABLE M040 HEE;Lo;0;R;;;;;N;;;;; +1E86C;MENDE KIKAKUI SYLLABLE M096 HE;Lo;0;R;;;;;N;;;;; +1E86D;MENDE KIKAKUI SYLLABLE M042 HOO;Lo;0;R;;;;;N;;;;; +1E86E;MENDE KIKAKUI SYLLABLE M140 HO;Lo;0;R;;;;;N;;;;; +1E86F;MENDE KIKAKUI SYLLABLE M083 HEEI;Lo;0;R;;;;;N;;;;; +1E870;MENDE KIKAKUI SYLLABLE M128 HOOU;Lo;0;R;;;;;N;;;;; +1E871;MENDE KIKAKUI SYLLABLE M053 HIN;Lo;0;R;;;;;N;;;;; +1E872;MENDE KIKAKUI SYLLABLE M130 HAN;Lo;0;R;;;;;N;;;;; +1E873;MENDE KIKAKUI SYLLABLE M087 HUN;Lo;0;R;;;;;N;;;;; +1E874;MENDE KIKAKUI SYLLABLE M052 HEN;Lo;0;R;;;;;N;;;;; +1E875;MENDE KIKAKUI SYLLABLE M193 HON;Lo;0;R;;;;;N;;;;; +1E876;MENDE KIKAKUI SYLLABLE M046 HUAN;Lo;0;R;;;;;N;;;;; +1E877;MENDE KIKAKUI SYLLABLE M090 NGGI;Lo;0;R;;;;;N;;;;; +1E878;MENDE KIKAKUI SYLLABLE M043 NGGA;Lo;0;R;;;;;N;;;;; +1E879;MENDE KIKAKUI SYLLABLE M082 NGGU;Lo;0;R;;;;;N;;;;; +1E87A;MENDE KIKAKUI SYLLABLE M115 NGGEE;Lo;0;R;;;;;N;;;;; +1E87B;MENDE KIKAKUI SYLLABLE M146 NGGE;Lo;0;R;;;;;N;;;;; +1E87C;MENDE KIKAKUI SYLLABLE M156 NGGOO;Lo;0;R;;;;;N;;;;; +1E87D;MENDE KIKAKUI SYLLABLE M120 NGGO;Lo;0;R;;;;;N;;;;; +1E87E;MENDE KIKAKUI SYLLABLE M159 NGGAA;Lo;0;R;;;;;N;;;;; +1E87F;MENDE KIKAKUI SYLLABLE M127 NGGUA;Lo;0;R;;;;;N;;;;; +1E880;MENDE KIKAKUI SYLLABLE M086 LONG NGGE;Lo;0;R;;;;;N;;;;; +1E881;MENDE KIKAKUI SYLLABLE M106 LONG NGGOO;Lo;0;R;;;;;N;;;;; +1E882;MENDE KIKAKUI SYLLABLE M183 LONG NGGO;Lo;0;R;;;;;N;;;;; +1E883;MENDE KIKAKUI SYLLABLE M155 GI;Lo;0;R;;;;;N;;;;; +1E884;MENDE KIKAKUI SYLLABLE M111 GA;Lo;0;R;;;;;N;;;;; +1E885;MENDE KIKAKUI SYLLABLE M168 GU;Lo;0;R;;;;;N;;;;; +1E886;MENDE KIKAKUI SYLLABLE M190 GEE;Lo;0;R;;;;;N;;;;; +1E887;MENDE KIKAKUI SYLLABLE M166 GUEI;Lo;0;R;;;;;N;;;;; +1E888;MENDE KIKAKUI SYLLABLE M167 GUAN;Lo;0;R;;;;;N;;;;; +1E889;MENDE KIKAKUI SYLLABLE M184 NGEN;Lo;0;R;;;;;N;;;;; +1E88A;MENDE KIKAKUI SYLLABLE M057 NGON;Lo;0;R;;;;;N;;;;; +1E88B;MENDE KIKAKUI SYLLABLE M177 NGUAN;Lo;0;R;;;;;N;;;;; +1E88C;MENDE KIKAKUI SYLLABLE M068 PI;Lo;0;R;;;;;N;;;;; +1E88D;MENDE KIKAKUI SYLLABLE M099 PA;Lo;0;R;;;;;N;;;;; +1E88E;MENDE KIKAKUI SYLLABLE M050 PU;Lo;0;R;;;;;N;;;;; +1E88F;MENDE KIKAKUI SYLLABLE M081 PEE;Lo;0;R;;;;;N;;;;; +1E890;MENDE KIKAKUI SYLLABLE M051 PE;Lo;0;R;;;;;N;;;;; +1E891;MENDE KIKAKUI SYLLABLE M102 POO;Lo;0;R;;;;;N;;;;; +1E892;MENDE KIKAKUI SYLLABLE M066 PO;Lo;0;R;;;;;N;;;;; +1E893;MENDE KIKAKUI SYLLABLE M145 MBI;Lo;0;R;;;;;N;;;;; +1E894;MENDE KIKAKUI SYLLABLE M062 MBA;Lo;0;R;;;;;N;;;;; +1E895;MENDE KIKAKUI SYLLABLE M122 MBU;Lo;0;R;;;;;N;;;;; +1E896;MENDE KIKAKUI SYLLABLE M047 MBEE;Lo;0;R;;;;;N;;;;; +1E897;MENDE KIKAKUI SYLLABLE M188 MBEE;Lo;0;R;;;;;N;;;;; +1E898;MENDE KIKAKUI SYLLABLE M072 MBE;Lo;0;R;;;;;N;;;;; +1E899;MENDE KIKAKUI SYLLABLE M172 MBOO;Lo;0;R;;;;;N;;;;; +1E89A;MENDE KIKAKUI SYLLABLE M174 MBO;Lo;0;R;;;;;N;;;;; +1E89B;MENDE KIKAKUI SYLLABLE M187 MBUU;Lo;0;R;;;;;N;;;;; +1E89C;MENDE KIKAKUI SYLLABLE M161 LONG MBE;Lo;0;R;;;;;N;;;;; +1E89D;MENDE KIKAKUI SYLLABLE M105 LONG MBOO;Lo;0;R;;;;;N;;;;; +1E89E;MENDE KIKAKUI SYLLABLE M142 LONG MBO;Lo;0;R;;;;;N;;;;; +1E89F;MENDE KIKAKUI SYLLABLE M132 KPI;Lo;0;R;;;;;N;;;;; +1E8A0;MENDE KIKAKUI SYLLABLE M092 KPA;Lo;0;R;;;;;N;;;;; +1E8A1;MENDE KIKAKUI SYLLABLE M074 KPU;Lo;0;R;;;;;N;;;;; +1E8A2;MENDE KIKAKUI SYLLABLE M044 KPEE;Lo;0;R;;;;;N;;;;; +1E8A3;MENDE KIKAKUI SYLLABLE M108 KPE;Lo;0;R;;;;;N;;;;; +1E8A4;MENDE KIKAKUI SYLLABLE M112 KPOO;Lo;0;R;;;;;N;;;;; +1E8A5;MENDE KIKAKUI SYLLABLE M158 KPO;Lo;0;R;;;;;N;;;;; +1E8A6;MENDE KIKAKUI SYLLABLE M124 GBI;Lo;0;R;;;;;N;;;;; +1E8A7;MENDE KIKAKUI SYLLABLE M056 GBA;Lo;0;R;;;;;N;;;;; +1E8A8;MENDE KIKAKUI SYLLABLE M148 GBU;Lo;0;R;;;;;N;;;;; +1E8A9;MENDE KIKAKUI SYLLABLE M093 GBEE;Lo;0;R;;;;;N;;;;; +1E8AA;MENDE KIKAKUI SYLLABLE M107 GBE;Lo;0;R;;;;;N;;;;; +1E8AB;MENDE KIKAKUI SYLLABLE M071 GBOO;Lo;0;R;;;;;N;;;;; +1E8AC;MENDE KIKAKUI SYLLABLE M070 GBO;Lo;0;R;;;;;N;;;;; +1E8AD;MENDE KIKAKUI SYLLABLE M171 RA;Lo;0;R;;;;;N;;;;; +1E8AE;MENDE KIKAKUI SYLLABLE M123 NDI;Lo;0;R;;;;;N;;;;; +1E8AF;MENDE KIKAKUI SYLLABLE M129 NDA;Lo;0;R;;;;;N;;;;; +1E8B0;MENDE KIKAKUI SYLLABLE M125 NDU;Lo;0;R;;;;;N;;;;; +1E8B1;MENDE KIKAKUI SYLLABLE M191 NDEE;Lo;0;R;;;;;N;;;;; +1E8B2;MENDE KIKAKUI SYLLABLE M119 NDE;Lo;0;R;;;;;N;;;;; +1E8B3;MENDE KIKAKUI SYLLABLE M067 NDOO;Lo;0;R;;;;;N;;;;; +1E8B4;MENDE KIKAKUI SYLLABLE M064 NDO;Lo;0;R;;;;;N;;;;; +1E8B5;MENDE KIKAKUI SYLLABLE M152 NJA;Lo;0;R;;;;;N;;;;; +1E8B6;MENDE KIKAKUI SYLLABLE M192 NJU;Lo;0;R;;;;;N;;;;; +1E8B7;MENDE KIKAKUI SYLLABLE M149 NJEE;Lo;0;R;;;;;N;;;;; +1E8B8;MENDE KIKAKUI SYLLABLE M134 NJOO;Lo;0;R;;;;;N;;;;; +1E8B9;MENDE KIKAKUI SYLLABLE M182 VI;Lo;0;R;;;;;N;;;;; +1E8BA;MENDE KIKAKUI SYLLABLE M185 VA;Lo;0;R;;;;;N;;;;; +1E8BB;MENDE KIKAKUI SYLLABLE M151 VU;Lo;0;R;;;;;N;;;;; +1E8BC;MENDE KIKAKUI SYLLABLE M173 VEE;Lo;0;R;;;;;N;;;;; +1E8BD;MENDE KIKAKUI SYLLABLE M085 VE;Lo;0;R;;;;;N;;;;; +1E8BE;MENDE KIKAKUI SYLLABLE M144 VOO;Lo;0;R;;;;;N;;;;; +1E8BF;MENDE KIKAKUI SYLLABLE M077 VO;Lo;0;R;;;;;N;;;;; +1E8C0;MENDE KIKAKUI SYLLABLE M164 NYIN;Lo;0;R;;;;;N;;;;; +1E8C1;MENDE KIKAKUI SYLLABLE M058 NYAN;Lo;0;R;;;;;N;;;;; +1E8C2;MENDE KIKAKUI SYLLABLE M170 NYUN;Lo;0;R;;;;;N;;;;; +1E8C3;MENDE KIKAKUI SYLLABLE M098 NYEN;Lo;0;R;;;;;N;;;;; +1E8C4;MENDE KIKAKUI SYLLABLE M060 NYON;Lo;0;R;;;;;N;;;;; +1E8C7;MENDE KIKAKUI DIGIT ONE;No;0;R;;;;1;N;;;;; +1E8C8;MENDE KIKAKUI DIGIT TWO;No;0;R;;;;2;N;;;;; +1E8C9;MENDE KIKAKUI DIGIT THREE;No;0;R;;;;3;N;;;;; +1E8CA;MENDE KIKAKUI DIGIT FOUR;No;0;R;;;;4;N;;;;; +1E8CB;MENDE KIKAKUI DIGIT FIVE;No;0;R;;;;5;N;;;;; +1E8CC;MENDE KIKAKUI DIGIT SIX;No;0;R;;;;6;N;;;;; +1E8CD;MENDE KIKAKUI DIGIT SEVEN;No;0;R;;;;7;N;;;;; +1E8CE;MENDE KIKAKUI DIGIT EIGHT;No;0;R;;;;8;N;;;;; +1E8CF;MENDE KIKAKUI DIGIT NINE;No;0;R;;;;9;N;;;;; +1E8D0;MENDE KIKAKUI COMBINING NUMBER TEENS;Mn;220;NSM;;;;;N;;;;; +1E8D1;MENDE KIKAKUI COMBINING NUMBER TENS;Mn;220;NSM;;;;;N;;;;; +1E8D2;MENDE KIKAKUI COMBINING NUMBER HUNDREDS;Mn;220;NSM;;;;;N;;;;; +1E8D3;MENDE KIKAKUI COMBINING NUMBER THOUSANDS;Mn;220;NSM;;;;;N;;;;; +1E8D4;MENDE KIKAKUI COMBINING NUMBER TEN THOUSANDS;Mn;220;NSM;;;;;N;;;;; +1E8D5;MENDE KIKAKUI COMBINING NUMBER HUNDRED THOUSANDS;Mn;220;NSM;;;;;N;;;;; +1E8D6;MENDE KIKAKUI COMBINING NUMBER MILLIONS;Mn;220;NSM;;;;;N;;;;; 1EE00;ARABIC MATHEMATICAL ALEF;Lo;0;AL;<font> 0627;;;;N;;;;; 1EE01;ARABIC MATHEMATICAL BEH;Lo;0;AL;<font> 0628;;;;N;;;;; 1EE02;ARABIC MATHEMATICAL JEEM;Lo;0;AL;<font> 062C;;;;N;;;;; @@ -22485,6 +24780,7 @@ 1F0BC;PLAYING CARD KNIGHT OF HEARTS;So;0;ON;;;;;N;;;;; 1F0BD;PLAYING CARD QUEEN OF HEARTS;So;0;ON;;;;;N;;;;; 1F0BE;PLAYING CARD KING OF HEARTS;So;0;ON;;;;;N;;;;; +1F0BF;PLAYING CARD RED JOKER;So;0;ON;;;;;N;;;;; 1F0C1;PLAYING CARD ACE OF DIAMONDS;So;0;ON;;;;;N;;;;; 1F0C2;PLAYING CARD TWO OF DIAMONDS;So;0;ON;;;;;N;;;;; 1F0C3;PLAYING CARD THREE OF DIAMONDS;So;0;ON;;;;;N;;;;; @@ -22515,6 +24811,28 @@ 1F0DD;PLAYING CARD QUEEN OF CLUBS;So;0;ON;;;;;N;;;;; 1F0DE;PLAYING CARD KING OF CLUBS;So;0;ON;;;;;N;;;;; 1F0DF;PLAYING CARD WHITE JOKER;So;0;ON;;;;;N;;;;; +1F0E0;PLAYING CARD FOOL;So;0;ON;;;;;N;;;;; +1F0E1;PLAYING CARD TRUMP-1;So;0;ON;;;;;N;;;;; +1F0E2;PLAYING CARD TRUMP-2;So;0;ON;;;;;N;;;;; +1F0E3;PLAYING CARD TRUMP-3;So;0;ON;;;;;N;;;;; +1F0E4;PLAYING CARD TRUMP-4;So;0;ON;;;;;N;;;;; +1F0E5;PLAYING CARD TRUMP-5;So;0;ON;;;;;N;;;;; +1F0E6;PLAYING CARD TRUMP-6;So;0;ON;;;;;N;;;;; +1F0E7;PLAYING CARD TRUMP-7;So;0;ON;;;;;N;;;;; +1F0E8;PLAYING CARD TRUMP-8;So;0;ON;;;;;N;;;;; +1F0E9;PLAYING CARD TRUMP-9;So;0;ON;;;;;N;;;;; +1F0EA;PLAYING CARD TRUMP-10;So;0;ON;;;;;N;;;;; +1F0EB;PLAYING CARD TRUMP-11;So;0;ON;;;;;N;;;;; +1F0EC;PLAYING CARD TRUMP-12;So;0;ON;;;;;N;;;;; +1F0ED;PLAYING CARD TRUMP-13;So;0;ON;;;;;N;;;;; +1F0EE;PLAYING CARD TRUMP-14;So;0;ON;;;;;N;;;;; +1F0EF;PLAYING CARD TRUMP-15;So;0;ON;;;;;N;;;;; +1F0F0;PLAYING CARD TRUMP-16;So;0;ON;;;;;N;;;;; +1F0F1;PLAYING CARD TRUMP-17;So;0;ON;;;;;N;;;;; +1F0F2;PLAYING CARD TRUMP-18;So;0;ON;;;;;N;;;;; +1F0F3;PLAYING CARD TRUMP-19;So;0;ON;;;;;N;;;;; +1F0F4;PLAYING CARD TRUMP-20;So;0;ON;;;;;N;;;;; +1F0F5;PLAYING CARD TRUMP-21;So;0;ON;;;;;N;;;;; 1F100;DIGIT ZERO FULL STOP;No;0;EN;<compat> 0030 002E;;0;0;N;;;;; 1F101;DIGIT ZERO COMMA;No;0;EN;<compat> 0030 002C;;0;0;N;;;;; 1F102;DIGIT ONE COMMA;No;0;EN;<compat> 0031 002C;;1;1;N;;;;; @@ -22526,6 +24844,8 @@ 1F108;DIGIT SEVEN COMMA;No;0;EN;<compat> 0037 002C;;7;7;N;;;;; 1F109;DIGIT EIGHT COMMA;No;0;EN;<compat> 0038 002C;;8;8;N;;;;; 1F10A;DIGIT NINE COMMA;No;0;EN;<compat> 0039 002C;;9;9;N;;;;; +1F10B;DINGBAT CIRCLED SANS-SERIF DIGIT ZERO;No;0;ON;;;;0;N;;;;; +1F10C;DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO;No;0;ON;;;;0;N;;;;; 1F110;PARENTHESIZED LATIN CAPITAL LETTER A;So;0;L;<compat> 0028 0041 0029;;;;N;;;;; 1F111;PARENTHESIZED LATIN CAPITAL LETTER B;So;0;L;<compat> 0028 0042 0029;;;;N;;;;; 1F112;PARENTHESIZED LATIN CAPITAL LETTER C;So;0;L;<compat> 0028 0043 0029;;;;N;;;;; @@ -22776,12 +25096,25 @@ 1F31E;SUN WITH FACE;So;0;ON;;;;;N;;;;; 1F31F;GLOWING STAR;So;0;ON;;;;;N;;;;; 1F320;SHOOTING STAR;So;0;ON;;;;;N;;;;; +1F321;THERMOMETER;So;0;ON;;;;;N;;;;; +1F322;BLACK DROPLET;So;0;ON;;;;;N;;;;; +1F323;WHITE SUN;So;0;ON;;;;;N;;;;; +1F324;WHITE SUN WITH SMALL CLOUD;So;0;ON;;;;;N;;;;; +1F325;WHITE SUN BEHIND CLOUD;So;0;ON;;;;;N;;;;; +1F326;WHITE SUN BEHIND CLOUD WITH RAIN;So;0;ON;;;;;N;;;;; +1F327;CLOUD WITH RAIN;So;0;ON;;;;;N;;;;; +1F328;CLOUD WITH SNOW;So;0;ON;;;;;N;;;;; +1F329;CLOUD WITH LIGHTNING;So;0;ON;;;;;N;;;;; +1F32A;CLOUD WITH TORNADO;So;0;ON;;;;;N;;;;; +1F32B;FOG;So;0;ON;;;;;N;;;;; +1F32C;WIND BLOWING FACE;So;0;ON;;;;;N;;;;; 1F330;CHESTNUT;So;0;ON;;;;;N;;;;; 1F331;SEEDLING;So;0;ON;;;;;N;;;;; 1F332;EVERGREEN TREE;So;0;ON;;;;;N;;;;; 1F333;DECIDUOUS TREE;So;0;ON;;;;;N;;;;; 1F334;PALM TREE;So;0;ON;;;;;N;;;;; 1F335;CACTUS;So;0;ON;;;;;N;;;;; +1F336;HOT PEPPER;So;0;ON;;;;;N;;;;; 1F337;TULIP;So;0;ON;;;;;N;;;;; 1F338;CHERRY BLOSSOM;So;0;ON;;;;;N;;;;; 1F339;ROSE;So;0;ON;;;;;N;;;;; @@ -22852,6 +25185,7 @@ 1F37A;BEER MUG;So;0;ON;;;;;N;;;;; 1F37B;CLINKING BEER MUGS;So;0;ON;;;;;N;;;;; 1F37C;BABY BOTTLE;So;0;ON;;;;;N;;;;; +1F37D;FORK AND KNIFE WITH PLATE;So;0;ON;;;;;N;;;;; 1F380;RIBBON;So;0;ON;;;;;N;;;;; 1F381;WRAPPED PRESENT;So;0;ON;;;;;N;;;;; 1F382;BIRTHDAY CAKE;So;0;ON;;;;;N;;;;; @@ -22872,6 +25206,18 @@ 1F391;MOON VIEWING CEREMONY;So;0;ON;;;;;N;;;;; 1F392;SCHOOL SATCHEL;So;0;ON;;;;;N;;;;; 1F393;GRADUATION CAP;So;0;ON;;;;;N;;;;; +1F394;HEART WITH TIP ON THE LEFT;So;0;ON;;;;;N;;;;; +1F395;BOUQUET OF FLOWERS;So;0;ON;;;;;N;;;;; +1F396;MILITARY MEDAL;So;0;ON;;;;;N;;;;; +1F397;REMINDER RIBBON;So;0;ON;;;;;N;;;;; +1F398;MUSICAL KEYBOARD WITH JACKS;So;0;ON;;;;;N;;;;; +1F399;STUDIO MICROPHONE;So;0;ON;;;;;N;;;;; +1F39A;LEVEL SLIDER;So;0;ON;;;;;N;;;;; +1F39B;CONTROL KNOBS;So;0;ON;;;;;N;;;;; +1F39C;BEAMED ASCENDING MUSICAL NOTES;So;0;ON;;;;;N;;;;; +1F39D;BEAMED DESCENDING MUSICAL NOTES;So;0;ON;;;;;N;;;;; +1F39E;FILM FRAMES;So;0;ON;;;;;N;;;;; +1F39F;ADMISSION TICKETS;So;0;ON;;;;;N;;;;; 1F3A0;CAROUSEL HORSE;So;0;ON;;;;;N;;;;; 1F3A1;FERRIS WHEEL;So;0;ON;;;;;N;;;;; 1F3A2;ROLLER COASTER;So;0;ON;;;;;N;;;;; @@ -22909,11 +25255,28 @@ 1F3C2;SNOWBOARDER;So;0;ON;;;;;N;;;;; 1F3C3;RUNNER;So;0;ON;;;;;N;;;;; 1F3C4;SURFER;So;0;ON;;;;;N;;;;; +1F3C5;SPORTS MEDAL;So;0;ON;;;;;N;;;;; 1F3C6;TROPHY;So;0;ON;;;;;N;;;;; 1F3C7;HORSE RACING;So;0;ON;;;;;N;;;;; 1F3C8;AMERICAN FOOTBALL;So;0;ON;;;;;N;;;;; 1F3C9;RUGBY FOOTBALL;So;0;ON;;;;;N;;;;; 1F3CA;SWIMMER;So;0;ON;;;;;N;;;;; +1F3CB;WEIGHT LIFTER;So;0;ON;;;;;N;;;;; +1F3CC;GOLFER;So;0;ON;;;;;N;;;;; +1F3CD;RACING MOTORCYCLE;So;0;ON;;;;;N;;;;; +1F3CE;RACING CAR;So;0;ON;;;;;N;;;;; +1F3D4;SNOW CAPPED MOUNTAIN;So;0;ON;;;;;N;;;;; +1F3D5;CAMPING;So;0;ON;;;;;N;;;;; +1F3D6;BEACH WITH UMBRELLA;So;0;ON;;;;;N;;;;; +1F3D7;BUILDING CONSTRUCTION;So;0;ON;;;;;N;;;;; +1F3D8;HOUSE BUILDINGS;So;0;ON;;;;;N;;;;; +1F3D9;CITYSCAPE;So;0;ON;;;;;N;;;;; +1F3DA;DERELICT HOUSE BUILDING;So;0;ON;;;;;N;;;;; +1F3DB;CLASSICAL BUILDING;So;0;ON;;;;;N;;;;; +1F3DC;DESERT;So;0;ON;;;;;N;;;;; +1F3DD;DESERT ISLAND;So;0;ON;;;;;N;;;;; +1F3DE;NATIONAL PARK;So;0;ON;;;;;N;;;;; +1F3DF;STADIUM;So;0;ON;;;;;N;;;;; 1F3E0;HOUSE BUILDING;So;0;ON;;;;;N;;;;; 1F3E1;HOUSE WITH GARDEN;So;0;ON;;;;;N;;;;; 1F3E2;OFFICE BUILDING;So;0;ON;;;;;N;;;;; @@ -22931,6 +25294,13 @@ 1F3EE;IZAKAYA LANTERN;So;0;ON;;;;;N;;;;; 1F3EF;JAPANESE CASTLE;So;0;ON;;;;;N;;;;; 1F3F0;EUROPEAN CASTLE;So;0;ON;;;;;N;;;;; +1F3F1;WHITE PENNANT;So;0;ON;;;;;N;;;;; +1F3F2;BLACK PENNANT;So;0;ON;;;;;N;;;;; +1F3F3;WAVING WHITE FLAG;So;0;ON;;;;;N;;;;; +1F3F4;WAVING BLACK FLAG;So;0;ON;;;;;N;;;;; +1F3F5;ROSETTE;So;0;ON;;;;;N;;;;; +1F3F6;BLACK ROSETTE;So;0;ON;;;;;N;;;;; +1F3F7;LABEL;So;0;ON;;;;;N;;;;; 1F400;RAT;So;0;ON;;;;;N;;;;; 1F401;MOUSE;So;0;ON;;;;;N;;;;; 1F402;OX;So;0;ON;;;;;N;;;;; @@ -22994,7 +25364,9 @@ 1F43C;PANDA FACE;So;0;ON;;;;;N;;;;; 1F43D;PIG NOSE;So;0;ON;;;;;N;;;;; 1F43E;PAW PRINTS;So;0;ON;;;;;N;;;;; +1F43F;CHIPMUNK;So;0;ON;;;;;N;;;;; 1F440;EYES;So;0;ON;;;;;N;;;;; +1F441;EYE;So;0;ON;;;;;N;;;;; 1F442;EAR;So;0;ON;;;;;N;;;;; 1F443;NOSE;So;0;ON;;;;;N;;;;; 1F444;MOUTH;So;0;ON;;;;;N;;;;; @@ -23177,10 +25549,13 @@ 1F4F5;NO MOBILE PHONES;So;0;ON;;;;;N;;;;; 1F4F6;ANTENNA WITH BARS;So;0;ON;;;;;N;;;;; 1F4F7;CAMERA;So;0;ON;;;;;N;;;;; +1F4F8;CAMERA WITH FLASH;So;0;ON;;;;;N;;;;; 1F4F9;VIDEO CAMERA;So;0;ON;;;;;N;;;;; 1F4FA;TELEVISION;So;0;ON;;;;;N;;;;; 1F4FB;RADIO;So;0;ON;;;;;N;;;;; 1F4FC;VIDEOCASSETTE;So;0;ON;;;;;N;;;;; +1F4FD;FILM PROJECTOR;So;0;ON;;;;;N;;;;; +1F4FE;PORTABLE STEREO;So;0;ON;;;;;N;;;;; 1F500;TWISTED RIGHTWARDS ARROWS;So;0;ON;;;;;N;;;;; 1F501;CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS;So;0;ON;;;;;N;;;;; 1F502;CLOCKWISE RIGHTWARDS AND LEFTWARDS OPEN CIRCLE ARROWS WITH CIRCLED ONE OVERLAY;So;0;ON;;;;;N;;;;; @@ -23243,10 +25618,19 @@ 1F53B;DOWN-POINTING RED TRIANGLE;So;0;ON;;;;;N;;;;; 1F53C;UP-POINTING SMALL RED TRIANGLE;So;0;ON;;;;;N;;;;; 1F53D;DOWN-POINTING SMALL RED TRIANGLE;So;0;ON;;;;;N;;;;; +1F53E;LOWER RIGHT SHADOWED WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F53F;UPPER RIGHT SHADOWED WHITE CIRCLE;So;0;ON;;;;;N;;;;; 1F540;CIRCLED CROSS POMMEE;So;0;ON;;;;;N;;;;; 1F541;CROSS POMMEE WITH HALF-CIRCLE BELOW;So;0;ON;;;;;N;;;;; 1F542;CROSS POMMEE;So;0;ON;;;;;N;;;;; 1F543;NOTCHED LEFT SEMICIRCLE WITH THREE DOTS;So;0;ON;;;;;N;;;;; +1F544;NOTCHED RIGHT SEMICIRCLE WITH THREE DOTS;So;0;ON;;;;;N;;;;; +1F545;SYMBOL FOR MARKS CHAPTER;So;0;ON;;;;;N;;;;; +1F546;WHITE LATIN CROSS;So;0;ON;;;;;N;;;;; +1F547;HEAVY LATIN CROSS;So;0;ON;;;;;N;;;;; +1F548;CELTIC CROSS;So;0;ON;;;;;N;;;;; +1F549;OM SYMBOL;So;0;ON;;;;;N;;;;; +1F54A;DOVE OF PEACE;So;0;ON;;;;;N;;;;; 1F550;CLOCK FACE ONE OCLOCK;So;0;ON;;;;;N;;;;; 1F551;CLOCK FACE TWO OCLOCK;So;0;ON;;;;;N;;;;; 1F552;CLOCK FACE THREE OCLOCK;So;0;ON;;;;;N;;;;; @@ -23271,6 +25655,151 @@ 1F565;CLOCK FACE TEN-THIRTY;So;0;ON;;;;;N;;;;; 1F566;CLOCK FACE ELEVEN-THIRTY;So;0;ON;;;;;N;;;;; 1F567;CLOCK FACE TWELVE-THIRTY;So;0;ON;;;;;N;;;;; +1F568;RIGHT SPEAKER;So;0;ON;;;;;N;;;;; +1F569;RIGHT SPEAKER WITH ONE SOUND WAVE;So;0;ON;;;;;N;;;;; +1F56A;RIGHT SPEAKER WITH THREE SOUND WAVES;So;0;ON;;;;;N;;;;; +1F56B;BULLHORN;So;0;ON;;;;;N;;;;; +1F56C;BULLHORN WITH SOUND WAVES;So;0;ON;;;;;N;;;;; +1F56D;RINGING BELL;So;0;ON;;;;;N;;;;; +1F56E;BOOK;So;0;ON;;;;;N;;;;; +1F56F;CANDLE;So;0;ON;;;;;N;;;;; +1F570;MANTELPIECE CLOCK;So;0;ON;;;;;N;;;;; +1F571;BLACK SKULL AND CROSSBONES;So;0;ON;;;;;N;;;;; +1F572;NO PIRACY;So;0;ON;;;;;N;;;;; +1F573;HOLE;So;0;ON;;;;;N;;;;; +1F574;MAN IN BUSINESS SUIT LEVITATING;So;0;ON;;;;;N;;;;; +1F575;SLEUTH OR SPY;So;0;ON;;;;;N;;;;; +1F576;DARK SUNGLASSES;So;0;ON;;;;;N;;;;; +1F577;SPIDER;So;0;ON;;;;;N;;;;; +1F578;SPIDER WEB;So;0;ON;;;;;N;;;;; +1F579;JOYSTICK;So;0;ON;;;;;N;;;;; +1F57B;LEFT HAND TELEPHONE RECEIVER;So;0;ON;;;;;N;;;;; +1F57C;TELEPHONE RECEIVER WITH PAGE;So;0;ON;;;;;N;;;;; +1F57D;RIGHT HAND TELEPHONE RECEIVER;So;0;ON;;;;;N;;;;; +1F57E;WHITE TOUCHTONE TELEPHONE;So;0;ON;;;;;N;;;;; +1F57F;BLACK TOUCHTONE TELEPHONE;So;0;ON;;;;;N;;;;; +1F580;TELEPHONE ON TOP OF MODEM;So;0;ON;;;;;N;;;;; +1F581;CLAMSHELL MOBILE PHONE;So;0;ON;;;;;N;;;;; +1F582;BACK OF ENVELOPE;So;0;ON;;;;;N;;;;; +1F583;STAMPED ENVELOPE;So;0;ON;;;;;N;;;;; +1F584;ENVELOPE WITH LIGHTNING;So;0;ON;;;;;N;;;;; +1F585;FLYING ENVELOPE;So;0;ON;;;;;N;;;;; +1F586;PEN OVER STAMPED ENVELOPE;So;0;ON;;;;;N;;;;; +1F587;LINKED PAPERCLIPS;So;0;ON;;;;;N;;;;; +1F588;BLACK PUSHPIN;So;0;ON;;;;;N;;;;; +1F589;LOWER LEFT PENCIL;So;0;ON;;;;;N;;;;; +1F58A;LOWER LEFT BALLPOINT PEN;So;0;ON;;;;;N;;;;; +1F58B;LOWER LEFT FOUNTAIN PEN;So;0;ON;;;;;N;;;;; +1F58C;LOWER LEFT PAINTBRUSH;So;0;ON;;;;;N;;;;; +1F58D;LOWER LEFT CRAYON;So;0;ON;;;;;N;;;;; +1F58E;LEFT WRITING HAND;So;0;ON;;;;;N;;;;; +1F58F;TURNED OK HAND SIGN;So;0;ON;;;;;N;;;;; +1F590;RAISED HAND WITH FINGERS SPLAYED;So;0;ON;;;;;N;;;;; +1F591;REVERSED RAISED HAND WITH FINGERS SPLAYED;So;0;ON;;;;;N;;;;; +1F592;REVERSED THUMBS UP SIGN;So;0;ON;;;;;N;;;;; +1F593;REVERSED THUMBS DOWN SIGN;So;0;ON;;;;;N;;;;; +1F594;REVERSED VICTORY HAND;So;0;ON;;;;;N;;;;; +1F595;REVERSED HAND WITH MIDDLE FINGER EXTENDED;So;0;ON;;;;;N;;;;; +1F596;RAISED HAND WITH PART BETWEEN MIDDLE AND RING FINGERS;So;0;ON;;;;;N;;;;; +1F597;WHITE DOWN POINTING LEFT HAND INDEX;So;0;ON;;;;;N;;;;; +1F598;SIDEWAYS WHITE LEFT POINTING INDEX;So;0;ON;;;;;N;;;;; +1F599;SIDEWAYS WHITE RIGHT POINTING INDEX;So;0;ON;;;;;N;;;;; +1F59A;SIDEWAYS BLACK LEFT POINTING INDEX;So;0;ON;;;;;N;;;;; +1F59B;SIDEWAYS BLACK RIGHT POINTING INDEX;So;0;ON;;;;;N;;;;; +1F59C;BLACK LEFT POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;; +1F59D;BLACK RIGHT POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;; +1F59E;SIDEWAYS WHITE UP POINTING INDEX;So;0;ON;;;;;N;;;;; +1F59F;SIDEWAYS WHITE DOWN POINTING INDEX;So;0;ON;;;;;N;;;;; +1F5A0;SIDEWAYS BLACK UP POINTING INDEX;So;0;ON;;;;;N;;;;; +1F5A1;SIDEWAYS BLACK DOWN POINTING INDEX;So;0;ON;;;;;N;;;;; +1F5A2;BLACK UP POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;; +1F5A3;BLACK DOWN POINTING BACKHAND INDEX;So;0;ON;;;;;N;;;;; +1F5A5;DESKTOP COMPUTER;So;0;ON;;;;;N;;;;; +1F5A6;KEYBOARD AND MOUSE;So;0;ON;;;;;N;;;;; +1F5A7;THREE NETWORKED COMPUTERS;So;0;ON;;;;;N;;;;; +1F5A8;PRINTER;So;0;ON;;;;;N;;;;; +1F5A9;POCKET CALCULATOR;So;0;ON;;;;;N;;;;; +1F5AA;BLACK HARD SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;; +1F5AB;WHITE HARD SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;; +1F5AC;SOFT SHELL FLOPPY DISK;So;0;ON;;;;;N;;;;; +1F5AD;TAPE CARTRIDGE;So;0;ON;;;;;N;;;;; +1F5AE;WIRED KEYBOARD;So;0;ON;;;;;N;;;;; +1F5AF;ONE BUTTON MOUSE;So;0;ON;;;;;N;;;;; +1F5B0;TWO BUTTON MOUSE;So;0;ON;;;;;N;;;;; +1F5B1;THREE BUTTON MOUSE;So;0;ON;;;;;N;;;;; +1F5B2;TRACKBALL;So;0;ON;;;;;N;;;;; +1F5B3;OLD PERSONAL COMPUTER;So;0;ON;;;;;N;;;;; +1F5B4;HARD DISK;So;0;ON;;;;;N;;;;; +1F5B5;SCREEN;So;0;ON;;;;;N;;;;; +1F5B6;PRINTER ICON;So;0;ON;;;;;N;;;;; +1F5B7;FAX ICON;So;0;ON;;;;;N;;;;; +1F5B8;OPTICAL DISC ICON;So;0;ON;;;;;N;;;;; +1F5B9;DOCUMENT WITH TEXT;So;0;ON;;;;;N;;;;; +1F5BA;DOCUMENT WITH TEXT AND PICTURE;So;0;ON;;;;;N;;;;; +1F5BB;DOCUMENT WITH PICTURE;So;0;ON;;;;;N;;;;; +1F5BC;FRAME WITH PICTURE;So;0;ON;;;;;N;;;;; +1F5BD;FRAME WITH TILES;So;0;ON;;;;;N;;;;; +1F5BE;FRAME WITH AN X;So;0;ON;;;;;N;;;;; +1F5BF;BLACK FOLDER;So;0;ON;;;;;N;;;;; +1F5C0;FOLDER;So;0;ON;;;;;N;;;;; +1F5C1;OPEN FOLDER;So;0;ON;;;;;N;;;;; +1F5C2;CARD INDEX DIVIDERS;So;0;ON;;;;;N;;;;; +1F5C3;CARD FILE BOX;So;0;ON;;;;;N;;;;; +1F5C4;FILE CABINET;So;0;ON;;;;;N;;;;; +1F5C5;EMPTY NOTE;So;0;ON;;;;;N;;;;; +1F5C6;EMPTY NOTE PAGE;So;0;ON;;;;;N;;;;; +1F5C7;EMPTY NOTE PAD;So;0;ON;;;;;N;;;;; +1F5C8;NOTE;So;0;ON;;;;;N;;;;; +1F5C9;NOTE PAGE;So;0;ON;;;;;N;;;;; +1F5CA;NOTE PAD;So;0;ON;;;;;N;;;;; +1F5CB;EMPTY DOCUMENT;So;0;ON;;;;;N;;;;; +1F5CC;EMPTY PAGE;So;0;ON;;;;;N;;;;; +1F5CD;EMPTY PAGES;So;0;ON;;;;;N;;;;; +1F5CE;DOCUMENT;So;0;ON;;;;;N;;;;; +1F5CF;PAGE;So;0;ON;;;;;N;;;;; +1F5D0;PAGES;So;0;ON;;;;;N;;;;; +1F5D1;WASTEBASKET;So;0;ON;;;;;N;;;;; +1F5D2;SPIRAL NOTE PAD;So;0;ON;;;;;N;;;;; +1F5D3;SPIRAL CALENDAR PAD;So;0;ON;;;;;N;;;;; +1F5D4;DESKTOP WINDOW;So;0;ON;;;;;N;;;;; +1F5D5;MINIMIZE;So;0;ON;;;;;N;;;;; +1F5D6;MAXIMIZE;So;0;ON;;;;;N;;;;; +1F5D7;OVERLAP;So;0;ON;;;;;N;;;;; +1F5D8;CLOCKWISE RIGHT AND LEFT SEMICIRCLE ARROWS;So;0;ON;;;;;N;;;;; +1F5D9;CANCELLATION X;So;0;ON;;;;;N;;;;; +1F5DA;INCREASE FONT SIZE SYMBOL;So;0;ON;;;;;N;;;;; +1F5DB;DECREASE FONT SIZE SYMBOL;So;0;ON;;;;;N;;;;; +1F5DC;COMPRESSION;So;0;ON;;;;;N;;;;; +1F5DD;OLD KEY;So;0;ON;;;;;N;;;;; +1F5DE;ROLLED-UP NEWSPAPER;So;0;ON;;;;;N;;;;; +1F5DF;PAGE WITH CIRCLED TEXT;So;0;ON;;;;;N;;;;; +1F5E0;STOCK CHART;So;0;ON;;;;;N;;;;; +1F5E1;DAGGER KNIFE;So;0;ON;;;;;N;;;;; +1F5E2;LIPS;So;0;ON;;;;;N;;;;; +1F5E3;SPEAKING HEAD IN SILHOUETTE;So;0;ON;;;;;N;;;;; +1F5E4;THREE RAYS ABOVE;So;0;ON;;;;;N;;;;; +1F5E5;THREE RAYS BELOW;So;0;ON;;;;;N;;;;; +1F5E6;THREE RAYS LEFT;So;0;ON;;;;;N;;;;; +1F5E7;THREE RAYS RIGHT;So;0;ON;;;;;N;;;;; +1F5E8;LEFT SPEECH BUBBLE;So;0;ON;;;;;N;;;;; +1F5E9;RIGHT SPEECH BUBBLE;So;0;ON;;;;;N;;;;; +1F5EA;TWO SPEECH BUBBLES;So;0;ON;;;;;N;;;;; +1F5EB;THREE SPEECH BUBBLES;So;0;ON;;;;;N;;;;; +1F5EC;LEFT THOUGHT BUBBLE;So;0;ON;;;;;N;;;;; +1F5ED;RIGHT THOUGHT BUBBLE;So;0;ON;;;;;N;;;;; +1F5EE;LEFT ANGER BUBBLE;So;0;ON;;;;;N;;;;; +1F5EF;RIGHT ANGER BUBBLE;So;0;ON;;;;;N;;;;; +1F5F0;MOOD BUBBLE;So;0;ON;;;;;N;;;;; +1F5F1;LIGHTNING MOOD BUBBLE;So;0;ON;;;;;N;;;;; +1F5F2;LIGHTNING MOOD;So;0;ON;;;;;N;;;;; +1F5F3;BALLOT BOX WITH BALLOT;So;0;ON;;;;;N;;;;; +1F5F4;BALLOT SCRIPT X;So;0;ON;;;;;N;;;;; +1F5F5;BALLOT BOX WITH SCRIPT X;So;0;ON;;;;;N;;;;; +1F5F6;BALLOT BOLD SCRIPT X;So;0;ON;;;;;N;;;;; +1F5F7;BALLOT BOX WITH BOLD SCRIPT X;So;0;ON;;;;;N;;;;; +1F5F8;LIGHT CHECK MARK;So;0;ON;;;;;N;;;;; +1F5F9;BALLOT BOX WITH BOLD CHECK;So;0;ON;;;;;N;;;;; +1F5FA;WORLD MAP;So;0;ON;;;;;N;;;;; 1F5FB;MOUNT FUJI;So;0;ON;;;;;N;;;;; 1F5FC;TOKYO TOWER;So;0;ON;;;;;N;;;;; 1F5FD;STATUE OF LIBERTY;So;0;ON;;;;;N;;;;; @@ -23341,6 +25870,8 @@ 1F63E;POUTING CAT FACE;So;0;ON;;;;;N;;;;; 1F63F;CRYING CAT FACE;So;0;ON;;;;;N;;;;; 1F640;WEARY CAT FACE;So;0;ON;;;;;N;;;;; +1F641;SLIGHTLY FROWNING FACE;So;0;ON;;;;;N;;;;; +1F642;SLIGHTLY SMILING FACE;So;0;ON;;;;;N;;;;; 1F645;FACE WITH NO GOOD GESTURE;So;0;ON;;;;;N;;;;; 1F646;FACE WITH OK GESTURE;So;0;ON;;;;;N;;;;; 1F647;PERSON BOWING DEEPLY;So;0;ON;;;;;N;;;;; @@ -23352,6 +25883,54 @@ 1F64D;PERSON FROWNING;So;0;ON;;;;;N;;;;; 1F64E;PERSON WITH POUTING FACE;So;0;ON;;;;;N;;;;; 1F64F;PERSON WITH FOLDED HANDS;So;0;ON;;;;;N;;;;; +1F650;NORTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F651;SOUTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F652;NORTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F653;SOUTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F654;TURNED NORTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F655;TURNED SOUTH WEST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F656;TURNED NORTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F657;TURNED SOUTH EAST POINTING LEAF;So;0;ON;;;;;N;;;;; +1F658;NORTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F659;SOUTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65A;NORTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65B;SOUTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65C;HEAVY NORTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65D;HEAVY SOUTH WEST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65E;HEAVY NORTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F65F;HEAVY SOUTH EAST POINTING VINE LEAF;So;0;ON;;;;;N;;;;; +1F660;NORTH WEST POINTING BUD;So;0;ON;;;;;N;;;;; +1F661;SOUTH WEST POINTING BUD;So;0;ON;;;;;N;;;;; +1F662;NORTH EAST POINTING BUD;So;0;ON;;;;;N;;;;; +1F663;SOUTH EAST POINTING BUD;So;0;ON;;;;;N;;;;; +1F664;HEAVY NORTH WEST POINTING BUD;So;0;ON;;;;;N;;;;; +1F665;HEAVY SOUTH WEST POINTING BUD;So;0;ON;;;;;N;;;;; +1F666;HEAVY NORTH EAST POINTING BUD;So;0;ON;;;;;N;;;;; +1F667;HEAVY SOUTH EAST POINTING BUD;So;0;ON;;;;;N;;;;; +1F668;HOLLOW QUILT SQUARE ORNAMENT;So;0;ON;;;;;N;;;;; +1F669;HOLLOW QUILT SQUARE ORNAMENT IN BLACK SQUARE;So;0;ON;;;;;N;;;;; +1F66A;SOLID QUILT SQUARE ORNAMENT;So;0;ON;;;;;N;;;;; +1F66B;SOLID QUILT SQUARE ORNAMENT IN BLACK SQUARE;So;0;ON;;;;;N;;;;; +1F66C;LEFTWARDS ROCKET;So;0;ON;;;;;N;;;;; +1F66D;UPWARDS ROCKET;So;0;ON;;;;;N;;;;; +1F66E;RIGHTWARDS ROCKET;So;0;ON;;;;;N;;;;; +1F66F;DOWNWARDS ROCKET;So;0;ON;;;;;N;;;;; +1F670;SCRIPT LIGATURE ET ORNAMENT;So;0;ON;;;;;N;;;;; +1F671;HEAVY SCRIPT LIGATURE ET ORNAMENT;So;0;ON;;;;;N;;;;; +1F672;LIGATURE OPEN ET ORNAMENT;So;0;ON;;;;;N;;;;; +1F673;HEAVY LIGATURE OPEN ET ORNAMENT;So;0;ON;;;;;N;;;;; +1F674;HEAVY AMPERSAND ORNAMENT;So;0;ON;;;;;N;;;;; +1F675;SWASH AMPERSAND ORNAMENT;So;0;ON;;;;;N;;;;; +1F676;SANS-SERIF HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;; +1F677;SANS-SERIF HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;; +1F678;SANS-SERIF HEAVY LOW DOUBLE COMMA QUOTATION MARK ORNAMENT;So;0;ON;;;;;N;;;;; +1F679;HEAVY INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;; +1F67A;SANS-SERIF INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;; +1F67B;HEAVY SANS-SERIF INTERROBANG ORNAMENT;So;0;ON;;;;;N;;;;; +1F67C;VERY HEAVY SOLIDUS;So;0;ON;;;;;N;;;;; +1F67D;VERY HEAVY REVERSE SOLIDUS;So;0;ON;;;;;N;;;;; +1F67E;CHECKER BOARD;So;0;ON;;;;;N;;;;; +1F67F;REVERSE CHECKER BOARD;So;0;ON;;;;;N;;;;; 1F680;ROCKET;So;0;ON;;;;;N;;;;; 1F681;HELICOPTER;So;0;ON;;;;;N;;;;; 1F682;STEAM LOCOMOTIVE;So;0;ON;;;;;N;;;;; @@ -23422,6 +26001,33 @@ 1F6C3;CUSTOMS;So;0;ON;;;;;N;;;;; 1F6C4;BAGGAGE CLAIM;So;0;ON;;;;;N;;;;; 1F6C5;LEFT LUGGAGE;So;0;ON;;;;;N;;;;; +1F6C6;TRIANGLE WITH ROUNDED CORNERS;So;0;ON;;;;;N;;;;; +1F6C7;PROHIBITED SIGN;So;0;ON;;;;;N;;;;; +1F6C8;CIRCLED INFORMATION SOURCE;So;0;ON;;;;;N;;;;; +1F6C9;BOYS SYMBOL;So;0;ON;;;;;N;;;;; +1F6CA;GIRLS SYMBOL;So;0;ON;;;;;N;;;;; +1F6CB;COUCH AND LAMP;So;0;ON;;;;;N;;;;; +1F6CC;SLEEPING ACCOMMODATION;So;0;ON;;;;;N;;;;; +1F6CD;SHOPPING BAGS;So;0;ON;;;;;N;;;;; +1F6CE;BELLHOP BELL;So;0;ON;;;;;N;;;;; +1F6CF;BED;So;0;ON;;;;;N;;;;; +1F6E0;HAMMER AND WRENCH;So;0;ON;;;;;N;;;;; +1F6E1;SHIELD;So;0;ON;;;;;N;;;;; +1F6E2;OIL DRUM;So;0;ON;;;;;N;;;;; +1F6E3;MOTORWAY;So;0;ON;;;;;N;;;;; +1F6E4;RAILWAY TRACK;So;0;ON;;;;;N;;;;; +1F6E5;MOTOR BOAT;So;0;ON;;;;;N;;;;; +1F6E6;UP-POINTING MILITARY AIRPLANE;So;0;ON;;;;;N;;;;; +1F6E7;UP-POINTING AIRPLANE;So;0;ON;;;;;N;;;;; +1F6E8;UP-POINTING SMALL AIRPLANE;So;0;ON;;;;;N;;;;; +1F6E9;SMALL AIRPLANE;So;0;ON;;;;;N;;;;; +1F6EA;NORTHEAST-POINTING AIRPLANE;So;0;ON;;;;;N;;;;; +1F6EB;AIRPLANE DEPARTURE;So;0;ON;;;;;N;;;;; +1F6EC;AIRPLANE ARRIVING;So;0;ON;;;;;N;;;;; +1F6F0;SATELLITE;So;0;ON;;;;;N;;;;; +1F6F1;ONCOMING FIRE ENGINE;So;0;ON;;;;;N;;;;; +1F6F2;DIESEL LOCOMOTIVE;So;0;ON;;;;;N;;;;; +1F6F3;PASSENGER SHIP;So;0;ON;;;;;N;;;;; 1F700;ALCHEMICAL SYMBOL FOR QUINTESSENCE;So;0;ON;;;;;N;;;;; 1F701;ALCHEMICAL SYMBOL FOR AIR;So;0;ON;;;;;N;;;;; 1F702;ALCHEMICAL SYMBOL FOR FIRE;So;0;ON;;;;;N;;;;; @@ -23538,6 +26144,239 @@ 1F771;ALCHEMICAL SYMBOL FOR MONTH;So;0;ON;;;;;N;;;;; 1F772;ALCHEMICAL SYMBOL FOR HALF DRAM;So;0;ON;;;;;N;;;;; 1F773;ALCHEMICAL SYMBOL FOR HALF OUNCE;So;0;ON;;;;;N;;;;; +1F780;BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;; +1F781;BLACK UP-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;; +1F782;BLACK RIGHT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;; +1F783;BLACK DOWN-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;; +1F784;BLACK SLIGHTLY SMALL CIRCLE;So;0;ON;;;;;N;;;;; +1F785;MEDIUM BOLD WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F786;BOLD WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F787;HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F788;VERY HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F789;EXTREMELY HEAVY WHITE CIRCLE;So;0;ON;;;;;N;;;;; +1F78A;WHITE CIRCLE CONTAINING BLACK SMALL CIRCLE;So;0;ON;;;;;N;;;;; +1F78B;ROUND TARGET;So;0;ON;;;;;N;;;;; +1F78C;BLACK TINY SQUARE;So;0;ON;;;;;N;;;;; +1F78D;BLACK SLIGHTLY SMALL SQUARE;So;0;ON;;;;;N;;;;; +1F78E;LIGHT WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F78F;MEDIUM WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F790;BOLD WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F791;HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F792;VERY HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F793;EXTREMELY HEAVY WHITE SQUARE;So;0;ON;;;;;N;;;;; +1F794;WHITE SQUARE CONTAINING BLACK VERY SMALL SQUARE;So;0;ON;;;;;N;;;;; +1F795;WHITE SQUARE CONTAINING BLACK MEDIUM SQUARE;So;0;ON;;;;;N;;;;; +1F796;SQUARE TARGET;So;0;ON;;;;;N;;;;; +1F797;BLACK TINY DIAMOND;So;0;ON;;;;;N;;;;; +1F798;BLACK VERY SMALL DIAMOND;So;0;ON;;;;;N;;;;; +1F799;BLACK MEDIUM SMALL DIAMOND;So;0;ON;;;;;N;;;;; +1F79A;WHITE DIAMOND CONTAINING BLACK VERY SMALL DIAMOND;So;0;ON;;;;;N;;;;; +1F79B;WHITE DIAMOND CONTAINING BLACK MEDIUM DIAMOND;So;0;ON;;;;;N;;;;; +1F79C;DIAMOND TARGET;So;0;ON;;;;;N;;;;; +1F79D;BLACK TINY LOZENGE;So;0;ON;;;;;N;;;;; +1F79E;BLACK VERY SMALL LOZENGE;So;0;ON;;;;;N;;;;; +1F79F;BLACK MEDIUM SMALL LOZENGE;So;0;ON;;;;;N;;;;; +1F7A0;WHITE LOZENGE CONTAINING BLACK SMALL LOZENGE;So;0;ON;;;;;N;;;;; +1F7A1;THIN GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A2;LIGHT GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A3;MEDIUM GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A4;BOLD GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A5;VERY BOLD GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A6;VERY HEAVY GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A7;EXTREMELY HEAVY GREEK CROSS;So;0;ON;;;;;N;;;;; +1F7A8;THIN SALTIRE;So;0;ON;;;;;N;;;;; +1F7A9;LIGHT SALTIRE;So;0;ON;;;;;N;;;;; +1F7AA;MEDIUM SALTIRE;So;0;ON;;;;;N;;;;; +1F7AB;BOLD SALTIRE;So;0;ON;;;;;N;;;;; +1F7AC;HEAVY SALTIRE;So;0;ON;;;;;N;;;;; +1F7AD;VERY HEAVY SALTIRE;So;0;ON;;;;;N;;;;; +1F7AE;EXTREMELY HEAVY SALTIRE;So;0;ON;;;;;N;;;;; +1F7AF;LIGHT FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B0;MEDIUM FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B1;BOLD FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B2;HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B3;VERY HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B4;EXTREMELY HEAVY FIVE SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B5;LIGHT SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B6;MEDIUM SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B7;BOLD SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B8;HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7B9;VERY HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BA;EXTREMELY HEAVY SIX SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BB;LIGHT EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BC;MEDIUM EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BD;BOLD EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BE;HEAVY EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7BF;VERY HEAVY EIGHT SPOKED ASTERISK;So;0;ON;;;;;N;;;;; +1F7C0;LIGHT THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C1;MEDIUM THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C2;THREE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C3;MEDIUM THREE POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F7C4;LIGHT FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C5;MEDIUM FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C6;FOUR POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7C7;MEDIUM FOUR POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F7C8;REVERSE LIGHT FOUR POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F7C9;LIGHT FIVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7CA;HEAVY FIVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7CB;MEDIUM SIX POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7CC;HEAVY SIX POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7CD;SIX POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F7CE;MEDIUM EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7CF;HEAVY EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7D0;VERY HEAVY EIGHT POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7D1;HEAVY EIGHT POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F7D2;LIGHT TWELVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7D3;HEAVY TWELVE POINTED BLACK STAR;So;0;ON;;;;;N;;;;; +1F7D4;HEAVY TWELVE POINTED PINWHEEL STAR;So;0;ON;;;;;N;;;;; +1F800;LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F801;UPWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F802;RIGHTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F803;DOWNWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F804;LEFTWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F805;UPWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F806;RIGHTWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F807;DOWNWARDS ARROW WITH MEDIUM TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F808;LEFTWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F809;UPWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F80A;RIGHTWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F80B;DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F810;LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F811;UPWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F812;RIGHTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F813;DOWNWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F814;LEFTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F815;UPWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F816;RIGHTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F817;DOWNWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F818;HEAVY LEFTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F819;HEAVY UPWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81A;HEAVY RIGHTWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81B;HEAVY DOWNWARDS ARROW WITH EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81C;HEAVY LEFTWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81D;HEAVY UPWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81E;HEAVY RIGHTWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F81F;HEAVY DOWNWARDS ARROW WITH LARGE EQUILATERAL ARROWHEAD;So;0;ON;;;;;N;;;;; +1F820;LEFTWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;; +1F821;UPWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;; +1F822;RIGHTWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;; +1F823;DOWNWARDS TRIANGLE-HEADED ARROW WITH NARROW SHAFT;So;0;ON;;;;;N;;;;; +1F824;LEFTWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;; +1F825;UPWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;; +1F826;RIGHTWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;; +1F827;DOWNWARDS TRIANGLE-HEADED ARROW WITH MEDIUM SHAFT;So;0;ON;;;;;N;;;;; +1F828;LEFTWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;; +1F829;UPWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;; +1F82A;RIGHTWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;; +1F82B;DOWNWARDS TRIANGLE-HEADED ARROW WITH BOLD SHAFT;So;0;ON;;;;;N;;;;; +1F82C;LEFTWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F82D;UPWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F82E;RIGHTWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F82F;DOWNWARDS TRIANGLE-HEADED ARROW WITH HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F830;LEFTWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F831;UPWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F832;RIGHTWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F833;DOWNWARDS TRIANGLE-HEADED ARROW WITH VERY HEAVY SHAFT;So;0;ON;;;;;N;;;;; +1F834;LEFTWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;; +1F835;UPWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;; +1F836;RIGHTWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;; +1F837;DOWNWARDS FINGER-POST ARROW;So;0;ON;;;;;N;;;;; +1F838;LEFTWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;; +1F839;UPWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;; +1F83A;RIGHTWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;; +1F83B;DOWNWARDS SQUARED ARROW;So;0;ON;;;;;N;;;;; +1F83C;LEFTWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F83D;UPWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F83E;RIGHTWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F83F;DOWNWARDS COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F840;LEFTWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F841;UPWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F842;RIGHTWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F843;DOWNWARDS HEAVY COMPRESSED ARROW;So;0;ON;;;;;N;;;;; +1F844;LEFTWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;; +1F845;UPWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;; +1F846;RIGHTWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;; +1F847;DOWNWARDS HEAVY ARROW;So;0;ON;;;;;N;;;;; +1F850;LEFTWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F851;UPWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F852;RIGHTWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F853;DOWNWARDS SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F854;NORTH WEST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F855;NORTH EAST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F856;SOUTH EAST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F857;SOUTH WEST SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F858;LEFT RIGHT SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F859;UP DOWN SANS-SERIF ARROW;So;0;ON;;;;;N;;;;; +1F860;WIDE-HEADED LEFTWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F861;WIDE-HEADED UPWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F862;WIDE-HEADED RIGHTWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F863;WIDE-HEADED DOWNWARDS LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F864;WIDE-HEADED NORTH WEST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F865;WIDE-HEADED NORTH EAST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F866;WIDE-HEADED SOUTH EAST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F867;WIDE-HEADED SOUTH WEST LIGHT BARB ARROW;So;0;ON;;;;;N;;;;; +1F868;WIDE-HEADED LEFTWARDS BARB ARROW;So;0;ON;;;;;N;;;;; +1F869;WIDE-HEADED UPWARDS BARB ARROW;So;0;ON;;;;;N;;;;; +1F86A;WIDE-HEADED RIGHTWARDS BARB ARROW;So;0;ON;;;;;N;;;;; +1F86B;WIDE-HEADED DOWNWARDS BARB ARROW;So;0;ON;;;;;N;;;;; +1F86C;WIDE-HEADED NORTH WEST BARB ARROW;So;0;ON;;;;;N;;;;; +1F86D;WIDE-HEADED NORTH EAST BARB ARROW;So;0;ON;;;;;N;;;;; +1F86E;WIDE-HEADED SOUTH EAST BARB ARROW;So;0;ON;;;;;N;;;;; +1F86F;WIDE-HEADED SOUTH WEST BARB ARROW;So;0;ON;;;;;N;;;;; +1F870;WIDE-HEADED LEFTWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F871;WIDE-HEADED UPWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F872;WIDE-HEADED RIGHTWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F873;WIDE-HEADED DOWNWARDS MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F874;WIDE-HEADED NORTH WEST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F875;WIDE-HEADED NORTH EAST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F876;WIDE-HEADED SOUTH EAST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F877;WIDE-HEADED SOUTH WEST MEDIUM BARB ARROW;So;0;ON;;;;;N;;;;; +1F878;WIDE-HEADED LEFTWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F879;WIDE-HEADED UPWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87A;WIDE-HEADED RIGHTWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87B;WIDE-HEADED DOWNWARDS HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87C;WIDE-HEADED NORTH WEST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87D;WIDE-HEADED NORTH EAST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87E;WIDE-HEADED SOUTH EAST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F87F;WIDE-HEADED SOUTH WEST HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F880;WIDE-HEADED LEFTWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F881;WIDE-HEADED UPWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F882;WIDE-HEADED RIGHTWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F883;WIDE-HEADED DOWNWARDS VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F884;WIDE-HEADED NORTH WEST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F885;WIDE-HEADED NORTH EAST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F886;WIDE-HEADED SOUTH EAST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F887;WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW;So;0;ON;;;;;N;;;;; +1F890;LEFTWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F891;UPWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F892;RIGHTWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F893;DOWNWARDS TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F894;LEFTWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F895;UPWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F896;RIGHTWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F897;DOWNWARDS WHITE ARROW WITHIN TRIANGLE ARROWHEAD;So;0;ON;;;;;N;;;;; +1F898;LEFTWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;; +1F899;UPWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;; +1F89A;RIGHTWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;; +1F89B;DOWNWARDS ARROW WITH NOTCHED TAIL;So;0;ON;;;;;N;;;;; +1F89C;HEAVY ARROW SHAFT WIDTH ONE;So;0;ON;;;;;N;;;;; +1F89D;HEAVY ARROW SHAFT WIDTH TWO THIRDS;So;0;ON;;;;;N;;;;; +1F89E;HEAVY ARROW SHAFT WIDTH ONE HALF;So;0;ON;;;;;N;;;;; +1F89F;HEAVY ARROW SHAFT WIDTH ONE THIRD;So;0;ON;;;;;N;;;;; +1F8A0;LEFTWARDS BOTTOM-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A1;RIGHTWARDS BOTTOM SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A2;LEFTWARDS TOP SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A3;RIGHTWARDS TOP SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A4;LEFTWARDS LEFT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A5;RIGHTWARDS RIGHT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A6;LEFTWARDS RIGHT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A7;RIGHTWARDS LEFT-SHADED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A8;LEFTWARDS BACK-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8A9;RIGHTWARDS BACK-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8AA;LEFTWARDS FRONT-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8AB;RIGHTWARDS FRONT-TILTED SHADOWED WHITE ARROW;So;0;ON;;;;;N;;;;; +1F8AC;WHITE ARROW SHAFT WIDTH ONE;So;0;ON;;;;;N;;;;; +1F8AD;WHITE ARROW SHAFT WIDTH TWO THIRDS;So;0;ON;;;;;N;;;;; 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;; 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;; 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;; --- old/jdk/make/data/unicodedata/VERSION 2015-07-13 16:11:37.000000000 +0900 +++ new/jdk/make/data/unicodedata/VERSION 2015-07-13 16:11:37.000000000 +0900 @@ -1 +1 @@ -6.2.0 +7.0.0 --- old/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java 2015-07-13 16:11:38.000000000 +0900 +++ new/jdk/make/src/classes/build/tools/generatecharacter/GenerateCharacter.java 2015-07-13 16:11:38.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -906,6 +906,14 @@ return Integer.toString(UnicodeSpec.DIRECTIONALITY_WHITESPACE); if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS][UnicodeSpec.LONG])) return Integer.toString(UnicodeSpec.DIRECTIONALITY_OTHER_NEUTRALS); + if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE][UnicodeSpec.LONG])) + return Integer.toString(UnicodeSpec.DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE); + if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE][UnicodeSpec.LONG])) + return Integer.toString(UnicodeSpec.DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE); + if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE][UnicodeSpec.LONG])) + return Integer.toString(UnicodeSpec.DIRECTIONALITY_FIRST_STRONG_ISOLATE); + if (x.equals(UnicodeSpec.bidiCategoryList[UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE][UnicodeSpec.LONG])) + return Integer.toString(UnicodeSpec.DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE); FAIL("Unknown text substitution marker " + commandMarker + x); return commandMarker + x; } --- old/jdk/make/src/classes/build/tools/generatecharacter/UnicodeSpec.java 2015-07-13 16:11:39.000000000 +0900 +++ new/jdk/make/src/classes/build/tools/generatecharacter/UnicodeSpec.java 2015-07-13 16:11:39.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -121,7 +121,7 @@ String[] tokens = null; try { - tokens = tokenSeparator.split(s, REQUIRED_FIELDS); + tokens = tokenSeparator.split(s, REQUIRED_FIELDS); spec = new UnicodeSpec(); spec.setCodePoint(parseCodePoint(tokens[FIELD_VALUE])); spec.setName(parseName(tokens[FIELD_NAME])); @@ -672,7 +672,8 @@ * Bidirectional categories */ public static final byte - DIRECTIONALITY_UNDEFINED = -1, + DIRECTIONALITY_UNDEFINED = -1, + // Strong category DIRECTIONALITY_LEFT_TO_RIGHT = 0, // L DIRECTIONALITY_RIGHT_TO_LEFT = 1, // R @@ -689,15 +690,19 @@ DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10, // B DIRECTIONALITY_SEGMENT_SEPARATOR = 11, // S DIRECTIONALITY_WHITESPACE = 12, // WS - DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON - + DIRECTIONALITY_OTHER_NEUTRALS = 13, // ON + // Explicit Formatting category DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14, // LRE DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15, // LRO DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16, // RLE DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17, // RLO DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18, // PDF + DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19, // LRI + DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20, // RLI + DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21, // FSI + DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22, // PDI - DIRECTIONALITY_CATEGORY_COUNT = 19; // sentinel value + DIRECTIONALITY_CATEGORY_COUNT = 23; // sentinel value // If changes are made to the above bidi category assignments, this // list of bidi category names must be changed to keep their order in synch. @@ -722,7 +727,10 @@ {"RLE", "DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING"}, {"RLO", "DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE"}, {"PDF", "DIRECTIONALITY_POP_DIRECTIONAL_FORMAT"}, - + {"LRI", "DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE"}, + {"RLI", "DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE"}, + {"FSI", "DIRECTIONALITY_FIRST_STRONG_ISOLATE"}, + {"PDI", "DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE"}, }; // Unicode specification lines have fields in this order. --- old/jdk/src/java.base/share/classes/java/lang/Character.java 2015-07-13 16:11:40.000000000 +0900 +++ new/jdk/src/java.base/share/classes/java/lang/Character.java 2015-07-13 16:11:39.000000000 +0900 @@ -40,7 +40,7 @@ * a character's category (lowercase letter, digit, etc.) and for converting * characters from uppercase to lowercase and vice versa. * <p> - * Character information is based on the Unicode Standard, version 6.2.0. + * Character information is based on the Unicode Standard, version 7.0.0. * <p> * The methods and data of class {@code Character} are defined by * the information in the <i>UnicodeData</i> file that is part of the @@ -490,6 +490,30 @@ public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; /** + * Weak bidirectional character type "LRI" in the Unicode specification. + * @since 1.9 + */ + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 19; + + /** + * Weak bidirectional character type "RLI" in the Unicode specification. + * @since 1.9 + */ + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 20; + + /** + * Weak bidirectional character type "FSI" in the Unicode specification. + * @since 1.9 + */ + public static final byte DIRECTIONALITY_FIRST_STRONG_ISOLATE = 21; + + /** + * Weak bidirectional character type "PDI" in the Unicode specification. + * @since 1.9 + */ + public static final byte DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 22; + + /** * The minimum value of a * <a href="http://www.unicode.org/glossary/#high_surrogate_code_unit"> * Unicode high-surrogate code unit</a> @@ -2561,6 +2585,269 @@ "ARABIC MATHEMATICAL ALPHABETIC SYMBOLS", "ARABICMATHEMATICALALPHABETICSYMBOLS"); + /** + * Constant for the "Combining Diacritical Marks Extended" Unicode + * character block. + * @since 1.9 + */ + public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS_EXTENDED = + new UnicodeBlock("COMBINING_DIACRITICAL_MARKS_EXTENDED", + "COMBINING DIACRITICAL MARKS EXTENDED", + "COMBININGDIACRITICALMARKSEXTENDED"); + + /** + * Constant for the "Myanmar Extended-B" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MYANMAR_EXTENDED_B = + new UnicodeBlock("MYANMAR_EXTENDED_B", + "MYANMAR EXTENDED-B", + "MYANMAREXTENDED-B"); + + /** + * Constant for the "Latin Extended-E" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock LATIN_EXTENDED_E = + new UnicodeBlock("LATIN_EXTENDED_E", + "LATIN EXTENDED-E", + "LATINEXTENDED-E"); + + /** + * Constant for the "Coptic Epact Numbers" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock COPTIC_EPACT_NUMBERS = + new UnicodeBlock("COPTIC_EPACT_NUMBERS", + "COPTIC EPACT NUMBERS", + "COPTICEPACTNUMBERS"); + + /** + * Constant for the "Old Permic" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock OLD_PERMIC = + new UnicodeBlock("OLD_PERMIC", + "OLD PERMIC", + "OLDPERMIC"); + + /** + * Constant for the "Elbasan" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock ELBASAN = + new UnicodeBlock("ELBASAN"); + + /** + * Constant for the "Caucasian Albanian" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock CAUCASIAN_ALBANIAN = + new UnicodeBlock("CAUCASIAN_ALBANIAN", + "CAUCASIAN ALBANIAN", + "CAUCASIANALBANIAN"); + + /** + * Constant for the "Linear A" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock LINEAR_A = + new UnicodeBlock("LINEAR_A", + "LINEAR A", + "LINEARA"); + + /** + * Constant for the "Palmyrene" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock PALMYRENE = + new UnicodeBlock("PALMYRENE"); + + /** + * Constant for the "Nabataean" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock NABATAEAN = + new UnicodeBlock("NABATAEAN"); + + /** + * Constant for the "Old North Arabian" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock OLD_NORTH_ARABIAN = + new UnicodeBlock("OLD_NORTH_ARABIAN", + "OLD NORTH ARABIAN", + "OLDNORTHARABIAN"); + + /** + * Constant for the "Manichaean" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MANICHAEAN = + new UnicodeBlock("MANICHAEAN"); + + /** + * Constant for the "Psalter Pahlavi" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock PSALTER_PAHLAVI = + new UnicodeBlock("PSALTER_PAHLAVI", + "PSALTER PAHLAVI", + "PSALTERPAHLAVI"); + + /** + * Constant for the "Mahajani" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MAHAJANI = + new UnicodeBlock("MAHAJANI"); + + /** + * Constant for the "Sinhala Archaic Numbers" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock SINHALA_ARCHAIC_NUMBERS = + new UnicodeBlock("SINHALA_ARCHAIC_NUMBERS", + "SINHALA ARCHAIC NUMBERS", + "SINHALAARCHAICNUMBERS"); + + /** + * Constant for the "Khojki" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock KHOJKI = + new UnicodeBlock("KHOJKI"); + + /** + * Constant for the "Khudawadi" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock KHUDAWADI = + new UnicodeBlock("KHUDAWADI"); + + /** + * Constant for the "Grantha" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock GRANTHA = + new UnicodeBlock("GRANTHA"); + + /** + * Constant for the "Tirhuta" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock TIRHUTA = + new UnicodeBlock("TIRHUTA"); + + /** + * Constant for the "Siddham" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock SIDDHAM = + new UnicodeBlock("SIDDHAM"); + + /** + * Constant for the "Modi" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MODI = + new UnicodeBlock("MODI"); + + /** + * Constant for the "Warang Citi" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock WARANG_CITI = + new UnicodeBlock("WARANG_CITI", + "WARANG CITI", + "WARANGCITI"); + + /** + * Constant for the "Pau Cin Hau" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock PAU_CIN_HAU = + new UnicodeBlock("PAU_CIN_HAU", + "PAU CIN HAU", + "PAUCINHAU"); + + /** + * Constant for the "Mro" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MRO = + new UnicodeBlock("MRO"); + + /** + * Constant for the "Bassa Vah" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock BASSA_VAH = + new UnicodeBlock("BASSA_VAH", + "BASSA VAH", + "BASSAVAH"); + + /** + * Constant for the "Pahawh Hmong" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock PAHAWH_HMONG = + new UnicodeBlock("PAHAWH_HMONG", + "PAHAWH HMONG", + "PAHAWHHMONG"); + + /** + * Constant for the "Duployan" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock DUPLOYAN = + new UnicodeBlock("DUPLOYAN"); + + /** + * Constant for the "Shorthand Format Controls" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock SHORTHAND_FORMAT_CONTROLS = + new UnicodeBlock("SHORTHAND_FORMAT_CONTROLS", + "SHORTHAND FORMAT CONTROLS", + "SHORTHANDFORMATCONTROLS"); + + /** + * Constant for the "Mende Kikakui" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock MENDE_KIKAKUI = + new UnicodeBlock("MENDE_KIKAKUI", + "MENDE KIKAKUI", + "MENDEKIKAKUI"); + + /** + * Constant for the "Ornamental Dingbats" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock ORNAMENTAL_DINGBATS = + new UnicodeBlock("ORNAMENTAL_DINGBATS", + "ORNAMENTAL DINGBATS", + "ORNAMENTALDINGBATS"); + + /** + * Constant for the "Geometric Shapes Extended" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock GEOMETRIC_SHAPES_EXTENDED = + new UnicodeBlock("GEOMETRIC_SHAPES_EXTENDED", + "GEOMETRIC SHAPES EXTENDED", + "GEOMETRICSHAPESEXTENDED"); + + /** + * Constant for the "Supplemental Arrows-C" Unicode character block. + * @since 1.9 + */ + public static final UnicodeBlock SUPPLEMENTAL_ARROWS_C = + new UnicodeBlock("SUPPLEMENTAL_ARROWS_C", + "SUPPLEMENTAL ARROWS-C", + "SUPPLEMENTALARROWS-C"); + private static final int blockStarts[] = { 0x0000, // 0000..007F; Basic Latin 0x0080, // 0080..00FF; Latin-1 Supplement @@ -2618,7 +2905,7 @@ 0x19E0, // 19E0..19FF; Khmer Symbols 0x1A00, // 1A00..1A1F; Buginese 0x1A20, // 1A20..1AAF; Tai Tham - 0x1AB0, // unassigned + 0x1AB0, // 1AB0..1AFF; Combining Diacritical Marks Extended 0x1B00, // 1B00..1B7F; Balinese 0x1B80, // 1B80..1BBF; Sundanese 0x1BC0, // 1BC0..1BFF; Batak @@ -2699,13 +2986,14 @@ 0xA930, // A930..A95F; Rejang 0xA960, // A960..A97F; Hangul Jamo Extended-A 0xA980, // A980..A9DF; Javanese - 0xA9E0, // unassigned + 0xA9E0, // A9E0..A9FF; Myanmar Extended-B 0xAA00, // AA00..AA5F; Cham 0xAA60, // AA60..AA7F; Myanmar Extended-A 0xAA80, // AA80..AADF; Tai Viet 0xAAE0, // AAE0..AAFF; Meetei Mayek Extensions 0xAB00, // AB00..AB2F; Ethiopic Extended-A - 0xAB30, // unassigned + 0xAB30, // AB30..AB6F; Latin Extended-E + 0xAB70, // unassigned 0xABC0, // ABC0..ABFF; Meetei Mayek 0xAC00, // AC00..D7AF; Hangul Syllables 0xD7B0, // D7B0..D7FF; Hangul Jamo Extended-B @@ -2733,10 +3021,10 @@ 0x10200, // unassigned 0x10280, // 10280..1029F; Lycian 0x102A0, // 102A0..102DF; Carian - 0x102E0, // unassigned + 0x102E0, // 102E0..102FF; Coptic Epact Numbers 0x10300, // 10300..1032F; Old Italic 0x10330, // 10330..1034F; Gothic - 0x10350, // unassigned + 0x10350, // 10350..1037F; Old Permic 0x10380, // 10380..1039F; Ugaritic 0x103A0, // 103A0..103DF; Old Persian 0x103E0, // unassigned @@ -2744,9 +3032,16 @@ 0x10450, // 10450..1047F; Shavian 0x10480, // 10480..104AF; Osmanya 0x104B0, // unassigned + 0x10500, // 10500..1052F; Elbasan + 0x10530, // 10530..1056F; Caucasian Albanian + 0x10570, // unassigned + 0x10600, // 10600..1077F; Linear A + 0x10780, // unassigned 0x10800, // 10800..1083F; Cypriot Syllabary 0x10840, // 10840..1085F; Imperial Aramaic - 0x10860, // unassigned + 0x10860, // 10860..1087F; Palmyrene + 0x10880, // 10880..108AF; Nabataean + 0x108B0, // unassigned 0x10900, // 10900..1091F; Phoenician 0x10920, // 10920..1093F; Lydian 0x10940, // unassigned @@ -2754,11 +3049,14 @@ 0x109A0, // 109A0..109FF; Meroitic Cursive 0x10A00, // 10A00..10A5F; Kharoshthi 0x10A60, // 10A60..10A7F; Old South Arabian - 0x10A80, // unassigned + 0x10A80, // 10A80..10A9F; Old North Arabian + 0x10AA0, // unassigned + 0x10AC0, // 10AC0..10AFF; Manichaean 0x10B00, // 10B00..10B3F; Avestan 0x10B40, // 10B40..10B5F; Inscriptional Parthian 0x10B60, // 10B60..10B7F; Inscriptional Pahlavi - 0x10B80, // unassigned + 0x10B80, // 10B80..10BAF; Psalter Pahlavi + 0x10BB0, // unassigned 0x10C00, // 10C00..10C4F; Old Turkic 0x10C50, // unassigned 0x10E60, // 10E60..10E7F; Rumi Numeral Symbols @@ -2767,22 +3065,43 @@ 0x11080, // 11080..110CF; Kaithi 0x110D0, // 110D0..110FF; Sora Sompeng 0x11100, // 11100..1114F; Chakma - 0x11150, // unassigned + 0x11150, // 11150..1117F; Mahajani 0x11180, // 11180..111DF; Sharada - 0x111E0, // unassigned + 0x111E0, // 111E0..111FF; Sinhala Archaic Numbers + 0x11200, // 11200..1124F; Khojki + 0x11250, // unassigned + 0x112B0, // 112B0..112FF; Khudawadi + 0x11300, // 11300..1137F; Grantha + 0x11380, // unassigned + 0x11480, // 11480..114DF; Tirhuta + 0x114E0, // unassigned + 0x11580, // 11580..115FF; Siddham + 0x11600, // 11600..1165F; Modi + 0x11660, // unassigned 0x11680, // 11680..116CF; Takri 0x116D0, // unassigned + 0x118A0, // 118A0..118FF; Warang Citi + 0x11900, // unassigned + 0x11AC0, // 11AC0..11AFF; Pau Cin Hau + 0x11B00, // unassigned 0x12000, // 12000..123FF; Cuneiform 0x12400, // 12400..1247F; Cuneiform Numbers and Punctuation 0x12480, // unassigned 0x13000, // 13000..1342F; Egyptian Hieroglyphs 0x13430, // unassigned 0x16800, // 16800..16A3F; Bamum Supplement - 0x16A40, // unassigned + 0x16A40, // 16A40..16A6F; Mro + 0x16A70, // unassigned + 0x16AD0, // 16AD0..16AFF; Bassa Vah + 0x16B00, // 16B00..16B8F; Pahawh Hmong + 0x16B90, // unassigned 0x16F00, // 16F00..16F9F; Miao 0x16FA0, // unassigned 0x1B000, // 1B000..1B0FF; Kana Supplement 0x1B100, // unassigned + 0x1BC00, // 1BC00..1BC9F; Duployan + 0x1BCA0, // 1BCA0..1BCAF; Shorthand Format Controls + 0x1BCB0, // unassigned 0x1D000, // 1D000..1D0FF; Byzantine Musical Symbols 0x1D100, // 1D100..1D1FF; Musical Symbols 0x1D200, // 1D200..1D24F; Ancient Greek Musical Notation @@ -2792,6 +3111,8 @@ 0x1D380, // unassigned 0x1D400, // 1D400..1D7FF; Mathematical Alphanumeric Symbols 0x1D800, // unassigned + 0x1E800, // 1E800..1E8DF; Mende Kikakui + 0x1E8E0, // unassigned 0x1EE00, // 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols 0x1EF00, // unassigned 0x1F000, // 1F000..1F02F; Mahjong Tiles @@ -2801,10 +3122,12 @@ 0x1F200, // 1F200..1F2FF; Enclosed Ideographic Supplement 0x1F300, // 1F300..1F5FF; Miscellaneous Symbols And Pictographs 0x1F600, // 1F600..1F64F; Emoticons - 0x1F650, // unassigned + 0x1F650, // 1F650..1F67F; Ornamental Dingbats 0x1F680, // 1F680..1F6FF; Transport And Map Symbols 0x1F700, // 1F700..1F77F; Alchemical Symbols - 0x1F780, // unassigned + 0x1F780, // 1F780..1F7FF; Geometric Shapes Extended + 0x1F800, // 1F800..1F8FF; Supplemental Arrows-C + 0x1F900, // unassigned 0x20000, // 20000..2A6DF; CJK Unified Ideographs Extension B 0x2A6E0, // unassigned 0x2A700, // 2A700..2B73F; CJK Unified Ideographs Extension C @@ -2877,7 +3200,7 @@ KHMER_SYMBOLS, BUGINESE, TAI_THAM, - null, + COMBINING_DIACRITICAL_MARKS_EXTENDED, BALINESE, SUNDANESE, BATAK, @@ -2958,12 +3281,13 @@ REJANG, HANGUL_JAMO_EXTENDED_A, JAVANESE, - null, + MYANMAR_EXTENDED_B, CHAM, MYANMAR_EXTENDED_A, TAI_VIET, MEETEI_MAYEK_EXTENSIONS, ETHIOPIC_EXTENDED_A, + LATIN_EXTENDED_E, null, MEETEI_MAYEK, HANGUL_SYLLABLES, @@ -2992,10 +3316,10 @@ null, LYCIAN, CARIAN, - null, + COPTIC_EPACT_NUMBERS, OLD_ITALIC, GOTHIC, - null, + OLD_PERMIC, UGARITIC, OLD_PERSIAN, null, @@ -3003,8 +3327,15 @@ SHAVIAN, OSMANYA, null, + ELBASAN, + CAUCASIAN_ALBANIAN, + null, + LINEAR_A, + null, CYPRIOT_SYLLABARY, IMPERIAL_ARAMAIC, + PALMYRENE, + NABATAEAN, null, PHOENICIAN, LYDIAN, @@ -3013,10 +3344,13 @@ MEROITIC_CURSIVE, KHAROSHTHI, OLD_SOUTH_ARABIAN, + OLD_NORTH_ARABIAN, null, + MANICHAEAN, AVESTAN, INSCRIPTIONAL_PARTHIAN, INSCRIPTIONAL_PAHLAVI, + PSALTER_PAHLAVI, null, OLD_TURKIC, null, @@ -3026,22 +3360,43 @@ KAITHI, SORA_SOMPENG, CHAKMA, - null, + MAHAJANI, SHARADA, + SINHALA_ARCHAIC_NUMBERS, + KHOJKI, + null, + KHUDAWADI, + GRANTHA, + null, + TIRHUTA, + null, + SIDDHAM, + MODI, null, TAKRI, null, + WARANG_CITI, + null, + PAU_CIN_HAU, + null, CUNEIFORM, CUNEIFORM_NUMBERS_AND_PUNCTUATION, null, EGYPTIAN_HIEROGLYPHS, null, BAMUM_SUPPLEMENT, + MRO, + null, + BASSA_VAH, + PAHAWH_HMONG, null, MIAO, null, KANA_SUPPLEMENT, null, + DUPLOYAN, + SHORTHAND_FORMAT_CONTROLS, + null, BYZANTINE_MUSICAL_SYMBOLS, MUSICAL_SYMBOLS, ANCIENT_GREEK_MUSICAL_NOTATION, @@ -3051,6 +3406,8 @@ null, MATHEMATICAL_ALPHANUMERIC_SYMBOLS, null, + MENDE_KIKAKUI, + null, ARABIC_MATHEMATICAL_ALPHABETIC_SYMBOLS, null, MAHJONG_TILES, @@ -3060,9 +3417,11 @@ ENCLOSED_IDEOGRAPHIC_SUPPLEMENT, MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS, EMOTICONS, - null, + ORNAMENTAL_DINGBATS, TRANSPORT_AND_MAP_SYMBOLS, ALCHEMICAL_SYMBOLS, + GEOMETRIC_SHAPES_EXTENDED, + SUPPLEMENTAL_ARROWS_C, null, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, null, @@ -3675,40 +4034,185 @@ /** * Unicode script "Meroitic Hieroglyphs". + * @since 1.8 */ MEROITIC_HIEROGLYPHS, /** * Unicode script "Meroitic Cursive". + * @since 1.8 */ MEROITIC_CURSIVE, /** * Unicode script "Sora Sompeng". + * @since 1.8 */ SORA_SOMPENG, /** * Unicode script "Chakma". + * @since 1.8 */ CHAKMA, /** * Unicode script "Sharada". + * @since 1.8 */ SHARADA, /** * Unicode script "Takri". + * @since 1.8 */ TAKRI, /** * Unicode script "Miao". + * @since 1.8 */ MIAO, /** + * Unicode script "Caucasian Albanian". + * @since 1.9 + */ + CAUCASIAN_ALBANIAN, + + /** + * Unicode script "Bassa Vah". + * @since 1.9 + */ + BASSA_VAH, + + /** + * Unicode script "Duployan". + * @since 1.9 + */ + DUPLOYAN, + + /** + * Unicode script "Elbasan". + * @since 1.9 + */ + ELBASAN, + + /** + * Unicode script "Grantha". + * @since 1.9 + */ + GRANTHA, + + /** + * Unicode script "Pahawh Hmong". + * @since 1.9 + */ + PAHAWH_HMONG, + + /** + * Unicode script "Khojki". + * @since 1.9 + */ + KHOJKI, + + /** + * Unicode script "Linear A". + * @since 1.9 + */ + LINEAR_A, + + /** + * Unicode script "Mahajani". + * @since 1.9 + */ + MAHAJANI, + + /** + * Unicode script "Manichaean". + * @since 1.9 + */ + MANICHAEAN, + + /** + * Unicode script "Mende Kikakui". + * @since 1.9 + */ + MENDE_KIKAKUI, + + /** + * Unicode script "Modi". + * @since 1.9 + */ + MODI, + + /** + * Unicode script "Mro". + * @since 1.9 + */ + MRO, + + /** + * Unicode script "Old North Arabian". + * @since 1.9 + */ + OLD_NORTH_ARABIAN, + + /** + * Unicode script "Nabataean". + * @since 1.9 + */ + NABATAEAN, + + /** + * Unicode script "Palmyrene". + * @since 1.9 + */ + PALMYRENE, + + /** + * Unicode script "Pau Cin Hau". + * @since 1.9 + */ + PAU_CIN_HAU, + + /** + * Unicode script "Old Permic". + * @since 1.9 + */ + OLD_PERMIC, + + /** + * Unicode script "Psalter Pahlavi". + * @since 1.9 + */ + PSALTER_PAHLAVI, + + /** + * Unicode script "Siddham". + * @since 1.9 + */ + SIDDHAM, + + /** + * Unicode script "Khudawadi". + * @since 1.9 + */ + KHUDAWADI, + + /** + * Unicode script "Tirhuta". + * @since 1.9 + */ + TIRHUTA, + + /** + * Unicode script "Warang Citi". + * @since 1.9 + */ + WARANG_CITI, + + /** * Unicode script "Unknown". */ UNKNOWN; @@ -3719,14 +4223,14 @@ 0x005B, // 005B..0060; COMMON 0x0061, // 0061..007A; LATIN 0x007B, // 007B..00A9; COMMON - 0x00AA, // 00AA..00AA; LATIN + 0x00AA, // 00AA ; LATIN 0x00AB, // 00AB..00B9; COMMON - 0x00BA, // 00BA..00BA; LATIN + 0x00BA, // 00BA ; LATIN 0x00BB, // 00BB..00BF; COMMON 0x00C0, // 00C0..00D6; LATIN - 0x00D7, // 00D7..00D7; COMMON + 0x00D7, // 00D7 ; COMMON 0x00D8, // 00D8..00F6; LATIN - 0x00F7, // 00F7..00F7; COMMON + 0x00F7, // 00F7 ; COMMON 0x00F8, // 00F8..02B8; LATIN 0x02B9, // 02B9..02DF; COMMON 0x02E0, // 02E0..02E4; LATIN @@ -3735,284 +4239,1178 @@ 0x02EC, // 02EC..02FF; COMMON 0x0300, // 0300..036F; INHERITED 0x0370, // 0370..0373; GREEK - 0x0374, // 0374..0374; COMMON - 0x0375, // 0375..037D; GREEK - 0x037E, // 037E..0383; COMMON - 0x0384, // 0384..0384; GREEK - 0x0385, // 0385..0385; COMMON - 0x0386, // 0386..0386; GREEK - 0x0387, // 0387..0387; COMMON - 0x0388, // 0388..03E1; GREEK + 0x0374, // 0374 ; COMMON + 0x0375, // 0375..0377; GREEK + 0x0378, // 0378..0379; UNKNOWN + 0x037A, // 037A..037D; GREEK + 0x037E, // 037E ; COMMON + 0x037F, // 037F ; GREEK + 0x0380, // 0380..0383; UNKNOWN + 0x0384, // 0384 ; GREEK + 0x0385, // 0385 ; COMMON + 0x0386, // 0386 ; GREEK + 0x0387, // 0387 ; COMMON + 0x0388, // 0388..038A; GREEK + 0x038B, // 038B ; UNKNOWN + 0x038C, // 038C ; GREEK + 0x038D, // 038D ; UNKNOWN + 0x038E, // 038E..03A1; GREEK + 0x03A2, // 03A2 ; UNKNOWN + 0x03A3, // 03A3..03E1; GREEK 0x03E2, // 03E2..03EF; COPTIC 0x03F0, // 03F0..03FF; GREEK 0x0400, // 0400..0484; CYRILLIC 0x0485, // 0485..0486; INHERITED - 0x0487, // 0487..0530; CYRILLIC - 0x0531, // 0531..0588; ARMENIAN - 0x0589, // 0589..0589; COMMON - 0x058A, // 058A..0590; ARMENIAN - 0x0591, // 0591..05FF; HEBREW - 0x0600, // 0600..060B; ARABIC - 0x060C, // 060C..060C; COMMON + 0x0487, // 0487..052F; CYRILLIC + 0x0530, // 0530 ; UNKNOWN + 0x0531, // 0531..0556; ARMENIAN + 0x0557, // 0557..0558; UNKNOWN + 0x0559, // 0559..055F; ARMENIAN + 0x0560, // 0560 ; UNKNOWN + 0x0561, // 0561..0587; ARMENIAN + 0x0588, // 0588 ; UNKNOWN + 0x0589, // 0589 ; COMMON + 0x058A, // 058A ; ARMENIAN + 0x058B, // 058B..058C; UNKNOWN + 0x058D, // 058D..058F; ARMENIAN + 0x0590, // 0590 ; UNKNOWN + 0x0591, // 0591..05C7; HEBREW + 0x05C8, // 05C8..05CF; UNKNOWN + 0x05D0, // 05D0..05EA; HEBREW + 0x05EB, // 05EB..05EF; UNKNOWN + 0x05F0, // 05F0..05F4; HEBREW + 0x05F5, // 05F5..05FF; UNKNOWN + 0x0600, // 0600..0604; ARABIC + 0x0605, // 0605 ; COMMON + 0x0606, // 0606..060B; ARABIC + 0x060C, // 060C ; COMMON 0x060D, // 060D..061A; ARABIC - 0x061B, // 061B..061D; COMMON - 0x061E, // 061E..061E; ARABIC - 0x061F, // 061F..061F; COMMON + 0x061B, // 061B..061C; COMMON + 0x061D, // 061D ; UNKNOWN + 0x061E, // 061E ; ARABIC + 0x061F, // 061F ; COMMON 0x0620, // 0620..063F; ARABIC - 0x0640, // 0640..0640; COMMON + 0x0640, // 0640 ; COMMON 0x0641, // 0641..064A; ARABIC 0x064B, // 064B..0655; INHERITED 0x0656, // 0656..065F; ARABIC 0x0660, // 0660..0669; COMMON 0x066A, // 066A..066F; ARABIC - 0x0670, // 0670..0670; INHERITED + 0x0670, // 0670 ; INHERITED 0x0671, // 0671..06DC; ARABIC - 0x06DD, // 06DD..06DD; COMMON + 0x06DD, // 06DD ; COMMON 0x06DE, // 06DE..06FF; ARABIC - 0x0700, // 0700..074F; SYRIAC + 0x0700, // 0700..070D; SYRIAC + 0x070E, // 070E ; UNKNOWN + 0x070F, // 070F..074A; SYRIAC + 0x074B, // 074B..074C; UNKNOWN + 0x074D, // 074D..074F; SYRIAC 0x0750, // 0750..077F; ARABIC - 0x0780, // 0780..07BF; THAANA - 0x07C0, // 07C0..07FF; NKO - 0x0800, // 0800..083F; SAMARITAN - 0x0840, // 0840..089F; MANDAIC - 0x08A0, // 08A0..08FF; ARABIC + 0x0780, // 0780..07B1; THAANA + 0x07B2, // 07B2..07BF; UNKNOWN + 0x07C0, // 07C0..07FA; NKO + 0x07FB, // 07FB..07FF; UNKNOWN + 0x0800, // 0800..082D; SAMARITAN + 0x082E, // 082E..082F; UNKNOWN + 0x0830, // 0830..083E; SAMARITAN + 0x083F, // 083F ; UNKNOWN + 0x0840, // 0840..085B; MANDAIC + 0x085C, // 085C..085D; UNKNOWN + 0x085E, // 085E ; MANDAIC + 0x085F, // 085F..089F; UNKNOWN + 0x08A0, // 08A0..08B2; ARABIC + 0x08B3, // 08B3..08E3; UNKNOWN + 0x08E4, // 08E4..08FF; ARABIC 0x0900, // 0900..0950; DEVANAGARI 0x0951, // 0951..0952; INHERITED 0x0953, // 0953..0963; DEVANAGARI 0x0964, // 0964..0965; COMMON - 0x0966, // 0966..0980; DEVANAGARI - 0x0981, // 0981..0A00; BENGALI - 0x0A01, // 0A01..0A80; GURMUKHI - 0x0A81, // 0A81..0B00; GUJARATI - 0x0B01, // 0B01..0B81; ORIYA - 0x0B82, // 0B82..0C00; TAMIL - 0x0C01, // 0C01..0C81; TELUGU - 0x0C82, // 0C82..0CF0; KANNADA - 0x0D02, // 0D02..0D81; MALAYALAM - 0x0D82, // 0D82..0E00; SINHALA - 0x0E01, // 0E01..0E3E; THAI - 0x0E3F, // 0E3F..0E3F; COMMON - 0x0E40, // 0E40..0E80; THAI - 0x0E81, // 0E81..0EFF; LAO - 0x0F00, // 0F00..0FD4; TIBETAN + 0x0966, // 0966..097F; DEVANAGARI + 0x0980, // 0980..0983; BENGALI + 0x0984, // 0984 ; UNKNOWN + 0x0985, // 0985..098C; BENGALI + 0x098D, // 098D..098E; UNKNOWN + 0x098F, // 098F..0990; BENGALI + 0x0991, // 0991..0992; UNKNOWN + 0x0993, // 0993..09A8; BENGALI + 0x09A9, // 09A9 ; UNKNOWN + 0x09AA, // 09AA..09B0; BENGALI + 0x09B1, // 09B1 ; UNKNOWN + 0x09B2, // 09B2 ; BENGALI + 0x09B3, // 09B3..09B5; UNKNOWN + 0x09B6, // 09B6..09B9; BENGALI + 0x09BA, // 09BA..09BB; UNKNOWN + 0x09BC, // 09BC..09C4; BENGALI + 0x09C5, // 09C5..09C6; UNKNOWN + 0x09C7, // 09C7..09C8; BENGALI + 0x09C9, // 09C9..09CA; UNKNOWN + 0x09CB, // 09CB..09CE; BENGALI + 0x09CF, // 09CF..09D6; UNKNOWN + 0x09D7, // 09D7 ; BENGALI + 0x09D8, // 09D8..09DB; UNKNOWN + 0x09DC, // 09DC..09DD; BENGALI + 0x09DE, // 09DE ; UNKNOWN + 0x09DF, // 09DF..09E3; BENGALI + 0x09E4, // 09E4..09E5; UNKNOWN + 0x09E6, // 09E6..09FB; BENGALI + 0x09FC, // 09FC..0A00; UNKNOWN + 0x0A01, // 0A01..0A03; GURMUKHI + 0x0A04, // 0A04 ; UNKNOWN + 0x0A05, // 0A05..0A0A; GURMUKHI + 0x0A0B, // 0A0B..0A0E; UNKNOWN + 0x0A0F, // 0A0F..0A10; GURMUKHI + 0x0A11, // 0A11..0A12; UNKNOWN + 0x0A13, // 0A13..0A28; GURMUKHI + 0x0A29, // 0A29 ; UNKNOWN + 0x0A2A, // 0A2A..0A30; GURMUKHI + 0x0A31, // 0A31 ; UNKNOWN + 0x0A32, // 0A32..0A33; GURMUKHI + 0x0A34, // 0A34 ; UNKNOWN + 0x0A35, // 0A35..0A36; GURMUKHI + 0x0A37, // 0A37 ; UNKNOWN + 0x0A38, // 0A38..0A39; GURMUKHI + 0x0A3A, // 0A3A..0A3B; UNKNOWN + 0x0A3C, // 0A3C ; GURMUKHI + 0x0A3D, // 0A3D ; UNKNOWN + 0x0A3E, // 0A3E..0A42; GURMUKHI + 0x0A43, // 0A43..0A46; UNKNOWN + 0x0A47, // 0A47..0A48; GURMUKHI + 0x0A49, // 0A49..0A4A; UNKNOWN + 0x0A4B, // 0A4B..0A4D; GURMUKHI + 0x0A4E, // 0A4E..0A50; UNKNOWN + 0x0A51, // 0A51 ; GURMUKHI + 0x0A52, // 0A52..0A58; UNKNOWN + 0x0A59, // 0A59..0A5C; GURMUKHI + 0x0A5D, // 0A5D ; UNKNOWN + 0x0A5E, // 0A5E ; GURMUKHI + 0x0A5F, // 0A5F..0A65; UNKNOWN + 0x0A66, // 0A66..0A75; GURMUKHI + 0x0A76, // 0A76..0A80; UNKNOWN + 0x0A81, // 0A81..0A83; GUJARATI + 0x0A84, // 0A84 ; UNKNOWN + 0x0A85, // 0A85..0A8D; GUJARATI + 0x0A8E, // 0A8E ; UNKNOWN + 0x0A8F, // 0A8F..0A91; GUJARATI + 0x0A92, // 0A92 ; UNKNOWN + 0x0A93, // 0A93..0AA8; GUJARATI + 0x0AA9, // 0AA9 ; UNKNOWN + 0x0AAA, // 0AAA..0AB0; GUJARATI + 0x0AB1, // 0AB1 ; UNKNOWN + 0x0AB2, // 0AB2..0AB3; GUJARATI + 0x0AB4, // 0AB4 ; UNKNOWN + 0x0AB5, // 0AB5..0AB9; GUJARATI + 0x0ABA, // 0ABA..0ABB; UNKNOWN + 0x0ABC, // 0ABC..0AC5; GUJARATI + 0x0AC6, // 0AC6 ; UNKNOWN + 0x0AC7, // 0AC7..0AC9; GUJARATI + 0x0ACA, // 0ACA ; UNKNOWN + 0x0ACB, // 0ACB..0ACD; GUJARATI + 0x0ACE, // 0ACE..0ACF; UNKNOWN + 0x0AD0, // 0AD0 ; GUJARATI + 0x0AD1, // 0AD1..0ADF; UNKNOWN + 0x0AE0, // 0AE0..0AE3; GUJARATI + 0x0AE4, // 0AE4..0AE5; UNKNOWN + 0x0AE6, // 0AE6..0AF1; GUJARATI + 0x0AF2, // 0AF2..0B00; UNKNOWN + 0x0B01, // 0B01..0B03; ORIYA + 0x0B04, // 0B04 ; UNKNOWN + 0x0B05, // 0B05..0B0C; ORIYA + 0x0B0D, // 0B0D..0B0E; UNKNOWN + 0x0B0F, // 0B0F..0B10; ORIYA + 0x0B11, // 0B11..0B12; UNKNOWN + 0x0B13, // 0B13..0B28; ORIYA + 0x0B29, // 0B29 ; UNKNOWN + 0x0B2A, // 0B2A..0B30; ORIYA + 0x0B31, // 0B31 ; UNKNOWN + 0x0B32, // 0B32..0B33; ORIYA + 0x0B34, // 0B34 ; UNKNOWN + 0x0B35, // 0B35..0B39; ORIYA + 0x0B3A, // 0B3A..0B3B; UNKNOWN + 0x0B3C, // 0B3C..0B44; ORIYA + 0x0B45, // 0B45..0B46; UNKNOWN + 0x0B47, // 0B47..0B48; ORIYA + 0x0B49, // 0B49..0B4A; UNKNOWN + 0x0B4B, // 0B4B..0B4D; ORIYA + 0x0B4E, // 0B4E..0B55; UNKNOWN + 0x0B56, // 0B56..0B57; ORIYA + 0x0B58, // 0B58..0B5B; UNKNOWN + 0x0B5C, // 0B5C..0B5D; ORIYA + 0x0B5E, // 0B5E ; UNKNOWN + 0x0B5F, // 0B5F..0B63; ORIYA + 0x0B64, // 0B64..0B65; UNKNOWN + 0x0B66, // 0B66..0B77; ORIYA + 0x0B78, // 0B78..0B81; UNKNOWN + 0x0B82, // 0B82..0B83; TAMIL + 0x0B84, // 0B84 ; UNKNOWN + 0x0B85, // 0B85..0B8A; TAMIL + 0x0B8B, // 0B8B..0B8D; UNKNOWN + 0x0B8E, // 0B8E..0B90; TAMIL + 0x0B91, // 0B91 ; UNKNOWN + 0x0B92, // 0B92..0B95; TAMIL + 0x0B96, // 0B96..0B98; UNKNOWN + 0x0B99, // 0B99..0B9A; TAMIL + 0x0B9B, // 0B9B ; UNKNOWN + 0x0B9C, // 0B9C ; TAMIL + 0x0B9D, // 0B9D ; UNKNOWN + 0x0B9E, // 0B9E..0B9F; TAMIL + 0x0BA0, // 0BA0..0BA2; UNKNOWN + 0x0BA3, // 0BA3..0BA4; TAMIL + 0x0BA5, // 0BA5..0BA7; UNKNOWN + 0x0BA8, // 0BA8..0BAA; TAMIL + 0x0BAB, // 0BAB..0BAD; UNKNOWN + 0x0BAE, // 0BAE..0BB9; TAMIL + 0x0BBA, // 0BBA..0BBD; UNKNOWN + 0x0BBE, // 0BBE..0BC2; TAMIL + 0x0BC3, // 0BC3..0BC5; UNKNOWN + 0x0BC6, // 0BC6..0BC8; TAMIL + 0x0BC9, // 0BC9 ; UNKNOWN + 0x0BCA, // 0BCA..0BCD; TAMIL + 0x0BCE, // 0BCE..0BCF; UNKNOWN + 0x0BD0, // 0BD0 ; TAMIL + 0x0BD1, // 0BD1..0BD6; UNKNOWN + 0x0BD7, // 0BD7 ; TAMIL + 0x0BD8, // 0BD8..0BE5; UNKNOWN + 0x0BE6, // 0BE6..0BFA; TAMIL + 0x0BFB, // 0BFB..0BFF; UNKNOWN + 0x0C00, // 0C00..0C03; TELUGU + 0x0C04, // 0C04 ; UNKNOWN + 0x0C05, // 0C05..0C0C; TELUGU + 0x0C0D, // 0C0D ; UNKNOWN + 0x0C0E, // 0C0E..0C10; TELUGU + 0x0C11, // 0C11 ; UNKNOWN + 0x0C12, // 0C12..0C28; TELUGU + 0x0C29, // 0C29 ; UNKNOWN + 0x0C2A, // 0C2A..0C39; TELUGU + 0x0C3A, // 0C3A..0C3C; UNKNOWN + 0x0C3D, // 0C3D..0C44; TELUGU + 0x0C45, // 0C45 ; UNKNOWN + 0x0C46, // 0C46..0C48; TELUGU + 0x0C49, // 0C49 ; UNKNOWN + 0x0C4A, // 0C4A..0C4D; TELUGU + 0x0C4E, // 0C4E..0C54; UNKNOWN + 0x0C55, // 0C55..0C56; TELUGU + 0x0C57, // 0C57 ; UNKNOWN + 0x0C58, // 0C58..0C59; TELUGU + 0x0C5A, // 0C5A..0C5F; UNKNOWN + 0x0C60, // 0C60..0C63; TELUGU + 0x0C64, // 0C64..0C65; UNKNOWN + 0x0C66, // 0C66..0C6F; TELUGU + 0x0C70, // 0C70..0C77; UNKNOWN + 0x0C78, // 0C78..0C7F; TELUGU + 0x0C80, // 0C80 ; UNKNOWN + 0x0C81, // 0C81..0C83; KANNADA + 0x0C84, // 0C84 ; UNKNOWN + 0x0C85, // 0C85..0C8C; KANNADA + 0x0C8D, // 0C8D ; UNKNOWN + 0x0C8E, // 0C8E..0C90; KANNADA + 0x0C91, // 0C91 ; UNKNOWN + 0x0C92, // 0C92..0CA8; KANNADA + 0x0CA9, // 0CA9 ; UNKNOWN + 0x0CAA, // 0CAA..0CB3; KANNADA + 0x0CB4, // 0CB4 ; UNKNOWN + 0x0CB5, // 0CB5..0CB9; KANNADA + 0x0CBA, // 0CBA..0CBB; UNKNOWN + 0x0CBC, // 0CBC..0CC4; KANNADA + 0x0CC5, // 0CC5 ; UNKNOWN + 0x0CC6, // 0CC6..0CC8; KANNADA + 0x0CC9, // 0CC9 ; UNKNOWN + 0x0CCA, // 0CCA..0CCD; KANNADA + 0x0CCE, // 0CCE..0CD4; UNKNOWN + 0x0CD5, // 0CD5..0CD6; KANNADA + 0x0CD7, // 0CD7..0CDD; UNKNOWN + 0x0CDE, // 0CDE ; KANNADA + 0x0CDF, // 0CDF ; UNKNOWN + 0x0CE0, // 0CE0..0CE3; KANNADA + 0x0CE4, // 0CE4..0CE5; UNKNOWN + 0x0CE6, // 0CE6..0CEF; KANNADA + 0x0CF0, // 0CF0 ; UNKNOWN + 0x0CF1, // 0CF1..0CF2; KANNADA + 0x0CF3, // 0CF3..0D00; UNKNOWN + 0x0D01, // 0D01..0D03; MALAYALAM + 0x0D04, // 0D04 ; UNKNOWN + 0x0D05, // 0D05..0D0C; MALAYALAM + 0x0D0D, // 0D0D ; UNKNOWN + 0x0D0E, // 0D0E..0D10; MALAYALAM + 0x0D11, // 0D11 ; UNKNOWN + 0x0D12, // 0D12..0D3A; MALAYALAM + 0x0D3B, // 0D3B..0D3C; UNKNOWN + 0x0D3D, // 0D3D..0D44; MALAYALAM + 0x0D45, // 0D45 ; UNKNOWN + 0x0D46, // 0D46..0D48; MALAYALAM + 0x0D49, // 0D49 ; UNKNOWN + 0x0D4A, // 0D4A..0D4E; MALAYALAM + 0x0D4F, // 0D4F..0D56; UNKNOWN + 0x0D57, // 0D57 ; MALAYALAM + 0x0D58, // 0D58..0D5F; UNKNOWN + 0x0D60, // 0D60..0D63; MALAYALAM + 0x0D64, // 0D64..0D65; UNKNOWN + 0x0D66, // 0D66..0D75; MALAYALAM + 0x0D76, // 0D76..0D78; UNKNOWN + 0x0D79, // 0D79..0D7F; MALAYALAM + 0x0D80, // 0D80..0D81; UNKNOWN + 0x0D82, // 0D82..0D83; SINHALA + 0x0D84, // 0D84 ; UNKNOWN + 0x0D85, // 0D85..0D96; SINHALA + 0x0D97, // 0D97..0D99; UNKNOWN + 0x0D9A, // 0D9A..0DB1; SINHALA + 0x0DB2, // 0DB2 ; UNKNOWN + 0x0DB3, // 0DB3..0DBB; SINHALA + 0x0DBC, // 0DBC ; UNKNOWN + 0x0DBD, // 0DBD ; SINHALA + 0x0DBE, // 0DBE..0DBF; UNKNOWN + 0x0DC0, // 0DC0..0DC6; SINHALA + 0x0DC7, // 0DC7..0DC9; UNKNOWN + 0x0DCA, // 0DCA ; SINHALA + 0x0DCB, // 0DCB..0DCE; UNKNOWN + 0x0DCF, // 0DCF..0DD4; SINHALA + 0x0DD5, // 0DD5 ; UNKNOWN + 0x0DD6, // 0DD6 ; SINHALA + 0x0DD7, // 0DD7 ; UNKNOWN + 0x0DD8, // 0DD8..0DDF; SINHALA + 0x0DE0, // 0DE0..0DE5; UNKNOWN + 0x0DE6, // 0DE6..0DEF; SINHALA + 0x0DF0, // 0DF0..0DF1; UNKNOWN + 0x0DF2, // 0DF2..0DF4; SINHALA + 0x0DF5, // 0DF5..0E00; UNKNOWN + 0x0E01, // 0E01..0E3A; THAI + 0x0E3B, // 0E3B..0E3E; UNKNOWN + 0x0E3F, // 0E3F ; COMMON + 0x0E40, // 0E40..0E5B; THAI + 0x0E5C, // 0E5C..0E80; UNKNOWN + 0x0E81, // 0E81..0E82; LAO + 0x0E83, // 0E83 ; UNKNOWN + 0x0E84, // 0E84 ; LAO + 0x0E85, // 0E85..0E86; UNKNOWN + 0x0E87, // 0E87..0E88; LAO + 0x0E89, // 0E89 ; UNKNOWN + 0x0E8A, // 0E8A ; LAO + 0x0E8B, // 0E8B..0E8C; UNKNOWN + 0x0E8D, // 0E8D ; LAO + 0x0E8E, // 0E8E..0E93; UNKNOWN + 0x0E94, // 0E94..0E97; LAO + 0x0E98, // 0E98 ; UNKNOWN + 0x0E99, // 0E99..0E9F; LAO + 0x0EA0, // 0EA0 ; UNKNOWN + 0x0EA1, // 0EA1..0EA3; LAO + 0x0EA4, // 0EA4 ; UNKNOWN + 0x0EA5, // 0EA5 ; LAO + 0x0EA6, // 0EA6 ; UNKNOWN + 0x0EA7, // 0EA7 ; LAO + 0x0EA8, // 0EA8..0EA9; UNKNOWN + 0x0EAA, // 0EAA..0EAB; LAO + 0x0EAC, // 0EAC ; UNKNOWN + 0x0EAD, // 0EAD..0EB9; LAO + 0x0EBA, // 0EBA ; UNKNOWN + 0x0EBB, // 0EBB..0EBD; LAO + 0x0EBE, // 0EBE..0EBF; UNKNOWN + 0x0EC0, // 0EC0..0EC4; LAO + 0x0EC5, // 0EC5 ; UNKNOWN + 0x0EC6, // 0EC6 ; LAO + 0x0EC7, // 0EC7 ; UNKNOWN + 0x0EC8, // 0EC8..0ECD; LAO + 0x0ECE, // 0ECE..0ECF; UNKNOWN + 0x0ED0, // 0ED0..0ED9; LAO + 0x0EDA, // 0EDA..0EDB; UNKNOWN + 0x0EDC, // 0EDC..0EDF; LAO + 0x0EE0, // 0EE0..0EFF; UNKNOWN + 0x0F00, // 0F00..0F47; TIBETAN + 0x0F48, // 0F48 ; UNKNOWN + 0x0F49, // 0F49..0F6C; TIBETAN + 0x0F6D, // 0F6D..0F70; UNKNOWN + 0x0F71, // 0F71..0F97; TIBETAN + 0x0F98, // 0F98 ; UNKNOWN + 0x0F99, // 0F99..0FBC; TIBETAN + 0x0FBD, // 0FBD ; UNKNOWN + 0x0FBE, // 0FBE..0FCC; TIBETAN + 0x0FCD, // 0FCD ; UNKNOWN + 0x0FCE, // 0FCE..0FD4; TIBETAN 0x0FD5, // 0FD5..0FD8; COMMON - 0x0FD9, // 0FD9..0FFF; TIBETAN + 0x0FD9, // 0FD9..0FDA; TIBETAN + 0x0FDB, // 0FDB..FFF; UNKNOWN 0x1000, // 1000..109F; MYANMAR - 0x10A0, // 10A0..10FA; GEORGIAN - 0x10FB, // 10FB..10FB; COMMON + 0x10A0, // 10A0..10C5; GEORGIAN + 0x10C6, // 10C6 ; UNKNOWN + 0x10C7, // 10C7 ; GEORGIAN + 0x10C8, // 10C8..10CC; UNKNOWN + 0x10CD, // 10CD ; GEORGIAN + 0x10CE, // 10CE..10CF; UNKNOWN + 0x10D0, // 10D0..10FA; GEORGIAN + 0x10FB, // 10FB ; COMMON 0x10FC, // 10FC..10FF; GEORGIAN 0x1100, // 1100..11FF; HANGUL - 0x1200, // 1200..139F; ETHIOPIC - 0x13A0, // 13A0..13FF; CHEROKEE + 0x1200, // 1200..1248; ETHIOPIC + 0x1249, // 1249 ; UNKNOWN + 0x124A, // 124A..124D; ETHIOPIC + 0x124E, // 124E..124F; UNKNOWN + 0x1250, // 1250..1256; ETHIOPIC + 0x1257, // 1257 ; UNKNOWN + 0x1258, // 1258 ; ETHIOPIC + 0x1259, // 1259 ; UNKNOWN + 0x125A, // 125A..125D; ETHIOPIC + 0x125E, // 125E..125F; UNKNOWN + 0x1260, // 1260..1288; ETHIOPIC + 0x1289, // 1289 ; UNKNOWN + 0x128A, // 128A..128D; ETHIOPIC + 0x128E, // 128E..128F; UNKNOWN + 0x1290, // 1290..12B0; ETHIOPIC + 0x12B1, // 12B1 ; UNKNOWN + 0x12B2, // 12B2..12B5; ETHIOPIC + 0x12B6, // 12B6..12B7; UNKNOWN + 0x12B8, // 12B8..12BE; ETHIOPIC + 0x12BF, // 12BF ; UNKNOWN + 0x12C0, // 12C0 ; ETHIOPIC + 0x12C1, // 12C1 ; UNKNOWN + 0x12C2, // 12C2..12C5; ETHIOPIC + 0x12C6, // 12C6..12C7; UNKNOWN + 0x12C8, // 12C8..12D6; ETHIOPIC + 0x12D7, // 12D7 ; UNKNOWN + 0x12D8, // 12D8..1310; ETHIOPIC + 0x1311, // 1311 ; UNKNOWN + 0x1312, // 1312..1315; ETHIOPIC + 0x1316, // 1316..1317; UNKNOWN + 0x1318, // 1318..135A; ETHIOPIC + 0x135B, // 135B..135C; UNKNOWN + 0x135D, // 135D..137C; ETHIOPIC + 0x137D, // 137D..137F; UNKNOWN + 0x1380, // 1380..1399; ETHIOPIC + 0x139A, // 139A..139F; UNKNOWN + 0x13A0, // 13A0..13F4; CHEROKEE + 0x13F5, // 13F5..13FF; UNKNOWN 0x1400, // 1400..167F; CANADIAN_ABORIGINAL - 0x1680, // 1680..169F; OGHAM + 0x1680, // 1680..169C; OGHAM + 0x169D, // 169D..169F; UNKNOWN 0x16A0, // 16A0..16EA; RUNIC 0x16EB, // 16EB..16ED; COMMON - 0x16EE, // 16EE..16FF; RUNIC - 0x1700, // 1700..171F; TAGALOG + 0x16EE, // 16EE..16F8; RUNIC + 0x16F9, // 16F9..16FF; UNKNOWN + 0x1700, // 1700..170C; TAGALOG + 0x170D, // 170D ; UNKNOWN + 0x170E, // 170E..1714; TAGALOG + 0x1715, // 1715..171F; UNKNOWN 0x1720, // 1720..1734; HANUNOO - 0x1735, // 1735..173F; COMMON - 0x1740, // 1740..175F; BUHID - 0x1760, // 1760..177F; TAGBANWA - 0x1780, // 1780..17FF; KHMER + 0x1735, // 1735..1736; COMMON + 0x1737, // 1737..173F; UNKNOWN + 0x1740, // 1740..1753; BUHID + 0x1754, // 1754..175F; UNKNOWN + 0x1760, // 1760..176C; TAGBANWA + 0x176D, // 176D ; UNKNOWN + 0x176E, // 176E..1770; TAGBANWA + 0x1771, // 1771 ; UNKNOWN + 0x1772, // 1772..1773; TAGBANWA + 0x1774, // 1774..177F; UNKNOWN + 0x1780, // 1780..17DD; KHMER + 0x17DE, // 17DE..17DF; UNKNOWN + 0x17E0, // 17E0..17E9; KHMER + 0x17EA, // 17EA..17EF; UNKNOWN + 0x17F0, // 17F0..17F9; KHMER + 0x17FA, // 17FA..17FF; UNKNOWN 0x1800, // 1800..1801; MONGOLIAN 0x1802, // 1802..1803; COMMON - 0x1804, // 1804..1804; MONGOLIAN - 0x1805, // 1805..1805; COMMON - 0x1806, // 1806..18AF; MONGOLIAN - 0x18B0, // 18B0..18FF; CANADIAN_ABORIGINAL - 0x1900, // 1900..194F; LIMBU - 0x1950, // 1950..197F; TAI_LE - 0x1980, // 1980..19DF; NEW_TAI_LUE + 0x1804, // 1804 ; MONGOLIAN + 0x1805, // 1805 ; COMMON + 0x1806, // 1806..180E; MONGOLIAN + 0x180F, // 180F ; UNKNOWN + 0x1810, // 1810..1819; MONGOLIAN + 0x181A, // 181A..181F; UNKNOWN + 0x1820, // 1820..1877; MONGOLIAN + 0x1878, // 1878..187F; UNKNOWN + 0x1880, // 1880..18AA; MONGOLIAN + 0x18AB, // 18AB..18AF; UNKNOWN + 0x18B0, // 18B0..18F5; CANADIAN_ABORIGINAL + 0x18F6, // 18F6..18FF; UNKNOWN + 0x1900, // 1900..191E; LIMBU + 0x191F, // 191F ; UNKNOWN + 0x1920, // 1920..192B; LIMBU + 0x192C, // 192C..192F; UNKNOWN + 0x1930, // 1930..193B; LIMBU + 0x193C, // 193C..193F; UNKNOWN + 0x1940, // 1940 ; LIMBU + 0x1941, // 1941..1943; UNKNOWN + 0x1944, // 1944..194F; LIMBU + 0x1950, // 1950..196D; TAI_LE + 0x196E, // 196E..196F; UNKNOWN + 0x1970, // 1970..1974; TAI_LE + 0x1975, // 1975..197F; UNKNOWN + 0x1980, // 1980..19AB; NEW_TAI_LUE + 0x19AC, // 19AC..19AF; UNKNOWN + 0x19B0, // 19B0..19C9; NEW_TAI_LUE + 0x19CA, // 19CA..19CF; UNKNOWN + 0x19D0, // 19D0..19DA; NEW_TAI_LUE + 0x19DB, // 19DB..19DD; UNKNOWN + 0x19DE, // 19DE..19DF; NEW_TAI_LUE 0x19E0, // 19E0..19FF; KHMER - 0x1A00, // 1A00..1A1F; BUGINESE - 0x1A20, // 1A20..1AFF; TAI_THAM - 0x1B00, // 1B00..1B7F; BALINESE + 0x1A00, // 1A00..1A1B; BUGINESE + 0x1A1C, // 1A1C..1A1D; UNKNOWN + 0x1A1E, // 1A1E..1A1F; BUGINESE + 0x1A20, // 1A20..1A5E; TAI_THAM + 0x1A5F, // 1A5F ; UNKNOWN + 0x1A60, // 1A60..1A7C; TAI_THAM + 0x1A7D, // 1A7D..1A7E; UNKNOWN + 0x1A7F, // 1A7F..1A89; TAI_THAM + 0x1A8A, // 1A8A..1A8F; UNKNOWN + 0x1A90, // 1A90..1A99; TAI_THAM + 0x1A9A, // 1A9A..1A9F; UNKNOWN + 0x1AA0, // 1AA0..1AAD; TAI_THAM + 0x1AAE, // 1AAE..1AAF; UNKNOWN + 0x1AB0, // 1AB0..1ABE; INHERITED + 0x1ABF, // 1ABF..1AFF; UNKNOWN + 0x1B00, // 1B00..1B4B; BALINESE + 0x1B4C, // 1B4C..1B4F; UNKNOWN + 0x1B50, // 1B50..1B7C; BALINESE + 0x1B7D, // 1B7D..1B7F; UNKNOWN 0x1B80, // 1B80..1BBF; SUNDANESE - 0x1BC0, // 1BC0..1BFF; BATAK - 0x1C00, // 1C00..1C4F; LEPCHA - 0x1C50, // 1C50..1CBF; OL_CHIKI - 0x1CC0, // 1CC0..1CCF; SUNDANESE + 0x1BC0, // 1BC0..1BF3; BATAK + 0x1BF4, // 1BF4..1BFB; UNKNOWN + 0x1BFC, // 1BFC..1BFF; BATAK + 0x1C00, // 1C00..1C37; LEPCHA + 0x1C38, // 1C38..1C3A; UNKNOWN + 0x1C3B, // 1C3B..1C49; LEPCHA + 0x1C4A, // 1C4A..1C4C; UNKNOWN + 0x1C4D, // 1C4D..1C4F; LEPCHA + 0x1C50, // 1C50..1C7F; OL_CHIKI + 0x1C80, // 1C80..1CBF; UNKNOWN + 0x1CC0, // 1CC0..1CC7; SUNDANESE + 0x1CC8, // 1CC8..1CCF; UNKNOWN 0x1CD0, // 1CD0..1CD2; INHERITED - 0x1CD3, // 1CD3..1CD3; COMMON + 0x1CD3, // 1CD3 ; COMMON 0x1CD4, // 1CD4..1CE0; INHERITED - 0x1CE1, // 1CE1..1CE1; COMMON + 0x1CE1, // 1CE1 ; COMMON 0x1CE2, // 1CE2..1CE8; INHERITED 0x1CE9, // 1CE9..1CEC; COMMON - 0x1CED, // 1CED..1CED; INHERITED + 0x1CED, // 1CED ; INHERITED 0x1CEE, // 1CEE..1CF3; COMMON - 0x1CF4, // 1CF4..1CF4; INHERITED - 0x1CF5, // 1CF5..1CFF; COMMON + 0x1CF4, // 1CF4 ; INHERITED + 0x1CF5, // 1CF5..1CF6; COMMON + 0x1CF7, // 1CF7 ; UNKNOWN + 0x1CF8, // 1CF8..1CF9; INHERITED + 0x1CFA, // 1CFA..1CFF; UNKNOWN 0x1D00, // 1D00..1D25; LATIN 0x1D26, // 1D26..1D2A; GREEK - 0x1D2B, // 1D2B..1D2B; CYRILLIC + 0x1D2B, // 1D2B ; CYRILLIC 0x1D2C, // 1D2C..1D5C; LATIN 0x1D5D, // 1D5D..1D61; GREEK 0x1D62, // 1D62..1D65; LATIN 0x1D66, // 1D66..1D6A; GREEK 0x1D6B, // 1D6B..1D77; LATIN - 0x1D78, // 1D78..1D78; CYRILLIC + 0x1D78, // 1D78 ; CYRILLIC 0x1D79, // 1D79..1DBE; LATIN - 0x1DBF, // 1DBF..1DBF; GREEK - 0x1DC0, // 1DC0..1DFF; INHERITED + 0x1DBF, // 1DBF ; GREEK + 0x1DC0, // 1DC0..1DF5; INHERITED + 0x1DF6, // 1DF6..1DFB; UNKNOWN + 0x1DFC, // 1DFC..1DFF; INHERITED 0x1E00, // 1E00..1EFF; LATIN - 0x1F00, // 1F00..1FFF; GREEK + 0x1F00, // 1F00..1F15; GREEK + 0x1F16, // 1F16..1F17; UNKNOWN + 0x1F18, // 1F18..1F1D; GREEK + 0x1F1E, // 1F1E..1F1F; UNKNOWN + 0x1F20, // 1F20..1F45; GREEK + 0x1F46, // 1F46..1F47; UNKNOWN + 0x1F48, // 1F48..1F4D; GREEK + 0x1F4E, // 1F4E..1F4F; UNKNOWN + 0x1F50, // 1F50..1F57; GREEK + 0x1F58, // 1F58 ; UNKNOWN + 0x1F59, // 1F59 ; GREEK + 0x1F5A, // 1F5A ; UNKNOWN + 0x1F5B, // 1F5B ; GREEK + 0x1F5C, // 1F5C ; UNKNOWN + 0x1F5D, // 1F5D ; GREEK + 0x1F5E, // 1F5E ; UNKNOWN + 0x1F5F, // 1F5F..1F7D; GREEK + 0x1F7E, // 1F7E..1F7F; UNKNOWN + 0x1F80, // 1F80..1FB4; GREEK + 0x1FB5, // 1FB5 ; UNKNOWN + 0x1FB6, // 1FB6..1FC4; GREEK + 0x1FC5, // 1FC5 ; UNKNOWN + 0x1FC6, // 1FC6..1FD3; GREEK + 0x1FD4, // 1FD4..1FD5; UNKNOWN + 0x1FD6, // 1FD6..1FDB; GREEK + 0x1FDC, // 1FDC ; UNKNOWN + 0x1FDD, // 1FDD..1FEF; GREEK + 0x1FF0, // 1FF0..1FF1; UNKNOWN + 0x1FF2, // 1FF2..1FF4; GREEK + 0x1FF5, // 1FF5 ; UNKNOWN + 0x1FF6, // 1FF6..1FFE; GREEK + 0x1FFF, // 1FFF ; UNKNOWN 0x2000, // 2000..200B; COMMON 0x200C, // 200C..200D; INHERITED - 0x200E, // 200E..2070; COMMON - 0x2071, // 2071..2073; LATIN + 0x200E, // 200E..2064; COMMON + 0x2065, // 2065 ; UNKNOWN + 0x2066, // 2066..2070; COMMON + 0x2071, // 2071 ; LATIN + 0x2072, // 2072..2073; UNKNOWN 0x2074, // 2074..207E; COMMON - 0x207F, // 207F..207F; LATIN - 0x2080, // 2080..208F; COMMON - 0x2090, // 2090..209F; LATIN - 0x20A0, // 20A0..20CF; COMMON - 0x20D0, // 20D0..20FF; INHERITED + 0x207F, // 207F ; LATIN + 0x2080, // 2080..208E; COMMON + 0x208F, // 208F ; UNKNOWN + 0x2090, // 2090..209C; LATIN + 0x209D, // 209D..209F; UNKNOWN + 0x20A0, // 20A0..20BD; COMMON + 0x20BE, // 20BE..20CF; UNKNOWN + 0x20D0, // 20D0..20F0; INHERITED + 0x20F1, // 20F1..20FF; UNKNOWN 0x2100, // 2100..2125; COMMON - 0x2126, // 2126..2126; GREEK + 0x2126, // 2126 ; GREEK 0x2127, // 2127..2129; COMMON 0x212A, // 212A..212B; LATIN 0x212C, // 212C..2131; COMMON - 0x2132, // 2132..2132; LATIN + 0x2132, // 2132 ; LATIN 0x2133, // 2133..214D; COMMON - 0x214E, // 214E..214E; LATIN + 0x214E, // 214E ; LATIN 0x214F, // 214F..215F; COMMON 0x2160, // 2160..2188; LATIN - 0x2189, // 2189..27FF; COMMON + 0x2189, // 2189 ; COMMON + 0x218A, // 218A..218F; UNKNOWN + 0x2190, // 2190..23FA; COMMON + 0x23FB, // 23FB..23FF; UNKNOWN + 0x2400, // 2400..2426; COMMON + 0x2427, // 2427..243F; UNKNOWN + 0x2440, // 2440..244A; COMMON + 0x244B, // 244B..245F; UNKNOWN + 0x2460, // 2460..27FF; COMMON 0x2800, // 2800..28FF; BRAILLE - 0x2900, // 2900..2BFF; COMMON - 0x2C00, // 2C00..2C5F; GLAGOLITIC + 0x2900, // 2900..2B73; COMMON + 0x2B74, // 2B74..2B75; UNKNOWN + 0x2B76, // 2B76..2B95; COMMON + 0x2B96, // 2B96..2B97; UNKNOWN + 0x2B98, // 2B98..2BB9; COMMON + 0x2BBA, // 2BBA..2BBC; UNKNOWN + 0x2BBD, // 2BBD..2BC8; COMMON + 0x2BC9, // 2BC9 ; UNKNOWN + 0x2BCA, // 2BCA..2BD1; COMMON + 0x2BD2, // 2BD2..2BFF; UNKNOWN + 0x2C00, // 2C00..2C2E; GLAGOLITIC + 0x2C2F, // 2C2F ; UNKNOWN + 0x2C30, // 2C30..2C5E; GLAGOLITIC + 0x2C5F, // 2C5F ; UNKNOWN 0x2C60, // 2C60..2C7F; LATIN - 0x2C80, // 2C80..2CFF; COPTIC - 0x2D00, // 2D00..2D2F; GEORGIAN - 0x2D30, // 2D30..2D7F; TIFINAGH - 0x2D80, // 2D80..2DDF; ETHIOPIC + 0x2C80, // 2C80..2CF3; COPTIC + 0x2CF4, // 2CF4..2CF8; UNKNOWN + 0x2CF9, // 2CF9..2CFF; COPTIC + 0x2D00, // 2D00..2D25; GEORGIAN + 0x2D26, // 2D26 ; UNKNOWN + 0x2D27, // 2D27 ; GEORGIAN + 0x2D28, // 2D28..2D2C; UNKNOWN + 0x2D2D, // 2D2D ; GEORGIAN + 0x2D2E, // 2D2E..2D2F; UNKNOWN + 0x2D30, // 2D30..2D67; TIFINAGH + 0x2D68, // 2D68..2D6E; UNKNOWN + 0x2D6F, // 2D6F..2D70; TIFINAGH + 0x2D71, // 2D71..2D7E; UNKNOWN + 0x2D7F, // 2D7F ; TIFINAGH + 0x2D80, // 2D80..2D96; ETHIOPIC + 0x2D97, // 2D97..2D9F; UNKNOWN + 0x2DA0, // 2DA0..2DA6; ETHIOPIC + 0x2DA7, // 2DA7 ; UNKNOWN + 0x2DA8, // 2DA8..2DAE; ETHIOPIC + 0x2DAF, // 2DAF ; UNKNOWN + 0x2DB0, // 2DB0..2DB6; ETHIOPIC + 0x2DB7, // 2DB7 ; UNKNOWN + 0x2DB8, // 2DB8..2DBE; ETHIOPIC + 0x2DBF, // 2DBF ; UNKNOWN + 0x2DC0, // 2DC0..2DC6; ETHIOPIC + 0x2DC7, // 2DC7 ; UNKNOWN + 0x2DC8, // 2DC8..2DCE; ETHIOPIC + 0x2DCF, // 2DCF ; UNKNOWN + 0x2DD0, // 2DD0..2DD6; ETHIOPIC + 0x2DD7, // 2DD7 ; UNKNOWN + 0x2DD8, // 2DD8..2DDE; ETHIOPIC + 0x2DDF, // 2DDF ; UNKNOWN 0x2DE0, // 2DE0..2DFF; CYRILLIC - 0x2E00, // 2E00..2E7F; COMMON - 0x2E80, // 2E80..2FEF; HAN - 0x2FF0, // 2FF0..3004; COMMON - 0x3005, // 3005..3005; HAN - 0x3006, // 3006..3006; COMMON - 0x3007, // 3007..3007; HAN + 0x2E00, // 2E00..2E42; COMMON + 0x2E43, // 2E43..2E7F; UNKNOWN + 0x2E80, // 2E80..2E99; HAN + 0x2E9A, // 2E9A ; UNKNOWN + 0x2E9B, // 2E9B..2EF3; HAN + 0x2EF4, // 2EF4..2EFF; UNKNOWN + 0x2F00, // 2F00..2FD5; HAN + 0x2FD6, // 2FD6..2FEF; UNKNOWN + 0x2FF0, // 2FF0..2FFB; COMMON + 0x2FFC, // 2FFC..2FFF; UNKNOWN + 0x3000, // 3000..3004; COMMON + 0x3005, // 3005 ; HAN + 0x3006, // 3006 ; COMMON + 0x3007, // 3007 ; HAN 0x3008, // 3008..3020; COMMON 0x3021, // 3021..3029; HAN 0x302A, // 302A..302D; INHERITED 0x302E, // 302E..302F; HANGUL 0x3030, // 3030..3037; COMMON 0x3038, // 3038..303B; HAN - 0x303C, // 303C..3040; COMMON - 0x3041, // 3041..3098; HIRAGANA + 0x303C, // 303C..303F; COMMON + 0x3040, // 3040 ; UNKNOWN + 0x3041, // 3041..3096; HIRAGANA + 0x3097, // 3097..3098; UNKNOWN 0x3099, // 3099..309A; INHERITED 0x309B, // 309B..309C; COMMON 0x309D, // 309D..309F; HIRAGANA - 0x30A0, // 30A0..30A0; COMMON + 0x30A0, // 30A0 ; COMMON 0x30A1, // 30A1..30FA; KATAKANA 0x30FB, // 30FB..30FC; COMMON - 0x30FD, // 30FD..3104; KATAKANA - 0x3105, // 3105..3130; BOPOMOFO - 0x3131, // 3131..318F; HANGUL + 0x30FD, // 30FD..30FF; KATAKANA + 0x3100, // 3100..3104; UNKNOWN + 0x3105, // 3105..312D; BOPOMOFO + 0x312E, // 312E..3130; UNKNOWN + 0x3131, // 3131..318E; HANGUL + 0x318F, // 318F ; UNKNOWN 0x3190, // 3190..319F; COMMON - 0x31A0, // 31A0..31BF; BOPOMOFO - 0x31C0, // 31C0..31EF; COMMON + 0x31A0, // 31A0..31BA; BOPOMOFO + 0x31BB, // 31BB..31BF; UNKNOWN + 0x31C0, // 31C0..31E3; COMMON + 0x31E4, // 31E4..31EF; UNKNOWN 0x31F0, // 31F0..31FF; KATAKANA - 0x3200, // 3200..321F; HANGUL + 0x3200, // 3200..321E; HANGUL + 0x321F, // 321F ; UNKNOWN 0x3220, // 3220..325F; COMMON 0x3260, // 3260..327E; HANGUL 0x327F, // 327F..32CF; COMMON - 0x32D0, // 32D0..3357; KATAKANA + 0x32D0, // 32D0..32FE; KATAKANA + 0x32FF, // 32FF ; UNKNOWN + 0x3300, // 3300..3357; KATAKANA 0x3358, // 3358..33FF; COMMON - 0x3400, // 3400..4DBF; HAN + 0x3400, // 3400..4DB5; HAN + 0x4DB6, // 4DB6..4DBF; UNKNOWN 0x4DC0, // 4DC0..4DFF; COMMON - 0x4E00, // 4E00..9FFF; HAN - 0xA000, // A000..A4CF; YI + 0x4E00, // 4E00..9FCC; HAN + 0x9FCD, // 9FCD..9FFF; UNKNOWN + 0xA000, // A000..A48C; YI + 0xA48D, // A48D..A48F; UNKNOWN + 0xA490, // A490..A4C6; YI + 0xA4C7, // A4C7..A4CF; UNKNOWN 0xA4D0, // A4D0..A4FF; LISU - 0xA500, // A500..A63F; VAI - 0xA640, // A640..A69F; CYRILLIC - 0xA6A0, // A6A0..A6FF; BAMUM + 0xA500, // A500..A62B; VAI + 0xA62C, // A62C..A63F; UNKNOWN + 0xA640, // A640..A69D; CYRILLIC + 0xA69E, // A69E ; UNKNOWN + 0xA69F, // A69F ; CYRILLIC + 0xA6A0, // A6A0..A6F7; BAMUM + 0xA6F8, // A6F8..A6FF; UNKNOWN 0xA700, // A700..A721; COMMON 0xA722, // A722..A787; LATIN 0xA788, // A788..A78A; COMMON - 0xA78B, // A78B..A7FF; LATIN - 0xA800, // A800..A82F; SYLOTI_NAGRI - 0xA830, // A830..A83F; COMMON - 0xA840, // A840..A87F; PHAGS_PA - 0xA880, // A880..A8DF; SAURASHTRA - 0xA8E0, // A8E0..A8FF; DEVANAGARI - 0xA900, // A900..A92F; KAYAH_LI - 0xA930, // A930..A95F; REJANG - 0xA960, // A960..A97F; HANGUL - 0xA980, // A980..A9FF; JAVANESE - 0xAA00, // AA00..AA5F; CHAM + 0xA78B, // A78B..A78E; LATIN + 0xA78F, // A78F ; UNKNOWN + 0xA790, // A790..A7AD; LATIN + 0xA7AE, // A7AE..A7AF; UNKNOWN + 0xA7B0, // A7B0..A7B1; LATIN + 0xA7B2, // A7B2..A7F6; UNKNOWN + 0xA7F7, // A7F7..A7FF; LATIN + 0xA800, // A800..A82B; SYLOTI_NAGRI + 0xA82C, // A82C..A82F; UNKNOWN + 0xA830, // A830..A839; COMMON + 0xA83A, // A83A..A83F; UNKNOWN + 0xA840, // A840..A877; PHAGS_PA + 0xA878, // A878..A87F; UNKNOWN + 0xA880, // A880..A8C4; SAURASHTRA + 0xA8C5, // A8C5..A8CD; UNKNOWN + 0xA8CE, // A8CE..A8D9; SAURASHTRA + 0xA8DA, // A8DA..A8DF; UNKNOWN + 0xA8E0, // A8E0..A8FB; DEVANAGARI + 0xA8FC, // A8FC..A8FF; UNKNOWN + 0xA900, // A900..A92D; KAYAH_LI + 0xA92E, // A92E ; COMMON + 0xA92F, // A92F ; KAYAH_LI + 0xA930, // A930..A953; REJANG + 0xA954, // A954..A95E; UNKNOWN + 0xA95F, // A95F ; REJANG + 0xA960, // A960..A97C; HANGUL + 0xA97D, // A97D..A97F; UNKNOWN + 0xA980, // A980..A9CD; JAVANESE + 0xA9CE, // A9CE ; UNKNOWN + 0xA9CF, // A9CF ; COMMON + 0xA9D0, // A9D0..A9D9; JAVANESE + 0xA9DA, // A9DA..A9DD; UNKNOWN + 0xA9DE, // A9DE..A9DF; JAVANESE + 0xA9E0, // A9E0..A9FE; MYANMAR + 0xA9FF, // A9FF ; UNKNOWN + 0xAA00, // AA00..AA36; CHAM + 0xAA37, // AA37..AA3F; UNKNOWN + 0xAA40, // AA40..AA4D; CHAM + 0xAA4E, // AA4E..AA4F; UNKNOWN + 0xAA50, // AA50..AA59; CHAM + 0xAA5A, // AA5A..AA5B; UNKNOWN + 0xAA5C, // AA5C..AA5F; CHAM 0xAA60, // AA60..AA7F; MYANMAR - 0xAA80, // AA80..AADF; TAI_VIET - 0xAAE0, // AAE0..AB00; MEETEI_MAYEK - 0xAB01, // AB01..ABBF; ETHIOPIC - 0xABC0, // ABC0..ABFF; MEETEI_MAYEK - 0xAC00, // AC00..D7FB; HANGUL + 0xAA80, // AA80..AAC2; TAI_VIET + 0xAAC3, // AAC3..AADA; UNKNOWN + 0xAADB, // AADB..AADF; TAI_VIET + 0xAAE0, // AAE0..AAF6; MEETEI_MAYEK + 0xAAF7, // AAF7..AB00; UNKNOWN + 0xAB01, // AB01..AB06; ETHIOPIC + 0xAB07, // AB07..AB08; UNKNOWN + 0xAB09, // AB09..AB0E; ETHIOPIC + 0xAB0F, // AB0F..AB10; UNKNOWN + 0xAB11, // AB11..AB16; ETHIOPIC + 0xAB17, // AB17..AB1F; UNKNOWN + 0xAB20, // AB20..AB26; ETHIOPIC + 0xAB27, // AB27 ; UNKNOWN + 0xAB28, // AB28..AB2E; ETHIOPIC + 0xAB2F, // AB2F ; UNKNOWN + 0xAB30, // AB30..AB5A; LATIN + 0xAB5B, // AB5B ; COMMON + 0xAB5C, // AB5C..AB5F; LATIN + 0xAB60, // AB60..AB63; UNKNOWN + 0xAB64, // AB64 ; LATIN + 0xAB65, // AB65 ; GREEK + 0xAB66, // AB66..ABBF; UNKNOWN + 0xABC0, // ABC0..ABED; MEETEI_MAYEK + 0xABEE, // ABEE..ABEF; UNKNOWN + 0xABF0, // ABF0..ABF9; MEETEI_MAYEK + 0xABFA, // ABFA..ABFF; UNKNOWN + 0xAC00, // AC00..D7A3; HANGUL + 0xD7A4, // D7A4..D7AF; UNKNOWN + 0xD7B0, // D7B0..D7C6; HANGUL + 0xD7C7, // D7C7..D7CA; UNKNOWN + 0xD7CB, // D7CB..D7FB; HANGUL 0xD7FC, // D7FC..F8FF; UNKNOWN - 0xF900, // F900..FAFF; HAN - 0xFB00, // FB00..FB12; LATIN - 0xFB13, // FB13..FB1C; ARMENIAN - 0xFB1D, // FB1D..FB4F; HEBREW - 0xFB50, // FB50..FD3D; ARABIC - 0xFD3E, // FD3E..FD4F; COMMON - 0xFD50, // FD50..FDFC; ARABIC - 0xFDFD, // FDFD..FDFF; COMMON + 0xF900, // F900..FA6D; HAN + 0xFA6E, // FA6E..FA6F; UNKNOWN + 0xFA70, // FA70..FAD9; HAN + 0xFADA, // FADA..FAFF; UNKNOWN + 0xFB00, // FB00..FB06; LATIN + 0xFB07, // FB07..FB12; UNKNOWN + 0xFB13, // FB13..FB17; ARMENIAN + 0xFB18, // FB18..FB1C; UNKNOWN + 0xFB1D, // FB1D..FB36; HEBREW + 0xFB37, // FB37 ; UNKNOWN + 0xFB38, // FB38..FB3C; HEBREW + 0xFB3D, // FB3D ; UNKNOWN + 0xFB3E, // FB3E ; HEBREW + 0xFB3F, // FB3F ; UNKNOWN + 0xFB40, // FB40..FB41; HEBREW + 0xFB42, // FB42 ; UNKNOWN + 0xFB43, // FB43..FB44; HEBREW + 0xFB45, // FB45 ; UNKNOWN + 0xFB46, // FB46..FB4F; HEBREW + 0xFB50, // FB50..FBC1; ARABIC + 0xFBC2, // FBC2..FBD2; UNKNOWN + 0xFBD3, // FBD3..FD3D; ARABIC + 0xFD3E, // FD3E..FD3F; COMMON + 0xFD40, // FD40..FD4F; UNKNOWN + 0xFD50, // FD50..FD8F; ARABIC + 0xFD90, // FD90..FD91; UNKNOWN + 0xFD92, // FD92..FDC7; ARABIC + 0xFDC8, // FDC8..FDEF; UNKNOWN + 0xFDF0, // FDF0..FDFD; ARABIC + 0xFDFE, // FDFE..FDFF; UNKNOWN 0xFE00, // FE00..FE0F; INHERITED - 0xFE10, // FE10..FE1F; COMMON - 0xFE20, // FE20..FE2F; INHERITED - 0xFE30, // FE30..FE6F; COMMON - 0xFE70, // FE70..FEFE; ARABIC - 0xFEFF, // FEFF..FF20; COMMON + 0xFE10, // FE10..FE19; COMMON + 0xFE1A, // FE1A..FE1F; UNKNOWN + 0xFE20, // FE20..FE2D; INHERITED + 0xFE2E, // FE2E..FE2F; UNKNOWN + 0xFE30, // FE30..FE52; COMMON + 0xFE53, // FE53 ; UNKNOWN + 0xFE54, // FE54..FE66; COMMON + 0xFE67, // FE67 ; UNKNOWN + 0xFE68, // FE68..FE6B; COMMON + 0xFE6C, // FE6C..FE6F; UNKNOWN + 0xFE70, // FE70..FE74; ARABIC + 0xFE75, // FE75 ; UNKNOWN + 0xFE76, // FE76..FEFC; ARABIC + 0xFEFD, // FEFD..FEFE; UNKNOWN + 0xFEFF, // FEFF ; COMMON + 0xFF00, // FF00 ; UNKNOWN + 0xFF01, // FF01..FF20; COMMON 0xFF21, // FF21..FF3A; LATIN 0xFF3B, // FF3B..FF40; COMMON 0xFF41, // FF41..FF5A; LATIN 0xFF5B, // FF5B..FF65; COMMON 0xFF66, // FF66..FF6F; KATAKANA - 0xFF70, // FF70..FF70; COMMON + 0xFF70, // FF70 ; COMMON 0xFF71, // FF71..FF9D; KATAKANA 0xFF9E, // FF9E..FF9F; COMMON - 0xFFA0, // FFA0..FFDF; HANGUL - 0xFFE0, // FFE0..FFFF; COMMON - 0x10000, // 10000..100FF; LINEAR_B - 0x10100, // 10100..1013F; COMMON - 0x10140, // 10140..1018F; GREEK - 0x10190, // 10190..101FC; COMMON - 0x101FD, // 101FD..1027F; INHERITED - 0x10280, // 10280..1029F; LYCIAN - 0x102A0, // 102A0..102FF; CARIAN - 0x10300, // 10300..1032F; OLD_ITALIC - 0x10330, // 10330..1037F; GOTHIC - 0x10380, // 10380..1039F; UGARITIC - 0x103A0, // 103A0..103FF; OLD_PERSIAN + 0xFFA0, // FFA0..FFBE; HANGUL + 0xFFBF, // FFBF..FFC1; UNKNOWN + 0xFFC2, // FFC2..FFC7; HANGUL + 0xFFC8, // FFC8..FFC9; UNKNOWN + 0xFFCA, // FFCA..FFCF; HANGUL + 0xFFD0, // FFD0..FFD1; UNKNOWN + 0xFFD2, // FFD2..FFD7; HANGUL + 0xFFD8, // FFD8..FFD9; UNKNOWN + 0xFFDA, // FFDA..FFDC; HANGUL + 0xFFDD, // FFDD..FFDF; UNKNOWN + 0xFFE0, // FFE0..FFE6; COMMON + 0xFFE7, // FFE7 ; UNKNOWN + 0xFFE8, // FFE8..FFEE; COMMON + 0xFFEF, // FFEF..FFF8; UNKNOWN + 0xFFF9, // FFF9..FFFD; COMMON + 0xFFFE, // FFFE..FFFF; UNKNOWN + 0x10000, // 10000..1000B; LINEAR_B + 0x1000C, // 1000C ; UNKNOWN + 0x1000D, // 1000D..10026; LINEAR_B + 0x10027, // 10027 ; UNKNOWN + 0x10028, // 10028..1003A; LINEAR_B + 0x1003B, // 1003B ; UNKNOWN + 0x1003C, // 1003C..1003D; LINEAR_B + 0x1003E, // 1003E ; UNKNOWN + 0x1003F, // 1003F..1004D; LINEAR_B + 0x1004E, // 1004E..1004F; UNKNOWN + 0x10050, // 10050..1005D; LINEAR_B + 0x1005E, // 1005E..1007F; UNKNOWN + 0x10080, // 10080..100FA; LINEAR_B + 0x100FB, // 100FB..100FF; UNKNOWN + 0x10100, // 10100..10102; COMMON + 0x10103, // 10103..10106; UNKNOWN + 0x10107, // 10107..10133; COMMON + 0x10134, // 10134..10136; UNKNOWN + 0x10137, // 10137..1013F; COMMON + 0x10140, // 10140..1018C; GREEK + 0x1018D, // 1018D..1018F; UNKNOWN + 0x10190, // 10190..1019B; COMMON + 0x1019C, // 1019C..1019F; UNKNOWN + 0x101A0, // 101A0 ; GREEK + 0x101A1, // 101A1..101CF; UNKNOWN + 0x101D0, // 101D0..101FC; COMMON + 0x101FD, // 101FD ; INHERITED + 0x101FE, // 101FE..1027F; UNKNOWN + 0x10280, // 10280..1029C; LYCIAN + 0x1029D, // 1029D..1029F; UNKNOWN + 0x102A0, // 102A0..102D0; CARIAN + 0x102D1, // 102D1..102DF; UNKNOWN + 0x102E0, // 102E0 ; INHERITED + 0x102E1, // 102E1..102FB; COMMON + 0x102FC, // 102FC..102FF; UNKNOWN + 0x10300, // 10300..10323; OLD_ITALIC + 0x10324, // 10324..1032F; UNKNOWN + 0x10330, // 10330..1034A; GOTHIC + 0x1034B, // 1034B..1034F; UNKNOWN + 0x10350, // 10350..1037A; OLD_PERMIC + 0x1037B, // 1037B..1037F; UNKNOWN + 0x10380, // 10380..1039D; UGARITIC + 0x1039E, // 1039E ; UNKNOWN + 0x1039F, // 1039F ; UGARITIC + 0x103A0, // 103A0..103C3; OLD_PERSIAN + 0x103C4, // 103C4..103C7; UNKNOWN + 0x103C8, // 103C8..103D5; OLD_PERSIAN + 0x103D6, // 103D6..103FF; UNKNOWN 0x10400, // 10400..1044F; DESERET 0x10450, // 10450..1047F; SHAVIAN - 0x10480, // 10480..107FF; OSMANYA - 0x10800, // 10800..1083F; CYPRIOT - 0x10840, // 10840..108FF; IMPERIAL_ARAMAIC - 0x10900, // 10900..1091F; PHOENICIAN - 0x10920, // 10920..1097F; LYDIAN + 0x10480, // 10480..1049D; OSMANYA + 0x1049E, // 1049E..1049F; UNKNOWN + 0x104A0, // 104A0..104A9; OSMANYA + 0x104AA, // 104AA..104FF; UNKNOWN + 0x10500, // 10500..10527; ELBASAN + 0x10528, // 10528..1052F; UNKNOWN + 0x10530, // 10530..10563; CAUCASIAN_ALBANIAN + 0x10564, // 10564..1056E; UNKNOWN + 0x1056F, // 1056F ; CAUCASIAN_ALBANIAN + 0x10570, // 10570..105FF; UNKNOWN + 0x10600, // 10600..10736; LINEAR_A + 0x10737, // 10737..1073F; UNKNOWN + 0x10740, // 10740..10755; LINEAR_A + 0x10756, // 10756..1075F; UNKNOWN + 0x10760, // 10760..10767; LINEAR_A + 0x10768, // 10768..107FF; UNKNOWN + 0x10800, // 10800..10805; CYPRIOT + 0x10806, // 10806..10807; UNKNOWN + 0x10808, // 10808 ; CYPRIOT + 0x10809, // 10809 ; UNKNOWN + 0x1080A, // 1080A..10835; CYPRIOT + 0x10836, // 10836 ; UNKNOWN + 0x10837, // 10837..10838; CYPRIOT + 0x10839, // 10839..1083B; UNKNOWN + 0x1083C, // 1083C ; CYPRIOT + 0x1083D, // 1083D..1083E; UNKNOWN + 0x1083F, // 1083F ; CYPRIOT + 0x10840, // 10840..10855; IMPERIAL_ARAMAIC + 0x10856, // 10856 ; UNKNOWN + 0x10857, // 10857..1085F; IMPERIAL_ARAMAIC + 0x10860, // 10860..1087F; PALMYRENE + 0x10880, // 10880..1089E; NABATAEAN + 0x1089F, // 1089F..108A6; UNKNOWN + 0x108A7, // 108A7..108AF; NABATAEAN + 0x108B0, // 108B0..108FF; UNKNOWN + 0x10900, // 10900..1091B; PHOENICIAN + 0x1091C, // 1091C..1091E; UNKNOWN + 0x1091F, // 1091F ; PHOENICIAN + 0x10920, // 10920..10939; LYDIAN + 0x1093A, // 1093A..1093E; UNKNOWN + 0x1093F, // 1093F ; LYDIAN + 0x10940, // 10940..1097F; UNKNOWN 0x10980, // 10980..1099F; MEROITIC_HIEROGLYPHS - 0x109A0, // 109A0..109FF; MEROITIC_CURSIVE - 0x10A00, // 10A00..10A5F; KHAROSHTHI - 0x10A60, // 10A60..10AFF; OLD_SOUTH_ARABIAN - 0x10B00, // 10B00..10B3F; AVESTAN - 0x10B40, // 10B40..10B5F; INSCRIPTIONAL_PARTHIAN - 0x10B60, // 10B60..10BFF; INSCRIPTIONAL_PAHLAVI - 0x10C00, // 10C00..10E5F; OLD_TURKIC - 0x10E60, // 10E60..10FFF; ARABIC - 0x11000, // 11000..1107F; BRAHMI - 0x11080, // 11080..110CF; KAITHI - 0x110D0, // 110D0..110FF; SORA_SOMPENG - 0x11100, // 11100..1117F; CHAKMA - 0x11180, // 11180..1167F; SHARADA - 0x11680, // 11680..116CF; TAKRI - 0x12000, // 12000..12FFF; CUNEIFORM - 0x13000, // 13000..167FF; EGYPTIAN_HIEROGLYPHS + 0x109A0, // 109A0..109B7; MEROITIC_CURSIVE + 0x109B8, // 109B8..109BD; UNKNOWN + 0x109BE, // 109BE..109BF; MEROITIC_CURSIVE + 0x109C0, // 109C0..109FF; UNKNOWN + 0x10A00, // 10A00..10A03; KHAROSHTHI + 0x10A04, // 10A04 ; UNKNOWN + 0x10A05, // 10A05..10A06; KHAROSHTHI + 0x10A07, // 10A07..10A0B; UNKNOWN + 0x10A0C, // 10A0C..10A13; KHAROSHTHI + 0x10A14, // 10A14 ; UNKNOWN + 0x10A15, // 10A15..10A17; KHAROSHTHI + 0x10A18, // 10A18 ; UNKNOWN + 0x10A19, // 10A19..10A33; KHAROSHTHI + 0x10A34, // 10A34..10A37; UNKNOWN + 0x10A38, // 10A38..10A3A; KHAROSHTHI + 0x10A3B, // 10A3B..10A3E; UNKNOWN + 0x10A3F, // 10A3F..10A47; KHAROSHTHI + 0x10A48, // 10A48..10A4F; UNKNOWN + 0x10A50, // 10A50..10A58; KHAROSHTHI + 0x10A59, // 10A59..10A5F; UNKNOWN + 0x10A60, // 10A60..10A7F; OLD_SOUTH_ARABIAN + 0x10A80, // 10A80..10A9F; OLD_NORTH_ARABIAN + 0x10AA0, // 10AA0..10ABF; UNKNOWN + 0x10AC0, // 10AC0..10AE6; MANICHAEAN + 0x10AE7, // 10AE7..10AEA; UNKNOWN + 0x10AEB, // 10AEB..10AF6; MANICHAEAN + 0x10AF7, // 10AF7..10AFF; UNKNOWN + 0x10B00, // 10B00..10B35; AVESTAN + 0x10B36, // 10B36..10B38; UNKNOWN + 0x10B39, // 10B39..10B3F; AVESTAN + 0x10B40, // 10B40..10B55; INSCRIPTIONAL_PARTHIAN + 0x10B56, // 10B56..10B57; UNKNOWN + 0x10B58, // 10B58..10B5F; INSCRIPTIONAL_PARTHIAN + 0x10B60, // 10B60..10B72; INSCRIPTIONAL_PAHLAVI + 0x10B73, // 10B73..10B77; UNKNOWN + 0x10B78, // 10B78..10B7F; INSCRIPTIONAL_PAHLAVI + 0x10B80, // 10B80..10B91; PSALTER_PAHLAVI + 0x10B92, // 10B92..10B98; UNKNOWN + 0x10B99, // 10B99..10B9C; PSALTER_PAHLAVI + 0x10B9D, // 10B9D..10BA8; UNKNOWN + 0x10BA9, // 10BA9..10BAF; PSALTER_PAHLAVI + 0x10BB0, // 10BB0..10BFF; UNKNOWN + 0x10C00, // 10C00..10C48; OLD_TURKIC + 0x10C49, // 10C49..10E5F; UNKNOWN + 0x10E60, // 10E60..10E7E; ARABIC + 0x10E7F, // 10E7F..10FFF; UNKNOWN + 0x11000, // 11000..1104D; BRAHMI + 0x1104E, // 1104E..11051; UNKNOWN + 0x11052, // 11052..1106F; BRAHMI + 0x11070, // 11070..1107E; UNKNOWN + 0x1107F, // 1107F ; BRAHMI + 0x11080, // 11080..110C1; KAITHI + 0x110C2, // 110C2..110CF; UNKNOWN + 0x110D0, // 110D0..110E8; SORA_SOMPENG + 0x110E9, // 110E9..110EF; UNKNOWN + 0x110F0, // 110F0..110F9; SORA_SOMPENG + 0x110FA, // 110FA..110FF; UNKNOWN + 0x11100, // 11100..11134; CHAKMA + 0x11135, // 11135 ; UNKNOWN + 0x11136, // 11136..11143; CHAKMA + 0x11144, // 11144..1114F; UNKNOWN + 0x11150, // 11150..11176; MAHAJANI + 0x11177, // 11177..1117F; UNKNOWN + 0x11180, // 11180..111C8; SHARADA + 0x111C9, // 111C9..111CC; UNKNOWN + 0x111CD, // 111CD ; SHARADA + 0x111CE, // 111CE..111CF; UNKNOWN + 0x111D0, // 111D0..111DA; SHARADA + 0x111DB, // 111DB..111E0; UNKNOWN + 0x111E1, // 111E1..111F4; SINHALA + 0x111F5, // 111F5..111FF; UNKNOWN + 0x11200, // 11200..11211; KHOJKI + 0x11212, // 11212 ; UNKNOWN + 0x11213, // 11213..1123D; KHOJKI + 0x1123E, // 1123E..112AF; UNKNOWN + 0x112B0, // 112B0..112EA; KHUDAWADI + 0x112EB, // 112EB..112EF; UNKNOWN + 0x112F0, // 112F0..112F9; KHUDAWADI + 0x112FA, // 112FA..11300; UNKNOWN + 0x11301, // 11301..11303; GRANTHA + 0x11304, // 11304 ; UNKNOWN + 0x11305, // 11305..1130C; GRANTHA + 0x1130D, // 1130D..1130E; UNKNOWN + 0x1130F, // 1130F..11310; GRANTHA + 0x11311, // 11311..11312; UNKNOWN + 0x11313, // 11313..11328; GRANTHA + 0x11329, // 11329 ; UNKNOWN + 0x1132A, // 1132A..11330; GRANTHA + 0x11331, // 11331 ; UNKNOWN + 0x11332, // 11332..11333; GRANTHA + 0x11334, // 11334 ; UNKNOWN + 0x11335, // 11335..11339; GRANTHA + 0x1133A, // 1133A..1133B; UNKNOWN + 0x1133C, // 1133C..11344; GRANTHA + 0x11345, // 11345..11346; UNKNOWN + 0x11347, // 11347..11348; GRANTHA + 0x11349, // 11349..1134A; UNKNOWN + 0x1134B, // 1134B..1134D; GRANTHA + 0x1134E, // 1134E..11356; UNKNOWN + 0x11357, // 11357 ; GRANTHA + 0x11358, // 11358..1135C; UNKNOWN + 0x1135D, // 1135D..11363; GRANTHA + 0x11364, // 11364..11365; UNKNOWN + 0x11366, // 11366..1136C; GRANTHA + 0x1136D, // 1136D..1136F; UNKNOWN + 0x11370, // 11370..11374; GRANTHA + 0x11375, // 11375..1147F; UNKNOWN + 0x11480, // 11480..114C7; TIRHUTA + 0x114C8, // 114C8..114CF; UNKNOWN + 0x114D0, // 114D0..114D9; TIRHUTA + 0x114DA, // 114DA..1157F; UNKNOWN + 0x11580, // 11580..115B5; SIDDHAM + 0x115B6, // 115B6..115B7; UNKNOWN + 0x115B8, // 115B8..115C9; SIDDHAM + 0x115CA, // 115CA..115FF; UNKNOWN + 0x11600, // 11600..11644; MODI + 0x11645, // 11645..1164F; UNKNOWN + 0x11650, // 11650..11659; MODI + 0x1165A, // 1165A..1167F; UNKNOWN + 0x11680, // 11680..116B7; TAKRI + 0x116B8, // 116B8..116BF; UNKNOWN + 0x116C0, // 116C0..116C9; TAKRI + 0x116CA, // 116CA..1189F; UNKNOWN + 0x118A0, // 118A0..118F2; WARANG_CITI + 0x118F3, // 118F3..118FE; UNKNOWN + 0x118FF, // 118FF ; WARANG_CITI + 0x11900, // 11900..11ABF; UNKNOWN + 0x11AC0, // 11AC0..11AF8; PAU_CIN_HAU + 0x11AF9, // 11AF9..11FFF; UNKNOWN + 0x12000, // 12000..12398; CUNEIFORM + 0x12399, // 12399..123FF; UNKNOWN + 0x12400, // 12400..1246E; CUNEIFORM + 0x1246F, // 1246F ; UNKNOWN + 0x12470, // 12470..12474; CUNEIFORM + 0x12475, // 12475..12FFF; UNKNOWN + 0x13000, // 13000..1342E; EGYPTIAN_HIEROGLYPHS + 0x1342F, // 1342F..167FF; UNKNOWN 0x16800, // 16800..16A38; BAMUM - 0x16F00, // 16F00..16F9F; MIAO - 0x1B000, // 1B000..1B000; KATAKANA - 0x1B001, // 1B001..1CFFF; HIRAGANA - 0x1D000, // 1D000..1D166; COMMON + 0x16A39, // 16A39..16A3F; UNKNOWN + 0x16A40, // 16A40..16A5E; MRO + 0x16A5F, // 16A5F ; UNKNOWN + 0x16A60, // 16A60..16A69; MRO + 0x16A6A, // 16A6A..16A6D; UNKNOWN + 0x16A6E, // 16A6E..16A6F; MRO + 0x16A70, // 16A70..16ACF; UNKNOWN + 0x16AD0, // 16AD0..16AED; BASSA_VAH + 0x16AEE, // 16AEE..16AEF; UNKNOWN + 0x16AF0, // 16AF0..16AF5; BASSA_VAH + 0x16AF6, // 16AF6..16AFF; UNKNOWN + 0x16B00, // 16B00..16B45; PAHAWH_HMONG + 0x16B46, // 16B46..16B4F; UNKNOWN + 0x16B50, // 16B50..16B59; PAHAWH_HMONG + 0x16B5A, // 16B5A ; UNKNOWN + 0x16B5B, // 16B5B..16B61; PAHAWH_HMONG + 0x16B62, // 16B62 ; UNKNOWN + 0x16B63, // 16B63..16B77; PAHAWH_HMONG + 0x16B78, // 16B78..16B7C; UNKNOWN + 0x16B7D, // 16B7D..16B8F; PAHAWH_HMONG + 0x16B90, // 16B90..16EFF; UNKNOWN + 0x16F00, // 16F00..16F44; MIAO + 0x16F45, // 16F45..16F4F; UNKNOWN + 0x16F50, // 16F50..16F7E; MIAO + 0x16F7F, // 16F7F..16F8E; UNKNOWN + 0x16F8F, // 16F8F..16F9F; MIAO + 0x16FA0, // 16FA0..1AFFF; UNKNOWN + 0x1B000, // 1B000 ; KATAKANA + 0x1B001, // 1B001 ; HIRAGANA + 0x1B002, // 1B002..1BBFF; UNKNOWN + 0x1BC00, // 1BC00..1BC6A; DUPLOYAN + 0x1BC6B, // 1BC6B..1BC6F; UNKNOWN + 0x1BC70, // 1BC70..1BC7C; DUPLOYAN + 0x1BC7D, // 1BC7D..1BC7F; UNKNOWN + 0x1BC80, // 1BC80..1BC88; DUPLOYAN + 0x1BC89, // 1BC89..1BC8F; UNKNOWN + 0x1BC90, // 1BC90..1BC99; DUPLOYAN + 0x1BC9A, // 1BC9A..1BC9B; UNKNOWN + 0x1BC9C, // 1BC9C..1BC9F; DUPLOYAN + 0x1BCA0, // 1BCA0..1BCA3; COMMON + 0x1BCA4, // 1BCA4..1CFFF; UNKNOWN + 0x1D000, // 1D000..1D0F5; COMMON + 0x1D0F6, // 1D0F6..1D0FF; UNKNOWN + 0x1D100, // 1D100..1D126; COMMON + 0x1D127, // 1D127..1D128; UNKNOWN + 0x1D129, // 1D129..1D166; COMMON 0x1D167, // 1D167..1D169; INHERITED 0x1D16A, // 1D16A..1D17A; COMMON 0x1D17B, // 1D17B..1D182; INHERITED @@ -4020,354 +5418,1635 @@ 0x1D185, // 1D185..1D18B; INHERITED 0x1D18C, // 1D18C..1D1A9; COMMON 0x1D1AA, // 1D1AA..1D1AD; INHERITED - 0x1D1AE, // 1D1AE..1D1FF; COMMON - 0x1D200, // 1D200..1D2FF; GREEK - 0x1D300, // 1D300..1EDFF; COMMON - 0x1EE00, // 1EE00..1EFFF; ARABIC - 0x1F000, // 1F000..1F1FF; COMMON - 0x1F200, // 1F200..1F200; HIRAGANA - 0x1F201, // 1F210..1FFFF; COMMON - 0x20000, // 20000..E0000; HAN - 0xE0001, // E0001..E00FF; COMMON + 0x1D1AE, // 1D1AE..1D1DD; COMMON + 0x1D1DE, // 1D1DE..1D1FF; UNKNOWN + 0x1D200, // 1D200..1D245; GREEK + 0x1D246, // 1D246..1D2FF; UNKNOWN + 0x1D300, // 1D300..1D356; COMMON + 0x1D357, // 1D357..1D35F; UNKNOWN + 0x1D360, // 1D360..1D371; COMMON + 0x1D372, // 1D372..1D3FF; UNKNOWN + 0x1D400, // 1D400..1D454; COMMON + 0x1D455, // 1D455 ; UNKNOWN + 0x1D456, // 1D456..1D49C; COMMON + 0x1D49D, // 1D49D ; UNKNOWN + 0x1D49E, // 1D49E..1D49F; COMMON + 0x1D4A0, // 1D4A0..1D4A1; UNKNOWN + 0x1D4A2, // 1D4A2 ; COMMON + 0x1D4A3, // 1D4A3..1D4A4; UNKNOWN + 0x1D4A5, // 1D4A5..1D4A6; COMMON + 0x1D4A7, // 1D4A7..1D4A8; UNKNOWN + 0x1D4A9, // 1D4A9..1D4AC; COMMON + 0x1D4AD, // 1D4AD ; UNKNOWN + 0x1D4AE, // 1D4AE..1D4B9; COMMON + 0x1D4BA, // 1D4BA ; UNKNOWN + 0x1D4BB, // 1D4BB ; COMMON + 0x1D4BC, // 1D4BC ; UNKNOWN + 0x1D4BD, // 1D4BD..1D4C3; COMMON + 0x1D4C4, // 1D4C4 ; UNKNOWN + 0x1D4C5, // 1D4C5..1D505; COMMON + 0x1D506, // 1D506 ; UNKNOWN + 0x1D507, // 1D507..1D50A; COMMON + 0x1D50B, // 1D50B..1D50C; UNKNOWN + 0x1D50D, // 1D50D..1D514; COMMON + 0x1D515, // 1D515 ; UNKNOWN + 0x1D516, // 1D516..1D51C; COMMON + 0x1D51D, // 1D51D ; UNKNOWN + 0x1D51E, // 1D51E..1D539; COMMON + 0x1D53A, // 1D53A ; UNKNOWN + 0x1D53B, // 1D53B..1D53E; COMMON + 0x1D53F, // 1D53F ; UNKNOWN + 0x1D540, // 1D540..1D544; COMMON + 0x1D545, // 1D545 ; UNKNOWN + 0x1D546, // 1D546 ; COMMON + 0x1D547, // 1D547..1D549; UNKNOWN + 0x1D54A, // 1D54A..1D550; COMMON + 0x1D551, // 1D551 ; UNKNOWN + 0x1D552, // 1D552..1D6A5; COMMON + 0x1D6A6, // 1D6A6..1D6A7; UNKNOWN + 0x1D6A8, // 1D6A8..1D7CB; COMMON + 0x1D7CC, // 1D7CC..1D7CD; UNKNOWN + 0x1D7CE, // 1D7CE..1D7FF; COMMON + 0x1D800, // 1D800..1E7FF; UNKNOWN + 0x1E800, // 1E800..1E8C4; MENDE_KIKAKUI + 0x1E8C5, // 1E8C5..1E8C6; UNKNOWN + 0x1E8C7, // 1E8C7..1E8D6; MENDE_KIKAKUI + 0x1E8D7, // 1E8D7..1EDFF; UNKNOWN + 0x1EE00, // 1EE00..1EE03; ARABIC + 0x1EE04, // 1EE04 ; UNKNOWN + 0x1EE05, // 1EE05..1EE1F; ARABIC + 0x1EE20, // 1EE20 ; UNKNOWN + 0x1EE21, // 1EE21..1EE22; ARABIC + 0x1EE23, // 1EE23 ; UNKNOWN + 0x1EE24, // 1EE24 ; ARABIC + 0x1EE25, // 1EE25..1EE26; UNKNOWN + 0x1EE27, // 1EE27 ; ARABIC + 0x1EE28, // 1EE28 ; UNKNOWN + 0x1EE29, // 1EE29..1EE32; ARABIC + 0x1EE33, // 1EE33 ; UNKNOWN + 0x1EE34, // 1EE34..1EE37; ARABIC + 0x1EE38, // 1EE38 ; UNKNOWN + 0x1EE39, // 1EE39 ; ARABIC + 0x1EE3A, // 1EE3A ; UNKNOWN + 0x1EE3B, // 1EE3B ; ARABIC + 0x1EE3C, // 1EE3C..1EE41; UNKNOWN + 0x1EE42, // 1EE42 ; ARABIC + 0x1EE43, // 1EE43..1EE46; UNKNOWN + 0x1EE47, // 1EE47 ; ARABIC + 0x1EE48, // 1EE48 ; UNKNOWN + 0x1EE49, // 1EE49 ; ARABIC + 0x1EE4A, // 1EE4A ; UNKNOWN + 0x1EE4B, // 1EE4B ; ARABIC + 0x1EE4C, // 1EE4C ; UNKNOWN + 0x1EE4D, // 1EE4D..1EE4F; ARABIC + 0x1EE50, // 1EE50 ; UNKNOWN + 0x1EE51, // 1EE51..1EE52; ARABIC + 0x1EE53, // 1EE53 ; UNKNOWN + 0x1EE54, // 1EE54 ; ARABIC + 0x1EE55, // 1EE55..1EE56; UNKNOWN + 0x1EE57, // 1EE57 ; ARABIC + 0x1EE58, // 1EE58 ; UNKNOWN + 0x1EE59, // 1EE59 ; ARABIC + 0x1EE5A, // 1EE5A ; UNKNOWN + 0x1EE5B, // 1EE5B ; ARABIC + 0x1EE5C, // 1EE5C ; UNKNOWN + 0x1EE5D, // 1EE5D ; ARABIC + 0x1EE5E, // 1EE5E ; UNKNOWN + 0x1EE5F, // 1EE5F ; ARABIC + 0x1EE60, // 1EE60 ; UNKNOWN + 0x1EE61, // 1EE61..1EE62; ARABIC + 0x1EE63, // 1EE63 ; UNKNOWN + 0x1EE64, // 1EE64 ; ARABIC + 0x1EE65, // 1EE65..1EE66; UNKNOWN + 0x1EE67, // 1EE67..1EE6A; ARABIC + 0x1EE6B, // 1EE6B ; UNKNOWN + 0x1EE6C, // 1EE6C..1EE72; ARABIC + 0x1EE73, // 1EE73 ; UNKNOWN + 0x1EE74, // 1EE74..1EE77; ARABIC + 0x1EE78, // 1EE78 ; UNKNOWN + 0x1EE79, // 1EE79..1EE7C; ARABIC + 0x1EE7D, // 1EE7D ; UNKNOWN + 0x1EE7E, // 1EE7E ; ARABIC + 0x1EE7F, // 1EE7F ; UNKNOWN + 0x1EE80, // 1EE80..1EE89; ARABIC + 0x1EE8A, // 1EE8A ; UNKNOWN + 0x1EE8B, // 1EE8B..1EE9B; ARABIC + 0x1EE9C, // 1EE9C..1EEA0; UNKNOWN + 0x1EEA1, // 1EEA1..1EEA3; ARABIC + 0x1EEA4, // 1EEA4 ; UNKNOWN + 0x1EEA5, // 1EEA5..1EEA9; ARABIC + 0x1EEAA, // 1EEAA ; UNKNOWN + 0x1EEAB, // 1EEAB..1EEBB; ARABIC + 0x1EEBC, // 1EEBC..1EEEF; UNKNOWN + 0x1EEF0, // 1EEF0..1EEF1; ARABIC + 0x1EEF2, // 1EEF2..1EFFF; UNKNOWN + 0x1F000, // 1F000..1F02B; COMMON + 0x1F02C, // 1F02C..1F02F; UNKNOWN + 0x1F030, // 1F030..1F093; COMMON + 0x1F094, // 1F094..1F09F; UNKNOWN + 0x1F0A0, // 1F0A0..1F0AE; COMMON + 0x1F0AF, // 1F0AF..1F0B0; UNKNOWN + 0x1F0B1, // 1F0B1..1F0BF; COMMON + 0x1F0C0, // 1F0C0 ; UNKNOWN + 0x1F0C1, // 1F0C1..1F0CF; COMMON + 0x1F0D0, // 1F0D0 ; UNKNOWN + 0x1F0D1, // 1F0D1..1F0F5; COMMON + 0x1F0F6, // 1F0F6..1F0FF; UNKNOWN + 0x1F100, // 1F100..1F10C; COMMON + 0x1F10D, // 1F10D..1F10F; UNKNOWN + 0x1F110, // 1F110..1F12E; COMMON + 0x1F12F, // 1F12F ; UNKNOWN + 0x1F130, // 1F130..1F16B; COMMON + 0x1F16C, // 1F16C..1F16F; UNKNOWN + 0x1F170, // 1F170..1F19A; COMMON + 0x1F19B, // 1F19B..1F1E5; UNKNOWN + 0x1F1E6, // 1F1E6..1F1FF; COMMON + 0x1F200, // 1F200 ; HIRAGANA + 0x1F201, // 1F201..1F202; COMMON + 0x1F203, // 1F203..1F20F; UNKNOWN + 0x1F210, // 1F210..1F23A; COMMON + 0x1F23B, // 1F23B..1F23F; UNKNOWN + 0x1F240, // 1F240..1F248; COMMON + 0x1F249, // 1F249..1F24F; UNKNOWN + 0x1F250, // 1F250..1F251; COMMON + 0x1F252, // 1F252..1F2FF; UNKNOWN + 0x1F300, // 1F300..1F32C; COMMON + 0x1F32D, // 1F32D..1F32F; UNKNOWN + 0x1F330, // 1F330..1F37D; COMMON + 0x1F37E, // 1F37E..1F37F; UNKNOWN + 0x1F380, // 1F380..1F3CE; COMMON + 0x1F3CF, // 1F3CF..1F3D3; UNKNOWN + 0x1F3D4, // 1F3D4..1F3F7; COMMON + 0x1F3F8, // 1F3F8..1F3FF; UNKNOWN + 0x1F400, // 1F400..1F4FE; COMMON + 0x1F4FF, // 1F4FF ; UNKNOWN + 0x1F500, // 1F500..1F54A; COMMON + 0x1F54B, // 1F54B..1F54F; UNKNOWN + 0x1F550, // 1F550..1F579; COMMON + 0x1F57A, // 1F57A ; UNKNOWN + 0x1F57B, // 1F57B..1F5A3; COMMON + 0x1F5A4, // 1F5A4 ; UNKNOWN + 0x1F5A5, // 1F5A5..1F642; COMMON + 0x1F643, // 1F643..1F644; UNKNOWN + 0x1F645, // 1F645..1F6CF; COMMON + 0x1F6D0, // 1F6D0..1F6DF; UNKNOWN + 0x1F6E0, // 1F6E0..1F6EC; COMMON + 0x1F6ED, // 1F6ED..1F6EF; UNKNOWN + 0x1F6F0, // 1F6F0..1F6F3; COMMON + 0x1F6F4, // 1F6F4..1F6FF; UNKNOWN + 0x1F700, // 1F700..1F773; COMMON + 0x1F774, // 1F774..1F77F; UNKNOWN + 0x1F780, // 1F780..1F7D4; COMMON + 0x1F7D5, // 1F7D5..1F7FF; UNKNOWN + 0x1F800, // 1F800..1F80B; COMMON + 0x1F80C, // 1F80C..1F80F; UNKNOWN + 0x1F810, // 1F810..1F847; COMMON + 0x1F848, // 1F848..1F84F; UNKNOWN + 0x1F850, // 1F850..1F859; COMMON + 0x1F85A, // 1F85A..1F85F; UNKNOWN + 0x1F860, // 1F860..1F887; COMMON + 0x1F888, // 1F888..1F88F; UNKNOWN + 0x1F890, // 1F890..1F8AD; COMMON + 0x1F8AE, // 1F8AE..1FFFF; UNKNOWN + 0x20000, // 20000..2A6D6; HAN + 0x2A6D7, // 2A6D7..2A6FF; UNKNOWN + 0x2A700, // 2A700..2B734; HAN + 0x2B735, // 2B735..2B73F; UNKNOWN + 0x2B740, // 2B740..2B81D; HAN + 0x2B81E, // 2B81E..2F7FF; UNKNOWN + 0x2F800, // 2F800..2FA1D; HAN + 0x2FA1E, // 2FA1E..E0000; UNKNOWN + 0xE0001, // E0001 ; COMMON + 0xE0002, // E0002..E001F; UNKNOWN + 0xE0020, // E0020..E007F; COMMON + 0xE0080, // E0080..E00FF; UNKNOWN 0xE0100, // E0100..E01EF; INHERITED - 0xE01F0 // E01F0..10FFFF; UNKNOWN - + 0xE01F0, // E01F0..10FFFF; UNKNOWN }; private static final UnicodeScript[] scripts = { - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - BOPOMOFO, - COMMON, - INHERITED, - GREEK, - COMMON, - GREEK, - COMMON, - GREEK, - COMMON, - GREEK, - COMMON, - GREEK, - COPTIC, - GREEK, - CYRILLIC, - INHERITED, - CYRILLIC, - ARMENIAN, - COMMON, - ARMENIAN, - HEBREW, - ARABIC, - COMMON, - ARABIC, - COMMON, - ARABIC, - COMMON, - ARABIC, - COMMON, - ARABIC, - INHERITED, - ARABIC, - COMMON, - ARABIC, - INHERITED, - ARABIC, - COMMON, - ARABIC, - SYRIAC, - ARABIC, - THAANA, - NKO, - SAMARITAN, - MANDAIC, - ARABIC, - DEVANAGARI, - INHERITED, - DEVANAGARI, - COMMON, - DEVANAGARI, - BENGALI, - GURMUKHI, - GUJARATI, - ORIYA, - TAMIL, - TELUGU, - KANNADA, - MALAYALAM, - SINHALA, - THAI, - COMMON, - THAI, - LAO, - TIBETAN, - COMMON, - TIBETAN, - MYANMAR, - GEORGIAN, - COMMON, - GEORGIAN, - HANGUL, - ETHIOPIC, - CHEROKEE, - CANADIAN_ABORIGINAL, - OGHAM, - RUNIC, - COMMON, - RUNIC, - TAGALOG, - HANUNOO, - COMMON, - BUHID, - TAGBANWA, - KHMER, - MONGOLIAN, - COMMON, - MONGOLIAN, - COMMON, - MONGOLIAN, - CANADIAN_ABORIGINAL, - LIMBU, - TAI_LE, - NEW_TAI_LUE, - KHMER, - BUGINESE, - TAI_THAM, - BALINESE, - SUNDANESE, - BATAK, - LEPCHA, - OL_CHIKI, - SUNDANESE, - INHERITED, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - LATIN, - GREEK, - CYRILLIC, - LATIN, - GREEK, - LATIN, - GREEK, - LATIN, - CYRILLIC, - LATIN, - GREEK, - INHERITED, - LATIN, - GREEK, - COMMON, - INHERITED, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - INHERITED, - COMMON, - GREEK, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - BRAILLE, - COMMON, - GLAGOLITIC, - LATIN, - COPTIC, - GEORGIAN, - TIFINAGH, - ETHIOPIC, - CYRILLIC, - COMMON, - HAN, - COMMON, - HAN, - COMMON, - HAN, - COMMON, - HAN, - INHERITED, - HANGUL, - COMMON, - HAN, - COMMON, - HIRAGANA, - INHERITED, - COMMON, - HIRAGANA, - COMMON, - KATAKANA, - COMMON, - KATAKANA, - BOPOMOFO, - HANGUL, - COMMON, - BOPOMOFO, - COMMON, - KATAKANA, - HANGUL, - COMMON, - HANGUL, - COMMON, - KATAKANA, - COMMON, - HAN, - COMMON, - HAN, - YI, - LISU, - VAI, - CYRILLIC, - BAMUM, - COMMON, - LATIN, - COMMON, - LATIN, - SYLOTI_NAGRI, - COMMON, - PHAGS_PA, - SAURASHTRA, - DEVANAGARI, - KAYAH_LI, - REJANG, - HANGUL, - JAVANESE, - CHAM, - MYANMAR, - TAI_VIET, - MEETEI_MAYEK, - ETHIOPIC, - MEETEI_MAYEK, - HANGUL, - UNKNOWN , - HAN, - LATIN, - ARMENIAN, - HEBREW, - ARABIC, - COMMON, - ARABIC, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - ARABIC, - COMMON, - LATIN, - COMMON, - LATIN, - COMMON, - KATAKANA, - COMMON, - KATAKANA, - COMMON, - HANGUL, - COMMON, - LINEAR_B, - COMMON, - GREEK, - COMMON, - INHERITED, - LYCIAN, - CARIAN, - OLD_ITALIC, - GOTHIC, - UGARITIC, - OLD_PERSIAN, - DESERET, - SHAVIAN, - OSMANYA, - CYPRIOT, - IMPERIAL_ARAMAIC, - PHOENICIAN, - LYDIAN, - MEROITIC_HIEROGLYPHS, - MEROITIC_CURSIVE, - KHAROSHTHI, - OLD_SOUTH_ARABIAN, - AVESTAN, - INSCRIPTIONAL_PARTHIAN, - INSCRIPTIONAL_PAHLAVI, - OLD_TURKIC, - ARABIC, - BRAHMI, - KAITHI, - SORA_SOMPENG, - CHAKMA, - SHARADA, - TAKRI, - CUNEIFORM, - EGYPTIAN_HIEROGLYPHS, - BAMUM, - MIAO, - KATAKANA, - HIRAGANA, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - INHERITED, - COMMON, - GREEK, - COMMON, - ARABIC, - COMMON, - HIRAGANA, - COMMON, - HAN, - COMMON, - INHERITED, - UNKNOWN + COMMON, // 0000..0040 + LATIN, // 0041..005A + COMMON, // 005B..0060 + LATIN, // 0061..007A + COMMON, // 007B..00A9 + LATIN, // 00AA + COMMON, // 00AB..00B9 + LATIN, // 00BA + COMMON, // 00BB..00BF + LATIN, // 00C0..00D6 + COMMON, // 00D7 + LATIN, // 00D8..00F6 + COMMON, // 00F7 + LATIN, // 00F8..02B8 + COMMON, // 02B9..02DF + LATIN, // 02E0..02E4 + COMMON, // 02E5..02E9 + BOPOMOFO, // 02EA..02EB + COMMON, // 02EC..02FF + INHERITED, // 0300..036F + GREEK, // 0370..0373 + COMMON, // 0374 + GREEK, // 0375..0377 + UNKNOWN, // 0378..0379 + GREEK, // 037A..037D + COMMON, // 037E + GREEK, // 037F + UNKNOWN, // 0380..0383 + GREEK, // 0384 + COMMON, // 0385 + GREEK, // 0386 + COMMON, // 0387 + GREEK, // 0388..038A + UNKNOWN, // 038B + GREEK, // 038C + UNKNOWN, // 038D + GREEK, // 038E..03A1 + UNKNOWN, // 03A2 + GREEK, // 03A3..03E1 + COPTIC, // 03E2..03EF + GREEK, // 03F0..03FF + CYRILLIC, // 0400..0484 + INHERITED, // 0485..0486 + CYRILLIC, // 0487..052F + UNKNOWN, // 0530 + ARMENIAN, // 0531..0556 + UNKNOWN, // 0557..0558 + ARMENIAN, // 0559..055F + UNKNOWN, // 0560 + ARMENIAN, // 0561..0587 + UNKNOWN, // 0588 + COMMON, // 0589 + ARMENIAN, // 058A + UNKNOWN, // 058B..058C + ARMENIAN, // 058D..058F + UNKNOWN, // 0590 + HEBREW, // 0591..05C7 + UNKNOWN, // 05C8..05CF + HEBREW, // 05D0..05EA + UNKNOWN, // 05EB..05EF + HEBREW, // 05F0..05F4 + UNKNOWN, // 05F5..05FF + ARABIC, // 0600..0604 + COMMON, // 0605 + ARABIC, // 0606..060B + COMMON, // 060C + ARABIC, // 060D..061A + COMMON, // 061B..061C + UNKNOWN, // 061D + ARABIC, // 061E + COMMON, // 061F + ARABIC, // 0620..063F + COMMON, // 0640 + ARABIC, // 0641..064A + INHERITED, // 064B..0655 + ARABIC, // 0656..065F + COMMON, // 0660..0669 + ARABIC, // 066A..066F + INHERITED, // 0670 + ARABIC, // 0671..06DC + COMMON, // 06DD + ARABIC, // 06DE..06FF + SYRIAC, // 0700..070D + UNKNOWN, // 070E + SYRIAC, // 070F..074A + UNKNOWN, // 074B..074C + SYRIAC, // 074D..074F + ARABIC, // 0750..077F + THAANA, // 0780..07B1 + UNKNOWN, // 07B2..07BF + NKO, // 07C0..07FA + UNKNOWN, // 07FB..07FF + SAMARITAN, // 0800..082D + UNKNOWN, // 082E..082F + SAMARITAN, // 0830..083E + UNKNOWN, // 083F + MANDAIC, // 0840..085B + UNKNOWN, // 085C..085D + MANDAIC, // 085E + UNKNOWN, // 085F..089F + ARABIC, // 08A0..08B2 + UNKNOWN, // 08B3..08E3 + ARABIC, // 08E4..08FF + DEVANAGARI, // 0900..0950 + INHERITED, // 0951..0952 + DEVANAGARI, // 0953..0963 + COMMON, // 0964..0965 + DEVANAGARI, // 0966..097F + BENGALI, // 0980..0983 + UNKNOWN, // 0984 + BENGALI, // 0985..098C + UNKNOWN, // 098D..098E + BENGALI, // 098F..0990 + UNKNOWN, // 0991..0992 + BENGALI, // 0993..09A8 + UNKNOWN, // 09A9 + BENGALI, // 09AA..09B0 + UNKNOWN, // 09B1 + BENGALI, // 09B2 + UNKNOWN, // 09B3..09B5 + BENGALI, // 09B6..09B9 + UNKNOWN, // 09BA..09BB + BENGALI, // 09BC..09C4 + UNKNOWN, // 09C5..09C6 + BENGALI, // 09C7..09C8 + UNKNOWN, // 09C9..09CA + BENGALI, // 09CB..09CE + UNKNOWN, // 09CF..09D6 + BENGALI, // 09D7 + UNKNOWN, // 09D8..09DB + BENGALI, // 09DC..09DD + UNKNOWN, // 09DE + BENGALI, // 09DF..09E3 + UNKNOWN, // 09E4..09E5 + BENGALI, // 09E6..09FB + UNKNOWN, // 09FC..0A00 + GURMUKHI, // 0A01..0A03 + UNKNOWN, // 0A04 + GURMUKHI, // 0A05..0A0A + UNKNOWN, // 0A0B..0A0E + GURMUKHI, // 0A0F..0A10 + UNKNOWN, // 0A11..0A12 + GURMUKHI, // 0A13..0A28 + UNKNOWN, // 0A29 + GURMUKHI, // 0A2A..0A30 + UNKNOWN, // 0A31 + GURMUKHI, // 0A32..0A33 + UNKNOWN, // 0A34 + GURMUKHI, // 0A35..0A36 + UNKNOWN, // 0A37 + GURMUKHI, // 0A38..0A39 + UNKNOWN, // 0A3A..0A3B + GURMUKHI, // 0A3C + UNKNOWN, // 0A3D + GURMUKHI, // 0A3E..0A42 + UNKNOWN, // 0A43..0A46 + GURMUKHI, // 0A47..0A48 + UNKNOWN, // 0A49..0A4A + GURMUKHI, // 0A4B..0A4D + UNKNOWN, // 0A4E..0A50 + GURMUKHI, // 0A51 + UNKNOWN, // 0A52..0A58 + GURMUKHI, // 0A59..0A5C + UNKNOWN, // 0A5D + GURMUKHI, // 0A5E + UNKNOWN, // 0A5F..0A65 + GURMUKHI, // 0A66..0A75 + UNKNOWN, // 0A76..0A80 + GUJARATI, // 0A81..0A83 + UNKNOWN, // 0A84 + GUJARATI, // 0A85..0A8D + UNKNOWN, // 0A8E + GUJARATI, // 0A8F..0A91 + UNKNOWN, // 0A92 + GUJARATI, // 0A93..0AA8 + UNKNOWN, // 0AA9 + GUJARATI, // 0AAA..0AB0 + UNKNOWN, // 0AB1 + GUJARATI, // 0AB2..0AB3 + UNKNOWN, // 0AB4 + GUJARATI, // 0AB5..0AB9 + UNKNOWN, // 0ABA..0ABB + GUJARATI, // 0ABC..0AC5 + UNKNOWN, // 0AC6 + GUJARATI, // 0AC7..0AC9 + UNKNOWN, // 0ACA + GUJARATI, // 0ACB..0ACD + UNKNOWN, // 0ACE..0ACF + GUJARATI, // 0AD0 + UNKNOWN, // 0AD1..0ADF + GUJARATI, // 0AE0..0AE3 + UNKNOWN, // 0AE4..0AE5 + GUJARATI, // 0AE6..0AF1 + UNKNOWN, // 0AF2..0B00 + ORIYA, // 0B01..0B03 + UNKNOWN, // 0B04 + ORIYA, // 0B05..0B0C + UNKNOWN, // 0B0D..0B0E + ORIYA, // 0B0F..0B10 + UNKNOWN, // 0B11..0B12 + ORIYA, // 0B13..0B28 + UNKNOWN, // 0B29 + ORIYA, // 0B2A..0B30 + UNKNOWN, // 0B31 + ORIYA, // 0B32..0B33 + UNKNOWN, // 0B34 + ORIYA, // 0B35..0B39 + UNKNOWN, // 0B3A..0B3B + ORIYA, // 0B3C..0B44 + UNKNOWN, // 0B45..0B46 + ORIYA, // 0B47..0B48 + UNKNOWN, // 0B49..0B4A + ORIYA, // 0B4B..0B4D + UNKNOWN, // 0B4E..0B55 + ORIYA, // 0B56..0B57 + UNKNOWN, // 0B58..0B5B + ORIYA, // 0B5C..0B5D + UNKNOWN, // 0B5E + ORIYA, // 0B5F..0B63 + UNKNOWN, // 0B64..0B65 + ORIYA, // 0B66..0B77 + UNKNOWN, // 0B78..0B81 + TAMIL, // 0B82..0B83 + UNKNOWN, // 0B84 + TAMIL, // 0B85..0B8A + UNKNOWN, // 0B8B..0B8D + TAMIL, // 0B8E..0B90 + UNKNOWN, // 0B91 + TAMIL, // 0B92..0B95 + UNKNOWN, // 0B96..0B98 + TAMIL, // 0B99..0B9A + UNKNOWN, // 0B9B + TAMIL, // 0B9C + UNKNOWN, // 0B9D + TAMIL, // 0B9E..0B9F + UNKNOWN, // 0BA0..0BA2 + TAMIL, // 0BA3..0BA4 + UNKNOWN, // 0BA5..0BA7 + TAMIL, // 0BA8..0BAA + UNKNOWN, // 0BAB..0BAD + TAMIL, // 0BAE..0BB9 + UNKNOWN, // 0BBA..0BBD + TAMIL, // 0BBE..0BC2 + UNKNOWN, // 0BC3..0BC5 + TAMIL, // 0BC6..0BC8 + UNKNOWN, // 0BC9 + TAMIL, // 0BCA..0BCD + UNKNOWN, // 0BCE..0BCF + TAMIL, // 0BD0 + UNKNOWN, // 0BD1..0BD6 + TAMIL, // 0BD7 + UNKNOWN, // 0BD8..0BE5 + TAMIL, // 0BE6..0BFA + UNKNOWN, // 0BFB..0BFF + TELUGU, // 0C00..0C03 + UNKNOWN, // 0C04 + TELUGU, // 0C05..0C0C + UNKNOWN, // 0C0D + TELUGU, // 0C0E..0C10 + UNKNOWN, // 0C11 + TELUGU, // 0C12..0C28 + UNKNOWN, // 0C29 + TELUGU, // 0C2A..0C39 + UNKNOWN, // 0C3A..0C3C + TELUGU, // 0C3D..0C44 + UNKNOWN, // 0C45 + TELUGU, // 0C46..0C48 + UNKNOWN, // 0C49 + TELUGU, // 0C4A..0C4D + UNKNOWN, // 0C4E..0C54 + TELUGU, // 0C55..0C56 + UNKNOWN, // 0C57 + TELUGU, // 0C58..0C59 + UNKNOWN, // 0C5A..0C5F + TELUGU, // 0C60..0C63 + UNKNOWN, // 0C64..0C65 + TELUGU, // 0C66..0C6F + UNKNOWN, // 0C70..0C77 + TELUGU, // 0C78..0C7F + UNKNOWN, // 0C80 + KANNADA, // 0C81..0C83 + UNKNOWN, // 0C84 + KANNADA, // 0C85..0C8C + UNKNOWN, // 0C8D + KANNADA, // 0C8E..0C90 + UNKNOWN, // 0C91 + KANNADA, // 0C92..0CA8 + UNKNOWN, // 0CA9 + KANNADA, // 0CAA..0CB3 + UNKNOWN, // 0CB4 + KANNADA, // 0CB5..0CB9 + UNKNOWN, // 0CBA..0CBB + KANNADA, // 0CBC..0CC4 + UNKNOWN, // 0CC5 + KANNADA, // 0CC6..0CC8 + UNKNOWN, // 0CC9 + KANNADA, // 0CCA..0CCD + UNKNOWN, // 0CCE..0CD4 + KANNADA, // 0CD5..0CD6 + UNKNOWN, // 0CD7..0CDD + KANNADA, // 0CDE + UNKNOWN, // 0CDF + KANNADA, // 0CE0..0CE3 + UNKNOWN, // 0CE4..0CE5 + KANNADA, // 0CE6..0CEF + UNKNOWN, // 0CF0 + KANNADA, // 0CF1..0CF2 + UNKNOWN, // 0CF3..0D00 + MALAYALAM, // 0D01..0D03 + UNKNOWN, // 0D04 + MALAYALAM, // 0D05..0D0C + UNKNOWN, // 0D0D + MALAYALAM, // 0D0E..0D10 + UNKNOWN, // 0D11 + MALAYALAM, // 0D12..0D3A + UNKNOWN, // 0D3B..0D3C + MALAYALAM, // 0D3D..0D44 + UNKNOWN, // 0D45 + MALAYALAM, // 0D46..0D48 + UNKNOWN, // 0D49 + MALAYALAM, // 0D4A..0D4E + UNKNOWN, // 0D4F..0D56 + MALAYALAM, // 0D57 + UNKNOWN, // 0D58..0D5F + MALAYALAM, // 0D60..0D63 + UNKNOWN, // 0D64..0D65 + MALAYALAM, // 0D66..0D75 + UNKNOWN, // 0D76..0D78 + MALAYALAM, // 0D79..0D7F + UNKNOWN, // 0D80..0D81 + SINHALA, // 0D82..0D83 + UNKNOWN, // 0D84 + SINHALA, // 0D85..0D96 + UNKNOWN, // 0D97..0D99 + SINHALA, // 0D9A..0DB1 + UNKNOWN, // 0DB2 + SINHALA, // 0DB3..0DBB + UNKNOWN, // 0DBC + SINHALA, // 0DBD + UNKNOWN, // 0DBE..0DBF + SINHALA, // 0DC0..0DC6 + UNKNOWN, // 0DC7..0DC9 + SINHALA, // 0DCA + UNKNOWN, // 0DCB..0DCE + SINHALA, // 0DCF..0DD4 + UNKNOWN, // 0DD5 + SINHALA, // 0DD6 + UNKNOWN, // 0DD7 + SINHALA, // 0DD8..0DDF + UNKNOWN, // 0DE0..0DE5 + SINHALA, // 0DE6..0DEF + UNKNOWN, // 0DF0..0DF1 + SINHALA, // 0DF2..0DF4 + UNKNOWN, // 0DF5..0E00 + THAI, // 0E01..0E3A + UNKNOWN, // 0E3B..0E3E + COMMON, // 0E3F + THAI, // 0E40..0E5B + UNKNOWN, // 0E5C..0E80 + LAO, // 0E81..0E82 + UNKNOWN, // 0E83 + LAO, // 0E84 + UNKNOWN, // 0E85..0E86 + LAO, // 0E87..0E88 + UNKNOWN, // 0E89 + LAO, // 0E8A + UNKNOWN, // 0E8B..0E8C + LAO, // 0E8D + UNKNOWN, // 0E8E..0E93 + LAO, // 0E94..0E97 + UNKNOWN, // 0E98 + LAO, // 0E99..0E9F + UNKNOWN, // 0EA0 + LAO, // 0EA1..0EA3 + UNKNOWN, // 0EA4 + LAO, // 0EA5 + UNKNOWN, // 0EA6 + LAO, // 0EA7 + UNKNOWN, // 0EA8..0EA9 + LAO, // 0EAA..0EAB + UNKNOWN, // 0EAC + LAO, // 0EAD..0EB9 + UNKNOWN, // 0EBA + LAO, // 0EBB..0EBD + UNKNOWN, // 0EBE..0EBF + LAO, // 0EC0..0EC4 + UNKNOWN, // 0EC5 + LAO, // 0EC6 + UNKNOWN, // 0EC7 + LAO, // 0EC8..0ECD + UNKNOWN, // 0ECE..0ECF + LAO, // 0ED0..0ED9 + UNKNOWN, // 0EDA..0EDB + LAO, // 0EDC..0EDF + UNKNOWN, // 0EE0..0EFF + TIBETAN, // 0F00..0F47 + UNKNOWN, // 0F48 + TIBETAN, // 0F49..0F6C + UNKNOWN, // 0F6D..0F70 + TIBETAN, // 0F71..0F97 + UNKNOWN, // 0F98 + TIBETAN, // 0F99..0FBC + UNKNOWN, // 0FBD + TIBETAN, // 0FBE..0FCC + UNKNOWN, // 0FCD + TIBETAN, // 0FCE..0FD4 + COMMON, // 0FD5..0FD8 + TIBETAN, // 0FD9..0FDA + UNKNOWN, // 0FDB..FFF + MYANMAR, // 1000..109F + GEORGIAN, // 10A0..10C5 + UNKNOWN, // 10C6 + GEORGIAN, // 10C7 + UNKNOWN, // 10C8..10CC + GEORGIAN, // 10CD + UNKNOWN, // 10CE..10CF + GEORGIAN, // 10D0..10FA + COMMON, // 10FB + GEORGIAN, // 10FC..10FF + HANGUL, // 1100..11FF + ETHIOPIC, // 1200..1248 + UNKNOWN, // 1249 + ETHIOPIC, // 124A..124D + UNKNOWN, // 124E..124F + ETHIOPIC, // 1250..1256 + UNKNOWN, // 1257 + ETHIOPIC, // 1258 + UNKNOWN, // 1259 + ETHIOPIC, // 125A..125D + UNKNOWN, // 125E..125F + ETHIOPIC, // 1260..1288 + UNKNOWN, // 1289 + ETHIOPIC, // 128A..128D + UNKNOWN, // 128E..128F + ETHIOPIC, // 1290..12B0 + UNKNOWN, // 12B1 + ETHIOPIC, // 12B2..12B5 + UNKNOWN, // 12B6..12B7 + ETHIOPIC, // 12B8..12BE + UNKNOWN, // 12BF + ETHIOPIC, // 12C0 + UNKNOWN, // 12C1 + ETHIOPIC, // 12C2..12C5 + UNKNOWN, // 12C6..12C7 + ETHIOPIC, // 12C8..12D6 + UNKNOWN, // 12D7 + ETHIOPIC, // 12D8..1310 + UNKNOWN, // 1311 + ETHIOPIC, // 1312..1315 + UNKNOWN, // 1316..1317 + ETHIOPIC, // 1318..135A + UNKNOWN, // 135B..135C + ETHIOPIC, // 135D..137C + UNKNOWN, // 137D..137F + ETHIOPIC, // 1380..1399 + UNKNOWN, // 139A..139F + CHEROKEE, // 13A0..13F4 + UNKNOWN, // 13F5..13FF + CANADIAN_ABORIGINAL, // 1400..167F + OGHAM, // 1680..169C + UNKNOWN, // 169D..169F + RUNIC, // 16A0..16EA + COMMON, // 16EB..16ED + RUNIC, // 16EE..16F8 + UNKNOWN, // 16F9..16FF + TAGALOG, // 1700..170C + UNKNOWN, // 170D + TAGALOG, // 170E..1714 + UNKNOWN, // 1715..171F + HANUNOO, // 1720..1734 + COMMON, // 1735..1736 + UNKNOWN, // 1737..173F + BUHID, // 1740..1753 + UNKNOWN, // 1754..175F + TAGBANWA, // 1760..176C + UNKNOWN, // 176D + TAGBANWA, // 176E..1770 + UNKNOWN, // 1771 + TAGBANWA, // 1772..1773 + UNKNOWN, // 1774..177F + KHMER, // 1780..17DD + UNKNOWN, // 17DE..17DF + KHMER, // 17E0..17E9 + UNKNOWN, // 17EA..17EF + KHMER, // 17F0..17F9 + UNKNOWN, // 17FA..17FF + MONGOLIAN, // 1800..1801 + COMMON, // 1802..1803 + MONGOLIAN, // 1804 + COMMON, // 1805 + MONGOLIAN, // 1806..180E + UNKNOWN, // 180F + MONGOLIAN, // 1810..1819 + UNKNOWN, // 181A..181F + MONGOLIAN, // 1820..1877 + UNKNOWN, // 1878..187F + MONGOLIAN, // 1880..18AA + UNKNOWN, // 18AB..18AF + CANADIAN_ABORIGINAL, // 18B0..18F5 + UNKNOWN, // 18F6..18FF + LIMBU, // 1900..191E + UNKNOWN, // 191F + LIMBU, // 1920..192B + UNKNOWN, // 192C..192F + LIMBU, // 1930..193B + UNKNOWN, // 193C..193F + LIMBU, // 1940 + UNKNOWN, // 1941..1943 + LIMBU, // 1944..194F + TAI_LE, // 1950..196D + UNKNOWN, // 196E..196F + TAI_LE, // 1970..1974 + UNKNOWN, // 1975..197F + NEW_TAI_LUE, // 1980..19AB + UNKNOWN, // 19AC..19AF + NEW_TAI_LUE, // 19B0..19C9 + UNKNOWN, // 19CA..19CF + NEW_TAI_LUE, // 19D0..19DA + UNKNOWN, // 19DB..19DD + NEW_TAI_LUE, // 19DE..19DF + KHMER, // 19E0..19FF + BUGINESE, // 1A00..1A1B + UNKNOWN, // 1A1C..1A1D + BUGINESE, // 1A1E..1A1F + TAI_THAM, // 1A20..1A5E + UNKNOWN, // 1A5F + TAI_THAM, // 1A60..1A7C + UNKNOWN, // 1A7D..1A7E + TAI_THAM, // 1A7F..1A89 + UNKNOWN, // 1A8A..1A8F + TAI_THAM, // 1A90..1A99 + UNKNOWN, // 1A9A..1A9F + TAI_THAM, // 1AA0..1AAD + UNKNOWN, // 1AAE..1AAF + INHERITED, // 1AB0..1ABE + UNKNOWN, // 1ABF..1AFF + BALINESE, // 1B00..1B4B + UNKNOWN, // 1B4C..1B4F + BALINESE, // 1B50..1B7C + UNKNOWN, // 1B7D..1B7F + SUNDANESE, // 1B80..1BBF + BATAK, // 1BC0..1BF3 + UNKNOWN, // 1BF4..1BFB + BATAK, // 1BFC..1BFF + LEPCHA, // 1C00..1C37 + UNKNOWN, // 1C38..1C3A + LEPCHA, // 1C3B..1C49 + UNKNOWN, // 1C4A..1C4C + LEPCHA, // 1C4D..1C4F + OL_CHIKI, // 1C50..1C7F + UNKNOWN, // 1C80..1CBF + SUNDANESE, // 1CC0..1CC7 + UNKNOWN, // 1CC8..1CCF + INHERITED, // 1CD0..1CD2 + COMMON, // 1CD3 + INHERITED, // 1CD4..1CE0 + COMMON, // 1CE1 + INHERITED, // 1CE2..1CE8 + COMMON, // 1CE9..1CEC + INHERITED, // 1CED + COMMON, // 1CEE..1CF3 + INHERITED, // 1CF4 + COMMON, // 1CF5..1CF6 + UNKNOWN, // 1CF7 + INHERITED, // 1CF8..1CF9 + UNKNOWN, // 1CFA..1CFF + LATIN, // 1D00..1D25 + GREEK, // 1D26..1D2A + CYRILLIC, // 1D2B + LATIN, // 1D2C..1D5C + GREEK, // 1D5D..1D61 + LATIN, // 1D62..1D65 + GREEK, // 1D66..1D6A + LATIN, // 1D6B..1D77 + CYRILLIC, // 1D78 + LATIN, // 1D79..1DBE + GREEK, // 1DBF + INHERITED, // 1DC0..1DF5 + UNKNOWN, // 1DF6..1DFB + INHERITED, // 1DFC..1DFF + LATIN, // 1E00..1EFF + GREEK, // 1F00..1F15 + UNKNOWN, // 1F16..1F17 + GREEK, // 1F18..1F1D + UNKNOWN, // 1F1E..1F1F + GREEK, // 1F20..1F45 + UNKNOWN, // 1F46..1F47 + GREEK, // 1F48..1F4D + UNKNOWN, // 1F4E..1F4F + GREEK, // 1F50..1F57 + UNKNOWN, // 1F58 + GREEK, // 1F59 + UNKNOWN, // 1F5A + GREEK, // 1F5B + UNKNOWN, // 1F5C + GREEK, // 1F5D + UNKNOWN, // 1F5E + GREEK, // 1F5F..1F7D + UNKNOWN, // 1F7E..1F7F + GREEK, // 1F80..1FB4 + UNKNOWN, // 1FB5 + GREEK, // 1FB6..1FC4 + UNKNOWN, // 1FC5 + GREEK, // 1FC6..1FD3 + UNKNOWN, // 1FD4..1FD5 + GREEK, // 1FD6..1FDB + UNKNOWN, // 1FDC + GREEK, // 1FDD..1FEF + UNKNOWN, // 1FF0..1FF1 + GREEK, // 1FF2..1FF4 + UNKNOWN, // 1FF5 + GREEK, // 1FF6..1FFE + UNKNOWN, // 1FFF + COMMON, // 2000..200B + INHERITED, // 200C..200D + COMMON, // 200E..2064 + UNKNOWN, // 2065 + COMMON, // 2066..2070 + LATIN, // 2071 + UNKNOWN, // 2072..2073 + COMMON, // 2074..207E + LATIN, // 207F + COMMON, // 2080..208E + UNKNOWN, // 208F + LATIN, // 2090..209C + UNKNOWN, // 209D..209F + COMMON, // 20A0..20BD + UNKNOWN, // 20BE..20CF + INHERITED, // 20D0..20F0 + UNKNOWN, // 20F1..20FF + COMMON, // 2100..2125 + GREEK, // 2126 + COMMON, // 2127..2129 + LATIN, // 212A..212B + COMMON, // 212C..2131 + LATIN, // 2132 + COMMON, // 2133..214D + LATIN, // 214E + COMMON, // 214F..215F + LATIN, // 2160..2188 + COMMON, // 2189 + UNKNOWN, // 218A..218F + COMMON, // 2190..23FA + UNKNOWN, // 23FB..23FF + COMMON, // 2400..2426 + UNKNOWN, // 2427..243F + COMMON, // 2440..244A + UNKNOWN, // 244B..245F + COMMON, // 2460..27FF + BRAILLE, // 2800..28FF + COMMON, // 2900..2B73 + UNKNOWN, // 2B74..2B75 + COMMON, // 2B76..2B95 + UNKNOWN, // 2B96..2B97 + COMMON, // 2B98..2BB9 + UNKNOWN, // 2BBA..2BBC + COMMON, // 2BBD..2BC8 + UNKNOWN, // 2BC9 + COMMON, // 2BCA..2BD1 + UNKNOWN, // 2BD2..2BFF + GLAGOLITIC, // 2C00..2C2E + UNKNOWN, // 2C2F + GLAGOLITIC, // 2C30..2C5E + UNKNOWN, // 2C5F + LATIN, // 2C60..2C7F + COPTIC, // 2C80..2CF3 + UNKNOWN, // 2CF4..2CF8 + COPTIC, // 2CF9..2CFF + GEORGIAN, // 2D00..2D25 + UNKNOWN, // 2D26 + GEORGIAN, // 2D27 + UNKNOWN, // 2D28..2D2C + GEORGIAN, // 2D2D + UNKNOWN, // 2D2E..2D2F + TIFINAGH, // 2D30..2D67 + UNKNOWN, // 2D68..2D6E + TIFINAGH, // 2D6F..2D70 + UNKNOWN, // 2D71..2D7E + TIFINAGH, // 2D7F + ETHIOPIC, // 2D80..2D96 + UNKNOWN, // 2D97..2D9F + ETHIOPIC, // 2DA0..2DA6 + UNKNOWN, // 2DA7 + ETHIOPIC, // 2DA8..2DAE + UNKNOWN, // 2DAF + ETHIOPIC, // 2DB0..2DB6 + UNKNOWN, // 2DB7 + ETHIOPIC, // 2DB8..2DBE + UNKNOWN, // 2DBF + ETHIOPIC, // 2DC0..2DC6 + UNKNOWN, // 2DC7 + ETHIOPIC, // 2DC8..2DCE + UNKNOWN, // 2DCF + ETHIOPIC, // 2DD0..2DD6 + UNKNOWN, // 2DD7 + ETHIOPIC, // 2DD8..2DDE + UNKNOWN, // 2DDF + CYRILLIC, // 2DE0..2DFF + COMMON, // 2E00..2E42 + UNKNOWN, // 2E43..2E7F + HAN, // 2E80..2E99 + UNKNOWN, // 2E9A + HAN, // 2E9B..2EF3 + UNKNOWN, // 2EF4..2EFF + HAN, // 2F00..2FD5 + UNKNOWN, // 2FD6..2FEF + COMMON, // 2FF0..2FFB + UNKNOWN, // 2FFC..2FFF + COMMON, // 3000..3004 + HAN, // 3005 + COMMON, // 3006 + HAN, // 3007 + COMMON, // 3008..3020 + HAN, // 3021..3029 + INHERITED, // 302A..302D + HANGUL, // 302E..302F + COMMON, // 3030..3037 + HAN, // 3038..303B + COMMON, // 303C..303F + UNKNOWN, // 3040 + HIRAGANA, // 3041..3096 + UNKNOWN, // 3097..3098 + INHERITED, // 3099..309A + COMMON, // 309B..309C + HIRAGANA, // 309D..309F + COMMON, // 30A0 + KATAKANA, // 30A1..30FA + COMMON, // 30FB..30FC + KATAKANA, // 30FD..30FF + UNKNOWN, // 3100..3104 + BOPOMOFO, // 3105..312D + UNKNOWN, // 312E..3130 + HANGUL, // 3131..318E + UNKNOWN, // 318F + COMMON, // 3190..319F + BOPOMOFO, // 31A0..31BA + UNKNOWN, // 31BB..31BF + COMMON, // 31C0..31E3 + UNKNOWN, // 31E4..31EF + KATAKANA, // 31F0..31FF + HANGUL, // 3200..321E + UNKNOWN, // 321F + COMMON, // 3220..325F + HANGUL, // 3260..327E + COMMON, // 327F..32CF + KATAKANA, // 32D0..32FE + UNKNOWN, // 32FF + KATAKANA, // 3300..3357 + COMMON, // 3358..33FF + HAN, // 3400..4DB5 + UNKNOWN, // 4DB6..4DBF + COMMON, // 4DC0..4DFF + HAN, // 4E00..9FCC + UNKNOWN, // 9FCD..9FFF + YI, // A000..A48C + UNKNOWN, // A48D..A48F + YI, // A490..A4C6 + UNKNOWN, // A4C7..A4CF + LISU, // A4D0..A4FF + VAI, // A500..A62B + UNKNOWN, // A62C..A63F + CYRILLIC, // A640..A69D + UNKNOWN, // A69E + CYRILLIC, // A69F + BAMUM, // A6A0..A6F7 + UNKNOWN, // A6F8..A6FF + COMMON, // A700..A721 + LATIN, // A722..A787 + COMMON, // A788..A78A + LATIN, // A78B..A78E + UNKNOWN, // A78F + LATIN, // A790..A7AD + UNKNOWN, // A7AE..A7AF + LATIN, // A7B0..A7B1 + UNKNOWN, // A7B2..A7F6 + LATIN, // A7F7..A7FF + SYLOTI_NAGRI, // A800..A82B + UNKNOWN, // A82C..A82F + COMMON, // A830..A839 + UNKNOWN, // A83A..A83F + PHAGS_PA, // A840..A877 + UNKNOWN, // A878..A87F + SAURASHTRA, // A880..A8C4 + UNKNOWN, // A8C5..A8CD + SAURASHTRA, // A8CE..A8D9 + UNKNOWN, // A8DA..A8DF + DEVANAGARI, // A8E0..A8FB + UNKNOWN, // A8FC..A8FF + KAYAH_LI, // A900..A92D + COMMON, // A92E + KAYAH_LI, // A92F + REJANG, // A930..A953 + UNKNOWN, // A954..A95E + REJANG, // A95F + HANGUL, // A960..A97C + UNKNOWN, // A97D..A97F + JAVANESE, // A980..A9CD + UNKNOWN, // A9CE + COMMON, // A9CF + JAVANESE, // A9D0..A9D9 + UNKNOWN, // A9DA..A9DD + JAVANESE, // A9DE..A9DF + MYANMAR, // A9E0..A9FE + UNKNOWN, // A9FF + CHAM, // AA00..AA36 + UNKNOWN, // AA37..AA3F + CHAM, // AA40..AA4D + UNKNOWN, // AA4E..AA4F + CHAM, // AA50..AA59 + UNKNOWN, // AA5A..AA5B + CHAM, // AA5C..AA5F + MYANMAR, // AA60..AA7F + TAI_VIET, // AA80..AAC2 + UNKNOWN, // AAC3..AADA + TAI_VIET, // AADB..AADF + MEETEI_MAYEK, // AAE0..AAF6 + UNKNOWN, // AAF7..AB00 + ETHIOPIC, // AB01..AB06 + UNKNOWN, // AB07..AB08 + ETHIOPIC, // AB09..AB0E + UNKNOWN, // AB0F..AB10 + ETHIOPIC, // AB11..AB16 + UNKNOWN, // AB17..AB1F + ETHIOPIC, // AB20..AB26 + UNKNOWN, // AB27 + ETHIOPIC, // AB28..AB2E + UNKNOWN, // AB2F + LATIN, // AB30..AB5A + COMMON, // AB5B + LATIN, // AB5C..AB5F + UNKNOWN, // AB60..AB63 + LATIN, // AB64 + GREEK, // AB65 + UNKNOWN, // AB66..ABBF + MEETEI_MAYEK, // ABC0..ABED + UNKNOWN, // ABEE..ABEF + MEETEI_MAYEK, // ABF0..ABF9 + UNKNOWN, // ABFA..ABFF + HANGUL, // AC00..D7A3 + UNKNOWN, // D7A4..D7AF + HANGUL, // D7B0..D7C6 + UNKNOWN, // D7C7..D7CA + HANGUL, // D7CB..D7FB + UNKNOWN, // D7FC..F8FF + HAN, // F900..FA6D + UNKNOWN, // FA6E..FA6F + HAN, // FA70..FAD9 + UNKNOWN, // FADA..FAFF + LATIN, // FB00..FB06 + UNKNOWN, // FB07..FB12 + ARMENIAN, // FB13..FB17 + UNKNOWN, // FB18..FB1C + HEBREW, // FB1D..FB36 + UNKNOWN, // FB37 + HEBREW, // FB38..FB3C + UNKNOWN, // FB3D + HEBREW, // FB3E + UNKNOWN, // FB3F + HEBREW, // FB40..FB41 + UNKNOWN, // FB42 + HEBREW, // FB43..FB44 + UNKNOWN, // FB45 + HEBREW, // FB46..FB4F + ARABIC, // FB50..FBC1 + UNKNOWN, // FBC2..FBD2 + ARABIC, // FBD3..FD3D + COMMON, // FD3E..FD3F + UNKNOWN, // FD40..FD4F + ARABIC, // FD50..FD8F + UNKNOWN, // FD90..FD91 + ARABIC, // FD92..FDC7 + UNKNOWN, // FDC8..FDEF + ARABIC, // FDF0..FDFD + UNKNOWN, // FDFE..FDFF + INHERITED, // FE00..FE0F + COMMON, // FE10..FE19 + UNKNOWN, // FE1A..FE1F + INHERITED, // FE20..FE2D + UNKNOWN, // FE2E..FE2F + COMMON, // FE30..FE52 + UNKNOWN, // FE53 + COMMON, // FE54..FE66 + UNKNOWN, // FE67 + COMMON, // FE68..FE6B + UNKNOWN, // FE6C..FE6F + ARABIC, // FE70..FE74 + UNKNOWN, // FE75 + ARABIC, // FE76..FEFC + UNKNOWN, // FEFD..FEFE + COMMON, // FEFF + UNKNOWN, // FF00 + COMMON, // FF01..FF20 + LATIN, // FF21..FF3A + COMMON, // FF3B..FF40 + LATIN, // FF41..FF5A + COMMON, // FF5B..FF65 + KATAKANA, // FF66..FF6F + COMMON, // FF70 + KATAKANA, // FF71..FF9D + COMMON, // FF9E..FF9F + HANGUL, // FFA0..FFBE + UNKNOWN, // FFBF..FFC1 + HANGUL, // FFC2..FFC7 + UNKNOWN, // FFC8..FFC9 + HANGUL, // FFCA..FFCF + UNKNOWN, // FFD0..FFD1 + HANGUL, // FFD2..FFD7 + UNKNOWN, // FFD8..FFD9 + HANGUL, // FFDA..FFDC + UNKNOWN, // FFDD..FFDF + COMMON, // FFE0..FFE6 + UNKNOWN, // FFE7 + COMMON, // FFE8..FFEE + UNKNOWN, // FFEF..FFF8 + COMMON, // FFF9..FFFD + UNKNOWN, // FFFE..FFFF + LINEAR_B, // 10000..1000B + UNKNOWN, // 1000C + LINEAR_B, // 1000D..10026 + UNKNOWN, // 10027 + LINEAR_B, // 10028..1003A + UNKNOWN, // 1003B + LINEAR_B, // 1003C..1003D + UNKNOWN, // 1003E + LINEAR_B, // 1003F..1004D + UNKNOWN, // 1004E..1004F + LINEAR_B, // 10050..1005D + UNKNOWN, // 1005E..1007F + LINEAR_B, // 10080..100FA + UNKNOWN, // 100FB..100FF + COMMON, // 10100..10102 + UNKNOWN, // 10103..10106 + COMMON, // 10107..10133 + UNKNOWN, // 10134..10136 + COMMON, // 10137..1013F + GREEK, // 10140..1018C + UNKNOWN, // 1018D..1018F + COMMON, // 10190..1019B + UNKNOWN, // 1019C..1019F + GREEK, // 101A0 + UNKNOWN, // 101A1..101CF + COMMON, // 101D0..101FC + INHERITED, // 101FD + UNKNOWN, // 101FE..1027F + LYCIAN, // 10280..1029C + UNKNOWN, // 1029D..1029F + CARIAN, // 102A0..102D0 + UNKNOWN, // 102D1..102DF + INHERITED, // 102E0 + COMMON, // 102E1..102FB + UNKNOWN, // 102FC..102FF + OLD_ITALIC, // 10300..10323 + UNKNOWN, // 10324..1032F + GOTHIC, // 10330..1034A + UNKNOWN, // 1034B..1034F + OLD_PERMIC, // 10350..1037A + UNKNOWN, // 1037B..1037F + UGARITIC, // 10380..1039D + UNKNOWN, // 1039E + UGARITIC, // 1039F + OLD_PERSIAN, // 103A0..103C3 + UNKNOWN, // 103C4..103C7 + OLD_PERSIAN, // 103C8..103D5 + UNKNOWN, // 103D6..103FF + DESERET, // 10400..1044F + SHAVIAN, // 10450..1047F + OSMANYA, // 10480..1049D + UNKNOWN, // 1049E..1049F + OSMANYA, // 104A0..104A9 + UNKNOWN, // 104AA..104FF + ELBASAN, // 10500..10527 + UNKNOWN, // 10528..1052F + CAUCASIAN_ALBANIAN, // 10530..10563 + UNKNOWN, // 10564..1056E + CAUCASIAN_ALBANIAN, // 1056F + UNKNOWN, // 10570..105FF + LINEAR_A, // 10600..10736 + UNKNOWN, // 10737..1073F + LINEAR_A, // 10740..10755 + UNKNOWN, // 10756..1075F + LINEAR_A, // 10760..10767 + UNKNOWN, // 10768..107FF + CYPRIOT, // 10800..10805 + UNKNOWN, // 10806..10807 + CYPRIOT, // 10808 + UNKNOWN, // 10809 + CYPRIOT, // 1080A..10835 + UNKNOWN, // 10836 + CYPRIOT, // 10837..10838 + UNKNOWN, // 10839..1083B + CYPRIOT, // 1083C + UNKNOWN, // 1083D..1083E + CYPRIOT, // 1083F + IMPERIAL_ARAMAIC, // 10840..10855 + UNKNOWN, // 10856 + IMPERIAL_ARAMAIC, // 10857..1085F + PALMYRENE, // 10860..1087F + NABATAEAN, // 10880..1089E + UNKNOWN, // 1089F..108A6 + NABATAEAN, // 108A7..108AF + UNKNOWN, // 108B0..108FF + PHOENICIAN, // 10900..1091B + UNKNOWN, // 1091C..1091E + PHOENICIAN, // 1091F + LYDIAN, // 10920..10939 + UNKNOWN, // 1093A..1093E + LYDIAN, // 1093F + UNKNOWN, // 10940..1097F + MEROITIC_HIEROGLYPHS, // 10980..1099F + MEROITIC_CURSIVE, // 109A0..109B7 + UNKNOWN, // 109B8..109BD + MEROITIC_CURSIVE, // 109BE..109BF + UNKNOWN, // 109C0..109FF + KHAROSHTHI, // 10A00..10A03 + UNKNOWN, // 10A04 + KHAROSHTHI, // 10A05..10A06 + UNKNOWN, // 10A07..10A0B + KHAROSHTHI, // 10A0C..10A13 + UNKNOWN, // 10A14 + KHAROSHTHI, // 10A15..10A17 + UNKNOWN, // 10A18 + KHAROSHTHI, // 10A19..10A33 + UNKNOWN, // 10A34..10A37 + KHAROSHTHI, // 10A38..10A3A + UNKNOWN, // 10A3B..10A3E + KHAROSHTHI, // 10A3F..10A47 + UNKNOWN, // 10A48..10A4F + KHAROSHTHI, // 10A50..10A58 + UNKNOWN, // 10A59..10A5F + OLD_SOUTH_ARABIAN, // 10A60..10A7F + OLD_NORTH_ARABIAN, // 10A80..10A9F + UNKNOWN, // 10AA0..10ABF + MANICHAEAN, // 10AC0..10AE6 + UNKNOWN, // 10AE7..10AEA + MANICHAEAN, // 10AEB..10AF6 + UNKNOWN, // 10AF7..10AFF + AVESTAN, // 10B00..10B35 + UNKNOWN, // 10B36..10B38 + AVESTAN, // 10B39..10B3F + INSCRIPTIONAL_PARTHIAN, // 10B40..10B55 + UNKNOWN, // 10B56..10B57 + INSCRIPTIONAL_PARTHIAN, // 10B58..10B5F + INSCRIPTIONAL_PAHLAVI, // 10B60..10B72 + UNKNOWN, // 10B73..10B77 + INSCRIPTIONAL_PAHLAVI, // 10B78..10B7F + PSALTER_PAHLAVI, // 10B80..10B91 + UNKNOWN, // 10B92..10B98 + PSALTER_PAHLAVI, // 10B99..10B9C + UNKNOWN, // 10B9D..10BA8 + PSALTER_PAHLAVI, // 10BA9..10BAF + UNKNOWN, // 10BB0..10BFF + OLD_TURKIC, // 10C00..10C48 + UNKNOWN, // 10C49..10E5F + ARABIC, // 10E60..10E7E + UNKNOWN, // 10E7F..10FFF + BRAHMI, // 11000..1104D + UNKNOWN, // 1104E..11051 + BRAHMI, // 11052..1106F + UNKNOWN, // 11070..1107E + BRAHMI, // 1107F + KAITHI, // 11080..110C1 + UNKNOWN, // 110C2..110CF + SORA_SOMPENG, // 110D0..110E8 + UNKNOWN, // 110E9..110EF + SORA_SOMPENG, // 110F0..110F9 + UNKNOWN, // 110FA..110FF + CHAKMA, // 11100..11134 + UNKNOWN, // 11135 + CHAKMA, // 11136..11143 + UNKNOWN, // 11144..1114F + MAHAJANI, // 11150..11176 + UNKNOWN, // 11177..1117F + SHARADA, // 11180..111C8 + UNKNOWN, // 111C9..111CC + SHARADA, // 111CD + UNKNOWN, // 111CE..111CF + SHARADA, // 111D0..111DA + UNKNOWN, // 111DB..111E0 + SINHALA, // 111E1..111F4 + UNKNOWN, // 111F5..111FF + KHOJKI, // 11200..11211 + UNKNOWN, // 11212 + KHOJKI, // 11213..1123D + UNKNOWN, // 1123E..112AF + KHUDAWADI, // 112B0..112EA + UNKNOWN, // 112EB..112EF + KHUDAWADI, // 112F0..112F9 + UNKNOWN, // 112FA..11300 + GRANTHA, // 11301..11303 + UNKNOWN, // 11304 + GRANTHA, // 11305..1130C + UNKNOWN, // 1130D..1130E + GRANTHA, // 1130F..11310 + UNKNOWN, // 11311..11312 + GRANTHA, // 11313..11328 + UNKNOWN, // 11329 + GRANTHA, // 1132A..11330 + UNKNOWN, // 11331 + GRANTHA, // 11332..11333 + UNKNOWN, // 11334 + GRANTHA, // 11335..11339 + UNKNOWN, // 1133A..1133B + GRANTHA, // 1133C..11344 + UNKNOWN, // 11345..11346 + GRANTHA, // 11347..11348 + UNKNOWN, // 11349..1134A + GRANTHA, // 1134B..1134D + UNKNOWN, // 1134E..11356 + GRANTHA, // 11357 + UNKNOWN, // 11358..1135C + GRANTHA, // 1135D..11363 + UNKNOWN, // 11364..11365 + GRANTHA, // 11366..1136C + UNKNOWN, // 1136D..1136F + GRANTHA, // 11370..11374 + UNKNOWN, // 11375..1147F + TIRHUTA, // 11480..114C7 + UNKNOWN, // 114C8..114CF + TIRHUTA, // 114D0..114D9 + UNKNOWN, // 114DA..1157F + SIDDHAM, // 11580..115B5 + UNKNOWN, // 115B6..115B7 + SIDDHAM, // 115B8..115C9 + UNKNOWN, // 115CA..115FF + MODI, // 11600..11644 + UNKNOWN, // 11645..1164F + MODI, // 11650..11659 + UNKNOWN, // 1165A..1167F + TAKRI, // 11680..116B7 + UNKNOWN, // 116B8..116BF + TAKRI, // 116C0..116C9 + UNKNOWN, // 116CA..1189F + WARANG_CITI, // 118A0..118F2 + UNKNOWN, // 118F3..118FE + WARANG_CITI, // 118FF + UNKNOWN, // 11900..11ABF + PAU_CIN_HAU, // 11AC0..11AF8 + UNKNOWN, // 11AF9..11FFF + CUNEIFORM, // 12000..12398 + UNKNOWN, // 12399..123FF + CUNEIFORM, // 12400..1246E + UNKNOWN, // 1246F + CUNEIFORM, // 12470..12474 + UNKNOWN, // 12475..12FFF + EGYPTIAN_HIEROGLYPHS, // 13000..1342E + UNKNOWN, // 1342F..167FF + BAMUM, // 16800..16A38 + UNKNOWN, // 16A39..16A3F + MRO, // 16A40..16A5E + UNKNOWN, // 16A5F + MRO, // 16A60..16A69 + UNKNOWN, // 16A6A..16A6D + MRO, // 16A6E..16A6F + UNKNOWN, // 16A70..16ACF + BASSA_VAH, // 16AD0..16AED + UNKNOWN, // 16AEE..16AEF + BASSA_VAH, // 16AF0..16AF5 + UNKNOWN, // 16AF6..16AFF + PAHAWH_HMONG, // 16B00..16B45 + UNKNOWN, // 16B46..16B4F + PAHAWH_HMONG, // 16B50..16B59 + UNKNOWN, // 16B5A + PAHAWH_HMONG, // 16B5B..16B61 + UNKNOWN, // 16B62 + PAHAWH_HMONG, // 16B63..16B77 + UNKNOWN, // 16B78..16B7C + PAHAWH_HMONG, // 16B7D..16B8F + UNKNOWN, // 16B90..16EFF + MIAO, // 16F00..16F44 + UNKNOWN, // 16F45..16F4F + MIAO, // 16F50..16F7E + UNKNOWN, // 16F7F..16F8E + MIAO, // 16F8F..16F9F + UNKNOWN, // 16FA0..1AFFF + KATAKANA, // 1B000 + HIRAGANA, // 1B001 + UNKNOWN, // 1B002..1BBFF + DUPLOYAN, // 1BC00..1BC6A + UNKNOWN, // 1BC6B..1BC6F + DUPLOYAN, // 1BC70..1BC7C + UNKNOWN, // 1BC7D..1BC7F + DUPLOYAN, // 1BC80..1BC88 + UNKNOWN, // 1BC89..1BC8F + DUPLOYAN, // 1BC90..1BC99 + UNKNOWN, // 1BC9A..1BC9B + DUPLOYAN, // 1BC9C..1BC9F + COMMON, // 1BCA0..1BCA3 + UNKNOWN, // 1BCA4..1CFFF + COMMON, // 1D000..1D0F5 + UNKNOWN, // 1D0F6..1D0FF + COMMON, // 1D100..1D126 + UNKNOWN, // 1D127..1D128 + COMMON, // 1D129..1D166 + INHERITED, // 1D167..1D169 + COMMON, // 1D16A..1D17A + INHERITED, // 1D17B..1D182 + COMMON, // 1D183..1D184 + INHERITED, // 1D185..1D18B + COMMON, // 1D18C..1D1A9 + INHERITED, // 1D1AA..1D1AD + COMMON, // 1D1AE..1D1DD + UNKNOWN, // 1D1DE..1D1FF + GREEK, // 1D200..1D245 + UNKNOWN, // 1D246..1D2FF + COMMON, // 1D300..1D356 + UNKNOWN, // 1D357..1D35F + COMMON, // 1D360..1D371 + UNKNOWN, // 1D372..1D3FF + COMMON, // 1D400..1D454 + UNKNOWN, // 1D455 + COMMON, // 1D456..1D49C + UNKNOWN, // 1D49D + COMMON, // 1D49E..1D49F + UNKNOWN, // 1D4A0..1D4A1 + COMMON, // 1D4A2 + UNKNOWN, // 1D4A3..1D4A4 + COMMON, // 1D4A5..1D4A6 + UNKNOWN, // 1D4A7..1D4A8 + COMMON, // 1D4A9..1D4AC + UNKNOWN, // 1D4AD + COMMON, // 1D4AE..1D4B9 + UNKNOWN, // 1D4BA + COMMON, // 1D4BB + UNKNOWN, // 1D4BC + COMMON, // 1D4BD..1D4C3 + UNKNOWN, // 1D4C4 + COMMON, // 1D4C5..1D505 + UNKNOWN, // 1D506 + COMMON, // 1D507..1D50A + UNKNOWN, // 1D50B..1D50C + COMMON, // 1D50D..1D514 + UNKNOWN, // 1D515 + COMMON, // 1D516..1D51C + UNKNOWN, // 1D51D + COMMON, // 1D51E..1D539 + UNKNOWN, // 1D53A + COMMON, // 1D53B..1D53E + UNKNOWN, // 1D53F + COMMON, // 1D540..1D544 + UNKNOWN, // 1D545 + COMMON, // 1D546 + UNKNOWN, // 1D547..1D549 + COMMON, // 1D54A..1D550 + UNKNOWN, // 1D551 + COMMON, // 1D552..1D6A5 + UNKNOWN, // 1D6A6..1D6A7 + COMMON, // 1D6A8..1D7CB + UNKNOWN, // 1D7CC..1D7CD + COMMON, // 1D7CE..1D7FF + UNKNOWN, // 1D800..1E7FF + MENDE_KIKAKUI, // 1E800..1E8C4 + UNKNOWN, // 1E8C5..1E8C6 + MENDE_KIKAKUI, // 1E8C7..1E8D6 + UNKNOWN, // 1E8D7..1EDFF + ARABIC, // 1EE00..1EE03 + UNKNOWN, // 1EE04 + ARABIC, // 1EE05..1EE1F + UNKNOWN, // 1EE20 + ARABIC, // 1EE21..1EE22 + UNKNOWN, // 1EE23 + ARABIC, // 1EE24 + UNKNOWN, // 1EE25..1EE26 + ARABIC, // 1EE27 + UNKNOWN, // 1EE28 + ARABIC, // 1EE29..1EE32 + UNKNOWN, // 1EE33 + ARABIC, // 1EE34..1EE37 + UNKNOWN, // 1EE38 + ARABIC, // 1EE39 + UNKNOWN, // 1EE3A + ARABIC, // 1EE3B + UNKNOWN, // 1EE3C..1EE41 + ARABIC, // 1EE42 + UNKNOWN, // 1EE43..1EE46 + ARABIC, // 1EE47 + UNKNOWN, // 1EE48 + ARABIC, // 1EE49 + UNKNOWN, // 1EE4A + ARABIC, // 1EE4B + UNKNOWN, // 1EE4C + ARABIC, // 1EE4D..1EE4F + UNKNOWN, // 1EE50 + ARABIC, // 1EE51..1EE52 + UNKNOWN, // 1EE53 + ARABIC, // 1EE54 + UNKNOWN, // 1EE55..1EE56 + ARABIC, // 1EE57 + UNKNOWN, // 1EE58 + ARABIC, // 1EE59 + UNKNOWN, // 1EE5A + ARABIC, // 1EE5B + UNKNOWN, // 1EE5C + ARABIC, // 1EE5D + UNKNOWN, // 1EE5E + ARABIC, // 1EE5F + UNKNOWN, // 1EE60 + ARABIC, // 1EE61..1EE62 + UNKNOWN, // 1EE63 + ARABIC, // 1EE64 + UNKNOWN, // 1EE65..1EE66 + ARABIC, // 1EE67..1EE6A + UNKNOWN, // 1EE6B + ARABIC, // 1EE6C..1EE72 + UNKNOWN, // 1EE73 + ARABIC, // 1EE74..1EE77 + UNKNOWN, // 1EE78 + ARABIC, // 1EE79..1EE7C + UNKNOWN, // 1EE7D + ARABIC, // 1EE7E + UNKNOWN, // 1EE7F + ARABIC, // 1EE80..1EE89 + UNKNOWN, // 1EE8A + ARABIC, // 1EE8B..1EE9B + UNKNOWN, // 1EE9C..1EEA0 + ARABIC, // 1EEA1..1EEA3 + UNKNOWN, // 1EEA4 + ARABIC, // 1EEA5..1EEA9 + UNKNOWN, // 1EEAA + ARABIC, // 1EEAB..1EEBB + UNKNOWN, // 1EEBC..1EEEF + ARABIC, // 1EEF0..1EEF1 + UNKNOWN, // 1EEF2..1EFFF + COMMON, // 1F000..1F02B + UNKNOWN, // 1F02C..1F02F + COMMON, // 1F030..1F093 + UNKNOWN, // 1F094..1F09F + COMMON, // 1F0A0..1F0AE + UNKNOWN, // 1F0AF..1F0B0 + COMMON, // 1F0B1..1F0BF + UNKNOWN, // 1F0C0 + COMMON, // 1F0C1..1F0CF + UNKNOWN, // 1F0D0 + COMMON, // 1F0D1..1F0F5 + UNKNOWN, // 1F0F6..1F0FF + COMMON, // 1F100..1F10C + UNKNOWN, // 1F10D..1F10F + COMMON, // 1F110..1F12E + UNKNOWN, // 1F12F + COMMON, // 1F130..1F16B + UNKNOWN, // 1F16C..1F16F + COMMON, // 1F170..1F19A + UNKNOWN, // 1F19B..1F1E5 + COMMON, // 1F1E6..1F1FF + HIRAGANA, // 1F200 + COMMON, // 1F201..1F202 + UNKNOWN, // 1F203..1F20F + COMMON, // 1F210..1F23A + UNKNOWN, // 1F23B..1F23F + COMMON, // 1F240..1F248 + UNKNOWN, // 1F249..1F24F + COMMON, // 1F250..1F251 + UNKNOWN, // 1F252..1F2FF + COMMON, // 1F300..1F32C + UNKNOWN, // 1F32D..1F32F + COMMON, // 1F330..1F37D + UNKNOWN, // 1F37E..1F37F + COMMON, // 1F380..1F3CE + UNKNOWN, // 1F3CF..1F3D3 + COMMON, // 1F3D4..1F3F7 + UNKNOWN, // 1F3F8..1F3FF + COMMON, // 1F400..1F4FE + UNKNOWN, // 1F4FF + COMMON, // 1F500..1F54A + UNKNOWN, // 1F54B..1F54F + COMMON, // 1F550..1F579 + UNKNOWN, // 1F57A + COMMON, // 1F57B..1F5A3 + UNKNOWN, // 1F5A4 + COMMON, // 1F5A5..1F642 + UNKNOWN, // 1F643..1F644 + COMMON, // 1F645..1F6CF + UNKNOWN, // 1F6D0..1F6DF + COMMON, // 1F6E0..1F6EC + UNKNOWN, // 1F6ED..1F6EF + COMMON, // 1F6F0..1F6F3 + UNKNOWN, // 1F6F4..1F6FF + COMMON, // 1F700..1F773 + UNKNOWN, // 1F774..1F77F + COMMON, // 1F780..1F7D4 + UNKNOWN, // 1F7D5..1F7FF + COMMON, // 1F800..1F80B + UNKNOWN, // 1F80C..1F80F + COMMON, // 1F810..1F847 + UNKNOWN, // 1F848..1F84F + COMMON, // 1F850..1F859 + UNKNOWN, // 1F85A..1F85F + COMMON, // 1F860..1F887 + UNKNOWN, // 1F888..1F88F + COMMON, // 1F890..1F8AD + UNKNOWN, // 1F8AE..1FFFF + HAN, // 20000..2A6D6 + UNKNOWN, // 2A6D7..2A6FF + HAN, // 2A700..2B734 + UNKNOWN, // 2B735..2B73F + HAN, // 2B740..2B81D + UNKNOWN, // 2B81E..2F7FF + HAN, // 2F800..2FA1D + UNKNOWN, // 2FA1E..E0000 + COMMON, // E0001 + UNKNOWN, // E0002..E001F + COMMON, // E0020..E007F + UNKNOWN, // E0080..E00FF + INHERITED, // E0100..E01EF + UNKNOWN, // E01F0..10FFFF }; private static HashMap<String, Character.UnicodeScript> aliases; static { aliases = new HashMap<>(128); + aliases.put("AGHB", CAUCASIAN_ALBANIAN); aliases.put("ARAB", ARABIC); aliases.put("ARMI", IMPERIAL_ARAMAIC); aliases.put("ARMN", ARMENIAN); aliases.put("AVST", AVESTAN); aliases.put("BALI", BALINESE); aliases.put("BAMU", BAMUM); + aliases.put("BASS", BASSA_VAH); aliases.put("BATK", BATAK); aliases.put("BENG", BENGALI); aliases.put("BOPO", BOPOMOFO); - aliases.put("BRAI", BRAILLE); aliases.put("BRAH", BRAHMI); + aliases.put("BRAI", BRAILLE); aliases.put("BUGI", BUGINESE); aliases.put("BUHD", BUHID); aliases.put("CAKM", CHAKMA); @@ -4380,11 +7059,14 @@ aliases.put("CYRL", CYRILLIC); aliases.put("DEVA", DEVANAGARI); aliases.put("DSRT", DESERET); + aliases.put("DUPL", DUPLOYAN); aliases.put("EGYP", EGYPTIAN_HIEROGLYPHS); + aliases.put("ELBA", ELBASAN); aliases.put("ETHI", ETHIOPIC); aliases.put("GEOR", GEORGIAN); aliases.put("GLAG", GLAGOLITIC); aliases.put("GOTH", GOTHIC); + aliases.put("GRAN", GRANTHA); aliases.put("GREK", GREEK); aliases.put("GUJR", GUJARATI); aliases.put("GURU", GURMUKHI); @@ -4393,6 +7075,7 @@ aliases.put("HANO", HANUNOO); aliases.put("HEBR", HEBREW); aliases.put("HIRA", HIRAGANA); + aliases.put("HMNG", PAHAWH_HMONG); // it appears we don't have the KATAKANA_OR_HIRAGANA //aliases.put("HRKT", KATAKANA_OR_HIRAGANA); aliases.put("ITAL", OLD_ITALIC); @@ -4401,6 +7084,7 @@ aliases.put("KANA", KATAKANA); aliases.put("KHAR", KHAROSHTHI); aliases.put("KHMR", KHMER); + aliases.put("KHOJ", KHOJKI); aliases.put("KNDA", KANNADA); aliases.put("KTHI", KAITHI); aliases.put("LANA", TAI_THAM); @@ -4408,27 +7092,39 @@ aliases.put("LATN", LATIN); aliases.put("LEPC", LEPCHA); aliases.put("LIMB", LIMBU); + aliases.put("LINA", LINEAR_A); aliases.put("LINB", LINEAR_B); aliases.put("LISU", LISU); aliases.put("LYCI", LYCIAN); aliases.put("LYDI", LYDIAN); + aliases.put("MAHJ", MAHAJANI); aliases.put("MAND", MANDAIC); + aliases.put("MANI", MANICHAEAN); + aliases.put("MEND", MENDE_KIKAKUI); aliases.put("MERC", MEROITIC_CURSIVE); aliases.put("MERO", MEROITIC_HIEROGLYPHS); aliases.put("MLYM", MALAYALAM); + aliases.put("MODI", MODI); aliases.put("MONG", MONGOLIAN); + aliases.put("MROO", MRO); aliases.put("MTEI", MEETEI_MAYEK); aliases.put("MYMR", MYANMAR); + aliases.put("NARB", OLD_NORTH_ARABIAN); + aliases.put("NBAT", NABATAEAN); aliases.put("NKOO", NKO); aliases.put("OGAM", OGHAM); aliases.put("OLCK", OL_CHIKI); aliases.put("ORKH", OLD_TURKIC); aliases.put("ORYA", ORIYA); aliases.put("OSMA", OSMANYA); + aliases.put("PALM", PALMYRENE); + aliases.put("PAUC", PAU_CIN_HAU); + aliases.put("PERM", OLD_PERMIC); aliases.put("PHAG", PHAGS_PA); - aliases.put("PLRD", MIAO); aliases.put("PHLI", INSCRIPTIONAL_PAHLAVI); + aliases.put("PHLP", PSALTER_PAHLAVI); aliases.put("PHNX", PHOENICIAN); + aliases.put("PLRD", MIAO); aliases.put("PRTI", INSCRIPTIONAL_PARTHIAN); aliases.put("RJNG", REJANG); aliases.put("RUNR", RUNIC); @@ -4437,14 +7133,16 @@ aliases.put("SAUR", SAURASHTRA); aliases.put("SHAW", SHAVIAN); aliases.put("SHRD", SHARADA); + aliases.put("SIDD", SIDDHAM); + aliases.put("SIND", KHUDAWADI); aliases.put("SINH", SINHALA); aliases.put("SORA", SORA_SOMPENG); aliases.put("SUND", SUNDANESE); aliases.put("SYLO", SYLOTI_NAGRI); aliases.put("SYRC", SYRIAC); aliases.put("TAGB", TAGBANWA); - aliases.put("TALE", TAI_LE); aliases.put("TAKR", TAKRI); + aliases.put("TALE", TAI_LE); aliases.put("TALU", NEW_TAI_LUE); aliases.put("TAML", TAMIL); aliases.put("TAVT", TAI_VIET); @@ -4454,8 +7152,10 @@ aliases.put("THAA", THAANA); aliases.put("THAI", THAI); aliases.put("TIBT", TIBETAN); + aliases.put("TIRH", TIRHUTA); aliases.put("UGAR", UGARITIC); aliases.put("VAII", VAI); + aliases.put("WARA", WARANG_CITI); aliases.put("XPEO", OLD_PERSIAN); aliases.put("XSUX", CUNEIFORM); aliases.put("YIII", YI); @@ -6594,8 +9294,9 @@ * * @param ch the character to be converted. * @return the numeric value of the character, as a nonnegative {@code int} - * value; -2 if the character has a numeric value that is not a - * nonnegative integer; -1 if the character has no numeric value. + * value; -2 if the character has a numeric value but the value + * can not be represented as a nonnegative {@code int} value; + * -1 if the character has no numeric value. * @see Character#forDigit(int, int) * @see Character#isDigit(char) * @since 1.1 @@ -6627,8 +9328,9 @@ * * @param codePoint the character (Unicode code point) to be converted. * @return the numeric value of the character, as a nonnegative {@code int} - * value; -2 if the character has a numeric value that is not a - * nonnegative integer; -1 if the character has no numeric value. + * value; -2 if the character has a numeric value but the value + * can not be represented as a nonnegative {@code int} value; + * -1 if the character has no numeric value. * @see Character#forDigit(int, int) * @see Character#isDigit(int) * @since 1.5 @@ -6998,6 +9700,10 @@ * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_FORMAT + * @see Character#DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE + * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE + * @see Character#DIRECTIONALITY_FIRST_STRONG_ISOLATE + * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE * @since 1.4 */ public static byte getDirectionality(char ch) { @@ -7035,6 +9741,10 @@ * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_FORMAT DIRECTIONALITY_POP_DIRECTIONAL_FORMAT + * @see Character#DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE + * @see Character#DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE + * @see Character#DIRECTIONALITY_FIRST_STRONG_ISOLATE DIRECTIONALITY_FIRST_STRONG_ISOLATE + * @see Character#DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE * @since 1.5 */ public static byte getDirectionality(int codePoint) { --- old/jdk/src/java.base/share/classes/java/text/Bidi.java 2015-07-13 16:11:41.000000000 +0900 +++ new/jdk/src/java.base/share/classes/java/text/Bidi.java 2015-07-13 16:11:41.000000000 +0900 @@ -185,7 +185,7 @@ AttributedString astr = new AttributedString(""); Bidi newBidi = new Bidi(astr.getIterator()); - return bidiBase.setLine(this, bidiBase, newBidi, newBidi.bidiBase,lineStart, lineLimit); + return bidiBase.setLine(this, bidiBase, newBidi, newBidi.bidiBase, lineStart, lineLimit); } /** --- old/jdk/src/java.base/share/classes/java/text/Normalizer.java 2015-07-13 16:11:41.000000000 +0900 +++ new/jdk/src/java.base/share/classes/java/text/Normalizer.java 2015-07-13 16:11:41.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -38,7 +38,6 @@ package java.text; import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.NormalizerImpl; /** * This class provides the method <code>normalize</code> which transforms Unicode --- old/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java 2015-07-13 16:11:42.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/net/idn/StringPrep.java 2015-07-13 16:11:42.000000000 +0900 @@ -50,7 +50,6 @@ import sun.text.Normalizer; import sun.text.normalizer.CharTrie; import sun.text.normalizer.Trie; -import sun.text.normalizer.NormalizerImpl; import sun.text.normalizer.VersionInfo; import sun.text.normalizer.UCharacter; import sun.text.normalizer.UCharacterIterator; @@ -227,7 +226,7 @@ checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); - VersionInfo normUniVer = NormalizerImpl.getUnicodeVersion(); + VersionInfo normUniVer = UCharacter.getUnicodeVersion(); if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ @@ -354,7 +353,7 @@ Normalizer.normalize( src.toString(), java.text.Normalizer.Form.NFKC, - Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29)); + Normalizer.UNICODE_3_2)); } /* boolean isLabelSeparator(int ch){ --- old/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java 2015-07-13 16:11:43.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/ComposedCharIter.java 2015-07-13 16:11:43.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -43,7 +43,7 @@ private static int decompNum; static { - int maxNum = 2000; //TBD: Unicode 4.0 only has 1926 canoDecomp... + int maxNum = 2100; chars = new int[maxNum]; decomps = new String[maxNum]; decompNum = NormalizerImpl.getDecompose(chars, decomps); --- old/jdk/src/java.base/share/classes/sun/text/Normalizer.java 2015-07-13 16:11:44.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/Normalizer.java 2015-07-13 16:11:43.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -26,7 +26,7 @@ package sun.text; import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.NormalizerImpl; +import sun.text.normalizer.UCharacter; /** * This Normalizer is for Unicode 3.2 support for IDNA only. @@ -93,6 +93,6 @@ * @return combining class of the given character */ public static final int getCombiningClass(int ch) { - return NormalizerImpl.getCombiningClass(ch); + return UCharacter.getCombiningClass(ch); } } --- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java 2015-07-13 16:11:44.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiBase.java 2015-07-13 16:11:44.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,17 +22,13 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ +******************************************************************************* +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ /* FOOD FOR THOUGHT: currently the reordering modes are a mixture of * algorithm for direct BiDi, algorithm for inverse Bidi and the bizarre @@ -52,12 +48,10 @@ package sun.text.bidi; -import java.io.IOException; import java.lang.reflect.Array; import java.text.AttributedCharacterIterator; import java.text.Bidi; import java.util.Arrays; -import java.util.MissingResourceException; import sun.misc.JavaAWTFontAccess; import sun.misc.SharedSecrets; import sun.text.normalizer.UBiDiProps; @@ -68,10 +62,9 @@ * * <h2>Bidi algorithm for ICU</h2> * - * This is an implementation of the Unicode Bidirectional algorithm. The + * This is an implementation of the Unicode Bidirectional Algorithm. The * algorithm is defined in the <a - * href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>, - * version 13, also described in The Unicode Standard, Version 4.0 . + * href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>. * <p> * * Note: Libraries that perform a bidirectional algorithm and reorder strings @@ -106,6 +99,7 @@ * <li>{@link #LTR} * <li>{@link #RTL} * <li>{@link #MIXED} + * <li>{@link #NEUTRAL} * </ul> * * <h3>Basic concept: levels</h3> @@ -167,6 +161,7 @@ * * <h3>Basic concept: Reordering Options</h3> * Reordering options can be applied during Bidi text transformations. + * * <p><b>See Also:</b> * <ul> * <li>{@link #setReorderingOptions} @@ -456,19 +451,134 @@ * }</pre> */ +/* + * General implementation notes: + * + * Throughout the implementation, there are comments like (W2) that refer to + * rules of the BiDi algorithm, in this example to the second rule of the + * resolution of weak types. + * + * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32) + * character according to UTF-16, the second UChar gets the directional property of + * the entire character assigned, while the first one gets a BN, a boundary + * neutral, type, which is ignored by most of the algorithm according to + * rule (X9) and the implementation suggestions of the BiDi algorithm. + * + * Later, adjustWSLevels() will set the level for each BN to that of the + * following character (UChar), which results in surrogate pairs getting the + * same level on each of their surrogates. + * + * In a UTF-8 implementation, the same thing could be done: the last byte of + * a multi-byte sequence would get the "real" property, while all previous + * bytes of that sequence would get BN. + * + * It is not possible to assign all those parts of a character the same real + * property because this would fail in the resolution of weak types with rules + * that look at immediately surrounding types. + * + * As a related topic, this implementation does not remove Boundary Neutral + * types from the input, but ignores them wherever this is relevant. + * For example, the loop for the resolution of the weak types reads + * types until it finds a non-BN. + * Also, explicit embedding codes are neither changed into BN nor removed. + * They are only treated the same way real BNs are. + * As stated before, adjustWSLevels() takes care of them at the end. + * For the purpose of conformance, the levels of all these codes + * do not matter. + * + * Note that this implementation modifies the dirProps + * after the initial setup, when applying X5c (replace FSI by LRI or RLI), + * X6, N0 (replace paired brackets by L or R). + * + * In this implementation, the resolution of weak types (W1 to W6), + * neutrals (N1 and N2), and the assignment of the resolved level (In) + * are all done in one single loop, in resolveImplicitLevels(). + * Changes of dirProp values are done on the fly, without writing + * them back to the dirProps array. + * + * + * This implementation contains code that allows to bypass steps of the + * algorithm that are not needed on the specific paragraph + * in order to speed up the most common cases considerably, + * like text that is entirely LTR, or RTL text without numbers. + * + * Most of this is done by setting a bit for each directional property + * in a flags variable and later checking for whether there are + * any LTR characters or any RTL characters, or both, whether + * there are any explicit embedding codes, etc. + * + * If the (Xn) steps are performed, then the flags are re-evaluated, + * because they will then not contain the embedding codes any more + * and will be adjusted for override codes, so that subsequently + * more bypassing may be possible than what the initial flags suggested. + * + * If the text is not mixed-directional, then the + * algorithm steps for the weak type resolution are not performed, + * and all levels are set to the paragraph level. + * + * If there are no explicit embedding codes, then the (Xn) steps + * are not performed. + * + * If embedding levels are supplied as a parameter, then all + * explicit embedding codes are ignored, and the (Xn) steps + * are not performed. + * + * White Space types could get the level of the run they belong to, + * and are checked with a test of (flags&MASK_EMBEDDING) to + * consider if the paragraph direction should be considered in + * the flags variable. + * + * If there are no White Space types in the paragraph, then + * (L1) is not necessary in adjustWSLevels(). + */ + public class BidiBase { - class Point { + static class Point { int pos; /* position in text */ int flag; /* flag for LRM/RLM, before/after */ } - class InsertPoints { + static class InsertPoints { int size; int confirmed; Point[] points = new Point[0]; } + static class Opening { + int position; /* position of opening bracket */ + int match; /* matching char or -position of closing bracket */ + int contextPos; /* position of last strong char found before opening */ + short flags; /* bits for L or R/AL found within the pair */ + byte contextDir; /* L or R according to last strong char before opening */ + } + + static class IsoRun { + int contextPos; /* position of char determining context */ + short start; /* index of first opening entry for this run */ + short limit; /* index after last opening entry for this run */ + byte level; /* level of this run */ + byte lastStrong; /* bidi class of last strong char found in this run */ + byte lastBase; /* bidi class of last base char found in this run */ + byte contextDir; /* L or R to use as context for following openings */ + } + + static class BracketData { + Opening[] openings = new Opening[SIMPLE_PARAS_COUNT]; + int isoRunLast; /* index of last used entry */ + /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL + + 1 for index 0, + 1 for before the first isolated sequence */ + IsoRun[] isoRuns = new IsoRun[MAX_EXPLICIT_LEVEL+2]; + boolean isNumbersSpecial; /*reordering mode for NUMBERS_SPECIAL */ + } + + static class Isolate { + int startON; + int start1; + short stateImp; + short state; + } + /** Paragraph level setting<p> * * Constant indicating that the base direction depends on the first strong @@ -482,7 +592,7 @@ * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either - * the righmost or leftmost strong character of the source text is RTL + * the rightmost or leftmost strong character of the source text is RTL * or Arabic Letter, the direction will be LTR otherwise.<p> * * If reordering option <code>OPTION_INSERT_MARKS</code> is set, an RLM may @@ -493,7 +603,7 @@ * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 3.8 */ - public static final byte INTERNAL_LEVEL_DEFAULT_LTR = (byte)0x7e; + public static final byte LEVEL_DEFAULT_LTR = (byte)0x7e; /** Paragraph level setting<p> * @@ -508,7 +618,7 @@ * is assumed to be visual LTR, and the text after reordering is required * to be the corresponding logical string with appropriate contextual * direction. The direction of the result string will be RTL if either - * the righmost or leftmost strong character of the source text is RTL + * the rightmost or leftmost strong character of the source text is RTL * or Arabic Letter, or if the text contains no strong character; * the direction will be LTR otherwise.<p> * @@ -520,21 +630,21 @@ * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 3.8 */ - public static final byte INTERNAL_LEVEL_DEFAULT_RTL = (byte)0x7f; + public static final byte LEVEL_DEFAULT_RTL = (byte)0x7f; /** * Maximum explicit embedding level. * (The maximum resolved level can be up to <code>MAX_EXPLICIT_LEVEL+1</code>). * @stable ICU 3.8 */ - public static final byte MAX_EXPLICIT_LEVEL = 61; + public static final byte MAX_EXPLICIT_LEVEL = 125; /** * Bit flag for level input. * Overrides directional properties. * @stable ICU 3.8 */ - public static final byte INTERNAL_LEVEL_OVERRIDE = (byte)0x80; + public static final byte LEVEL_OVERRIDE = (byte)0x80; /** * Special value which can be returned by the mapping methods when a @@ -555,13 +665,53 @@ public static final int MAP_NOWHERE = -1; /** + * Left-to-right text. + * <ul> + * <li>As return value for <code>getDirection()</code>, it means + * that the source string contains no right-to-left characters, or + * that the source string is empty and the paragraph level is even. + * <li>As return value for <code>getBaseDirection()</code>, it + * means that the first strong character of the source string has + * a left-to-right direction. + * </ul> + * @stable ICU 3.8 + */ + public static final byte LTR = 0; + + /** + * Right-to-left text. + * <ul> + * <li>As return value for <code>getDirection()</code>, it means + * that the source string contains no left-to-right characters, or + * that the source string is empty and the paragraph level is odd. + * <li>As return value for <code>getBaseDirection()</code>, it + * means that the first strong character of the source string has + * a right-to-left direction. + * </ul> + * @stable ICU 3.8 + */ + public static final byte RTL = 1; + + /** * Mixed-directional text. + * <p>As return value for <code>getDirection()</code>, it means + * that the source string contains both left-to-right and + * right-to-left characters. * @stable ICU 3.8 */ public static final byte MIXED = 2; /** * option bit for writeReordered(): + * keep combining characters after their base characters in RTL runs + * + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short KEEP_BASE_COMBINING = 1; + + /** + * option bit for writeReordered(): * replace characters with the "mirrored" property in RTL runs * by their mirror-image mappings * @@ -570,6 +720,50 @@ */ public static final short DO_MIRRORING = 2; + /** + * option bit for writeReordered(): + * surround the run with LRMs if necessary; + * this is part of the approximate "inverse Bidi" algorithm + * + * <p>This option does not imply corresponding adjustment of the index + * mappings.</p> + * + * @see #setInverse + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short INSERT_LRM_FOR_NUMERIC = 4; + + /** + * option bit for writeReordered(): + * remove Bidi control characters + * (this does not affect INSERT_LRM_FOR_NUMERIC) + * + * <p>This option does not imply corresponding adjustment of the index + * mappings.</p> + * + * @see #writeReordered + * @see #INSERT_LRM_FOR_NUMERIC + * @stable ICU 3.8 + */ + public static final short REMOVE_BIDI_CONTROLS = 8; + + /** + * option bit for writeReordered(): + * write the output in reverse order + * + * <p>This has the same effect as calling <code>writeReordered()</code> + * first without this option, and then calling + * <code>writeReverse()</code> without mirroring. + * Doing this in the same step is faster and avoids a temporary buffer. + * An example for using this option is output to a character terminal that + * is designed for RTL scripts and stores text in reverse order.</p> + * + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short OUTPUT_REVERSE = 16; + /** Reordering mode: Regular Logical to Visual Bidi algorithm according to Unicode. * @see #setReorderingMode * @stable ICU 3.8 @@ -600,7 +794,7 @@ * @see #setReorderingMode * @stable ICU 3.8 */ - private static final short REORDER_RUNS_ONLY = 3; + static final short REORDER_RUNS_ONLY = 3; /** Reordering mode: Visual to Logical algorithm which handles numbers * like L (same algorithm as selected by <code>setInverse(true)</code>. @@ -608,21 +802,21 @@ * @see #setReorderingMode * @stable ICU 3.8 */ - private static final short REORDER_INVERSE_NUMBERS_AS_L = 4; + static final short REORDER_INVERSE_NUMBERS_AS_L = 4; /** Reordering mode: Visual to Logical algorithm equivalent to the regular * Logical to Visual algorithm. * @see #setReorderingMode * @stable ICU 3.8 */ - private static final short REORDER_INVERSE_LIKE_DIRECT = 5; + static final short REORDER_INVERSE_LIKE_DIRECT = 5; /** Reordering mode: Inverse Bidi (Visual to Logical) algorithm for the * <code>REORDER_NUMBERS_SPECIAL</code> Bidi algorithm. * @see #setReorderingMode * @stable ICU 3.8 */ - private static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6; + static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6; /* Reordering mode values must be ordered so that all the regular logical to * visual modes come first, and all inverse Bidi modes come last. @@ -682,7 +876,7 @@ * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL * @stable ICU 3.8 */ - private static final int OPTION_INSERT_MARKS = 1; + static final int OPTION_INSERT_MARKS = 1; /** * Option bit for <code>setReorderingOptions</code>: @@ -704,7 +898,7 @@ * @see #REMOVE_BIDI_CONTROLS * @stable ICU 3.8 */ - private static final int OPTION_REMOVE_CONTROLS = 2; + static final int OPTION_REMOVE_CONTROLS = 2; /** * Option bit for <code>setReorderingOptions</code>: @@ -741,8 +935,7 @@ * part of the text.</p> * * <p>When the <code>OPTION_STREAMING</code> option is used, it is - * recommended to call <code>orderParagraphsLTR()</code> with argument - * <code>orderParagraphsLTR</code> set to <code>true</code> before calling + * recommended to call <code>orderParagraphsLTR(true)</code> before calling * <code>setPara()</code> so that later paragraphs may be concatenated to * previous paragraphs on the right. * </p> @@ -750,7 +943,6 @@ * @see #setReorderingMode * @see #setReorderingOptions * @see #getProcessedLength - * @see #orderParagraphsLTR * @stable ICU 3.8 */ private static final int OPTION_STREAMING = 4; @@ -760,7 +952,7 @@ * is easier with the same names for the Bidi types in the code as there. * See UCharacterDirection */ - private static final byte L = 0; + /* private */ static final byte L = 0; private static final byte R = 1; private static final byte EN = 2; private static final byte ES = 3; @@ -779,8 +971,55 @@ private static final byte PDF = 16; private static final byte NSM = 17; private static final byte BN = 18; + private static final byte FSI = 19; + private static final byte LRI = 20; + private static final byte RLI = 21; + private static final byte PDI = 22; + private static final byte ENL = PDI + 1; /* EN after W7 */ + private static final byte ENR = ENL + 1; /* EN not subject to W7 */ - private static final int MASK_R_AL = (1 << R | 1 << AL); + // Number of directional types + private static final int CHAR_DIRECTION_COUNT = 23; + + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + + /** + * Bidi Paired Bracket Type constants. + * + * @see UProperty#BIDI_PAIRED_BRACKET_TYPE + * @stable ICU 52 + */ + public static interface BidiPairedBracketType { + /** + * Not a paired bracket. + * @stable ICU 52 + */ + public static final int NONE = 0; + /** + * Open paired bracket. + * @stable ICU 52 + */ + public static final int OPEN = 1; + /** + * Close paired bracket. + * @stable ICU 52 + */ + public static final int CLOSE = 2; + /** + * @stable ICU 52 + */ + public static final int COUNT = 3; + } + + /* number of paras entries allocated initially */ + static final int SIMPLE_PARAS_COUNT = 10; private static final char CR = '\r'; private static final char LF = '\n'; @@ -790,12 +1029,22 @@ static final int RLM_BEFORE = 4; static final int RLM_AFTER = 8; + /* flags for Opening.flags */ + static final byte FOUND_L = (byte)DirPropFlag(L); + static final byte FOUND_R = (byte)DirPropFlag(R); + + /* + * The following bit is used for the directional isolate status. + * Stack entries corresponding to isolate sequences are greater than ISOLATE. + */ + static final int ISOLATE = 0x0100; + /* * reference to parent paragraph object (reference to self if this object is * a paragraph object); set to null in a newly opened object; set to a * real value after a successful execution of setPara or setLine */ - BidiBase paraBidi; + BidiBase paraBidi; final UBiDiProps bdp; @@ -828,6 +1077,15 @@ byte[] dirProps; byte[] levels; + /* are we performing an approximation of the "inverse Bidi" algorithm? */ + boolean isInverse; + + /* are we using the basic algorithm or its variation? */ + int reorderingMode; + + /* bitmask for reordering options */ + int reorderingOptions; + /* must block separators receive level 0? */ boolean orderParagraphsLTR; @@ -855,14 +1113,10 @@ /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */ int trailingWSStart; - /* fields for paragraph handling */ - int paraCount; /* set in getDirProps() */ - int[] parasMemory = new int[1]; - int[] paras; /* limits of paragraphs, filled in - ResolveExplicitLevels() or CheckExplicitLevels() */ - - /* for single paragraph text, we only need a tiny array of paras (no allocation) */ - int[] simpleParas = {0}; + /* fields for paragraph handling, set in getDirProps() */ + int paraCount; + int[] paras_limit = new int[SIMPLE_PARAS_COUNT]; + byte[] paras_level = new byte[SIMPLE_PARAS_COUNT]; /* fields for line reordering */ int runCount; /* ==-1: runs not set up yet */ @@ -872,9 +1126,18 @@ /* for non-mixed text, we only need a tiny array of runs (no allocation) */ BidiRun[] simpleRuns = {new BidiRun()}; + /* fields for managing isolate sequences */ + Isolate[] isolates; + + /* maximum or current nesting depth of isolate sequences */ + /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal + nesting encountered. + Within resolveImplicitLevels(), this is the index of the current isolates + stack entry. */ + int isolateCount; + /* mapping of runs in logical order to visual order */ int[] logicalToVisualRunsMap; - /* flag to indicate that the map has been updated */ boolean isGoodLogicalToVisualRunsMap; @@ -894,23 +1157,8 @@ return (1 << dir); } - /* - * The following bit is ORed to the property of characters in paragraphs - * with contextual RTL direction when paraLevel is contextual. - */ - static final byte CONTEXT_RTL_SHIFT = 6; - static final byte CONTEXT_RTL = (byte)(1<<CONTEXT_RTL_SHIFT); // 0x40 - static byte NoContextRTL(byte dir) - { - return (byte)(dir & ~CONTEXT_RTL); - } - - /* - * The following is a variant of DirProp.DirPropFlag() which ignores the - * CONTEXT_RTL bit. - */ - static int DirPropFlagNC(byte dir) { - return (1<<(dir & ~CONTEXT_RTL)); + boolean testDirPropFlagAt(int flag, int index) { + return ((DirPropFlag(dirProps[index]) & flag) != 0); } static final int DirPropFlagMultiRuns = DirPropFlag((byte)31); @@ -923,40 +1171,38 @@ static final int DirPropFlagLR(byte level) { return DirPropFlagLR[level & 1]; } static final int DirPropFlagE(byte level) { return DirPropFlagE[level & 1]; } static final int DirPropFlagO(byte level) { return DirPropFlagO[level & 1]; } + static final byte DirFromStrong(byte strong) { return strong == L ? L : R; } + static final byte NoOverride(byte level) { return (byte)(level & ~LEVEL_OVERRIDE); } - /* - * are there any characters that are LTR? - */ + /* are there any characters that are LTR or RTL? */ static final int MASK_LTR = - DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO); + DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(ENL)|DirPropFlag(ENR)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(LRI); + static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(RLI); - /* - * are there any characters that are RTL? - */ - static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO); + static final int MASK_R_AL = DirPropFlag(R)|DirPropFlag(AL); /* explicit embedding codes */ - private static final int MASK_LRX = DirPropFlag(LRE)|DirPropFlag(LRO); - private static final int MASK_RLX = DirPropFlag(RLE)|DirPropFlag(RLO); - private static final int MASK_EXPLICIT = MASK_LRX|MASK_RLX|DirPropFlag(PDF); + private static final int MASK_EXPLICIT = DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(PDF); private static final int MASK_BN_EXPLICIT = DirPropFlag(BN)|MASK_EXPLICIT; + /* explicit isolate codes */ + private static final int MASK_ISO = DirPropFlag(LRI)|DirPropFlag(RLI)|DirPropFlag(FSI)|DirPropFlag(PDI); + /* paragraph and segment separators */ private static final int MASK_B_S = DirPropFlag(B)|DirPropFlag(S); /* all types that are counted as White Space or Neutral in some steps */ - static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT; - private static final int MASK_N = DirPropFlag(ON)|MASK_WS; + static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT|MASK_ISO; /* types that are neutrals or could becomes neutrals in (Wn) */ - private static final int MASK_POSSIBLE_N = DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_N; + private static final int MASK_POSSIBLE_N = DirPropFlag(ON)|DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_WS; /* * These types may be changed to "e", * the embedding type (L or R) of the run, * in the Bidi algorithm (N2) */ - static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N; + private static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N; /* * the dirProp's L and R are defined to 0 and 1 values in UCharacterDirection.java @@ -968,30 +1214,25 @@ private static boolean IsDefaultLevel(byte level) { - return ((level & INTERNAL_LEVEL_DEFAULT_LTR) == INTERNAL_LEVEL_DEFAULT_LTR); - } - - byte GetParaLevelAt(int index) - { - return (defaultParaLevel != 0) ? - (byte)(dirProps[index]>>CONTEXT_RTL_SHIFT) : paraLevel; + return ((level & LEVEL_DEFAULT_LTR) == LEVEL_DEFAULT_LTR); } static boolean IsBidiControlChar(int c) { /* check for range 0x200c to 0x200f (ZWNJ, ZWJ, LRM, RLM) or 0x202a to 0x202e (LRE, RLE, PDF, LRO, RLO) */ - return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e))); + return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e)) + || ((c >= 0x2066) && (c <= 0x2069))); } - public void verifyValidPara() + void verifyValidPara() { - if (this != this.paraBidi) { - throw new IllegalStateException(""); + if (!(this == this.paraBidi)) { + throw new IllegalStateException(); } } - public void verifyValidParaOrLine() + void verifyValidParaOrLine() { BidiBase para = this.paraBidi; /* verify Para */ @@ -1004,7 +1245,7 @@ } } - public void verifyRange(int index, int start, int limit) + void verifyRange(int index, int start, int limit) { if (index < start || index >= limit) { throw new IllegalArgumentException("Value " + index + @@ -1012,14 +1253,6 @@ } } - public void verifyIndex(int index, int start, int limit) - { - if (index < start || index >= limit) { - throw new ArrayIndexOutOfBoundsException("Index " + index + - " is out of range " + start + " to " + limit); - } - } - /** * Allocate a <code>Bidi</code> object with preallocated memory * for internal structures. @@ -1051,7 +1284,7 @@ * @stable ICU 3.8 */ public BidiBase(int maxLength, int maxRunCount) - { + { /* check the argument values */ if (maxLength < 0 || maxRunCount < 0) { throw new IllegalArgumentException(); @@ -1075,12 +1308,7 @@ direction = 0; */ /* get Bidi properties */ - try { - bdp = UBiDiProps.getSingleton(); - } - catch (IOException e) { - throw new MissingResourceException(e.getMessage(), "(BidiProps)", ""); - } + bdp = UBiDiProps.INSTANCE; /* allocate memory for arrays as requested */ if (maxLength > 0) { @@ -1180,18 +1408,68 @@ getLevelsMemory(true, len); } - private void getInitialParasMemory(int len) - { - Object array = getMemory("Paras", parasMemory, Integer.TYPE, true, len); - parasMemory = (int[]) array; - } - private void getInitialRunsMemory(int len) { getRunsMemory(true, len); } -/* perform (P2)..(P3) ------------------------------------------------------- */ + /** + * Is this <code>Bidi</code> object set to perform the inverse Bidi + * algorithm? + * <p>Note: calling this method after setting the reordering mode with + * <code>setReorderingMode</code> will return <code>true</code> if the + * reordering mode was set to + * <code>REORDER_INVERSE_NUMBERS_AS_L</code>, <code>false</code> + * for all other values.</p> + * + * @return <code>true</code> if the <code>Bidi</code> object is set to + * perform the inverse Bidi algorithm by handling numbers as L. + * + * @see #setInverse + * @see #setReorderingMode + * @see #REORDER_INVERSE_NUMBERS_AS_L + * @stable ICU 3.8 + */ + public boolean isInverse() { + return isInverse; + } + + /* perform (P2)..(P3) ------------------------------------------------------- */ + + /* + * Check that there are enough entries in the arrays paras_limit and paras_level + */ + private void checkParaCount() { + int[] saveLimits; + byte[] saveLevels; + int count = paraCount; + if (count <= paras_level.length) + return; + int oldLength = paras_level.length; + saveLimits = paras_limit; + saveLevels = paras_level; + try { + paras_limit = new int[count * 2]; + paras_level = new byte[count * 2]; + } catch (Exception e) { + throw new OutOfMemoryError("Failed to allocate memory for paras"); + } + System.arraycopy(saveLimits, 0, paras_limit, 0, oldLength); + System.arraycopy(saveLevels, 0, paras_level, 0, oldLength); + } + + /* + * Get the directional properties for the text, calculate the flags bit-set, and + * determine the paragraph level if necessary (in paras_level[i]). + * FSI initiators are also resolved and their dirProp replaced with LRI or RLI. + * When encountering an FSI, it is initially replaced with an LRI, which is the + * default. Only if a strong R or AL is found within its scope will the LRI be + * replaced by an RLI. + */ + static final int NOT_SEEKING_STRONG = 0; /* 0: not contextual paraLevel, not after FSI */ + static final int SEEKING_STRONG_FOR_PARA = 1; /* 1: looking for first strong char in para */ + static final int SEEKING_STRONG_FOR_FSI = 2; /* 2: looking for first strong after FSI */ + static final int LOOKING_FOR_PDI = 3; /* 3: found strong after FSI, looking for PDI */ private void getDirProps() { @@ -1199,32 +1477,44 @@ flags = 0; /* collect all directionalities in the text */ int uchar; byte dirProp; - byte paraDirDefault = 0; /* initialize to avoid compiler warnings */ + byte defaultParaLevel = 0; /* initialize to avoid compiler warnings */ boolean isDefaultLevel = IsDefaultLevel(paraLevel); /* for inverse Bidi, the default para level is set to RTL if there is a strong R or AL character at either end of the text */ + boolean isDefaultLevelInverse=isDefaultLevel && + (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL); lastArabicPos = -1; - controlCount = 0; + int controlCount = 0; + boolean removeBidiControls = (reorderingOptions & OPTION_REMOVE_CONTROLS) != 0; - final int NOT_CONTEXTUAL = 0; /* 0: not contextual paraLevel */ - final int LOOKING_FOR_STRONG = 1; /* 1: looking for first strong char */ - final int FOUND_STRONG_CHAR = 2; /* 2: found first strong char */ - - int state; - int paraStart = 0; /* index of first char in paragraph */ - byte paraDir; /* == CONTEXT_RTL within paragraphs - starting with strong R char */ - byte lastStrongDir=0; /* for default level & inverse Bidi */ - int lastStrongLTR=0; /* for STREAMING option */ + byte state; + byte lastStrong = ON; /* for default level & inverse Bidi */ + /* The following stacks are used to manage isolate sequences. Those + sequences may be nested, but obviously never more deeply than the + maximum explicit embedding level. + lastStack is the index of the last used entry in the stack. A value of -1 + means that there is no open isolate sequence. + lastStack is reset to -1 on paragraph boundaries. */ + /* The following stack contains the position of the initiator of + each open isolate sequence */ + int[] isolateStartStack= new int[MAX_EXPLICIT_LEVEL+1]; + /* The following stack contains the last known state before + encountering the initiator of an isolate sequence */ + byte[] previousStateStack = new byte[MAX_EXPLICIT_LEVEL+1]; + int stackLast=-1; + + if ((reorderingOptions & OPTION_STREAMING) != 0) + length = 0; + defaultParaLevel = (byte)(paraLevel & 1); if (isDefaultLevel) { - paraDirDefault = ((paraLevel & 1) != 0) ? CONTEXT_RTL : 0; - paraDir = paraDirDefault; - lastStrongDir = paraDirDefault; - state = LOOKING_FOR_STRONG; + paras_level[0] = defaultParaLevel; + lastStrong = defaultParaLevel; + state = SEEKING_STRONG_FOR_PARA; } else { - state = NOT_CONTEXTUAL; - paraDir = 0; + paras_level[0] = paraLevel; + state = NOT_SEEKING_STRONG; } /* count paragraphs and determine the paragraph level (P2..P3) */ /* @@ -1236,90 +1526,509 @@ for (i = 0; i < originalLength; /* i is incremented in the loop */) { i0 = i; /* index of first code unit */ uchar = UTF16.charAt(text, 0, originalLength, i); - i += Character.charCount(uchar); + i += UTF16.getCharCount(uchar); i1 = i - 1; /* index of last code unit, gets the directional property */ - dirProp = (byte)bdp.getClass(uchar); - + dirProp = (byte)getCustomizedClass(uchar); flags |= DirPropFlag(dirProp); - dirProps[i1] = (byte)(dirProp | paraDir); + dirProps[i1] = dirProp; if (i1 > i0) { /* set previous code units' properties to BN */ flags |= DirPropFlag(BN); do { - dirProps[--i1] = (byte)(BN | paraDir); + dirProps[--i1] = BN; } while (i1 > i0); } - if (state == LOOKING_FOR_STRONG) { - if (dirProp == L) { - state = FOUND_STRONG_CHAR; - if (paraDir != 0) { - paraDir = 0; - for (i1 = paraStart; i1 < i; i1++) { - dirProps[i1] &= ~CONTEXT_RTL; - } - } - continue; + if (removeBidiControls && IsBidiControlChar(uchar)) { + controlCount++; + } + if (dirProp == L) { + if (state == SEEKING_STRONG_FOR_PARA) { + paras_level[paraCount - 1] = 0; + state = NOT_SEEKING_STRONG; } - if (dirProp == R || dirProp == AL) { - state = FOUND_STRONG_CHAR; - if (paraDir == 0) { - paraDir = CONTEXT_RTL; - for (i1 = paraStart; i1 < i; i1++) { - dirProps[i1] |= CONTEXT_RTL; - } + else if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); } - continue; + state = LOOKING_FOR_PDI; } + lastStrong = L; + continue; } - if (dirProp == L) { - lastStrongDir = 0; - lastStrongLTR = i; /* i is index to next character */ + if (dirProp == R || dirProp == AL) { + if (state == SEEKING_STRONG_FOR_PARA) { + paras_level[paraCount - 1] = 1; + state = NOT_SEEKING_STRONG; + } + else if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + dirProps[isolateStartStack[stackLast]] = RLI; + flags |= DirPropFlag(RLI); + } + state = LOOKING_FOR_PDI; + } + lastStrong = R; + if (dirProp == AL) + lastArabicPos = i - 1; + continue; } - else if (dirProp == R) { - lastStrongDir = CONTEXT_RTL; + if (dirProp >= FSI && dirProp <= RLI) { /* FSI, LRI or RLI */ + stackLast++; + if (stackLast <= MAX_EXPLICIT_LEVEL) { + isolateStartStack[stackLast] = i - 1; + previousStateStack[stackLast] = state; + } + if (dirProp == FSI) { + dirProps[i-1] = LRI; /* default if no strong char */ + state = SEEKING_STRONG_FOR_FSI; + } + else + state = LOOKING_FOR_PDI; + continue; } - else if (dirProp == AL) { - lastStrongDir = CONTEXT_RTL; - lastArabicPos = i-1; - } - else if (dirProp == B) { - if (i < originalLength) { /* B not last char in text */ - if (!((uchar == (int)CR) && (text[i] == (int)LF))) { - paraCount++; + if (dirProp == PDI) { + if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); } + } + if (stackLast >= 0) { + if (stackLast <= MAX_EXPLICIT_LEVEL) + state = previousStateStack[stackLast]; + stackLast--; + } + continue; + } + if (dirProp == B) { + if (i < originalLength && uchar == CR && text[i] == LF) /* do nothing on the CR */ + continue; + paras_limit[paraCount - 1] = i; + if (isDefaultLevelInverse && lastStrong == R) + paras_level[paraCount - 1] = 1; + if ((reorderingOptions & OPTION_STREAMING) != 0) { + /* When streaming, we only process whole paragraphs + thus some updates are only done on paragraph boundaries */ + length = i; /* i is index to next character */ + this.controlCount = controlCount; + } + if (i < originalLength) { /* B not last char in text */ + paraCount++; + checkParaCount(); /* check that there is enough memory for a new para entry */ if (isDefaultLevel) { - state=LOOKING_FOR_STRONG; - paraStart = i; /* i is index to next character */ - paraDir = paraDirDefault; - lastStrongDir = paraDirDefault; + paras_level[paraCount - 1] = defaultParaLevel; + state = SEEKING_STRONG_FOR_PARA; + lastStrong = defaultParaLevel; + } else { + paras_level[paraCount - 1] = paraLevel; + state = NOT_SEEKING_STRONG; } + stackLast = -1; } + continue; } } + /* +Ignore still open isolate sequences with overflow */ + if (stackLast > MAX_EXPLICIT_LEVEL) { + stackLast = MAX_EXPLICIT_LEVEL; + state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */ + } + /* Resolve direction of still unresolved open FSI sequences */ + while (stackLast >= 0) { + if (state == SEEKING_STRONG_FOR_FSI) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); + break; + } + state = previousStateStack[stackLast]; + stackLast--; + } + /* When streaming, ignore text after the last paragraph separator */ + if ((reorderingOptions & OPTION_STREAMING) != 0) { + if (length < originalLength) + paraCount--; + } else { + paras_limit[paraCount - 1] = originalLength; + this.controlCount = controlCount; + } + /* For inverse bidi, default para direction is RTL if there is + a strong R or AL at either end of the paragraph */ + if (isDefaultLevelInverse && lastStrong == R) { + paras_level[paraCount - 1] = 1; + } if (isDefaultLevel) { - paraLevel = GetParaLevelAt(0); + paraLevel = paras_level[0]; } - - /* The following line does nothing new for contextual paraLevel, but is - needed for absolute paraLevel. */ - flags |= DirPropFlagLR(paraLevel); + /* The following is needed to resolve the text direction for default level + paragraphs containing no strong character */ + for (i = 0; i < paraCount; i++) + flags |= DirPropFlagLR(paras_level[i]); if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { flags |= DirPropFlag(L); } } + /* determine the paragraph level at position index */ + byte GetParaLevelAt(int pindex) + { + if (defaultParaLevel == 0 || pindex < paras_limit[0]) + return paraLevel; + int i; + for (i = 1; i < paraCount; i++) + if (pindex < paras_limit[i]) + break; + if (i >= paraCount) + i = paraCount - 1; + return paras_level[i]; + } + + /* Functions for handling paired brackets ----------------------------------- */ + + /* In the isoRuns array, the first entry is used for text outside of any + isolate sequence. Higher entries are used for each more deeply nested + isolate sequence. isoRunLast is the index of the last used entry. The + openings array is used to note the data of opening brackets not yet + matched by a closing bracket, or matched but still susceptible to change + level. + Each isoRun entry contains the index of the first and + one-after-last openings entries for pending opening brackets it + contains. The next openings entry to use is the one-after-last of the + most deeply nested isoRun entry. + isoRun entries also contain their current embedding level and the last + encountered strong character, since these will be needed to resolve + the level of paired brackets. */ + + private void bracketInit(BracketData bd) { + bd.isoRunLast = 0; + bd.isoRuns[0] = new IsoRun(); + bd.isoRuns[0].start = 0; + bd.isoRuns[0].limit = 0; + bd.isoRuns[0].level = GetParaLevelAt(0); + bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(GetParaLevelAt(0) & 1); + bd.isoRuns[0].contextPos = 0; + bd.openings = new Opening[SIMPLE_PARAS_COUNT]; + bd.isNumbersSpecial = reorderingMode == REORDER_NUMBERS_SPECIAL || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL; + } + + /* paragraph boundary */ + private void bracketProcessB(BracketData bd, byte level) { + bd.isoRunLast = 0; + bd.isoRuns[0].limit = 0; + bd.isoRuns[0].level = level; + bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(level & 1); + bd.isoRuns[0].contextPos = 0; + } + + /* LRE, LRO, RLE, RLO, PDF */ + private void bracketProcessBoundary(BracketData bd, int lastCcPos, + byte contextLevel, byte embeddingLevel) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + if ((DirPropFlag(dirProps[lastCcPos]) & MASK_ISO) != 0) /* after an isolate */ + return; + if (NoOverride(embeddingLevel) > NoOverride(contextLevel)) /* not a PDF */ + contextLevel = embeddingLevel; + pLastIsoRun.limit = pLastIsoRun.start; + pLastIsoRun.level = embeddingLevel; + pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(contextLevel & 1); + pLastIsoRun.contextPos = lastCcPos; + } + + /* LRI or RLI */ + private void bracketProcessLRI_RLI(BracketData bd, byte level) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + short lastLimit; + pLastIsoRun.lastBase = ON; + lastLimit = pLastIsoRun.limit; + bd.isoRunLast++; + pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + if (pLastIsoRun == null) + pLastIsoRun = bd.isoRuns[bd.isoRunLast] = new IsoRun(); + pLastIsoRun.start = pLastIsoRun.limit = lastLimit; + pLastIsoRun.level = level; + pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(level & 1); + pLastIsoRun.contextPos = 0; + } + + /* PDI */ + private void bracketProcessPDI(BracketData bd) { + IsoRun pLastIsoRun; + bd.isoRunLast--; + pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + pLastIsoRun.lastBase = ON; + } + + /* newly found opening bracket: create an openings entry */ + private void bracketAddOpening(BracketData bd, char match, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening pOpening; + if (pLastIsoRun.limit >= bd.openings.length) { /* no available new entry */ + Opening[] saveOpenings = bd.openings; + int count; + try { + count = bd.openings.length; + bd.openings = new Opening[count * 2]; + } catch (Exception e) { + throw new OutOfMemoryError("Failed to allocate memory for openings"); + } + System.arraycopy(saveOpenings, 0, bd.openings, 0, count); + } + pOpening = bd.openings[pLastIsoRun.limit]; + if (pOpening == null) + pOpening = bd.openings[pLastIsoRun.limit]= new Opening(); + pOpening.position = position; + pOpening.match = match; + pOpening.contextDir = pLastIsoRun.contextDir; + pOpening.contextPos = pLastIsoRun.contextPos; + pOpening.flags = 0; + pLastIsoRun.limit++; + } + + /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */ + private void fixN0c(BracketData bd, int openingIndex, int newPropPosition, byte newProp) { + /* This function calls itself recursively */ + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening qOpening; + int k, openingPosition, closingPosition; + for (k = openingIndex+1; k < pLastIsoRun.limit; k++) { + qOpening = bd.openings[k]; + if (qOpening.match >= 0) /* not an N0c match */ + continue; + if (newPropPosition < qOpening.contextPos) + break; + if (newPropPosition >= qOpening.position) + continue; + if (newProp == qOpening.contextDir) + break; + openingPosition = qOpening.position; + dirProps[openingPosition] = newProp; + closingPosition = -(qOpening.match); + dirProps[closingPosition] = newProp; + qOpening.match = 0; /* prevent further changes */ + fixN0c(bd, k, openingPosition, newProp); + fixN0c(bd, k, closingPosition, newProp); + } + } + + /* process closing bracket; return L or R if N0b or N0c, ON if N0d */ + private byte bracketProcessClosing(BracketData bd, int openIdx, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening pOpening, qOpening; + byte direction; + boolean stable; + byte newProp; + pOpening = bd.openings[openIdx]; + direction = (byte)(pLastIsoRun.level & 1); + stable = true; /* assume stable until proved otherwise */ + + /* The stable flag is set when brackets are paired and their + level is resolved and cannot be changed by what will be + found later in the source string. + An unstable match can occur only when applying N0c, where + the resolved level depends on the preceding context, and + this context may be affected by text occurring later. + Example: RTL paragraph containing: abc[(latin) HEBREW] + When the closing parenthesis is encountered, it appears + that N0c1 must be applied since 'abc' sets an opposite + direction context and both parentheses receive level 2. + However, when the closing square bracket is processed, + N0b applies because of 'HEBREW' being included within the + brackets, thus the square brackets are treated like R and + receive level 1. However, this changes the preceding + context of the opening parenthesis, and it now appears + that N0c2 must be applied to the parentheses rather than + N0c1. */ + + if ((direction == 0 && (pOpening.flags & FOUND_L) > 0) || + (direction == 1 && (pOpening.flags & FOUND_R) > 0)) { /* N0b */ + newProp = direction; + } + else if ((pOpening.flags & (FOUND_L | FOUND_R)) != 0) { /* N0c */ + /* it is stable if there is no preceding text or in + conditions too complicated and not worth checking */ + stable = (openIdx == pLastIsoRun.start); + if (direction != pOpening.contextDir) + newProp = pOpening.contextDir; /* N0c1 */ + else + newProp = direction; /* N0c2 */ + } else { + /* forget this and any brackets nested within this pair */ + pLastIsoRun.limit = (short)openIdx; + return ON; /* N0d */ + } + dirProps[pOpening.position] = newProp; + dirProps[position] = newProp; + /* Update nested N0c pairs that may be affected */ + fixN0c(bd, openIdx, pOpening.position, newProp); + if (stable) { + pLastIsoRun.limit = (short)openIdx; /* forget any brackets nested within this pair */ + /* remove lower located synonyms if any */ + while (pLastIsoRun.limit > pLastIsoRun.start && + bd.openings[pLastIsoRun.limit - 1].position == pOpening.position) + pLastIsoRun.limit--; + } else { + int k; + pOpening.match = -position; + /* neutralize lower located synonyms if any */ + k = openIdx - 1; + while (k >= pLastIsoRun.start && + bd.openings[k].position == pOpening.position) + bd.openings[k--].match = 0; + /* neutralize any unmatched opening between the current pair; + this will also neutralize higher located synonyms if any */ + for (k = openIdx + 1; k < pLastIsoRun.limit; k++) { + qOpening =bd.openings[k]; + if (qOpening.position >= position) + break; + if (qOpening.match > 0) + qOpening.match = 0; + } + } + return newProp; + } + + /* handle strong characters, digits and candidates for closing brackets */ + private void bracketProcessChar(BracketData bd, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + byte dirProp, newProp; + byte level; + dirProp = dirProps[position]; + if (dirProp == ON) { + char c, match; + int idx; + /* First see if it is a matching closing bracket. Hopefully, this is + more efficient than checking if it is a closing bracket at all */ + c = text[position]; + for (idx = pLastIsoRun.limit - 1; idx >= pLastIsoRun.start; idx--) { + if (bd.openings[idx].match != c) + continue; + /* We have a match */ + newProp = bracketProcessClosing(bd, idx, position); + if(newProp == ON) { /* N0d */ + c = 0; /* prevent handling as an opening */ + break; + } + pLastIsoRun.lastBase = ON; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + level = levels[position]; + if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ + short flag; + int i; + newProp = (byte)(level & 1); + pLastIsoRun.lastStrong = newProp; + flag = (short)DirPropFlag(newProp); + for (i = pLastIsoRun.start; i < idx; i++) + bd.openings[i].flags |= flag; + /* matching brackets are not overridden by LRO/RLO */ + levels[position] &= ~LEVEL_OVERRIDE; + } + /* matching brackets are not overridden by LRO/RLO */ + levels[bd.openings[idx].position] &= ~LEVEL_OVERRIDE; + return; + } + /* We get here only if the ON character is not a matching closing + bracket or it is a case of N0d */ + /* Now see if it is an opening bracket */ + if (c != 0) { + match = (char)UCharacter.getBidiPairedBracket(c); /* get the matching char */ + } else { + match = 0; + } + if (match != c && /* has a matching char */ + UCharacter.getIntPropertyValue(c, BIDI_PAIRED_BRACKET_TYPE) == + /* opening bracket */ BidiPairedBracketType.OPEN) { + /* special case: process synonyms + create an opening entry for each synonym */ + if (match == 0x232A) { /* RIGHT-POINTING ANGLE BRACKET */ + bracketAddOpening(bd, (char)0x3009, position); + } + else if (match == 0x3009) { /* RIGHT ANGLE BRACKET */ + bracketAddOpening(bd, (char)0x232A, position); + } + bracketAddOpening(bd, match, position); + } + } + level = levels[position]; + if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ + newProp = (byte)(level & 1); + if (dirProp != S && dirProp != WS && dirProp != ON) + dirProps[position] = newProp; + pLastIsoRun.lastBase = newProp; + pLastIsoRun.lastStrong = newProp; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + } + else if (dirProp <= R || dirProp == AL) { + newProp = DirFromStrong(dirProp); + pLastIsoRun.lastBase = dirProp; + pLastIsoRun.lastStrong = dirProp; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + } + else if(dirProp == EN) { + pLastIsoRun.lastBase = EN; + if (pLastIsoRun.lastStrong == L) { + newProp = L; /* W7 */ + if (!bd.isNumbersSpecial) + dirProps[position] = ENL; + pLastIsoRun.contextDir = L; + pLastIsoRun.contextPos = position; + } + else { + newProp = R; /* N0 */ + if (pLastIsoRun.lastStrong == AL) + dirProps[position] = AN; /* W2 */ + else + dirProps[position] = ENR; + pLastIsoRun.contextDir = R; + pLastIsoRun.contextPos = position; + } + } + else if (dirProp == AN) { + newProp = R; /* N0 */ + pLastIsoRun.lastBase = AN; + pLastIsoRun.contextDir = R; + pLastIsoRun.contextPos = position; + } + else if (dirProp == NSM) { + /* if the last real char was ON, change NSM to ON so that it + will stay ON even if the last real char is a bracket which + may be changed to L or R */ + newProp = pLastIsoRun.lastBase; + if (newProp == ON) + dirProps[position] = newProp; + } + else { + newProp = dirProp; + pLastIsoRun.lastBase = dirProp; + } + if (newProp <= R || newProp == AL) { + int i; + short flag = (short)DirPropFlag(DirFromStrong(newProp)); + for (i = pLastIsoRun.start; i < pLastIsoRun.limit; i++) + if (position > bd.openings[i].position) + bd.openings[i].flags |= flag; + } + } + /* perform (X1)..(X9) ------------------------------------------------------- */ /* determine if the text is mixed-directional or single-directional */ private byte directionFromFlags() { + /* if the text contains AN and neutrals, then some neutrals may become RTL */ if (!((flags & MASK_RTL) != 0 || ((flags & DirPropFlag(AN)) != 0 && (flags & MASK_POSSIBLE_N) != 0))) { - return Bidi.DIRECTION_LEFT_TO_RIGHT; + return LTR; } else if ((flags & MASK_LTR) == 0) { - return Bidi.DIRECTION_RIGHT_TO_LEFT; + return RTL; } else { return MIXED; } @@ -1330,16 +2039,16 @@ * Recalculate the flags to have them reflect the real properties * after taking the explicit embeddings into account. * - * The Bidi algorithm is designed to result in the same behavior whether embedding + * The BiDi algorithm is designed to result in the same behavior whether embedding * levels are externally specified (from "styled text", supposedly the preferred - * method) or set by explicit embedding codes (LRx, RLx, PDF) in the plain text. - * That is why (X9) instructs to remove all explicit codes (and BN). - * However, in a real implementation, this removal of these codes and their index + * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text. + * That is why (X9) instructs to remove all not-isolate explicit codes (and BN). + * However, in a real implementation, the removal of these codes and their index * positions in the plain text is undesirable since it would result in * reallocated, reindexed text. * Instead, this implementation leaves the codes in there and just ignores them * in the subsequent processing. - * In order to get the same reordering behavior, positions with a BN or an + * In order to get the same reordering behavior, positions with a BN or a not-isolate * explicit embedding code just get the same level assigned as the last "real" * character. * @@ -1351,185 +2060,281 @@ * This limits the scope of the implicit rules in effectively * the same way as the run limits. * - * Instead, this implementation does not modify these codes. + * Instead, this implementation does not modify these codes, except for + * paired brackets whose properties (ON) may be replaced by L or R. * On one hand, the paragraph has to be scanned for same-level-runs, but * on the other hand, this saves another loop to reset these codes, * or saves making and modifying a copy of dirProps[]. * * - * Note that (Pn) and (Xn) changed significantly from version 4 of the Bidi algorithm. + * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm. * * * Handling the stack of explicit levels (Xn): * - * With the Bidi stack of explicit levels, - * as pushed with each LRE, RLE, LRO, and RLO and popped with each PDF, - * the explicit level must never exceed MAX_EXPLICIT_LEVEL==61. + * With the BiDi stack of explicit levels, as pushed with each + * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI, + * the explicit level must never exceed MAX_EXPLICIT_LEVEL. * * In order to have a correct push-pop semantics even in the case of overflows, - * there are two overflow counters: - * - countOver60 is incremented with each LRx at level 60 - * - from level 60, one RLx increases the level to 61 - * - countOver61 is incremented with each LRx and RLx at level 61 - * - * Popping levels with PDF must work in the opposite order so that level 61 - * is correct at the correct point. Underflows (too many PDFs) must be checked. + * overflow counters and a valid isolate counter are used as described in UAX#9 + * section 3.3.2 "Explicit Levels and Directions". * * This implementation assumes that MAX_EXPLICIT_LEVEL is odd. + * + * Returns the direction + * */ private byte resolveExplicitLevels() { int i = 0; byte dirProp; byte level = GetParaLevelAt(0); - byte dirct; - int paraIndex = 0; + isolateCount = 0; /* determine if the text is mixed-directional or single-directional */ dirct = directionFromFlags(); - /* we may not need to resolve any explicit levels, but for multiple - paragraphs we want to loop on all chars to set the para boundaries */ - if ((dirct != MIXED) && (paraCount == 1)) { + /* we may not need to resolve any explicit levels */ + if (dirct != MIXED) { /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */ - } else if ((paraCount == 1) && - ((flags & MASK_EXPLICIT) == 0)) { - /* mixed, but all characters are at the same embedding level */ - /* or we are in "inverse Bidi" */ - /* and we don't have contextual multiple paragraphs with some B char */ + return dirct; + } + + if (reorderingMode > REORDER_LAST_LOGICAL_TO_VISUAL) { + /* inverse BiDi: mixed, but all characters are at the same embedding level */ /* set all levels to the paragraph level */ - for (i = 0; i < length; ++i) { - levels[i] = level; + int paraIndex, start, limit; + for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { + if (paraIndex == 0) + start = 0; + else + start = paras_limit[paraIndex - 1]; + limit = paras_limit[paraIndex]; + level = paras_level[paraIndex]; + for (i = start; i < limit; i++) + levels[i] =level; } - } else { - /* continue to perform (Xn) */ - - /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */ - /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */ - byte embeddingLevel = level; - byte newLevel; - byte stackTop = 0; - - byte[] stack = new byte[MAX_EXPLICIT_LEVEL]; /* we never push anything >=MAX_EXPLICIT_LEVEL */ - int countOver60 = 0; - int countOver61 = 0; /* count overflows of explicit levels */ - - /* recalculate the flags */ - flags = 0; - - for (i = 0; i < length; ++i) { - dirProp = NoContextRTL(dirProps[i]); - switch(dirProp) { - case LRE: - case LRO: - /* (X3, X5) */ - newLevel = (byte)((embeddingLevel+2) & ~(INTERNAL_LEVEL_OVERRIDE | 1)); /* least greater even level */ - if (newLevel <= MAX_EXPLICIT_LEVEL) { - stack[stackTop] = embeddingLevel; - ++stackTop; - embeddingLevel = newLevel; - if (dirProp == LRO) { - embeddingLevel |= INTERNAL_LEVEL_OVERRIDE; - } - /* we don't need to set LEVEL_OVERRIDE off for LRE - since this has already been done for newLevel which is - the source for embeddingLevel. - */ - } else if ((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL) { - ++countOver61; - } else /* (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) == MAX_EXPLICIT_LEVEL-1 */ { - ++countOver60; - } - flags |= DirPropFlag(BN); - break; - case RLE: - case RLO: - /* (X2, X4) */ - newLevel=(byte)(((embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) + 1) | 1); /* least greater odd level */ - if (newLevel<=MAX_EXPLICIT_LEVEL) { - stack[stackTop] = embeddingLevel; - ++stackTop; - embeddingLevel = newLevel; - if (dirProp == RLO) { - embeddingLevel |= INTERNAL_LEVEL_OVERRIDE; - } - /* we don't need to set LEVEL_OVERRIDE off for RLE - since this has already been done for newLevel which is - the source for embeddingLevel. - */ - } else { - ++countOver61; - } - flags |= DirPropFlag(BN); - break; - case PDF: - /* (X7) */ - /* handle all the overflow cases first */ - if (countOver61 > 0) { - --countOver61; - } else if (countOver60 > 0 && (embeddingLevel & ~INTERNAL_LEVEL_OVERRIDE) != MAX_EXPLICIT_LEVEL) { - /* handle LRx overflows from level 60 */ - --countOver60; - } else if (stackTop > 0) { - /* this is the pop operation; it also pops level 61 while countOver60>0 */ - --stackTop; - embeddingLevel = stack[stackTop]; - /* } else { (underflow) */ - } - flags |= DirPropFlag(BN); - break; - case B: - stackTop = 0; - countOver60 = 0; - countOver61 = 0; - level = GetParaLevelAt(i); - if ((i + 1) < length) { - embeddingLevel = GetParaLevelAt(i+1); - if (!((text[i] == CR) && (text[i + 1] == LF))) { - paras[paraIndex++] = i+1; - } - } - flags |= DirPropFlag(B); - break; - case BN: - /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */ - /* they will get their levels set correctly in adjustWSLevels() */ - flags |= DirPropFlag(BN); - break; - default: - /* all other types get the "real" level */ - if (level != embeddingLevel) { - level = embeddingLevel; - if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) { - flags |= DirPropFlagO(level) | DirPropFlagMultiRuns; - } else { - flags |= DirPropFlagE(level) | DirPropFlagMultiRuns; + return dirct; /* no bracket matching for inverse BiDi */ + } + if ((flags & (MASK_EXPLICIT | MASK_ISO)) == 0) { + /* no embeddings, set all levels to the paragraph level */ + /* we still have to perform bracket matching */ + int paraIndex, start, limit; + BracketData bracketData = new BracketData(); + bracketInit(bracketData); + for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { + if (paraIndex == 0) + start = 0; + else + start = paras_limit[paraIndex-1]; + limit = paras_limit[paraIndex]; + level = paras_level[paraIndex]; + for (i = start; i < limit; i++) { + levels[i] = level; + dirProp = dirProps[i]; + if (dirProp == BN) + continue; + if (dirProp == B) { + if ((i + 1) < length) { + if (text[i] == CR && text[i + 1] == LF) + continue; /* skip CR when followed by LF */ + bracketProcessB(bracketData, level); } + continue; } - if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) { - flags |= DirPropFlag(dirProp); - } - break; + bracketProcessChar(bracketData, i); } - - /* - * We need to set reasonable levels even on BN codes and - * explicit codes because we will later look at same-level runs (X10). - */ - levels[i] = level; - } - if ((flags & MASK_EMBEDDING) != 0) { - flags |= DirPropFlagLR(paraLevel); - } - if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { - flags |= DirPropFlag(L); } + return dirct; + } + /* continue to perform (Xn) */ - /* subsequently, ignore the explicit codes and BN (X9) */ + /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */ + /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */ + byte embeddingLevel = level, newLevel; + byte previousLevel = level; /* previous level for regular (not CC) characters */ + int lastCcPos = 0; /* index of last effective LRx,RLx, PDx */ + + /* The following stack remembers the embedding level and the ISOLATE flag of level runs. + stackLast points to its current entry. */ + short[] stack = new short[MAX_EXPLICIT_LEVEL + 2]; /* we never push anything >= MAX_EXPLICIT_LEVEL + but we need one more entry as base */ + int stackLast = 0; + int overflowIsolateCount = 0; + int overflowEmbeddingCount = 0; + int validIsolateCount = 0; + BracketData bracketData = new BracketData(); + bracketInit(bracketData); + stack[0] = level; /* initialize base entry to para level, no override, no isolate */ - /* again, determine if the text is mixed-directional or single-directional */ - dirct = directionFromFlags(); + /* recalculate the flags */ + flags = 0; + + for (i = 0; i < length; i++) { + dirProp = dirProps[i]; + switch (dirProp) { + case LRE: + case RLE: + case LRO: + case RLO: + /* (X2, X3, X4, X5) */ + flags |= DirPropFlag(BN); + levels[i] = previousLevel; + if (dirProp == LRE || dirProp == LRO) { + /* least greater even level */ + newLevel = (byte)((embeddingLevel+2) & ~(LEVEL_OVERRIDE | 1)); + } else { + /* least greater odd level */ + newLevel = (byte)((NoOverride(embeddingLevel) + 1) | 1); + } + if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 && + overflowEmbeddingCount == 0) { + lastCcPos = i; + embeddingLevel = newLevel; + if (dirProp == LRO || dirProp == RLO) + embeddingLevel |= LEVEL_OVERRIDE; + stackLast++; + stack[stackLast] = embeddingLevel; + /* we don't need to set LEVEL_OVERRIDE off for LRE and RLE + since this has already been done for newLevel which is + the source for embeddingLevel. + */ + } else { + if (overflowIsolateCount == 0) + overflowEmbeddingCount++; + } + break; + case PDF: + /* (X7) */ + flags |= DirPropFlag(BN); + levels[i] = previousLevel; + /* handle all the overflow cases first */ + if (overflowIsolateCount > 0) { + break; + } + if (overflowEmbeddingCount > 0) { + overflowEmbeddingCount--; + break; + } + if (stackLast > 0 && stack[stackLast] < ISOLATE) { /* not an isolate entry */ + lastCcPos = i; + stackLast--; + embeddingLevel = (byte)stack[stackLast]; + } + break; + case LRI: + case RLI: + flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); + levels[i] = NoOverride(embeddingLevel); + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + } + previousLevel = embeddingLevel; + /* (X5a, X5b) */ + if (dirProp == LRI) + /* least greater even level */ + newLevel=(byte)((embeddingLevel+2)&~(LEVEL_OVERRIDE|1)); + else + /* least greater odd level */ + newLevel=(byte)((NoOverride(embeddingLevel)+1)|1); + if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 + && overflowEmbeddingCount == 0) { + flags |= DirPropFlag(dirProp); + lastCcPos = i; + validIsolateCount++; + if (validIsolateCount > isolateCount) + isolateCount = validIsolateCount; + embeddingLevel = newLevel; + /* we can increment stackLast without checking because newLevel + will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */ + stackLast++; + stack[stackLast] = (short)(embeddingLevel + ISOLATE); + bracketProcessLRI_RLI(bracketData, embeddingLevel); + } else { + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + overflowIsolateCount++; + } + break; + case PDI: + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + } + /* (X6a) */ + if (overflowIsolateCount > 0) { + overflowIsolateCount--; + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + } + else if (validIsolateCount > 0) { + flags |= DirPropFlag(PDI); + lastCcPos = i; + overflowEmbeddingCount = 0; + while (stack[stackLast] < ISOLATE) /* pop embedding entries */ + stackLast--; /* until the last isolate entry */ + stackLast--; /* pop also the last isolate entry */ + validIsolateCount--; + bracketProcessPDI(bracketData); + } else + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + embeddingLevel = (byte)(stack[stackLast] & ~ISOLATE); + flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); + previousLevel = embeddingLevel; + levels[i] = NoOverride(embeddingLevel); + break; + case B: + flags |= DirPropFlag(B); + levels[i] = GetParaLevelAt(i); + if ((i + 1) < length) { + if (text[i] == CR && text[i + 1] == LF) + break; /* skip CR when followed by LF */ + overflowEmbeddingCount = overflowIsolateCount = 0; + validIsolateCount = 0; + stackLast = 0; + previousLevel = embeddingLevel = GetParaLevelAt(i + 1); + stack[0] = embeddingLevel; /* initialize base entry to para level, no override, no isolate */ + bracketProcessB(bracketData, embeddingLevel); + } + break; + case BN: + /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */ + /* they will get their levels set correctly in adjustWSLevels() */ + levels[i] = previousLevel; + flags |= DirPropFlag(BN); + break; + default: + /* all other types are normal characters and get the "real" level */ + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + if ((embeddingLevel & LEVEL_OVERRIDE) != 0) + flags |= DirPropFlagO(embeddingLevel); + else + flags |= DirPropFlagE(embeddingLevel); + } + previousLevel = embeddingLevel; + levels[i] = embeddingLevel; + bracketProcessChar(bracketData, i); + /* the dirProp may have been changed in bracketProcessChar() */ + flags |= DirPropFlag(dirProps[i]); + break; + } + } + if ((flags & MASK_EMBEDDING) != 0) { + flags |= DirPropFlagLR(paraLevel); } + if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { + flags |= DirPropFlag(L); + } + /* again, determine if the text is mixed-directional or single-directional */ + dirct = directionFromFlags(); return dirct; } @@ -1547,49 +2352,57 @@ private byte checkExplicitLevels() { byte dirProp; int i; + int isolateCount = 0; + this.flags = 0; /* collect all directionalities in the text */ byte level; - int paraIndex = 0; + this.isolateCount = 0; for (i = 0; i < length; ++i) { if (levels[i] == 0) { - levels[i] = paraLevel; + levels[i] = paraLevel; } + + // for backward compatibility if (MAX_EXPLICIT_LEVEL < (levels[i]&0x7f)) { - if ((levels[i] & INTERNAL_LEVEL_OVERRIDE) != 0) { - levels[i] = (byte)(paraLevel|INTERNAL_LEVEL_OVERRIDE); + if ((levels[i] & LEVEL_OVERRIDE) != 0) { + levels[i] = (byte)(paraLevel|LEVEL_OVERRIDE); } else { levels[i] = paraLevel; } } + level = levels[i]; - dirProp = NoContextRTL(dirProps[i]); - if ((level & INTERNAL_LEVEL_OVERRIDE) != 0) { + dirProp = dirProps[i]; + if (dirProp == LRI || dirProp == RLI) { + isolateCount++; + if (isolateCount > this.isolateCount) + this.isolateCount = isolateCount; + } + else if (dirProp == PDI) { + isolateCount--; + } else if (dirProp == B) { + isolateCount = 0; + } + if ((level & LEVEL_OVERRIDE) != 0) { /* keep the override flag in levels[i] but adjust the flags */ - level &= ~INTERNAL_LEVEL_OVERRIDE; /* make the range check below simpler */ + level &= ~LEVEL_OVERRIDE; /* make the range check below simpler */ flags |= DirPropFlagO(level); } else { /* set the flags */ flags |= DirPropFlagE(level) | DirPropFlag(dirProp); } - if ((level < GetParaLevelAt(i) && !((0 == level) && (dirProp == B))) || - (MAX_EXPLICIT_LEVEL <level)) { + (MAX_EXPLICIT_LEVEL < level)) { /* level out of bounds */ throw new IllegalArgumentException("level " + level + - " out of bounds at index " + i); - } - if ((dirProp == B) && ((i + 1) < length)) { - if (!((text[i] == CR) && (text[i + 1] == LF))) { - paras[paraIndex++] = i + 1; - } + " out of bounds at " + i); } } - if ((flags&MASK_EMBEDDING) != 0) { + if ((flags & MASK_EMBEDDING) != 0) { flags |= DirPropFlagLR(paraLevel); } - /* determine if the text is mixed-directional or single-directional */ return directionFromFlags(); } @@ -1610,7 +2423,7 @@ /*********************************************************************/ /* Definitions and type for properties state tables */ /*********************************************************************/ - private static final int IMPTABPROPS_COLUMNS = 14; + private static final int IMPTABPROPS_COLUMNS = 16; private static final int IMPTABPROPS_RES = IMPTABPROPS_COLUMNS - 1; private static short GetStateProps(short cell) { return (short)(cell & 0x1f); @@ -1621,8 +2434,8 @@ private static final short groupProp[] = /* dirProp regrouped */ { - /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN */ - 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10 + /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ENL ENR */ + 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4, 13, 14 }; private static final short _L = 0; private static final short _R = 1; @@ -1637,7 +2450,7 @@ /* PROPERTIES STATE TABLE */ /* */ /* In table impTabProps, */ - /* - the ON column regroups ON and WS */ + /* - the ON column regroups ON and WS, FSI, RLI, LRI and PDI */ /* - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF */ /* - the Res column is the reduced property assigned to a run */ /* */ @@ -1668,25 +2481,31 @@ /* */ private static final short impTabProps[][] = { -/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, Res */ -/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, _ON }, -/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, _L }, -/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, _R }, -/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, _R }, -/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, _EN }, -/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, _AN }, -/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, _AN }, -/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, _ON }, -/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, _ON }, -/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, _ON }, -/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, _EN }, -/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, _EN }, -/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, _AN }, -/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, _AN }, -/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3, _ON }, -/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, _S }, -/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, _S }, -/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, _B } +/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, ENL, ENR, Res */ +/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, 18, 21, _ON }, +/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, 32+18, 32+21, _L }, +/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, 32+18, 32+21, _R }, +/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, 32+18, 32+21, _R }, +/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, 18, 21, _EN }, +/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, 32+18, 32+21, _AN }, +/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, 18, 21, _AN }, +/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, 32+18, 32+21, _ON }, +/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, 32+18, 32+21, _ON }, +/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, 18, 21, _ON }, +/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, 18, 21, _EN }, +/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, 18, 21, _EN }, +/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, 96+18, 96+21, _AN }, +/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, 18, 21, _AN }, +/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3,128+18,128+21, _ON }, +/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, 32+18, 32+21, _S }, +/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, 32+18, 32+21, _S }, +/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, 32+18, 32+21, _B }, +/*18 ENL */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 64+19, 20, 64+19, 18, 18, 32+3, 18, 21, _L }, +/*19 ENL+ES/CS */ { 96+1, 96+2, 18, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 19, 128+7, 96+3, 18, 21, _L }, +/*20 ENL+ET */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 32+7, 20, 32+7, 20, 20, 32+3, 18, 21, _L }, +/*21 ENR */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 64+22, 23, 64+22, 21, 21, 32+3, 18, 21, _AN }, +/*22 ENR+ES/CS */ { 96+1, 96+2, 21, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 22, 128+7, 96+3, 18, 21, _AN }, +/*23 ENR+ET */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 32+7, 23, 32+7, 23, 23, 32+3, 18, 21, _AN } }; /*********************************************************************/ @@ -1760,7 +2579,7 @@ /* */ private static final byte impTabL_DEFAULT[][] = /* Even paragraph level */ - /* In this table, conditional sequences receive the higher possible level + /* In this table, conditional sequences receive the lower possible level until proven otherwise. */ { @@ -1769,8 +2588,8 @@ /* 1 : R */ { 0, 1, 3, 3, 0x14, 0x14, 0, 1 }, /* 2 : AN */ { 0, 1, 0, 2, 0x15, 0x15, 0, 2 }, /* 3 : R+EN/AN */ { 0, 1, 3, 3, 0x14, 0x14, 0, 2 }, - /* 4 : R+ON */ { 0x20, 1, 3, 3, 4, 4, 0x20, 1 }, - /* 5 : AN+ON */ { 0x20, 1, 0x20, 2, 5, 5, 0x20, 1 } + /* 4 : R+ON */ { 0, 0x21, 0x33, 0x33, 4, 4, 0, 0 }, + /* 5 : AN+ON */ { 0, 0x21, 0, 0x32, 5, 5, 0, 0 } }; private static final byte impTabR_DEFAULT[][] = /* Odd paragraph level */ @@ -1787,20 +2606,20 @@ /* 5 : L+AN+ON */ { 1, 0, 1, 3, 5, 5, 0, 0 } }; - private static final short[] impAct0 = {0,1,2,3,4,5,6}; + private static final short[] impAct0 = {0,1,2,3,4}; private static final ImpTabPair impTab_DEFAULT = new ImpTabPair( impTabL_DEFAULT, impTabR_DEFAULT, impAct0, impAct0); private static final byte impTabL_NUMBERS_SPECIAL[][] = { /* Even paragraph level */ - /* In this table, conditional sequences receive the higher possible + /* In this table, conditional sequences receive the lower possible level until proven otherwise. */ /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 2, 1, 1, 0, 0, 0, 0 }, - /* 1 : L+EN/AN */ { 0, 2, 1, 1, 0, 0, 0, 2 }, - /* 2 : R */ { 0, 2, 4, 4, 0x13, 0, 0, 1 }, - /* 3 : R+ON */ { 0x20, 2, 4, 4, 3, 3, 0x20, 1 }, + /* 0 : init */ { 0, 2, 0x11, 0x11, 0, 0, 0, 0 }, + /* 1 : L+EN/AN */ { 0, 0x42, 1, 1, 0, 0, 0, 0 }, + /* 2 : R */ { 0, 2, 4, 4, 0x13, 0x13, 0, 1 }, + /* 3 : R+ON */ { 0, 0x22, 0x34, 0x34, 3, 3, 0, 0 }, /* 4 : R+EN/AN */ { 0, 2, 4, 4, 0x13, 0x13, 0, 2 } }; private static final ImpTabPair impTab_NUMBERS_SPECIAL = new ImpTabPair( @@ -1874,7 +2693,7 @@ /* 5 : L+AN+ON */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 2 }, /* 6 : L+ON+EN */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 1 } }; - private static final short[] impAct1 = {0,1,11,12}; + private static final short[] impAct1 = {0,1,13,14}; private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT = new ImpTabPair( impTabL_DEFAULT, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); @@ -1898,15 +2717,16 @@ /* 0 : init */ { 0x13, 0, 1, 1, 0, 0, 0, 0 }, /* 1 : R+EN/AN */ { 0x23, 0, 1, 1, 2, 0x40, 0, 1 }, /* 2 : R+EN/AN+ON */ { 0x23, 0, 1, 1, 2, 0x40, 0, 0 }, - /* 3 : L */ { 3 , 0, 3, 0x36, 0x14, 0x40, 0, 1 }, + /* 3 : L */ { 3, 0, 3, 0x36, 0x14, 0x40, 0, 1 }, /* 4 : L+ON */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 0 }, /* 5 : L+ON+EN */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 1 }, /* 6 : L+AN */ { 0x53, 0x40, 6, 6, 4, 0x40, 0x40, 3 } }; - private static final short impAct2[] = {0,1,7,8,9,10}; + private static final short[] impAct2 = {0,1,2,5,6,7,8}; + private static final short[] impAct3 = {0,1,9,10,11,12}; private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT_WITH_MARKS = new ImpTabPair(impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS, - impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct0, impAct2); + impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL = new ImpTabPair( impTabL_NUMBERS_SPECIAL, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); @@ -1923,14 +2743,15 @@ }; private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS = new ImpTabPair(impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS, - impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct0, impAct2); + impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); - private class LevState { + private static class LevState { byte[][] impTab; /* level table pointer */ short[] impAct; /* action map array */ int startON; /* start of ON sequence */ int startL2EN; /* start of level 2 sequence */ int lastStrongRTL; /* index of last found R or AL */ + int runStart; /* start position of the run */ short state; /* current state */ byte runLevel; /* run level before implicit solving */ } @@ -1962,6 +2783,22 @@ insertPoints.size++; } + private void setLevelsOutsideIsolates(int start, int limit, byte level) + { + byte dirProp; + int isolateCount = 0, k; + for (k = start; k < limit; k++) { + dirProp = dirProps[k]; + if (dirProp == PDI) + isolateCount--; + if (isolateCount == 0) { + levels[k] = level; + } + if (dirProp == LRI || dirProp == RLI) + isolateCount++; + } + } + /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */ /* @@ -2003,7 +2840,17 @@ start = levState.startON; break; - case 3: /* L or S after possible relevant EN/AN */ + case 3: /* EN/AN after R+ON */ + level = (byte)(levState.runLevel + 1); + setLevelsOutsideIsolates(levState.startON, start0, level); + break; + + case 4: /* EN/AN before R for NUMBERS_SPECIAL */ + level = (byte)(levState.runLevel + 2); + setLevelsOutsideIsolates(levState.startON, start0, level); + break; + + case 5: /* L or S after possible relevant EN/AN */ /* check if we had EN after R/AL */ if (levState.startL2EN >= 0) { addPoint(levState.startL2EN, LRM_BEFORE); @@ -2039,7 +2886,7 @@ } break; - case 4: /* R/AL after possible relevant EN/AN */ + case 6: /* R/AL after possible relevant EN/AN */ /* just clean up */ if (insertPoints.points.length > 0) /* remove all non confirmed insert points */ @@ -2049,12 +2896,15 @@ levState.lastStrongRTL = limit - 1; break; - case 5: /* EN/AN after R/AL + possible cont */ + case 7: /* EN/AN after R/AL + possible cont */ /* check for real AN */ - if ((_prop == _AN) && (NoContextRTL(dirProps[start0]) == AN)) { + + if ((_prop == _AN) && (dirProps[start0] == AN) && + (reorderingMode != REORDER_INVERSE_FOR_NUMBERS_SPECIAL)) + { /* real AN */ if (levState.startL2EN == -1) { /* if no relevant EN already found */ - /* just note the righmost digit as a strong RTL */ + /* just note the rightmost digit as a strong RTL */ levState.lastStrongRTL = limit - 1; break; } @@ -2072,12 +2922,12 @@ } break; - case 6: /* note location of latest R/AL */ + case 8: /* note location of latest R/AL */ levState.lastStrongRTL = limit - 1; levState.startON = -1; break; - case 7: /* L after R+ON/EN/AN */ + case 9: /* L after R+ON/EN/AN */ /* include possible adjacent number on the left */ for (k = start0-1; k >= 0 && ((levels[k] & 1) == 0); k--) { } @@ -2088,14 +2938,14 @@ levState.startON = start0; break; - case 8: /* AN after L */ + case 10: /* AN after L */ /* AN numbers between L text on both sides may be trouble. */ /* tentatively bracket with LRMs; will be confirmed if followed by L */ addPoint(start0, LRM_BEFORE); /* add LRM before */ addPoint(start0, LRM_AFTER); /* add LRM after */ break; - case 9: /* R after L+ON/EN/AN */ + case 11: /* R after L+ON/EN/AN */ /* false alert, infirm LRMs around previous AN */ insertPoints.size=insertPoints.confirmed; if (_prop == _S) { /* add RLM before S */ @@ -2104,7 +2954,7 @@ } break; - case 10: /* L after L+ON/AN */ + case 12: /* L after L+ON/AN */ level = (byte)(levState.runLevel + addLevel); for (k=levState.startON; k < start0; k++) { if (levels[k] < level) { @@ -2115,7 +2965,7 @@ levState.startON = start0; break; - case 11: /* L after L+ON+EN/AN/ON */ + case 13: /* L after L+ON+EN/AN/ON */ level = levState.runLevel; for (k = start0-1; k >= levState.startON; k--) { if (levels[k] == level+3) { @@ -2134,7 +2984,7 @@ } break; - case 12: /* R after L+ON+EN/AN/ON */ + case 14: /* R after L+ON+EN/AN/ON */ level = (byte)(levState.runLevel+1); for (k = start0-1; k >= levState.startON; k--) { if (levels[k] > level) { @@ -2149,22 +2999,27 @@ } if ((addLevel) != 0 || (start < start0)) { level = (byte)(levState.runLevel + addLevel); - for (k = start; k < limit; k++) { - levels[k] = level; + if (start >= levState.runStart) { + for (k = start; k < limit; k++) { + levels[k] = level; + } + } else { + setLevelsOutsideIsolates(start, limit, level); } } } private void resolveImplicitLevels(int start, int limit, short sor, short eor) { + byte dirProp; LevState levState = new LevState(); int i, start1, start2; short oldStateImp, stateImp, actionImp; short gprop, resProp, cell; + boolean inverseRTL; short nextStrongProp = R; int nextStrongPos = -1; - /* check for RTL inverse Bidi mode */ /* FOOD FOR THOUGHT: in case of RTL inverse Bidi, it would make sense to * loop on the text characters from end to start. @@ -2172,29 +3027,78 @@ * actions) and different levels state tables (maybe very similar to the * LTR corresponding ones. */ - /* initialize for levels state table */ + inverseRTL=((start<lastArabicPos) && ((GetParaLevelAt(start) & 1)>0) && + (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL)); + /* initialize for property and levels state table */ levState.startL2EN = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ levState.lastStrongRTL = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ - levState.state = 0; + levState.runStart = start; levState.runLevel = levels[start]; levState.impTab = impTabPair.imptab[levState.runLevel & 1]; levState.impAct = impTabPair.impact[levState.runLevel & 1]; - processPropertySeq(levState, sor, start, start); - /* initialize for property state table */ - if (dirProps[start] == NSM) { - stateImp = (short)(1 + sor); + + /* The isolates[] entries contain enough information to + resume the bidi algorithm in the same state as it was + when it was interrupted by an isolate sequence. */ + if (dirProps[start] == PDI) { + levState.startON = isolates[isolateCount].startON; + start1 = isolates[isolateCount].start1; + stateImp = isolates[isolateCount].stateImp; + levState.state = isolates[isolateCount].state; + isolateCount--; } else { - stateImp = 0; + levState.startON = -1; + start1 = start; + if (dirProps[start] == NSM) + stateImp = (short)(1 + sor); + else + stateImp = 0; + levState.state = 0; + processPropertySeq(levState, sor, start, start); } - start1 = start; - start2 = 0; + start2 = start; /* to make the Java compiler happy */ for (i = start; i <= limit; i++) { if (i >= limit) { + int k; + for (k = limit - 1; + k > start && + (DirPropFlag(dirProps[k]) & MASK_BN_EXPLICIT) != 0; + k--); + dirProp = dirProps[k]; + if (dirProp == LRI || dirProp == RLI) + break; /* no forced closing for sequence ending with LRI/RLI */ gprop = eor; } else { - short prop, prop1; - prop = NoContextRTL(dirProps[i]); + byte prop, prop1; + prop = dirProps[i]; + if (prop == B) + isolateCount = -1; /* current isolates stack entry == none */ + if (inverseRTL) { + if (prop == AL) { + /* AL before EN does not make it AN */ + prop = R; + } else if (prop == EN) { + if (nextStrongPos <= i) { + /* look for next strong char (L/R/AL) */ + int j; + nextStrongProp = R; /* set default */ + nextStrongPos = limit; + for (j = i+1; j < limit; j++) { + prop1 = dirProps[j]; + if (prop1 == L || prop1 == R || prop1 == AL) { + nextStrongProp = prop1; + nextStrongPos = j; + break; + } + } + } + if (nextStrongProp == AL) { + prop = AN; + } + } + } gprop = groupProp[prop]; } oldStateImp = stateImp; @@ -2230,8 +3134,24 @@ } } } - /* flush possible pending sequence, e.g. ON */ - processPropertySeq(levState, eor, limit, limit); + + /* look for the last char not a BN or LRE/RLE/LRO/RLO/PDF */ + for (i = limit - 1; + i > start && + (DirPropFlag(dirProps[i]) & MASK_BN_EXPLICIT) != 0; + i--); + dirProp = dirProps[i]; + if ((dirProp == LRI || dirProp == RLI) && limit < length) { + isolateCount++; + if (isolates[isolateCount] == null) + isolates[isolateCount] = new Isolate(); + isolates[isolateCount].stateImp = stateImp; + isolates[isolateCount].state = levState.state; + isolates[isolateCount].start1 = start1; + isolates[isolateCount].startON = levState.startON; + } + else + processPropertySeq(levState, eor, limit, limit); } /* perform (L1) and (X9) ---------------------------------------------------- */ @@ -2250,7 +3170,7 @@ i = trailingWSStart; while (i > 0) { /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */ - while (i > 0 && ((flag = DirPropFlagNC(dirProps[--i])) & MASK_WS) != 0) { + while (i > 0 && ((flag = DirPropFlag(dirProps[--i])) & MASK_WS) != 0) { if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { levels[i] = 0; } else { @@ -2261,7 +3181,7 @@ /* reset BN to the next character's paraLevel until B/S, which restarts above loop */ /* here, i+1 is guaranteed to be <length */ while (i > 0) { - flag = DirPropFlagNC(dirProps[--i]); + flag = DirPropFlag(dirProps[--i]); if ((flag & MASK_BN_EXPLICIT) != 0) { levels[i] = levels[i + 1]; } else if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { @@ -2276,6 +3196,10 @@ } } + private void setParaSuccess() { + paraBidi = this; /* mark successful setPara */ + } + private int Bidi_Min(int x, int y) { return x < y ? x : y; } @@ -2284,6 +3208,159 @@ return x >= 0 ? x : -x; } + void setParaRunsOnly(char[] parmText, byte parmParaLevel) { + int[] visualMap; + String visualText; + int saveLength, saveTrailingWSStart; + byte[] saveLevels; + byte saveDirection; + int i, j, visualStart, logicalStart, + oldRunCount, runLength, addedRuns, insertRemove, + start, limit, step, indexOddBit, logicalPos, + index, index1; + int saveOptions; + + reorderingMode = REORDER_DEFAULT; + int parmLength = parmText.length; + if (parmLength == 0) { + setPara(parmText, parmParaLevel, null); + reorderingMode = REORDER_RUNS_ONLY; + return; + } + /* obtain memory for mapping table and visual text */ + saveOptions = reorderingOptions; + if ((saveOptions & OPTION_INSERT_MARKS) > 0) { + reorderingOptions &= ~OPTION_INSERT_MARKS; + reorderingOptions |= OPTION_REMOVE_CONTROLS; + } + parmParaLevel &= 1; /* accept only 0 or 1 */ + setPara(parmText, parmParaLevel, null); + /* we cannot access directly levels since it is not yet set if + * direction is not MIXED + */ + saveLevels = new byte[this.length]; + System.arraycopy(getLevels(), 0, saveLevels, 0, this.length); + saveTrailingWSStart = trailingWSStart; + + /* FOOD FOR THOUGHT: instead of writing the visual text, we could use + * the visual map and the dirProps array to drive the second call + * to setPara (but must make provision for possible removal of + * Bidi controls. Alternatively, only use the dirProps array via + * customized classifier callback. + */ + visualText = writeReordered(DO_MIRRORING); + visualMap = getVisualMap(); + this.reorderingOptions = saveOptions; + saveLength = this.length; + saveDirection=this.direction; + + this.reorderingMode = REORDER_INVERSE_LIKE_DIRECT; + parmParaLevel ^= 1; + setPara(visualText, parmParaLevel, null); + BidiLine.getRuns(this); + /* check if some runs must be split, count how many splits */ + addedRuns = 0; + oldRunCount = this.runCount; + visualStart = 0; + for (i = 0; i < oldRunCount; i++, visualStart += runLength) { + runLength = runs[i].limit - visualStart; + if (runLength < 2) { + continue; + } + logicalStart = runs[i].start; + for (j = logicalStart+1; j < logicalStart+runLength; j++) { + index = visualMap[j]; + index1 = visualMap[j-1]; + if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { + addedRuns++; + } + } + } + if (addedRuns > 0) { + getRunsMemory(oldRunCount + addedRuns); + if (runCount == 1) { + /* because we switch from UBiDi.simpleRuns to UBiDi.runs */ + runsMemory[0] = runs[0]; + } else { + System.arraycopy(runs, 0, runsMemory, 0, runCount); + } + runs = runsMemory; + runCount += addedRuns; + for (i = oldRunCount; i < runCount; i++) { + if (runs[i] == null) { + runs[i] = new BidiRun(0, 0, (byte)0); + } + } + } + /* split runs which are not consecutive in source text */ + int newI; + for (i = oldRunCount-1; i >= 0; i--) { + newI = i + addedRuns; + runLength = i==0 ? runs[0].limit : + runs[i].limit - runs[i-1].limit; + logicalStart = runs[i].start; + indexOddBit = runs[i].level & 1; + if (runLength < 2) { + if (addedRuns > 0) { + runs[newI].copyFrom(runs[i]); + } + logicalPos = visualMap[logicalStart]; + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + continue; + } + if (indexOddBit > 0) { + start = logicalStart; + limit = logicalStart + runLength - 1; + step = 1; + } else { + start = logicalStart + runLength - 1; + limit = logicalStart; + step = -1; + } + for (j = start; j != limit; j += step) { + index = visualMap[j]; + index1 = visualMap[j+step]; + if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { + logicalPos = Bidi_Min(visualMap[start], index); + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + runs[newI].limit = runs[i].limit; + runs[i].limit -= Bidi_Abs(j - start) + 1; + insertRemove = runs[i].insertRemove & (LRM_AFTER|RLM_AFTER); + runs[newI].insertRemove = insertRemove; + runs[i].insertRemove &= ~insertRemove; + start = j + step; + addedRuns--; + newI--; + } + } + if (addedRuns > 0) { + runs[newI].copyFrom(runs[i]); + } + logicalPos = Bidi_Min(visualMap[start], visualMap[limit]); + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + } + + cleanup1: + /* restore initial paraLevel */ + this.paraLevel ^= 1; + cleanup2: + /* restore real text */ + this.text = parmText; + this.length = saveLength; + this.originalLength = parmLength; + this.direction=saveDirection; + this.levels = saveLevels; + this.trailingWSStart = saveTrailingWSStart; + if (runCount > 1) { + this.direction = MIXED; + } + cleanup3: + this.reorderingMode = REORDER_RUNS_ONLY; + } + /** * Perform the Unicode Bidi algorithm. It is defined in the * <a href="http://www.unicode.org/unicode/reports/tr9/">Unicode Standard Annex #9</a>, @@ -2386,7 +3463,7 @@ * For example, in pure LTR text with numbers the numbers would get * a resolved level of 2 higher than the surrounding text according to * the algorithm. This implementation may set all resolved levels to - * the same value in such a case.<p> + * the same value in such a case. * * The text can be composed of multiple paragraphs. Occurrence of a block * separator in the text terminates a paragraph, and whatever comes next starts @@ -2421,9 +3498,9 @@ * (same index) character if the level has the * <code>LEVEL_OVERRIDE</code> bit set.<br><br> * Except for that bit, it must be - * {@code paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL}, + * <code>paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL</code>, * with one exception: a level of zero may be specified for a - * paragraph separator even if {@code paraLevel > 0} when multiple + * paragraph separator even if <code>paraLevel>0</code> when multiple * paragraphs are submitted in the same call to <code>setPara()</code>.<br><br> * <strong>Caution: </strong>A reference to this array, not a copy * of the levels, will be stored in the <code>Bidi</code> object; @@ -2444,22 +3521,28 @@ * @see #MAX_EXPLICIT_LEVEL * @stable ICU 3.8 */ - public void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels) + void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels) { /* check the argument values */ - if (paraLevel < INTERNAL_LEVEL_DEFAULT_LTR) { + if (paraLevel < LEVEL_DEFAULT_LTR) { verifyRange(paraLevel, 0, MAX_EXPLICIT_LEVEL + 1); } if (chars == null) { chars = new char[0]; } + /* special treatment for RUNS_ONLY mode */ + if (reorderingMode == REORDER_RUNS_ONLY) { + setParaRunsOnly(chars, paraLevel); + return; + } + /* initialize the Bidi object */ this.paraBidi = null; /* mark unfinished setPara */ this.text = chars; this.length = this.originalLength = this.resultLength = text.length; this.paraLevel = paraLevel; - this.direction = Bidi.DIRECTION_LEFT_TO_RIGHT; + this.direction = (byte)(paraLevel & 1); this.paraCount = 1; /* Allocate zero-length arrays instead of setting to null here; then @@ -2475,11 +3558,7 @@ /* * Save the original paraLevel if contextual; otherwise, set to 0. */ - if (IsDefaultLevel(paraLevel)) { - defaultParaLevel = paraLevel; - } else { - defaultParaLevel = 0; - } + defaultParaLevel = IsDefaultLevel(paraLevel) ? paraLevel : 0; if (length == 0) { /* @@ -2491,17 +3570,10 @@ this.paraLevel &= 1; defaultParaLevel = 0; } - if ((this.paraLevel & 1) != 0) { - flags = DirPropFlag(R); - direction = Bidi.DIRECTION_RIGHT_TO_LEFT; - } else { - flags = DirPropFlag(L); - direction = Bidi.DIRECTION_LEFT_TO_RIGHT; - } - + flags = DirPropFlagLR(paraLevel); runCount = 0; paraCount = 0; - paraBidi = this; /* mark successful setPara */ + setParaSuccess(); return; } @@ -2515,21 +3587,9 @@ getDirPropsMemory(length); dirProps = dirPropsMemory; getDirProps(); - /* the processed length may have changed if OPTION_STREAMING is set */ trailingWSStart = length; /* the levels[] will reflect the WS run */ - /* allocate paras memory */ - if (paraCount > 1) { - getInitialParasMemory(paraCount); - paras = parasMemory; - paras[paraCount - 1] = length; - } else { - /* initialize paras for single paragraph */ - paras = simpleParas; - simpleParas[0] = length; - } - /* are explicit levels specified? */ if (embeddingLevels == null) { /* no: determine explicit levels according to the (Xn) rules */ @@ -2542,28 +3602,62 @@ direction = checkExplicitLevels(); } + /* allocate isolate memory */ + if (isolateCount > 0) { + if (isolates == null || isolates.length < isolateCount) + isolates = new Isolate[isolateCount + 3]; /* keep some reserve */ + } + isolateCount = -1; /* current isolates stack entry == none */ + /* * The steps after (X9) in the Bidi algorithm are performed only if * the paragraph text has mixed directionality! */ switch (direction) { - case Bidi.DIRECTION_LEFT_TO_RIGHT: - /* make sure paraLevel is even */ - paraLevel = (byte)((paraLevel + 1) & ~1); - + case LTR: /* all levels are implicitly at paraLevel (important for getLevels()) */ trailingWSStart = 0; break; - case Bidi.DIRECTION_RIGHT_TO_LEFT: - /* make sure paraLevel is odd */ - paraLevel |= 1; - + case RTL: /* all levels are implicitly at paraLevel (important for getLevels()) */ trailingWSStart = 0; break; default: - this.impTabPair = impTab_DEFAULT; - + /* + * Choose the right implicit state table + */ + switch(reorderingMode) { + case REORDER_DEFAULT: + this.impTabPair = impTab_DEFAULT; + break; + case REORDER_NUMBERS_SPECIAL: + this.impTabPair = impTab_NUMBERS_SPECIAL; + break; + case REORDER_GROUP_NUMBERS_WITH_R: + this.impTabPair = impTab_GROUP_NUMBERS_WITH_R; + break; + case REORDER_RUNS_ONLY: + /* we should never get here */ + throw new InternalError("Internal ICU error in setPara"); + /* break; */ + case REORDER_INVERSE_NUMBERS_AS_L: + this.impTabPair = impTab_INVERSE_NUMBERS_AS_L; + break; + case REORDER_INVERSE_LIKE_DIRECT: + if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { + this.impTabPair = impTab_INVERSE_LIKE_DIRECT_WITH_MARKS; + } else { + this.impTabPair = impTab_INVERSE_LIKE_DIRECT; + } + break; + case REORDER_INVERSE_FOR_NUMBERS_SPECIAL: + if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { + this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS; + } else { + this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL; + } + break; + } /* * If there are no external levels specified and there * are no significant explicit level codes in the text, @@ -2601,7 +3695,7 @@ /* the values for this run's start are the same as for the previous run's end */ start = limit; level = nextLevel; - if ((start > 0) && (NoContextRTL(dirProps[start - 1]) == B)) { + if ((start > 0) && (dirProps[start - 1] == B)) { /* except if this is a new paragraph, then set sor = para level */ sor = GetLRFromLevel(GetParaLevelAt(start)); } else { @@ -2609,7 +3703,9 @@ } /* search for the limit of this run */ - while (++limit < length && levels[limit] == level) {} + while ((++limit < length) && + ((levels[limit] == level) || + ((DirPropFlag(dirProps[limit]) & MASK_BN_EXPLICIT) != 0))) {} /* get the correct level of the next run */ if (limit < length) { @@ -2619,7 +3715,7 @@ } /* determine eor from max(level, nextLevel); sor is last run's eor */ - if ((level & ~INTERNAL_LEVEL_OVERRIDE) < (nextLevel & ~INTERNAL_LEVEL_OVERRIDE)) { + if (NoOverride(level) < NoOverride(nextLevel)) { eor = GetLRFromLevel(nextLevel); } else { eor = GetLRFromLevel(level); @@ -2627,12 +3723,12 @@ /* if the run consists of overridden directional types, then there are no implicit types to be resolved */ - if ((level & INTERNAL_LEVEL_OVERRIDE) == 0) { + if ((level & LEVEL_OVERRIDE) == 0) { resolveImplicitLevels(start, limit, sor, eor); } else { /* remove the LEVEL_OVERRIDE flags */ do { - levels[start++] &= ~INTERNAL_LEVEL_OVERRIDE; + levels[start++] &= ~LEVEL_OVERRIDE; } while (start < limit); } } while (limit < length); @@ -2644,8 +3740,46 @@ break; } - resultLength += insertPoints.size; - paraBidi = this; /* mark successful setPara */ + /* add RLM for inverse Bidi with contextual orientation resolving + * to RTL which would not round-trip otherwise + */ + if ((defaultParaLevel > 0) && + ((reorderingOptions & OPTION_INSERT_MARKS) != 0) && + ((reorderingMode == REORDER_INVERSE_LIKE_DIRECT) || + (reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) { + int start, last; + byte level; + byte dirProp; + for (int i = 0; i < paraCount; i++) { + last = paras_limit[i] - 1; + level = paras_level[i]; + if (level == 0) + continue; /* LTR paragraph */ + start = i == 0 ? 0 : paras_limit[i - 1]; + for (int j = last; j >= start; j--) { + dirProp = dirProps[j]; + if (dirProp == L) { + if (j < last) { + while (dirProps[last] == B) { + last--; + } + } + addPoint(last, RLM_BEFORE); + break; + } + if ((DirPropFlag(dirProp) & MASK_R_AL) != 0) { + break; + } + } + } + } + + if ((reorderingOptions & OPTION_REMOVE_CONTROLS) != 0) { + resultLength -= controlCount; + } else { + resultLength += insertPoints.size; + } + setParaSuccess(); } /** @@ -2682,7 +3816,7 @@ * For example, in pure LTR text with numbers the numbers would get * a resolved level of 2 higher than the surrounding text according to * the algorithm. This implementation may set all resolved levels to - * the same value in such a case. + * the same value in such a case.<p> * * @param paragraph a paragraph of text with optional character and * paragraph attribute information @@ -2693,13 +3827,14 @@ byte paraLvl; char ch = paragraph.first(); Boolean runDirection = - (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION); + (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION); Object shaper = paragraph.getAttribute(TextAttributeConstants.NUMERIC_SHAPING); + if (runDirection == null) { - paraLvl = INTERNAL_LEVEL_DEFAULT_LTR; + paraLvl = LEVEL_DEFAULT_LTR; } else { paraLvl = (runDirection.equals(TextAttributeConstants.RUN_DIRECTION_LTR)) ? - (byte)Bidi.DIRECTION_LEFT_TO_RIGHT : (byte)Bidi.DIRECTION_RIGHT_TO_LEFT; + LTR : RTL; } byte[] lvls = null; @@ -2717,7 +3852,7 @@ /* no-op */ } else if (level < 0) { lvls = embeddingLevels; - embeddingLevels[i] = (byte)((0 - level) | INTERNAL_LEVEL_OVERRIDE); + embeddingLevels[i] = (byte)((0 - level) | LEVEL_OVERRIDE); } else { lvls = embeddingLevels; embeddingLevels[i] = level; @@ -2751,7 +3886,7 @@ * @see #setPara * @stable ICU 3.8 */ - private void orderParagraphsLTR(boolean ordarParaLTR) { + public void orderParagraphsLTR(boolean ordarParaLTR) { orderParagraphsLTR = ordarParaLTR; } @@ -2771,7 +3906,7 @@ * @see #MIXED * @stable ICU 3.8 */ - private byte getDirection() + public byte getDirection() { verifyValidParaOrLine(); return direction; @@ -2819,31 +3954,25 @@ } /** - * Get the index of a paragraph, given a position within the text. + * Retrieves the Bidi class for a given code point. + * <p>If a <code>BidiClassifier</code> is defined and returns a value + * other than <code>CLASS_DEFAULT</code>, that value is used; otherwise + * the default class determination mechanism is invoked.</p> * - * @param charIndex is the index of a character within the text, in the - * range <code>[0..getProcessedLength()-1]</code>. + * @param c The code point to get a Bidi class for. * - * @return The index of the paragraph containing the specified position, - * starting from 0. + * @return The Bidi class for the character <code>c</code> that is in effect + * for this <code>Bidi</code> instance. * - * @throws IllegalStateException if this call is not preceded by a successful - * call to <code>setPara</code> or <code>setLine</code> - * @throws IllegalArgumentException if charIndex is not within the legal range - * - * @see com.ibm.icu.text.BidiRun - * @see #getProcessedLength * @stable ICU 3.8 */ - public int getParagraphIndex(int charIndex) - { - verifyValidParaOrLine(); - BidiBase bidi = paraBidi; /* get Para object if Line object */ - verifyRange(charIndex, 0, bidi.length); - int paraIndex; - for (paraIndex = 0; charIndex >= bidi.paras[paraIndex]; paraIndex++) { - } - return paraIndex; + public int getCustomizedClass(int c) { + int dir; + + dir = bdp.getClass(c); + if (dir >= CHAR_DIRECTION_COUNT) + dir = ON; + return dir; } /** @@ -2891,7 +4020,7 @@ verifyRange(start, 0, limit); verifyRange(limit, 0, length+1); - return BidiLine.setLine(bidi, this, newBidi, newBidiBase, start, limit); + return BidiLine.setLine(this, newBidi, newBidiBase, start, limit); } /** @@ -2911,9 +4040,11 @@ */ public byte getLevelAt(int charIndex) { + // for backward compatibility if (charIndex < 0 || charIndex >= length) { return (byte)getBaseLevel(); } + verifyValidParaOrLine(); verifyRange(charIndex, 0, length); return BidiLine.getLevelAt(this, charIndex); @@ -2932,7 +4063,7 @@ * call to <code>setPara</code> or <code>setLine</code> * @stable ICU 3.8 */ - private byte[] getLevels() + byte[] getLevels() { verifyValidParaOrLine(); if (length <= 0) { @@ -2963,6 +4094,78 @@ } /** + * + * Get a <code>BidiRun</code> object according to its index. BidiRun methods + * may be used to retrieve the run's logical start, length and level, + * which can be even for an LTR run or odd for an RTL run. + * In an RTL run, the character at the logical start is + * visually on the right of the displayed run. + * The length is the number of characters in the run.<p> + * <code>countRuns()</code> is normally called + * before the runs are retrieved. + * + * <p> + * Example: + * <pre> + * Bidi bidi = new Bidi(); + * String text = "abc 123 DEFG xyz"; + * bidi.setPara(text, Bidi.RTL, null); + * int i, count=bidi.countRuns(), logicalStart, visualIndex=0, length; + * BidiRun run; + * for (i = 0; i < count; ++i) { + * run = bidi.getVisualRun(i); + * logicalStart = run.getStart(); + * length = run.getLength(); + * if (Bidi.LTR == run.getEmbeddingLevel()) { + * do { // LTR + * show_char(text.charAt(logicalStart++), visualIndex++); + * } while (--length > 0); + * } else { + * logicalStart += length; // logicalLimit + * do { // RTL + * show_char(text.charAt(--logicalStart), visualIndex++); + * } while (--length > 0); + * } + * } + * </pre> + * <p> + * Note that in right-to-left runs, code like this places + * second surrogates before first ones (which is generally a bad idea) + * and combining characters before base characters. + * <p> + * Use of <code>{@link #writeReordered}</code>, optionally with the + * <code>{@link #KEEP_BASE_COMBINING}</code> option, can be considered in + * order to avoid these issues. + * + * @param runIndex is the number of the run in visual order, in the + * range <code>[0..countRuns()-1]</code>. + * + * @return a BidiRun object containing the details of the run. The + * directionality of the run is + * <code>LTR==0</code> or <code>RTL==1</code>, + * never <code>MIXED</code>. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to <code>setPara</code> or <code>setLine</code> + * @throws IllegalArgumentException if <code>runIndex</code> is not in + * the range <code>0<=runIndex<countRuns()</code> + * + * @see #countRuns() + * @see com.ibm.icu.text.BidiRun + * @see com.ibm.icu.text.BidiRun#getStart() + * @see com.ibm.icu.text.BidiRun#getLength() + * @see com.ibm.icu.text.BidiRun#getEmbeddingLevel() + * @stable ICU 3.8 + */ + BidiRun getVisualRun(int runIndex) + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + verifyRange(runIndex, 0, runCount); + return BidiLine.getVisualRun(this, runIndex); + } + + /** * Get a visual-to-logical index map (array) for the characters in the * <code>Bidi</code> (paragraph or line) object. * <p> @@ -3031,19 +4234,10 @@ * Constant indicating that the base direction depends on the first strong * directional character in the text according to the Unicode Bidirectional * Algorithm. If no strong directional character is present, the base - * direction is left-to-right. - * @stable ICU 3.8 - */ - private static final int INTERNAL_DIRECTION_DEFAULT_LEFT_TO_RIGHT = 0x7e; - - /** - * Constant indicating that the base direction depends on the first strong - * directional character in the text according to the Unicode Bidirectional - * Algorithm. If no strong directional character is present, the base * direction is right-to-left. * @stable ICU 3.8 */ - private static final int INTERMAL_DIRECTION_DEFAULT_RIGHT_TO_LEFT = 0x7f; + public static final int DIRECTION_DEFAULT_RIGHT_TO_LEFT = LEVEL_DEFAULT_RTL; /** * Create Bidi from the given text, embedding, and direction information. @@ -3080,27 +4274,27 @@ * @stable ICU 3.8 */ public BidiBase(char[] text, - int textStart, - byte[] embeddings, - int embStart, - int paragraphLength, - int flags) - { + int textStart, + byte[] embeddings, + int embStart, + int paragraphLength, + int flags) + { this(0, 0); byte paraLvl; switch (flags) { case Bidi.DIRECTION_LEFT_TO_RIGHT: default: - paraLvl = Bidi.DIRECTION_LEFT_TO_RIGHT; + paraLvl = LTR; break; case Bidi.DIRECTION_RIGHT_TO_LEFT: - paraLvl = Bidi.DIRECTION_RIGHT_TO_LEFT; + paraLvl = RTL; break; case Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT: - paraLvl = INTERNAL_LEVEL_DEFAULT_LTR; + paraLvl = LEVEL_DEFAULT_LTR; break; case Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT: - paraLvl = INTERNAL_LEVEL_DEFAULT_RTL; + paraLvl = LEVEL_DEFAULT_RTL; break; } byte[] paraEmbeddings; @@ -3112,7 +4306,7 @@ for (int i = 0; i < paragraphLength; i++) { lev = embeddings[i + embStart]; if (lev < 0) { - lev = (byte)((- lev) | INTERNAL_LEVEL_OVERRIDE); + lev = (byte)((- lev) | LEVEL_OVERRIDE); } else if (lev == 0) { lev = paraLvl; if (paraLvl > MAX_EXPLICIT_LEVEL) { @@ -3122,13 +4316,10 @@ paraEmbeddings[i] = lev; } } - if (textStart == 0 && embStart == 0 && paragraphLength == text.length) { - setPara(text, paraLvl, paraEmbeddings); - } else { - char[] paraText = new char[paragraphLength]; - System.arraycopy(text, textStart, paraText, 0, paragraphLength); - setPara(paraText, paraLvl, paraEmbeddings); - } + + char[] paraText = new char[paragraphLength]; + System.arraycopy(text, textStart, paraText, 0, paragraphLength); + setPara(paraText, paraLvl, paraEmbeddings); } /** @@ -3148,7 +4339,7 @@ } /** - * Return true if the line is all left-to-right text and the base direction + * Return true if the line is all left-to-right text and the base direction * is left-to-right. * * @return true if the line is all left-to-right text and the base direction @@ -3160,7 +4351,7 @@ */ public boolean isLeftToRight() { - return (getDirection() == Bidi.DIRECTION_LEFT_TO_RIGHT && (paraLevel & 1) == 0); + return (getDirection() == LTR && (paraLevel & 1) == 0); } /** @@ -3176,7 +4367,7 @@ */ public boolean isRightToLeft() { - return (getDirection() == Bidi.DIRECTION_RIGHT_TO_LEFT && (paraLevel & 1) == 1); + return (getDirection() == RTL && (paraLevel & 1) == 1); } /** @@ -3191,7 +4382,7 @@ */ public boolean baseIsLeftToRight() { - return (getParaLevel() == Bidi.DIRECTION_LEFT_TO_RIGHT); + return (getParaLevel() == LTR); } /** @@ -3212,8 +4403,8 @@ /** * Compute the logical to visual run mapping */ - private void getLogicalToVisualRunsMap() - { + void getLogicalToVisualRunsMap() + { if (isGoodLogicalToVisualRunsMap) { return; } @@ -3231,9 +4422,8 @@ for (i = 0; i < count; i++) { logicalToVisualRunsMap[i] = (int)(keys[i] & 0x00000000FFFFFFFF); } - keys = null; isGoodLogicalToVisualRunsMap = true; - } + } /** * Return the level of the nth logical run in this line. @@ -3252,9 +4442,12 @@ { verifyValidParaOrLine(); BidiLine.getRuns(this); + + // for backward compatibility if (run < 0 || run >= runCount) { return getParaLevel(); } + getLogicalToVisualRunsMap(); return runs[logicalToVisualRunsMap[run]].level; } @@ -3277,12 +4470,14 @@ { verifyValidParaOrLine(); BidiLine.getRuns(this); + + // for backward compatibility if (runCount == 1) { return 0; } else if (run == runCount) { return length; } - verifyIndex(run, 0, runCount); + getLogicalToVisualRunsMap(); return runs[logicalToVisualRunsMap[run]].start; } @@ -3306,10 +4501,12 @@ { verifyValidParaOrLine(); BidiLine.getRuns(this); + + // for backward compatibility if (runCount == 1) { return length; } - verifyIndex(run, 0, runCount); + getLogicalToVisualRunsMap(); int idx = logicalToVisualRunsMap[run]; int len = idx == 0 ? runs[idx].limit : @@ -3336,7 +4533,7 @@ int start, int limit) { - final int RTLMask = (1 << Bidi.DIRECTION_RIGHT_TO_LEFT | + final int RTLMask = (1 << R | 1 << AL | 1 << RLE | 1 << RLO | @@ -3346,6 +4543,7 @@ throw new IllegalArgumentException("Value start " + start + " is out of range 0 to " + limit); } + for (int i = start; i < limit; ++i) { if (Character.isHighSurrogate(text[i]) && i < (limit-1) && Character.isLowSurrogate(text[i+1])) { @@ -3356,6 +4554,7 @@ return true; } } + return false; } @@ -3382,8 +4581,9 @@ int objectStart, int count) { + // for backward compatibility if (0 > levelStart || levels.length <= levelStart) { - throw new IllegalArgumentException("Value levelStart " + + throw new IllegalArgumentException("Value levelStart " + levelStart + " is out of range 0 to " + (levels.length-1)); } @@ -3397,6 +4597,7 @@ levelStart + " is out of range 0 to " + (objects.length - objectStart)); } + byte[] reorderLevels = new byte[count]; System.arraycopy(levels, levelStart, reorderLevels, 0, count); int[] indexMap = reorderVisual(reorderLevels); @@ -3408,6 +4609,74 @@ } /** + * Take a <code>Bidi</code> object containing the reordering + * information for a piece of text (one or more paragraphs) set by + * <code>setPara()</code> or for a line of text set by <code>setLine()</code> + * and return a string containing the reordered text. + * + * <p>The text may have been aliased (only a reference was stored + * without copying the contents), thus it must not have been modified + * since the <code>setPara()</code> call.</p> + * + * This method preserves the integrity of characters with multiple + * code units and (optionally) combining characters. + * Characters in RTL runs can be replaced by mirror-image characters + * in the returned string. Note that "real" mirroring has to be done in a + * rendering engine by glyph selection and that for many "mirrored" + * characters there are no Unicode characters as mirror-image equivalents. + * There are also options to insert or remove Bidi control + * characters; see the descriptions of the return value and the + * <code>options</code> parameter, and of the option bit flags. + * + * @param options A bit set of options for the reordering that control + * how the reordered text is written. + * The options include mirroring the characters on a code + * point basis and inserting LRM characters, which is used + * especially for transforming visually stored text + * to logically stored text (although this is still an + * imperfect implementation of an "inverse Bidi" algorithm + * because it uses the "forward Bidi" algorithm at its core). + * The available options are: + * <code>DO_MIRRORING</code>, + * <code>INSERT_LRM_FOR_NUMERIC</code>, + * <code>KEEP_BASE_COMBINING</code>, + * <code>OUTPUT_REVERSE</code>, + * <code>REMOVE_BIDI_CONTROLS</code>, + * <code>STREAMING</code> + * + * @return The reordered text. + * If the <code>INSERT_LRM_FOR_NUMERIC</code> option is set, then + * the length of the returned string could be as large as + * <code>getLength()+2*countRuns()</code>.<br> + * If the <code>REMOVE_BIDI_CONTROLS</code> option is set, then the + * length of the returned string may be less than + * <code>getLength()</code>.<br> + * If none of these options is set, then the length of the returned + * string will be exactly <code>getProcessedLength()</code>. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to <code>setPara</code> or <code>setLine</code> + * + * @see #DO_MIRRORING + * @see #INSERT_LRM_FOR_NUMERIC + * @see #KEEP_BASE_COMBINING + * @see #OUTPUT_REVERSE + * @see #REMOVE_BIDI_CONTROLS + * @see #OPTION_STREAMING + * @see #getProcessedLength + * @stable ICU 3.8 + */ + public String writeReordered(int options) + { + verifyValidParaOrLine(); + if (length == 0) { + /* nothing to do */ + return ""; + } + return BidiWriter.writeReordered(this, options); + } + + /** * Display the bidi internal state, used in debugging. */ public String toString() { @@ -3507,4 +4776,5 @@ } } } + } --- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java 2015-07-13 16:11:45.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiLine.java 2015-07-13 16:11:45.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,17 +22,13 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ +******************************************************************************* +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ /* Written by Simon Montagu, Matitiahu Allouche * (ported from C code written by Markus W. Scherer) */ @@ -42,7 +38,7 @@ import java.text.Bidi; import java.util.Arrays; -public final class BidiLine { +final class BidiLine { /* * General remarks about the functions in this file: @@ -122,13 +118,13 @@ level of B chars from 0 to paraLevel in getLevels when orderParagraphsLTR==TRUE */ - if (BidiBase.NoContextRTL(dirProps[start - 1]) == BidiBase.B) { + if (dirProps[start - 1] == BidiBase.B) { bidiBase.trailingWSStart = start; /* currently == bidiBase.length */ return; } /* go backwards across all WS, BN, explicit codes */ while (start > 0 && - (BidiBase.DirPropFlagNC(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) { + (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) { --start; } @@ -140,13 +136,11 @@ bidiBase.trailingWSStart=start; } - public static Bidi setLine(Bidi bidi, BidiBase paraBidi, - Bidi newBidi, BidiBase newBidiBase, - int start, int limit) { + static Bidi setLine(BidiBase paraBidi, + Bidi newBidi, BidiBase lineBidi, + int start, int limit) { int length; - BidiBase lineBidi = newBidiBase; - /* set the values in lineBidi from its paraBidi parent */ /* class members are already initialized to 0 */ // lineBidi.paraBidi = null; /* mark unfinished setLine */ @@ -161,6 +155,8 @@ lineBidi.paraLevel = paraBidi.GetParaLevelAt(start); lineBidi.paraCount = paraBidi.paraCount; lineBidi.runs = new BidiRun[0]; + lineBidi.reorderingMode = paraBidi.reorderingMode; + lineBidi.reorderingOptions = paraBidi.reorderingOptions; if (paraBidi.controlCount > 0) { int j; for (j = start; j < limit; j++) { @@ -206,7 +202,7 @@ setTrailingWSStart(lineBidi); trailingWSStart = lineBidi.trailingWSStart; - /* recalculate lineBidi.direction */ + /* recalculate lineBidiBase.direction */ if (trailingWSStart == 0) { /* all levels are at paraLevel */ lineBidi.direction = (byte)(lineBidi.paraLevel & 1); @@ -260,7 +256,8 @@ } } - newBidiBase.paraBidi = paraBidi; /* mark successful setLine */ + lineBidi.paraBidi = paraBidi; /* mark successful setLine */ + return newBidi; } @@ -303,30 +300,19 @@ return bidiBase.levels; } - static BidiRun getLogicalRun(BidiBase bidiBase, int logicalPosition) - { - /* this is done based on runs rather than on levels since levels have - a special interpretation when REORDER_RUNS_ONLY - */ - BidiRun newRun = new BidiRun(), iRun; - getRuns(bidiBase); - int runCount = bidiBase.runCount; - int visualStart = 0, logicalLimit = 0; - iRun = bidiBase.runs[0]; - - for (int i = 0; i < runCount; i++) { - iRun = bidiBase.runs[i]; - logicalLimit = iRun.start + iRun.limit - visualStart; - if ((logicalPosition >= iRun.start) && - (logicalPosition < logicalLimit)) { - break; - } - visualStart = iRun.limit; - } - newRun.start = iRun.start; - newRun.limit = logicalLimit; - newRun.level = iRun.level; - return newRun; + static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) { + int start = bidiBase.runs[runIndex].start; + int limit; + byte level = bidiBase.runs[runIndex].level; + + if (runIndex > 0) { + limit = start + + bidiBase.runs[runIndex].limit - + bidiBase.runs[runIndex - 1].limit; + } else { + limit = start + bidiBase.runs[0].limit; + } + return new BidiRun(start, limit, level); } /* in trivial cases there is only one trivial run; called by getRuns() */ @@ -502,7 +488,7 @@ int length = bidiBase.length, limit; byte[] levels = bidiBase.levels; int i, runCount; - byte level = BidiBase.INTERNAL_LEVEL_DEFAULT_LTR; /* initialize with no valid level */ + byte level = -1; /* initialize with no valid level */ /* * If there are WS characters at the end of the line * and the run preceding them has a level different from @@ -651,7 +637,7 @@ maxLevel = 0; for (start = levels.length; start>0; ) { level = levels[--start]; - if (level > BidiBase.MAX_EXPLICIT_LEVEL + 1) { + if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) { return null; } if (level < minLevel) { --- old/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java 2015-07-13 16:11:46.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiRun.java 2015-07-13 16:11:46.000000000 +0900 @@ -55,7 +55,7 @@ * * @see com.ibm.icu.text.Bidi */ -public class BidiRun { +class BidiRun { int start; /* first logical position of the run */ int limit; /* last visual position of the run +1 */ @@ -106,7 +106,7 @@ /** * Get level of run */ - public byte getEmbeddingLevel() + byte getEmbeddingLevel() { return level; } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java 2015-07-13 16:11:47.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/CharTrie.java 2015-07-13 16:11:46.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,22 +22,18 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** */ package sun.text.normalizer; -import java.io.InputStream; import java.io.DataInputStream; +import java.io.InputStream; import java.io.IOException; /** @@ -73,120 +69,17 @@ throw new IllegalArgumentException( "Data given does not belong to a char trie."); } - m_friendAgent_ = new FriendAgent(); - } - - /** - * Make a dummy CharTrie. - * A dummy trie is an empty runtime trie, used when a real data trie cannot - * be loaded. - * - * The trie always returns the initialValue, - * or the leadUnitValue for lead surrogate code points. - * The Latin-1 part is always set up to be linear. - * - * @param initialValue the initial value that is set for all code points - * @param leadUnitValue the value for lead surrogate code _units_ that do not - * have associated supplementary data - * @param dataManipulate object which provides methods to parse the char data - */ - public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) { - super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate); - - int dataLength, latin1Length, i, limit; - char block; - - /* calculate the actual size of the dummy trie data */ - - /* max(Latin-1, block 0) */ - dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH; - if(leadUnitValue!=initialValue) { - dataLength+=DATA_BLOCK_LENGTH; - } - m_data_=new char[dataLength]; - m_dataLength_=dataLength; - - m_initialValue_=(char)initialValue; - - /* fill the index and data arrays */ - - /* indexes are preset to 0 (block 0) */ - - /* Latin-1 data */ - for(i=0; i<latin1Length; ++i) { - m_data_[i]=(char)initialValue; - } - - if(leadUnitValue!=initialValue) { - /* indexes for lead surrogate code units to the block after Latin-1 */ - block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_); - i=0xd800>>INDEX_STAGE_1_SHIFT_; - limit=0xdc00>>INDEX_STAGE_1_SHIFT_; - for(; i<limit; ++i) { - m_index_[i]=block; - } - - /* data for lead surrogate code units */ - limit=latin1Length+DATA_BLOCK_LENGTH; - for(i=latin1Length; i<limit; ++i) { - m_data_[i]=(char)leadUnitValue; - } - } - - m_friendAgent_ = new FriendAgent(); - } - - /** - * Java friend implementation - */ - public class FriendAgent - { - /** - * Gives out the index array of the trie - * @return index array of trie - */ - public char[] getPrivateIndex() - { - return m_index_; - } - /** - * Gives out the data array of the trie - * @return data array of trie - */ - public char[] getPrivateData() - { - return m_data_; - } - /** - * Gives out the data offset in the trie - * @return data offset in the trie - */ - public int getPrivateInitialValue() - { - return m_initialValue_; - } } // public methods -------------------------------------------------- /** - * Java friend implementation - * To store the index and data array into the argument. - * @param friend java friend UCharacterProperty object to store the array + * Gets the value associated with the codepoint. + * If no value is associated with the codepoint, a default value will be + * returned. + * @param ch codepoint + * @return offset to data */ - public void putIndexData(UCharacterProperty friend) - { - friend.setIndexData(m_friendAgent_); - } - - /** - * Gets the value associated with the codepoint. - * If no value is associated with the codepoint, a default value will be - * returned. - * @param ch codepoint - * @return offset to data - * @draft 2.1 - */ public final char getCodePointValue(int ch) { int offset; @@ -215,52 +108,12 @@ * This method does not guarantee correct results for trail surrogates. * @param ch lead surrogate character * @return data value - * @draft 2.1 */ public final char getLeadValue(char ch) { return m_data_[getLeadOffset(ch)]; } - /** - * Get the value associated with a pair of surrogates. - * @param lead a lead surrogate - * @param trail a trail surrogate - * @draft 2.1 - */ - public final char getSurrogateValue(char lead, char trail) - { - int offset = getSurrogateOffset(lead, trail); - if (offset > 0) { - return m_data_[offset]; - } - return m_initialValue_; - } - - /** - * <p>Get a value from a folding offset (from the value of a lead surrogate) - * and a trail surrogate.</p> - * <p>If the - * @param leadvalue value associated with the lead surrogate which contains - * the folding offset - * @param trail surrogate - * @return trie data value associated with the trail character - * @draft 2.1 - */ - public final char getTrailValue(int leadvalue, char trail) - { - if (m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - int offset = m_dataManipulate_.getFoldingOffset(leadvalue); - if (offset > 0) { - return m_data_[getRawOffset(offset, - (char)(trail & SURROGATE_MASK_))]; - } - return m_initialValue_; - } - // protected methods ----------------------------------------------- /** @@ -309,41 +162,14 @@ return -1; } - /** - * Gets the value at the argument index. - * For use internally in TrieIterator. - * @param index value at index will be retrieved - * @return 32 bit value - * @see com.ibm.icu.impl.TrieIterator - * @draft 2.1 - */ - protected final int getValue(int index) - { - return m_data_[index]; - } - - /** - * Gets the default initial value - * @return 32 bit value - * @draft 2.1 - */ - protected final int getInitialValue() - { - return m_initialValue_; - } - // private data members -------------------------------------------- /** - * Default value - */ + * Default value + */ private char m_initialValue_; /** - * Array of char data - */ - private char m_data_[]; - /** - * Agent for friends + * Array of char data */ - private FriendAgent m_friendAgent_; + private char m_data_[]; } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java 2015-07-13 16:11:47.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java 2015-07-13 16:11:47.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -45,7 +45,7 @@ * @author ram */ -public class CharacterIteratorWrapper extends UCharacterIterator { +class CharacterIteratorWrapper extends UCharacterIterator { private CharacterIterator iterator; @@ -111,7 +111,6 @@ iterator.setIndex(index); } - //// for StringPrep /** * @see UCharacterIterator#getText(char[]) */ --- old/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java 2015-07-13 16:11:48.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java 2015-07-13 16:11:48.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,25 +25,38 @@ /* ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.InputStream; +import java.io.BufferedInputStream; import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.nio.file.FileSystems; import java.util.Arrays; +import java.security.AccessController; +import java.security.PrivilegedAction; + +public final class ICUBinary { + + private static final class IsAcceptable implements Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 1; + } + } -public final class ICUBinary -{ // public inner interface ------------------------------------------------ /** @@ -63,53 +76,44 @@ // public methods -------------------------------------------------------- /** - * <p>ICU data header reader method. - * Takes a ICU generated big-endian input stream, parse the ICU standard - * file header and authenticates them. - * <p>Header format: - * <ul> - * <li> Header size (char) - * <li> Magic number 1 (byte) - * <li> Magic number 2 (byte) - * <li> Rest of the header size (char) - * <li> Reserved word (char) - * <li> Big endian indicator (byte) - * <li> Character set family indicator (byte) - * <li> Size of a char (byte) for c++ and c use - * <li> Reserved byte (byte) - * <li> Data format identifier (4 bytes), each ICU data has its own - * identifier to distinguish them. [0] major [1] minor - * [2] milli [3] micro - * <li> Data version (4 bytes), the change version of the ICU data - * [0] major [1] minor [2] milli [3] micro - * <li> Unicode version (4 bytes) this ICU is based on. - * </ul> - * - * <p> - * Example of use:<br> - * <pre> - * try { - * FileInputStream input = new FileInputStream(filename); - * If (Utility.readICUDataHeader(input, dataformat, dataversion, - * unicode) { - * System.out.println("Verified file header, this is a ICU data file"); - * } - * } catch (IOException e) { - * System.out.println("This is not a ICU data file"); - * } - * </pre> - * - * @param inputStream input stream that contains the ICU data header - * @param dataFormatIDExpected Data format expected. An array of 4 bytes - * information about the data format. - * E.g. data format ID 1.2.3.4. will became an array of - * {1, 2, 3, 4} - * @param authenticate user defined extra data authentication. This value - * can be null, if no extra authentication is needed. - * @exception IOException thrown if there is a read error or - * when header authentication fails. - * @draft 2.1 - */ + * Loads an ICU binary data file and returns it as a ByteBuffer. + * The buffer contents is normally read-only, but its position etc. can be modified. + * + * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu". + * @return The data as a read-only ByteBuffer. + */ + public static ByteBuffer getRequiredData(String itemPath) { + final Class<ICUBinary> root = ICUBinary.class; + + try (InputStream is = AccessController.doPrivileged(new PrivilegedAction<InputStream>() { + public InputStream run() { + return root.getResourceAsStream(itemPath); + } + })) { + + BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */); + DataInputStream inputStream = new DataInputStream(b); + byte[] bb = new byte[120000]; + int n = inputStream.read(bb); + ByteBuffer bytes = ByteBuffer.wrap(bb, 0, n); + return bytes; + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Same as readHeader(), but returns a VersionInfo rather than a compact int. + */ + public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes, + int dataFormat, + Authenticate authenticate) + throws IOException { + return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate)); + } + + private static final byte BIG_ENDIAN_ = 1; public static final byte[] readHeader(InputStream inputStream, byte dataFormatIDExpected[], Authenticate authenticate) @@ -164,6 +168,80 @@ return unicodeVersion; } + /** + * Reads an ICU data header, checks the data format, and returns the data version. + * + * <p>Assumes that the ByteBuffer position is 0 on input. + * The buffer byte order is set according to the data. + * The buffer position is advanced past the header (including UDataInfo and comment). + * + * <p>See C++ ucmndata.h and unicode/udata.h. + * + * @return dataVersion + * @throws IOException if this is not a valid ICU data item of the expected dataFormat + */ + public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate) + throws IOException { + assert bytes.position() == 0; + byte magic1 = bytes.get(2); + byte magic2 = bytes.get(3); + if (magic1 != MAGIC1 || magic2 != MAGIC2) { + throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); + } + + byte isBigEndian = bytes.get(8); + byte charsetFamily = bytes.get(9); + byte sizeofUChar = bytes.get(10); + if (isBigEndian < 0 || 1 < isBigEndian || + charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_); + } + bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN); + + int headerSize = bytes.getChar(0); + int sizeofUDataInfo = bytes.getChar(4); + if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) { + throw new IOException("Internal Error: Header size error"); + } + // TODO: Change Authenticate to take int major, int minor, int milli, int micro + // to avoid array allocation. + byte[] formatVersion = new byte[] { + bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19) + }; + if (bytes.get(12) != (byte)(dataFormat >> 24) || + bytes.get(13) != (byte)(dataFormat >> 16) || + bytes.get(14) != (byte)(dataFormat >> 8) || + bytes.get(15) != (byte)dataFormat || + (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_ + + String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d", + bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15), + formatVersion[0] & 0xff, formatVersion[1] & 0xff, + formatVersion[2] & 0xff, formatVersion[3] & 0xff)); + } + + bytes.position(headerSize); + return // dataVersion + ((int)bytes.get(20) << 24) | + ((bytes.get(21) & 0xff) << 16) | + ((bytes.get(22) & 0xff) << 8) | + (bytes.get(23) & 0xff); + } + + public static void skipBytes(ByteBuffer bytes, int skipLength) { + if (skipLength > 0) { + bytes.position(bytes.position() + skipLength); + } + } + + /** + * Returns a VersionInfo for the bytes in the compact version integer. + */ + public static VersionInfo getVersionInfoFromCompactInt(int version) { + return VersionInfo.getInstance( + version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff); + } + // private variables ------------------------------------------------- /** @@ -175,7 +253,6 @@ /** * File format authentication values */ - private static final byte BIG_ENDIAN_ = 1; private static final byte CHAR_SET_ = 0; private static final byte CHAR_SIZE_ = 2; @@ -183,7 +260,7 @@ * Error messages */ private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ = - "ICU data file error: Not an ICU data file"; + "ICUBinary data file error: Magin number authentication failed"; private static final String HEADER_AUTHENTICATION_FAILED_ = - "ICU data file error: Header authentication failed, please check if you have a valid ICU data file"; + "ICUBinary data file error: Header authentication failed"; } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2015-07-13 16:11:49.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2015-07-13 16:11:49.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,18 +22,13 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 2000-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ - package sun.text.normalizer; import java.text.CharacterIterator; @@ -125,8 +120,8 @@ * * normalize(FCD) may be implemented with NFD. * - * For more details on FCD see the collation design document: - * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm + * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): + * http://www.unicode.org/notes/tn5/#FCD * * ICU collation performs either NFD or FCD normalization automatically if * normalization is turned on for the collator object. Beyond collation and @@ -138,26 +133,88 @@ * often do not encode any combining marks by themselves. For conversion to such * character encodings the Unicode text needs to be normalized to NFC. * For more usage examples, see the Unicode Standard Annex. + * + * Note: The Normalizer class also provides API for iterative normalization. + * While the setIndex() and getIndex() refer to indices in the + * underlying Unicode input text, the next() and previous() methods + * iterate through characters in the normalized output. + * This means that there is not necessarily a one-to-one correspondence + * between characters returned by next() and previous() and the indices + * passed to and returned from setIndex() and getIndex(). + * It is for this reason that Normalizer does not implement the CharacterIterator interface. + * * @stable ICU 2.8 */ - +// Original filename in ICU4J: Normalizer.java public final class NormalizerBase implements Cloneable { - //------------------------------------------------------------------------- - // Private data - //------------------------------------------------------------------------- - private char[] buffer = new char[100]; - private int bufferStart = 0; - private int bufferPos = 0; - private int bufferLimit = 0; - // The input text and our position in it private UCharacterIterator text; - private Mode mode = NFC; - private int options = 0; + private Normalizer2 norm2; + private Mode mode; + private int options; + + // The normalization buffer is the result of normalization + // of the source in [currentIndex..nextIndex] . private int currentIndex; private int nextIndex; + // A buffer for holding intermediate results + private StringBuilder buffer; + private int bufferPos; + + // Helper classes to defer loading of normalization data. + private static final class ModeImpl { + private ModeImpl(Normalizer2 n2) { + normalizer2 = n2; + } + private final Normalizer2 normalizer2; + } + + private static final class NFDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); + } + + private static final class NFKDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); + } + + private static final class NFCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); + } + + private static final class NFKCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); + } + + private static final class Unicode32 { + private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); + } + + private static final class NFD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), + Unicode32.INSTANCE)); + } + /** * Options bit set value to select Unicode 3.2 normalization * (except NormalizationCorrections). @@ -166,6 +223,17 @@ */ public static final int UNICODE_3_2=0x20; + public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; + + /* + * Default option for the latest Unicode normalization. This option is + * provided mainly for testing. + * The value zero means that normalization is done with the fixes for + * - Corrigendum 4 (Five CJK Canonical Mapping Errors) + * - Corrigendum 5 (Normalization Idempotency) + */ + public static final int UNICODE_LATEST = 0x00; + /** * Constant indicating that the end of the iteration has been reached. * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. @@ -175,392 +243,120 @@ /** * Constants for normalization modes. + * <p> + * The Mode class is not intended for public subclassing. + * Only the Mode constants provided by the Normalizer class should be used, + * and any fields or methods should not be called or overridden by users. * @stable ICU 2.8 */ - public static class Mode { - private int modeValue; - private Mode(int value) { - modeValue = value; - } - - /** - * This method is used for method dispatch - * @stable ICU 2.6 - */ - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - UnicodeSet nx) { - int srcLen = (srcLimit - srcStart); - int destLen = (destLimit - destStart); - if( srcLen > destLen ) { - return srcLen; - } - System.arraycopy(src,srcStart,dest,destStart,srcLen); - return srcLen; - } - - /** - * This method is used for method dispatch - * @stable ICU 2.6 - */ - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - int options) { - return normalize( src, srcStart, srcLimit, - dest,destStart,destLimit, - NormalizerImpl.getNX(options) - ); - } - - /** - * This method is used for method dispatch - * @stable ICU 2.6 - */ - protected String normalize(String src, int options) { - return src; - } - - /** - * This method is used for method dispatch - * @stable ICU 2.8 - */ - protected int getMinC() { - return -1; - } - - /** - * This method is used for method dispatch - * @stable ICU 2.8 - */ - protected int getMask() { - return -1; - } + public static abstract class Mode { /** - * This method is used for method dispatch - * @stable ICU 2.8 + * Sole constructor + * @internal + * @deprecated This API is ICU internal only. */ - protected IsPrevBoundary getPrevBoundary() { - return null; + @Deprecated + protected Mode() { } /** - * This method is used for method dispatch - * @stable ICU 2.8 + * @internal + * @deprecated This API is ICU internal only. */ - protected IsNextBoundary getNextBoundary() { - return null; - } + @Deprecated + protected abstract Normalizer2 getNormalizer2(int options); + } - /** - * This method is used for method dispatch - * @stable ICU 2.6 - */ - protected QuickCheckResult quickCheck(char[] src,int start, int limit, - boolean allowMaybe,UnicodeSet nx) { - if(allowMaybe) { - return MAYBE; - } - return NO; + private static Mode toMode(Normalizer.Form form) { + switch (form) { + case NFC : + return NFC; + case NFD : + return NFD; + case NFKC : + return NFKC; + case NFKD : + return NFKD; } - /** - * This method is used for method dispatch - * @stable ICU 2.8 - */ - protected boolean isNFSkippable(int c) { - return true; - } + throw new IllegalArgumentException("Unexpected normalization form: " + + form); } - /** - * No decomposition/composition. - * @stable ICU 2.8 - */ - public static final Mode NONE = new Mode(1); - - /** - * Canonical decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFD = new NFDMode(2); + private static final class NONEMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } + } private static final class NFDMode extends Mode { - private NFDMode(int value) { - super(value); - } - - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - UnicodeSet nx) { - int[] trailCC = new int[1]; - return NormalizerImpl.decompose(src, srcStart,srcLimit, - dest, destStart,destLimit, - false, trailCC,nx); - } - - protected String normalize( String src, int options) { - return decompose(src,false,options); - } - - protected int getMinC() { - return NormalizerImpl.MIN_WITH_LEAD_CC; - } - - protected IsPrevBoundary getPrevBoundary() { - return new IsPrevNFDSafe(); - } - - protected IsNextBoundary getNextBoundary() { - return new IsNextNFDSafe(); - } - - protected int getMask() { - return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD); - } - - protected QuickCheckResult quickCheck(char[] src,int start, - int limit,boolean allowMaybe, - UnicodeSet nx) { - return NormalizerImpl.quickCheck( - src, start,limit, - NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFD_NO_MAYBE - ), - NormalizerImpl.QC_NFD, - 0, - allowMaybe, - nx - ); - } - - protected boolean isNFSkippable(int c) { - return NormalizerImpl.isNFSkippable(c,this, - (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFD) - ); + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFD32ModeImpl.INSTANCE.normalizer2 : + NFDModeImpl.INSTANCE.normalizer2; } } - /** - * Compatibility decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFKD = new NFKDMode(3); - private static final class NFKDMode extends Mode { - private NFKDMode(int value) { - super(value); - } - - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - UnicodeSet nx) { - int[] trailCC = new int[1]; - return NormalizerImpl.decompose(src, srcStart,srcLimit, - dest, destStart,destLimit, - true, trailCC, nx); - } - - protected String normalize( String src, int options) { - return decompose(src,true,options); - } - - protected int getMinC() { - return NormalizerImpl.MIN_WITH_LEAD_CC; - } - - protected IsPrevBoundary getPrevBoundary() { - return new IsPrevNFDSafe(); - } - - protected IsNextBoundary getNextBoundary() { - return new IsNextNFDSafe(); - } - - protected int getMask() { - return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD); + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKD32ModeImpl.INSTANCE.normalizer2 : + NFKDModeImpl.INSTANCE.normalizer2; } + } - protected QuickCheckResult quickCheck(char[] src,int start, - int limit,boolean allowMaybe, - UnicodeSet nx) { - return NormalizerImpl.quickCheck( - src,start,limit, - NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFKD_NO_MAYBE - ), - NormalizerImpl.QC_NFKD, - NormalizerImpl.OPTIONS_COMPAT, - allowMaybe, - nx - ); + private static final class NFCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFC32ModeImpl.INSTANCE.normalizer2 : + NFCModeImpl.INSTANCE.normalizer2; } + } - protected boolean isNFSkippable(int c) { - return NormalizerImpl.isNFSkippable(c, this, - (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKD) - ); + private static final class NFKCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKC32ModeImpl.INSTANCE.normalizer2 : + NFKCModeImpl.INSTANCE.normalizer2; } } /** - * Canonical decomposition followed by canonical composition. + * No decomposition/composition. * @stable ICU 2.8 */ - public static final Mode NFC = new NFCMode(4); - - private static final class NFCMode extends Mode{ - private NFCMode(int value) { - super(value); - } - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - UnicodeSet nx) { - return NormalizerImpl.compose( src, srcStart, srcLimit, - dest,destStart,destLimit, - 0, nx); - } - - protected String normalize( String src, int options) { - return compose(src, false, options); - } - - protected int getMinC() { - return NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE - ); - } - protected IsPrevBoundary getPrevBoundary() { - return new IsPrevTrueStarter(); - } - protected IsNextBoundary getNextBoundary() { - return new IsNextTrueStarter(); - } - protected int getMask() { - return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFC); - } - protected QuickCheckResult quickCheck(char[] src,int start, - int limit,boolean allowMaybe, - UnicodeSet nx) { - return NormalizerImpl.quickCheck( - src,start,limit, - NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFC_NO_MAYBE - ), - NormalizerImpl.QC_NFC, - 0, - allowMaybe, - nx - ); - } - protected boolean isNFSkippable(int c) { - return NormalizerImpl.isNFSkippable(c,this, - ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY| - (NormalizerImpl.QC_NFC & NormalizerImpl.QC_ANY_NO) - ) - ); - } - }; + public static final Mode NONE = new NONEMode(); /** - * Compatibility decomposition followed by canonical composition. + * Canonical decomposition. * @stable ICU 2.8 */ - public static final Mode NFKC =new NFKCMode(5); - - private static final class NFKCMode extends Mode{ - private NFKCMode(int value) { - super(value); - } - protected int normalize(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - UnicodeSet nx) { - return NormalizerImpl.compose(src, srcStart,srcLimit, - dest, destStart,destLimit, - NormalizerImpl.OPTIONS_COMPAT, nx); - } - - protected String normalize( String src, int options) { - return compose(src, true, options); - } - protected int getMinC() { - return NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE - ); - } - protected IsPrevBoundary getPrevBoundary() { - return new IsPrevTrueStarter(); - } - protected IsNextBoundary getNextBoundary() { - return new IsNextTrueStarter(); - } - protected int getMask() { - return (NormalizerImpl.CC_MASK|NormalizerImpl.QC_NFKC); - } - protected QuickCheckResult quickCheck(char[] src,int start, - int limit,boolean allowMaybe, - UnicodeSet nx) { - return NormalizerImpl.quickCheck( - src,start,limit, - NormalizerImpl.getFromIndexesArr( - NormalizerImpl.INDEX_MIN_NFKC_NO_MAYBE - ), - NormalizerImpl.QC_NFKC, - NormalizerImpl.OPTIONS_COMPAT, - allowMaybe, - nx - ); - } - protected boolean isNFSkippable(int c) { - return NormalizerImpl.isNFSkippable(c, this, - ( NormalizerImpl.CC_MASK|NormalizerImpl.COMBINES_ANY| - (NormalizerImpl.QC_NFKC & NormalizerImpl.QC_ANY_NO) - ) - ); - } - }; + public static final Mode NFD = new NFDMode(); /** - * Result values for quickCheck(). - * For details see Unicode Technical Report 15. - * @stable ICU 2.8 - */ - public static final class QuickCheckResult{ - private int resultValue; - private QuickCheckResult(int value) { - resultValue=value; - } - } - /** - * Indicates that string is not in the normalized format + * Compatibility decomposition. * @stable ICU 2.8 */ - public static final QuickCheckResult NO = new QuickCheckResult(0); + public static final Mode NFKD = new NFKDMode(); /** - * Indicates that string is in the normalized format + * Canonical decomposition followed by canonical composition. * @stable ICU 2.8 */ - public static final QuickCheckResult YES = new QuickCheckResult(1); + public static final Mode NFC = new NFCMode(); - /** - * Indicates it cannot be determined if string is in the normalized - * format without further thorough checks. - * @stable ICU 2.8 - */ - public static final QuickCheckResult MAYBE = new QuickCheckResult(2); + public static final Mode NFKC =new NFKCMode(); //------------------------------------------------------------------------- - // Constructors + // Iterator constructors //------------------------------------------------------------------------- /** - * Creates a new {@code Normalizer} object for iterating over the + * Creates a new {@code NormalizerBase} object for iterating over the * normalized form of a given string. * <p> * The {@code options} parameter specifies which optional - * {@code Normalizer} features are to be enabled for this object. - * + * {@code NormalizerBase} features are to be enabled for this object. + * <p> * @param str The string to be normalized. The normalization * will start at the beginning of the string. * @@ -576,25 +372,19 @@ this.text = UCharacterIterator.getInstance(str); this.mode = mode; this.options=opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); } - /** - * Creates a new {@code Normalizer} object for iterating over the - * normalized form of the given text. - * - * @param iter The input text to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - */ - public NormalizerBase(CharacterIterator iter, Mode mode) { - this(iter, mode, UNICODE_LATEST); + public NormalizerBase(String str, Mode mode) { + this(str, mode, 0); } + /** - * Creates a new {@code Normalizer} object for iterating over the + * Creates a new {@code NormalizerBase} object for iterating over the * normalized form of the given text. - * + * <p> * @param iter The input text to be normalized. The normalization * will start at the beginning of the string. * @@ -607,15 +397,19 @@ * @stable ICU 2.6 */ public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance( - (CharacterIterator)iter.clone() - ); + this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); this.mode = mode; this.options = opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(CharacterIterator iter, Mode mode) { + this(iter, mode, 0); } /** - * Clones this {@code Normalizer} object. All properties of this + * Clones this {@code NormalizerBase} object. All properties of this * object are duplicated in the new object, including the cloning of any * {@link CharacterIterator} that was passed in to the constructor * or to {@link #setText(CharacterIterator) setText}. @@ -628,11 +422,13 @@ try { NormalizerBase copy = (NormalizerBase) super.clone(); copy.text = (UCharacterIterator) text.clone(); - //clone the internal buffer - if (buffer != null) { - copy.buffer = new char[buffer.length]; - System.arraycopy(buffer,0,copy.buffer,0,buffer.length); - } + copy.mode = mode; + copy.options = options; + copy.norm2 = norm2; + copy.buffer = new StringBuilder(buffer); + copy.bufferPos = bufferPos; + copy.currentIndex = currentIndex; + copy.nextIndex = nextIndex; return copy; } catch (CloneNotSupportedException e) { @@ -640,150 +436,60 @@ } } - //-------------------------------------------------------------------------- - // Static Utility methods - //-------------------------------------------------------------------------- - /** - * Compose a string. - * The string will be composed according to the specified mode. - * @param str The string to compose. - * @param compat If true the string will be composed according to - * NFKC rules and if false will be composed according to - * NFC rules. - * @param options The only recognized option is UNICODE_3_2 - * @return String The composed string + * Normalizes a {@code String} using the given normalization operation. + * <p> + * The {@code options} parameter specifies which optional + * {@code NormalizerBase} features are to be enabled for this operation. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the standard + * Unicode Normalization Forms, use 0 for this argument. + * <p> + * @param str the input string to be normalized. + * @param mode the normalization mode + * @param options the optional features to be enabled. + * @return String the normalized string * @stable ICU 2.6 */ - public static String compose(String str, boolean compat, int options) { - - char[] dest, src; - if (options == UNICODE_3_2_0_ORIGINAL) { - String mappedStr = NormalizerImpl.convert(str); - dest = new char[mappedStr.length()*MAX_BUF_SIZE_COMPOSE]; - src = mappedStr.toCharArray(); - } else { - dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE]; - src = str.toCharArray(); - } - int destSize=0; - - UnicodeSet nx = NormalizerImpl.getNX(options); - - /* reset options bits that should only be set here or inside compose() */ - options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS); - - if(compat) { - options|=NormalizerImpl.OPTIONS_COMPAT; - } - - for(;;) { - destSize=NormalizerImpl.compose(src,0,src.length, - dest,0,dest.length,options, - nx); - if(destSize<=dest.length) { - return new String(dest,0,destSize); - } else { - dest = new char[destSize]; - } - } + public static String normalize(String str, Mode mode, int options) { + return mode.getNormalizer2(options).normalize(str); } - private static final int MAX_BUF_SIZE_COMPOSE = 2; - private static final int MAX_BUF_SIZE_DECOMPOSE = 3; + public static String normalize(String str, Normalizer.Form form) { + return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); + } - /** - * Decompose a string. - * The string will be decomposed according to the specified mode. - * @param str The string to decompose. - * @param compat If true the string will be decomposed according to NFKD - * rules and if false will be decomposed according to NFD - * rules. - * @return String The decomposed string - * @stable ICU 2.8 - */ - public static String decompose(String str, boolean compat) { - return decompose(str,compat,UNICODE_LATEST); + public static String normalize(String str, Normalizer.Form form, int options) { + return NormalizerBase.normalize(str, toMode(form), options); } /** - * Decompose a string. - * The string will be decomposed according to the specified mode. - * @param str The string to decompose. - * @param compat If true the string will be decomposed according to NFKD - * rules and if false will be decomposed according to NFD - * rules. - * @param options The normalization options, ORed together (0 for no options). - * @return String The decomposed string + * Test if a string is in a given normalization form. + * This is semantically equivalent to source.equals(normalize(source, mode)). + * + * Unlike quickCheck(), this function returns a definitive result, + * never a "maybe". + * For NFD, NFKD, and FCD, both functions work exactly the same. + * For NFC and NFKC where quickCheck may return "maybe", this function will + * perform further tests to arrive at a true/false result. + * @param str the input string to be checked to see if it is + * normalized + * @param mode the normalization mode + * @param options Options for use with exclusion set and tailored Normalization + * The only option that is currently recognized is UNICODE_3_2 + * @see #isNormalized * @stable ICU 2.6 */ - public static String decompose(String str, boolean compat, int options) { - - int[] trailCC = new int[1]; - int destSize=0; - UnicodeSet nx = NormalizerImpl.getNX(options); - char[] dest; - - if (options == UNICODE_3_2_0_ORIGINAL) { - String mappedStr = NormalizerImpl.convert(str); - dest = new char[mappedStr.length()*MAX_BUF_SIZE_DECOMPOSE]; - - for(;;) { - destSize=NormalizerImpl.decompose(mappedStr.toCharArray(),0,mappedStr.length(), - dest,0,dest.length, - compat,trailCC, nx); - if(destSize<=dest.length) { - return new String(dest,0,destSize); - } else { - dest = new char[destSize]; - } - } - } else { - dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE]; - - for(;;) { - destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(), - dest,0,dest.length, - compat,trailCC, nx); - if(destSize<=dest.length) { - return new String(dest,0,destSize); - } else { - dest = new char[destSize]; - } - } - } + public static boolean isNormalized(String str, Mode mode, int options) { + return mode.getNormalizer2(options).isNormalized(str); } - /** - * Normalize a string. - * The string will be normalized according to the specified normalization - * mode and options. - * @param src The char array to compose. - * @param srcStart Start index of the source - * @param srcLimit Limit index of the source - * @param dest The char buffer to fill in - * @param destStart Start index of the destination buffer - * @param destLimit End index of the destination buffer - * @param mode The normalization mode; one of Normalizer.NONE, - * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, - * Normalizer.NFKD, Normalizer.DEFAULT - * @param options The normalization options, ORed together (0 for no options). - * @return int The total buffer size needed;if greater than length of - * result, the output was truncated. - * @exception IndexOutOfBoundsException if the target capacity is - * less than the required length - * @stable ICU 2.6 - */ - public static int normalize(char[] src,int srcStart, int srcLimit, - char[] dest,int destStart, int destLimit, - Mode mode, int options) { - int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options); + public static boolean isNormalized(String str, Normalizer.Form form) { + return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); + } - if(length<=(destLimit-destStart)) { - return length; - } else { - throw new IndexOutOfBoundsException(Integer.toString(length)); - } + public static boolean isNormalized(String str, Normalizer.Form form, int options) { + return NormalizerBase.isNormalized(str, toMode(form), options); } //------------------------------------------------------------------------- @@ -796,8 +502,8 @@ * @stable ICU 2.8 */ public int current() { - if(bufferPos<bufferLimit || nextNormalize()) { - return getCodePointAt(bufferPos); + if(bufferPos<buffer.length() || nextNormalize()) { + return buffer.codePointAt(bufferPos); } else { return DONE; } @@ -811,16 +517,15 @@ * @stable ICU 2.8 */ public int next() { - if(bufferPos<bufferLimit || nextNormalize()) { - int c=getCodePointAt(bufferPos); - bufferPos+=(c>0xFFFF) ? 2 : 1; + if(bufferPos<buffer.length() || nextNormalize()) { + int c=buffer.codePointAt(bufferPos); + bufferPos+=Character.charCount(c); return c; } else { return DONE; } } - /** * Return the previous character in the normalized text and decrement * the iteration position by one. If the beginning @@ -830,8 +535,8 @@ */ public int previous() { if(bufferPos>0 || previousNormalize()) { - int c=getCodePointAt(bufferPos-1); - bufferPos-=(c>0xFFFF) ? 2 : 1; + int c=buffer.codePointBefore(bufferPos); + bufferPos-=Character.charCount(c); return c; } else { return DONE; @@ -859,8 +564,8 @@ * @stable ICU 2.8 */ public void setIndexOnly(int index) { - text.setIndex(index); - currentIndex=nextIndex=index; // validates index + text.setIndex(index); // validates index + currentIndex=nextIndex=index; clearBuffer(); } @@ -874,7 +579,7 @@ * necessarily a one-to-one correspondence between characters returned * by {@code next} and {@code previous} and the indices passed to and * returned from {@code setIndex} and {@link #getIndex}. - * + * <p> * @param index the desired index in the input text. * * @return the first normalized character that is the result of iterating @@ -882,11 +587,9 @@ * * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than {@link #getEndIndex}. - * @return The codepoint as an int - * @deprecated ICU 3.2 + * deprecated ICU 3.2 * @obsolete ICU 3.2 */ - @Deprecated public int setIndex(int index) { setIndexOnly(index); return current(); @@ -895,7 +598,7 @@ /** * Retrieve the index of the start of the input text. This is the begin * index of the {@code CharacterIterator} or the start (i.e. 0) of the - * {@code String} over which this {@code Normalizer} is iterating + * {@code String} over which this {@code NormalizerBase} is iterating * @deprecated ICU 2.2. Use startIndex() instead. * @return The codepoint as an int * @see #startIndex @@ -908,7 +611,7 @@ /** * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} - * over which this {@code Normalizer} is iterating + * over which this {@code NormalizerBase} is iterating * @deprecated ICU 2.2. Use endIndex() instead. * @return The codepoint as an int * @see #endIndex @@ -934,7 +637,7 @@ * @stable ICU 2.8 */ public int getIndex() { - if(bufferPos<bufferLimit) { + if(bufferPos<buffer.length()) { return currentIndex; } else { return nextIndex; @@ -942,9 +645,9 @@ } /** - * Retrieve the index of the end of the input text. This is the end index + * Retrieve the index of the end of the input text. This is the end index * of the {@code CharacterIterator} or the length of the {@code String} - * over which this {@code Normalizer} is iterating + * over which this {@code NormalizerBase} is iterating * @return The current iteration position * @stable ICU 2.8 */ @@ -953,7 +656,7 @@ } //------------------------------------------------------------------------- - // Property access methods + // Iterator attributes //------------------------------------------------------------------------- /** * Set the normalization mode for this object. @@ -964,18 +667,18 @@ * until the iteration is able to re-sync at the next base character. * It is safest to call {@link #setText setText()}, {@link #first}, * {@link #last}, etc. after calling {@code setMode}. - * - * @param newMode the new mode for this {@code Normalizer}. + * <p> + * @param newMode the new mode for this {@code NormalizerBase}. * The supported modes are: * <ul> - * <li>{@link #COMPOSE} - Unicode canonical decompositiion - * followed by canonical composition. - * <li>{@link #COMPOSE_COMPAT} - Unicode compatibility decompositiion - * follwed by canonical composition. - * <li>{@link #DECOMP} - Unicode canonical decomposition - * <li>{@link #DECOMP_COMPAT} - Unicode compatibility decomposition. - * <li>{@link #NO_OP} - Do nothing but return characters - * from the underlying input text. + * <li>{@link #NFC} - Unicode canonical decompositiion + * followed by canonical composition. + * <li>{@link #NFKC} - Unicode compatibility decompositiion + * follwed by canonical composition. + * <li>{@link #NFD} - Unicode canonical decomposition + * <li>{@link #NFKD} - Unicode compatibility decomposition. + * <li>{@link #NONE} - Do nothing but return characters + * from the underlying input text. * </ul> * * @see #getMode @@ -983,9 +686,11 @@ */ public void setMode(Mode newMode) { mode = newMode; + norm2 = mode.getNormalizer2(options); } + /** - * Return the basic operation performed by this {@code Normalizer} + * Return the basic operation performed by this {@code NormalizerBase} * * @see #setMode * @stable ICU 2.8 @@ -995,688 +700,83 @@ } /** - * Set the input text over which this {@code Normalizer} will iterate. + * Set the input text over which this {@code NormalizerBase} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(String newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { - throw new InternalError("Could not create a new UCharacterIterator"); + throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; reset(); } /** - * Set the input text over which this {@code Normalizer} will iterate. + * Set the input text over which this {@code NormalizerBase} will iterate. * The iteration position is set to the beginning of the input text. * @param newText The new string to be normalized. * @stable ICU 2.8 */ public void setText(CharacterIterator newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); if (newIter == null) { - throw new InternalError("Could not create a new UCharacterIterator"); + throw new IllegalStateException("Could not create a new UCharacterIterator"); } text = newIter; currentIndex=nextIndex=0; clearBuffer(); } - //------------------------------------------------------------------------- - // Private utility methods - //------------------------------------------------------------------------- - - - /* backward iteration --------------------------------------------------- */ - - /* - * read backwards and get norm32 - * return 0 if the character is <minC - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - - private static long getPrevNorm32(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] chars) { - long norm32; - int ch=0; - /* need src.hasPrevious() */ - if((ch=src.previous()) == UCharacterIterator.DONE) { - return 0; - } - chars[0]=(char)ch; - chars[1]=0; - - /* check for a surrogate before getting norm32 to see if we need to - * predecrement further */ - if(chars[0]<minC) { - return 0; - } else if(!UTF16.isSurrogate(chars[0])) { - return NormalizerImpl.getNorm32(chars[0]); - } else if(UTF16.isLeadSurrogate(chars[0]) || (src.getIndex()==0)) { - /* unpaired surrogate */ - chars[1]=(char)src.current(); - return 0; - } else if(UTF16.isLeadSurrogate(chars[1]=(char)src.previous())) { - norm32=NormalizerImpl.getNorm32(chars[1]); - if((norm32&mask)==0) { - /* all surrogate pairs with this lead surrogate have irrelevant - * data */ - return 0; - } else { - /* norm32 must be a surrogate special */ - return NormalizerImpl.getNorm32FromSurrogatePair(norm32,chars[0]); - } - } else { - /* unpaired second surrogate, undo the c2=src.previous() movement */ - src.moveIndex( 1); - return 0; - } - } - - private interface IsPrevBoundary{ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] chars); - } - private static final class IsPrevNFDSafe implements IsPrevBoundary{ - /* - * for NF*D: - * read backwards and check if the lead combining class is 0 - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - char[] chars) { - - return NormalizerImpl.isNFDSafe(getPrevNorm32(src, minC, - ccOrQCMask, chars), - ccOrQCMask, - ccOrQCMask& NormalizerImpl.QC_MASK); - } - } - - private static final class IsPrevTrueStarter implements IsPrevBoundary{ - /* - * read backwards and check if the character is (or its decomposition - * begins with) a "true starter" (cc==0 and NF*C_YES) - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - public boolean isPrevBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - char[] chars) { - long norm32; - int/*unsigned*/ decompQCMask; - - decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/ - norm32=getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, chars); - return NormalizerImpl.isTrueStarter(norm32,ccOrQCMask,decompQCMask); - } - } - - private static int findPreviousIterationBoundary(UCharacterIterator src, - IsPrevBoundary obj, - int/*unsigned*/ minC, - int/*mask*/ mask, - char[] buffer, - int[] startIndex) { - char[] chars=new char[2]; - boolean isBoundary; - - /* fill the buffer from the end backwards */ - startIndex[0] = buffer.length; - chars[0]=0; - while(src.getIndex()>0 && chars[0]!=UCharacterIterator.DONE) { - isBoundary=obj.isPrevBoundary(src, minC, mask, chars); - - /* always write this character to the front of the buffer */ - /* make sure there is enough space in the buffer */ - if(startIndex[0] < (chars[1]==0 ? 1 : 2)) { - - // grow the buffer - char[] newBuf = new char[buffer.length*2]; - /* move the current buffer contents up */ - System.arraycopy(buffer,startIndex[0],newBuf, - newBuf.length-(buffer.length-startIndex[0]), - buffer.length-startIndex[0]); - //adjust the startIndex - startIndex[0]+=newBuf.length-buffer.length; - - buffer=newBuf; - newBuf=null; - - } - - buffer[--startIndex[0]]=chars[0]; - if(chars[1]!=0) { - buffer[--startIndex[0]]=chars[1]; - } - - /* stop if this just-copied character is a boundary */ - if(isBoundary) { - break; - } - } - - /* return the length of the buffer contents */ - return buffer.length-startIndex[0]; - } - - private static int previous(UCharacterIterator src, - char[] dest, int destStart, int destLimit, - Mode mode, - boolean doNormalize, - boolean[] pNeededToNormalize, - int options) { - - IsPrevBoundary isPreviousBoundary; - int destLength, bufferLength; - int/*unsigned*/ mask; - int c,c2; - - char minC; - int destCapacity = destLimit-destStart; - destLength=0; - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=false; - } - minC = (char)mode.getMinC(); - mask = mode.getMask(); - isPreviousBoundary = mode.getPrevBoundary(); - - if(isPreviousBoundary==null) { - destLength=0; - if((c=src.previous())>=0) { - destLength=1; - if(UTF16.isTrailSurrogate((char)c)) { - c2= src.previous(); - if(c2!= UCharacterIterator.DONE) { - if(UTF16.isLeadSurrogate((char)c2)) { - if(destCapacity>=2) { - dest[1]=(char)c; // trail surrogate - destLength=2; - } - // lead surrogate to be written below - c=c2; - } else { - src.moveIndex(1); - } - } - } - - if(destCapacity>0) { - dest[0]=(char)c; - } - } - return destLength; - } - - char[] buffer = new char[100]; - int[] startIndex= new int[1]; - bufferLength=findPreviousIterationBoundary(src, - isPreviousBoundary, - minC, mask,buffer, - startIndex); - if(bufferLength>0) { - if(doNormalize) { - destLength=NormalizerBase.normalize(buffer,startIndex[0], - startIndex[0]+bufferLength, - dest, destStart,destLimit, - mode, options); - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=destLength!=bufferLength || - Utility.arrayRegionMatches( - buffer,0,dest, - destStart,destLimit - ); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - System.arraycopy(buffer,startIndex[0],dest,0, - (bufferLength<destCapacity) ? - bufferLength : destCapacity - ); - } - } - } - - - return destLength; - } - - - - /* forward iteration ---------------------------------------------------- */ - /* - * read forward and check if the character is a next-iteration boundary - * if c2!=0 then (c, c2) is a surrogate pair - */ - private interface IsNextBoundary{ - boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - int[] chars); - } - /* - * read forward and get norm32 - * return 0 if the character is <minC - * if c2!=0 then (c2, c) is a surrogate pair - * always reads complete characters - */ - private static long /*unsigned*/ getNextNorm32(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - int[] chars) { - long norm32; - - /* need src.hasNext() to be true */ - chars[0]=src.next(); - chars[1]=0; - - if(chars[0]<minC) { - return 0; - } - - norm32=NormalizerImpl.getNorm32((char)chars[0]); - if(UTF16.isLeadSurrogate((char)chars[0])) { - if(src.current()!=UCharacterIterator.DONE && - UTF16.isTrailSurrogate((char)(chars[1]=src.current()))) { - src.moveIndex(1); /* skip the c2 surrogate */ - if((norm32&mask)==0) { - /* irrelevant data */ - return 0; - } else { - /* norm32 must be a surrogate special */ - return NormalizerImpl.getNorm32FromSurrogatePair(norm32,(char)chars[1]); - } - } else { - /* unmatched surrogate */ - return 0; - } - } - return norm32; - } - - - /* - * for NF*D: - * read forward and check if the lead combining class is 0 - * if c2!=0 then (c, c2) is a surrogate pair - */ - private static final class IsNextNFDSafe implements IsNextBoundary{ - public boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - int[] chars) { - return NormalizerImpl.isNFDSafe(getNextNorm32(src,minC,ccOrQCMask,chars), - ccOrQCMask, ccOrQCMask&NormalizerImpl.QC_MASK); - } - } - - /* - * for NF*C: - * read forward and check if the character is (or its decomposition begins - * with) a "true starter" (cc==0 and NF*C_YES) - * if c2!=0 then (c, c2) is a surrogate pair - */ - private static final class IsNextTrueStarter implements IsNextBoundary{ - public boolean isNextBoundary(UCharacterIterator src, - int/*unsigned*/ minC, - int/*unsigned*/ ccOrQCMask, - int[] chars) { - long norm32; - int/*unsigned*/ decompQCMask; - - decompQCMask=(ccOrQCMask<<2)&0xf; /*decomposition quick check mask*/ - norm32=getNextNorm32(src, minC, ccOrQCMask|decompQCMask, chars); - return NormalizerImpl.isTrueStarter(norm32, ccOrQCMask, decompQCMask); - } - } - - private static int findNextIterationBoundary(UCharacterIterator src, - IsNextBoundary obj, - int/*unsigned*/ minC, - int/*unsigned*/ mask, - char[] buffer) { - if(src.current()==UCharacterIterator.DONE) { - return 0; - } - - /* get one character and ignore its properties */ - int[] chars = new int[2]; - chars[0]=src.next(); - buffer[0]=(char)chars[0]; - int bufferIndex = 1; - - if(UTF16.isLeadSurrogate((char)chars[0])&& - src.current()!=UCharacterIterator.DONE) { - if(UTF16.isTrailSurrogate((char)(chars[1]=src.next()))) { - buffer[bufferIndex++]=(char)chars[1]; - } else { - src.moveIndex(-1); /* back out the non-trail-surrogate */ - } - } - - /* get all following characters until we see a boundary */ - /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff - * is part of the string */ - while( src.current()!=UCharacterIterator.DONE) { - if(obj.isNextBoundary(src, minC, mask, chars)) { - /* back out the latest movement to stop at the boundary */ - src.moveIndex(chars[1]==0 ? -1 : -2); - break; - } else { - if(bufferIndex+(chars[1]==0 ? 1 : 2)<=buffer.length) { - buffer[bufferIndex++]=(char)chars[0]; - if(chars[1]!=0) { - buffer[bufferIndex++]=(char)chars[1]; - } - } else { - char[] newBuf = new char[buffer.length*2]; - System.arraycopy(buffer,0,newBuf,0,bufferIndex); - buffer = newBuf; - buffer[bufferIndex++]=(char)chars[0]; - if(chars[1]!=0) { - buffer[bufferIndex++]=(char)chars[1]; - } - } - } - } - - /* return the length of the buffer contents */ - return bufferIndex; - } - - private static int next(UCharacterIterator src, - char[] dest, int destStart, int destLimit, - NormalizerBase.Mode mode, - boolean doNormalize, - boolean[] pNeededToNormalize, - int options) { - - IsNextBoundary isNextBoundary; - int /*unsigned*/ mask; - int /*unsigned*/ bufferLength; - int c,c2; - char minC; - int destCapacity = destLimit - destStart; - int destLength = 0; - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=false; - } - - minC = (char)mode.getMinC(); - mask = mode.getMask(); - isNextBoundary = mode.getNextBoundary(); - - if(isNextBoundary==null) { - destLength=0; - c=src.next(); - if(c!=UCharacterIterator.DONE) { - destLength=1; - if(UTF16.isLeadSurrogate((char)c)) { - c2= src.next(); - if(c2!= UCharacterIterator.DONE) { - if(UTF16.isTrailSurrogate((char)c2)) { - if(destCapacity>=2) { - dest[1]=(char)c2; // trail surrogate - destLength=2; - } - // lead surrogate to be written below - } else { - src.moveIndex(-1); - } - } - } - - if(destCapacity>0) { - dest[0]=(char)c; - } - } - return destLength; - } - - char[] buffer=new char[100]; - int[] startIndex = new int[1]; - bufferLength=findNextIterationBoundary(src,isNextBoundary, minC, mask, - buffer); - if(bufferLength>0) { - if(doNormalize) { - destLength=mode.normalize(buffer,startIndex[0],bufferLength, - dest,destStart,destLimit, options); - - if(pNeededToNormalize!=null) { - pNeededToNormalize[0]=destLength!=bufferLength || - Utility.arrayRegionMatches(buffer,startIndex[0], - dest,destStart, - destLength); - } - } else { - /* just copy the source characters */ - if(destCapacity>0) { - System.arraycopy(buffer,0,dest,destStart, - Math.min(bufferLength,destCapacity) - ); - } - - - } - } - return destLength; - } - private void clearBuffer() { - bufferLimit=bufferStart=bufferPos=0; + buffer.setLength(0); + bufferPos=0; } private boolean nextNormalize() { - clearBuffer(); currentIndex=nextIndex; text.setIndex(nextIndex); - - bufferLimit=next(text,buffer,bufferStart,buffer.length,mode,true,null,options); - + // Skip at least one character so we make progress. + int c=text.nextCodePoint(); + if(c<0) { + return false; + } + StringBuilder segment=new StringBuilder().appendCodePoint(c); + while((c=text.nextCodePoint())>=0) { + if(norm2.hasBoundaryBefore(c)) { + text.moveCodePointIndex(-1); + break; + } + segment.appendCodePoint(c); + } nextIndex=text.getIndex(); - return (bufferLimit>0); + norm2.normalize(segment, buffer); + return buffer.length()!=0; } private boolean previousNormalize() { - clearBuffer(); nextIndex=currentIndex; text.setIndex(currentIndex); - bufferLimit=previous(text,buffer,bufferStart,buffer.length,mode,true,null,options); - - currentIndex=text.getIndex(); - bufferPos = bufferLimit; - return bufferLimit>0; - } - - private int getCodePointAt(int index) { - if( UTF16.isSurrogate(buffer[index])) { - if(UTF16.isLeadSurrogate(buffer[index])) { - if((index+1)<bufferLimit && - UTF16.isTrailSurrogate(buffer[index+1])) { - return UCharacterProperty.getRawSupplementary( - buffer[index], - buffer[index+1] - ); - } - }else if(UTF16.isTrailSurrogate(buffer[index])) { - if(index>0 && UTF16.isLeadSurrogate(buffer[index-1])) { - return UCharacterProperty.getRawSupplementary( - buffer[index-1], - buffer[index] - ); - } - } - } - return buffer[index]; - - } - - /** - * Internal API - * @internal - */ - public static boolean isNFSkippable(int c, Mode mode) { - return mode.isNFSkippable(c); - } - - // - // Options - // - - /* - * Default option for Unicode 3.2.0 normalization. - * Corrigendum 4 was fixed in Unicode 3.2.0 but isn't supported in - * IDNA/StringPrep. - * The public review issue #29 was fixed in Unicode 4.1.0. Corrigendum 5 - * allowed Unicode 3.2 to 4.0.1 to apply the fix for PRI #29, but it isn't - * supported by IDNA/StringPrep as well as Corrigendum 4. - */ - public static final int UNICODE_3_2_0_ORIGINAL = - UNICODE_3_2 | - NormalizerImpl.WITHOUT_CORRIGENDUM4_CORRECTIONS | - NormalizerImpl.BEFORE_PRI_29; - - /* - * Default option for the latest Unicode normalization. This option is - * provided mainly for testing. - * The value zero means that normalization is done with the fixes for - * - Corrigendum 4 (Five CJK Canonical Mapping Errors) - * - Corrigendum 5 (Normalization Idempotency) - */ - public static final int UNICODE_LATEST = 0x00; - - // - // public constructor and methods for java.text.Normalizer and - // sun.text.Normalizer - // - - /** - * Creates a new {@code Normalizer} object for iterating over the - * normalized form of a given string. - * - * @param str The string to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - */ - public NormalizerBase(String str, Mode mode) { - this(str, mode, UNICODE_LATEST); - } - - /** - * Normalizes a <code>String</code> using the given normalization form. - * - * @param str the input string to be normalized. - * @param form the normalization form - */ - public static String normalize(String str, Normalizer.Form form) { - return normalize(str, form, UNICODE_LATEST); - } - - /** - * Normalizes a <code>String</code> using the given normalization form. - * - * @param str the input string to be normalized. - * @param form the normalization form - * @param options the optional features to be enabled. - */ - public static String normalize(String str, Normalizer.Form form, int options) { - int len = str.length(); - boolean asciiOnly = true; - if (len < 80) { - for (int i = 0; i < len; i++) { - if (str.charAt(i) > 127) { - asciiOnly = false; - break; - } + StringBuilder segment=new StringBuilder(); + int c; + while((c=text.previousCodePoint())>=0) { + if(c<=0xffff) { + segment.insert(0, (char)c); + } else { + segment.insert(0, Character.toChars(c)); } - } else { - char[] a = str.toCharArray(); - for (int i = 0; i < len; i++) { - if (a[i] > 127) { - asciiOnly = false; - break; - } + if(norm2.hasBoundaryBefore(c)) { + break; } } - - switch (form) { - case NFC : - return asciiOnly ? str : NFC.normalize(str, options); - case NFD : - return asciiOnly ? str : NFD.normalize(str, options); - case NFKC : - return asciiOnly ? str : NFKC.normalize(str, options); - case NFKD : - return asciiOnly ? str : NFKD.normalize(str, options); - } - - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } - - /** - * Test if a string is in a given normalization form. - * This is semantically equivalent to source.equals(normalize(source, mode)). - * - * Unlike quickCheck(), this function returns a definitive result, - * never a "maybe". - * For NFD, NFKD, and FCD, both functions work exactly the same. - * For NFC and NFKC where quickCheck may return "maybe", this function will - * perform further tests to arrive at a true/false result. - * @param str the input string to be checked to see if it is normalized - * @param form the normalization form - */ - public static boolean isNormalized(String str, Normalizer.Form form) { - return isNormalized(str, form, UNICODE_LATEST); + currentIndex=text.getIndex(); + norm2.normalize(segment, buffer); + bufferPos=buffer.length(); + return buffer.length()!=0; } - /** - * Test if a string is in a given normalization form. - * This is semantically equivalent to source.equals(normalize(source, mode)). - * - * Unlike quickCheck(), this function returns a definitive result, - * never a "maybe". - * For NFD, NFKD, and FCD, both functions work exactly the same. - * For NFC and NFKC where quickCheck may return "maybe", this function will - * perform further tests to arrive at a true/false result. - * @param str the input string to be checked to see if it is normalized - * @param form the normalization form - * @param options the optional features to be enabled. - */ - public static boolean isNormalized(String str, Normalizer.Form form, int options) { - switch (form) { - case NFC: - return (NFC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); - case NFD: - return (NFD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); - case NFKC: - return (NFKC.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); - case NFKD: - return (NFKD.quickCheck(str.toCharArray(),0,str.length(),false,NormalizerImpl.getNX(options))==YES); - } - - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:50.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2015-07-13 16:11:49.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,2453 +22,1706 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.BufferedInputStream; -import java.io.InputStream; +import java.nio.ByteBuffer; +import java.text.Normalizer; -/** - * @author Ram Viswanadha - */ +// Original filename in ICU4J: Normalizer2Impl.java public final class NormalizerImpl { - // Static block for the class to initialize its own self - static final NormalizerImpl IMPL; - - static - { - try - { - IMPL = new NormalizerImpl(); - } - catch (Exception e) - { - throw new RuntimeException(e.getMessage()); - } - } - - static final int UNSIGNED_BYTE_MASK =0xFF; - static final long UNSIGNED_INT_MASK = 0xffffffffL; - /* - * This new implementation of the normalization code loads its data from - * unorm.icu, which is generated with the gennorm tool. - * The format of that file is described at the end of this file. - */ - private static final String DATA_FILE_NAME = "/sun/text/resources/unorm.icu"; - - // norm32 value constants - - // quick check flags 0..3 set mean "no" for their forms - public static final int QC_NFC=0x11; /* no|maybe */ - public static final int QC_NFKC=0x22; /* no|maybe */ - public static final int QC_NFD=4; /* no */ - public static final int QC_NFKD=8; /* no */ - - public static final int QC_ANY_NO=0xf; - - /* quick check flags 4..5 mean "maybe" for their forms; - * test flags>=QC_MAYBE - */ - public static final int QC_MAYBE=0x10; - public static final int QC_ANY_MAYBE=0x30; - - public static final int QC_MASK=0x3f; - - private static final int COMBINES_FWD=0x40; - private static final int COMBINES_BACK=0x80; - public static final int COMBINES_ANY=0xc0; - // UnicodeData.txt combining class in bits 15. - private static final int CC_SHIFT=8; - public static final int CC_MASK=0xff00; - // 16 bits for the index to UChars and other extra data - private static final int EXTRA_SHIFT=16; - - /* norm32 value constants using >16 bits */ - private static final long MIN_SPECIAL = 0xfc000000 & UNSIGNED_INT_MASK; - private static final long SURROGATES_TOP = 0xfff00000 & UNSIGNED_INT_MASK; - private static final long MIN_HANGUL = 0xfff00000 & UNSIGNED_INT_MASK; -// private static final long MIN_JAMO_V = 0xfff20000 & UNSIGNED_INT_MASK; - private static final long JAMO_V_TOP = 0xfff30000 & UNSIGNED_INT_MASK; - - - /* indexes[] value names */ - /* number of bytes in normalization trie */ - static final int INDEX_TRIE_SIZE = 0; - /* number of chars in extra data */ - static final int INDEX_CHAR_COUNT = 1; - /* number of uint16_t words for combining data */ - static final int INDEX_COMBINE_DATA_COUNT = 2; - /* first code point with quick check NFC NO/MAYBE */ - public static final int INDEX_MIN_NFC_NO_MAYBE = 6; - /* first code point with quick check NFKC NO/MAYBE */ - public static final int INDEX_MIN_NFKC_NO_MAYBE = 7; - /* first code point with quick check NFD NO/MAYBE */ - public static final int INDEX_MIN_NFD_NO_MAYBE = 8; - /* first code point with quick check NFKD NO/MAYBE */ - public static final int INDEX_MIN_NFKD_NO_MAYBE = 9; - /* number of bytes in FCD trie */ - static final int INDEX_FCD_TRIE_SIZE = 10; - /* number of bytes in the auxiliary trie */ - static final int INDEX_AUX_TRIE_SIZE = 11; - /* changing this requires a new formatVersion */ - static final int INDEX_TOP = 32; - - - /* AUX constants */ - /* value constants for auxTrie */ - private static final int AUX_UNSAFE_SHIFT = 11; - private static final int AUX_COMP_EX_SHIFT = 10; - private static final int AUX_NFC_SKIPPABLE_F_SHIFT = 12; - - private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT; - private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK); - private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK); - private static final int AUX_COMP_EX_MASK = (int)((1<<AUX_COMP_EX_SHIFT) & UNSIGNED_INT_MASK); - private static final long AUX_NFC_SKIP_F_MASK = ((UNSIGNED_INT_MASK&1)<<AUX_NFC_SKIPPABLE_F_SHIFT); - - private static final int MAX_BUFFER_SIZE = 20; - - /*******************************/ - - /* Wrappers for Trie implementations */ - static final class NormTrieImpl implements Trie.DataManipulate{ - static IntTrie normTrie= null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ - public int getFoldingOffset(int value){ - return BMP_INDEX_LENGTH+ - ((value>>(EXTRA_SHIFT-SURROGATE_BLOCK_BITS))& - (0x3ff<<SURROGATE_BLOCK_BITS)); - } - - } - static final class FCDTrieImpl implements Trie.DataManipulate{ - static CharTrie fcdTrie=null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* fcdTrie: the folding offset is the lead FCD value itself */ - public int getFoldingOffset(int value){ - return value; - } - } - - static final class AuxTrieImpl implements Trie.DataManipulate{ - static CharTrie auxTrie = null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */ - public int getFoldingOffset(int value){ - return (value &AUX_FNC_MASK)<<SURROGATE_BLOCK_BITS; - } - } - - /****************************************************/ - - - private static FCDTrieImpl fcdTrieImpl; - private static NormTrieImpl normTrieImpl; - private static AuxTrieImpl auxTrieImpl; - private static int[] indexes; - private static char[] combiningTable; - private static char[] extraData; - - private static boolean isDataLoaded; - private static boolean isFormatVersion_2_1; - private static boolean isFormatVersion_2_2; - private static byte[] unicodeVersion; - - /** - * Default buffer size of datafile - */ - private static final int DATA_BUFFER_SIZE = 25000; - - /** - * FCD check: everything below this code point is known to have a 0 - * lead combining class - */ - public static final int MIN_WITH_LEAD_CC=0x300; - - - /** - * Bit 7 of the length byte for a decomposition string in extra data is - * a flag indicating whether the decomposition string is - * preceded by a 16-bit word with the leading and trailing cc - * of the decomposition (like for A-umlaut); - * if not, then both cc's are zero (like for compatibility ideographs). - */ - private static final int DECOMP_FLAG_LENGTH_HAS_CC=0x80; - /** - * Bits 6..0 of the length byte contain the actual length. - */ - private static final int DECOMP_LENGTH_MASK=0x7f; - - /** Length of the BMP portion of the index (stage 1) array. */ - private static final int BMP_INDEX_LENGTH=0x10000>>Trie.INDEX_STAGE_1_SHIFT_; - /** Number of bits of a trail surrogate that are used in index table - * lookups. - */ - private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_; - - - // public utility - public static int getFromIndexesArr(int index){ - return indexes[index]; - } - - // protected constructor --------------------------------------------- - - /** - * Constructor - * @exception thrown when data reading fails or data corrupted - */ - private NormalizerImpl() throws IOException { - //data should be loaded only once - if(!isDataLoaded){ - - // jar access - InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME); - BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE); - NormalizerDataReader reader = new NormalizerDataReader(b); - - // read the indexes - indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP); - - byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]]; - - int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT]; - combiningTable = new char[combiningTableTop]; - - int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT]; - extraData = new char[extraDataTop]; - - byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]]; - byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]]; - - fcdTrieImpl = new FCDTrieImpl(); - normTrieImpl = new NormTrieImpl(); - auxTrieImpl = new AuxTrieImpl(); - - // load the rest of the data data and initialize the data members - reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable); - - NormTrieImpl.normTrie = new IntTrie( new ByteArrayInputStream(normBytes),normTrieImpl ); - FCDTrieImpl.fcdTrie = new CharTrie( new ByteArrayInputStream(fcdBytes),fcdTrieImpl ); - AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl ); - - // we reached here without any exceptions so the data is fully - // loaded set the variable to true - isDataLoaded = true; - - // get the data format version - byte[] formatVersion = reader.getDataFormatVersion(); - - isFormatVersion_2_1 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=1) - ); - isFormatVersion_2_2 =( formatVersion[0]>2 - || - (formatVersion[0]==2 && formatVersion[1]>=2) - ); - unicodeVersion = reader.getUnicodeVersion(); - b.close(); - } - } - - /* ---------------------------------------------------------------------- */ - - /* Korean Hangul and Jamo constants */ - - public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ - public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ - public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ - - public static final int HANGUL_BASE=0xac00; - - public static final int JAMO_L_COUNT=19; - public static final int JAMO_V_COUNT=21; - public static final int JAMO_T_COUNT=28; - public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; - - private static boolean isHangulWithoutJamoT(char c) { - c-=HANGUL_BASE; - return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; - } - - /* norm32 helpers */ - - /* is this a norm32 with a regular index? */ - private static boolean isNorm32Regular(long norm32) { - return norm32<MIN_SPECIAL; - } - - /* is this a norm32 with a special index for a lead surrogate? */ - private static boolean isNorm32LeadSurrogate(long norm32) { - return MIN_SPECIAL<=norm32 && norm32<SURROGATES_TOP; - } - - /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */ - private static boolean isNorm32HangulOrJamo(long norm32) { - return norm32>=MIN_HANGUL; - } - - /* - * Given norm32 for Jamo V or T, - * is this a Jamo V? - */ - private static boolean isJamoVTNorm32JamoV(long norm32) { - return norm32<JAMO_V_TOP; - } - - /* data access primitives ----------------------------------------------- */ - - public static long/*unsigned*/ getNorm32(char c) { - return ((UNSIGNED_INT_MASK) & (NormTrieImpl.normTrie.getLeadValue(c))); - } - - public static long/*unsigned*/ getNorm32FromSurrogatePair(long norm32, - char c2) { - /* - * the surrogate index in norm32 stores only the number of the surrogate - * index block see gennorm/store.c/getFoldedNormValue() - */ - return ((UNSIGNED_INT_MASK) & - NormTrieImpl.normTrie.getTrailValue((int)norm32, c2)); - } - ///CLOVER:OFF - private static long getNorm32(int c){ - return (UNSIGNED_INT_MASK&(NormTrieImpl.normTrie.getCodePointValue(c))); - } - - /* - * get a norm32 from text with complete code points - * (like from decompositions) - */ - private static long/*unsigned*/ getNorm32(char[] p,int start, - int/*unsigned*/ mask) { - long/*unsigned*/ norm32= getNorm32(p[start]); - if(((norm32&mask)>0) && isNorm32LeadSurrogate(norm32)) { - /* *p is a lead surrogate, get the real norm32 */ - norm32=getNorm32FromSurrogatePair(norm32, p[start+1]); - } - return norm32; - } - - //// for StringPrep - public static VersionInfo getUnicodeVersion(){ - return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1], - unicodeVersion[2], unicodeVersion[3]); - } - - public static char getFCD16(char c) { - return FCDTrieImpl.fcdTrie.getLeadValue(c); - } - - public static char getFCD16FromSurrogatePair(char fcd16, char c2) { - /* the surrogate index in fcd16 is an absolute offset over the - * start of stage 1 - * */ - return FCDTrieImpl.fcdTrie.getTrailValue(fcd16, c2); - } - public static int getFCD16(int c) { - return FCDTrieImpl.fcdTrie.getCodePointValue(c); - } - - private static int getExtraDataIndex(long norm32) { - return (int)(norm32>>EXTRA_SHIFT); - } - - private static final class DecomposeArgs{ - int /*unsigned byte*/ cc; - int /*unsigned byte*/ trailCC; - int length; - } - /** - * - * get the canonical or compatibility decomposition for one character - * - * @return index into the extraData array - */ - private static int/*index*/ decompose(long/*unsigned*/ norm32, - int/*unsigned*/ qcMask, - DecomposeArgs args) { - int p= getExtraDataIndex(norm32); - args.length=extraData[p++]; - - if((norm32&qcMask&QC_NFKD)!=0 && args.length>=0x100) { - /* use compatibility decomposition, skip canonical data */ - p+=((args.length>>7)&1)+(args.length&DECOMP_LENGTH_MASK); - args.length>>=8; - } - - if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { - /* get the lead and trail cc's */ - char bothCCs=extraData[p++]; - args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); - args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; - } else { - /* lead and trail cc's are both 0 */ - args.cc=args.trailCC=0; - } - args.length&=DECOMP_LENGTH_MASK; - return p; - } + public static final class Hangul { + /* Korean Hangul and Jamo constants */ + public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ + public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ + public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ + public static final int HANGUL_BASE=0xac00; + public static final int HANGUL_END=0xd7a3; - /** - * get the canonical decomposition for one character - * @return index into the extraData array - */ - private static int decompose(long/*unsigned*/ norm32, - DecomposeArgs args) { + public static final int JAMO_L_COUNT=19; + public static final int JAMO_V_COUNT=21; + public static final int JAMO_T_COUNT=28; - int p= getExtraDataIndex(norm32); - args.length=extraData[p++]; + public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; + public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; - if((args.length&DECOMP_FLAG_LENGTH_HAS_CC)>0) { - /* get the lead and trail cc's */ - char bothCCs=extraData[p++]; - args.cc=(UNSIGNED_BYTE_MASK) & (bothCCs>>8); - args.trailCC=(UNSIGNED_BYTE_MASK) & bothCCs; - } else { - /* lead and trail cc's are both 0 */ - args.cc=args.trailCC=0; + public static boolean isHangul(int c) { + return HANGUL_BASE<=c && c<HANGUL_LIMIT; } - args.length&=DECOMP_LENGTH_MASK; - return p; - } - - - private static final class NextCCArgs{ - char[] source; - int next; - int limit; - char c; - char c2; - } - - /* - * get the combining class of (c, c2)= args.source[args.next++] - * before: args.next<args.limit after: args.next<=args.limit - * if only one code unit is used, then c2==0 - */ - private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { - long /*unsigned*/ norm32; - - args.c=args.source[args.next++]; - - norm32= getNorm32(args.c); - if((norm32 & CC_MASK)==0) { - args.c2=0; - return 0; - } else { - if(!isNorm32LeadSurrogate(norm32)) { - args.c2=0; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(args.next!=args.limit && - UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ - ++args.next; - norm32=getNorm32FromSurrogatePair(norm32, args.c2); - } else { - args.c2=0; - return 0; - } - } - - return (int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); + public static boolean isHangulWithoutJamoT(char c) { + c-=HANGUL_BASE; + return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; } - } - private static final class PrevArgs{ - char[] src; - int start; - int current; - char c; - char c2; - } - - /* - * read backwards and get norm32 - * return 0 if the character is <minC - * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first - * surrogate but read second!) - */ - private static long /*unsigned*/ getPrevNorm32(PrevArgs args, - int/*unsigned*/ minC, - int/*unsigned*/ mask) { - long/*unsigned*/ norm32; - - args.c=args.src[--args.current]; - args.c2=0; - - /* check for a surrogate before getting norm32 to see if we need to - * predecrement further + /** + * Decomposes c, which must be a Hangul syllable, into buffer + * and returns the length of the decomposition (2 or 3). */ - if(args.c<minC) { - return 0; - } else if(!UTF16.isSurrogate(args.c)) { - return getNorm32(args.c); - } else if(UTF16.isLeadSurrogate(args.c)) { - /* unpaired first surrogate */ - return 0; - } else if(args.current!=args.start && - UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { - --args.current; - norm32=getNorm32(args.c2); - - if((norm32&mask)==0) { - /* all surrogate pairs with this lead surrogate have - * only irrelevant data - */ - return 0; - } else { - /* norm32 must be a surrogate special */ - return getNorm32FromSurrogatePair(norm32, args.c); - } - } else { - /* unpaired second surrogate */ - args.c2=0; - return 0; - } - } - - /* - * get the combining class of (c, c2)=*--p - * before: start<p after: start<=p - */ - private static int /*unsigned byte*/ getPrevCC(PrevArgs args) { - - return (int)((UNSIGNED_BYTE_MASK)&(getPrevNorm32(args, MIN_WITH_LEAD_CC, - CC_MASK)>>CC_SHIFT)); - } - - /* - * is this a safe boundary character for NF*D? - * (lead cc==0) - */ - public static boolean isNFDSafe(long/*unsigned*/ norm32, - int/*unsigned*/ccOrQCMask, - int/*unsigned*/ decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return true; /* cc==0 and no decomposition: this is NF*D safe */ - } - - /* inspect its decomposition - maybe a Hangul but not a surrogate here*/ - if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) { - DecomposeArgs args=new DecomposeArgs(); - /* decomposes, get everything from the variable-length extra data */ - decompose(norm32, decompQCMask, args); - return args.cc==0; - } else { - /* no decomposition (or Hangul), test the cc directly */ - return (norm32&CC_MASK)==0; - } - } - - /* - * is this (or does its decomposition begin with) a "true starter"? - * (cc==0 and NF*C_YES) - */ - public static boolean isTrueStarter(long/*unsigned*/ norm32, - int/*unsigned*/ ccOrQCMask, - int/*unsigned*/ decompQCMask) { - if((norm32&ccOrQCMask)==0) { - return true; /* this is a true starter (could be Hangul or Jamo L)*/ - } - - /* inspect its decomposition - not a Hangul or a surrogate here */ - if((norm32&decompQCMask)!=0) { - int p; /* index into extra data array */ - DecomposeArgs args=new DecomposeArgs(); - /* decomposes, get everything from the variable-length extra data */ - p=decompose(norm32, decompQCMask, args); - - if(args.cc==0) { - int/*unsigned*/ qcMask=ccOrQCMask&QC_MASK; - - /* does it begin with NFC_YES? */ - if((getNorm32(extraData,p, qcMask)&qcMask)==0) { - /* yes, the decomposition begins with a true starter */ - return true; - } - } - } - return false; - } - - /* reorder UTF-16 in-place ---------------------------------------------- */ - - /** - * simpler, single-character version of mergeOrdered() - - * bubble-insert one single code point into the preceding string - * which is already canonically ordered - * (c, c2) may or may not yet have been inserted at src[current]..src[p] - * - * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) - * - * before: src[start]..src[current] is already ordered, and - * src[current]..src[p] may or may not hold (c, c2) but - * must be exactly the same length as (c, c2) - * after: src[start]..src[p] is ordered - * - * @return the trailing combining class - */ - private static int/*unsigned byte*/ insertOrdered(char[] source, - int start, - int current, int p, - char c, char c2, - int/*unsigned byte*/ cc) { - int back, preBack; - int r; - int prevCC, trailCC=cc; - - if(start<current && cc!=0) { - // search for the insertion point where cc>=prevCC - preBack=back=current; - PrevArgs prevArgs = new PrevArgs(); - prevArgs.current = current; - prevArgs.start = start; - prevArgs.src = source; - // get the prevCC - prevCC=getPrevCC(prevArgs); - preBack = prevArgs.current; - - if(cc<prevCC) { - // this will be the last code point, so keep its cc - trailCC=prevCC; - back=preBack; - while(start<preBack) { - prevCC=getPrevCC(prevArgs); - preBack=prevArgs.current; - if(cc>=prevCC) { - break; - } - back=preBack; + public static int decompose(int c, Appendable buffer) { + try { + c-=HANGUL_BASE; + int c2=c%JAMO_T_COUNT; + c/=JAMO_T_COUNT; + buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); + buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); + if(c2==0) { + return 2; + } else { + buffer.append((char)(JAMO_T_BASE+c2)); + return 3; } - - - // this is where we are right now with all these indicies: - // [start]..[pPreBack] 0..? code points that we can ignore - // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc - // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) - // [current]..[p] 1 code point (c, c2) with cc - - // move the code units in between up - r=p; - do { - source[--r]=source[--current]; - } while(back!=current); + } catch(IOException e) { + throw new InternalError(e); } } - - // insert (c, c2) - source[current]=c; - if(c2!=0) { - source[(current+1)]=c2; - } - - // we know the cc of the last code point - return trailCC; } /** - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation - * - * the two strings may already be adjacent, so that the merging is done - * in-place if the two strings are not adjacent, then the buffer holding the - * first one must be large enough - * the second string may or may not be ordered in itself - * - * before: [start]..[current] is already ordered, and - * [next]..[limit] may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from - * src[next++] and inserts them in correct combining class order into the - * preceding part of the string - * - * since this function is called much less often than the single-code point - * insertOrdered(), it just uses that for easier maintenance - * - * @return the trailing combining class - */ - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - int next, - int limit, - boolean isOrdered) { - int r; - int /*unsigned byte*/ cc, trailCC=0; - boolean adjacent; - - adjacent= current==next; - NextCCArgs ncArgs = new NextCCArgs(); - ncArgs.source = data; - ncArgs.next = next; - ncArgs.limit = limit; - - if(start!=current || !isOrdered) { - - while(ncArgs.next<ncArgs.limit) { - cc=getNextCC(ncArgs); - if(cc==0) { - // does not bubble back - trailCC=0; - if(adjacent) { - current=ncArgs.next; - } else { - data[current++]=ncArgs.c; - if(ncArgs.c2!=0) { - data[current++]=ncArgs.c2; - } - } - if(isOrdered) { - break; - } else { - start=current; - } - } else { - r=current+(ncArgs.c2==0 ? 1 : 2); - trailCC=insertOrdered(source,start, current, r, - ncArgs.c, ncArgs.c2, cc); - current=r; - } - } - } - - if(ncArgs.next==ncArgs.limit) { - // we know the cc of the last code point - return trailCC; - } else { - if(!adjacent) { - // copy the second string part - do { - source[current++]=data[ncArgs.next++]; - } while(ncArgs.next!=ncArgs.limit); - ncArgs.limit=current; - } - PrevArgs prevArgs = new PrevArgs(); - prevArgs.src = data; - prevArgs.start = start; - prevArgs.current = ncArgs.limit; - return getPrevCC(prevArgs); - } - - } - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - final int next, - final int limit) { - return mergeOrdered(source,start,current,data,next,limit,true); - } - - public static NormalizerBase.QuickCheckResult quickCheck(char[] src, - int srcStart, - int srcLimit, - int minNoMaybe, - int qcMask, - int options, - boolean allowMaybe, - UnicodeSet nx){ - - int ccOrQCMask; - long norm32; - char c, c2; - char cc, prevCC; - long qcNorm32; - NormalizerBase.QuickCheckResult result; - ComposePartArgs args = new ComposePartArgs(); - char[] buffer ; - int start = srcStart; - - if(!isDataLoaded) { - return NormalizerBase.MAYBE; - } - // initialize - ccOrQCMask=CC_MASK|qcMask; - result=NormalizerBase.YES; - prevCC=0; - - for(;;) { - for(;;) { - if(srcStart==srcLimit) { - return result; - } else if((c=src[srcStart++])>=minNoMaybe && - (( norm32=getNorm32(c)) & ccOrQCMask)!=0) { - break; - } - prevCC=0; - } - - - // check one above-minimum, relevant code unit - if(isNorm32LeadSurrogate(norm32)) { - // c is a lead surrogate, get the real norm32 - if(srcStart!=srcLimit&& UTF16.isTrailSurrogate(c2=src[srcStart])) { - ++srcStart; - norm32=getNorm32FromSurrogatePair(norm32,c2); + * Writable buffer that takes care of canonical ordering. + * Its Appendable methods behave like the C++ implementation's + * appendZeroCC() methods. + * <p> + * If dest is a StringBuilder, then the buffer writes directly to it. + * Otherwise, the buffer maintains a StringBuilder for intermediate text segments + * until no further changes are necessary and whole segments are appended. + * append() methods that take combining-class values always write to the StringBuilder. + * Other append() methods flush and append to the Appendable. + */ + public static final class ReorderingBuffer implements Appendable { + public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { + impl=ni; + app=dest; + if (app instanceof StringBuilder) { + appIsStringBuilder=true; + str=(StringBuilder)dest; + // In Java, the constructor subsumes public void init(int destCapacity) + str.ensureCapacity(destCapacity); + reorderStart=0; + if(str.length()==0) { + lastCC=0; } else { - norm32=0; - c2=0; - } - }else{ - c2=0; - } - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - norm32=0; - } - - // check the combining order - cc=(char)((norm32>>CC_SHIFT)&0xFF); - if(cc!=0 && cc<prevCC) { - return NormalizerBase.NO; - } - prevCC=cc; - - // check for "no" or "maybe" quick check flags - qcNorm32 = norm32 & qcMask; - if((qcNorm32& QC_ANY_NO)>=1) { - result= NormalizerBase.NO; - break; - } else if(qcNorm32!=0) { - // "maybe" can only occur for NFC and NFKC - if(allowMaybe){ - result=NormalizerBase.MAYBE; - }else{ - // normalize a section around here to see if it is really - // normalized or not - int prevStarter; - int/*unsigned*/ decompQCMask; - - decompQCMask=(qcMask<<2)&0xf; // decomposition quick check mask - - // find the previous starter - - // set prevStarter to the beginning of the current character - prevStarter=srcStart-1; - if(UTF16.isTrailSurrogate(src[prevStarter])) { - // safe because unpaired surrogates do not result - // in "maybe" - --prevStarter; - } - prevStarter=findPreviousStarter(src, start, prevStarter, - ccOrQCMask, decompQCMask, - (char)minNoMaybe); - - // find the next true starter in [src..limit[ - modifies - // src to point to the next starter - srcStart=findNextStarter(src,srcStart, srcLimit, qcMask, - decompQCMask,(char) minNoMaybe); - - //set the args for compose part - args.prevCC = prevCC; - - // decompose and recompose [prevStarter..src[ - buffer = composePart(args,prevStarter,src,srcStart,srcLimit,options,nx); - - // compare the normalized version with the original - if(0!=strCompare(buffer,0,args.length,src,prevStarter,srcStart, false)) { - result=NormalizerBase.NO; // normalization differs - break; + setIterator(); + lastCC=previousCC(); + // Set reorderStart after the last code point with cc<=1 if there is one. + if(lastCC>1) { + while(previousCC()>1) {} } - - // continue after the next starter + reorderStart=codePointLimit; } - } - } - return result; - } - - - //------------------------------------------------------ - // make NFD & NFKD - //------------------------------------------------------ - - public static int decompose(char[] src,int srcStart,int srcLimit, - char[] dest,int destStart,int destLimit, - boolean compat,int[] outTrailCC, - UnicodeSet nx) { - - char[] buffer = new char[3]; - int prevSrc; - long norm32; - int ccOrQCMask, qcMask; - int reorderStartIndex, length; - char c, c2, minNoMaybe; - int/*unsigned byte*/ cc, prevCC, trailCC; - char[] p; - int pStart; - int destIndex = destStart; - int srcIndex = srcStart; - if(!compat) { - minNoMaybe=(char)indexes[INDEX_MIN_NFD_NO_MAYBE]; - qcMask=QC_NFD; - } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFKD_NO_MAYBE]; - qcMask=QC_NFKD; + } else { + appIsStringBuilder=false; + str=new StringBuilder(); + reorderStart=0; + lastCC=0; + } } - /* initialize */ - ccOrQCMask=CC_MASK|qcMask; - reorderStartIndex=0; - prevCC=0; - norm32=0; - c=0; - pStart=0; + public boolean isEmpty() { return str.length()==0; } + public int length() { return str.length(); } + public int getLastCC() { return lastCC; } - cc=trailCC=-1;//initialize to bogus value + public StringBuilder getStringBuilder() { return str; } - for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check - */ - prevSrc=srcIndex; + public boolean equals(CharSequence s, int start, int limit) { + return UTF16Plus.equal(str, 0, str.length(), s, start, limit); + } - while(srcIndex!=srcLimit &&((c=src[srcIndex])<minNoMaybe || - ((norm32=getNorm32(c))&ccOrQCMask)==0)){ - prevCC=0; - ++srcIndex; - } + // For Hangul composition, replacing the Leading consonant Jamo with the syllable. + public void setLastChar(char c) { + str.setCharAt(str.length()-1, c); + } - /* copy these code units all at once */ - if(srcIndex!=prevSrc) { - length=srcIndex-prevSrc; - if((destIndex+length)<=destLimit) { - System.arraycopy(src,prevSrc,dest,destIndex,length); + public void append(int c, int cc) { + if(lastCC<=cc || cc==0) { + str.appendCodePoint(c); + lastCC=cc; + if(cc<=1) { + reorderStart=str.length(); } - - destIndex+=length; - reorderStartIndex=destIndex; + } else { + insert(c, cc); } + } - /* end of source reached? */ - if(srcIndex==srcLimit) { - break; + // s must be in NFD, otherwise change the implementation. + public void append(CharSequence s, int start, int limit, + int leadCC, int trailCC) { + if(start==limit) { + return; } - - /* c already contains *src and norm32 is set for it, increment src*/ - ++srcIndex; - - /* check one above-minimum, relevant code unit */ - /* - * generally, set p and length to the decomposition string - * in simple cases, p==NULL and (c, c2) will hold the length code - * units to append in all cases, set cc to the lead and trailCC to - * the trail combining class - * - * the following merge-sort of the current character into the - * preceding, canonically ordered result text will use the - * optimized insertOrdered() - * if there is only one single code point to process; - * this is indicated with p==NULL, and (c, c2) is the character to - * insert - * ((c, 0) for a BMP character and (lead surrogate, trail surrogate) - * for a supplementary character) - * otherwise, p[length] is merged in with _mergeOrdered() - */ - if(isNorm32HangulOrJamo(norm32)) { - if(nx_contains(nx, c)) { - c2=0; - p=null; - length=1; - } else { - // Hangul syllable: decompose algorithmically - p=buffer; - pStart=0; - cc=trailCC=0; - - c-=HANGUL_BASE; - - c2=(char)(c%JAMO_T_COUNT); - c/=JAMO_T_COUNT; - if(c2>0) { - buffer[2]=(char)(JAMO_T_BASE+c2); - length=3; - } else { - length=2; - } - - buffer[1]=(char)(JAMO_V_BASE+c%JAMO_V_COUNT); - buffer[0]=(char)(JAMO_L_BASE+c/JAMO_V_COUNT); + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. } + str.append(s, start, limit); + lastCC=trailCC; } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - // c is a lead surrogate, get the real norm32 - if(srcIndex!=srcLimit && - UTF16.isTrailSurrogate(c2=src[srcIndex])) { - ++srcIndex; - length=2; - norm32=getNorm32FromSurrogatePair(norm32, c2); + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(start<limit) { + c=Character.codePointAt(s, start); + start+=Character.charCount(c); + if(start<limit) { + // s must be in NFD, otherwise we need to use getCC(). + leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); } else { - c2=0; - length=1; - norm32=0; + leadCC=trailCC; } + append(c, leadCC); } + } + } - /* get the decomposition and the lead and trail cc's */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=trailCC=0; - p=null; - } else if((norm32&qcMask)==0) { - /* c does not decompose */ - cc=trailCC=(int)((UNSIGNED_BYTE_MASK) & (norm32>>CC_SHIFT)); - p=null; - pStart=-1; - } else { - DecomposeArgs arg = new DecomposeArgs(); - /* c decomposes, get everything from the variable-length - * extra data - */ - pStart=decompose(norm32, qcMask, arg); - p=extraData; - length=arg.length; - cc=arg.cc; - trailCC=arg.trailCC; - if(length==1) { - /* fastpath a single code unit from decomposition */ - c=p[pStart]; - c2=0; - p=null; - pStart=-1; - } - } + // The following append() methods work like C++ appendZeroCC(). + // They assume that the cc or trailCC of their input is 0. + // Most of them implement Appendable interface methods. + // @Override when we switch to Java 6 + public ReorderingBuffer append(char c) { + str.append(c); + lastCC=0; + reorderStart=str.length(); + return this; + } + + public void appendZeroCC(int c) { + str.appendCodePoint(c); + lastCC=0; + reorderStart=str.length(); + } + + // @Override when we switch to Java 6 + public ReorderingBuffer append(CharSequence s) { + if(s.length()!=0) { + str.append(s); + lastCC=0; + reorderStart=str.length(); } + return this; + } - /* append the decomposition to the destination buffer, assume - * length>0 - */ - if((destIndex+length)<=destLimit) { - int reorderSplit=destIndex; - if(p==null) { - /* fastpath: single code point */ - if(cc!=0 && cc<prevCC) { - /* (c, c2) is out of order with respect to the preceding - * text - */ - destIndex+=length; - trailCC=insertOrdered(dest,reorderStartIndex, - reorderSplit, destIndex, c, c2, cc); - } else { - /* just append (c, c2) */ - dest[destIndex++]=c; - if(c2!=0) { - dest[destIndex++]=c2; - } - } - } else { - /* general: multiple code points (ordered by themselves) - * from decomposition - */ - if(cc!=0 && cc<prevCC) { - /* the decomposition is out of order with respect to the - * preceding text - */ - destIndex+=length; - trailCC=mergeOrdered(dest,reorderStartIndex, - reorderSplit,p, pStart,pStart+length); - } else { - /* just append the decomposition */ - do { - dest[destIndex++]=p[pStart++]; - } while(--length>0); - } - } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + // @Override when we switch to Java 6 + public ReorderingBuffer append(CharSequence s, int start, int limit) { + if(start!=limit) { + str.append(s, start, limit); + lastCC=0; + reorderStart=str.length(); } + return this; + } - prevCC=trailCC; - if(prevCC==0) { - reorderStartIndex=destIndex; + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Used after recomposition. + * Must be called at the end when writing to a non-StringBuilder Appendable. + */ + public void flush() { + if(appIsStringBuilder) { + reorderStart=str.length(); + } else { + try { + app.append(str); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + } + + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Then appends the new text to the Appendable or StringBuilder. + * Normally used after quick check loops find a non-empty sequence. + */ + public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { + if(appIsStringBuilder) { + str.append(s, start, limit); + reorderStart=str.length(); + } else { + try { + app.append(str).append(s, start, limit); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + return this; + } + + public void remove() { + str.setLength(0); + lastCC=0; + reorderStart=0; + } + + public void removeSuffix(int suffixLength) { + int oldLength=str.length(); + str.delete(oldLength-suffixLength, oldLength); + lastCC=0; + reorderStart=str.length(); + } + + // Inserts c somewhere before the last character. + // Requires 0<cc<lastCC which implies reorderStart<limit. + private void insert(int c, int cc) { + for(setIterator(), skipPrevious(); previousCC()>cc;) {} + // insert c at codePointLimit, after the character with prevCC<=cc + if(c<=0xffff) { + str.insert(codePointLimit, (char)c); + if(cc<=1) { + reorderStart=codePointLimit+1; + } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; + } } } - outTrailCC[0]=prevCC; + private final NormalizerImpl impl; + private final Appendable app; + private final StringBuilder str; + private final boolean appIsStringBuilder; + private int reorderStart; + private int lastCC; - return destIndex - destStart; - } + // private backward iterator + private void setIterator() { codePointStart=str.length(); } + private void skipPrevious() { // Requires 0<codePointStart. + codePointLimit=codePointStart; + codePointStart=str.offsetByCodePoints(codePointStart, -1); + } + private int previousCC() { // Returns 0 if there is no previous character. + codePointLimit=codePointStart; + if(reorderStart>=codePointStart) { + return 0; + } + int c=str.codePointBefore(codePointStart); + codePointStart-=Character.charCount(c); + if(c<MIN_CCC_LCCC_CP) { + return 0; + } + return getCCFromYesOrMaybe(impl.getNorm16(c)); + } - /* make NFC & NFKC ------------------------------------------------------ */ - private static final class NextCombiningArgs{ - char[] source; - int start; - //int limit; - char c; - char c2; - int/*unsigned*/ combiningIndex; - char /*unsigned byte*/ cc; + private int codePointStart, codePointLimit; } - /* get the composition properties of the next character */ - private static int /*unsigned*/ getNextCombining(NextCombiningArgs args, - int limit, - UnicodeSet nx) { - long/*unsigned*/ norm32; - int combineFlags; - /* get properties */ - args.c=args.source[args.start++]; - norm32=getNorm32(args.c); - - /* preset output values for most characters */ - args.c2=0; - args.combiningIndex=0; - args.cc=0; + // TODO: Propose as public API on the UTF16 class. + // TODO: Propose widening UTF16 methods that take char to take int. + // TODO: Propose widening UTF16 methods that take String to take CharSequence. + public static final class UTF16Plus { + /** + * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), + * is it a lead surrogate? + * @param c code unit or code point + * @return true or false + */ + public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } - if((norm32&(CC_MASK|COMBINES_ANY))==0) { - return 0; - } else { - if(isNorm32Regular(norm32)) { - /* set cc etc. below */ - } else if(isNorm32HangulOrJamo(norm32)) { - /* a compatibility decomposition contained Jamos */ - args.combiningIndex=(int)((UNSIGNED_INT_MASK)&(0xfff0| - (norm32>>EXTRA_SHIFT))); - return (int)(norm32&COMBINES_ANY); - } else { - /* c is a lead surrogate, get the real norm32 */ - if(args.start!=limit && UTF16.isTrailSurrogate(args.c2= - args.source[args.start])) { - ++args.start; - norm32=getNorm32FromSurrogatePair(norm32, args.c2); - } else { - args.c2=0; - return 0; - } + /** + * Compares two CharSequence subsequences for binary equality. + * @param s1 first sequence + * @param start1 start offset in first sequence + * @param limit1 limit offset in first sequence + * @param s2 second sequence + * @param start2 start offset in second sequence + * @param limit2 limit offset in second sequence + * @return true if s1.subSequence(start1, limit1) contains the same text + * as s2.subSequence(start2, limit2) + */ + public static boolean equal(CharSequence s1, int start1, int limit1, + CharSequence s2, int start2, int limit2) { + if((limit1-start1)!=(limit2-start2)) { + return false; } - - if(nx_contains(nx, args.c, args.c2)) { - return 0; /* excluded: norm32==0 */ + if(s1==s2 && start1==start2) { + return true; + } + while(start1<limit1) { + if(s1.charAt(start1++)!=s2.charAt(start2++)) { + return false; + } } + return true; + } + } - args.cc= (char)((norm32>>CC_SHIFT)&0xff); + public NormalizerImpl() {} - combineFlags=(int)(norm32&COMBINES_ANY); - if(combineFlags!=0) { - int index = getExtraDataIndex(norm32); - args.combiningIndex=index>0 ? extraData[(index-1)] :0; + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0]==2; + } + } + + private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); + private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" + + public NormalizerImpl load(ByteBuffer bytes) { + try { + dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); + int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 + if(indexesLength<=IX_MIN_MAYBE_YES) { + throw new IOException("Normalizer2 data: not enough indexes"); + } + int[] inIndexes=new int[indexesLength]; + inIndexes[0]=indexesLength*4; + for(int i=1; i<indexesLength; ++i) { + inIndexes[i]=bytes.getInt(); + } + + minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; + minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; + + minYesNo=inIndexes[IX_MIN_YES_NO]; + minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; + minNoNo=inIndexes[IX_MIN_NO_NO]; + limitNoNo=inIndexes[IX_LIMIT_NO_NO]; + minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; + + // Read the normTrie. + int offset=inIndexes[IX_NORM_TRIE_OFFSET]; + int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; + normTrie=Trie2_16.createFromSerialized(bytes); + int trieLength=normTrie.getSerializedLength(); + if(trieLength>(nextOffset-offset)) { + throw new IOException("Normalizer2 data: not enough bytes for normTrie"); + } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes + + // Read the composition and mapping data. + offset=nextOffset; + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; + int numChars=(nextOffset-offset)/2; + char[] chars; + if(numChars!=0) { + chars=new char[numChars]; + for(int i=0; i<numChars; ++i) { + chars[i]=bytes.getChar(); + } + maybeYesCompositions=new String(chars); + extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); + } + + // smallFCD: new in formatVersion 2 + offset=nextOffset; + smallFCD=new byte[0x100]; + for(int i=0; i<0x100; ++i) { + smallFCD[i]=bytes.get(); + } + + // Build tccc180[]. + // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. + tccc180=new int[0x180]; + int bits=0; + for(int c=0; c<0x180; bits>>=1) { + if((c&0xff)==0) { + bits=smallFCD[c>>8]; // one byte per 0x100 code points + } + if((bits&1)!=0) { + for(int i=0; i<0x20; ++i, ++c) { + tccc180[c]=getFCD16FromNormData(c)&0xff; + } + } else { + c+=0x20; + } } - return combineFlags; + return this; + } catch(IOException e) { + throw new InternalError(e); } } - /* - * given a composition-result starter (c, c2) - which means its cc==0, - * it combines forward, it has extra data, its norm32!=0, - * it is not a Hangul or Jamo, - * get just its combineFwdIndex - * - * norm32(c) is special if and only if c2!=0 - */ - private static int/*unsigned*/ getCombiningIndexFromStarter(char c,char c2){ - long/*unsigned*/ norm32; + public NormalizerImpl load(String name) { + return load(ICUBinary.getRequiredData(name)); + } - norm32=getNorm32(c); - if(c2!=0) { - norm32=getNorm32FromSurrogatePair(norm32, c2); - } - return extraData[(getExtraDataIndex(norm32)-1)]; + public int getNorm16(int c) { + return normTrie.get(c); } - /* - * Find the recomposition result for - * a forward-combining character - * (specified with a pointer to its part of the combiningTable[]) - * and a backward-combining character - * (specified with its combineBackIndex). - * - * If these two characters combine, then set (value, value2) - * with the code unit(s) of the composition character. - * - * Return value: - * 0 do not combine - * 1 combine - * >1 combine, and the composition is a forward-combining starter - * - * See unormimp.h for a description of the composition table format. - */ - private static int/*unsigned*/ combine(char[]table,int tableStart, - int/*unsinged*/ combineBackIndex, - int[] outValues) { - int/*unsigned*/ key; - int value,value2; + public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } - if(outValues.length<2){ - throw new IllegalArgumentException(); + public int getCC(int norm16) { + if(norm16>=MIN_NORMAL_MAYBE_YES) { + return norm16&0xff; } - - /* search in the starter's composition table */ - for(;;) { - key=table[tableStart++]; - if(key>=combineBackIndex) { - break; - } - tableStart+= ((table[tableStart]&0x8000) != 0)? 2 : 1; + if(norm16<minNoNo || limitNoNo<=norm16) { + return 0; } + return getCCFromNoNo(norm16); + } - /* mask off bit 15, the last-entry-in-the-list flag */ - if((key&0x7fff)==combineBackIndex) { - /* found! combine! */ - value=table[tableStart]; - - /* is the composition a starter that combines forward? */ - key=(int)((UNSIGNED_INT_MASK)&((value&0x2000)+1)); + public static int getCCFromYesOrMaybe(int norm16) { + return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; + } - /* get the composition result code point from the variable-length - * result value - */ - if((value&0x8000) != 0) { - if((value&0x4000) != 0) { - /* surrogate pair composition result */ - value=(int)((UNSIGNED_INT_MASK)&((value&0x3ff)|0xd800)); - value2=table[tableStart+1]; + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + public int getFCD16(int c) { + if(c<0) { + return 0; + } else if(c<0x180) { + return tccc180[c]; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + + /** Returns the FCD data for U+0000<=c<U+0180. */ + public int getFCD16FromBelow180(int c) { return tccc180[c]; } + /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ + public boolean singleLeadMightHaveNonZeroFCD16(int lead) { + // 0<=lead<=0xffff + byte bits=smallFCD[lead>>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + // Only loops for 1:1 algorithmic mappings. + for(;;) { + int norm16=getNorm16(c); + if(norm16<=minYesNo) { + // no decomposition or Hangul syllable, all zeros + return 0; + } else if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16&=0xff; + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + // A character that is deleted (maps to an empty string) must + // get the worst-case lccc and tccc values because arbitrary + // characters on both sides will become adjacent. + return 0x1ff; } else { - /* BMP composition result U+2000..U+ffff */ - value=table[tableStart+1]; - value2=0; + int fcd16=firstUnit>>8; // tccc + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc + } + return fcd16; } + } + } + } + + /** + * Gets the decomposition for one code point. + * @param c code point + * @return c's decomposition, if it has one; returns null if it does not have a decomposition + */ + public String getDecomposition(int c) { + int decomp=-1; + int norm16; + for(;;) { + if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { + // c does not decompose + } else if(isHangul(norm16)) { + // Hangul syllable: decompose algorithmically + StringBuilder buffer=new StringBuilder(); + Hangul.decompose(c, buffer); + return buffer.toString(); + } else if(isDecompNoAlgorithmic(norm16)) { + decomp=c=mapAlgorithmic(c, norm16); + continue; } else { - /* BMP composition result U+0000..U+1fff */ - value&=0x1fff; - value2=0; - } - outValues[0]=value; - outValues[1]=value2; - return key; - } else { - /* not found */ - return 0; + // c decomposes, get everything from the variable-length extra data + int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; + return extraData.substring(norm16, norm16+length); + } + if(decomp<0) { + return null; + } else { + return UTF16.valueOf(decomp); + } } } + public static final int MIN_CCC_LCCC_CP=0x300; - private static final class RecomposeArgs{ - char[] source; - int start; - int limit; - } - /* - * recompose the characters in [p..limit[ - * (which is in NFD - decomposed and canonically ordered), - * adjust limit, and return the trailing cc - * - * since for NFKC we may get Jamos in decompositions, we need to - * recompose those too - * - * note that recomposition never lengthens the text: - * any character consists of either one or two code units; - * a composition may contain at most one more code unit than the original - * starter, while the combining mark that is removed has at least one code - * unit - */ - private static char/*unsigned byte*/ recompose(RecomposeArgs args, int options, UnicodeSet nx) { - int remove, q, r; - int /*unsigned*/ combineFlags; - int /*unsigned*/ combineFwdIndex, combineBackIndex; - int /*unsigned*/ result, value=0, value2=0; - int /*unsigned byte*/ prevCC; - boolean starterIsSupplementary; - int starter; - int[] outValues = new int[2]; - starter=-1; /* no starter */ - combineFwdIndex=0; /* will not be used until starter!=NULL */ - starterIsSupplementary=false; /* will not be used until starter!=NULL */ - prevCC=0; + public static final int MIN_YES_YES_WITH_CC=0xff01; + public static final int JAMO_VT=0xff00; + public static final int MIN_NORMAL_MAYBE_YES=0xfe00; + public static final int MAX_DELTA=0x40; - NextCombiningArgs ncArg = new NextCombiningArgs(); - ncArg.source = args.source; + // Byte offsets from the start of the data, after the generic header. + public static final int IX_NORM_TRIE_OFFSET=0; + public static final int IX_EXTRA_DATA_OFFSET=1; + public static final int IX_SMALL_FCD_OFFSET=2; - ncArg.cc =0; - ncArg.c2 =0; + // Code point thresholds for quick check codes. + public static final int IX_MIN_DECOMP_NO_CP=8; + public static final int IX_MIN_COMP_NO_MAYBE_CP=9; - for(;;) { - ncArg.start = args.start; - combineFlags=getNextCombining(ncArg,args.limit,nx); - combineBackIndex=ncArg.combiningIndex; - args.start = ncArg.start; - - if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) { - if((combineBackIndex&0x8000)!=0) { - /* c is a Jamo V/T, see if we can compose it with the - * previous character - */ - /* for the PRI #29 fix, check that there is no intervening combining mark */ - if((options&BEFORE_PRI_29)!=0 || prevCC==0) { - remove=-1; /* NULL while no Hangul composition */ - combineFlags=0; - ncArg.c2=args.source[starter]; - if(combineBackIndex==0xfff2) { - /* Jamo V, compose with previous Jamo L and following - * Jamo T - */ - ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE); - if(ncArg.c2<JAMO_L_COUNT) { - remove=args.start-1; - ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+ - (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT); - if(args.start!=args.limit && - (ncArg.c2=(char)(args.source[args.start] - -JAMO_T_BASE))<JAMO_T_COUNT) { - ++args.start; - ncArg.c+=ncArg.c2; - } else { - /* the result is an LV syllable, which is a starter (unlike LVT) */ - combineFlags=COMBINES_FWD; - } - if(!nx_contains(nx, ncArg.c)) { - args.source[starter]=ncArg.c; - } else { - /* excluded */ - if(!isHangulWithoutJamoT(ncArg.c)) { - --args.start; /* undo the ++args.start from reading the Jamo T */ - } - /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */ - remove=args.start; - } - } + // Norm16 value thresholds for quick check combinations and types of extra data. + // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. + public static final int IX_MIN_YES_NO=10; + public static final int IX_MIN_NO_NO=11; + public static final int IX_LIMIT_NO_NO=12; + public static final int IX_MIN_MAYBE_YES=13; - /* - * Normally, the following can not occur: - * Since the input is in NFD, there are no Hangul LV syllables that - * a Jamo T could combine with. - * All Jamo Ts are combined above when handling Jamo Vs. - * - * However, before the PRI #29 fix, this can occur due to - * an intervening combining mark between the Hangul LV and the Jamo T. - */ - } else { - /* Jamo T, compose with previous Hangul that does not have a Jamo T */ - if(isHangulWithoutJamoT(ncArg.c2)) { - ncArg.c2+=ncArg.c-JAMO_T_BASE; - if(!nx_contains(nx, ncArg.c2)) { - remove=args.start-1; - args.source[starter]=ncArg.c2; - } - } - } + // Mappings only in [minYesNoMappingsOnly..minNoNo[. + public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; - if(remove!=-1) { - /* remove the Jamo(s) */ - q=remove; - r=args.start; - while(r<args.limit) { - args.source[q++]=args.source[r++]; - } - args.start=remove; - args.limit=q; - } + public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; + public static final int MAPPING_LENGTH_MASK=0x1f; - ncArg.c2=0; /* c2 held *starter temporarily */ + public static final int COMP_1_LAST_TUPLE=0x8000; + public static final int COMP_1_TRIPLE=1; + public static final int COMP_1_TRAIL_LIMIT=0x3400; + public static final int COMP_1_TRAIL_MASK=0x7ffe; + public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit + public static final int COMP_2_TRAIL_SHIFT=6; + public static final int COMP_2_TRAIL_MASK=0xffc0; - if(combineFlags!=0) { - /* - * not starter=NULL because the composition is a Hangul LV syllable - * and might combine once more (but only before the PRI #29 fix) - */ - - /* done? */ - if(args.start==args.limit) { - return (char)prevCC; - } + // higher-level functionality ------------------------------------------ *** - /* the composition is a Hangul LV syllable which is a starter that combines forward */ - combineFwdIndex=0xfff0; + /** + * Decomposes s[src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + public void decompose(CharSequence s, int src, int limit, StringBuilder dest, + int destLengthEstimate) { + if(destLengthEstimate<0) { + destLengthEstimate=limit-src; + } + dest.setLength(0); + ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); + decompose(s, src, limit, buffer); + } + + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int decompose(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + int minNoCP=minDecompNoCP; - /* we combined; continue with looking for compositions */ - continue; - } - } + int prevSrc; + int c=0; + int norm16=0; - /* - * now: cc==0 and the combining index does not include - * "forward" -> the rest of the loop body will reset starter - * to NULL; technically, a composed Hangul syllable is a - * starter, but it does not combine forward now that we have - * consumed all eligible Jamos; for Jamo V/T, combineFlags - * does not contain _NORM_COMBINES_FWD - */ + // only for quick check + int prevBoundary=src; + int prevCC=0; - } else if( - /* the starter is not a Hangul LV or Jamo V/T and */ - !((combineFwdIndex&0x8000)!=0) && - /* the combining mark is not blocked and */ - ((options&BEFORE_PRI_29)!=0 ? - (prevCC!=ncArg.cc || prevCC==0) : - (prevCC<ncArg.cc || prevCC==0)) && - /* the starter and the combining mark (c, c2) do combine */ - 0!=(result=combine(combiningTable,combineFwdIndex, - combineBackIndex, outValues)) && - /* the composition result is not excluded */ - !nx_contains(nx, (char)value, (char)value2) + for(;;) { + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src; src!=limit;) { + if( (c=s.charAt(src))<minNoCP || + isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) ) { - value=outValues[0]; - value2=outValues[1]; - /* replace the starter with the composition, remove the - * combining mark - */ - remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */ - - /* replace the starter with the composition */ - args.source[starter]=(char)value; - if(starterIsSupplementary) { - if(value2!=0) { - /* both are supplementary */ - args.source[starter+1]=(char)value2; - } else { - /* the composition is shorter than the starter, - * move the intermediate characters forward one */ - starterIsSupplementary=false; - q=starter+1; - r=q+1; - while(r<remove) { - args.source[q++]=args.source[r++]; - } - --remove; + ++src; + } else if(!UTF16.isSurrogate((char)c)) { + break; + } else { + char c2; + if(UTF16Plus.isSurrogateLead(c)) { + if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { + c=Character.toCodePoint((char)c, c2); } - } else if(value2!=0) { // for U+1109A, U+1109C, and U+110AB - starterIsSupplementary=true; - args.source[starter+1]=(char)value2; - /* } else { both are on the BMP, nothing more to do */ - } - - /* remove the combining mark by moving the following text - * over it */ - if(remove<args.start) { - q=remove; - r=args.start; - while(r<args.limit) { - args.source[q++]=args.source[r++]; + } else /* trail surrogate */ { + if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { + --src; + c=Character.toCodePoint(c2, (char)c); } - args.start=remove; - args.limit=q; } - - /* keep prevCC because we removed the combining mark */ - - /* done? */ - if(args.start==args.limit) { - return (char)prevCC; - } - - /* is the composition a starter that combines forward? */ - if(result>1) { - combineFwdIndex=getCombiningIndexFromStarter((char)value, - (char)value2); + if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { + src+=Character.charCount(c); } else { - starter=-1; + break; } - - /* we combined; continue with looking for compositions */ - continue; } } - - /* no combination this time */ - prevCC=ncArg.cc; - if(args.start==args.limit) { - return (char)prevCC; - } - - /* if (c, c2) did not combine, then check if it is a starter */ - if(ncArg.cc==0) { - /* found a new starter; combineFlags==0 if (c, c2) is excluded */ - if((combineFlags&COMBINES_FWD)!=0) { - /* it may combine with something, prepare for it */ - if(ncArg.c2==0) { - starterIsSupplementary=false; - starter=args.start-1; - } else { - starterIsSupplementary=false; - starter=args.start-2; - } - combineFwdIndex=combineBackIndex; + // copy these code units all at once + if(src!=prevSrc) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); } else { - /* it will not combine with anything */ - starter=-1; + prevCC=0; + prevBoundary=src; } - } else if((options&OPTIONS_COMPOSE_CONTIGUOUS)!=0) { - /* FCC: no discontiguous compositions; any intervening character blocks */ - starter=-1; - } - } - } - - // find the last true starter between src[start]....src[current] going - // backwards and return its index - private static int findPreviousStarter(char[]src, int srcStart, int current, - int/*unsigned*/ ccOrQCMask, - int/*unsigned*/ decompQCMask, - char minNoMaybe) { - long norm32; - PrevArgs args = new PrevArgs(); - args.src = src; - args.start = srcStart; - args.current = current; - - while(args.start<args.current) { - norm32= getPrevNorm32(args, minNoMaybe, ccOrQCMask|decompQCMask); - if(isTrueStarter(norm32, ccOrQCMask, decompQCMask)) { - break; - } - } - return args.current; - } - - /* find the first true starter in [src..limit[ and return the - * pointer to it - */ - private static int/*index*/ findNextStarter(char[] src,int start,int limit, - int/*unsigned*/ qcMask, - int/*unsigned*/ decompQCMask, - char minNoMaybe) { - int p; - long/*unsigned*/ norm32; - int ccOrQCMask; - char c, c2; - - ccOrQCMask=CC_MASK|qcMask; - - DecomposeArgs decompArgs = new DecomposeArgs(); - - for(;;) { - if(start==limit) { - break; /* end of string */ - } - c=src[start]; - if(c<minNoMaybe) { - break; /* catches NUL terminater, too */ } - - norm32=getNorm32(c); - if((norm32&ccOrQCMask)==0) { - break; /* true starter */ + if(src==limit) { + break; } - if(isNorm32LeadSurrogate(norm32)) { - /* c is a lead surrogate, get the real norm32 */ - if((start+1)==limit || - !UTF16.isTrailSurrogate(c2=(src[start+1]))){ - /* unmatched first surrogate: counts as a true starter */ - break; - } - norm32=getNorm32FromSurrogatePair(norm32, c2); - - if((norm32&ccOrQCMask)==0) { - break; /* true starter */ - } + // Check one above-minimum, relevant code point. + src+=Character.charCount(c); + if(buffer!=null) { + decompose(c, norm16, buffer); } else { - c2=0; - } - - /* (c, c2) is not a true starter but its decomposition may be */ - if((norm32&decompQCMask)!=0) { - /* (c, c2) decomposes, get everything from the variable-length - * extra data */ - p=decompose(norm32, decompQCMask, decompArgs); - - /* get the first character's norm32 to check if it is a true - * starter */ - if(decompArgs.cc==0 && (getNorm32(extraData,p, qcMask)&qcMask)==0) { - break; /* true starter */ + if(isDecompYes(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if(prevCC<=cc || cc==0) { + prevCC=cc; + if(cc<=1) { + prevBoundary=src; + } + continue; + } } + return prevBoundary; // "no" or cc out of order } - - start+= c2==0 ? 1 : 2; /* not a true starter, continue */ } - - return start; - } - - - private static final class ComposePartArgs{ - int prevCC; - int length; /* length of decomposed part */ + return src; } - /* decompose and recompose [prevStarter..src[ */ - private static char[] composePart(ComposePartArgs args, - int prevStarter, - char[] src, int start, int limit, - int options, - UnicodeSet nx) { - int recomposeLimit; - boolean compat =((options&OPTIONS_COMPAT)!=0); - - /* decompose [prevStarter..src[ */ - int[] outTrailCC = new int[1]; - char[] buffer = new char[(limit-prevStarter)*MAX_BUFFER_SIZE]; - - for(;;){ - args.length=decompose(src,prevStarter,(start), - buffer,0,buffer.length, - compat,outTrailCC,nx); - if(args.length<=buffer.length){ - break; - }else{ - buffer = new char[args.length]; - } + public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { + int limit=s.length(); + if(limit==0) { + return; } - - /* recompose the decomposition */ - recomposeLimit=args.length; - - if(args.length>=2) { - RecomposeArgs rcArgs = new RecomposeArgs(); - rcArgs.source = buffer; - rcArgs.start = 0; - rcArgs.limit = recomposeLimit; - args.prevCC=recompose(rcArgs, options, nx); - recomposeLimit = rcArgs.limit; - } - - /* return with a pointer to the recomposition and its length */ - args.length=recomposeLimit; - return buffer; - } - - private static boolean composeHangul(char prev, char c, - long/*unsigned*/ norm32, - char[] src,int[] srcIndex, int limit, - boolean compat, - char[] dest,int destIndex, - UnicodeSet nx) { - int start=srcIndex[0]; - if(isJamoVTNorm32JamoV(norm32)) { - /* c is a Jamo V, compose with previous Jamo L and - * following Jamo T */ - prev=(char)(prev-JAMO_L_BASE); - if(prev<JAMO_L_COUNT) { - c=(char)(HANGUL_BASE+(prev*JAMO_V_COUNT+ - (c-JAMO_V_BASE))*JAMO_T_COUNT); - - /* check if the next character is a Jamo T (normal or - * compatibility) */ - if(start!=limit) { - char next, t; - - next=src[start]; - if((t=(char)(next-JAMO_T_BASE))<JAMO_T_COUNT) { - /* normal Jamo T */ - ++start; - c+=t; - } else if(compat) { - /* if NFKC, then check for compatibility Jamo T - * (BMP only) */ - norm32=getNorm32(next); - if(isNorm32Regular(norm32) && ((norm32&QC_NFKD)!=0)) { - int p /*index into extra data array*/; - DecomposeArgs dcArgs = new DecomposeArgs(); - p=decompose(norm32, QC_NFKD, dcArgs); - if(dcArgs.length==1 && - (t=(char)(extraData[p]-JAMO_T_BASE)) - <JAMO_T_COUNT) { - /* compatibility Jamo T */ - ++start; - c+=t; - } - } - } - } - if(nx_contains(nx, c)) { - if(!isHangulWithoutJamoT(c)) { - --start; /* undo ++start from reading the Jamo T */ - } - return false; - } - dest[destIndex]=c; - srcIndex[0]=start; - return true; - } - } else if(isHangulWithoutJamoT(prev)) { - /* c is a Jamo T, compose with previous Hangul LV that does not - * contain a Jamo T */ - c=(char)(prev+(c-JAMO_T_BASE)); - if(nx_contains(nx, c)) { - return false; - } - dest[destIndex]=c; - srcIndex[0]=start; - return true; + if(doDecompose) { + decompose(s, 0, limit, buffer); + return; } - return false; - } - /* - public static int compose(char[] src, char[] dest,boolean compat, UnicodeSet nx){ - return compose(src,0,src.length,dest,0,dest.length,compat, nx); + // Just merge the strings at the boundary. + int c=Character.codePointAt(s, 0); + int src=0; + int firstCC, prevCC, cc; + firstCC=prevCC=cc=getCC(getNorm16(c)); + while(cc!=0) { + prevCC=cc; + src+=Character.charCount(c); + if(src>=limit) { + break; + } + c=Character.codePointAt(s, src); + cc=getCC(getNorm16(c)); + }; + buffer.append(s, 0, src, firstCC, prevCC); + buffer.append(s, src, limit); } - */ - public static int compose(char[] src, int srcStart, int srcLimit, - char[] dest,int destStart,int destLimit, - int options,UnicodeSet nx) { - - int prevSrc, prevStarter; - long/*unsigned*/ norm32; - int ccOrQCMask, qcMask; - int reorderStartIndex, length; - char c, c2, minNoMaybe; - int/*unsigned byte*/ cc, prevCC; - int[] ioIndex = new int[1]; - int destIndex = destStart; - int srcIndex = srcStart; - - if((options&OPTIONS_COMPAT)!=0) { - minNoMaybe=(char)indexes[INDEX_MIN_NFKC_NO_MAYBE]; - qcMask=QC_NFKC; - } else { - minNoMaybe=(char)indexes[INDEX_MIN_NFC_NO_MAYBE]; - qcMask=QC_NFC; - } + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. + // doCompose: normalize + // !doCompose: isNormalized (buffer must be empty and initialized) + public boolean compose(CharSequence s, int src, int limit, + boolean onlyContiguous, + boolean doCompose, + ReorderingBuffer buffer) { + int minNoMaybeCP=minCompNoMaybeCP; /* - * prevStarter points to the last character before the current one - * that is a "true" starter with cc==0 and quick check "yes". - * - * prevStarter will be used instead of looking for a true starter - * while incrementally decomposing [prevStarter..prevSrc[ - * in _composePart(). Having a good prevStarter allows to just decompose - * the entire [prevStarter..prevSrc[. - * - * When _composePart() backs out from prevSrc back to prevStarter, - * then it also backs out destIndex by the same amount. - * Therefore, at all times, the (prevSrc-prevStarter) source units - * must correspond 1:1 to destination units counted with destIndex, - * except for reordering. - * This is true for the qc "yes" characters copied in the fast loop, - * and for pure reordering. - * prevStarter must be set forward to src when this is not true: - * In _composePart() and after composing a Hangul syllable. + * prevBoundary points to the last character before the current one + * that has a composition boundary before it with ccc==0 and quick check "yes". + * Keeping track of prevBoundary saves us looking for a composition boundary + * when we find a "no" or "maybe". * - * This mechanism relies on the assumption that the decomposition of a - * true starter also begins with a true starter. gennorm/store.c checks - * for this. + * When we back out from prevSrc back to prevBoundary, + * then we also remove those same characters (which had been simply copied + * or canonically-order-inserted) from the ReorderingBuffer. + * Therefore, at all times, the [prevBoundary..prevSrc[ source units + * must correspond 1:1 to destination units at the end of the destination buffer. */ - prevStarter=srcIndex; - - ccOrQCMask=CC_MASK|qcMask; - /*destIndex=*/reorderStartIndex=0;/* ####TODO#### check this **/ - prevCC=0; + int prevBoundary=src; + int prevSrc; + int c=0; + int norm16=0; - /* avoid compiler warnings */ - norm32=0; - c=0; + // only for isNormalized + int prevCC=0; for(;;) { - /* count code units below the minimum or with irrelevant data for - * the quick check */ - prevSrc=srcIndex; - - while(srcIndex!=srcLimit && ((c=src[srcIndex])<minNoMaybe || - ((norm32=getNorm32(c))&ccOrQCMask)==0)) { - prevCC=0; - ++srcIndex; + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src; src!=limit;) { + if( (c=s.charAt(src))<minNoMaybeCP || + isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) + ) { + ++src; + } else if(!UTF16.isSurrogate((char)c)) { + break; + } else { + char c2; + if(UTF16Plus.isSurrogateLead(c)) { + if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { + c=Character.toCodePoint((char)c, c2); + } + } else /* trail surrogate */ { + if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { + --src; + c=Character.toCodePoint(c2, (char)c); + } + } + if(isCompYesAndZeroCC(norm16=getNorm16(c))) { + src+=Character.charCount(c); + } else { + break; + } + } } - - - /* copy these code units all at once */ - if(srcIndex!=prevSrc) { - length=srcIndex-prevSrc; - if((destIndex+length)<=destLimit) { - System.arraycopy(src,prevSrc,dest,destIndex,length); + // copy these code units all at once + if(src!=prevSrc) { + if(src==limit) { + if(doCompose) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); + } + break; } - destIndex+=length; - reorderStartIndex=destIndex; - - /* set prevStarter to the last character in the quick check - * loop */ - prevStarter=srcIndex-1; - if(UTF16.isTrailSurrogate(src[prevStarter]) && - prevSrc<prevStarter && - UTF16.isLeadSurrogate(src[(prevStarter-1)])) { - --prevStarter; + // Set prevBoundary to the last character in the quick check loop. + prevBoundary=src-1; + if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && + Character.isHighSurrogate(s.charAt(prevBoundary-1)) + ) { + --prevBoundary; } - - prevSrc=srcIndex; - } - - /* end of source reached? */ - if(srcIndex==srcLimit) { + if(doCompose) { + // The last "quick check yes" character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } else { + prevCC=0; + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { break; } - /* c already contains *src and norm32 is set for it, increment src*/ - ++srcIndex; - + src+=Character.charCount(c); + /* + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) + * or has ccc!=0. + * Check for Jamo V/T, then for regular characters. + * c is not a Hangul syllable or Jamo L because those have "yes" properties. + */ + if(isJamoVT(norm16) && prevBoundary!=prevSrc) { + char prev=s.charAt(prevSrc-1); + boolean needToDecompose=false; + if(c<Hangul.JAMO_T_BASE) { + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. + prev-=Hangul.JAMO_L_BASE; + if(prev<Hangul.JAMO_L_COUNT) { + if(!doCompose) { + return false; + } + char syllable=(char) + (Hangul.HANGUL_BASE+ + (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* + Hangul.JAMO_T_COUNT); + char t; + if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { + ++src; + syllable+=t; // The next character was a Jamo T. + prevBoundary=src; + buffer.setLastChar(syllable); + continue; + } + // If we see L+V+x where x!=T then we drop to the slow path, + // decompose and recompose. + // This is to deal with NFKC finding normal L and V but a + // compatibility variant of a T. We need to either fully compose that + // combination here (which would complicate the code and may not work + // with strange custom data) or use the slow path -- or else our replacing + // two input characters (L+V) with one output character (LV syllable) + // would violate the invariant that [prevBoundary..prevSrc[ has the same + // length as what we appended to the buffer since prevBoundary. + needToDecompose=true; + } + } else if(Hangul.isHangulWithoutJamoT(prev)) { + // c is a Jamo Trailing consonant, + // compose with previous Hangul LV that does not contain a Jamo T. + if(!doCompose) { + return false; + } + buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); + prevBoundary=src; + continue; + } + if(!needToDecompose) { + // The Jamo V/T did not compose into a Hangul syllable. + if(doCompose) { + buffer.append((char)c); + } else { + prevCC=0; + } + continue; + } + } /* - * source buffer pointers: + * Source buffer pointers: * * all done quick check current char not yet - * "yes" but (c, c2) processed + * "yes" but (c) processed * may combine * forward * [-------------[-------------[-------------[-------------[ * | | | | | - * start prevStarter prevSrc src limit + * orig. src prevBoundary prevSrc src limit * * - * destination buffer pointers and indexes: + * Destination buffer pointers inside the ReorderingBuffer: * * all done might take not filled yet * characters for * reordering * [-------------[-------------[-------------[ * | | | | - * dest reorderStartIndex destIndex destCapacity - */ - - /* check one above-minimum, relevant code unit */ - /* - * norm32 is for c=*(src-1), and the quick check flag is "no" or - * "maybe", and/or cc!=0 - * check for Jamo V/T, then for surrogates and regular characters - * c is not a Hangul syllable or Jamo L because - * they are not marked with no/maybe for NFC & NFKC(and their cc==0) + * start reorderStart limit | + * +remainingCap.+ */ - if(isNorm32HangulOrJamo(norm32)) { - /* - * c is a Jamo V/T: - * try to compose with the previous character, Jamo V also with - * a following Jamo T, and set values here right now in case we - * just continue with the main loop - */ - prevCC=cc=0; - reorderStartIndex=destIndex; - ioIndex[0]=srcIndex; - if( - destIndex>0 && - composeHangul(src[(prevSrc-1)], c, norm32,src, ioIndex, - srcLimit, (options&OPTIONS_COMPAT)!=0, dest, - destIndex<=destLimit ? destIndex-1: 0, - nx) + if(norm16>=MIN_YES_YES_WITH_CC) { + int cc=norm16&0xff; // cc!=0 + if( onlyContiguous && // FCC + (doCompose ? buffer.getLastCC() : prevCC)==0 && + prevBoundary<prevSrc && + // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) + // passed the quick check "yes && ccc==0" test. + // Check whether the last character was a "yesYes" or a "yesNo". + // If a "yesNo", then we get its trailing ccc from its + // mapping and check for canonical order. + // All other cases are ok. + getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc ) { - srcIndex=ioIndex[0]; - prevStarter=srcIndex; + // Fails FCD test, need to decompose and contiguously recompose. + if(!doCompose) { + return false; + } + } else if(doCompose) { + buffer.append(c, cc); + continue; + } else if(prevCC<=cc) { + prevCC=cc; continue; + } else { + return false; } + } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { + return false; + } - srcIndex = ioIndex[0]; + /* + * Find appropriate boundaries around this character, + * decompose the source text from between the boundaries, + * and recompose it. + * + * We may need to remove the last few characters from the ReorderingBuffer + * to account for source text that was copied or appended + * but needs to take part in the recomposition. + */ - /* the Jamo V/T did not compose into a Hangul syllable, just - * append to dest */ - c2=0; - length=1; - prevStarter=prevSrc; - } else { - if(isNorm32Regular(norm32)) { - c2=0; - length=1; - } else { - /* c is a lead surrogate, get the real norm32 */ - if(srcIndex!=srcLimit && - UTF16.isTrailSurrogate(c2=src[srcIndex])) { - ++srcIndex; - length=2; - norm32=getNorm32FromSurrogatePair(norm32, c2); - } else { - /* c is an unpaired lead surrogate, nothing to do */ - c2=0; - length=1; - norm32=0; - } + /* + * Find the last composition boundary in [prevBoundary..src[. + * It is either the decomposition of the current character (at prevSrc), + * or prevBoundary. + */ + if(hasCompBoundaryBefore(c, norm16)) { + prevBoundary=prevSrc; + } else if(doCompose) { + buffer.removeSuffix(prevSrc-prevBoundary); + } + + // Find the next composition boundary in [src..limit[ - + // modifies src to point to the next starter. + src=findNextCompBoundary(s, src, limit); + + // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. + int recomposeStartIndex=buffer.length(); + decomposeShort(s, prevBoundary, src, buffer); + recompose(buffer, recomposeStartIndex, onlyContiguous); + if(!doCompose) { + if(!buffer.equals(s, prevBoundary, src)) { + return false; } - ComposePartArgs args =new ComposePartArgs(); - - /* we are looking at the character (c, c2) at [prevSrc..src[ */ - if(nx_contains(nx, c, c2)) { - /* excluded: norm32==0 */ - cc=0; - } else if((norm32&qcMask)==0) { - cc=(int)((UNSIGNED_BYTE_MASK)&(norm32>>CC_SHIFT)); - } else { - char[] p; - - /* - * find appropriate boundaries around this character, - * decompose the source text from between the boundaries, - * and recompose it - * - * this puts the intermediate text into the side buffer because - * it might be longer than the recomposition end result, - * or the destination buffer may be too short or missing - * - * note that destIndex may be adjusted backwards to account - * for source text that passed the quick check but needed to - * take part in the recomposition - */ - int decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */ - /* - * find the last true starter in [prevStarter..src[ - * it is either the decomposition of the current character (at prevSrc), - * or prevStarter - */ - if(isTrueStarter(norm32, CC_MASK|qcMask, decompQCMask)) { - prevStarter=prevSrc; - } else { - /* adjust destIndex: back out what had been copied with qc "yes" */ - destIndex-=prevSrc-prevStarter; - } + buffer.remove(); + prevCC=0; + } - /* find the next true starter in [src..limit[ */ - srcIndex=findNextStarter(src, srcIndex,srcLimit, qcMask, - decompQCMask, minNoMaybe); - //args.prevStarter = prevStarter; - args.prevCC = prevCC; - //args.destIndex = destIndex; - args.length = length; - p=composePart(args,prevStarter,src,srcIndex,srcLimit,options,nx); + // Move to the next starter. We never need to look back before this point again. + prevBoundary=src; + } + return true; + } - if(p==null) { - /* an error occurred (out of memory) */ - break; - } + /** + * Very similar to compose(): Make the same changes in both places if relevant. + * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) + * !doSpan: quickCheck + * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and + * bit 0: set if "maybe"; otherwise, if the span length<s.length() + * then the quick check result is "no" + */ + public int composeQuickCheck(CharSequence s, int src, int limit, + boolean onlyContiguous, boolean doSpan) { + int qcResult=0; + int minNoMaybeCP=minCompNoMaybeCP; - prevCC = args.prevCC; - length = args.length; + /* + * prevBoundary points to the last character before the current one + * that has a composition boundary before it with ccc==0 and quick check "yes". + */ + int prevBoundary=src; + int prevSrc; + int c=0; + int norm16=0; + int prevCC=0; - /* append the recomposed buffer contents to the destination - * buffer */ - if((destIndex+args.length)<=destLimit) { - int i=0; - while(i<args.length) { - dest[destIndex++]=p[i++]; - --length; + for(;;) { + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src;;) { + if(src==limit) { + return (src<<1)|qcResult; // "yes" or "maybe" + } + if( (c=s.charAt(src))<minNoMaybeCP || + isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) + ) { + ++src; + } else if(!UTF16.isSurrogate((char)c)) { + break; + } else { + char c2; + if(UTF16Plus.isSurrogateLead(c)) { + if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { + c=Character.toCodePoint((char)c, c2); } + } else /* trail surrogate */ { + if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { + --src; + c=Character.toCodePoint(c2, (char)c); + } + } + if(isCompYesAndZeroCC(norm16=getNorm16(c))) { + src+=Character.charCount(c); } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; + break; } - - prevStarter=srcIndex; - continue; } } + if(src!=prevSrc) { + // Set prevBoundary to the last character in the quick check loop. + prevBoundary=src-1; + if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && + Character.isHighSurrogate(s.charAt(prevBoundary-1)) + ) { + --prevBoundary; + } + prevCC=0; + // The start of the current character (c). + prevSrc=src; + } - /* append the single code point (c, c2) to the destination buffer */ - if((destIndex+length)<=destLimit) { - if(cc!=0 && cc<prevCC) { - /* (c, c2) is out of order with respect to the preceding - * text */ - int reorderSplit= destIndex; - destIndex+=length; - prevCC=insertOrdered(dest,reorderStartIndex, reorderSplit, - destIndex, c, c2, cc); - } else { - /* just append (c, c2) */ - dest[destIndex++]=c; - if(c2!=0) { - dest[destIndex++]=c2; - } + src+=Character.charCount(c); + /* + * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. + * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) + * or has ccc!=0. + */ + if(isMaybeOrNonZeroCC(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if( onlyContiguous && // FCC + cc!=0 && + prevCC==0 && + prevBoundary<prevSrc && + // prevCC==0 && prevBoundary<prevSrc tell us that + // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) + // passed the quick check "yes && ccc==0" test. + // Check whether the last character was a "yesYes" or a "yesNo". + // If a "yesNo", then we get its trailing ccc from its + // mapping and check for canonical order. + // All other cases are ok. + getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc + ) { + // Fails FCD test. + } else if(prevCC<=cc || cc==0) { prevCC=cc; + if(norm16<MIN_YES_YES_WITH_CC) { + if(!doSpan) { + qcResult=1; + } else { + return prevBoundary<<1; // spanYes does not care to know it's "maybe" + } + } + continue; } - } else { - /* buffer overflow */ - /* keep incrementing the destIndex for preflighting */ - destIndex+=length; - prevCC=cc; } + return prevBoundary<<1; // "no" } - - return destIndex - destStart; } - public static int getCombiningClass(int c) { - long norm32; - norm32=getNorm32(c); - return (int)((norm32>>CC_SHIFT)&0xFF); - } - - public static boolean isFullCompositionExclusion(int c) { - if(isFormatVersion_2_1) { - int aux =AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux & AUX_COMP_EX_MASK)!=0; - } else { - return false; + public void composeAndAppend(CharSequence s, + boolean doCompose, + boolean onlyContiguous, + ReorderingBuffer buffer) { + int src=0, limit=s.length(); + if(!buffer.isEmpty()) { + int firstStarterInSrc=findNextCompBoundary(s, 0, limit); + if(0!=firstStarterInSrc) { + int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), + buffer.length()); + StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ + firstStarterInSrc+16); + middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); + buffer.removeSuffix(buffer.length()-lastStarterInDest); + middle.append(s, 0, firstStarterInSrc); + compose(middle, 0, middle.length(), onlyContiguous, true, buffer); + src=firstStarterInSrc; + } } - } - - public static boolean isCanonSafeStart(int c) { - if(isFormatVersion_2_1) { - int aux = AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux & AUX_UNSAFE_MASK)==0; + if(doCompose) { + compose(s, src, limit, onlyContiguous, true, buffer); } else { - return false; + buffer.append(s, src, limit); } } - /* Is c an NF<mode>-skippable code point? See unormimp.h. */ - public static boolean isNFSkippable(int c, NormalizerBase.Mode mode, long mask) { - long /*unsigned int*/ norm32; - mask = mask & UNSIGNED_INT_MASK; - char aux; - - /* check conditions (a)..(e), see unormimp.h */ - norm32 = getNorm32(c); + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { + // Note: In this function we use buffer->appendZeroCC() because we track + // the lead and trail combining classes here, rather than leaving it to + // the ReorderingBuffer. + // The exception is the call to decomposeShort() which uses the buffer + // in the normal way. + + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. + // Similar to the prevBoundary in the compose() implementation. + int prevBoundary=src; + int prevSrc; + int c=0; + int prevFCD16=0; + int fcd16=0; - if((norm32&mask)!=0) { - return false; /* fails (a)..(e), not skippable */ - } + for(;;) { + // count code units with lccc==0 + for(prevSrc=src; src!=limit;) { + if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { + prevFCD16=~c; + ++src; + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { + prevFCD16=0; + ++src; + } else { + if(UTF16.isSurrogate((char)c)) { + char c2; + if(UTF16Plus.isSurrogateLead(c)) { + if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { + c=Character.toCodePoint((char)c, c2); + } + } else /* trail surrogate */ { + if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { + --src; + c=Character.toCodePoint(c2, (char)c); + } + } + } + if((fcd16=getFCD16FromNormData(c))<=0xff) { + prevFCD16=fcd16; + src+=Character.charCount(c); + } else { + break; + } + } + } + // copy these code units all at once + if(src!=prevSrc) { + if(src==limit) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); + } + break; + } + prevBoundary=src; + // We know that the previous character's lccc==0. + if(prevFCD16<0) { + // Fetching the fcd16 value was deferred for this below-U+0300 code point. + int prev=~prevFCD16; + prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); + if(prevFCD16>1) { + --prevBoundary; + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && + Character.isHighSurrogate(s.charAt(p-1)) + ) { + --p; + // Need to fetch the previous character's FCD value because + // prevFCD16 was just for the trail surrogate code point. + prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); + // Still known to have lccc==0 because its lead surrogate unit had lccc==0. + } + if(prevFCD16>1) { + prevBoundary=p; + } + } + if(buffer!=null) { + // The last lccc==0 character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; + } - if(mode == NormalizerBase.NFD || mode == NormalizerBase.NFKD || mode == NormalizerBase.NONE){ - return true; /* NF*D, passed (a)..(c), is skippable */ + src+=Character.charCount(c); + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. + // Check for proper order, and decompose locally if necessary. + if((prevFCD16&0xff)<=(fcd16>>8)) { + // proper order: prev tccc <= current lccc + if((fcd16&0xff)<=1) { + prevBoundary=src; + } + if(buffer!=null) { + buffer.appendZeroCC(c); + } + prevFCD16=fcd16; + continue; + } else if(buffer==null) { + return prevBoundary; // quick check "no" + } else { + /* + * Back out the part of the source that we copied or appended + * already but is now going to be decomposed. + * prevSrc is set to after what was copied/appended. + */ + buffer.removeSuffix(prevSrc-prevBoundary); + /* + * Find the part of the source that needs to be decomposed, + * up to the next safe boundary. + */ + src=findNextFCDBoundary(s, src, limit); + /* + * The source text does not fulfill the conditions for FCD. + * Decompose and reorder a limited piece of the text. + */ + decomposeShort(s, prevBoundary, src, buffer); + prevBoundary=src; + prevFCD16=0; + } } - /* check conditions (a)..(e), see unormimp.h */ + return src; + } - /* NF*C/FCC, passed (a)..(e) */ - if((norm32& QC_NFD)==0) { - return true; /* no canonical decomposition, is skippable */ + // Note: hasDecompBoundary() could be implemented as aliases to + // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() + // at the cost of building the FCD trie for a decomposition normalizer. + public boolean hasDecompBoundary(int c, boolean before) { + for(;;) { + if(c<minDecompNoCP) { + return true; + } + int norm16=getNorm16(c); + if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { + return true; + } else if(norm16>MIN_NORMAL_MAYBE_YES) { + return false; // ccc!=0 + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + return false; + } + if(!before) { + // decomp after-boundary: same as hasFCDBoundaryAfter(), + // fcd16<=1 || trailCC==0 + if(firstUnit>0x1ff) { + return false; // trailCC>1 + } + if(firstUnit<=0xff) { + return true; // trailCC==0 + } + // if(trailCC==1) test leadCC==0, same as checking for before-boundary + } + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; + } } + } - /* check Hangul syllables algorithmically */ - if(isNorm32HangulOrJamo(norm32)) { - /* Jamo passed (a)..(e) above, must be Hangul */ - return !isHangulWithoutJamoT((char)c); /* LVT are skippable, LV are not */ - } + public boolean hasCompBoundaryBefore(int c) { + return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); + } + + private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } + private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } + private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } + private boolean isHangul(int norm16) { return norm16==minYesNo; } + private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } + + // UBool isCompYes(uint16_t norm16) const { + // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; + // } + // UBool isCompYesOrMaybe(uint16_t norm16) const { + // return norm16<minNoNo || minMaybeYes<=norm16; + // } + // private boolean hasZeroCCFromDecompYes(int norm16) { + // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; + // } + private boolean isDecompYesAndZeroCC(int norm16) { + return norm16<minYesNo || + norm16==JAMO_VT || + (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); + } - /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */ - /* NF*C, test (f) flag */ - if(!isFormatVersion_2_2) { - return false; /* no (f) data, say not skippable to be safe */ + /** + * A little faster and simpler than isDecompYesAndZeroCC() but does not include + * the MaybeYes which combine-forward and have ccc=0. + * (Standard Unicode 5.2 normalization does not have such characters.) + */ + private boolean isMostDecompYesAndZeroCC(int norm16) { + return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; + } + + private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; + // } + private int getCCFromNoNo(int norm16) { + if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + return extraData.charAt(norm16-1)&0xff; + } else { + return 0; } - - - aux = AuxTrieImpl.auxTrie.getCodePointValue(c); - return (aux&AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */ - - /* } else { FCC, test fcd<=1 instead of the above } */ } - public static UnicodeSet addPropertyStarts(UnicodeSet set) { + // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() + int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { int c; - - /* add the start code point of each same-value range of each trie */ - //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator normIter = new TrieIterator(NormTrieImpl.normTrie); - RangeValueIterator.Element normResult = new RangeValueIterator.Element(); - - while(normIter.next(normResult)){ - set.add(normResult.start); + if(cpStart==(cpLimit-1)) { + c=s.charAt(cpStart); + } else { + c=Character.codePointAt(s, cpStart); } + int prevNorm16=getNorm16(c); + if(prevNorm16<=minYesNo) { + return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 + } else { + return extraData.charAt(prevNorm16)>>8; // tccc from yesNo + } + } - //utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator fcdIter = new TrieIterator(FCDTrieImpl.fcdTrie); - RangeValueIterator.Element fcdResult = new RangeValueIterator.Element(); + // Requires algorithmic-NoNo. + private int mapAlgorithmic(int c, int norm16) { + return c+norm16-(minMaybeYes-MAX_DELTA-1); + } - while(fcdIter.next(fcdResult)){ - set.add(fcdResult.start); - } + // Requires minYesNo<norm16<limitNoNo. + // private int getMapping(int norm16) { return /*extraData+*/norm16; } - if(isFormatVersion_2_1){ - //utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set); - TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie); - RangeValueIterator.Element auxResult = new RangeValueIterator.Element(); - while(auxIter.next(auxResult)){ - set.add(auxResult.start); + /** + * @return index into maybeYesCompositions, or -1 + */ + private int getCompositionsListForDecompYes(int norm16) { + if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { + return -1; + } else { + if((norm16-=minMaybeYes)<0) { + // norm16<minMaybeYes: index into extraData which is a substring at + // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] + // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 + norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list } + return norm16; } - /* add Hangul LV syllables and LV+1 because of skippables */ - for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) { - set.add(c); - set.add(c+1); - } - set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */ - return set; // for chaining } /** - * Internal API, used in UCharacter.getIntPropertyValue(). - * @internal - * @param c code point - * @param modeValue numeric value compatible with Mode - * @return numeric value compatible with QuickCheck + * @return index into maybeYesCompositions */ - public static final int quickCheck(int c, int modeValue) { - final int qcMask[/*UNORM_MODE_COUNT*/]={ - 0, 0, QC_NFD, QC_NFKD, QC_NFC, QC_NFKC - }; - - int norm32=(int)getNorm32(c)&qcMask[modeValue]; - - if(norm32==0) { - return 1; // YES - } else if((norm32&QC_ANY_NO)!=0) { - return 0; // NO - } else /* _NORM_QC_ANY_MAYBE */ { - return 2; // MAYBE; + private int getCompositionsListForComposite(int norm16) { + // composite has both mapping & compositions list + int firstUnit=extraData.charAt(norm16); + return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions + 1+ // +1 to skip the first unit with the mapping lenth + (firstUnit&MAPPING_LENGTH_MASK); // + mapping length + } + + // Decompose a short piece of text which is likely to contain characters that + // fail the quick check loop and/or where the quick check loop's overhead + // is unlikely to be amortized. + // Called by the compose() and makeFCD() implementations. + // Public in Java for collation implementation code. + public void decomposeShort(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + while(src<limit) { + int c=Character.codePointAt(s, src); + src+=Character.charCount(c); + decompose(c, getNorm16(c), buffer); } } - private static int strCompare(char[] s1, int s1Start, int s1Limit, - char[] s2, int s2Start, int s2Limit, - boolean codePointOrder) { - - int start1, start2, limit1, limit2; - - char c1, c2; - - /* setup for fix-up */ - start1=s1Start; - start2=s2Start; - - int length1, length2; - - length1 = s1Limit - s1Start; - length2 = s2Limit - s2Start; - - int lengthResult; - - if(length1<length2) { - lengthResult=-1; - limit1=start1+length1; - } else if(length1==length2) { - lengthResult=0; - limit1=start1+length1; - } else /* length1>length2 */ { - lengthResult=1; - limit1=start1+length2; - } - - if(s1==s2) { - return lengthResult; - } - + private void decompose(int c, int norm16, + ReorderingBuffer buffer) { + // Only loops for 1:1 algorithmic mappings. for(;;) { - /* check pseudo-limit */ - if(s1Start==limit1) { - return lengthResult; - } - - c1=s1[s1Start]; - c2=s2[s2Start]; - if(c1!=c2) { - break; + // get the decomposition and the lead and trail cc's + if(isDecompYes(norm16)) { + // c does not decompose + buffer.append(c, getCCFromYesOrMaybe(norm16)); + } else if(isHangul(norm16)) { + // Hangul syllable: decompose algorithmically + Hangul.decompose(c, buffer); + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + norm16=getNorm16(c); + continue; + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + int length=firstUnit&MAPPING_LENGTH_MASK; + int leadCC, trailCC; + trailCC=firstUnit>>8; + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + leadCC=extraData.charAt(norm16-1)>>8; + } else { + leadCC=0; + } + ++norm16; // skip over the firstUnit + buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); } - ++s1Start; - ++s2Start; + return; } + } - /* setup for fix-up */ - limit1=start1+length1; - limit2=start2+length2; - - - /* if both values are in or above the surrogate range, fix them up */ - if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { - /* subtract 0x2800 from BMP code points to make them smaller than - * supplementary ones */ - if( - ( c1<=0xdbff && (s1Start+1)!=limit1 && - UTF16.isTrailSurrogate(s1[(s1Start+1)]) - ) || - ( UTF16.isTrailSurrogate(c1) && start1!=s1Start && - UTF16.isLeadSurrogate(s1[(s1Start-1)]) - ) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make <d800 */ - c1-=0x2800; + /** + * Finds the recomposition result for + * a forward-combining "lead" character, + * specified with a pointer to its compositions list, + * and a backward-combining "trail" character. + * + * <p>If the lead and trail characters combine, then this function returns + * the following "compositeAndFwd" value: + * <pre> + * Bits 21..1 composite character + * Bit 0 set if the composite is a forward-combining starter + * </pre> + * otherwise it returns -1. + * + * <p>The compositions list has (trail, compositeAndFwd) pair entries, + * encoded as either pairs or triples of 16-bit units. + * The last entry has the high bit of its first unit set. + * + * <p>The list is sorted by ascending trail characters (there are no duplicates). + * A linear search is used. + * + * <p>See normalizer2impl.h for a more detailed description + * of the compositions list format. + */ + private static int combine(String compositions, int list, int trail) { + int key1, firstUnit; + if(trail<COMP_1_TRAIL_LIMIT) { + // trail character is 0..33FF + // result entry may have 2 or 3 units + key1=(trail<<1); + while(key1>(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if((firstUnit&COMP_1_TRIPLE)!=0) { + return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + } else { + return compositions.charAt(list+1); + } } - - if( - ( c2<=0xdbff && (s2Start+1)!=limit2 && - UTF16.isTrailSurrogate(s2[(s2Start+1)]) - ) || - ( UTF16.isTrailSurrogate(c2) && start2!=s2Start && - UTF16.isLeadSurrogate(s2[(s2Start-1)]) - ) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make <d800 */ - c2-=0x2800; + } else { + // trail character is 3400..10FFFF + // result entry has 3 units + key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); + int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; + int secondUnit; + for(;;) { + if(key1>(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if(key2>(secondUnit=compositions.charAt(list+1))) { + if((firstUnit&COMP_1_LAST_TUPLE)!=0) { + break; + } else { + list+=3; + } + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + } else { + break; + } + } else { + break; + } } } - - /* now c1 and c2 are in UTF-32-compatible order */ - return (int)c1-(int)c2; + return -1; } - - /* - * Status of tailored normalization - * - * This was done initially for investigation on Unicode public review issue 7 - * (http://www.unicode.org/review/). See Jitterbug 2481. - * While the UTC at meeting #94 (2003mar) did not take up the issue, this is - * a permanent feature in ICU 2.6 in support of IDNA which requires true - * Unicode 3.2 normalization. - * (NormalizationCorrections are rolled into IDNA mapping tables.) - * - * Tailored normalization as implemented here allows to "normalize less" - * than full Unicode normalization would. - * Based internally on a UnicodeSet of code points that are - * "excluded from normalization", the normalization functions leave those - * code points alone ("inert"). This means that tailored normalization - * still transforms text into a canonically equivalent form. - * It does not add decompositions to code points that do not have any or - * change decomposition results. - * - * Any function that searches for a safe boundary has not been touched, - * which means that these functions will be over-pessimistic when - * exclusions are applied. - * This should not matter because subsequent checks and normalizations - * do apply the exclusions; only a little more of the text may be processed - * than necessary under exclusions. - * - * Normalization exclusions have the following effect on excluded code points c: - * - c is not decomposed - * - c is not a composition target - * - c does not combine forward or backward for composition - * except that this is not implemented for Jamo - * - c is treated as having a combining class of 0 - */ - - /* - * Constants for the bit fields in the options bit set parameter. - * These need not be public. - * A user only needs to know the currently assigned values. - * The number and positions of reserved bits per field can remain private. - */ - private static final int OPTIONS_NX_MASK=0x1f; - private static final int OPTIONS_UNICODE_MASK=0xe0; - public static final int OPTIONS_SETS_MASK=0xff; -// private static final int OPTIONS_UNICODE_SHIFT=5; - private static final UnicodeSet[] nxCache = new UnicodeSet[OPTIONS_SETS_MASK+1]; - - /* Constants for options flags for normalization.*/ - - /** - * Options bit 0, do not decompose Hangul syllables. - * @draft ICU 2.6 - */ - private static final int NX_HANGUL = 1; - /** - * Options bit 1, do not decompose CJK compatibility characters. - * @draft ICU 2.6 - */ - private static final int NX_CJK_COMPAT=2; - /** - * Options bit 8, use buggy recomposition described in - * Unicode Public Review Issue #29 - * at http://www.unicode.org/review/resolved-pri.html#pri29 - * - * Used in IDNA implementation according to strict interpretation - * of IDNA definition based on Unicode 3.2 which predates PRI #29. - * - * See ICU4C unormimp.h - * - * @draft ICU 3.2 - */ - public static final int BEFORE_PRI_29=0x100; - - /* - * The following options are used only in some composition functions. - * They use bits 12 and up to preserve lower bits for the available options - * space in unorm_compare() - - * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT. - */ - - /** Options bit 12, for compatibility vs. canonical decomposition. */ - public static final int OPTIONS_COMPAT=0x1000; - /** Options bit 13, no discontiguous composition (FCC vs. NFC). */ - public static final int OPTIONS_COMPOSE_CONTIGUOUS=0x2000; - - /* normalization exclusion sets --------------------------------------------- */ - /* - * Normalization exclusion UnicodeSets are used for tailored normalization; - * see the comment near the beginning of this file. + * Recomposes the buffer text starting at recomposeStartIndex + * (which is in NFD - decomposed and canonically ordered), + * and truncates the buffer contents. * - * By specifying one or several sets of code points, - * those code points become inert for normalization. - */ - private static final synchronized UnicodeSet internalGetNXHangul() { - /* internal function, does not check for incoming U_FAILURE */ - - if(nxCache[NX_HANGUL]==null) { - nxCache[NX_HANGUL]=new UnicodeSet(0xac00, 0xd7a3); - } - return nxCache[NX_HANGUL]; - } + * Note that recomposition never lengthens the text: + * Any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit. + */ + private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, + boolean onlyContiguous) { + StringBuilder sb=buffer.getStringBuilder(); + int p=recomposeStartIndex; + if(p==sb.length()) { + return; + } + + int starter, pRemove; + int compositionsList; + int c, compositeAndFwd; + int norm16; + int cc, prevCC; + boolean starterIsSupplementary; - private static final synchronized UnicodeSet internalGetNXCJKCompat() { - /* internal function, does not check for incoming U_FAILURE */ + // Some of the following variables are not used until we have a forward-combining starter + // and are only initialized now to avoid compiler warnings. + compositionsList=-1; // used as indicator for whether we have a forward-combining starter + starter=-1; + starterIsSupplementary=false; + prevCC=0; - if(nxCache[NX_CJK_COMPAT]==null) { + for(;;) { + c=sb.codePointAt(p); + p+=Character.charCount(c); + norm16=getNorm16(c); + cc=getCCFromYesOrMaybe(norm16); + if( // this character combines backward and + isMaybe(norm16) && + // we have seen a starter that combines forward and + compositionsList>=0 && + // the backward-combining character is not blocked + (prevCC<cc || prevCC==0)) { + if(isJamoVT(norm16)) { + // c is a Jamo V/T, see if we can compose it with the previous character. + if(c<Hangul.JAMO_T_BASE) { + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. + char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); + if(prev<Hangul.JAMO_L_COUNT) { + pRemove=p-1; + char syllable=(char) + (Hangul.HANGUL_BASE+ + (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* + Hangul.JAMO_T_COUNT); + char t; + if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { + ++p; + syllable+=t; // The next character was a Jamo T. + } + sb.setCharAt(starter, syllable); + // remove the Jamo V/T + sb.delete(pRemove, p); + p=pRemove; + } + } + /* + * No "else" for Jamo T: + * Since the input is in NFD, there are no Hangul LV syllables that + * a Jamo T could combine with. + * All Jamo Ts are combined above when handling Jamo Vs. + */ + if(p==sb.length()) { + break; + } + compositionsList=-1; + continue; + } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { + // The starter and the combining mark (c) do combine. + int composite=compositeAndFwd>>1; + + // Remove the combining mark. + pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark + sb.delete(pRemove, p); + p=pRemove; + // Replace the starter with the composite. + if(starterIsSupplementary) { + if(composite>0xffff) { + // both are supplementary + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); + } else { + sb.setCharAt(starter, (char)c); + sb.deleteCharAt(starter+1); + // The composite is shorter than the starter, + // move the intermediate characters forward one. + starterIsSupplementary=false; + --p; + } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); + } - /* build a set from [CJK Ideographs]&[has canonical decomposition] */ - UnicodeSet set, hasDecomp; + // Keep prevCC because we removed the combining mark. - set=new UnicodeSet("[:Ideographic:]"); + if(p==sb.length()) { + break; + } + // Is the composite a starter that combines forward? + if((compositeAndFwd&1)!=0) { + compositionsList= + getCompositionsListForComposite(getNorm16(composite)); + } else { + compositionsList=-1; + } - /* start with an empty set for [has canonical decomposition] */ - hasDecomp=new UnicodeSet(); + // We combined; continue with looking for compositions. + continue; + } + } - /* iterate over all ideographs and remember which canonically decompose */ - UnicodeSetIterator it = new UnicodeSetIterator(set); - int start, end; - long norm32; + // no combination this time + prevCC=cc; + if(p==sb.length()) { + break; + } - while(it.nextRange() && (it.codepoint != UnicodeSetIterator.IS_STRING)) { - start=it.codepoint; - end=it.codepointEnd; - while(start<=end) { - norm32 = getNorm32(start); - if((norm32 & QC_NFD)>0) { - hasDecomp.add(start); + // If c did not combine, then check if it is a starter. + if(cc==0) { + // Found a new starter. + if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { + // It may combine with something, prepare for it. + if(c<=0xffff) { + starterIsSupplementary=false; + starter=p-1; + } else { + starterIsSupplementary=true; + starter=p-2; } - ++start; } + } else if(onlyContiguous) { + // FCC: no discontiguous compositions; any intervening character blocks. + compositionsList=-1; } - - /* hasDecomp now contains all ideographs that decompose canonically */ - nxCache[NX_CJK_COMPAT]=hasDecomp; - } - - return nxCache[NX_CJK_COMPAT]; + buffer.flush(); } - private static final synchronized UnicodeSet internalGetNXUnicode(int options) { - options &= OPTIONS_UNICODE_MASK; - if(options==0) { - return null; + /** + * Does c have a composition boundary before it? + * True if its decomposition begins with a character that has + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes + * (isCompYesAndZeroCC()) so we need not decompose. + */ + private boolean hasCompBoundaryBefore(int c, int norm16) { + for(;;) { + if(isCompYesAndZeroCC(norm16)) { + return true; + } else if(isMaybeOrNonZeroCC(norm16)) { + return false; + } else if(isDecompNoAlgorithmic(norm16)) { + c=mapAlgorithmic(c, norm16); + norm16=getNorm16(c); + } else { + // c decomposes, get everything from the variable-length extra data + int firstUnit=extraData.charAt(norm16); + if((firstUnit&MAPPING_LENGTH_MASK)==0) { + return false; + } + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { + return false; // non-zero leadCC + } + return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); + } } + } - if(nxCache[options]==null) { - /* build a set with all code points that were not designated by the specified Unicode version */ - UnicodeSet set = new UnicodeSet(); - - switch(options) { - case NormalizerBase.UNICODE_3_2: - set.applyPattern("[:^Age=3.2:]"); + private int findPreviousCompBoundary(CharSequence s, int p) { + while(p>0) { + int c=Character.codePointBefore(s, p); + p-=Character.charCount(c); + if(hasCompBoundaryBefore(c)) { break; - default: - return null; } - - nxCache[options]=set; + // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, + // but that's probably not worth the extra cost. } - - return nxCache[options]; + return p; } - /* Get a decomposition exclusion set. The data must be loaded. */ - private static final synchronized UnicodeSet internalGetNX(int options) { - options&=OPTIONS_SETS_MASK; - - if(nxCache[options]==null) { - /* return basic sets */ - if(options==NX_HANGUL) { - return internalGetNXHangul(); - } - if(options==NX_CJK_COMPAT) { - return internalGetNXCJKCompat(); - } - if((options & OPTIONS_UNICODE_MASK)!=0 && (options & OPTIONS_NX_MASK)==0) { - return internalGetNXUnicode(options); - } - - /* build a set from multiple subsets */ - UnicodeSet set; - UnicodeSet other; - - set=new UnicodeSet(); - - - if((options & NX_HANGUL)!=0 && null!=(other=internalGetNXHangul())) { - set.addAll(other); - } - if((options&NX_CJK_COMPAT)!=0 && null!=(other=internalGetNXCJKCompat())) { - set.addAll(other); - } - if((options&OPTIONS_UNICODE_MASK)!=0 && null!=(other=internalGetNXUnicode(options))) { - set.addAll(other); + private int findNextCompBoundary(CharSequence s, int p, int limit) { + while(p<limit) { + int c=Character.codePointAt(s, p); + int norm16=normTrie.get(c); + if(hasCompBoundaryBefore(c, norm16)) { + break; } - - nxCache[options]=set; + p+=Character.charCount(c); } - return nxCache[options]; + return p; } - public static final UnicodeSet getNX(int options) { - if((options&=OPTIONS_SETS_MASK)==0) { - /* incoming failure, or no decomposition exclusions requested */ - return null; - } else { - return internalGetNX(options); + private int findNextFCDBoundary(CharSequence s, int p, int limit) { + while(p<limit) { + int c=Character.codePointAt(s, p); + if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { + break; + } + p+=Character.charCount(c); } + return p; } - private static final boolean nx_contains(UnicodeSet nx, int c) { - return nx!=null && nx.contains(c); - } - - private static final boolean nx_contains(UnicodeSet nx, char c, char c2) { - return nx!=null && nx.contains(c2==0 ? c : UCharacterProperty.getRawSupplementary(c, c2)); - } - -/*****************************************************************************/ - /** * Get the canonical decomposition * sherman for ComposedCharIter */ - public static int getDecompose(int chars[], String decomps[]) { - DecomposeArgs args = new DecomposeArgs(); + Normalizer2 impl = Normalizer2.getNFDInstance(); + int length=0; - long norm32 = 0; + int norm16 = 0; int ch = -1; - int index = 0; int i = 0; while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff @@ -2476,23 +1729,23 @@ //need a better solution/lookup if (ch == 0x30ff) ch = 0xf900; - else if (ch == 0x10000) + else if (ch == 0x115bc) ch = 0x1d15e; else if (ch == 0x1d1c1) ch = 0x2f800; - norm32 = NormalizerImpl.getNorm32(ch); - if((norm32 & QC_NFD)!=0 && i < chars.length) { + String s = impl.getDecomposition(ch); + + if(s != null && i < chars.length) { chars[i] = ch; - index = decompose(norm32, args); - decomps[i++] = new String(extraData,index, args.length); + decomps[i++] = s; } } return i; } //------------------------------------------------------ - // special method for Collation + // special method for Collation (RBTableBuilder.build()) //------------------------------------------------------ private static boolean needSingleQuotation(char c) { return (c >= 0x0009 && c <= 0x000D) || @@ -2503,45 +1756,42 @@ } public static String canonicalDecomposeWithSingleQuotation(String string) { - char[] src = string.toCharArray(); - int srcIndex = 0; - int srcLimit = src.length; - char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 - int destIndex = 0; - int destLimit = dest.length; + Normalizer2 impl = Normalizer2.getNFDInstance(); + char[] src = string.toCharArray(); + int srcIndex = 0; + int srcLimit = src.length; + char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 + int destIndex = 0; + int destLimit = dest.length; - char[] buffer = new char[3]; int prevSrc; - long norm32; - int ccOrQCMask; - int qcMask = QC_NFD; + String norm; int reorderStartIndex, length; - char c, c2; - char minNoMaybe = (char)indexes[INDEX_MIN_NFD_NO_MAYBE]; + char c1, c2; + int cp; + int minNoMaybe = 0x00c0; int cc, prevCC, trailCC; char[] p; int pStart; - // initialize - ccOrQCMask = CC_MASK | qcMask; reorderStartIndex = 0; prevCC = 0; - norm32 = 0; - c = 0; + norm = null; + cp = 0; pStart = 0; cc = trailCC = -1; // initialize to bogus value - for(;;) { + c1 = 0; + for (;;) { prevSrc=srcIndex; //quick check (1)less than minNoMaybe (2)no decomp (3)hangual while (srcIndex != srcLimit && - (( c = src[srcIndex]) < minNoMaybe || - ((norm32 = getNorm32(c)) & ccOrQCMask) == 0 || - ( c >= '\uac00' && c <= '\ud7a3'))){ - + ((c1 = src[srcIndex]) < minNoMaybe || + (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || + (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables prevCC = 0; - ++srcIndex; + srcIndex += (cp < 0x10000) ? 1 : 2; } // copy these code units all at once @@ -2556,47 +1806,43 @@ } // end of source reached? - if(srcIndex == srcLimit) { + if (srcIndex == srcLimit) { break; } - // c already contains *src and norm32 is set for it, increment src - ++srcIndex; - if(isNorm32Regular(norm32)) { + // cp already contains *src and norm32 is set for it, increment src + srcIndex += (cp < 0x10000) ? 1 : 2; + + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { c2 = 0; length = 1; - } else { - // c is a lead surrogate, get the real norm32 - if(srcIndex != srcLimit && - Character.isLowSurrogate(c2 = src[srcIndex])) { - ++srcIndex; - length = 2; - norm32 = getNorm32FromSurrogatePair(norm32, c2); - } else { - c2 = 0; - length = 1; - norm32 = 0; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; } + } else { + length = 2; + c2 = src[srcIndex-1]; } - // get the decomposition and the lead and trail cc's - if((norm32 & qcMask) == 0) { - // c does not decompose - cc = trailCC = (int)((UNSIGNED_BYTE_MASK) & (norm32 >> CC_SHIFT)); - p = null; - pStart = -1; - } else { - DecomposeArgs arg = new DecomposeArgs(); - // c decomposes, get everything from the variable-length - // extra data - pStart = decompose(norm32, qcMask, arg); - p = extraData; - length = arg.length; - cc = arg.cc; - trailCC = arg.trailCC; - if(length == 1) { + // get the decomposition and the lead and trail cc's + if (norm == null) { + // cp does not decompose + cc = trailCC = UCharacter.getCombiningClass(cp); + p = null; + pStart = -1; + } else { + + pStart = 0; + p = norm.toCharArray(); + length = p.length; + int cpNum = norm.codePointCount(0, length); + cc= UCharacter.getCombiningClass(norm.codePointAt(0)); + trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); + if (length == 1) { // fastpath a single code unit from decomposition - c = p[pStart]; + c1 = p[pStart]; c2 = 0; p = null; pStart = -1; @@ -2610,27 +1856,28 @@ dest = tmpBuf; destLimit = dest.length; } + // append the decomposition to the destination buffer, assume length>0 { int reorderSplit = destIndex; - if(p == null) { + if (p == null) { // fastpath: single code point - if (needSingleQuotation(c)) { + if (needSingleQuotation(c1)) { //if we need single quotation, no need to consider "prevCC" //and it must NOT be a supplementary pair dest[destIndex++] = '\''; - dest[destIndex++] = c; + dest[destIndex++] = c1; dest[destIndex++] = '\''; trailCC = 0; } else if(cc != 0 && cc < prevCC) { - // (c, c2) is out of order with respect to the preceding + // (c1, c2) is out of order with respect to the preceding // text destIndex += length; - trailCC = insertOrdered(dest,reorderStartIndex, - reorderSplit, destIndex, c, c2, cc); + trailCC = insertOrdered(dest, reorderStartIndex, + reorderSplit, destIndex, c1, c2, cc); } else { - // just append (c, c2) - dest[destIndex++] = c; + // just append (c1, c2) + dest[destIndex++] = c1; if(c2 != 0) { dest[destIndex++] = c2; } @@ -2646,16 +1893,16 @@ do { dest[destIndex++] = p[pStart++]; } while(--length > 0); - } else - if(cc != 0 && cc < prevCC) { + } else if (cc != 0 && cc < prevCC) { destIndex += length; - trailCC = mergeOrdered(dest,reorderStartIndex, - reorderSplit,p, pStart,pStart+length); + trailCC = mergeOrdered(dest, reorderStartIndex, + reorderSplit, p, pStart, + pStart+length); } else { // just append the decomposition do { dest[destIndex++] = p[pStart++]; - } while(--length > 0); + } while (--length > 0); } } } @@ -2664,73 +1911,245 @@ reorderStartIndex = destIndex; } } + return new String(dest, 0, destIndex); } - //------------------------------------------------------ - // mapping method for IDNA/StringPrep - //------------------------------------------------------ - - /* - * Normalization using NormalizerBase.UNICODE_3_2 option supports Unicode - * 3.2 normalization with Corrigendum 4 corrections. However, normalization - * without the corrections is necessary for IDNA/StringPrep support. - * This method is called when NormalizerBase.UNICODE_3_2_0_ORIGINAL option - * (= sun.text.Normalizer.UNICODE_3_2) is used and normalizes five - * characters in Corrigendum 4 before normalization in order to avoid - * incorrect normalization. - * For the Corrigendum 4 issue, refer - * http://www.unicode.org/versions/corrigendum4.html + /** + * simpler, single-character version of mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at src[current]..src[p] + * + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) + * + * before: src[start]..src[current] is already ordered, and + * src[current]..src[p] may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: src[start]..src[p] is ordered + * + * @return the trailing combining class */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; - /* - * Option used in NormalizerBase.UNICODE_3_2_0_ORIGINAL. - */ - public static final int WITHOUT_CORRIGENDUM4_CORRECTIONS=0x40000; + if (start<current && cc!=0) { + // search for the insertion point where cc>=prevCC + preBack=back=current; - private static final char[][] corrigendum4MappingTable = { - {'\uD844', '\uDF6A'}, // 0x2F868 - {'\u5F33'}, // 0x2F874 - {'\u43AB'}, // 0x2F91F - {'\u7AAE'}, // 0x2F95F - {'\u4D57'}}; // 0x2F9BF + PrevArgs prevArgs = new PrevArgs(); + prevArgs.current = current; + prevArgs.start = start; + prevArgs.src = source; + prevArgs.c1 = c1; + prevArgs.c2 = c2; - /* - * Removing Corrigendum 4 fix - * @return normalized text - */ - public static String convert(String str) { - if (str == null) { - return null; + // get the prevCC + prevCC=getPrevCC(prevArgs); + preBack = prevArgs.current; + + if(cc<prevCC) { + // this will be the last code point, so keep its cc + trailCC=prevCC; + back=preBack; + while(start<preBack) { + prevCC=getPrevCC(prevArgs); + preBack=prevArgs.current; + if(cc>=prevCC) { + break; + } + back=preBack; + } + + // this is where we are right now with all these indicies: + // [start]..[pPreBack] 0..? code points that we can ignore + // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc + // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) + // [current]..[p] 1 code point (c, c2) with cc + + // move the code units in between up + r=p; + do { + source[--r]=source[--current]; + } while (back!=current); + } } - int ch = UCharacterIterator.DONE; - StringBuffer dest = new StringBuffer(); - UCharacterIterator iter = UCharacterIterator.getInstance(str); - - while ((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ - switch (ch) { - case 0x2F868: - dest.append(corrigendum4MappingTable[0]); - break; - case 0x2F874: - dest.append(corrigendum4MappingTable[1]); - break; - case 0x2F91F: - dest.append(corrigendum4MappingTable[2]); - break; - case 0x2F95F: - dest.append(corrigendum4MappingTable[3]); - break; - case 0x2F9BF: - dest.append(corrigendum4MappingTable[4]); - break; - default: - UTF16.append(dest,ch); - break; + // insert (c1, c2) + source[current] = c1; + if (c2!=0) { + source[(current+1)] = c2; + } + + // we know the cc of the last code point + return trailCC; + } + + /** + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done + * in-place if the two strings are not adjacent, then the buffer holding the + * first one must be large enough + * the second string may or may not be ordered in itself + * + * before: [start]..[current] is already ordered, and + * [next]..[limit] may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from + * src[next++] and inserts them in correct combining class order into the + * preceding part of the string + * + * since this function is called much less often than the single-code point + * insertOrdered(), it just uses that for easier maintenance + * + * @return the trailing combining class + */ + private static int /*unsigned byte*/ mergeOrdered(char[] source, + int start, + int current, + char[] data, + int next, + int limit) { + int r; + int /*unsigned byte*/ cc, trailCC=0; + boolean adjacent; + + adjacent= current==next; + NextCCArgs ncArgs = new NextCCArgs(); + ncArgs.source = data; + ncArgs.next = next; + ncArgs.limit = limit; + + if(start!=current) { + + while(ncArgs.next<ncArgs.limit) { + cc=getNextCC(ncArgs); + if(cc==0) { + // does not bubble back + trailCC=0; + if(adjacent) { + current=ncArgs.next; + } else { + data[current++]=ncArgs.c1; + if(ncArgs.c2!=0) { + data[current++]=ncArgs.c2; + } + } + break; + } else { + r=current+(ncArgs.c2==0 ? 1 : 2); + trailCC=insertOrdered(source,start, current, r, + ncArgs.c1, ncArgs.c2, cc); + current=r; + } + } + } + + if(ncArgs.next==ncArgs.limit) { + // we know the cc of the last code point + return trailCC; + } else { + if(!adjacent) { + // copy the second string part + do { + source[current++]=data[ncArgs.next++]; + } while(ncArgs.next!=ncArgs.limit); + ncArgs.limit=current; + } + PrevArgs prevArgs = new PrevArgs(); + prevArgs.src = data; + prevArgs.start = start; + prevArgs.current = ncArgs.limit; + return getPrevCC(prevArgs); } + + } + + private static final class PrevArgs{ + char[] src; + int start; + int current; + char c1; + char c2; + } + + private static final class NextCCArgs{ + char[] source; + int next; + int limit; + char c1; + char c2; + } + + private static int /*unsigned*/ getPrevCC(PrevArgs args) { + args.c1=args.src[--args.current]; + args.c2=0; + + if (args.c1 < MIN_CCC_LCCC_CP) { + return 0; + } else if (UTF16.isLeadSurrogate(args.c1)) { + /* unpaired first surrogate */ + return 0; + } else if (!UTF16.isTrailSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.current!=args.start && + UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { + --args.current; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); + } else { + /* unpaired second surrogate */ + args.c2=0; + return 0; } + } + + private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { + args.c1=args.source[args.next++]; + args.c2=0; - return dest.toString(); + if (UTF16.isTrailSurrogate(args.c1)) { + /* unpaired second surrogate */ + return 0; + } else if (!UTF16.isLeadSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.next!=args.limit && + UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ + ++args.next; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); + } else { + /* unpaired first surrogate */ + args.c2=0; + return 0; + } } + + private VersionInfo dataVersion; + + // Code point thresholds for quick check codes. + private int minDecompNoCP; + private int minCompNoMaybeCP; + + // Norm16 value thresholds for quick check combinations and types of extra data. + private int minYesNo; + private int minYesNoMappingsOnly; + private int minNoNo; + private int limitNoNo; + private int minMaybeYes; + + private Trie2_16 normTrie; + private String maybeYesCompositions; + private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters + private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 + private int[] tccc180; // [0x180] tccc values for U+0000..U+017F + } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/Replaceable.java 2015-07-13 16:11:50.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Replaceable.java 2015-07-13 16:11:50.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -82,7 +82,7 @@ * @author Alan Liu * @stable ICU 2.0 */ -public interface Replaceable { +interface Replaceable { /** * Returns the number of 16-bit code units in the text. * @return number of 16-bit code units in text @@ -99,7 +99,6 @@ */ char charAt(int offset); - //// for StringPrep /** * Copies characters from this object into the destination * character array. The first character to be copied is at index --- old/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableString.java 2015-07-13 16:11:51.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableString.java 2015-07-13 16:11:51.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,13 +25,8 @@ /* ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2009, International Business Machines Corporation and * + * others. All Rights Reserved. * ******************************************************************************* */ @@ -51,7 +46,7 @@ * @author Alan Liu * @stable ICU 2.0 */ -public class ReplaceableString implements Replaceable { +class ReplaceableString implements Replaceable { private StringBuffer buf; @@ -64,7 +59,6 @@ buf = new StringBuffer(str); } - //// for StringPrep /** * Construct a new object using <code>buf</code> for internal * storage. The contents of <code>buf</code> at the time of @@ -98,7 +92,6 @@ return buf.charAt(offset); } - //// for StringPrep /** * Copies characters from this object into the destination * character array. The first character to be copied is at index @@ -118,6 +111,8 @@ * @stable ICU 2.0 */ public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) { - Utility.getChars(buf, srcStart, srcLimit, dst, dstStart); + if (srcStart != srcLimit) { + buf.getChars(srcStart, srcLimit, dst, dstStart); + } } } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java 2015-07-13 16:11:52.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/ReplaceableUCharacterIterator.java 2015-07-13 16:11:52.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -47,7 +47,7 @@ * * What are first, last, and getBeginIndex doing here?!?!?! */ -public class ReplaceableUCharacterIterator extends UCharacterIterator { +class ReplaceableUCharacterIterator extends UCharacterIterator { // public constructor ------------------------------------------------------ @@ -63,7 +63,6 @@ this.currentIndex = 0; } - //// for StringPrep /** * Public constructor * @param buf buffer of text on which the iterator will be based @@ -164,7 +163,6 @@ this.currentIndex = currentIndex; } - //// for StringPrep public int getText(char[] fillIn, int offset){ int length = replaceable.length(); if(offset < 0 || offset + length > fillIn.length){ --- old/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java 2015-07-13 16:11:53.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Trie.java 2015-07-13 16:11:52.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,16 +22,12 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** */ package sun.text.normalizer; @@ -135,93 +131,62 @@ unserialize(inputStream); } - /** - * Trie constructor - * @param index array to be used for index - * @param options used by the trie - * @param dataManipulate object containing the information to parse the - * trie data - */ - protected Trie(char index[], int options, DataManipulate dataManipulate) - { - m_options_ = options; - if(dataManipulate != null) { - m_dataManipulate_ = dataManipulate; - } else { - m_dataManipulate_ = new DefaultGetFoldingOffset(); - } - m_isLatin1Linear_ = (m_options_ & - HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; - m_index_ = index; - m_dataOffset_ = m_index_.length; - } - // protected data members ------------------------------------------ /** - * Lead surrogate code points' index displacement in the index array. - * <pre>{@code - * 0x10000-0xd800=0x2800 - * 0x2800 >> INDEX_STAGE_1_SHIFT_ - * }</pre> - */ + * Lead surrogate code points' index displacement in the index array. + * <pre>{@code + * 0x10000-0xd800=0x2800 + * 0x2800 >> INDEX_STAGE_1_SHIFT_ + * }</pre> + */ protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; /** - * Shift size for shifting right the input index. 1..9 - */ + * Shift size for shifting right the input index. 1..9 + */ protected static final int INDEX_STAGE_1_SHIFT_ = 5; /** - * Shift size for shifting left the index array values. - * Increases possible data size with 16-bit index values at the cost - * of compactability. - * This requires blocks of stage 2 data to be aligned by - * DATA_GRANULARITY. - * 0..INDEX_STAGE_1_SHIFT - */ + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires blocks of stage 2 data to be aligned by + * DATA_GRANULARITY. + * 0..INDEX_STAGE_1_SHIFT + */ protected static final int INDEX_STAGE_2_SHIFT_ = 2; /** * Number of data values in a stage 2 (data array) block. */ protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; /** - * Mask for getting the lower bits from the input index. - * DATA_BLOCK_LENGTH - 1. - */ + * Mask for getting the lower bits from the input index. + * DATA_BLOCK_LENGTH - 1. + */ protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; - /** Number of bits of a trail surrogate that are used in index table lookups. */ - protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; /** - * Number of index (stage 1) entries per lead surrogate. - * Same as number of index entries for 1024 trail surrogates, - * {@code ==0x400>>INDEX_STAGE_1_SHIFT_} - */ - protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); - /** Length of the BMP portion of the index (stage 1) array. */ - protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; - /** - * Surrogate mask to use when shifting offset to retrieve supplementary - * values - */ + * Surrogate mask to use when shifting offset to retrieve supplementary + * values + */ protected static final int SURROGATE_MASK_ = 0x3FF; /** - * Index or UTF16 characters - */ + * Index or UTF16 characters + */ protected char m_index_[]; /** - * Internal TrieValue which handles the parsing of the data value. - * This class is to be implemented by the user - */ + * Internal TrieValue which handles the parsing of the data value. + * This class is to be implemented by the user + */ protected DataManipulate m_dataManipulate_; /** - * Start index of the data portion of the trie. CharTrie combines - * index and data into a char array, so this is used to indicate the - * initial offset to the data portion. - * Note this index always points to the initial value. - */ + * Start index of the data portion of the trie. CharTrie combines + * index and data into a char array, so this is used to indicate the + * initial offset to the data portion. + * Note this index always points to the initial value. + */ protected int m_dataOffset_; /** - * Length of the data array - */ + * Length of the data array + */ protected int m_dataLength_; // protected methods ----------------------------------------------- @@ -235,19 +200,6 @@ protected abstract int getSurrogateOffset(char lead, char trail); /** - * Gets the value at the argument index - * @param index value at index will be retrieved - * @return 32 bit value - */ - protected abstract int getValue(int index); - - /** - * Gets the default initial value - * @return 32 bit value - */ - protected abstract int getInitialValue(); - - /** * Gets the offset to the data which the index ch after variable offset * points to. * Note for locating a non-supplementary character data offset, calling @@ -297,13 +249,13 @@ } /** - * Internal trie getter from a code point. - * Could be faster(?) but longer with - * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} - * Gets the offset to data which the codepoint points to - * @param ch codepoint - * @return offset to data - */ + * Internal trie getter from a code point. + * Could be faster(?) but longer with + * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} + * Gets the offset to data which the codepoint points to + * @param ch codepoint + * @return offset to data + */ protected final int getCodePointOffset(int ch) { // if ((ch >> 16) == 0) slower @@ -321,7 +273,7 @@ return getSurrogateOffset(UTF16.getLeadSurrogate(ch), (char)(ch & SURROGATE_MASK_)); } else { - // return -1 // if there is an error, in this case we return + // return -1 if there is an error, in this case we return return -1; } } @@ -343,15 +295,6 @@ } /** - * Determines if this is a 32 bit trie - * @return true if options specifies this is a 32 bit trie - */ - protected final boolean isIntTrie() - { - return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; - } - - /** * Determines if this is a 16 bit trie * @return true if this is a 16 bit trie */ @@ -363,8 +306,8 @@ // private data members -------------------------------------------- /** - * Latin 1 option mask - */ + * Latin 1 option mask + */ protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; /** * Constant number to authenticate the byte block @@ -378,28 +321,28 @@ protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; /** - * Flag indicator for Latin quick access data block - */ + * Flag indicator for Latin quick access data block + */ private boolean m_isLatin1Linear_; /** - * <p>Trie options field.</p> - * <p>options bit field:<br> - * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> - * 8 0 = 16-bit data, 1=32-bit data<br> - * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> - * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> - */ + * <p>Trie options field.</p> + * <p>options bit field:<br> + * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> + * 8 0 = 16-bit data, 1=32-bit data<br> + * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> + * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> + */ private int m_options_; // private methods --------------------------------------------------- /** - * Authenticates raw data header. - * Checking the header information, signature and options. - * @param signature This contains the options and type of a Trie - * @return true if the header is authenticated valid - */ + * Authenticates raw data header. + * Checking the header information, signature and options. + * @param signature This contains the options and type of a Trie + * @return true if the header is authenticated valid + */ private final boolean checkHeader(int signature) { // check the signature --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java 2015-07-13 16:11:53.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java 2015-07-13 16:11:53.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,74 +24,71 @@ */ /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * + * Copyright (C) 2004-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * ******************************************************************************* -* file name: UBiDiProps.java -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2005jan16 -* created by: Markus W. Scherer -* -* Low-level Unicode bidi/shaping properties access. -* Java port of ubidi_props.h/.c. -*/ + * file name: UBiDiProps.java + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2005jan16 + * created by: Markus W. Scherer + * + * Low-level Unicode bidi/shaping properties access. + * Java port of ubidi_props.h/.c. + */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.DataInputStream; -import java.io.InputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.MissingResourceException; public final class UBiDiProps { // constructors etc. --------------------------------------------------- *** // port of ubidi_openProps() - public UBiDiProps() throws IOException{ - InputStream is=ICUData.getStream(DATA_FILE_NAME); - BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */); - readData(b); - b.close(); - is.close(); - + private UBiDiProps() throws IOException{ + ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); + readData(bytes); } - private void readData(InputStream is) throws IOException { - DataInputStream inputStream=new DataInputStream(is); - + private void readData(ByteBuffer bytes) throws IOException { // read the header - ICUBinary.readHeader(inputStream, FMT, new IsAcceptable()); + ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); // read indexes[] int i, count; - count=inputStream.readInt(); - if(count<IX_INDEX_TOP) { + count=bytes.getInt(); + if(count<IX_TOP) { throw new IOException("indexes[0] too small in "+DATA_FILE_NAME); } indexes=new int[count]; indexes[0]=count; for(i=1; i<count; ++i) { - indexes[i]=inputStream.readInt(); + indexes[i]=bytes.getInt(); } // read the trie - trie=new CharTrie(inputStream, null); + trie=Trie2_16.createFromSerialized(bytes); + int expectedTrieLength=indexes[IX_TRIE_SIZE]; + int trieLength=trie.getSerializedLength(); + if(trieLength>expectedTrieLength) { + throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); // read mirrors[] count=indexes[IX_MIRROR_LENGTH]; if(count>0) { mirrors=new int[count]; for(i=0; i<count; ++i) { - mirrors[i]=inputStream.readInt(); + mirrors[i]=bytes.getInt(); } } @@ -99,81 +96,172 @@ count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START]; jgArray=new byte[count]; for(i=0; i<count; ++i) { - jgArray[i]=inputStream.readByte(); + jgArray[i]=bytes.get(); + } + + // read jgArray2[] + count=indexes[IX_JG_LIMIT2]-indexes[IX_JG_START2]; + jgArray2=new byte[count]; + for(i=0; i<count; ++i) { + jgArray2[i]=bytes.get(); } } // implement ICUBinary.Authenticate - private final class IsAcceptable implements ICUBinary.Authenticate { + private final static class IsAcceptable implements ICUBinary.Authenticate { public boolean isDataVersionAcceptable(byte version[]) { - return version[0]==1 && - version[2]==Trie.INDEX_STAGE_1_SHIFT_ && version[3]==Trie.INDEX_STAGE_2_SHIFT_; + return version[0]==2; } } - // UBiDiProps singleton - private static UBiDiProps gBdp=null; + // property access functions ------------------------------------------- *** - // port of ubidi_getSingleton() - public static final synchronized UBiDiProps getSingleton() throws IOException { - if(gBdp==null) { - gBdp=new UBiDiProps(); + public final int getClass(int c) { + return getClassFromProps(trie.get(c)); + } + + private final int getMirror(int c, int props) { + int delta=getMirrorDeltaFromProps(props); + if(delta!=ESC_MIRROR_DELTA) { + return c+delta; + } else { + /* look for mirror code point in the mirrors[] table */ + int m; + int i, length; + int c2; + + length=indexes[IX_MIRROR_LENGTH]; + + /* linear search */ + for(i=0; i<length; ++i) { + m=mirrors[i]; + c2=getMirrorCodePoint(m); + if(c==c2) { + /* found c, return its mirror code point using the index in m */ + return getMirrorCodePoint(mirrors[getMirrorIndex(m)]); + } else if(c<c2) { + break; + } + } + + /* c not found, return it itself */ + return c; } - return gBdp; } - // UBiDiProps dummy singleton - private static UBiDiProps gBdpDummy=null; + public final int getMirror(int c) { + int props=trie.get(c); + return getMirror(c, props); + } - private UBiDiProps(boolean makeDummy) { // ignore makeDummy, only creates a unique signature - indexes=new int[IX_TOP]; - indexes[0]=IX_TOP; - trie=new CharTrie(0, 0, null); // dummy trie, always returns 0 + public final int getJoiningType(int c) { + return (trie.get(c)&JT_MASK)>>JT_SHIFT; } - /** - * Get a singleton dummy object, one that works with no real data. - * This can be used when the real data is not available. - * Using the dummy can reduce checks for available data after an initial failure. - * Port of ucase_getDummy(). - */ - public static final synchronized UBiDiProps getDummy() { - if(gBdpDummy==null) { - gBdpDummy=new UBiDiProps(true); + public final int getJoiningGroup(int c) { + int start, limit; + + start=indexes[IX_JG_START]; + limit=indexes[IX_JG_LIMIT]; + if(start<=c && c<limit) { + return (int)jgArray[c-start]&0xff; } - return gBdpDummy; + start=indexes[IX_JG_START2]; + limit=indexes[IX_JG_LIMIT2]; + if(start<=c && c<limit) { + return (int)jgArray2[c-start]&0xff; + } + return UCharacter.JoiningGroup.NO_JOINING_GROUP; } - public final int getClass(int c) { - return getClassFromProps(trie.getCodePointValue(c)); + public final int getPairedBracketType(int c) { + return (trie.get(c)&BPT_MASK)>>BPT_SHIFT; + } + + public final int getPairedBracket(int c) { + int props=trie.get(c); + if((props&BPT_MASK)==0) { + return c; + } else { + return getMirror(c, props); + } } // data members -------------------------------------------------------- *** private int indexes[]; private int mirrors[]; private byte jgArray[]; + private byte jgArray2[]; - private CharTrie trie; + private Trie2_16 trie; // data format constants ----------------------------------------------- *** private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu"; /* format "BiDi" */ - private static final byte FMT[]={ 0x42, 0x69, 0x44, 0x69 }; + private static final int FMT=0x42694469; /* indexes into indexes[] */ - private static final int IX_INDEX_TOP=0; + private static final int IX_TRIE_SIZE=2; private static final int IX_MIRROR_LENGTH=3; private static final int IX_JG_START=4; private static final int IX_JG_LIMIT=5; + private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */ + private static final int IX_JG_LIMIT2=7; private static final int IX_TOP=16; + // definitions for 16-bit bidi/shaping properties word ----------------- *** + + /* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ + private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */ + + private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + + private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */ + private static final int CLASS_MASK= 0x0000001f; + private static final int JT_MASK= 0x000000e0; + private static final int BPT_MASK= 0x00000300; private static final int getClassFromProps(int props) { return props&CLASS_MASK; } + private static final boolean getFlagFromProps(int props, int shift) { + return ((props>>shift)&1)!=0; + } + private static final int getMirrorDeltaFromProps(int props) { + return (short)props>>MIRROR_DELTA_SHIFT; + } + + private static final int ESC_MIRROR_DELTA=-4; + + // definitions for 32-bit mirror table entry --------------------------- *** + + /* the source Unicode code point takes 21 bits (20..0) */ + private static final int MIRROR_INDEX_SHIFT=21; + + private static final int getMirrorCodePoint(int m) { + return m&0x1fffff; + } + private static final int getMirrorIndex(int m) { + return m>>>MIRROR_INDEX_SHIFT; + } + + /* + * public singleton instance + */ + public static final UBiDiProps INSTANCE; + + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UBiDiProps(); + } catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,""); + } + } } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacter.java 2015-07-13 16:11:54.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacter.java 2015-07-13 16:11:54.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,40 +22,30 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ -package sun.text.normalizer; +/** +******************************************************************************* +* Copyright (C) 1996-2014, International Business Machines Corporation and +* others. All Rights Reserved. +******************************************************************************* +*/ -import java.io.IOException; -import java.util.MissingResourceException; +package sun.text.normalizer; /** - * <p> - * The UCharacter class provides extensions to the - * <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/lang/Character.html"> + * <p>The UCharacter class provides extensions to the + * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html"> * java.lang.Character</a> class. These extensions provide support for * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a> * class, provide support for supplementary characters (those with code * points above U+FFFF). * Each ICU release supports the latest version of Unicode available at that time. - * </p> - * <p> - * Code points are represented in these API using ints. While it would be + * + * <p>Code points are represented in these API using ints. While it would be * more convenient in Java to have a separate primitive datatype for them, * ints suffice in the meantime. - * </p> - * <p> - * To use this class please add the jar file name icu4j.jar to the + * + * <p>To use this class please add the jar file name icu4j.jar to the * class path, since it contains data files which supply the information used * by this file.<br> * E.g. In Windows <br> @@ -64,9 +54,8 @@ * unames.icu from the icu4j source subdirectory * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. - * </p> - * <p> - * Aside from the additions for UTF-16 support, and the updated Unicode + * + * <p>Aside from the additions for UTF-16 support, and the updated Unicode * properties, the main differences between UCharacter and Character are: * <ul> * <li> UCharacter is not designed to be a char wrapper and does not have @@ -87,8 +76,9 @@ * as having numeric values. This is a semantic change from ICU4J 1.3.1. * </ul> * <p> - * Further detail differences can be determined from the program - * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java"> + * Further detail on differences can be determined using the program + * <a href= + * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java"> * com.ibm.icu.dev.test.lang.UCharacterCompare</a> * </p> * <p> @@ -103,8 +93,11 @@ * </p> * <p> * For more information see - * "About the Unicode Character Database" (http://www.unicode.org/ucd/) - * and the ICU User Guide chapter on Properties (http://www.icu-project.org/userguide/properties.html). + * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a> + * (http://www.unicode.org/ucd/) + * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU + * User Guide chapter on Properties</a> + * (http://www.icu-project.org/userguide/properties.html). * </p> * <p> * There are also functions that provide easy migration from C/POSIX functions @@ -128,12 +121,15 @@ * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). * </p> - * <pre>{@code + * <p> * API access for C/POSIX character classes is as follows: + * <pre>{@code * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC) * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE) * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE) - * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|(1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|(1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0 + * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)| + * (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)| + * (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0 * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT) * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM) @@ -143,21 +139,22 @@ * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH) * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT) * }</pre> + * </p> * <p> * The C/POSIX character classes are also available in UnicodeSet patterns, * using patterns like [:graph:] or \p{graph}. * </p> - * <p> - * Note: There are several ICU (and Java) whitespace functions. - * Comparison: - * - isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; + * + * There are several ICU (and Java) whitespace functions. + * Comparison:<ul> + * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; * most of general categories "Z" (separators) + most whitespace ISO controls * (including no-break spaces, but excluding IS1..IS4 and ZWSP) - * - isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces - * - isSpaceChar: just Z (including no-break spaces) + * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces + * <li> isSpaceChar: just Z (including no-break spaces)</ul> * </p> * <p> - * This class is not subclassable + * This class is not subclassable. * </p> * @author Syn Wee Quek * @stable ICU 2.1 @@ -168,6 +165,19 @@ { /** + * Joining Group constants. + * @see UProperty#JOINING_GROUP + * @stable ICU 2.4 + */ + public static interface JoiningGroup + { + /** + * @stable ICU 2.4 + */ + public static final int NO_JOINING_GROUP = 0; + } + + /** * Numeric Type constants. * @see UProperty#NUMERIC_TYPE * @stable ICU 2.4 @@ -177,7 +187,61 @@ /** * @stable ICU 2.4 */ + public static final int NONE = 0; + /** + * @stable ICU 2.4 + */ public static final int DECIMAL = 1; + /** + * @stable ICU 2.4 + */ + public static final int DIGIT = 2; + /** + * @stable ICU 2.4 + */ + public static final int NUMERIC = 3; + /** + * @stable ICU 2.4 + */ + public static final int COUNT = 4; + } + + /** + * Hangul Syllable Type constants. + * + * @see UProperty#HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ + public static interface HangulSyllableType + { + /** + * @stable ICU 2.6 + */ + public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ + /** + * @stable ICU 2.6 + */ + public static final int LEADING_JAMO = 1; /*[L]*/ + /** + * @stable ICU 2.6 + */ + public static final int VOWEL_JAMO = 2; /*[V]*/ + /** + * @stable ICU 2.6 + */ + public static final int TRAILING_JAMO = 3; /*[T]*/ + /** + * @stable ICU 2.6 + */ + public static final int LV_SYLLABLE = 4; /*[LV]*/ + /** + * @stable ICU 2.6 + */ + public static final int LVT_SYLLABLE = 5; /*[LVT]*/ + /** + * @stable ICU 2.6 + */ + public static final int COUNT = 6; } // public data members ----------------------------------------------- @@ -192,22 +256,15 @@ * The highest Unicode code point value (scalar value) according to the * Unicode Standard. * This is a 21-bit value (21 bits, rounded up).<br> - * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE + * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE * @stable ICU 2.1 */ public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; - /** - * The minimum value for Supplementary code points - * @stable ICU 2.1 - */ - public static final int SUPPLEMENTARY_MIN_VALUE = - UTF16.SUPPLEMENTARY_MIN_VALUE; - // public methods ---------------------------------------------------- /** - * Retrieves the numeric value of a decimal digit code point. + * Returns the numeric value of a decimal digit code point. * <br>This method observes the semantics of * <code>java.lang.Character.digit()</code>. Note that this * will return positive values for code points for which isDigit @@ -231,15 +288,54 @@ */ public static int digit(int ch, int radix) { - // when ch is out of bounds getProperty == 0 - int props = getProperty(ch); - int value; - if (getNumericType(props) == NumericType.DECIMAL) { - value = UCharacterProperty.getUnsignedValue(props); + if (2 <= radix && radix <= 36) { + int value = digit(ch); + if (value < 0) { + // ch is not a decimal digit, try latin letters + value = UCharacterProperty.getEuropeanDigit(ch); + } + return (value < radix) ? value : -1; } else { - value = getEuropeanDigit(ch); + return -1; // invalid radix } - return (0 <= value && value < radix) ? value : -1; + } + + /** + * Returns the numeric value of a decimal digit code point. + * <br>This is a convenience overload of <code>digit(int, int)</code> + * that provides a decimal radix. + * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this + * treated numeric letters and other numbers as digits. This has + * been changed to conform to the java semantics. + * @param ch the code point to query + * @return the numeric value represented by the code point, + * or -1 if the code point is not a decimal digit or if its + * value is too large for a decimal radix + * @stable ICU 2.1 + */ + public static int digit(int ch) + { + return UCharacterProperty.INSTANCE.digit(ch); + } + + /** + * Returns a value indicating a code point's Unicode category. + * Up-to-date Unicode implementation of java.lang.Character.getType() + * except for the above mentioned code points that had their category + * changed.<br> + * Return results are constants from the interface + * <a href=UCharacterCategory.html>UCharacterCategory</a><br> + * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with + * those returned by java.lang.Character.getType. UCharacterCategory values + * match the ones used in ICU4C, while java.lang.Character type + * values, though similar, skip the value 17.</p> + * @param ch code point whose type is to be determined + * @return category which is a value of UCharacterCategory + * @stable ICU 2.1 + */ + public static int getType(int ch) + { + return UCharacterProperty.INSTANCE.getType(ch); } /** @@ -254,7 +350,67 @@ */ public static int getDirection(int ch) { - return gBdp.getClass(ch); + return UBiDiProps.INSTANCE.getClass(ch); + } + + /** + * Maps the specified code point to a "mirror-image" code point. + * For code points with the "mirrored" property, implementations sometimes + * need a "poor man's" mapping to another code point such that the default + * glyph may serve as the mirror-image of the default glyph of the + * specified code point.<br> + * This is useful for text conversion to and from codepages with visual + * order, and for displays without glyph selection capabilities. + * @param ch code point whose mirror is to be retrieved + * @return another code point that may serve as a mirror-image substitute, + * or ch itself if there is no such mapping or ch does not have the + * "mirrored" property + * @stable ICU 2.1 + */ + public static int getMirror(int ch) + { + return UBiDiProps.INSTANCE.getMirror(ch); + } + + /** + * Maps the specified character to its paired bracket character. + * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). + * Otherwise c itself is returned. + * See http://www.unicode.org/reports/tr9/ + * + * @param c the code point to be mapped + * @return the paired bracket code point, + * or c itself if there is no such mapping + * (Bidi_Paired_Bracket_Type=None) + * + * @see UProperty#BIDI_PAIRED_BRACKET + * @see UProperty#BIDI_PAIRED_BRACKET_TYPE + * @see #getMirror(int) + * @stable ICU 52 + */ + public static int getBidiPairedBracket(int c) { + return UBiDiProps.INSTANCE.getPairedBracket(c); + } + + /** + * Returns the combining class of the argument codepoint + * @param ch code point whose combining is to be retrieved + * @return the combining class of the codepoint + * @stable ICU 2.1 + */ + public static int getCombiningClass(int ch) + { + return Normalizer2.getNFDInstance().getCombiningClass(ch); + } + + /** + * Returns the version of Unicode data used. + * @return the unicode version number used + * @stable ICU 2.1 + */ + public static VersionInfo getUnicodeVersion() + { + return UCharacterProperty.INSTANCE.m_unicodeVersion_; } /** @@ -275,7 +431,7 @@ } /** - * <p>Get the "age" of the code point.</p> + * Returns the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character. @@ -289,143 +445,95 @@ public static VersionInfo getAge(int ch) { if (ch < MIN_VALUE || ch > MAX_VALUE) { - throw new IllegalArgumentException("Codepoint out of bounds"); + throw new IllegalArgumentException("Codepoint out of bounds"); } - return PROPERTY_.getAge(ch); + return UCharacterProperty.INSTANCE.getAge(ch); } - // private variables ------------------------------------------------- - - /** - * Database storing the sets of character property - */ - private static final UCharacterProperty PROPERTY_; /** - * For optimization + * Returns the property value for an Unicode property type of a code point. + * Also returns binary and mask property values.</p> + * <p>Unicode, especially in version 3.2, defines many more properties than + * the original set in UnicodeData.txt.</p> + * <p>The properties APIs are intended to reflect Unicode properties as + * defined in the Unicode Character Database (UCD) and Unicode Technical + * Reports (UTR). For details about the properties see + * http://www.unicode.org/.</p> + * <p>For names of Unicode properties see the UCD file PropertyAliases.txt. + * </p> + * <pre> + * Sample usage: + * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH); + * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC); + * boolean b = (ideo == 1) ? true : false; + * </pre> + * @param ch code point to test. + * @param type UProperty selector constant, identifies which binary + * property to check. Must be + * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or + * UProperty.INT_START <= type < UProperty.INT_LIMIT or + * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. + * @return numeric value that is directly the property value or, + * for enumerated properties, corresponds to the numeric value of + * the enumerated constant of the respective property value + * enumeration type (cast to enum type if necessary). + * Returns 0 or 1 (for false / true) for binary Unicode properties. + * Returns a bit-mask for mask properties. + * Returns 0 if 'type' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code + * point. + * @see UProperty + * @see #hasBinaryProperty + * @see #getIntPropertyMinValue + * @see #getIntPropertyMaxValue + * @see #getUnicodeVersion + * @stable ICU 2.4 */ - private static final char[] PROPERTY_TRIE_INDEX_; - private static final char[] PROPERTY_TRIE_DATA_; - private static final int PROPERTY_INITIAL_VALUE_; - - private static final UBiDiProps gBdp; - - // block to initialise character property database - static - { - try - { - PROPERTY_ = UCharacterProperty.getInstance(); - PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; - PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; - PROPERTY_INITIAL_VALUE_ = PROPERTY_.m_trieInitialValue_; - } - catch (Exception e) - { - throw new MissingResourceException(e.getMessage(),"",""); - } - - UBiDiProps bdp; - try { - bdp=UBiDiProps.getSingleton(); - } catch(IOException e) { - bdp=UBiDiProps.getDummy(); - } - gBdp=bdp; + // for BiDiBase.java + public static int getIntPropertyValue(int ch, int type) { + return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type); } - /** - * Shift to get numeric type - */ - private static final int NUMERIC_TYPE_SHIFT_ = 5; - /** - * Mask to get numeric type - */ - private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_; - - // private methods --------------------------------------------------- + // private constructor ----------------------------------------------- /** - * Getting the digit values of characters like 'A' - 'Z', normal, - * half-width and full-width. This method assumes that the other digit - * characters are checked by the calling method. - * @param ch character to test - * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise - * its corresponding digit will be returned. + * Private constructor to prevent instantiation */ - private static int getEuropeanDigit(int ch) { - if ((ch > 0x7a && ch < 0xff21) - || ch < 0x41 || (ch > 0x5a && ch < 0x61) - || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { - return -1; - } - if (ch <= 0x7a) { - // ch >= 0x41 or ch < 0x61 - return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); - } - // ch >= 0xff21 - if (ch <= 0xff3a) { - return ch + 10 - 0xff21; - } - // ch >= 0xff41 && ch <= 0xff5a - return ch + 10 - 0xff41; - } + private UCharacter() { } - /** - * Gets the numeric type of the property argument - * @param props 32 bit property - * @return the numeric type - */ - private static int getNumericType(int props) - { - return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; - } + /* + * Copied from UCharacterEnums.java + */ - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. - * This is a duplicate of UCharacterProperty.getProperty. For optimization - * purposes, this method calls the trie data directly instead of through - * UCharacterProperty.getProperty. - * @param ch code point whose property value is to be retrieved - * @return property value of code point - * @stable ICU 2.6 - */ - private static final int getProperty(int ch) - { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - try { // using try for ch < 0 is faster than using an if statement - return PROPERTY_TRIE_DATA_[ - (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) - + (ch & 0x1f)]; - } catch (ArrayIndexOutOfBoundsException e) { - return PROPERTY_INITIAL_VALUE_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return PROPERTY_TRIE_DATA_[ - (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) - + (ch & 0x1f)]; - } - // for optimization - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return PROPERTY_.m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & 0x3ff)); - } - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return PROPERTY_INITIAL_VALUE_; - } + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final byte NON_SPACING_MARK = 6; + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final byte ENCLOSING_MARK = 7; + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final byte COMBINING_SPACING_MARK = 8; + /** + * Character type count + * @stable ICU 2.1 + */ + public static final byte CHAR_CATEGORY_COUNT = 30; + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java 2015-07-13 16:11:55.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java 2015-07-13 16:11:55.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2006, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,13 +25,8 @@ /* ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and * + * others. All Rights Reserved. * ******************************************************************************* */ @@ -84,7 +79,6 @@ return new ReplaceableUCharacterIterator(source); } - //// for StringPrep /** * Returns a <code>UCharacterIterator</code> object given a * source StringBuffer. @@ -97,7 +91,7 @@ return new ReplaceableUCharacterIterator(source); } - /** + /** * Returns a <code>UCharacterIterator</code> object given a * CharacterIterator. * @param source a valid CharacterIterator object. @@ -112,21 +106,12 @@ // public methods ---------------------------------------------------------- /** - * Returns the code unit at the current index. If index is out - * of range, returns DONE. Index is not changed. - * @return current code unit - * @stable ICU 2.4 - */ - public abstract int current(); - - /** * Returns the length of the text * @return length of the text * @stable ICU 2.4 */ public abstract int getLength(); - /** * Gets the current index in text. * @return current index in text. @@ -134,7 +119,6 @@ */ public abstract int getIndex(); - /** * Returns the UTF16 code unit at index, and increments to the next * code unit (post-increment semantics). If index is out of @@ -183,6 +167,33 @@ */ public abstract int previous(); + + /** + * Retreat to the start of the previous code point in the text, + * and return it (pre-decrement semantics). If the index is not + * preceeded by a valid surrogate pair, the behavior is the same + * as <code>previous()</code>. Otherwise the iterator is + * decremented to the start of the surrogate pair, and the code + * point represented by the pair is returned. + * @return the previous code point in the text, or DONE if the new + * index is before the start of the text. + * @stable ICU 2.4 + */ + public int previousCodePoint(){ + int ch1 = previous(); + if(UTF16.isTrailSurrogate((char)ch1)){ + int ch2 = previous(); + if(UTF16.isLeadSurrogate((char)ch2)){ + return UCharacterProperty.getRawSupplementary((char)ch2, + (char)ch1); + }else if (ch2 != DONE) { + //unmatched trail surrogate so back out + next(); + } + } + return ch1; + } + /** * Sets the index to the specified index in the text. * @param index the index within the text. @@ -192,7 +203,14 @@ */ public abstract void setIndex(int index); - //// for StringPrep + /** + * Sets the current index to the start. + * @stable ICU 2.4 + */ + public void setToStart() { + setIndex(0); + } + /** * Fills the buffer with the underlying text storage of the iterator * If the buffer capacity is not enough a exception is thrown. The capacity @@ -222,20 +240,19 @@ * units. * @param offset the position within the array to start putting the data. * @return the number of code units added to fillIn, as a convenience - * @exception IndexOutOfBounds exception if there is not enough - * room after offset in the array, or if offset {@literal <} 0. + * @exception IndexOutOfBoundsException exception if there is not enough + * room after offset in the array, or if offset < 0. * @stable ICU 2.4 */ public abstract int getText(char[] fillIn, int offset); - //// for StringPrep /** * Convenience override for <code>getText(char[], int)</code> that provides * an offset of 0. * @param fillIn an array of chars to fill with the underlying UTF-16 code * units. * @return the number of code units added to fillIn, as a convenience - * @exception IndexOutOfBounds exception if there is not enough + * @exception IndexOutOfBoundsException exception if there is not enough * room in the array. * @stable ICU 2.4 */ @@ -243,7 +260,6 @@ return getText(fillIn, 0); } - //// for StringPrep /** * Convenience method for returning the underlying text storage as a string * @return the underlying text storage in the iterator as a string @@ -256,25 +272,32 @@ } /** - * Moves the current position by the number of code units - * specified, either forward or backward depending on the sign - * of delta (positive or negative respectively). If the resulting - * index would be less than zero, the index is set to zero, and if - * the resulting index would be greater than limit, the index is - * set to limit. - * - * @param delta the number of code units to move the current - * index. - * @return the new index. - * @exception IndexOutOfBoundsException is thrown if an invalid index is + * Moves the current position by the number of code points + * specified, either forward or backward depending on the sign of + * delta (positive or negative respectively). If the current index + * is at a trail surrogate then the first adjustment is by code + * unit, and the remaining adjustments are by code points. If the + * resulting index would be less than zero, the index is set to + * zero, and if the resulting index would be greater than limit, + * the index is set to limit. + * @param delta the number of code units to move the current index. + * @return the new index + * @exception IndexOutOfBoundsException is thrown if an invalid delta is * supplied * @stable ICU 2.4 * */ - public int moveIndex(int delta) { - int x = Math.max(0, Math.min(getIndex() + delta, getLength())); - setIndex(x); - return x; + public int moveCodePointIndex(int delta){ + if(delta>0){ + while(delta>0 && nextCodePoint() != DONE){delta--;} + }else{ + while(delta<0 && previousCodePoint() != DONE){delta++;} + } + if(delta!=0){ + throw new IndexOutOfBoundsException(); + } + + return getIndex(); } /** --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2015-07-13 16:11:55.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,23 +24,21 @@ */ /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package sun.text.normalizer; -import java.io.BufferedInputStream; -import java.io.InputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; import java.util.MissingResourceException; +import sun.text.normalizer.UCharacter.HangulSyllableType; +import sun.text.normalizer.UCharacter.NumericType; + /** * <p>Internal class used for Unicode character property database.</p> * <p>This classes store binary data read from uprops.icu. @@ -56,134 +54,72 @@ * @since release 2.1, february 1st 2002 */ -public final class UCharacterProperty +final class UCharacterProperty { // public data members ----------------------------------------------- + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + /** * Trie data */ - public CharTrie m_trie_; - /** - * Optimization - * CharTrie index array - */ - public char[] m_trieIndex_; - /** - * Optimization - * CharTrie data array - */ - public char[] m_trieData_; - /** - * Optimization - * CharTrie data offset - */ - public int m_trieInitialValue_; + public Trie2_16 m_trie_; + /** * Unicode version */ public VersionInfo m_unicodeVersion_; + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + // uprops.h enum UPropertySource --------------------------------------- *** + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; /** From uchar.c/uprops.icu properties vectors trie */ public static final int SRC_PROPSVEC=2; - /** One more than the highest UPropertySource (SRC_) constant. */ - public static final int SRC_COUNT=9; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; // public methods ---------------------------------------------------- /** - * Java friends implementation - */ - public void setIndexData(CharTrie.FriendAgent friendagent) - { - m_trieIndex_ = friendagent.getPrivateIndex(); - m_trieData_ = friendagent.getPrivateData(); - m_trieInitialValue_ = friendagent.getPrivateInitialValue(); - } - - /** - * Gets the property value at the index. - * This is optimized. - * Note this is alittle different from CharTrie the index m_trieData_ - * is never negative. + * Gets the main property value for code point ch. * @param ch code point whose property value is to be retrieved * @return property value of code point */ public final int getProperty(int ch) { - if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE - || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE - && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { - // BMP codepoint 0000..D7FF or DC00..FFFF - // optimized - try { // using try for ch < 0 is faster than using an if statement - return m_trieData_[ - (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } catch (ArrayIndexOutOfBoundsException e) { - return m_trieInitialValue_; - } - } - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - // lead surrogate D800..DBFF - return m_trieData_[ - (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ - + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] - << Trie.INDEX_STAGE_2_SHIFT_) - + (ch & Trie.INDEX_STAGE_3_MASK_)]; - } - if (ch <= UTF16.CODEPOINT_MAX_VALUE) { - // supplementary code point 10000..10FFFF - // look at the construction of supplementary characters - // trail forms the ends of it. - return m_trie_.getSurrogateValue( - UTF16.getLeadSurrogate(ch), - (char)(ch & Trie.SURROGATE_MASK_)); - } - // ch is out of bounds - // return m_dataOffset_ if there is an error, in this case we return - // the default value: m_initialValue_ - // we cannot assume that m_initialValue_ is at offset 0 - // this is for optimization. - return m_trieInitialValue_; - - // this all is an inlined form of return m_trie_.getCodePointValue(ch); - } - - /** - * Getting the unsigned numeric value of a character embedded in the property - * argument - * @param prop the character - * @return unsigned numberic value - */ - public static int getUnsignedValue(int prop) - { - return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; + return m_trie_.get(ch); } /** * Gets the unicode additional properties. - * C version getUnicodeProperties. + * Java version of C u_getUnicodeProperties(). * @param codepoint codepoint whose additional properties is to be * retrieved - * @param column + * @param column The column index. * @return unicode properties */ - public int getAdditional(int codepoint, int column) { - if (column == -1) { - return getProperty(codepoint); - } - if (column < 0 || column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[ - m_additionalTrie_.getCodePointValue(codepoint) + column]; - } + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } - /** + /** * <p>Get the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a @@ -203,6 +139,91 @@ version & LAST_NIBBLE_MASK_, 0, 0); } + // int-value and enumerated properties --------------------------------- *** + + public int getType(int c) { + return getProperty(c)&TYPE_MASK; + } + + /* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ + private static final int /* UHangulSyllableType */ gcbToHst[]={ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ + HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ + HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ + HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ + HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ + HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ + }; + + private class IntProperty { + int column; // SRC_PROPSVEC column, or "source" if mask==0 + int mask; + int shift; + + IntProperty(int column, int mask, int shift) { + this.column=column; + this.mask=mask; + this.shift=shift; + } + + IntProperty(int source) { + this.column=source; + this.mask=0; + } + + int getValue(int c) { + // systematic, directly stored properties + return (getAdditional(c, column)&mask)>>>shift; + } + } + + private class BiDiIntProperty extends IntProperty { + BiDiIntProperty() { + super(SRC_BIDI); + } + } + + private class CombiningClassIntProperty extends IntProperty { + CombiningClassIntProperty(int source) { + super(source); + } + } + + private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties + int which; + int max; + + NormQuickCheckIntProperty(int source, int which, int max) { + super(source); + this.which=which; + this.max=max; + } + } + + private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE + int getValue(int c) { + return UBiDiProps.INSTANCE.getPairedBracketType(c); + } + }; + + public int getIntPropertyValue(int c, int which) { + if (which == BIDI_PAIRED_BRACKET_TYPE) { + return intProp.getValue(c); + } + return 0; // undefined + } + /** * Forms a supplementary code point from the argument character<br> * Note this is for internal use hence no checks for the validity of the @@ -217,42 +238,48 @@ } /** - * Loads the property data and initialize the UCharacterProperty instance. - * @throws MissingResourceException when data is missing or data has been corrupted - */ - public static UCharacterProperty getInstance() + * Gets the type mask + * @param type character type + * @return mask + */ + public static final int getMask(int type) { - if(INSTANCE_ == null) { - try { - INSTANCE_ = new UCharacterProperty(); - } - catch (Exception e) { - throw new MissingResourceException(e.getMessage(),"",""); - } - } - return INSTANCE_; + return 1 << type; } /** - * Checks if the argument c is to be treated as a white space in ICU - * rules. Usually ICU rule white spaces are ignored unless quoted. - * Equivalent to test for Pattern_White_Space Unicode property. - * Stable set of characters, won't change. - * See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ - * @param c codepoint to check - * @return true if c is a ICU white space - */ - public static boolean isRuleWhiteSpace(int c) - { - /* "white space" in the sense of ICU rule parsers - This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. - See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/ - U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 - Equivalent to test for Pattern_White_Space Unicode property. - */ - return (c >= 0x0009 && c <= 0x2029 && - (c <= 0x000D || c == 0x0020 || c == 0x0085 || - c == 0x200E || c == 0x200F || c >= 0x2028)); + * Returns the digit values of characters like 'A' - 'Z', normal, + * half-width and full-width. This method assumes that the other digit + * characters are checked by the calling method. + * @param ch character to test + * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise + * its corresponding digit will be returned. + */ + public static int getEuropeanDigit(int ch) { + if ((ch > 0x7a && ch < 0xff21) + || ch < 0x41 || (ch > 0x5a && ch < 0x61) + || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { + return -1; + } + if (ch <= 0x7a) { + // ch >= 0x41 or ch < 0x61 + return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); + } + // ch >= 0xff21 + if (ch <= 0xff3a) { + return ch + 10 - 0xff21; + } + // ch >= 0xff41 && ch <= 0xff5a + return ch + 10 - 0xff41; + } + + public int digit(int c) { + int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; + if(value<=9) { + return value; + } else { + return -1; + } } // protected variables ----------------------------------------------- @@ -260,7 +287,7 @@ /** * Extra property trie */ - CharTrie m_additionalTrie_; + Trie2_16 m_additionalTrie_; /** * Extra property vectors, 1st column for age and second for binary * properties. @@ -280,40 +307,24 @@ * 0 */ int m_maxJTGValue_; + /** + * Script_Extensions data + */ + public char[] m_scriptExtensions_; // private variables ------------------------------------------------- - /** - * UnicodeData.txt property object - */ - private static UCharacterProperty INSTANCE_ = null; - /** * Default name of the datafile */ private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; /** - * Default buffer size of datafile - */ - private static final int DATA_BUFFER_SIZE_ = 25000; - - /** - * Numeric value shift - */ - private static final int VALUE_SHIFT_ = 8; - - /** - * Mask to be applied after shifting to obtain an unsigned numeric value - */ - private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF; - - /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** - * Offset to add to combined surrogate pair to avoid msking. + * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET_ = UTF16.SUPPLEMENTARY_MIN_VALUE - @@ -321,7 +332,153 @@ LEAD_SURROGATE_SHIFT_) - UTF16.TRAIL_SURROGATE_MIN_VALUE; - // additional properties ---------------------------------------------- + + // property data constants ------------------------------------------------- + + /** + * Numeric types and values in the main properties words. + */ + private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; + private static final int getNumericTypeValue(int props) { + return props >> NUMERIC_TYPE_VALUE_SHIFT_; + } + + /* constants for the storage form of numeric types and values */ + /** No numeric value. */ + private static final int NTV_NONE_ = 0; + /** Decimal digits: nv=0..9 */ + private static final int NTV_DECIMAL_START_ = 1; + /** Other digits: nv=0..9 */ + private static final int NTV_DIGIT_START_ = 11; + /** Small integers: nv=0..154 */ + private static final int NTV_NUMERIC_START_ = 21; + + private static final int ntvGetType(int ntv) { + return + (ntv==NTV_NONE_) ? NumericType.NONE : + (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : + (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : + NumericType.NUMERIC; + } + + /* + * Properties in vector word 0 + * Bits + * 31..24 DerivedAge version major/minor one nibble each + * 23..22 3..1: Bits 7..0 = Script_Extensions index + * 3: Script value from Script_Extensions + * 2: Script=Inherited + * 1: Script=Common + * 0: Script=bits 7..0 + * 21..20 reserved + * 19..17 East Asian Width + * 16.. 8 UBlockCode + * 7.. 0 UScriptCode + */ + /** + * Script_Extensions: mask includes Script + */ + public static final int SCRIPT_X_MASK = 0x00c000ff; + //private static final int SCRIPT_X_SHIFT = 22; + /** + * Integer properties mask and shift values for East Asian cell width. + * Equivalent to icu4c UPROPS_EA_MASK + */ + private static final int EAST_ASIAN_MASK_ = 0x000e0000; + /** + * Integer properties mask and shift values for East Asian cell width. + * Equivalent to icu4c UPROPS_EA_SHIFT + */ + private static final int EAST_ASIAN_SHIFT_ = 17; + /** + * Integer properties mask and shift values for blocks. + * Equivalent to icu4c UPROPS_BLOCK_MASK + */ + private static final int BLOCK_MASK_ = 0x0001ff00; + /** + * Integer properties mask and shift values for blocks. + * Equivalent to icu4c UPROPS_BLOCK_SHIFT + */ + private static final int BLOCK_SHIFT_ = 8; + /** + * Integer properties mask and shift values for scripts. + * Equivalent to icu4c UPROPS_SHIFT_MASK + */ + public static final int SCRIPT_MASK_ = 0x000000ff; + + /** + * Additional properties used in internal trie data + */ + /* + * Properties in vector word 1 + * Each bit encodes one binary property. + * The following constants represent the bit number, use 1<<UPROPS_XYZ. + * UPROPS_BINARY_1_TOP<=32! + * + * Keep this list of property enums in sync with + * propListNames[] in icu/source/tools/genprops/props2.c! + * + * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". + */ + private static final int WHITE_SPACE_PROPERTY_ = 0; + private static final int DASH_PROPERTY_ = 1; + private static final int HYPHEN_PROPERTY_ = 2; + private static final int QUOTATION_MARK_PROPERTY_ = 3; + private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; + private static final int MATH_PROPERTY_ = 5; + private static final int HEX_DIGIT_PROPERTY_ = 6; + private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; + private static final int ALPHABETIC_PROPERTY_ = 8; + private static final int IDEOGRAPHIC_PROPERTY_ = 9; + private static final int DIACRITIC_PROPERTY_ = 10; + private static final int EXTENDER_PROPERTY_ = 11; + private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; + private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; + private static final int GRAPHEME_LINK_PROPERTY_ = 14; + private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; + private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; + private static final int RADICAL_PROPERTY_ = 17; + private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; + private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; + private static final int DEPRECATED_PROPERTY_ = 20; + private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; + private static final int XID_START_PROPERTY_ = 22; + private static final int XID_CONTINUE_PROPERTY_ = 23; + private static final int ID_START_PROPERTY_ = 24; + private static final int ID_CONTINUE_PROPERTY_ = 25; + private static final int GRAPHEME_BASE_PROPERTY_ = 26; + private static final int S_TERM_PROPERTY_ = 27; + private static final int VARIATION_SELECTOR_PROPERTY_ = 28; + private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ + private static final int PATTERN_WHITE_SPACE = 30; + + /* + * Properties in vector word 2 + * Bits + * 31..26 reserved + * 25..20 Line Break + * 19..15 Sentence Break + * 14..10 Word Break + * 9.. 5 Grapheme Cluster Break + * 4.. 0 Decomposition Type + */ + private static final int LB_MASK = 0x03f00000; + private static final int LB_SHIFT = 20; + + private static final int SB_MASK = 0x000f8000; + private static final int SB_SHIFT = 15; + + private static final int WB_MASK = 0x00007c00; + private static final int WB_SHIFT = 10; + + private static final int GCB_MASK = 0x000003e0; + private static final int GCB_SHIFT = 5; + + /** + * Integer properties mask for decomposition type. + * Equivalent to icu4c UPROPS_DT_MASK. + */ + private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; /** * First nibble shift @@ -339,31 +496,112 @@ // private constructors -------------------------------------------------- /** - * Constructor - * @exception IOException thrown when data reading fails or data corrupted - */ + * Constructor + * @exception IOException thrown when data reading fails or data corrupted + */ private UCharacterProperty() throws IOException { // jar access - InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); - BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); - UCharacterPropertyReader reader = new UCharacterPropertyReader(b); - reader.read(this); - b.close(); + ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); + m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); + // Read or skip the 16 indexes. + int propertyOffset = bytes.getInt(); + /* exceptionOffset = */ bytes.getInt(); + /* caseOffset = */ bytes.getInt(); + int additionalOffset = bytes.getInt(); + int additionalVectorsOffset = bytes.getInt(); + m_additionalColumnsCount_ = bytes.getInt(); + int scriptExtensionsOffset = bytes.getInt(); + int reservedOffset7 = bytes.getInt(); + /* reservedOffset8 = */ bytes.getInt(); + /* dataTopOffset = */ bytes.getInt(); + m_maxBlockScriptValue_ = bytes.getInt(); + m_maxJTGValue_ = bytes.getInt(); + ICUBinary.skipBytes(bytes, (16 - 12) << 2); + + // read the main properties trie + m_trie_ = Trie2_16.createFromSerialized(bytes); + int expectedTrieLength = (propertyOffset - 16) * 4; + int trieLength = m_trie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for main trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // skip unused intervening data structures + ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); + + if(m_additionalColumnsCount_ > 0) { + // reads the additional property block + m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); + expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; + trieLength = m_additionalTrie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // additional properties + int size = scriptExtensionsOffset - additionalVectorsOffset; + m_additionalVectors_ = new int[size]; + for (int i = 0; i < size; i ++) { + m_additionalVectors_[i] = bytes.getInt(); + } + } - m_trie_.putIndexData(this); + // Script_Extensions + int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; + if(numChars > 0) { + m_scriptExtensions_ = new char[numChars]; + for(int i = 0; i < numChars; ++i) { + m_scriptExtensions_[i] = bytes.getChar(); + } + } } + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 7; + } + } + + private static final int DATA_FORMAT = 0x5550726F; // "UPro" + public void upropsvec_addPropertyStarts(UnicodeSet set) { /* add the start code point of each same-value range of the properties vectors trie */ if(m_additionalColumnsCount_>0) { /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ - TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); - RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); - while(propsVectorsIter.next(propsVectorsResult)){ - set.add(propsVectorsResult.start); + Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); + Trie2.Range range; + while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { + set.add(range.startCodePoint); } } } + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UCharacterProperty(); + } + catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); + } + } + + + // Moved from UProperty.java + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + } --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java 2015-07-13 16:11:56.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UTF16.java 2015-07-13 16:11:56.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,15 +22,10 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ -/* +/** ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ @@ -57,21 +52,21 @@ * * // iteration forwards: Changes for UTF-32 * int ch; - * for (int i = 0; i < s.length(); i+=UTF16.getCharCount(ch)) { - * ch = UTF16.charAt(s,i); + * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { + * ch = UTF16.charAt(s, i); * doSomethingWith(ch); * } * * // iteration backwards: Original - * for (int i = s.length() -1; i >= 0; --i) { + * for (int i = s.length() - 1; i >= 0; --i) { * char ch = s.charAt(i); * doSomethingWith(ch); * } * * // iteration backwards: Changes for UTF-32 * int ch; - * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) { - * ch = UTF16.charAt(s,i); + * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { + * ch = UTF16.charAt(s, i); * doSomethingWith(ch); * } * }</pre> @@ -93,7 +88,7 @@ * back if and only if <code>bounds(string, offset16) != TRAIL</code>. * </li> * <li> - * <strong>Exceptions:</strong> The error checking will throw an exception + * <strong>Exceptions:</strong> The error checking will throw an exception * if indices are out of bounds. Other than that, all methods will * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 * values are present. <code>UCharacter.isLegal()</code> can be used to check @@ -106,10 +101,10 @@ * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). * </li> * <li> - * <strong>Optimization:</strong> The method implementations may need - * optimization if the compiler doesn't fold static final methods. Since - * surrogate pairs will form an exceeding small percentage of all the text - * in the world, the singleton case should always be optimized for. + * <strong>Optimization:</strong> The method implementations may need + * optimization if the compiler doesn't fold static final methods. Since + * surrogate pairs will form an exceeding small percentage of all the text + * in the world, the singleton case should always be optimized for. * </li> * </ul> * @author Mark Davis, with help from Markus Scherer @@ -135,7 +130,7 @@ * The minimum value for Supplementary code points * @stable ICU 2.1 */ - public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; + public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; /** * Lead surrogate minimum value * @stable ICU 2.1 @@ -161,7 +156,41 @@ * @stable ICU 2.1 */ public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; + /** + * Lead surrogate bitmask + */ + private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; + /** + * Trail surrogate bitmask + */ + private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; + /** + * Surrogate bitmask + */ + private static final int SURROGATE_BITMASK = 0xFFFFF800; + /** + * Lead surrogate bits + */ + private static final int LEAD_SURROGATE_BITS = 0xD800; + /** + * Trail surrogate bits + */ + private static final int TRAIL_SURROGATE_BITS = 0xDC00; + /** + * Surrogate bits + */ + private static final int SURROGATE_BITS = 0xD800; + + // constructor -------------------------------------------------------- + // /CLOVER:OFF + /** + * Prevent instance from being created. + */ + private UTF16() { + } + + // /CLOVER:ON // public method ------------------------------------------------------ /** @@ -222,7 +251,7 @@ } /** - * Extract a single UTF-32 value from a substring. + * Extract a single UTF-32 value from a string. * Used when iterating forwards or backwards (with * <code>UTF16.getCharCount()</code>, as well as random access. If a * validity check is required, use @@ -232,19 +261,72 @@ * character will be returned. If a complete supplementary character is * not found the incomplete character will be returned * @param source array of UTF-16 chars - * @param start offset to substring in the source array for analyzing - * @param limit offset to substring in the source array for analyzing - * @param offset16 UTF-16 offset relative to start + * @param offset16 UTF-16 offset to the start of the character. * @return UTF-32 value for the UTF-32 value that contains the char at * offset16. The boundaries of that codepoint are the same as in * <code>bounds32()</code>. - * @exception IndexOutOfBoundsException thrown if offset16 is not within - * the range of start and limit. + * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds. * @stable ICU 2.1 */ - public static int charAt(char source[], int start, int limit, - int offset16) - { + public static int charAt(CharSequence source, int offset16) { + char single = source.charAt(offset16); + if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { + return single; + } + return _charAt(source, offset16, single); + } + + private static int _charAt(CharSequence source, int offset16, char single) { + if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { + return single; + } + + // Convert the UTF-16 surrogate pair if necessary. + // For simplicity in usage, and because the frequency of pairs is + // low, look both directions. + + if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + ++offset16; + if (source.length() != offset16) { + char trail = source.charAt(offset16); + if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE + && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(single, trail); + } + } + } else { + --offset16; + if (offset16 >= 0) { + // single is a trail surrogate so + char lead = source.charAt(offset16); + if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE + && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(lead, single); + } + } + } + return single; // return unmatched surrogate + } + + /** + * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards + * (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is + * required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal() + * </a></code> + * on the return value. If the char retrieved is part of a surrogate pair, its supplementary + * character will be returned. If a complete supplementary character is not found the incomplete + * character will be returned + * + * @param source Array of UTF-16 chars + * @param start Offset to substring in the source array for analyzing + * @param limit Offset to substring in the source array for analyzing + * @param offset16 UTF-16 offset relative to start + * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries + * of that codepoint are the same as in <code>bounds32()</code>. + * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. + * @stable ICU 2.1 + */ + public static int charAt(char source[], int start, int limit, int offset16) { offset16 += start; if (offset16 < start || offset16 >= limit) { throw new ArrayIndexOutOfBoundsException(offset16); @@ -259,7 +341,7 @@ // For simplicity in usage, and because the frequency of pairs is // low, look both directions. if (single <= LEAD_SURROGATE_MAX_VALUE) { - offset16 ++; + offset16++; if (offset16 >= limit) { return single; } @@ -272,7 +354,7 @@ if (offset16 == start) { return single; } - offset16 --; + offset16--; char lead = source[offset16]; if (isLeadSurrogate(lead)) return UCharacterProperty.getRawSupplementary(lead, single); @@ -300,37 +382,34 @@ /** * Determines whether the code value is a surrogate. * @param char16 the input character. - * @return true iff the input character is a surrogate. + * @return true if the input character is a surrogate. * @stable ICU 2.1 */ public static boolean isSurrogate(char char16) { - return LEAD_SURROGATE_MIN_VALUE <= char16 && - char16 <= TRAIL_SURROGATE_MAX_VALUE; + return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; } /** * Determines whether the character is a trail surrogate. * @param char16 the input character. - * @return true iff the input character is a trail surrogate. + * @return true if the input character is a trail surrogate. * @stable ICU 2.1 */ public static boolean isTrailSurrogate(char char16) { - return (TRAIL_SURROGATE_MIN_VALUE <= char16 && - char16 <= TRAIL_SURROGATE_MAX_VALUE); + return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; } /** * Determines whether the character is a lead surrogate. * @param char16 the input character. - * @return true iff the input character is a lead surrogate + * @return true if the input character is a lead surrogate * @stable ICU 2.1 */ public static boolean isLeadSurrogate(char char16) { - return LEAD_SURROGATE_MIN_VALUE <= char16 && - char16 <= LEAD_SURROGATE_MAX_VALUE; + return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; } /** @@ -359,7 +438,7 @@ * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> * on char32 before calling. * @param char32 the input character. - * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise + * @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise * the character itself * @stable ICU 2.1 */ @@ -370,7 +449,7 @@ (char32 & TRAIL_SURROGATE_MASK_)); } - return (char)char32; + return (char) char32; } /** @@ -415,16 +494,15 @@ // Write the UTF-16 values if (char32 >= SUPPLEMENTARY_MIN_VALUE) { - target.append(getLeadSurrogate(char32)); - target.append(getTrailSurrogate(char32)); - } + target.append(getLeadSurrogate(char32)); + target.append(getTrailSurrogate(char32)); + } else { - target.append((char)char32); + target.append((char) char32); } return target; } - //// for StringPrep /** * Shifts offset16 by the argument number of codepoints within a subarray. * @param source char array @@ -441,20 +519,20 @@ public static int moveCodePointOffset(char source[], int start, int limit, int offset16, int shift32) { - int size = source.length; - int count; - char ch; - int result = offset16 + start; - if (start<0 || limit<start) { + int size = source.length; + int count; + char ch; + int result = offset16 + start; + if (start < 0 || limit < start) { throw new StringIndexOutOfBoundsException(start); } - if (limit>size) { + if (limit > size) { throw new StringIndexOutOfBoundsException(limit); } - if (offset16<0 || result>limit) { + if (offset16 < 0 || result > limit) { throw new StringIndexOutOfBoundsException(offset16); } - if (shift32 > 0 ) { + if (shift32 > 0) { if (shift32 + result > size) { throw new StringIndexOutOfBoundsException(result); } @@ -462,29 +540,29 @@ while (result < limit && count > 0) { ch = source[result]; - if (isLeadSurrogate(ch) && (result+1 < limit) && - isTrailSurrogate(source[result+1])) { - result ++; + if (isLeadSurrogate(ch) && (result + 1 < limit) && + isTrailSurrogate(source[result + 1])) { + result++; } - count --; - result ++; + count--; + result++; } } else { if (result + shift32 < start) { throw new StringIndexOutOfBoundsException(result); } - for (count=-shift32; count>0; count--) { + for (count = -shift32; count > 0; count--) { result--; - if (result<start) { + if (result < start) { break; } ch = source[result]; - if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) { + if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { result--; } } } - if (count != 0) { + if (count != 0) { throw new StringIndexOutOfBoundsException(shift32); } result -= start; @@ -501,7 +579,7 @@ /** * Mask to retrieve the significant value from a trail surrogate. */ - private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; + private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; /** * Value that all lead surrogate starts with @@ -509,7 +587,7 @@ private static final int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE - >> LEAD_SURROGATE_SHIFT_); + >> LEAD_SURROGATE_SHIFT_); // private methods ------------------------------------------------------ @@ -527,7 +605,7 @@ private static String toString(int ch) { if (ch < SUPPLEMENTARY_MIN_VALUE) { - return String.valueOf((char)ch); + return String.valueOf((char) ch); } StringBuilder result = new StringBuilder(); --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java 2015-07-13 16:11:57.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java 2015-07-13 16:11:57.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,29 +22,31 @@ * or visit www.oracle.com if you need additional information or have any * questions. */ + /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2015, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ - package sun.text.normalizer; +import java.io.IOException; import java.text.ParsePosition; -import java.util.Iterator; +import java.util.ArrayList; import java.util.TreeSet; /** - * A mutable set of Unicode characters and multicharacter strings. Objects of this class - * represent <em>character classes</em> used in regular expressions. - * A character specifies a subset of Unicode code points. Legal - * code points are U+0000 to U+10FFFF, inclusive. + * A mutable set of Unicode characters and multicharacter strings. + * Objects of this class represent <em>character classes</em> used + * in regular expressions. A character specifies a subset of Unicode + * code points. Legal code points are U+0000 to U+10FFFF, inclusive. + * + * Note: method freeze() will not only make the set immutable, but + * also makes important methods much higher performance: + * contains(c), containsNone(...), span(...), spanBack(...) etc. + * After the object is frozen, any subsequent call that wants to change + * the object will throw UnsupportedOperationException. * * <p>The UnicodeSet class is not designed to be subclassed. * @@ -118,7 +120,7 @@ * </blockquote> * * Any character may be preceded by a backslash in order to remove any special - * meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are + * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are * ignored, unless they are escaped. * * <p>Property patterns specify a set of characters having a certain @@ -267,18 +269,24 @@ * </tr> * </table> * </blockquote> - * <p>To iterate over contents of UnicodeSet, use UnicodeSetIterator class. + * <p>To iterate over contents of UnicodeSet, the following are available: + * <ul><li>{@link #ranges()} to iterate through the ranges</li> + * <li>{@link #strings()} to iterate through the strings</li> + * <li>{@link #iterator()} to iterate through the entire contents in a single loop. + * That method is, however, not particularly efficient, since it "boxes" each code point into a String. + * </ul> + * All of the above can be used in <b>for</b> loops. + * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops. + * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. * * @author Alan Liu * @stable ICU 2.0 - * @see UnicodeSetIterator */ -@SuppressWarnings("deprecation") -public class UnicodeSet implements UnicodeMatcher { +class UnicodeSet { private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. - // 110000 for codepoints + // 110000 for codepoints /** * Minimum value that can be stored in a UnicodeSet. @@ -299,7 +307,7 @@ // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! // is not private so that UnicodeSetIterator can get access - TreeSet<String> strings = new TreeSet<>(); + TreeSet<String> strings = new TreeSet<String>(); /** * The pattern representation of this set. This may not be the @@ -310,18 +318,14 @@ * indicating that toPattern() must generate a pattern * representation from the inversion list. */ - private String pat = null; private static final int START_EXTRA = 16; // initial storage. Must be >= 0 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 - /** - * A set of all characters _except_ the second through last characters of - * certain ranges. These ranges are ranges of characters whose - * properties are all exactly alike, e.g. CJK Ideographs from - * U+4E00 to U+9FA5. - */ - private static UnicodeSet INCLUSIONS[] = null; + private static UnicodeSet INCLUSION = null; + + private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. + private volatile UnicodeSetStringSpan stringSpan; //---------------------------------------------------------------- // Public API @@ -331,14 +335,22 @@ * Constructs an empty set. * @stable ICU 2.0 */ - public UnicodeSet() { + private UnicodeSet() { list = new int[1 + START_EXTRA]; list[len++] = HIGH; } /** - * Constructs a set containing the given range. - * If {@code end > start} then an empty set is created. + * Constructs a copy of an existing set. + * @stable ICU 2.0 + */ + private UnicodeSet(UnicodeSet other) { + set(other); + } + + /** + * Constructs a set containing the given range. If <code>end > + * start</code> then an empty set is created. * * @param start first character, inclusive, of range * @param end last character, inclusive, of range @@ -359,7 +371,7 @@ */ public UnicodeSet(String pattern) { this(); - applyPattern(pattern, null, null, IGNORE_SPACE); + applyPattern(pattern, null); } /** @@ -368,172 +380,29 @@ * copied to this object * @stable ICU 2.0 */ - @SuppressWarnings("unchecked") // Casting result of clone of a collection public UnicodeSet set(UnicodeSet other) { + checkFrozen(); list = other.list.clone(); len = other.len; - pat = other.pat; - strings = (TreeSet)other.strings.clone(); + strings = new TreeSet<String>(other.strings); return this; } /** - * Modifies this set to represent the set specified by the given pattern. - * See the class description for the syntax of the pattern language. - * Whitespace is ignored. - * @param pattern a string specifying what characters are in the set - * @exception java.lang.IllegalArgumentException if the pattern - * contains a syntax error. + * Returns the number of elements in this set (its cardinality) + * Note than the elements of a set may include both individual + * codepoints and strings. + * + * @return the number of elements in this set (its cardinality). * @stable ICU 2.0 */ - public final UnicodeSet applyPattern(String pattern) { - return applyPattern(pattern, null, null, IGNORE_SPACE); - } - - /** - * Append the <code>toPattern()</code> representation of a - * string to the given <code>StringBuffer</code>. - */ - private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) { - for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) { - _appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable); - } - } - - /** - * Append the <code>toPattern()</code> representation of a - * character to the given <code>StringBuffer</code>. - */ - private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) { - if (escapeUnprintable && Utility.isUnprintable(c)) { - // Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything - // unprintable - if (Utility.escapeUnprintable(buf, c)) { - return; - } - } - // Okay to let ':' pass through - switch (c) { - case '[': // SET_OPEN: - case ']': // SET_CLOSE: - case '-': // HYPHEN: - case '^': // COMPLEMENT: - case '&': // INTERSECTION: - case '\\': //BACKSLASH: - case '{': - case '}': - case '$': - case ':': - buf.append('\\'); - break; - default: - // Escape whitespace - if (UCharacterProperty.isRuleWhiteSpace(c)) { - buf.append('\\'); - } - break; - } - UTF16.append(buf, c); - } - - /** - * Append a string representation of this set to result. This will be - * a cleaned version of the string passed to applyPattern(), if there - * is one. Otherwise it will be generated. - */ - private StringBuffer _toPattern(StringBuffer result, - boolean escapeUnprintable) { - if (pat != null) { - int i; - int backslashCount = 0; - for (i=0; i<pat.length(); ) { - int c = UTF16.charAt(pat, i); - i += UTF16.getCharCount(c); - if (escapeUnprintable && Utility.isUnprintable(c)) { - // If the unprintable character is preceded by an odd - // number of backslashes, then it has been escaped. - // Before unescaping it, we delete the final - // backslash. - if ((backslashCount % 2) == 1) { - result.setLength(result.length() - 1); - } - Utility.escapeUnprintable(result, c); - backslashCount = 0; - } else { - UTF16.append(result, c); - if (c == '\\') { - ++backslashCount; - } else { - backslashCount = 0; - } - } - } - return result; - } - - return _generatePattern(result, escapeUnprintable, true); - } - - /** - * Generate and append a string representation of this set to result. - * This does not use this.pat, the cleaned up copy of the string - * passed to applyPattern(). - * @param includeStrings if false, doesn't include the strings. - * @stable ICU 3.8 - */ - public StringBuffer _generatePattern(StringBuffer result, - boolean escapeUnprintable, boolean includeStrings) { - result.append('['); - + public int size() { + int n = 0; int count = getRangeCount(); - - // If the set contains at least 2 intervals and includes both - // MIN_VALUE and MAX_VALUE, then the inverse representation will - // be more economical. - if (count > 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - - // Emit the inverse - result.append('^'); - - for (int i = 1; i < count; ++i) { - int start = getRangeEnd(i-1)+1; - int end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append('-'); - } - _appendToPat(result, end, escapeUnprintable); - } - } - } - - // Default; emit the ranges as pairs - else { - for (int i = 0; i < count; ++i) { - int start = getRangeStart(i); - int end = getRangeEnd(i); - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append('-'); - } - _appendToPat(result, end, escapeUnprintable); - } - } + for (int i = 0; i < count; ++i) { + n += getRangeEnd(i) - getRangeStart(i) + 1; } - - if (includeStrings && strings.size() > 0) { - Iterator<String> it = strings.iterator(); - while (it.hasNext()) { - result.append('{'); - _appendToPat(result, it.next(), escapeUnprintable); - result.append('}'); - } - } - return result.append(']'); + return n + strings.size(); } // for internal use, after checkFrozen has been called @@ -559,6 +428,7 @@ * @stable ICU 2.0 */ public final UnicodeSet add(int c) { + checkFrozen(); return add_unchecked(c); } @@ -643,7 +513,6 @@ len += 2; } - pat = null; return this; } @@ -657,11 +526,11 @@ * @return this object, for chaining * @stable ICU 2.0 */ - public final UnicodeSet add(String s) { + public final UnicodeSet add(CharSequence s) { + checkFrozen(); int cp = getSingleCP(s); if (cp < 0) { - strings.add(s); - pat = null; + strings.add(s.toString()); } else { add_unchecked(cp, cp); } @@ -669,11 +538,13 @@ } /** + * Utility for getting code point from single code point CharSequence. + * See the public UTF16.getSingleCodePoint() * @return a code point IF the string consists of a single one. * otherwise returns -1. - * @param string to test + * @param s to test */ - private static int getSingleCP(String s) { + private static int getSingleCP(CharSequence s) { if (s.length() < 1) { throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); } @@ -701,6 +572,7 @@ * @stable ICU 2.0 */ public UnicodeSet complement(int start, int end) { + checkFrozen(); if (start < MIN_VALUE || start > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); } @@ -710,26 +582,6 @@ if (start <= end) { xor(range(start, end), 2, 0); } - pat = null; - return this; - } - - /** - * This is equivalent to - * <code>complement(MIN_VALUE, MAX_VALUE)</code>. - * @stable ICU 2.0 - */ - public UnicodeSet complement() { - if (list[0] == LOW) { - System.arraycopy(list, 1, list, 0, len-1); - --len; - } else { - ensureCapacity(len+1); - System.arraycopy(list, 0, list, 1, len); - list[0] = LOW; - ++len; - } - pat = null; return this; } @@ -743,6 +595,12 @@ if (c < MIN_VALUE || c > MAX_VALUE) { throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); } + if (bmpSet != null) { + return bmpSet.contains(c); + } + if (stringSpan != null) { + return stringSpan.contains(c); + } /* // Set i to the index of the start item greater than ch @@ -751,7 +609,7 @@ while (true) { if (c < list[++i]) break; } - */ + */ int i = findCodePoint(c); @@ -790,7 +648,7 @@ // invariant: c < list[hi] for (;;) { int i = (lo + hi) >>> 1; - if (i == lo) return hi; + if (i == lo) return hi; if (c < list[i]) { hi = i; } else { @@ -800,22 +658,6 @@ } /** - * Adds all of the elements in the specified set to this set if - * they're not already present. This operation effectively - * modifies this set so that its value is the <i>union</i> of the two - * sets. The behavior of this operation is unspecified if the specified - * collection is modified while the operation is in progress. - * - * @param c set whose elements are to be added to this set. - * @stable ICU 2.0 - */ - public UnicodeSet addAll(UnicodeSet c) { - add(c.list, c.len, 0); - strings.addAll(c.strings); - return this; - } - - /** * Retains only the elements in this set that are contained in the * specified set. In other words, removes from this set all of * its elements that are not contained in the specified set. This @@ -826,36 +668,21 @@ * @stable ICU 2.0 */ public UnicodeSet retainAll(UnicodeSet c) { + checkFrozen(); retain(c.list, c.len, 0); strings.retainAll(c.strings); return this; } /** - * Removes from this set all of its elements that are contained in the - * specified set. This operation effectively modifies this - * set so that its value is the <i>asymmetric set difference</i> of - * the two sets. - * - * @param c set that defines which elements will be removed from - * this set. - * @stable ICU 2.0 - */ - public UnicodeSet removeAll(UnicodeSet c) { - retain(c.list, c.len, 2); - strings.removeAll(c.strings); - return this; - } - - /** * Removes all of the elements from this set. This set will be * empty after this call returns. * @stable ICU 2.0 */ public UnicodeSet clear() { + checkFrozen(); list[0] = HIGH; len = 1; - pat = null; strings.clear(); return this; } @@ -923,405 +750,18 @@ * of <code>pattern</code> * @exception java.lang.IllegalArgumentException if the parse fails. */ - UnicodeSet applyPattern(String pattern, - ParsePosition pos, - SymbolTable symbols, - int options) { - - // Need to build the pattern in a temporary string because - // _applyPattern calls add() etc., which set pat to empty. - boolean parsePositionWasNull = pos == null; - if (parsePositionWasNull) { - pos = new ParsePosition(0); - } - - StringBuffer rebuiltPat = new StringBuffer(); - RuleCharacterIterator chars = - new RuleCharacterIterator(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, options); - if (chars.inVariable()) { - syntaxError(chars, "Extra chars in variable value"); - } - pat = rebuiltPat.toString(); - if (parsePositionWasNull) { - int i = pos.getIndex(); - - // Skip over trailing whitespace - if ((options & IGNORE_SPACE) != 0) { - i = Utility.skipWhitespace(pattern, i); - } - - if (i != pattern.length()) { - throw new IllegalArgumentException("Parse of \"" + pattern + - "\" failed at " + i); - } - } - return this; - } - - /** - * Parse the pattern from the given RuleCharacterIterator. The - * iterator is advanced over the parsed pattern. - * @param chars iterator over the pattern characters. Upon return - * it will be advanced to the first character after the parsed - * pattern, or the end of the iteration if all characters are - * parsed. - * @param symbols symbol table to use to parse and dereference - * variables, or null if none. - * @param rebuiltPat the pattern that was parsed, rebuilt or - * copied from the input pattern, as appropriate. - * @param options a bit mask of zero or more of the following: - * IGNORE_SPACE, CASE. - */ - void applyPattern(RuleCharacterIterator chars, SymbolTable symbols, - StringBuffer rebuiltPat, int options) { - // Syntax characters: [ ] ^ - & { } - - // Recognized special forms for chars, sets: c-c s-s s&s - - int opts = RuleCharacterIterator.PARSE_VARIABLES | - RuleCharacterIterator.PARSE_ESCAPES; - if ((options & IGNORE_SPACE) != 0) { - opts |= RuleCharacterIterator.SKIP_WHITESPACE; - } - - StringBuffer patBuf = new StringBuffer(), buf = null; - boolean usePat = false; - UnicodeSet scratch = null; - Object backup = null; - - // mode: 0=before [, 1=between [...], 2=after ] - // lastItem: 0=none, 1=char, 2=set - int lastItem = 0, lastChar = 0, mode = 0; - char op = 0; - - boolean invert = false; - - clear(); - - while (mode != 2 && !chars.atEnd()) { - if (false) { - // Debugging assertion - if (!((lastItem == 0 && op == 0) || - (lastItem == 1 && (op == 0 || op == '-')) || - (lastItem == 2 && (op == 0 || op == '-' || op == '&')))) { - throw new IllegalArgumentException(); - } - } - - int c = 0; - boolean literal = false; - UnicodeSet nested = null; - - // -------- Check for property pattern - - // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed - int setMode = 0; - if (resemblesPropertyPattern(chars, opts)) { - setMode = 2; - } - - // -------- Parse '[' of opening delimiter OR nested set. - // If there is a nested set, use `setMode' to define how - // the set should be parsed. If the '[' is part of the - // opening delimiter for this pattern, parse special - // strings "[", "[^", "[-", and "[^-". Check for stand-in - // characters representing a nested set in the symbol - // table. - - else { - // Prepare to backup if necessary - backup = chars.getPos(backup); - c = chars.next(opts); - literal = chars.isEscaped(); - - if (c == '[' && !literal) { - if (mode == 1) { - chars.setPos(backup); // backup - setMode = 1; - } else { - // Handle opening '[' delimiter - mode = 1; - patBuf.append('['); - backup = chars.getPos(backup); // prepare to backup - c = chars.next(opts); - literal = chars.isEscaped(); - if (c == '^' && !literal) { - invert = true; - patBuf.append('^'); - backup = chars.getPos(backup); // prepare to backup - c = chars.next(opts); - literal = chars.isEscaped(); - } - // Fall through to handle special leading '-'; - // otherwise restart loop for nested [], \p{}, etc. - if (c == '-') { - literal = true; - // Fall through to handle literal '-' below - } else { - chars.setPos(backup); // backup - continue; - } - } - } else if (symbols != null) { - UnicodeMatcher m = symbols.lookupMatcher(c); // may be null - if (m != null) { - try { - nested = (UnicodeSet) m; - setMode = 3; - } catch (ClassCastException e) { - syntaxError(chars, "Syntax error"); - } - } - } - } - - // -------- Handle a nested set. This either is inline in - // the pattern or represented by a stand-in that has - // previously been parsed and was looked up in the symbol - // table. - - if (setMode != 0) { - if (lastItem == 1) { - if (op != 0) { - syntaxError(chars, "Char expected after operator"); - } - add_unchecked(lastChar, lastChar); - _appendToPat(patBuf, lastChar, false); - lastItem = op = 0; - } - - if (op == '-' || op == '&') { - patBuf.append(op); - } - - if (nested == null) { - if (scratch == null) scratch = new UnicodeSet(); - nested = scratch; - } - switch (setMode) { - case 1: - nested.applyPattern(chars, symbols, patBuf, options); - break; - case 2: - chars.skipIgnored(opts); - nested.applyPropertyPattern(chars, patBuf, symbols); - break; - case 3: // `nested' already parsed - nested._toPattern(patBuf, false); - break; - } - - usePat = true; - - if (mode == 0) { - // Entire pattern is a category; leave parse loop - set(nested); - mode = 2; - break; - } - - switch (op) { - case '-': - removeAll(nested); - break; - case '&': - retainAll(nested); - break; - case 0: - addAll(nested); - break; - } - - op = 0; - lastItem = 2; - - continue; - } - - if (mode == 0) { - syntaxError(chars, "Missing '['"); - } - - // -------- Parse special (syntax) characters. If the - // current character is not special, or if it is escaped, - // then fall through and handle it below. - - if (!literal) { - switch (c) { - case ']': - if (lastItem == 1) { - add_unchecked(lastChar, lastChar); - _appendToPat(patBuf, lastChar, false); - } - // Treat final trailing '-' as a literal - if (op == '-') { - add_unchecked(op, op); - patBuf.append(op); - } else if (op == '&') { - syntaxError(chars, "Trailing '&'"); - } - patBuf.append(']'); - mode = 2; - continue; - case '-': - if (op == 0) { - if (lastItem != 0) { - op = (char) c; - continue; - } else { - // Treat final trailing '-' as a literal - add_unchecked(c, c); - c = chars.next(opts); - literal = chars.isEscaped(); - if (c == ']' && !literal) { - patBuf.append("-]"); - mode = 2; - continue; - } - } - } - syntaxError(chars, "'-' not after char or set"); - break; - case '&': - if (lastItem == 2 && op == 0) { - op = (char) c; - continue; - } - syntaxError(chars, "'&' not after set"); - break; - case '^': - syntaxError(chars, "'^' not after '['"); - break; - case '{': - if (op != 0) { - syntaxError(chars, "Missing operand after operator"); - } - if (lastItem == 1) { - add_unchecked(lastChar, lastChar); - _appendToPat(patBuf, lastChar, false); - } - lastItem = 0; - if (buf == null) { - buf = new StringBuffer(); - } else { - buf.setLength(0); - } - boolean ok = false; - while (!chars.atEnd()) { - c = chars.next(opts); - literal = chars.isEscaped(); - if (c == '}' && !literal) { - ok = true; - break; - } - UTF16.append(buf, c); - } - if (buf.length() < 1 || !ok) { - syntaxError(chars, "Invalid multicharacter string"); - } - // We have new string. Add it to set and continue; - // we don't need to drop through to the further - // processing - add(buf.toString()); - patBuf.append('{'); - _appendToPat(patBuf, buf.toString(), false); - patBuf.append('}'); - continue; - case SymbolTable.SYMBOL_REF: - // symbols nosymbols - // [a-$] error error (ambiguous) - // [a$] anchor anchor - // [a-$x] var "x"* literal '$' - // [a-$.] error literal '$' - // *We won't get here in the case of var "x" - backup = chars.getPos(backup); - c = chars.next(opts); - literal = chars.isEscaped(); - boolean anchor = (c == ']' && !literal); - if (symbols == null && !anchor) { - c = SymbolTable.SYMBOL_REF; - chars.setPos(backup); - break; // literal '$' - } - if (anchor && op == 0) { - if (lastItem == 1) { - add_unchecked(lastChar, lastChar); - _appendToPat(patBuf, lastChar, false); - } - add_unchecked(UnicodeMatcher.ETHER); - usePat = true; - patBuf.append(SymbolTable.SYMBOL_REF).append(']'); - mode = 2; - continue; - } - syntaxError(chars, "Unquoted '$'"); - break; - default: - break; - } - } - - // -------- Parse literal characters. This includes both - // escaped chars ("\u4E01") and non-syntax characters - // ("a"). - - switch (lastItem) { - case 0: - lastItem = 1; - lastChar = c; - break; - case 1: - if (op == '-') { - if (lastChar >= c) { - // Don't allow redundant (a-a) or empty (b-a) ranges; - // these are most likely typos. - syntaxError(chars, "Invalid range"); - } - add_unchecked(lastChar, c); - _appendToPat(patBuf, lastChar, false); - patBuf.append(op); - _appendToPat(patBuf, c, false); - lastItem = op = 0; - } else { - add_unchecked(lastChar, lastChar); - _appendToPat(patBuf, lastChar, false); - lastChar = c; - } - break; - case 2: - if (op != 0) { - syntaxError(chars, "Set expected after operator"); - } - lastChar = c; - lastItem = 1; - break; - } - } - - if (mode != 2) { - syntaxError(chars, "Missing ']'"); - } - - chars.skipIgnored(opts); - - if (invert) { - complement(); - } - - // Use the rebuilt pattern (pat) only if necessary. Prefer the - // generated pattern. - if (usePat) { - rebuiltPat.append(patBuf.toString()); + private UnicodeSet applyPattern(String pattern, + ParsePosition pos) { + if ("[:age=3.2:]".equals(pattern)) { + checkFrozen(); + VersionInfo version = VersionInfo.getInstance("3.2"); + applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); } else { - _generatePattern(rebuiltPat, false, true); + throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern " + + pattern + ")"); } - } - private static void syntaxError(RuleCharacterIterator chars, String msg) { - throw new IllegalArgumentException("Error: " + msg + " at \"" + - Utility.escape(chars.toString()) + - '"'); + return this; } //---------------------------------------------------------------- @@ -1397,7 +837,6 @@ int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } @@ -1414,88 +853,87 @@ // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. main: - while (true) { - switch (polarity) { - case 0: // both first; take lower if unequal - if (a < b) { // take a - // Back up over overlapping ranges in buffer[] - if (k > 0 && a <= buffer[k-1]) { - // Pick latter end value in buffer[] vs. list[] - a = max(list[i], buffer[--k]); - } else { - // No overlap - buffer[k++] = a; - a = list[i]; - } - i++; // Common if/else code factored out - polarity ^= 1; - } else if (b < a) { // take b - if (k > 0 && b <= buffer[k-1]) { - b = max(other[j], buffer[--k]); - } else { - buffer[k++] = b; - b = other[j]; + while (true) { + switch (polarity) { + case 0: // both first; take lower if unequal + if (a < b) { // take a + // Back up over overlapping ranges in buffer[] + if (k > 0 && a <= buffer[k-1]) { + // Pick latter end value in buffer[] vs. list[] + a = max(list[i], buffer[--k]); + } else { + // No overlap + buffer[k++] = a; + a = list[i]; + } + i++; // Common if/else code factored out + polarity ^= 1; + } else if (b < a) { // take b + if (k > 0 && b <= buffer[k-1]) { + b = max(other[j], buffer[--k]); + } else { + buffer[k++] = b; + b = other[j]; + } + j++; + polarity ^= 2; + } else { // a == b, take a, drop b + if (a == HIGH) break main; + // This is symmetrical; it doesn't matter if + // we backtrack with a or b. - liu + if (k > 0 && a <= buffer[k-1]) { + a = max(list[i], buffer[--k]); + } else { + // No overlap + buffer[k++] = a; + a = list[i]; + } + i++; + polarity ^= 1; + b = other[j++]; polarity ^= 2; } - j++; - polarity ^= 2; - } else { // a == b, take a, drop b - if (a == HIGH) break main; - // This is symmetrical; it doesn't matter if - // we backtrack with a or b. - liu - if (k > 0 && a <= buffer[k-1]) { - a = max(list[i], buffer[--k]); - } else { - // No overlap + break; + case 3: // both second; take higher if unequal, and drop other + if (b <= a) { // take a + if (a == HIGH) break main; buffer[k++] = a; - a = list[i]; + } else { // take b + if (b == HIGH) break main; + buffer[k++] = b; } - i++; - polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 3: // both second; take higher if unequal, and drop other - if (b <= a) { // take a - if (a == HIGH) break main; - buffer[k++] = a; - } else { // take b - if (b == HIGH) break main; - buffer[k++] = b; - } - a = list[i++]; polarity ^= 1; // factored common code - b = other[j++]; polarity ^= 2; - break; - case 1: // a second, b first; if b < a, overlap - if (a < b) { // no overlap, take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else if (b < a) { // OVERLAP, drop b - b = other[j++]; polarity ^= 2; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 2: // a first, b second; if a < b, overlap - if (b < a) { // no overlap, take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else if (a < b) { // OVERLAP, drop a - a = list[i++]; polarity ^= 1; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; + a = list[i++]; polarity ^= 1; // factored common code b = other[j++]; polarity ^= 2; + break; + case 1: // a second, b first; if b < a, overlap + if (a < b) { // no overlap, take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else if (b < a) { // OVERLAP, drop b + b = other[j++]; polarity ^= 2; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 2: // a first, b second; if a < b, overlap + if (b < a) { // no overlap, take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else if (a < b) { // OVERLAP, drop a + a = list[i++]; polarity ^= 1; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; } - break; } - } buffer[k++] = HIGH; // terminate len = k; // swap list and buffer int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } @@ -1512,61 +950,60 @@ // change from xor is that we have to check overlapping pairs // polarity bit 1 means a is second, bit 2 means b is. main: - while (true) { - switch (polarity) { - case 0: // both first; drop the smaller - if (a < b) { // drop a - a = list[i++]; polarity ^= 1; - } else if (b < a) { // drop b - b = other[j++]; polarity ^= 2; - } else { // a == b, take one, drop other - if (a == HIGH) break main; - buffer[k++] = a; a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 3: // both second; take lower if unequal - if (a < b) { // take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else if (b < a) { // take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else { // a == b, take one, drop other - if (a == HIGH) break main; - buffer[k++] = a; a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 1: // a second, b first; - if (a < b) { // NO OVERLAP, drop a - a = list[i++]; polarity ^= 1; - } else if (b < a) { // OVERLAP, take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 2: // a first, b second; if a < b, overlap - if (b < a) { // no overlap, drop b - b = other[j++]; polarity ^= 2; - } else if (a < b) { // OVERLAP, take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; + while (true) { + switch (polarity) { + case 0: // both first; drop the smaller + if (a < b) { // drop a + a = list[i++]; polarity ^= 1; + } else if (b < a) { // drop b + b = other[j++]; polarity ^= 2; + } else { // a == b, take one, drop other + if (a == HIGH) break main; + buffer[k++] = a; a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 3: // both second; take lower if unequal + if (a < b) { // take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else if (b < a) { // take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else { // a == b, take one, drop other + if (a == HIGH) break main; + buffer[k++] = a; a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 1: // a second, b first; + if (a < b) { // NO OVERLAP, drop a + a = list[i++]; polarity ^= 1; + } else if (b < a) { // OVERLAP, take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 2: // a first, b second; if a < b, overlap + if (b < a) { // no overlap, drop b + b = other[j++]; polarity ^= 2; + } else if (a < b) { // OVERLAP, take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; } - break; } - } buffer[k++] = HIGH; // terminate len = k; // swap list and buffer int[] temp = list; list = buffer; buffer = temp; - pat = null; return this; } @@ -1582,58 +1019,46 @@ boolean contains(int codePoint); } - // VersionInfo for unassigned characters - static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); + private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); private static class VersionFilter implements Filter { VersionInfo version; - VersionFilter(VersionInfo version) { this.version = version; } - public boolean contains(int ch) { VersionInfo v = UCharacter.getAge(ch); // Reference comparison ok; VersionInfo caches and reuses // unique objects. return v != NO_VERSION && - v.compareTo(version) <= 0; + v.compareTo(version) <= 0; } } private static synchronized UnicodeSet getInclusions(int src) { - if (INCLUSIONS == null) { - INCLUSIONS = new UnicodeSet[UCharacterProperty.SRC_COUNT]; + if (src != UCharacterProperty.SRC_PROPSVEC) { + throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); } - if(INCLUSIONS[src] == null) { + + if (INCLUSION == null) { UnicodeSet incl = new UnicodeSet(); - switch(src) { - case UCharacterProperty.SRC_PROPSVEC: - UCharacterProperty.getInstance().upropsvec_addPropertyStarts(incl); - break; - default: - throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); - } - INCLUSIONS[src] = incl; + UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); + INCLUSION = incl; } - return INCLUSIONS[src]; + return INCLUSION; } /** * Generic filter-based scanning code for UCD property UnicodeSets. */ private UnicodeSet applyFilter(Filter filter, int src) { - // Walk through all Unicode characters, noting the start + // Logically, walk through all Unicode characters, noting the start // and end of each range for which filter.contain(c) is // true. Add each range to a set. // - // To improve performance, use the INCLUSIONS set, which + // To improve performance, use an inclusions set which // encodes information about character ranges that are known - // to have identical properties, such as the CJK Ideographs - // from U+4E00 to U+9FA5. INCLUSIONS contains all characters - // except the first characters of such ranges. - // - // TODO Where possible, instead of scanning over code points, - // use internal property data to initialize UnicodeSets for - // those properties. Scanning code points is slow. + // to have identical properties. + // getInclusions(src) contains exactly the first characters of + // same-value ranges for the given properties "source". clear(); @@ -1668,204 +1093,315 @@ } /** - * Remove leading and trailing rule white space and compress - * internal rule white space to a single space character. + * Is this frozen, according to the Freezable interface? * - * @see UCharacterProperty#isRuleWhiteSpace + * @return value + * @stable ICU 3.8 */ - private static String mungeCharName(String source) { - StringBuffer buf = new StringBuffer(); - for (int i=0; i<source.length(); ) { - int ch = UTF16.charAt(source, i); - i += UTF16.getCharCount(ch); - if (UCharacterProperty.isRuleWhiteSpace(ch)) { - if (buf.length() == 0 || - buf.charAt(buf.length() - 1) == ' ') { - continue; - } - ch = ' '; // convert to ' ' - } - UTF16.append(buf, ch); - } - if (buf.length() != 0 && - buf.charAt(buf.length() - 1) == ' ') { - buf.setLength(buf.length() - 1); - } - return buf.toString(); + public boolean isFrozen() { + return (bmpSet != null || stringSpan != null); } /** - * Modifies this set to contain those code points which have the - * given value for the given property. Prior contents of this - * set are lost. - * @param propertyAlias the property alias - * @param valueAlias the value alias - * @param symbols if not null, then symbols are first called to see if a property - * is available. If true, then everything else is skipped. - * @return this set - * @stable ICU 3.2 - */ - public UnicodeSet applyPropertyAlias(String propertyAlias, - String valueAlias, SymbolTable symbols) { - if (valueAlias.length() > 0) { - if (propertyAlias.equals("Age")) { - // Must munge name, since - // VersionInfo.getInstance() does not do - // 'loose' matching. - VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias)); - applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); - return this; + * Freeze this class, according to the Freezable interface. + * + * @return this + * @stable ICU 4.4 + */ + public UnicodeSet freeze() { + if (!isFrozen()) { + // Do most of what compact() does before freezing because + // compact() will not work when the set is frozen. + // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). + + // Delete buffer first to defragment memory less. + buffer = null; + if (list.length > (len + GROW_EXTRA)) { + // Make the capacity equal to len or 1. + // We don't want to realloc of 0 size. + int capacity = (len == 0) ? 1 : len; + int[] oldList = list; + list = new int[capacity]; + for (int i = capacity; i-- > 0;) { + list[i] = oldList[i]; + } + } + + // Optimize contains() and span() and similar functions. + if (!strings.isEmpty()) { + stringSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), UnicodeSetStringSpan.ALL); + } + if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { + // Optimize for code point spans. + // There are no strings, or + // all strings are irrelevant for span() etc. because + // all of each string's code points are contained in this set. + // However, fully contained strings are relevant for spanAndCount(), + // so we create both objects. + bmpSet = new BMPSet(list, len); } } - throw new IllegalArgumentException("Unsupported property: " + propertyAlias); + return this; } /** - * Return true if the given iterator appears to point at a - * property pattern. Regardless of the result, return with the - * iterator unchanged. - * @param chars iterator over the pattern characters. Upon return - * it will be unchanged. - * @param iterOpts RuleCharacterIterator options - */ - private static boolean resemblesPropertyPattern(RuleCharacterIterator chars, - int iterOpts) { - boolean result = false; - iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES; - Object pos = chars.getPos(null); - int c = chars.next(iterOpts); - if (c == '[' || c == '\\') { - int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE); - result = (c == '[') ? (d == ':') : - (d == 'N' || d == 'p' || d == 'P'); - } - chars.setPos(pos); - return result; + * Span a string using this UnicodeSet. + * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param spanCondition The span condition + * @return the length of the span + * @stable ICU 4.4 + */ + public int span(CharSequence s, SpanCondition spanCondition) { + return span(s, 0, spanCondition); + } + + /** + * Span a string using this UnicodeSet. + * If the start index is less than 0, span will start from 0. + * If the start index is greater than the string length, span returns the string length. + * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the string index which ends the span (i.e. exclusive) + * @stable ICU 4.4 + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (bmpSet != null) { + // Frozen set without strings, or no string is relevant for span(). + return bmpSet.span(s, start, spanCondition, null); + } + if (stringSpan != null) { + return stringSpan.span(s, start, spanCondition); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); + if (strSpan.needsStringSpanUTF16()) { + return strSpan.span(s, start, spanCondition); + } + } + + return spanCodePointsAndCount(s, start, spanCondition, null); + } + + /** + * Same as span() but also counts the smallest number of set elements on any path across the span. + * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param outCount An output-only object (must not be null) for returning the count. + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { + if (outCount == null) { + throw new IllegalArgumentException("outCount must not be null"); + } + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (stringSpan != null) { + // We might also have bmpSet != null, + // but fully-contained strings are relevant for counting elements. + return stringSpan.spanAndCount(s, start, spanCondition, outCount); + } else if (bmpSet != null) { + return bmpSet.span(s, start, spanCondition, outCount); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + which |= UnicodeSetStringSpan.WITH_COUNT; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); + return strSpan.spanAndCount(s, start, spanCondition, outCount); + } + + return spanCodePointsAndCount(s, start, spanCondition, outCount); + } + + private int spanCodePointsAndCount(CharSequence s, int start, + SpanCondition spanCondition, OutputInt outCount) { + // Pin to 0/1 values. + boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); + + int c; + int next = start; + int length = s.length(); + int count = 0; + do { + c = Character.codePointAt(s, next); + if (spanContained != contains(c)) { + break; + } + ++count; + next += Character.charCount(c); + } while (next < length); + if (outCount != null) { outCount.value = count; } + return next; } /** - * Parse the given property pattern at the given parse position. - * @param symbols TODO - */ - private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) { - int pos = ppos.getIndex(); - - // On entry, ppos should point to one of the following locations: - - // Minimum length is 5 characters, e.g. \p{L} - if ((pos+5) > pattern.length()) { - return null; - } - - boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} - boolean isName = false; // true for \N{pat}, o/w false - boolean invert = false; - - // Look for an opening [:, [:^, \p, or \P - if (pattern.regionMatches(pos, "[:", 0, 2)) { - posix = true; - pos = Utility.skipWhitespace(pattern, pos+2); - if (pos < pattern.length() && pattern.charAt(pos) == '^') { - ++pos; - invert = true; - } - } else if (pattern.regionMatches(true, pos, "\\p", 0, 2) || - pattern.regionMatches(pos, "\\N", 0, 2)) { - char c = pattern.charAt(pos+1); - invert = (c == 'P'); - isName = (c == 'N'); - pos = Utility.skipWhitespace(pattern, pos+2); - if (pos == pattern.length() || pattern.charAt(pos++) != '{') { - // Syntax error; "\p" or "\P" not followed by "{" - return null; + * Span a string backwards (from the fromIndex) using this UnicodeSet. + * If the fromIndex is less than 0, spanBack will return 0. + * If fromIndex is greater than the string length, spanBack will start from the string length. + * <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + * @stable ICU 4.4 + */ + public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { + if (fromIndex <= 0) { + return 0; + } + if (fromIndex > s.length()) { + fromIndex = s.length(); + } + if (bmpSet != null) { + // Frozen set without strings, or no string is relevant for spanBack(). + return bmpSet.spanBack(s, fromIndex, spanCondition); + } + if (stringSpan != null) { + return stringSpan.spanBack(s, fromIndex, spanCondition); + } else if (!strings.isEmpty()) { + int which = (spanCondition == SpanCondition.NOT_CONTAINED) + ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList<String>(strings), which); + if (strSpan.needsStringSpanUTF16()) { + return strSpan.spanBack(s, fromIndex, spanCondition); } - } else { - // Open delimiter not seen - return null; - } - - // Look for the matching close delimiter, either :] or } - int close = pattern.indexOf(posix ? ":]" : "}", pos); - if (close < 0) { - // Syntax error; close delimiter missing - return null; - } - - // Look for an '=' sign. If this is present, we will parse a - // medium \p{gc=Cf} or long \p{GeneralCategory=Format} - // pattern. - int equals = pattern.indexOf('=', pos); - String propName, valueName; - if (equals >= 0 && equals < close && !isName) { - // Equals seen; parse medium/long pattern - propName = pattern.substring(pos, equals); - valueName = pattern.substring(equals+1, close); } - else { - // Handle case where no '=' is seen, and \N{} - propName = pattern.substring(pos, close); - valueName = ""; - - // Handle \N{name} - if (isName) { - // This is a little inefficient since it means we have to - // parse "na" back to UProperty.NAME even though we already - // know it's UProperty.NAME. If we refactor the API to - // support args of (int, String) then we can remove - // "na" and make this a little more efficient. - valueName = propName; - propName = "na"; + // Pin to 0/1 values. + boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); + + int c; + int prev = fromIndex; + do { + c = Character.codePointBefore(s, prev); + if (spanContained != contains(c)) { + break; } - } - - applyPropertyAlias(propName, valueName, symbols); - - if (invert) { - complement(); - } - - // Move to the limit position after the close delimiter - ppos.setIndex(close + (posix ? 2 : 1)); - - return this; + prev -= Character.charCount(c); + } while (prev > 0); + return prev; } /** - * Parse a property pattern. - * @param chars iterator over the pattern characters. Upon return - * it will be advanced to the first character after the parsed - * pattern, or the end of the iteration if all characters are - * parsed. - * @param rebuiltPat the pattern that was parsed, rebuilt or - * copied from the input pattern, as appropriate. - * @param symbols TODO - */ - private void applyPropertyPattern(RuleCharacterIterator chars, - StringBuffer rebuiltPat, SymbolTable symbols) { - String patStr = chars.lookahead(); - ParsePosition pos = new ParsePosition(0); - applyPropertyPattern(patStr, pos, symbols); - if (pos.getIndex() == 0) { - syntaxError(chars, "Invalid property pattern"); - } - chars.jumpahead(pos.getIndex()); - rebuiltPat.append(patStr, 0, pos.getIndex()); + * Clone a thawed version of this class, according to the Freezable interface. + * @return the clone, not frozen + * @stable ICU 4.4 + */ + public UnicodeSet cloneAsThawed() { + UnicodeSet result = new UnicodeSet(this); + assert !result.isFrozen(); + return result; } - //---------------------------------------------------------------- - // Case folding API - //---------------------------------------------------------------- + // internal function + private void checkFrozen() { + if (isFrozen()) { + throw new UnsupportedOperationException("Attempt to modify frozen object"); + } + } /** - * Bitmask for constructor and applyPattern() indicating that - * white space should be ignored. If set, ignore characters for - * which UCharacterProperty.isRuleWhiteSpace() returns true, - * unless they are quoted or escaped. This may be ORed together - * with other selectors. - * @stable ICU 3.8 + * Argument values for whether span() and similar functions continue while the current character is contained vs. + * not contained in the set. + * <p> + * The functionality is straightforward for sets with only single code points, without strings (which is the common + * case): + * <ul> + * <li>CONTAINED and SIMPLE work the same. + * <li>CONTAINED and SIMPLE are inverses of NOT_CONTAINED. + * <li>span() and spanBack() partition any string the + * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). + * <li>Using a + * complemented (inverted) set and the opposite span conditions yields the same results. + * </ul> + * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in + * the set (for example, whether they overlap with each other) and the string that is processed. For a set with + * strings: + * <ul> + * <li>The complement of the set contains the opposite set of code points, but the same set of strings. + * Therefore, complementing both the set and the span conditions may yield different results. + * <li>When starting spans + * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different + * because a set string may start before the later position. + * <li>span(SIMPLE) may be shorter than + * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which + * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", + * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). + * <li>With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, + * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield + * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. + * </ul> + * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then + * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could + * be used. + * <p> + * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point + * boundaries, never in the middle of a surrogate pair. + * + * @stable ICU 4.4 */ - public static final int IGNORE_SPACE = 1; + public enum SpanCondition { + /** + * Continues a span() while there is no set element at the current position. + * Increments by one code point at a time. + * Stops before the first set element (character or string). + * (For code points only, this is like while contains(current)==false). + * <p> + * When span() returns, the substring between where it started and the position it returned consists only of + * characters that are not in the set, and none of its strings overlap with the span. + * + * @stable ICU 4.4 + */ + NOT_CONTAINED, -} + /** + * Spans the longest substring that is a concatenation of set elements (characters or strings). + * (For characters only, this is like while contains(current)==true). + * <p> + * When span() returns, the substring between where it started and the position it returned consists only of set + * elements (characters or strings) that are in the set. + * <p> + * If a set contains strings, then the span will be the longest substring for which there + * exists at least one non-overlapping concatenation of set elements (characters or strings). + * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. + * (Java/ICU/Perl regex stops at the first match of an OR.) + * + * @stable ICU 4.4 + */ + CONTAINED, + /** + * Continues a span() while there is a set element at the current position. + * Increments by the longest matching element at each position. + * (For characters only, this is like while contains(current)==true). + * <p> + * When span() returns, the substring between where it started and the position it returned consists only of set + * elements (characters or strings) that are in the set. + * <p> + * If a set only contains single characters, then this is the same as CONTAINED. + * <p> + * If a set contains strings, then the span will be the longest substring with a match at each position with the + * longest single set element (character or string). + * <p> + * Use this span condition together with other longest-match algorithms, such as ICU converters + * (ucnv_getUnicodeSet()). + * + * @stable ICU 4.4 + */ + SIMPLE, + } + +} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java 2015-07-13 16:11:58.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Utility.java 2015-07-13 16:11:58.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,47 +24,26 @@ */ /* ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * + * Copyright (C) 1996-2011, International Business Machines Corporation and * + * others. All Rights Reserved. * ******************************************************************************* */ package sun.text.normalizer; -public final class Utility { +import java.io.IOException; +import java.util.Locale; - /** - * Convenience utility to compare two Object[]s - * Ought to be in System. - * @param len the length to compare. - * The start indices and start+len must be valid. - */ - public final static boolean arrayRegionMatches(char[] source, int sourceStart, - char[] target, int targetStart, - int len) - { - int sourceEnd = sourceStart + len; - int delta = targetStart - sourceStart; - for (int i = sourceStart; i < sourceEnd; i++) { - if (source[i]!=target[i + delta]) - return false; - } - return true; - } +final class Utility { /** * Convert characters outside the range U+0020 to U+007F to * Unicode escapes, and convert backslash to a double backslash. */ public static final String escape(String s) { - StringBuffer buf = new StringBuffer(); + StringBuilder buf = new StringBuilder(); for (int i=0; i<s.length(); ) { - int c = UTF16.charAt(s, i); + int c = Character.codePointAt(s, i); i += UTF16.getCharCount(c); if (c >= ' ' && c <= 0x007F) { if (c == '\\') { @@ -75,7 +54,7 @@ } else { boolean four = c <= 0xFFFF; buf.append(four ? "\\u" : "\\U"); - hex(c, four ? 4 : 8, buf); + buf.append(hex(c, four ? 4 : 8)); } } return buf.toString(); @@ -124,7 +103,7 @@ } /* Fetch first UChar after '\\' */ - c = UTF16.charAt(s, offset); + c = Character.codePointAt(s, offset); offset += UTF16.getCharCount(c); /* Convert hexadecimal and octal escapes */ @@ -143,7 +122,7 @@ maxDig = 8; } else { maxDig = 2; - } + } break; default: dig = UCharacter.digit(c, 8); @@ -175,7 +154,7 @@ return -1; } ++offset; - } + } if (result < 0 || result >= 0x110000) { return -1; } @@ -184,7 +163,7 @@ // escape or as a literal. If so, join them up into a // supplementary. if (offset < length && - UTF16.isLeadSurrogate((char) result)) { + UTF16.isLeadSurrogate((char) result)) { int ahead = offset+1; c = s.charAt(offset); // [sic] get 16-bit code unit if (c == '\\' && ahead < length) { @@ -194,8 +173,8 @@ } if (UTF16.isTrailSurrogate((char) c)) { offset = ahead; - result = UCharacterProperty.getRawSupplementary( - (char) result, (char) c); + result = UCharacterProperty.getRawSupplementary( + (char) result, (char) c); } } offset16[0] = offset; @@ -226,39 +205,22 @@ } /** - * Convert a integer to size width hex uppercase digits. - * E.g., {@code hex('a', 4, str) => "0041"}. - * Append the output to the given StringBuffer. - * If width is too small to fit, nothing will be appended to output. - */ - public static StringBuffer hex(int ch, int width, StringBuffer output) { - return appendNumber(output, ch, 16, width); - } - - /** - * Convert a integer to size width (minimum) hex uppercase digits. - * E.g., {@code hex('a', 4, str) => "0041"}. If the integer requires more - * than width digits, more will be used. - */ - public static String hex(int ch, int width) { - StringBuffer buf = new StringBuffer(); - return appendNumber(buf, ch, 16, width).toString(); - } - - /** - * Skip over a sequence of zero or more white space characters - * at pos. Return the index of the first non-white-space character - * at or after pos, or str.length(), if there is none. + * Supplies a zero-padded hex representation of an integer (without 0x) */ - public static int skipWhitespace(String str, int pos) { - while (pos < str.length()) { - int c = UTF16.charAt(str, pos); - if (!UCharacterProperty.isRuleWhiteSpace(c)) { - break; - } - pos += UTF16.getCharCount(c); + static public String hex(long i, int places) { + if (i == Long.MIN_VALUE) return "-8000000000000000"; + boolean negative = i < 0; + if (negative) { + i = -i; + } + String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); + if (result.length() < places) { + result = "0000000000000000".substring(result.length(),places) + result; + } + if (negative) { + return '-' + result; } - return pos; + return result; } static final char DIGITS[] = { @@ -269,117 +231,43 @@ }; /** - * Append the digits of a positive integer to the given - * <code>StringBuffer</code> in the given radix. This is - * done recursively since it is easiest to generate the low- - * order digit first, but it must be appended last. - * - * @param result is the <code>StringBuffer</code> to append to - * @param n is the positive integer - * @param radix is the radix, from 2 to 36 inclusive - * @param minDigits is the minimum number of digits to append. - */ - private static void recursiveAppendNumber(StringBuffer result, int n, - int radix, int minDigits) - { - int digit = n % radix; - - if (n >= radix || minDigits > 1) { - recursiveAppendNumber(result, n / radix, radix, minDigits - 1); - } - - result.append(DIGITS[digit]); - } - - /** - * Append a number to the given StringBuffer in the given radix. - * Standard digits '0'-'9' are used and letters 'A'-'Z' for - * radices 11 through 36. - * @param result the digits of the number are appended here - * @param n the number to be converted to digits; may be negative. - * If negative, a '-' is prepended to the digits. - * @param radix a radix from 2 to 36 inclusive. - * @param minDigits the minimum number of digits, not including - * any '-', to produce. Values less than 2 have no effect. One - * digit is always emitted regardless of this parameter. - * @return a reference to result - */ - public static StringBuffer appendNumber(StringBuffer result, int n, - int radix, int minDigits) - throws IllegalArgumentException - { - if (radix < 2 || radix > 36) { - throw new IllegalArgumentException("Illegal radix " + radix); - } - - - int abs = n; - - if (n < 0) { - abs = -n; - result.append("-"); - } - - recursiveAppendNumber(result, abs, radix, minDigits); - - return result; - } - - /** * Return true if the character is NOT printable ASCII. The tab, * newline and linefeed characters are considered unprintable. */ public static boolean isUnprintable(int c) { + //0x20 = 32 and 0x7E = 126 return !(c >= 0x20 && c <= 0x7E); } /** - * Escape unprintable characters using {@code <backslash>uxxxx} notation - * for U+0000 to U+FFFF and {@code <backslash>Uxxxxxxxx} for U+10000 and + * Escape unprintable characters using <backslash>uxxxx notation + * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and * above. If the character is printable ASCII, then do nothing * and return FALSE. Otherwise, append the escaped notation and * return TRUE. */ - public static boolean escapeUnprintable(StringBuffer result, int c) { - if (isUnprintable(c)) { - result.append('\\'); - if ((c & ~0xFFFF) != 0) { - result.append('U'); - result.append(DIGITS[0xF&(c>>28)]); - result.append(DIGITS[0xF&(c>>24)]); - result.append(DIGITS[0xF&(c>>20)]); - result.append(DIGITS[0xF&(c>>16)]); - } else { - result.append('u'); + public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { + try { + if (isUnprintable(c)) { + result.append('\\'); + if ((c & ~0xFFFF) != 0) { + result.append('U'); + result.append(DIGITS[0xF&(c>>28)]); + result.append(DIGITS[0xF&(c>>24)]); + result.append(DIGITS[0xF&(c>>20)]); + result.append(DIGITS[0xF&(c>>16)]); + } else { + result.append('u'); + } + result.append(DIGITS[0xF&(c>>12)]); + result.append(DIGITS[0xF&(c>>8)]); + result.append(DIGITS[0xF&(c>>4)]); + result.append(DIGITS[0xF&c]); + return true; } - result.append(DIGITS[0xF&(c>>12)]); - result.append(DIGITS[0xF&(c>>8)]); - result.append(DIGITS[0xF&(c>>4)]); - result.append(DIGITS[0xF&c]); - return true; - } - return false; - } - - /** - * Similar to StringBuffer.getChars, version 1.3. - * Since JDK 1.2 implements StringBuffer.getChars differently, this method - * is here to provide consistent results. - * To be removed after JDK 1.2 ceased to be the reference platform. - * @param src source string buffer - * @param srcBegin offset to the start of the src to retrieve from - * @param srcEnd offset to the end of the src to retrieve from - * @param dst char array to store the retrieved chars - * @param dstBegin offset to the start of the destination char array to - * store the retrieved chars - */ - public static void getChars(StringBuffer src, int srcBegin, int srcEnd, - char dst[], int dstBegin) - { - if (srcBegin == srcEnd) { - return; + return false; + } catch (IOException e) { + throw new IllegalArgumentException(e); } - src.getChars(srcBegin, srcEnd, dst, dstBegin); } - } Binary files old/jdk/src/java.base/share/classes/sun/text/resources/ubidi.icu and new/jdk/src/java.base/share/classes/sun/text/resources/ubidi.icu differ Binary files old/jdk/src/java.base/share/classes/sun/text/resources/uprops.icu and new/jdk/src/java.base/share/classes/sun/text/resources/uprops.icu differ --- old/jdk/src/java.desktop/share/classes/java/awt/font/NumericShaper.java 2015-07-13 16:12:00.000000000 +0900 +++ new/jdk/src/java.desktop/share/classes/java/awt/font/NumericShaper.java 2015-07-13 16:12:00.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -318,7 +318,17 @@ /** * The Meetei Mayek range with the Meetei Mayek digits. */ - MEETEI_MAYEK ('\uabf0', '\uabc0', '\uac00'); + MEETEI_MAYEK ('\uabf0', '\uabc0', '\uac00'), + /** + * The Sinhala range with the Sinhala digits. + * @since 1.9 + */ + SINHALA ('\u0de6', '\u0d80', '\u0e00'), + /** + * The Myanmar Extended-B range with the Myanmar Tai Laing digits. + * @since 1.9 + */ + MYANMAR_TAI_LAING ('\ua9f0', '\ua9e0', '\uaa00'); private static int toRangeIndex(Range script) { int index = script.ordinal(); @@ -624,15 +634,25 @@ 0x02e5, 0x02ee, 0x02ef, 0x0370, 0x0374, 0x0376, - 0x037e, 0x0386, + 0x0378, 0x037a, + 0x037e, 0x037f, + 0x0380, 0x0386, 0x0387, 0x0388, + 0x038b, 0x038c, + 0x038d, 0x038e, + 0x03a2, 0x03a3, 0x03f6, 0x03f7, 0x0483, 0x048a, - 0x058a, 0x05be, + 0x0530, 0x0531, + 0x0557, 0x0559, + 0x0560, 0x0561, + 0x0588, 0x0589, + 0x058a, 0x0590, + 0x0591, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c3, 0x05c4, 0x05c6, - 0x05c7, 0x05d0, + 0x05c7, 0x05c8, 0x0600, 0x0608, 0x0609, 0x060b, 0x060c, 0x060d, @@ -643,15 +663,15 @@ 0x06e7, 0x06ee, 0x06f0, 0x06fa, 0x0711, 0x0712, - 0x0730, 0x074d, + 0x0730, 0x074b, 0x07a6, 0x07b1, 0x07eb, 0x07f4, 0x07f6, 0x07fa, 0x0816, 0x081a, 0x081b, 0x0824, 0x0825, 0x0828, - 0x0829, 0x0830, - 0x0859, 0x085e, + 0x0829, 0x082e, + 0x0859, 0x085c, 0x08e4, 0x0903, 0x093a, 0x093b, 0x093c, 0x093d, @@ -660,57 +680,161 @@ 0x0951, 0x0958, 0x0962, 0x0964, 0x0981, 0x0982, - 0x09bc, 0x09bd, + 0x0984, 0x0985, + 0x098d, 0x098f, + 0x0991, 0x0993, + 0x09a9, 0x09aa, + 0x09b1, 0x09b2, + 0x09b3, 0x09b6, + 0x09ba, 0x09bd, 0x09c1, 0x09c7, + 0x09c9, 0x09cb, 0x09cd, 0x09ce, + 0x09cf, 0x09d7, + 0x09d8, 0x09dc, + 0x09de, 0x09df, 0x09e2, 0x09e6, 0x09f2, 0x09f4, 0x09fb, 0x0a03, - 0x0a3c, 0x0a3e, + 0x0a04, 0x0a05, + 0x0a0b, 0x0a0f, + 0x0a11, 0x0a13, + 0x0a29, 0x0a2a, + 0x0a31, 0x0a32, + 0x0a34, 0x0a35, + 0x0a37, 0x0a38, + 0x0a3a, 0x0a3e, 0x0a41, 0x0a59, + 0x0a5d, 0x0a5e, + 0x0a5f, 0x0a66, 0x0a70, 0x0a72, 0x0a75, 0x0a83, - 0x0abc, 0x0abd, + 0x0a84, 0x0a85, + 0x0a8e, 0x0a8f, + 0x0a92, 0x0a93, + 0x0aa9, 0x0aaa, + 0x0ab1, 0x0ab2, + 0x0ab4, 0x0ab5, + 0x0aba, 0x0abd, 0x0ac1, 0x0ac9, + 0x0aca, 0x0acb, 0x0acd, 0x0ad0, + 0x0ad1, 0x0ae0, 0x0ae2, 0x0ae6, 0x0af1, 0x0b02, - 0x0b3c, 0x0b3d, + 0x0b04, 0x0b05, + 0x0b0d, 0x0b0f, + 0x0b11, 0x0b13, + 0x0b29, 0x0b2a, + 0x0b31, 0x0b32, + 0x0b34, 0x0b35, + 0x0b3a, 0x0b3d, 0x0b3f, 0x0b40, 0x0b41, 0x0b47, + 0x0b49, 0x0b4b, 0x0b4d, 0x0b57, + 0x0b58, 0x0b5c, + 0x0b5e, 0x0b5f, 0x0b62, 0x0b66, - 0x0b82, 0x0b83, + 0x0b78, 0x0b83, + 0x0b84, 0x0b85, + 0x0b8b, 0x0b8e, + 0x0b91, 0x0b92, + 0x0b96, 0x0b99, + 0x0b9b, 0x0b9c, + 0x0b9d, 0x0b9e, + 0x0ba0, 0x0ba3, + 0x0ba5, 0x0ba8, + 0x0bab, 0x0bae, + 0x0bba, 0x0bbe, 0x0bc0, 0x0bc1, + 0x0bc3, 0x0bc6, + 0x0bc9, 0x0bca, 0x0bcd, 0x0bd0, + 0x0bd1, 0x0bd7, + 0x0bd8, 0x0be6, 0x0bf3, 0x0c01, + 0x0c04, 0x0c05, + 0x0c0d, 0x0c0e, + 0x0c11, 0x0c12, + 0x0c29, 0x0c2a, + 0x0c3a, 0x0c3d, 0x0c3e, 0x0c41, - 0x0c46, 0x0c58, + 0x0c45, 0x0c58, + 0x0c5a, 0x0c60, 0x0c62, 0x0c66, - 0x0c78, 0x0c7f, - 0x0cbc, 0x0cbd, + 0x0c70, 0x0c7f, + 0x0c80, 0x0c82, + 0x0c84, 0x0c85, + 0x0c8d, 0x0c8e, + 0x0c91, 0x0c92, + 0x0ca9, 0x0caa, + 0x0cb4, 0x0cb5, + 0x0cba, 0x0cbd, + 0x0cc5, 0x0cc6, + 0x0cc9, 0x0cca, 0x0ccc, 0x0cd5, + 0x0cd7, 0x0cde, + 0x0cdf, 0x0ce0, 0x0ce2, 0x0ce6, + 0x0cf0, 0x0cf1, + 0x0cf3, 0x0d02, + 0x0d04, 0x0d05, + 0x0d0d, 0x0d0e, + 0x0d11, 0x0d12, + 0x0d3b, 0x0d3d, 0x0d41, 0x0d46, + 0x0d49, 0x0d4a, 0x0d4d, 0x0d4e, + 0x0d4f, 0x0d57, + 0x0d58, 0x0d60, 0x0d62, 0x0d66, - 0x0dca, 0x0dcf, + 0x0d76, 0x0d79, + 0x0d80, 0x0d82, + 0x0d84, 0x0d85, + 0x0d97, 0x0d9a, + 0x0db2, 0x0db3, + 0x0dbc, 0x0dbd, + 0x0dbe, 0x0dc0, + 0x0dc7, 0x0dcf, 0x0dd2, 0x0dd8, + 0x0de0, 0x0de6, + 0x0df0, 0x0df2, + 0x0df5, 0x0e01, 0x0e31, 0x0e32, 0x0e34, 0x0e40, 0x0e47, 0x0e4f, + 0x0e5c, 0x0e81, + 0x0e83, 0x0e84, + 0x0e85, 0x0e87, + 0x0e89, 0x0e8a, + 0x0e8b, 0x0e8d, + 0x0e8e, 0x0e94, + 0x0e98, 0x0e99, + 0x0ea0, 0x0ea1, + 0x0ea4, 0x0ea5, + 0x0ea6, 0x0ea7, + 0x0ea8, 0x0eaa, + 0x0eac, 0x0ead, 0x0eb1, 0x0eb2, 0x0eb4, 0x0ebd, - 0x0ec8, 0x0ed0, + 0x0ebe, 0x0ec0, + 0x0ec5, 0x0ec6, + 0x0ec7, 0x0ed0, + 0x0eda, 0x0edc, + 0x0ee0, 0x0f00, 0x0f18, 0x0f1a, 0x0f35, 0x0f36, 0x0f37, 0x0f38, 0x0f39, 0x0f3e, - 0x0f71, 0x0f7f, + 0x0f48, 0x0f49, + 0x0f6d, 0x0f7f, 0x0f80, 0x0f85, 0x0f86, 0x0f88, 0x0f8d, 0x0fbe, 0x0fc6, 0x0fc7, + 0x0fcd, 0x0fce, + 0x0fdb, 0x1000, 0x102d, 0x1031, 0x1032, 0x1038, 0x1039, 0x103b, @@ -722,66 +846,119 @@ 0x1085, 0x1087, 0x108d, 0x108e, 0x109d, 0x109e, - 0x135d, 0x1360, + 0x10c6, 0x10c7, + 0x10c8, 0x10cd, + 0x10ce, 0x10d0, + 0x1249, 0x124a, + 0x124e, 0x1250, + 0x1257, 0x1258, + 0x1259, 0x125a, + 0x125e, 0x1260, + 0x1289, 0x128a, + 0x128e, 0x1290, + 0x12b1, 0x12b2, + 0x12b6, 0x12b8, + 0x12bf, 0x12c0, + 0x12c1, 0x12c2, + 0x12c6, 0x12c8, + 0x12d7, 0x12d8, + 0x1311, 0x1312, + 0x1316, 0x1318, + 0x135b, 0x1360, + 0x137d, 0x1380, 0x1390, 0x13a0, - 0x1400, 0x1401, + 0x13f5, 0x1401, 0x1680, 0x1681, 0x169b, 0x16a0, + 0x16f9, 0x1700, + 0x170d, 0x170e, 0x1712, 0x1720, 0x1732, 0x1735, + 0x1737, 0x1740, 0x1752, 0x1760, - 0x1772, 0x1780, + 0x176d, 0x176e, + 0x1771, 0x1780, 0x17b4, 0x17b6, 0x17b7, 0x17be, 0x17c6, 0x17c7, 0x17c9, 0x17d4, 0x17db, 0x17dc, 0x17dd, 0x17e0, - 0x17f0, 0x1810, + 0x17ea, 0x1810, + 0x181a, 0x1820, + 0x1878, 0x1880, 0x18a9, 0x18aa, - 0x1920, 0x1923, + 0x18ab, 0x18b0, + 0x18f6, 0x1900, + 0x191f, 0x1923, 0x1927, 0x1929, + 0x192c, 0x1930, 0x1932, 0x1933, 0x1939, 0x1946, - 0x19de, 0x1a00, + 0x196e, 0x1970, + 0x1975, 0x1980, + 0x19ac, 0x19b0, + 0x19ca, 0x19d0, + 0x19db, 0x1a00, 0x1a17, 0x1a19, + 0x1a1b, 0x1a1e, 0x1a56, 0x1a57, 0x1a58, 0x1a61, 0x1a62, 0x1a63, 0x1a65, 0x1a6d, 0x1a73, 0x1a80, - 0x1b00, 0x1b04, + 0x1a8a, 0x1a90, + 0x1a9a, 0x1aa0, + 0x1aae, 0x1b04, 0x1b34, 0x1b35, 0x1b36, 0x1b3b, 0x1b3c, 0x1b3d, 0x1b42, 0x1b43, + 0x1b4c, 0x1b50, 0x1b6b, 0x1b74, - 0x1b80, 0x1b82, + 0x1b7d, 0x1b82, 0x1ba2, 0x1ba6, 0x1ba8, 0x1baa, - 0x1bab, 0x1bac, + 0x1bab, 0x1bae, 0x1be6, 0x1be7, 0x1be8, 0x1bea, 0x1bed, 0x1bee, 0x1bef, 0x1bf2, + 0x1bf4, 0x1bfc, 0x1c2c, 0x1c34, 0x1c36, 0x1c3b, - 0x1cd0, 0x1cd3, + 0x1c4a, 0x1c4d, + 0x1c80, 0x1cc0, + 0x1cc8, 0x1cd3, 0x1cd4, 0x1ce1, 0x1ce2, 0x1ce9, 0x1ced, 0x1cee, 0x1cf4, 0x1cf5, + 0x1cf7, 0x1d00, 0x1dc0, 0x1e00, + 0x1f16, 0x1f18, + 0x1f1e, 0x1f20, + 0x1f46, 0x1f48, + 0x1f4e, 0x1f50, + 0x1f58, 0x1f59, + 0x1f5a, 0x1f5b, + 0x1f5c, 0x1f5d, + 0x1f5e, 0x1f5f, + 0x1f7e, 0x1f80, + 0x1fb5, 0x1fb6, 0x1fbd, 0x1fbe, 0x1fbf, 0x1fc2, + 0x1fc5, 0x1fc6, 0x1fcd, 0x1fd0, - 0x1fdd, 0x1fe0, + 0x1fd4, 0x1fd6, + 0x1fdc, 0x1fe0, 0x1fed, 0x1ff2, + 0x1ff5, 0x1ff6, 0x1ffd, 0x200e, 0x2010, 0x2071, - 0x2074, 0x207f, + 0x2072, 0x207f, 0x2080, 0x2090, - 0x20a0, 0x2102, + 0x209d, 0x2102, 0x2103, 0x2107, 0x2108, 0x210a, 0x2114, 0x2115, @@ -801,35 +978,59 @@ 0x24ea, 0x26ac, 0x26ad, 0x2800, 0x2900, 0x2c00, + 0x2c2f, 0x2c30, + 0x2c5f, 0x2c60, 0x2ce5, 0x2ceb, 0x2cef, 0x2cf2, - 0x2cf9, 0x2d00, - 0x2d7f, 0x2d80, - 0x2de0, 0x3005, + 0x2cf4, 0x2d00, + 0x2d26, 0x2d27, + 0x2d28, 0x2d2d, + 0x2d2e, 0x2d30, + 0x2d68, 0x2d6f, + 0x2d71, 0x2d80, + 0x2d97, 0x2da0, + 0x2da7, 0x2da8, + 0x2daf, 0x2db0, + 0x2db7, 0x2db8, + 0x2dbf, 0x2dc0, + 0x2dc7, 0x2dc8, + 0x2dcf, 0x2dd0, + 0x2dd7, 0x2dd8, + 0x2ddf, 0x3005, 0x3008, 0x3021, - 0x302a, 0x3031, + 0x302a, 0x302e, + 0x3030, 0x3031, 0x3036, 0x3038, 0x303d, 0x3041, - 0x3099, 0x309d, + 0x3097, 0x309d, 0x30a0, 0x30a1, 0x30fb, 0x30fc, - 0x31c0, 0x31f0, + 0x3100, 0x3105, + 0x312e, 0x3131, + 0x318f, 0x3190, + 0x31bb, 0x31f0, 0x321d, 0x3220, 0x3250, 0x3260, 0x327c, 0x327f, 0x32b1, 0x32c0, 0x32cc, 0x32d0, + 0x32ff, 0x3300, 0x3377, 0x337b, 0x33de, 0x33e0, 0x33ff, 0x3400, - 0x4dc0, 0x4e00, - 0xa490, 0xa4d0, + 0x4db6, 0x4e00, + 0x9fcd, 0xa000, + 0xa48d, 0xa4d0, 0xa60d, 0xa610, + 0xa62c, 0xa640, 0xa66f, 0xa680, - 0xa69f, 0xa6a0, + 0xa69e, 0xa6a0, 0xa6f0, 0xa6f2, - 0xa700, 0xa722, + 0xa6f8, 0xa722, 0xa788, 0xa789, + 0xa78f, 0xa790, + 0xa7ae, 0xa7b0, + 0xa7b2, 0xa7f7, 0xa802, 0xa803, 0xa806, 0xa807, 0xa80b, 0xa80c, @@ -838,77 +1039,241 @@ 0xa838, 0xa840, 0xa874, 0xa880, 0xa8c4, 0xa8ce, - 0xa8e0, 0xa8f2, + 0xa8da, 0xa8f2, + 0xa8fc, 0xa900, 0xa926, 0xa92e, 0xa947, 0xa952, - 0xa980, 0xa983, + 0xa954, 0xa95f, + 0xa97d, 0xa983, 0xa9b3, 0xa9b4, 0xa9b6, 0xa9ba, 0xa9bc, 0xa9bd, + 0xa9ce, 0xa9cf, + 0xa9da, 0xa9de, + 0xa9e5, 0xa9e6, + 0xa9ff, 0xaa00, 0xaa29, 0xaa2f, 0xaa31, 0xaa33, 0xaa35, 0xaa40, 0xaa43, 0xaa44, 0xaa4c, 0xaa4d, + 0xaa4e, 0xaa50, + 0xaa5a, 0xaa5c, + 0xaa7c, 0xaa7d, 0xaab0, 0xaab1, 0xaab2, 0xaab5, 0xaab7, 0xaab9, 0xaabe, 0xaac0, 0xaac1, 0xaac2, + 0xaac3, 0xaadb, 0xaaec, 0xaaee, 0xaaf6, 0xab01, + 0xab07, 0xab09, + 0xab0f, 0xab11, + 0xab17, 0xab20, + 0xab27, 0xab28, + 0xab2f, 0xab30, + 0xab60, 0xab64, + 0xab66, 0xabc0, 0xabe5, 0xabe6, 0xabe8, 0xabe9, 0xabed, 0xabf0, + 0xabfa, 0xac00, + 0xd7a4, 0xd7b0, + 0xd7c7, 0xd7cb, + 0xd7fc, 0xe000, + 0xfa6e, 0xfa70, + 0xfada, 0xfb00, + 0xfb07, 0xfb13, + 0xfb18, 0xfb1d, 0xfb1e, 0xfb1f, 0xfb29, 0xfb2a, - 0xfd3e, 0xfd50, - 0xfdfd, 0xfe70, + 0xfd3e, 0xfd40, + 0xfdd0, 0xfdf0, + 0xfdfd, 0xfdfe, + 0xfe00, 0xfe70, 0xfeff, 0xff21, 0xff3b, 0xff41, 0xff5b, 0xff66, - 0xffe0, 0x10000, + 0xffbf, 0xffc2, + 0xffc8, 0xffca, + 0xffd0, 0xffd2, + 0xffd8, 0xffda, + 0xffdd, 0x10000, + 0x1000c, 0x1000d, + 0x10027, 0x10028, + 0x1003b, 0x1003c, + 0x1003e, 0x1003f, + 0x1004e, 0x10050, + 0x1005e, 0x10080, + 0x100fb, 0x10100, 0x10101, 0x10102, + 0x10103, 0x10107, + 0x10134, 0x10137, 0x10140, 0x101d0, 0x101fd, 0x10280, + 0x1029d, 0x102a0, + 0x102d1, 0x10300, + 0x10324, 0x10330, + 0x1034b, 0x10350, + 0x10376, 0x10380, + 0x1039e, 0x1039f, + 0x103c4, 0x103c8, + 0x103d6, 0x10400, + 0x1049e, 0x104a0, + 0x104aa, 0x10500, + 0x10528, 0x10530, + 0x10564, 0x1056f, + 0x10570, 0x10600, + 0x10737, 0x10740, + 0x10756, 0x10760, + 0x10768, 0x10800, 0x1091f, 0x10920, - 0x10a01, 0x10a10, - 0x10a38, 0x10a40, + 0x10a01, 0x10a04, + 0x10a05, 0x10a07, + 0x10a0c, 0x10a10, + 0x10a38, 0x10a3b, + 0x10a3f, 0x10a40, + 0x10ae5, 0x10ae7, 0x10b39, 0x10b40, - 0x10e60, 0x11000, + 0x10e60, 0x10e7f, 0x11001, 0x11002, 0x11038, 0x11047, - 0x11052, 0x11066, - 0x11080, 0x11082, + 0x1104e, 0x11066, + 0x11070, 0x11082, 0x110b3, 0x110b7, 0x110b9, 0x110bb, - 0x11100, 0x11103, + 0x110c2, 0x110d0, + 0x110e9, 0x110f0, + 0x110fa, 0x11103, 0x11127, 0x1112c, 0x1112d, 0x11136, - 0x11180, 0x11182, + 0x11144, 0x11150, + 0x11173, 0x11174, + 0x11177, 0x11182, 0x111b6, 0x111bf, + 0x111c9, 0x111cd, + 0x111ce, 0x111d0, + 0x111db, 0x111e1, + 0x111f5, 0x11200, + 0x11212, 0x11213, + 0x1122f, 0x11232, + 0x11234, 0x11235, + 0x11236, 0x11238, + 0x1123e, 0x112b0, + 0x112df, 0x112e0, + 0x112e3, 0x112f0, + 0x112fa, 0x11302, + 0x11304, 0x11305, + 0x1130d, 0x1130f, + 0x11311, 0x11313, + 0x11329, 0x1132a, + 0x11331, 0x11332, + 0x11334, 0x11335, + 0x1133a, 0x1133d, + 0x11340, 0x11341, + 0x11345, 0x11347, + 0x11349, 0x1134b, + 0x1134e, 0x11357, + 0x11358, 0x1135d, + 0x11364, 0x11480, + 0x114b3, 0x114b9, + 0x114ba, 0x114bb, + 0x114bf, 0x114c1, + 0x114c2, 0x114c4, + 0x114c8, 0x114d0, + 0x114da, 0x11580, + 0x115b2, 0x115b8, + 0x115bc, 0x115be, + 0x115bf, 0x115c1, + 0x115ca, 0x11600, + 0x11633, 0x1163b, + 0x1163d, 0x1163e, + 0x1163f, 0x11641, + 0x11645, 0x11650, + 0x1165a, 0x11680, 0x116ab, 0x116ac, 0x116ad, 0x116ae, 0x116b0, 0x116b6, 0x116b7, 0x116c0, - 0x16f8f, 0x16f93, + 0x116ca, 0x118a0, + 0x118f3, 0x118ff, + 0x11900, 0x11ac0, + 0x11af9, 0x12000, + 0x12399, 0x12400, + 0x1246f, 0x12470, + 0x12475, 0x13000, + 0x1342f, 0x16800, + 0x16a39, 0x16a40, + 0x16a5f, 0x16a60, + 0x16a6a, 0x16a6e, + 0x16a70, 0x16ad0, + 0x16aee, 0x16af5, + 0x16af6, 0x16b00, + 0x16b30, 0x16b37, + 0x16b46, 0x16b50, + 0x16b5a, 0x16b5b, + 0x16b62, 0x16b63, + 0x16b78, 0x16b7d, + 0x16b90, 0x16f00, + 0x16f45, 0x16f50, + 0x16f7f, 0x16f93, + 0x16fa0, 0x1b000, + 0x1b002, 0x1bc00, + 0x1bc6b, 0x1bc70, + 0x1bc7d, 0x1bc80, + 0x1bc89, 0x1bc90, + 0x1bc9a, 0x1bc9c, + 0x1bc9d, 0x1bc9f, + 0x1bca0, 0x1d000, + 0x1d0f6, 0x1d100, + 0x1d127, 0x1d129, 0x1d167, 0x1d16a, 0x1d173, 0x1d183, 0x1d185, 0x1d18c, 0x1d1aa, 0x1d1ae, - 0x1d200, 0x1d360, + 0x1d1de, 0x1d360, + 0x1d372, 0x1d400, + 0x1d455, 0x1d456, + 0x1d49d, 0x1d49e, + 0x1d4a0, 0x1d4a2, + 0x1d4a3, 0x1d4a5, + 0x1d4a7, 0x1d4a9, + 0x1d4ad, 0x1d4ae, + 0x1d4ba, 0x1d4bb, + 0x1d4bc, 0x1d4bd, + 0x1d4c4, 0x1d4c5, + 0x1d506, 0x1d507, + 0x1d50b, 0x1d50d, + 0x1d515, 0x1d516, + 0x1d51d, 0x1d51e, + 0x1d53a, 0x1d53b, + 0x1d53f, 0x1d540, + 0x1d545, 0x1d546, + 0x1d547, 0x1d54a, + 0x1d551, 0x1d552, + 0x1d6a6, 0x1d6a8, 0x1d6db, 0x1d6dc, 0x1d715, 0x1d716, 0x1d74f, 0x1d750, 0x1d789, 0x1d78a, 0x1d7c3, 0x1d7c4, - 0x1d7ce, 0x1ee00, - 0x1eef0, 0x1f110, + 0x1d7cc, 0x1e800, + 0x1e8d0, 0x1e8d7, + 0x1eef0, 0x1eef2, + 0x1f000, 0x1f110, + 0x1f12f, 0x1f130, 0x1f16a, 0x1f170, - 0x1f300, 0x1f48c, - 0x1f48d, 0x1f524, - 0x1f525, 0x20000, - 0xe0001, 0xf0000, + 0x1f19b, 0x1f1e6, + 0x1f203, 0x1f210, + 0x1f23b, 0x1f240, + 0x1f249, 0x1f250, + 0x1f252, 0x20000, + 0x2a6d7, 0x2a700, + 0x2b735, 0x2b740, + 0x2b81e, 0x2f800, + 0x2fa1e, 0xf0000, + 0xffffe, 0x100000, 0x10fffe, 0x10ffff // sentinel }; --- old/jdk/test/java/awt/font/NumericShaper/ShapingTest.java 2015-07-13 16:12:01.000000000 +0900 +++ new/jdk/test/java/awt/font/NumericShaper/ShapingTest.java 2015-07-13 16:12:01.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /* * @test - * @bug 6842557 6943963 6959267 + * @bug 6842557 6943963 6959267 8032446 * @summary confirm that shaping works as expected. (Mainly for new characters which were added in Unicode 5 and 6) * used where appropriate. */ @@ -40,6 +40,7 @@ test6842557(); test6943963(); test6903266(); + test8032446(); if (err) { throw new RuntimeException("shape() returned unexpected value."); @@ -138,6 +139,18 @@ checkResult("Range.MEETEI_MAYEK", ns, given, expected); } + private static void test8032446() { + NumericShaper ns = getContextualShaper(EnumSet.of(Range.SINHALA)); + String given = "\u0d85 012"; + String expected = "\u0d85 \u0de6\u0de7\u0de8"; + checkResult("Range.SINHALA", ns, given, expected); + + ns = getContextualShaper(EnumSet.of(Range.MYANMAR_TAI_LAING)); + given = "\ua9e2 012"; + expected = "\ua9e2 \ua9f0\ua9f1\ua9f2"; + checkResult("Range.MYANMAR_TAI_LAING", ns, given, expected); + } + private static void checkResult(String ranges, NumericShaper ns, String given, String expected) { char[] text = given.toCharArray(); --- old/jdk/test/java/lang/Character/CheckProp.java 2015-07-13 16:12:02.000000000 +0900 +++ new/jdk/test/java/lang/Character/CheckProp.java 2015-07-13 16:12:01.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,7 +24,7 @@ /** * @test - * @bug 7037261 7070436 7198195 + * @bug 7037261 7070436 7198195 8032446 * @summary Check j.l.Character.isLowerCase/isUppercase/isAlphabetic/isIdeographic */ --- old/jdk/test/java/lang/Character/CheckScript.java 2015-07-13 16:12:02.000000000 +0900 +++ new/jdk/test/java/lang/Character/CheckScript.java 2015-07-13 16:12:02.000000000 +0900 @@ -1,6 +1,5 @@ - /* - * Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,7 +23,7 @@ /** * @test - * @bug 6945564 6959267 7033561 7070436 7198195 + * @bug 6945564 6959267 7033561 7070436 7198195 8032446 * @summary Check that the j.l.Character.UnicodeScript */ --- old/jdk/test/java/lang/Character/PropList.txt 2015-07-13 16:12:03.000000000 +0900 +++ new/jdk/test/java/lang/Character/PropList.txt 2015-07-13 16:12:03.000000000 +0900 @@ -1,8 +1,8 @@ -# PropList-6.2.0.txt -# Date: 2012-05-23, 20:34:59 GMT [MD] +# PropList-7.0.0.txt +# Date: 2014-02-19, 15:51:26 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ @@ -13,7 +13,6 @@ 0085 ; White_Space # Cc <control-0085> 00A0 ; White_Space # Zs NO-BREAK SPACE 1680 ; White_Space # Zs OGHAM SPACE MARK -180E ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR 2000..200A ; White_Space # Zs [11] EN QUAD..HAIR SPACE 2028 ; White_Space # Zl LINE SEPARATOR 2029 ; White_Space # Zp PARAGRAPH SEPARATOR @@ -21,14 +20,16 @@ 205F ; White_Space # Zs MEDIUM MATHEMATICAL SPACE 3000 ; White_Space # Zs IDEOGRAPHIC SPACE -# Total code points: 26 +# Total code points: 25 # ================================================ +061C ; Bidi_Control # Cf ARABIC LETTER MARK 200E..200F ; Bidi_Control # Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK 202A..202E ; Bidi_Control # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE +2066..2069 ; Bidi_Control # Cf [4] LEFT-TO-RIGHT ISOLATE..POP DIRECTIONAL ISOLATE -# Total code points: 7 +# Total code points: 12 # ================================================ @@ -51,6 +52,7 @@ 2E17 ; Dash # Pd DOUBLE OBLIQUE HYPHEN 2E1A ; Dash # Pd HYPHEN WITH DIAERESIS 2E3A..2E3B ; Dash # Pd [2] TWO-EM DASH..THREE-EM DASH +2E40 ; Dash # Pd DOUBLE HYPHEN 301C ; Dash # Pd WAVE DASH 3030 ; Dash # Pd WAVY DASH 30A0 ; Dash # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN @@ -59,7 +61,7 @@ FE63 ; Dash # Pd SMALL HYPHEN-MINUS FF0D ; Dash # Pd FULLWIDTH HYPHEN-MINUS -# Total code points: 27 +# Total code points: 28 # ================================================ @@ -91,6 +93,7 @@ 201F ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK 2039 ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK 203A ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +2E42 ; Quotation_Mark # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 300C ; Quotation_Mark # Ps LEFT CORNER BRACKET 300D ; Quotation_Mark # Pe RIGHT CORNER BRACKET 300E ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET @@ -106,7 +109,7 @@ FF62 ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET FF63 ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET -# Total code points: 29 +# Total code points: 30 # ================================================ @@ -136,6 +139,7 @@ 1361..1368 ; Terminal_Punctuation # Po [8] ETHIOPIC WORDSPACE..ETHIOPIC PARAGRAPH SEPARATOR 166D..166E ; Terminal_Punctuation # Po [2] CANADIAN SYLLABICS CHI SIGN..CANADIAN SYLLABICS FULL STOP 16EB..16ED ; Terminal_Punctuation # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION +1735..1736 ; Terminal_Punctuation # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION 17D4..17D6 ; Terminal_Punctuation # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH 17DA ; Terminal_Punctuation # Po KHMER SIGN KOOMUUT 1802..1805 ; Terminal_Punctuation # Po [4] MONGOLIAN COMMA..MONGOLIAN FOUR DOTS @@ -149,6 +153,8 @@ 203C..203D ; Terminal_Punctuation # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2E2E ; Terminal_Punctuation # Po REVERSED QUESTION MARK +2E3C ; Terminal_Punctuation # Po STENOGRAPHIC FULL STOP +2E41 ; Terminal_Punctuation # Po REVERSED COMMA 3001..3002 ; Terminal_Punctuation # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP A4FE..A4FF ; Terminal_Punctuation # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP A60D..A60F ; Terminal_Punctuation # Po [3] VAI COMMA..VAI QUESTION MARK @@ -174,14 +180,27 @@ 103D0 ; Terminal_Punctuation # Po OLD PERSIAN WORD DIVIDER 10857 ; Terminal_Punctuation # Po IMPERIAL ARAMAIC SECTION SIGN 1091F ; Terminal_Punctuation # Po PHOENICIAN WORD SEPARATOR +10A56..10A57 ; Terminal_Punctuation # Po [2] KHAROSHTHI PUNCTUATION DANDA..KHAROSHTHI PUNCTUATION DOUBLE DANDA +10AF0..10AF5 ; Terminal_Punctuation # Po [6] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION TWO DOTS 10B3A..10B3F ; Terminal_Punctuation # Po [6] TINY TWO DOTS OVER ONE DOT PUNCTUATION..LARGE ONE RING OVER TWO RINGS PUNCTUATION +10B99..10B9C ; Terminal_Punctuation # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT 11047..1104D ; Terminal_Punctuation # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 110BE..110C1 ; Terminal_Punctuation # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; Terminal_Punctuation # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; Terminal_Punctuation # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA -12470..12473 ; Terminal_Punctuation # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON +111CD ; Terminal_Punctuation # Po SHARADA SUTRA MARK +11238..1123C ; Terminal_Punctuation # Po [5] KHOJKI DANDA..KHOJKI DOUBLE SECTION MARK +115C2..115C5 ; Terminal_Punctuation # Po [4] SIDDHAM DANDA..SIDDHAM SEPARATOR BAR +115C9 ; Terminal_Punctuation # Po SIDDHAM END OF TEXT MARK +11641..11642 ; Terminal_Punctuation # Po [2] MODI DANDA..MODI DOUBLE DANDA +12470..12474 ; Terminal_Punctuation # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON +16A6E..16A6F ; Terminal_Punctuation # Po [2] MRO DANDA..MRO DOUBLE DANDA +16AF5 ; Terminal_Punctuation # Po BASSA VAH FULL STOP +16B37..16B39 ; Terminal_Punctuation # Po [3] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN CIM CHEEM +16B44 ; Terminal_Punctuation # Po PAHAWH HMONG SIGN XAUS +1BC9F ; Terminal_Punctuation # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP -# Total code points: 176 +# Total code points: 214 # ================================================ @@ -230,6 +249,10 @@ 21D5..21DB ; Other_Math # So [7] UP DOWN DOUBLE ARROW..RIGHTWARDS TRIPLE ARROW 21DD ; Other_Math # So RIGHTWARDS SQUIGGLE ARROW 21E4..21E5 ; Other_Math # So [2] LEFTWARDS ARROW TO BAR..RIGHTWARDS ARROW TO BAR +2308 ; Other_Math # Ps LEFT CEILING +2309 ; Other_Math # Pe RIGHT CEILING +230A ; Other_Math # Ps LEFT FLOOR +230B ; Other_Math # Pe RIGHT FLOOR 23B4..23B5 ; Other_Math # So [2] TOP SQUARE BRACKET..BOTTOM SQUARE BRACKET 23B7 ; Other_Math # So RADICAL SYMBOL BOTTOM 23D0 ; Other_Math # So VERTICAL LINE EXTENSION @@ -358,7 +381,7 @@ 1EEA5..1EEA9 ; Other_Math # Lo [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH 1EEAB..1EEBB ; Other_Math # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN -# Total code points: 1358 +# Total code points: 1362 # ================================================ @@ -403,8 +426,7 @@ 0825..0827 ; Other_Alphabetic # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U 0829..082C ; Other_Alphabetic # Mn [4] SAMARITAN VOWEL SIGN LONG I..SAMARITAN VOWEL SIGN SUKUN 08E4..08E9 ; Other_Alphabetic # Mn [6] ARABIC CURLY FATHA..ARABIC CURLY KASRATAN -08F0..08FE ; Other_Alphabetic # Mn [15] ARABIC OPEN FATHATAN..ARABIC DAMMA WITH DOT -0900..0902 ; Other_Alphabetic # Mn [3] DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANAGARI SIGN ANUSVARA +08F0..0902 ; Other_Alphabetic # Mn [19] ARABIC OPEN FATHATAN..DEVANAGARI SIGN ANUSVARA 0903 ; Other_Alphabetic # Mc DEVANAGARI SIGN VISARGA 093A ; Other_Alphabetic # Mn DEVANAGARI VOWEL SIGN OE 093B ; Other_Alphabetic # Mc DEVANAGARI VOWEL SIGN OOE @@ -457,6 +479,7 @@ 0BC6..0BC8 ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI 0BCA..0BCC ; Other_Alphabetic # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU 0BD7 ; Other_Alphabetic # Mc TAMIL AU LENGTH MARK +0C00 ; Other_Alphabetic # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Other_Alphabetic # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C3E..0C40 ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Other_Alphabetic # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR @@ -464,6 +487,7 @@ 0C4A..0C4C ; Other_Alphabetic # Mn [3] TELUGU VOWEL SIGN O..TELUGU VOWEL SIGN AU 0C55..0C56 ; Other_Alphabetic # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK 0C62..0C63 ; Other_Alphabetic # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +0C81 ; Other_Alphabetic # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Other_Alphabetic # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0CBE ; Other_Alphabetic # Mc KANNADA VOWEL SIGN AA 0CBF ; Other_Alphabetic # Mn KANNADA VOWEL SIGN I @@ -474,6 +498,7 @@ 0CCC ; Other_Alphabetic # Mn KANNADA VOWEL SIGN AU 0CD5..0CD6 ; Other_Alphabetic # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK 0CE2..0CE3 ; Other_Alphabetic # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +0D01 ; Other_Alphabetic # Mn MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Other_Alphabetic # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D3E..0D40 ; Other_Alphabetic # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II 0D41..0D44 ; Other_Alphabetic # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR @@ -538,7 +563,8 @@ 19B0..19C0 ; Other_Alphabetic # Mc [17] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE VOWEL SIGN IY 19C8..19C9 ; Other_Alphabetic # Mc [2] NEW TAI LUE TONE MARK-1..NEW TAI LUE TONE MARK-2 1A17..1A18 ; Other_Alphabetic # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U -1A19..1A1B ; Other_Alphabetic # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE +1A19..1A1A ; Other_Alphabetic # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A1B ; Other_Alphabetic # Mn BUGINESE VOWEL SIGN AE 1A55 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN MEDIAL RA 1A56 ; Other_Alphabetic # Mn TAI THAM CONSONANT SIGN MEDIAL LA 1A57 ; Other_Alphabetic # Mc TAI THAM CONSONANT SIGN LA TANG LAI @@ -564,7 +590,7 @@ 1BA2..1BA5 ; Other_Alphabetic # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU 1BA6..1BA7 ; Other_Alphabetic # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Other_Alphabetic # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG -1BAC..1BAD ; Other_Alphabetic # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BAC..1BAD ; Other_Alphabetic # Mn [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BE7 ; Other_Alphabetic # Mc BATAK VOWEL SIGN E 1BE8..1BE9 ; Other_Alphabetic # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE 1BEA..1BEC ; Other_Alphabetic # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O @@ -575,6 +601,7 @@ 1C2C..1C33 ; Other_Alphabetic # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T 1C34..1C35 ; Other_Alphabetic # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG 1CF2..1CF3 ; Other_Alphabetic # Mc [2] VEDIC SIGN ARDHAVISARGA..VEDIC SIGN ROTATED ARDHAVISARGA +1DE7..1DF4 ; Other_Alphabetic # Mn [14] COMBINING LATIN SMALL LETTER ALPHA..COMBINING LATIN SMALL LETTER U WITH DIAERESIS 24B6..24E9 ; Other_Alphabetic # So [52] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN SMALL LETTER Z 2DE0..2DFF ; Other_Alphabetic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS A674..A67B ; Other_Alphabetic # Mn [8] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC LETTER OMEGA @@ -616,6 +643,7 @@ ABE8 ; Other_Alphabetic # Mn MEETEI MAYEK VOWEL SIGN UNAP ABE9..ABEA ; Other_Alphabetic # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG FB1E ; Other_Alphabetic # Mn HEBREW POINT JUDEO-SPANISH VARIKA +10376..1037A ; Other_Alphabetic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII 10A01..10A03 ; Other_Alphabetic # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R 10A05..10A06 ; Other_Alphabetic # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O 10A0C..10A0F ; Other_Alphabetic # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA @@ -636,14 +664,54 @@ 111B3..111B5 ; Other_Alphabetic # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II 111B6..111BE ; Other_Alphabetic # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O 111BF ; Other_Alphabetic # Mc SHARADA VOWEL SIGN AU +1122C..1122E ; Other_Alphabetic # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +1122F..11231 ; Other_Alphabetic # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11232..11233 ; Other_Alphabetic # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11234 ; Other_Alphabetic # Mn KHOJKI SIGN ANUSVARA +11237 ; Other_Alphabetic # Mn KHOJKI SIGN SHADDA +112DF ; Other_Alphabetic # Mn KHUDAWADI SIGN ANUSVARA +112E0..112E2 ; Other_Alphabetic # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +112E3..112E8 ; Other_Alphabetic # Mn [6] KHUDAWADI VOWEL SIGN U..KHUDAWADI VOWEL SIGN AU +11301 ; Other_Alphabetic # Mn GRANTHA SIGN CANDRABINDU +11302..11303 ; Other_Alphabetic # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +1133E..1133F ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11340 ; Other_Alphabetic # Mn GRANTHA VOWEL SIGN II +11341..11344 ; Other_Alphabetic # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134C ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN OO..GRANTHA VOWEL SIGN AU +11357 ; Other_Alphabetic # Mc GRANTHA AU LENGTH MARK +11362..11363 ; Other_Alphabetic # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +114B0..114B2 ; Other_Alphabetic # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B3..114B8 ; Other_Alphabetic # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114B9 ; Other_Alphabetic # Mc TIRHUTA VOWEL SIGN E +114BA ; Other_Alphabetic # Mn TIRHUTA VOWEL SIGN SHORT E +114BB..114BE ; Other_Alphabetic # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114BF..114C0 ; Other_Alphabetic # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C1 ; Other_Alphabetic # Mc TIRHUTA SIGN VISARGA +115AF..115B1 ; Other_Alphabetic # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B2..115B5 ; Other_Alphabetic # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115B8..115BB ; Other_Alphabetic # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BC..115BD ; Other_Alphabetic # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BE ; Other_Alphabetic # Mc SIDDHAM SIGN VISARGA +11630..11632 ; Other_Alphabetic # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +11633..1163A ; Other_Alphabetic # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163B..1163C ; Other_Alphabetic # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163D ; Other_Alphabetic # Mn MODI SIGN ANUSVARA +1163E ; Other_Alphabetic # Mc MODI SIGN VISARGA +11640 ; Other_Alphabetic # Mn MODI SIGN ARDHACANDRA 116AB ; Other_Alphabetic # Mn TAKRI SIGN ANUSVARA 116AC ; Other_Alphabetic # Mc TAKRI SIGN VISARGA 116AD ; Other_Alphabetic # Mn TAKRI VOWEL SIGN AA 116AE..116AF ; Other_Alphabetic # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II 116B0..116B5 ; Other_Alphabetic # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU +16B30..16B36 ; Other_Alphabetic # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM 16F51..16F7E ; Other_Alphabetic # Mc [46] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN NG +1BC9E ; Other_Alphabetic # Mn DUPLOYAN DOUBLE MARK +1F130..1F149 ; Other_Alphabetic # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z +1F150..1F169 ; Other_Alphabetic # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z +1F170..1F189 ; Other_Alphabetic # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 922 +# Total code points: 1116 # ================================================ @@ -746,6 +814,7 @@ 1939..193B ; Diacritic # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I 1A75..1A7C ; Diacritic # Mn [8] TAI THAM SIGN TONE-1..TAI THAM SIGN KHUEN-LUE KARAN 1A7F ; Diacritic # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT +1AB0..1ABD ; Diacritic # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW 1B34 ; Diacritic # Mn BALINESE SIGN REREKAN 1B44 ; Diacritic # Mc BALINESE ADEG ADEG 1B6B..1B73 ; Diacritic # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG @@ -760,8 +829,10 @@ 1CE2..1CE8 ; Diacritic # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Diacritic # Mn VEDIC SIGN TIRYAK 1CF4 ; Diacritic # Mn VEDIC TONE CANDRA ABOVE +1CF8..1CF9 ; Diacritic # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE 1D2C..1D6A ; Diacritic # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI 1DC4..1DCF ; Diacritic # Mn [12] COMBINING MACRON-ACUTE..COMBINING ZIGZAG BELOW +1DF5 ; Diacritic # Mn COMBINING UP TACK ABOVE 1DFD..1DFF ; Diacritic # Mn [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 1FBD ; Diacritic # Sk GREEK KORONIS 1FBF..1FC1 ; Diacritic # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI @@ -779,6 +850,7 @@ A66F ; Diacritic # Mn COMBINING CYRILLIC VZMET A67C..A67D ; Diacritic # Mn [2] COMBINING CYRILLIC KAVYKA..COMBINING CYRILLIC PAYEROK A67F ; Diacritic # Lm CYRILLIC PAYEROK +A69C..A69D ; Diacritic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A6F0..A6F1 ; Diacritic # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS A717..A71F ; Diacritic # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK A720..A721 ; Diacritic # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE @@ -791,26 +863,45 @@ A953 ; Diacritic # Mc REJANG VIRAMA A9B3 ; Diacritic # Mn JAVANESE SIGN CECAK TELU A9C0 ; Diacritic # Mc JAVANESE PANGKON +A9E5 ; Diacritic # Mn MYANMAR SIGN SHAN SAW AA7B ; Diacritic # Mc MYANMAR SIGN PAO KAREN TONE +AA7C ; Diacritic # Mn MYANMAR SIGN TAI LAING TONE-2 +AA7D ; Diacritic # Mc MYANMAR SIGN TAI LAING TONE-5 AABF ; Diacritic # Mn TAI VIET TONE MAI EK AAC0 ; Diacritic # Lo TAI VIET TONE MAI NUENG AAC1 ; Diacritic # Mn TAI VIET TONE MAI THO AAC2 ; Diacritic # Lo TAI VIET TONE MAI SONG AAF6 ; Diacritic # Mn MEETEI MAYEK VIRAMA +AB5B ; Diacritic # Sk MODIFIER BREVE WITH INVERTED BREVE +AB5C..AB5F ; Diacritic # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK ABEC ; Diacritic # Mc MEETEI MAYEK LUM IYEK ABED ; Diacritic # Mn MEETEI MAYEK APUN IYEK FB1E ; Diacritic # Mn HEBREW POINT JUDEO-SPANISH VARIKA -FE20..FE26 ; Diacritic # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON +FE20..FE2D ; Diacritic # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW FF3E ; Diacritic # Sk FULLWIDTH CIRCUMFLEX ACCENT FF40 ; Diacritic # Sk FULLWIDTH GRAVE ACCENT FF70 ; Diacritic # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK FF9E..FF9F ; Diacritic # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK FFE3 ; Diacritic # Sk FULLWIDTH MACRON +102E0 ; Diacritic # Mn COPTIC EPACT THOUSANDS MARK +10AE5..10AE6 ; Diacritic # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW 110B9..110BA ; Diacritic # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA 11133..11134 ; Diacritic # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA +11173 ; Diacritic # Mn MAHAJANI SIGN NUKTA 111C0 ; Diacritic # Mc SHARADA SIGN VIRAMA +11235 ; Diacritic # Mc KHOJKI SIGN VIRAMA +11236 ; Diacritic # Mn KHOJKI SIGN NUKTA +112E9..112EA ; Diacritic # Mn [2] KHUDAWADI SIGN NUKTA..KHUDAWADI SIGN VIRAMA +1133C ; Diacritic # Mn GRANTHA SIGN NUKTA +1134D ; Diacritic # Mc GRANTHA SIGN VIRAMA +11366..1136C ; Diacritic # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; Diacritic # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA +114C2..114C3 ; Diacritic # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +115BF..115C0 ; Diacritic # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +1163F ; Diacritic # Mn MODI SIGN VIRAMA 116B6 ; Diacritic # Mc TAKRI SIGN VIRAMA 116B7 ; Diacritic # Mn TAKRI SIGN NUKTA +16AF0..16AF4 ; Diacritic # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE 16F8F..16F92 ; Diacritic # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW 16F93..16F9F ; Diacritic # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 1D167..1D169 ; Diacritic # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 @@ -818,8 +909,9 @@ 1D17B..1D182 ; Diacritic # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Diacritic # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Diacritic # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO +1E8D0..1E8D6 ; Diacritic # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS -# Total code points: 693 +# Total code points: 766 # ================================================ @@ -841,12 +933,16 @@ A015 ; Extender # Lm YI SYLLABLE WU A60C ; Extender # Lm VAI SYLLABLE LENGTHENER A9CF ; Extender # Lm JAVANESE PANGRANGKEP +A9E6 ; Extender # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION AA70 ; Extender # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AADD ; Extender # Lm TAI VIET SYMBOL SAM AAF3..AAF4 ; Extender # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK FF70 ; Extender # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +1135D ; Extender # Lo GRANTHA SIGN PLUTA +115C6..115C8 ; Extender # Po [3] SIDDHAM REPETITION MARK-1..SIDDHAM REPETITION MARK-3 +16B42..16B43 ; Extender # Lm [2] PAHAWH HMONG SIGN VOS NRUA..PAHAWH HMONG SIGN IB YAM -# Total code points: 31 +# Total code points: 38 # ================================================ @@ -866,17 +962,22 @@ 2170..217F ; Other_Lowercase # Nl [16] SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND 24D0..24E9 ; Other_Lowercase # So [26] CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z 2C7C..2C7D ; Other_Lowercase # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V +A69C..A69D ; Other_Lowercase # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A770 ; Other_Lowercase # Lm MODIFIER LETTER US A7F8..A7F9 ; Other_Lowercase # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE +AB5C..AB5F ; Other_Lowercase # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK -# Total code points: 183 +# Total code points: 189 # ================================================ 2160..216F ; Other_Uppercase # Nl [16] ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND 24B6..24CF ; Other_Uppercase # So [26] CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z +1F130..1F149 ; Other_Uppercase # So [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z +1F150..1F169 ; Other_Uppercase # So [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z +1F170..1F189 ; Other_Uppercase # So [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z -# Total code points: 42 +# Total code points: 120 # ================================================ @@ -918,10 +1019,15 @@ 200C..200D ; Other_Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 302E..302F ; Other_Grapheme_Extend # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK FF9E..FF9F ; Other_Grapheme_Extend # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +1133E ; Other_Grapheme_Extend # Mc GRANTHA VOWEL SIGN AA +11357 ; Other_Grapheme_Extend # Mc GRANTHA AU LENGTH MARK +114B0 ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN AA +114BD ; Other_Grapheme_Extend # Mc TIRHUTA VOWEL SIGN SHORT O +115AF ; Other_Grapheme_Extend # Mc SIDDHAM VOWEL SIGN AA 1D165 ; Other_Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM 1D16E..1D172 ; Other_Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 -# Total code points: 25 +# Total code points: 30 # ================================================ @@ -966,7 +1072,7 @@ 034F ; Other_Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER 115F..1160 ; Other_Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER 17B4..17B5 ; Other_Default_Ignorable_Code_Point # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA -2065..2069 ; Other_Default_Ignorable_Code_Point # Cn [5] <reserved-2065>..<reserved-2069> +2065 ; Other_Default_Ignorable_Code_Point # Cn <reserved-2065> 3164 ; Other_Default_Ignorable_Code_Point # Lo HANGUL FILLER FFA0 ; Other_Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER FFF0..FFF8 ; Other_Default_Ignorable_Code_Point # Cn [9] <reserved-FFF0>..<reserved-FFF8> @@ -975,7 +1081,7 @@ E0080..E00FF ; Other_Default_Ignorable_Code_Point # Cn [128] <reserved-E0080>..<reserved-E00FF> E01F0..E0FFF ; Other_Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>..<reserved-E0FFF> -# Total code points: 3780 +# Total code points: 3776 # ================================================ @@ -1060,8 +1166,6 @@ 0021 ; STerm # Po EXCLAMATION MARK 002E ; STerm # Po FULL STOP 003F ; STerm # Po QUESTION MARK -055C ; STerm # Po ARMENIAN EXCLAMATION MARK -055E ; STerm # Po ARMENIAN QUESTION MARK 0589 ; STerm # Po ARMENIAN FULL STOP 061F ; STerm # Po ARABIC QUESTION MARK 06D4 ; STerm # Po ARABIC FULL STOP @@ -1084,6 +1188,7 @@ 203C..203D ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG 2047..2049 ; STerm # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK 2E2E ; STerm # Po REVERSED QUESTION MARK +2E3C ; STerm # Po STENOGRAPHIC FULL STOP 3002 ; STerm # Po IDEOGRAPHIC FULL STOP A4FF ; STerm # Po LISU PUNCTUATION FULL STOP A60E..A60F ; STerm # Po [2] VAI FULL STOP..VAI QUESTION MARK @@ -1107,8 +1212,19 @@ 110BE..110C1 ; STerm # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA 11141..11143 ; STerm # Po [3] CHAKMA DANDA..CHAKMA QUESTION MARK 111C5..111C6 ; STerm # Po [2] SHARADA DANDA..SHARADA DOUBLE DANDA +111CD ; STerm # Po SHARADA SUTRA MARK +11238..11239 ; STerm # Po [2] KHOJKI DANDA..KHOJKI DOUBLE DANDA +1123B..1123C ; STerm # Po [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK +115C2..115C3 ; STerm # Po [2] SIDDHAM DANDA..SIDDHAM DOUBLE DANDA +115C9 ; STerm # Po SIDDHAM END OF TEXT MARK +11641..11642 ; STerm # Po [2] MODI DANDA..MODI DOUBLE DANDA +16A6E..16A6F ; STerm # Po [2] MRO DANDA..MRO DOUBLE DANDA +16AF5 ; STerm # Po BASSA VAH FULL STOP +16B37..16B38 ; STerm # Po [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB +16B44 ; STerm # Po PAHAWH HMONG SIGN XAUS +1BC9F ; STerm # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP -# Total code points: 83 +# Total code points: 99 # ================================================ @@ -1210,7 +1326,10 @@ 21D5..21F3 ; Pattern_Syntax # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Pattern_Syntax # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Pattern_Syntax # So [8] DIAMETER SIGN..WAVY LINE -2308..230B ; Pattern_Syntax # Sm [4] LEFT CEILING..RIGHT FLOOR +2308 ; Pattern_Syntax # Ps LEFT CEILING +2309 ; Pattern_Syntax # Pe RIGHT CEILING +230A ; Pattern_Syntax # Ps LEFT FLOOR +230B ; Pattern_Syntax # Pe RIGHT FLOOR 230C..231F ; Pattern_Syntax # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Pattern_Syntax # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Pattern_Syntax # So [7] FROWN..KEYBOARD @@ -1222,8 +1341,8 @@ 239B..23B3 ; Pattern_Syntax # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Pattern_Syntax # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Pattern_Syntax # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET -23E2..23F3 ; Pattern_Syntax # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND -23F4..23FF ; Pattern_Syntax # Cn [12] <reserved-23F4>..<reserved-23FF> +23E2..23FA ; Pattern_Syntax # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD +23FB..23FF ; Pattern_Syntax # Cn [5] <reserved-23FB>..<reserved-23FF> 2400..2426 ; Pattern_Syntax # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO 2427..243F ; Pattern_Syntax # Cn [25] <reserved-2427>..<reserved-243F> 2440..244A ; Pattern_Syntax # So [11] OCR HOOK..OCR DOUBLE BACKSLASH @@ -1236,9 +1355,7 @@ 25F8..25FF ; Pattern_Syntax # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Pattern_Syntax # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Pattern_Syntax # Sm MUSIC SHARP SIGN -2670..26FF ; Pattern_Syntax # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE -2700 ; Pattern_Syntax # Cn <reserved-2700> -2701..2767 ; Pattern_Syntax # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET +2670..2767 ; Pattern_Syntax # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Pattern_Syntax # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Pattern_Syntax # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Pattern_Syntax # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT @@ -1306,9 +1423,16 @@ 2B30..2B44 ; Pattern_Syntax # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Pattern_Syntax # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Pattern_Syntax # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR -2B4D..2B4F ; Pattern_Syntax # Cn [3] <reserved-2B4D>..<reserved-2B4F> -2B50..2B59 ; Pattern_Syntax # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE -2B5A..2BFF ; Pattern_Syntax # Cn [166] <reserved-2B5A>..<reserved-2BFF> +2B4D..2B73 ; Pattern_Syntax # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B74..2B75 ; Pattern_Syntax # Cn [2] <reserved-2B74>..<reserved-2B75> +2B76..2B95 ; Pattern_Syntax # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B96..2B97 ; Pattern_Syntax # Cn [2] <reserved-2B96>..<reserved-2B97> +2B98..2BB9 ; Pattern_Syntax # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX +2BBA..2BBC ; Pattern_Syntax # Cn [3] <reserved-2BBA>..<reserved-2BBC> +2BBD..2BC8 ; Pattern_Syntax # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED +2BC9 ; Pattern_Syntax # Cn <reserved-2BC9> +2BCA..2BD1 ; Pattern_Syntax # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN +2BD2..2BFF ; Pattern_Syntax # Cn [46] <reserved-2BD2>..<reserved-2BFF> 2E00..2E01 ; Pattern_Syntax # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Pattern_Syntax # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Pattern_Syntax # Pf RIGHT SUBSTITUTION BRACKET @@ -1342,7 +1466,11 @@ 2E2F ; Pattern_Syntax # Lm VERTICAL TILDE 2E30..2E39 ; Pattern_Syntax # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Pattern_Syntax # Pd [2] TWO-EM DASH..THREE-EM DASH -2E3C..2E7F ; Pattern_Syntax # Cn [68] <reserved-2E3C>..<reserved-2E7F> +2E3C..2E3F ; Pattern_Syntax # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E40 ; Pattern_Syntax # Pd DOUBLE HYPHEN +2E41 ; Pattern_Syntax # Po REVERSED COMMA +2E42 ; Pattern_Syntax # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK +2E43..2E7F ; Pattern_Syntax # Cn [61] <reserved-2E43>..<reserved-2E7F> 3001..3003 ; Pattern_Syntax # Po [3] IDEOGRAPHIC COMMA..DITTO MARK 3008 ; Pattern_Syntax # Ps LEFT ANGLE BRACKET 3009 ; Pattern_Syntax # Pe RIGHT ANGLE BRACKET @@ -1368,8 +1496,8 @@ 301E..301F ; Pattern_Syntax # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK 3020 ; Pattern_Syntax # So POSTAL MARK FACE 3030 ; Pattern_Syntax # Pd WAVY DASH -FD3E ; Pattern_Syntax # Ps ORNATE LEFT PARENTHESIS -FD3F ; Pattern_Syntax # Pe ORNATE RIGHT PARENTHESIS +FD3E ; Pattern_Syntax # Pe ORNATE LEFT PARENTHESIS +FD3F ; Pattern_Syntax # Ps ORNATE RIGHT PARENTHESIS FE45..FE46 ; Pattern_Syntax # Po [2] SESAME DOT..WHITE SESAME DOT # Total code points: 2760 --- old/jdk/test/java/lang/Character/PropertyValueAliases.txt 2015-07-13 16:12:04.000000000 +0900 +++ new/jdk/test/java/lang/Character/PropertyValueAliases.txt 2015-07-13 16:12:04.000000000 +0900 @@ -1,8 +1,8 @@ -# PropertyValueAliases-6.2.0.txt -# Date: 2012-08-14, 16:05:11 GMT [MD] +# PropertyValueAliases-7.0.0.txt +# Date: 2014-05-14, 23:55:16 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ # @@ -32,13 +32,14 @@ # # Loose matching should be applied to all property names and property values, with # the exception of String Property values. With loose matching of property names and -# values, the case distinctions, whitespace, and '_' are ignored. For Numeric Property -# values, numeric equivalencies are applied: thus "01.00" is equivalent to "1". +# values, the case distinctions, whitespace, hyphens, and '_' are ignored. +# For Numeric Property values, numeric equivalence is applied: thus "01.00" +# is equivalent to "1". # # NOTE: Property value names are NOT unique across properties. For example: # # AL means Arabic Letter for the Bidi_Class property, and -# AL means Above_Left for the Combining_Class property, and +# AL means Above_Left for the Canonical_Combining_Class property, and # AL means Alphabetic for the Line_Break property. # # In addition, some property names may be the same as some property value names. @@ -74,6 +75,8 @@ age; 6.0 ; V6_0 age; 6.1 ; V6_1 age; 6.2 ; V6_2 +age; 6.3 ; V6_3 +age; 7.0 ; V7_0 age; NA ; Unassigned # Alphabetic (Alpha) @@ -91,14 +94,18 @@ bc ; EN ; European_Number bc ; ES ; European_Separator bc ; ET ; European_Terminator +bc ; FSI ; First_Strong_Isolate bc ; L ; Left_To_Right bc ; LRE ; Left_To_Right_Embedding +bc ; LRI ; Left_To_Right_Isolate bc ; LRO ; Left_To_Right_Override bc ; NSM ; Nonspacing_Mark bc ; ON ; Other_Neutral bc ; PDF ; Pop_Directional_Format +bc ; PDI ; Pop_Directional_Isolate bc ; R ; Right_To_Left bc ; RLE ; Right_To_Left_Embedding +bc ; RLI ; Right_To_Left_Isolate bc ; RLO ; Right_To_Left_Override bc ; S ; Segment_Separator bc ; WS ; White_Space @@ -117,6 +124,17 @@ # @missing: 0000..10FFFF; Bidi_Mirroring_Glyph; <none> +# Bidi_Paired_Bracket (bpb) + +# @missing: 0000..10FFFF; Bidi_Paired_Bracket; <none> + +# Bidi_Paired_Bracket_Type (bpt) + +bpt; c ; Close +bpt; n ; None +bpt; o ; Open +# @missing: 0000..10FFFF; Bidi_Paired_Bracket_Type; n + # Block (blk) blk; Aegean_Numbers ; Aegean_Numbers @@ -138,6 +156,7 @@ blk; Balinese ; Balinese blk; Bamum ; Bamum blk; Bamum_Sup ; Bamum_Supplement +blk; Bassa_Vah ; Bassa_Vah blk; Batak ; Batak blk; Bengali ; Bengali blk; Block_Elements ; Block_Elements @@ -150,6 +169,7 @@ blk; Buhid ; Buhid blk; Byzantine_Music ; Byzantine_Musical_Symbols blk; Carian ; Carian +blk; Caucasian_Albanian ; Caucasian_Albanian blk; Chakma ; Chakma blk; Cham ; Cham blk; Cherokee ; Cherokee @@ -168,6 +188,7 @@ blk; Compat_Jamo ; Hangul_Compatibility_Jamo blk; Control_Pictures ; Control_Pictures blk; Coptic ; Coptic +blk; Coptic_Epact_Numbers ; Coptic_Epact_Numbers blk; Counting_Rod ; Counting_Rod_Numerals blk; Cuneiform ; Cuneiform blk; Cuneiform_Numbers ; Cuneiform_Numbers_And_Punctuation @@ -181,11 +202,14 @@ blk; Devanagari ; Devanagari blk; Devanagari_Ext ; Devanagari_Extended blk; Diacriticals ; Combining_Diacritical_Marks +blk; Diacriticals_Ext ; Combining_Diacritical_Marks_Extended blk; Diacriticals_For_Symbols ; Combining_Diacritical_Marks_For_Symbols; Combining_Marks_For_Symbols blk; Diacriticals_Sup ; Combining_Diacritical_Marks_Supplement blk; Dingbats ; Dingbats blk; Domino ; Domino_Tiles +blk; Duployan ; Duployan blk; Egyptian_Hieroglyphs ; Egyptian_Hieroglyphs +blk; Elbasan ; Elbasan blk; Emoticons ; Emoticons blk; Enclosed_Alphanum ; Enclosed_Alphanumerics blk; Enclosed_Alphanum_Sup ; Enclosed_Alphanumeric_Supplement @@ -196,10 +220,12 @@ blk; Ethiopic_Ext_A ; Ethiopic_Extended_A blk; Ethiopic_Sup ; Ethiopic_Supplement blk; Geometric_Shapes ; Geometric_Shapes +blk; Geometric_Shapes_Ext ; Geometric_Shapes_Extended blk; Georgian ; Georgian blk; Georgian_Sup ; Georgian_Supplement blk; Glagolitic ; Glagolitic blk; Gothic ; Gothic +blk; Grantha ; Grantha blk; Greek ; Greek_And_Coptic blk; Greek_Ext ; Greek_Extended blk; Gujarati ; Gujarati @@ -233,6 +259,8 @@ blk; Kharoshthi ; Kharoshthi blk; Khmer ; Khmer blk; Khmer_Symbols ; Khmer_Symbols +blk; Khojki ; Khojki +blk; Khudawadi ; Khudawadi blk; Lao ; Lao blk; Latin_1_Sup ; Latin_1_Supplement ; Latin_1 blk; Latin_Ext_A ; Latin_Extended_A @@ -240,22 +268,27 @@ blk; Latin_Ext_B ; Latin_Extended_B blk; Latin_Ext_C ; Latin_Extended_C blk; Latin_Ext_D ; Latin_Extended_D +blk; Latin_Ext_E ; Latin_Extended_E blk; Lepcha ; Lepcha blk; Letterlike_Symbols ; Letterlike_Symbols blk; Limbu ; Limbu +blk; Linear_A ; Linear_A blk; Linear_B_Ideograms ; Linear_B_Ideograms blk; Linear_B_Syllabary ; Linear_B_Syllabary blk; Lisu ; Lisu blk; Low_Surrogates ; Low_Surrogates blk; Lycian ; Lycian blk; Lydian ; Lydian +blk; Mahajani ; Mahajani blk; Mahjong ; Mahjong_Tiles blk; Malayalam ; Malayalam blk; Mandaic ; Mandaic +blk; Manichaean ; Manichaean blk; Math_Alphanum ; Mathematical_Alphanumeric_Symbols blk; Math_Operators ; Mathematical_Operators blk; Meetei_Mayek ; Meetei_Mayek blk; Meetei_Mayek_Ext ; Meetei_Mayek_Extensions +blk; Mende_Kikakui ; Mende_Kikakui blk; Meroitic_Cursive ; Meroitic_Cursive blk; Meroitic_Hieroglyphs ; Meroitic_Hieroglyphs blk; Miao ; Miao @@ -265,12 +298,16 @@ blk; Misc_Pictographs ; Miscellaneous_Symbols_And_Pictographs blk; Misc_Symbols ; Miscellaneous_Symbols blk; Misc_Technical ; Miscellaneous_Technical +blk; Modi ; Modi blk; Modifier_Letters ; Spacing_Modifier_Letters blk; Modifier_Tone_Letters ; Modifier_Tone_Letters blk; Mongolian ; Mongolian +blk; Mro ; Mro blk; Music ; Musical_Symbols blk; Myanmar ; Myanmar blk; Myanmar_Ext_A ; Myanmar_Extended_A +blk; Myanmar_Ext_B ; Myanmar_Extended_B +blk; Nabataean ; Nabataean blk; NB ; No_Block blk; New_Tai_Lue ; New_Tai_Lue blk; NKo ; NKo @@ -279,17 +316,24 @@ blk; Ogham ; Ogham blk; Ol_Chiki ; Ol_Chiki blk; Old_Italic ; Old_Italic +blk; Old_North_Arabian ; Old_North_Arabian +blk; Old_Permic ; Old_Permic blk; Old_Persian ; Old_Persian blk; Old_South_Arabian ; Old_South_Arabian blk; Old_Turkic ; Old_Turkic blk; Oriya ; Oriya +blk; Ornamental_Dingbats ; Ornamental_Dingbats blk; Osmanya ; Osmanya +blk; Pahawh_Hmong ; Pahawh_Hmong +blk; Palmyrene ; Palmyrene +blk; Pau_Cin_Hau ; Pau_Cin_Hau blk; Phags_Pa ; Phags_Pa blk; Phaistos ; Phaistos_Disc blk; Phoenician ; Phoenician blk; Phonetic_Ext ; Phonetic_Extensions blk; Phonetic_Ext_Sup ; Phonetic_Extensions_Supplement blk; Playing_Cards ; Playing_Cards +blk; Psalter_Pahlavi ; Psalter_Pahlavi blk; PUA ; Private_Use_Area ; Private_Use blk; Punctuation ; General_Punctuation blk; Rejang ; Rejang @@ -299,7 +343,10 @@ blk; Saurashtra ; Saurashtra blk; Sharada ; Sharada blk; Shavian ; Shavian +blk; Shorthand_Format_Controls ; Shorthand_Format_Controls +blk; Siddham ; Siddham blk; Sinhala ; Sinhala +blk; Sinhala_Archaic_Numbers ; Sinhala_Archaic_Numbers blk; Small_Forms ; Small_Form_Variants blk; Sora_Sompeng ; Sora_Sompeng blk; Specials ; Specials @@ -307,6 +354,7 @@ blk; Sundanese_Sup ; Sundanese_Supplement blk; Sup_Arrows_A ; Supplemental_Arrows_A blk; Sup_Arrows_B ; Supplemental_Arrows_B +blk; Sup_Arrows_C ; Supplemental_Arrows_C blk; Sup_Math_Operators ; Supplemental_Mathematical_Operators blk; Sup_PUA_A ; Supplementary_Private_Use_Area_A blk; Sup_PUA_B ; Supplementary_Private_Use_Area_B @@ -328,6 +376,7 @@ blk; Thai ; Thai blk; Tibetan ; Tibetan blk; Tifinagh ; Tifinagh +blk; Tirhuta ; Tirhuta blk; Transport_And_Map ; Transport_And_Map_Symbols blk; UCAS ; Unified_Canadian_Aboriginal_Syllabics; Canadian_Syllabics blk; UCAS_Ext ; Unified_Canadian_Aboriginal_Syllabics_Extended @@ -337,6 +386,7 @@ blk; Vertical_Forms ; Vertical_Forms blk; VS ; Variation_Selectors blk; VS_Sup ; Variation_Selectors_Supplement +blk; Warang_Citi ; Warang_Citi blk; Yi_Radicals ; Yi_Radicals blk; Yi_Syllables ; Yi_Syllables blk; Yijing ; Yijing_Hexagram_Symbols @@ -578,6 +628,7 @@ gc ; Zl ; Line_Separator gc ; Zp ; Paragraph_Separator gc ; Zs ; Space_Separator +# @missing: 0000..10FFFF; General_Category; Unassigned # Grapheme_Base (Gr_Base) @@ -662,7 +713,6 @@ InMC; Bottom ; Bottom InMC; Bottom_And_Right ; Bottom_And_Right -InMC; Invisible ; Invisible InMC; Left ; Left InMC; Left_And_Right ; Left_And_Right InMC; NA ; NA @@ -680,17 +730,27 @@ InSC; Avagraha ; Avagraha InSC; Bindu ; Bindu +InSC; Brahmi_Joining_Number ; Brahmi_Joining_Number +InSC; Cantillation_Mark ; Cantillation_Mark InSC; Consonant ; Consonant InSC; Consonant_Dead ; Consonant_Dead InSC; Consonant_Final ; Consonant_Final InSC; Consonant_Head_Letter ; Consonant_Head_Letter InSC; Consonant_Medial ; Consonant_Medial InSC; Consonant_Placeholder ; Consonant_Placeholder -InSC; Consonant_Repha ; Consonant_Repha +InSC; Consonant_Preceding_Repha ; Consonant_Preceding_Repha InSC; Consonant_Subjoined ; Consonant_Subjoined +InSC; Consonant_Succeeding_Repha ; Consonant_Succeeding_Repha +InSC; Gemination_Mark ; Gemination_Mark +InSC; Invisible_Stacker ; Invisible_Stacker +InSC; Joiner ; Joiner InSC; Modifying_Letter ; Modifying_Letter +InSC; Non_Joiner ; Non_Joiner InSC; Nukta ; Nukta +InSC; Number ; Number +InSC; Number_Joiner ; Number_Joiner InSC; Other ; Other +InSC; Pure_Killer ; Pure_Killer InSC; Register_Shifter ; Register_Shifter InSC; Tone_Letter ; Tone_Letter InSC; Tone_Mark ; Tone_Mark @@ -702,7 +762,6 @@ # Jamo_Short_Name (JSN) -# @missing: 0000..10FFFF; Jamo_Short_Name; <none> JSN; A ; A JSN; AE ; AE JSN; B ; B @@ -755,6 +814,7 @@ JSN; YI ; YI JSN; YO ; YO JSN; YU ; YU +# @missing: 0000..10FFFF; Jamo_Short_Name; <none> # Join_Control (Join_C) @@ -789,6 +849,33 @@ jg ; Knotted_Heh ; Knotted_Heh jg ; Lam ; Lam jg ; Lamadh ; Lamadh +jg ; Manichaean_Aleph ; Manichaean_Aleph +jg ; Manichaean_Ayin ; Manichaean_Ayin +jg ; Manichaean_Beth ; Manichaean_Beth +jg ; Manichaean_Daleth ; Manichaean_Daleth +jg ; Manichaean_Dhamedh ; Manichaean_Dhamedh +jg ; Manichaean_Five ; Manichaean_Five +jg ; Manichaean_Gimel ; Manichaean_Gimel +jg ; Manichaean_Heth ; Manichaean_Heth +jg ; Manichaean_Hundred ; Manichaean_Hundred +jg ; Manichaean_Kaph ; Manichaean_Kaph +jg ; Manichaean_Lamedh ; Manichaean_Lamedh +jg ; Manichaean_Mem ; Manichaean_Mem +jg ; Manichaean_Nun ; Manichaean_Nun +jg ; Manichaean_One ; Manichaean_One +jg ; Manichaean_Pe ; Manichaean_Pe +jg ; Manichaean_Qoph ; Manichaean_Qoph +jg ; Manichaean_Resh ; Manichaean_Resh +jg ; Manichaean_Sadhe ; Manichaean_Sadhe +jg ; Manichaean_Samekh ; Manichaean_Samekh +jg ; Manichaean_Taw ; Manichaean_Taw +jg ; Manichaean_Ten ; Manichaean_Ten +jg ; Manichaean_Teth ; Manichaean_Teth +jg ; Manichaean_Thamedh ; Manichaean_Thamedh +jg ; Manichaean_Twenty ; Manichaean_Twenty +jg ; Manichaean_Waw ; Manichaean_Waw +jg ; Manichaean_Yodh ; Manichaean_Yodh +jg ; Manichaean_Zayin ; Manichaean_Zayin jg ; Meem ; Meem jg ; Mim ; Mim jg ; No_Joining_Group ; No_Joining_Group @@ -806,6 +893,7 @@ jg ; Seen ; Seen jg ; Semkath ; Semkath jg ; Shin ; Shin +jg ; Straight_Waw ; Straight_Waw jg ; Swash_Kaf ; Swash_Kaf jg ; Syriac_Waw ; Syriac_Waw jg ; Tah ; Tah @@ -884,6 +972,10 @@ Lower; N ; No ; F ; False Lower; Y ; Yes ; T ; True +# Lowercase_Mapping (lc) + +# @missing: 0000..10FFFF; Lowercase_Mapping; <code point> + # Math (Math) Math; N ; No ; F ; False @@ -1006,12 +1098,14 @@ # Script (sc) +sc ; Aghb ; Caucasian_Albanian sc ; Arab ; Arabic sc ; Armi ; Imperial_Aramaic sc ; Armn ; Armenian sc ; Avst ; Avestan sc ; Bali ; Balinese sc ; Bamu ; Bamum +sc ; Bass ; Bassa_Vah sc ; Batk ; Batak sc ; Beng ; Bengali sc ; Bopo ; Bopomofo @@ -1029,11 +1123,14 @@ sc ; Cyrl ; Cyrillic sc ; Deva ; Devanagari sc ; Dsrt ; Deseret +sc ; Dupl ; Duployan sc ; Egyp ; Egyptian_Hieroglyphs +sc ; Elba ; Elbasan sc ; Ethi ; Ethiopic sc ; Geor ; Georgian sc ; Glag ; Glagolitic sc ; Goth ; Gothic +sc ; Gran ; Grantha sc ; Grek ; Greek sc ; Gujr ; Gujarati sc ; Guru ; Gurmukhi @@ -1042,6 +1139,7 @@ sc ; Hano ; Hanunoo sc ; Hebr ; Hebrew sc ; Hira ; Hiragana +sc ; Hmng ; Pahawh_Hmong sc ; Hrkt ; Katakana_Or_Hiragana sc ; Ital ; Old_Italic sc ; Java ; Javanese @@ -1049,6 +1147,7 @@ sc ; Kana ; Katakana sc ; Khar ; Kharoshthi sc ; Khmr ; Khmer +sc ; Khoj ; Khojki sc ; Knda ; Kannada sc ; Kthi ; Kaithi sc ; Lana ; Tai_Tham @@ -1056,25 +1155,37 @@ sc ; Latn ; Latin sc ; Lepc ; Lepcha sc ; Limb ; Limbu +sc ; Lina ; Linear_A sc ; Linb ; Linear_B sc ; Lisu ; Lisu sc ; Lyci ; Lycian sc ; Lydi ; Lydian +sc ; Mahj ; Mahajani sc ; Mand ; Mandaic +sc ; Mani ; Manichaean +sc ; Mend ; Mende_Kikakui sc ; Merc ; Meroitic_Cursive sc ; Mero ; Meroitic_Hieroglyphs sc ; Mlym ; Malayalam +sc ; Modi ; Modi sc ; Mong ; Mongolian +sc ; Mroo ; Mro sc ; Mtei ; Meetei_Mayek sc ; Mymr ; Myanmar +sc ; Narb ; Old_North_Arabian +sc ; Nbat ; Nabataean sc ; Nkoo ; Nko sc ; Ogam ; Ogham sc ; Olck ; Ol_Chiki sc ; Orkh ; Old_Turkic sc ; Orya ; Oriya sc ; Osma ; Osmanya +sc ; Palm ; Palmyrene +sc ; Pauc ; Pau_Cin_Hau +sc ; Perm ; Old_Permic sc ; Phag ; Phags_Pa sc ; Phli ; Inscriptional_Pahlavi +sc ; Phlp ; Psalter_Pahlavi sc ; Phnx ; Phoenician sc ; Plrd ; Miao sc ; Prti ; Inscriptional_Parthian @@ -1085,6 +1196,8 @@ sc ; Saur ; Saurashtra sc ; Shaw ; Shavian sc ; Shrd ; Sharada +sc ; Sidd ; Siddham +sc ; Sind ; Khudawadi sc ; Sinh ; Sinhala sc ; Sora ; Sora_Sompeng sc ; Sund ; Sundanese @@ -1102,8 +1215,10 @@ sc ; Thaa ; Thaana sc ; Thai ; Thai sc ; Tibt ; Tibetan +sc ; Tirh ; Tirhuta sc ; Ugar ; Ugaritic sc ; Vaii ; Vai +sc ; Wara ; Warang_Citi sc ; Xpeo ; Old_Persian sc ; Xsux ; Cuneiform sc ; Yiii ; Yi @@ -1159,6 +1274,10 @@ Term; N ; No ; F ; False Term; Y ; Yes ; T ; True +# Titlecase_Mapping (tc) + +# @missing: 0000..10FFFF; Titlecase_Mapping; <code point> + # Unicode_1_Name (na1) # @missing: 0000..10FFFF; Unicode_1_Name; <none> @@ -1173,6 +1292,10 @@ Upper; N ; No ; F ; False Upper; Y ; Yes ; T ; True +# Uppercase_Mapping (uc) + +# @missing: 0000..10FFFF; Uppercase_Mapping; <code point> + # Variation_Selector (VS) VS ; N ; No ; F ; False @@ -1186,9 +1309,11 @@ # Word_Break (WB) WB ; CR ; CR +WB ; DQ ; Double_Quote WB ; EX ; ExtendNumLet WB ; Extend ; Extend WB ; FO ; Format +WB ; HL ; Hebrew_Letter WB ; KA ; Katakana WB ; LE ; ALetter WB ; LF ; LF @@ -1198,6 +1323,7 @@ WB ; NL ; Newline WB ; NU ; Numeric WB ; RI ; Regional_Indicator +WB ; SQ ; Single_Quote WB ; XX ; Other # XID_Continue (XIDC) --- old/jdk/test/java/lang/Character/Scripts.txt 2015-07-13 16:12:05.000000000 +0900 +++ new/jdk/test/java/lang/Character/Scripts.txt 2015-07-13 16:12:04.000000000 +0900 @@ -1,8 +1,8 @@ -# Scripts-6.2.0.txt -# Date: 2012-06-04, 17:21:29 GMT [MD] +# Scripts-7.0.0.txt +# Date: 2014-05-15, 00:11:35 GMT [MD] # # Unicode Character Database -# Copyright (c) 1991-2012 Unicode, Inc. +# Copyright (c) 1991-2014 Unicode, Inc. # For terms of use, see http://www.unicode.org/terms_of_use.html # For documentation, see http://www.unicode.org/reports/tr44/ @@ -83,8 +83,10 @@ 0385 ; Common # Sk GREEK DIALYTIKA TONOS 0387 ; Common # Po GREEK ANO TELEIA 0589 ; Common # Po ARMENIAN FULL STOP +0605 ; Common # Cf ARABIC NUMBER MARK ABOVE 060C ; Common # Po ARABIC COMMA 061B ; Common # Po ARABIC SEMICOLON +061C ; Common # Cf ARABIC LETTER MARK 061F ; Common # Po ARABIC QUESTION MARK 0640 ; Common # Lm ARABIC TATWEEL 0660..0669 ; Common # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE @@ -136,7 +138,7 @@ 2055..205E ; Common # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS 205F ; Common # Zs MEDIUM MATHEMATICAL SPACE 2060..2064 ; Common # Cf [5] WORD JOINER..INVISIBLE PLUS -206A..206F ; Common # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES +2066..206F ; Common # Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES 2070 ; Common # No SUPERSCRIPT ZERO 2074..2079 ; Common # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE 207A..207C ; Common # Sm [3] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT EQUALS SIGN @@ -146,7 +148,7 @@ 208A..208C ; Common # Sm [3] SUBSCRIPT PLUS SIGN..SUBSCRIPT EQUALS SIGN 208D ; Common # Ps SUBSCRIPT LEFT PARENTHESIS 208E ; Common # Pe SUBSCRIPT RIGHT PARENTHESIS -20A0..20BA ; Common # Sc [27] EURO-CURRENCY SIGN..TURKISH LIRA SIGN +20A0..20BD ; Common # Sc [30] EURO-CURRENCY SIGN..RUBLE SIGN 2100..2101 ; Common # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT 2102 ; Common # L& DOUBLE-STRUCK CAPITAL C 2103..2106 ; Common # So [4] DEGREE CELSIUS..CADA UNA @@ -200,7 +202,10 @@ 21D5..21F3 ; Common # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW 21F4..22FF ; Common # Sm [268] RIGHT ARROW WITH SMALL CIRCLE..Z NOTATION BAG MEMBERSHIP 2300..2307 ; Common # So [8] DIAMETER SIGN..WAVY LINE -2308..230B ; Common # Sm [4] LEFT CEILING..RIGHT FLOOR +2308 ; Common # Ps LEFT CEILING +2309 ; Common # Pe RIGHT CEILING +230A ; Common # Ps LEFT FLOOR +230B ; Common # Pe RIGHT FLOOR 230C..231F ; Common # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER 2320..2321 ; Common # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL 2322..2328 ; Common # So [7] FROWN..KEYBOARD @@ -212,7 +217,7 @@ 239B..23B3 ; Common # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM 23B4..23DB ; Common # So [40] TOP SQUARE BRACKET..FUSE 23DC..23E1 ; Common # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET -23E2..23F3 ; Common # So [18] WHITE TRAPEZIUM..HOURGLASS WITH FLOWING SAND +23E2..23FA ; Common # So [25] WHITE TRAPEZIUM..BLACK CIRCLE FOR RECORD 2400..2426 ; Common # So [39] SYMBOL FOR NULL..SYMBOL FOR SUBSTITUTE FORM TWO 2440..244A ; Common # So [11] OCR HOOK..OCR DOUBLE BACKSLASH 2460..249B ; Common # No [60] CIRCLED DIGIT ONE..NUMBER TWENTY FULL STOP @@ -226,8 +231,7 @@ 25F8..25FF ; Common # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE 2600..266E ; Common # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN 266F ; Common # Sm MUSIC SHARP SIGN -2670..26FF ; Common # So [144] WEST SYRIAC CROSS..WHITE FLAG WITH HORIZONTAL MIDDLE BLACK STRIPE -2701..2767 ; Common # So [103] UPPER BLADE SCISSORS..ROTATED FLORAL HEART BULLET +2670..2767 ; Common # So [248] WEST SYRIAC CROSS..ROTATED FLORAL HEART BULLET 2768 ; Common # Ps MEDIUM LEFT PARENTHESIS ORNAMENT 2769 ; Common # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT 276A ; Common # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT @@ -295,7 +299,11 @@ 2B30..2B44 ; Common # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET 2B45..2B46 ; Common # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW 2B47..2B4C ; Common # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR -2B50..2B59 ; Common # So [10] WHITE MEDIUM STAR..HEAVY CIRCLED SALTIRE +2B4D..2B73 ; Common # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B76..2B95 ; Common # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B98..2BB9 ; Common # So [34] THREE-D TOP-LIGHTED LEFTWARDS EQUILATERAL ARROWHEAD..UP ARROWHEAD IN A RECTANGLE BOX +2BBD..2BC8 ; Common # So [12] BALLOT BOX WITH LIGHT X..BLACK MEDIUM RIGHT-POINTING TRIANGLE CENTRED +2BCA..2BD1 ; Common # So [8] TOP HALF BLACK CIRCLE..UNCERTAINTY SIGN 2E00..2E01 ; Common # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER 2E02 ; Common # Pi LEFT SUBSTITUTION BRACKET 2E03 ; Common # Pf RIGHT SUBSTITUTION BRACKET @@ -329,6 +337,10 @@ 2E2F ; Common # Lm VERTICAL TILDE 2E30..2E39 ; Common # Po [10] RING POINT..TOP HALF SECTION SIGN 2E3A..2E3B ; Common # Pd [2] TWO-EM DASH..THREE-EM DASH +2E3C..2E3F ; Common # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E40 ; Common # Pd DOUBLE HYPHEN +2E41 ; Common # Po REVERSED COMMA +2E42 ; Common # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK 2FF0..2FFB ; Common # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID 3000 ; Common # Zs IDEOGRAPHIC SPACE 3001..3003 ; Common # Po [3] IDEOGRAPHIC COMMA..DITTO MARK @@ -392,9 +404,11 @@ A836..A837 ; Common # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK A838 ; Common # Sc NORTH INDIC RUPEE MARK A839 ; Common # So NORTH INDIC QUANTITY MARK -FD3E ; Common # Ps ORNATE LEFT PARENTHESIS -FD3F ; Common # Pe ORNATE RIGHT PARENTHESIS -FDFD ; Common # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM +A92E ; Common # Po KAYAH LI SIGN CWI +A9CF ; Common # Lm JAVANESE PANGRANGKEP +AB5B ; Common # Sk MODIFIER BREVE WITH INVERTED BREVE +FD3E ; Common # Pe ORNATE LEFT PARENTHESIS +FD3F ; Common # Ps ORNATE RIGHT PARENTHESIS FE10..FE16 ; Common # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK FE17 ; Common # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET FE18 ; Common # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET @@ -487,6 +501,8 @@ 10137..1013F ; Common # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT 10190..1019B ; Common # So [12] ROMAN SEXTANS SIGN..ROMAN CENTURIAL SIGN 101D0..101FC ; Common # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND +102E1..102FB ; Common # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED +1BCA0..1BCA3 ; Common # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP 1D000..1D0F5 ; Common # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO 1D100..1D126 ; Common # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 1D129..1D164 ; Common # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE @@ -543,10 +559,10 @@ 1F000..1F02B ; Common # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK 1F030..1F093 ; Common # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 1F0A0..1F0AE ; Common # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES -1F0B1..1F0BE ; Common # So [14] PLAYING CARD ACE OF HEARTS..PLAYING CARD KING OF HEARTS +1F0B1..1F0BF ; Common # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER 1F0C1..1F0CF ; Common # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER -1F0D1..1F0DF ; Common # So [15] PLAYING CARD ACE OF CLUBS..PLAYING CARD WHITE JOKER -1F100..1F10A ; Common # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA +1F0D1..1F0F5 ; Common # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 +1F100..1F10C ; Common # No [13] DIGIT ZERO FULL STOP..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO 1F110..1F12E ; Common # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ 1F130..1F16B ; Common # So [60] SQUARED LATIN CAPITAL LETTER A..RAISED MD SIGN 1F170..1F19A ; Common # So [43] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VS @@ -555,28 +571,29 @@ 1F210..1F23A ; Common # So [43] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-55B6 1F240..1F248 ; Common # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 1F250..1F251 ; Common # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT -1F300..1F320 ; Common # So [33] CYCLONE..SHOOTING STAR -1F330..1F335 ; Common # So [6] CHESTNUT..CACTUS -1F337..1F37C ; Common # So [70] TULIP..BABY BOTTLE -1F380..1F393 ; Common # So [20] RIBBON..GRADUATION CAP -1F3A0..1F3C4 ; Common # So [37] CAROUSEL HORSE..SURFER -1F3C6..1F3CA ; Common # So [5] TROPHY..SWIMMER -1F3E0..1F3F0 ; Common # So [17] HOUSE BUILDING..EUROPEAN CASTLE -1F400..1F43E ; Common # So [63] RAT..PAW PRINTS -1F440 ; Common # So EYES -1F442..1F4F7 ; Common # So [182] EAR..CAMERA -1F4F9..1F4FC ; Common # So [4] VIDEO CAMERA..VIDEOCASSETTE -1F500..1F53D ; Common # So [62] TWISTED RIGHTWARDS ARROWS..DOWN-POINTING SMALL RED TRIANGLE -1F540..1F543 ; Common # So [4] CIRCLED CROSS POMMEE..NOTCHED LEFT SEMICIRCLE WITH THREE DOTS -1F550..1F567 ; Common # So [24] CLOCK FACE ONE OCLOCK..CLOCK FACE TWELVE-THIRTY -1F5FB..1F640 ; Common # So [70] MOUNT FUJI..WEARY CAT FACE -1F645..1F64F ; Common # So [11] FACE WITH NO GOOD GESTURE..PERSON WITH FOLDED HANDS -1F680..1F6C5 ; Common # So [70] ROCKET..LEFT LUGGAGE +1F300..1F32C ; Common # So [45] CYCLONE..WIND BLOWING FACE +1F330..1F37D ; Common # So [78] CHESTNUT..FORK AND KNIFE WITH PLATE +1F380..1F3CE ; Common # So [79] RIBBON..RACING CAR +1F3D4..1F3F7 ; Common # So [36] SNOW CAPPED MOUNTAIN..LABEL +1F400..1F4FE ; Common # So [255] RAT..PORTABLE STEREO +1F500..1F54A ; Common # So [75] TWISTED RIGHTWARDS ARROWS..DOVE OF PEACE +1F550..1F579 ; Common # So [42] CLOCK FACE ONE OCLOCK..JOYSTICK +1F57B..1F5A3 ; Common # So [41] LEFT HAND TELEPHONE RECEIVER..BLACK DOWN POINTING BACKHAND INDEX +1F5A5..1F642 ; Common # So [158] DESKTOP COMPUTER..SLIGHTLY SMILING FACE +1F645..1F6CF ; Common # So [139] FACE WITH NO GOOD GESTURE..BED +1F6E0..1F6EC ; Common # So [13] HAMMER AND WRENCH..AIRPLANE ARRIVING +1F6F0..1F6F3 ; Common # So [4] SATELLITE..PASSENGER SHIP 1F700..1F773 ; Common # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE +1F780..1F7D4 ; Common # So [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR +1F800..1F80B ; Common # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD +1F810..1F847 ; Common # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW +1F850..1F859 ; Common # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW +1F860..1F887 ; Common # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW +1F890..1F8AD ; Common # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS E0001 ; Common # Cf LANGUAGE TAG E0020..E007F ; Common # Cf [96] TAG SPACE..CANCEL TAG -# Total code points: 6413 +# Total code points: 7129 # ================================================ @@ -618,16 +635,20 @@ A770 ; Latin # Lm MODIFIER LETTER US A771..A787 ; Latin # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T A78B..A78E ; Latin # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT -A790..A793 ; Latin # L& [4] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER C WITH BAR -A7A0..A7AA ; Latin # L& [11] LATIN CAPITAL LETTER G WITH OBLIQUE STROKE..LATIN CAPITAL LETTER H WITH HOOK +A790..A7AD ; Latin # L& [30] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN CAPITAL LETTER L WITH BELT +A7B0..A7B1 ; Latin # L& [2] LATIN CAPITAL LETTER TURNED K..LATIN CAPITAL LETTER TURNED T +A7F7 ; Latin # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I A7F8..A7F9 ; Latin # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE A7FA ; Latin # L& LATIN LETTER SMALL CAPITAL TURNED M A7FB..A7FF ; Latin # Lo [5] LATIN EPIGRAPHIC LETTER REVERSED F..LATIN EPIGRAPHIC LETTER ARCHAIC M +AB30..AB5A ; Latin # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG +AB5C..AB5F ; Latin # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK +AB64 ; Latin # L& LATIN SMALL LETTER INVERTED ALPHA FB00..FB06 ; Latin # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST FF21..FF3A ; Latin # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z FF41..FF5A ; Latin # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z -# Total code points: 1272 +# Total code points: 1338 # ================================================ @@ -636,6 +657,7 @@ 0376..0377 ; Greek # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA 037A ; Greek # Lm GREEK YPOGEGRAMMENI 037B..037D ; Greek # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL +037F ; Greek # L& GREEK CAPITAL LETTER YOT 0384 ; Greek # Sk GREEK TONOS 0386 ; Greek # L& GREEK CAPITAL LETTER ALPHA WITH TONOS 0388..038A ; Greek # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS @@ -675,15 +697,18 @@ 1FF6..1FFC ; Greek # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI 1FFD..1FFE ; Greek # Sk [2] GREEK OXIA..GREEK DASIA 2126 ; Greek # L& OHM SIGN +AB65 ; Greek # L& GREEK LETTER SMALL CAPITAL OMEGA 10140..10174 ; Greek # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS 10175..10178 ; Greek # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN 10179..10189 ; Greek # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN -1018A ; Greek # No GREEK ZERO SIGN +1018A..1018B ; Greek # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN +1018C ; Greek # So GREEK SINUSOID SIGN +101A0 ; Greek # So GREEK SYMBOL TAU RHO 1D200..1D241 ; Greek # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 1D242..1D244 ; Greek # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME 1D245 ; Greek # So GREEK MUSICAL LEIMMA -# Total code points: 511 +# Total code points: 516 # ================================================ @@ -692,7 +717,7 @@ 0483..0484 ; Cyrillic # Mn [2] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PALATALIZATION 0487 ; Cyrillic # Mn COMBINING CYRILLIC POKRYTIE 0488..0489 ; Cyrillic # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN -048A..0527 ; Cyrillic # L& [158] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER SHHA WITH DESCENDER +048A..052F ; Cyrillic # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER 1D2B ; Cyrillic # L& CYRILLIC LETTER SMALL CAPITAL EL 1D78 ; Cyrillic # Lm MODIFIER LETTER CYRILLIC EN 2DE0..2DFF ; Cyrillic # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS @@ -704,10 +729,11 @@ A674..A67D ; Cyrillic # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK A67E ; Cyrillic # Po CYRILLIC KAVYKA A67F ; Cyrillic # Lm CYRILLIC PAYEROK -A680..A697 ; Cyrillic # L& [24] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER SHWE +A680..A69B ; Cyrillic # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O +A69C..A69D ; Cyrillic # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN A69F ; Cyrillic # Mn COMBINING CYRILLIC LETTER IOTIFIED E -# Total code points: 417 +# Total code points: 431 # ================================================ @@ -716,10 +742,11 @@ 055A..055F ; Armenian # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK 0561..0587 ; Armenian # L& [39] ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LIGATURE ECH YIWN 058A ; Armenian # Pd ARMENIAN HYPHEN +058D..058E ; Armenian # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN 058F ; Armenian # Sc ARMENIAN DRAM SIGN FB13..FB17 ; Armenian # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH -# Total code points: 91 +# Total code points: 93 # ================================================ @@ -779,9 +806,8 @@ 06FD..06FE ; Arabic # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN 06FF ; Arabic # Lo ARABIC LETTER HEH WITH INVERTED V 0750..077F ; Arabic # Lo [48] ARABIC LETTER BEH WITH THREE DOTS HORIZONTALLY BELOW..ARABIC LETTER KAF WITH TWO DOTS ABOVE -08A0 ; Arabic # Lo ARABIC LETTER BEH WITH SMALL V BELOW -08A2..08AC ; Arabic # Lo [11] ARABIC LETTER JEEM WITH TWO DOTS ABOVE..ARABIC LETTER ROHINGYA YEH -08E4..08FE ; Arabic # Mn [27] ARABIC CURLY FATHA..ARABIC DAMMA WITH DOT +08A0..08B2 ; Arabic # Lo [19] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER ZAIN WITH INVERTED V ABOVE +08E4..08FF ; Arabic # Mn [28] ARABIC CURLY FATHA..ARABIC MARK SIDEWAYS NOON GHUNNA FB50..FBB1 ; Arabic # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM FBB2..FBC1 ; Arabic # Sk [16] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL SMALL TAH BELOW FBD3..FD3D ; Arabic # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM @@ -789,6 +815,7 @@ FD92..FDC7 ; Arabic # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM FDF0..FDFB ; Arabic # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU FDFC ; Arabic # Sc RIAL SIGN +FDFD ; Arabic # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM FE70..FE74 ; Arabic # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM FE76..FEFC ; Arabic # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM 10E60..10E7E ; Arabic # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS @@ -827,7 +854,7 @@ 1EEAB..1EEBB ; Arabic # Lo [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN 1EEF0..1EEF1 ; Arabic # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL -# Total code points: 1235 +# Total code points: 1244 # ================================================ @@ -870,17 +897,17 @@ 0966..096F ; Devanagari # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE 0970 ; Devanagari # Po DEVANAGARI ABBREVIATION SIGN 0971 ; Devanagari # Lm DEVANAGARI SIGN HIGH SPACING DOT -0972..0977 ; Devanagari # Lo [6] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER UUE -0979..097F ; Devanagari # Lo [7] DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA +0972..097F ; Devanagari # Lo [14] DEVANAGARI LETTER CANDRA A..DEVANAGARI LETTER BBA A8E0..A8F1 ; Devanagari # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA A8F2..A8F7 ; Devanagari # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA A8F8..A8FA ; Devanagari # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET A8FB ; Devanagari # Lo DEVANAGARI HEADSTROKE -# Total code points: 151 +# Total code points: 152 # ================================================ +0980 ; Bengali # Lo BENGALI ANJI 0981 ; Bengali # Mn BENGALI SIGN CANDRABINDU 0982..0983 ; Bengali # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA 0985..098C ; Bengali # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L @@ -908,7 +935,7 @@ 09FA ; Bengali # So BENGALI ISSHAR 09FB ; Bengali # Sc BENGALI GANDA MARK -# Total code points: 92 +# Total code points: 93 # ================================================ @@ -1025,12 +1052,12 @@ # ================================================ +0C00 ; Telugu # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE 0C01..0C03 ; Telugu # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA 0C05..0C0C ; Telugu # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L 0C0E..0C10 ; Telugu # Lo [3] TELUGU LETTER E..TELUGU LETTER AI 0C12..0C28 ; Telugu # Lo [23] TELUGU LETTER O..TELUGU LETTER NA -0C2A..0C33 ; Telugu # Lo [10] TELUGU LETTER PA..TELUGU LETTER LLA -0C35..0C39 ; Telugu # Lo [5] TELUGU LETTER VA..TELUGU LETTER HA +0C2A..0C39 ; Telugu # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA 0C3D ; Telugu # Lo TELUGU SIGN AVAGRAHA 0C3E..0C40 ; Telugu # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II 0C41..0C44 ; Telugu # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR @@ -1044,10 +1071,11 @@ 0C78..0C7E ; Telugu # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR 0C7F ; Telugu # So TELUGU SIGN TUUMU -# Total code points: 93 +# Total code points: 95 # ================================================ +0C81 ; Kannada # Mn KANNADA SIGN CANDRABINDU 0C82..0C83 ; Kannada # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA 0C85..0C8C ; Kannada # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L 0C8E..0C90 ; Kannada # Lo [3] KANNADA LETTER E..KANNADA LETTER AI @@ -1070,10 +1098,11 @@ 0CE6..0CEF ; Kannada # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE 0CF1..0CF2 ; Kannada # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA -# Total code points: 86 +# Total code points: 87 # ================================================ +0D01 ; Malayalam # Mn MALAYALAM SIGN CANDRABINDU 0D02..0D03 ; Malayalam # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA 0D05..0D0C ; Malayalam # Lo [8] MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC L 0D0E..0D10 ; Malayalam # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI @@ -1093,7 +1122,7 @@ 0D79 ; Malayalam # So MALAYALAM DATE MARK 0D7A..0D7F ; Malayalam # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K -# Total code points: 98 +# Total code points: 99 # ================================================ @@ -1108,10 +1137,12 @@ 0DD2..0DD4 ; Sinhala # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA 0DD6 ; Sinhala # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA 0DD8..0DDF ; Sinhala # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA +0DE6..0DEF ; Sinhala # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE 0DF2..0DF3 ; Sinhala # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA 0DF4 ; Sinhala # Po SINHALA PUNCTUATION KUNDDALIYA +111E1..111F4 ; Sinhala # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND -# Total code points: 80 +# Total code points: 110 # ================================================ @@ -1234,14 +1265,23 @@ 109A..109C ; Myanmar # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A 109D ; Myanmar # Mn MYANMAR VOWEL SIGN AITON AI 109E..109F ; Myanmar # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION +A9E0..A9E4 ; Myanmar # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA +A9E5 ; Myanmar # Mn MYANMAR SIGN SHAN SAW +A9E6 ; Myanmar # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION +A9E7..A9EF ; Myanmar # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA +A9F0..A9F9 ; Myanmar # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE +A9FA..A9FE ; Myanmar # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA AA60..AA6F ; Myanmar # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA AA70 ; Myanmar # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION AA71..AA76 ; Myanmar # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM AA77..AA79 ; Myanmar # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO AA7A ; Myanmar # Lo MYANMAR LETTER AITON RA AA7B ; Myanmar # Mc MYANMAR SIGN PAO KAREN TONE +AA7C ; Myanmar # Mn MYANMAR SIGN TAI LAING TONE-2 +AA7D ; Myanmar # Mc MYANMAR SIGN TAI LAING TONE-5 +AA7E..AA7F ; Myanmar # Lo [2] MYANMAR LETTER SHWE PALAUNG CHA..MYANMAR LETTER SHWE PALAUNG SHA -# Total code points: 188 +# Total code points: 223 # ================================================ @@ -1345,8 +1385,9 @@ 16A0..16EA ; Runic # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X 16EE..16F0 ; Runic # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL +16F1..16F8 ; Runic # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC -# Total code points: 78 +# Total code points: 86 # ================================================ @@ -1377,7 +1418,7 @@ 1806 ; Mongolian # Pd MONGOLIAN TODO SOFT HYPHEN 1807..180A ; Mongolian # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU 180B..180D ; Mongolian # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE -180E ; Mongolian # Zs MONGOLIAN VOWEL SEPARATOR +180E ; Mongolian # Cf MONGOLIAN VOWEL SEPARATOR 1810..1819 ; Mongolian # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE 1820..1842 ; Mongolian # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI 1843 ; Mongolian # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN @@ -1452,10 +1493,10 @@ # ================================================ -10300..1031E ; Old_Italic # Lo [31] OLD ITALIC LETTER A..OLD ITALIC LETTER UU +10300..1031F ; Old_Italic # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS 10320..10323 ; Old_Italic # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY -# Total code points: 35 +# Total code points: 36 # ================================================ @@ -1479,12 +1520,15 @@ 064B..0655 ; Inherited # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW 0670 ; Inherited # Mn ARABIC LETTER SUPERSCRIPT ALEF 0951..0952 ; Inherited # Mn [2] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI STRESS SIGN ANUDATTA +1AB0..1ABD ; Inherited # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW +1ABE ; Inherited # Me COMBINING PARENTHESES OVERLAY 1CD0..1CD2 ; Inherited # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA 1CD4..1CE0 ; Inherited # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA 1CE2..1CE8 ; Inherited # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL 1CED ; Inherited # Mn VEDIC SIGN TIRYAK 1CF4 ; Inherited # Mn VEDIC TONE CANDRA ABOVE -1DC0..1DE6 ; Inherited # Mn [39] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER Z +1CF8..1CF9 ; Inherited # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +1DC0..1DF5 ; Inherited # Mn [54] COMBINING DOTTED GRAVE ACCENT..COMBINING UP TACK ABOVE 1DFC..1DFF ; Inherited # Mn [4] COMBINING DOUBLE INVERTED BREVE BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW 200C..200D ; Inherited # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER 20D0..20DC ; Inherited # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE @@ -1495,15 +1539,16 @@ 302A..302D ; Inherited # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK 3099..309A ; Inherited # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK FE00..FE0F ; Inherited # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 -FE20..FE26 ; Inherited # Mn [7] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON +FE20..FE2D ; Inherited # Mn [14] COMBINING LIGATURE LEFT HALF..COMBINING CONJOINING MACRON BELOW 101FD ; Inherited # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +102E0 ; Inherited # Mn COPTIC EPACT THOUSANDS MARK 1D167..1D169 ; Inherited # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 1D17B..1D182 ; Inherited # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE 1D185..1D18B ; Inherited # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE 1D1AA..1D1AD ; Inherited # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO E0100..E01EF ; Inherited # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 -# Total code points: 523 +# Total code points: 563 # ================================================ @@ -1537,7 +1582,7 @@ # ================================================ -1900..191C ; Limbu # Lo [29] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER HA +1900..191E ; Limbu # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA 1920..1922 ; Limbu # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U 1923..1926 ; Limbu # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU 1927..1928 ; Limbu # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O @@ -1550,7 +1595,7 @@ 1944..1945 ; Limbu # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK 1946..194F ; Limbu # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE -# Total code points: 66 +# Total code points: 68 # ================================================ @@ -1612,7 +1657,8 @@ 1A00..1A16 ; Buginese # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA 1A17..1A18 ; Buginese # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U -1A19..1A1B ; Buginese # Mc [3] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN AE +1A19..1A1A ; Buginese # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A1B ; Buginese # Mn BUGINESE VOWEL SIGN AE 1A1E..1A1F ; Buginese # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION # Total code points: 30 @@ -1724,11 +1770,11 @@ # ================================================ -12000..1236E ; Cuneiform # Lo [879] CUNEIFORM SIGN A..CUNEIFORM SIGN ZUM -12400..12462 ; Cuneiform # Nl [99] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN OLD ASSYRIAN ONE QUARTER -12470..12473 ; Cuneiform # Po [4] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL TRICOLON +12000..12398 ; Cuneiform # Lo [921] CUNEIFORM SIGN A..CUNEIFORM SIGN UM TIMES ME +12400..1246E ; Cuneiform # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM +12470..12474 ; Cuneiform # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON -# Total code points: 982 +# Total code points: 1037 # ================================================ @@ -1767,8 +1813,7 @@ 1BA6..1BA7 ; Sundanese # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG 1BA8..1BA9 ; Sundanese # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG 1BAA ; Sundanese # Mc SUNDANESE SIGN PAMAAEH -1BAB ; Sundanese # Mn SUNDANESE SIGN VIRAMA -1BAC..1BAD ; Sundanese # Mc [2] SUNDANESE CONSONANT SIGN PASANGAN MA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BAB..1BAD ; Sundanese # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA 1BAE..1BAF ; Sundanese # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA 1BB0..1BB9 ; Sundanese # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE 1BBA..1BBF ; Sundanese # Lo [6] SUNDANESE AVAGRAHA..SUNDANESE LETTER FINAL M @@ -1825,9 +1870,9 @@ A900..A909 ; Kayah_Li # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE A90A..A925 ; Kayah_Li # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO A926..A92D ; Kayah_Li # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU -A92E..A92F ; Kayah_Li # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA +A92F ; Kayah_Li # Po KAYAH LI SIGN SHYA -# Total code points: 48 +# Total code points: 47 # ================================================ @@ -1974,11 +2019,10 @@ A9BC ; Javanese # Mn JAVANESE VOWEL SIGN PEPET A9BD..A9C0 ; Javanese # Mc [4] JAVANESE CONSONANT SIGN KERET..JAVANESE PANGKON A9C1..A9CD ; Javanese # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH -A9CF ; Javanese # Lm JAVANESE PANGRANGKEP A9D0..A9D9 ; Javanese # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE A9DE..A9DF ; Javanese # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN -# Total code points: 91 +# Total code points: 90 # ================================================ @@ -2080,8 +2124,9 @@ 11047..1104D ; Brahmi # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS 11052..11065 ; Brahmi # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND 11066..1106F ; Brahmi # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE +1107F ; Brahmi # Mn BRAHMI NUMBER JOINER -# Total code points: 108 +# Total code points: 109 # ================================================ @@ -2136,9 +2181,11 @@ 111BF..111C0 ; Sharada # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA 111C1..111C4 ; Sharada # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM 111C5..111C8 ; Sharada # Po [4] SHARADA DANDA..SHARADA SEPARATOR +111CD ; Sharada # Po SHARADA SUTRA MARK 111D0..111D9 ; Sharada # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE +111DA ; Sharada # Lo SHARADA EKAM -# Total code points: 83 +# Total code points: 85 # ================================================ @@ -2161,4 +2208,244 @@ # Total code points: 66 +# ================================================ + +10530..10563 ; Caucasian_Albanian # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW +1056F ; Caucasian_Albanian # Po CAUCASIAN ALBANIAN CITATION MARK + +# Total code points: 53 + +# ================================================ + +16AD0..16AED ; Bassa_Vah # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I +16AF0..16AF4 ; Bassa_Vah # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE +16AF5 ; Bassa_Vah # Po BASSA VAH FULL STOP + +# Total code points: 36 + +# ================================================ + +1BC00..1BC6A ; Duployan # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M +1BC70..1BC7C ; Duployan # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK +1BC80..1BC88 ; Duployan # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL +1BC90..1BC99 ; Duployan # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW +1BC9C ; Duployan # So DUPLOYAN SIGN O WITH CROSS +1BC9D..1BC9E ; Duployan # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK +1BC9F ; Duployan # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP + +# Total code points: 143 + +# ================================================ + +10500..10527 ; Elbasan # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE + +# Total code points: 40 + +# ================================================ + +11301 ; Grantha # Mn GRANTHA SIGN CANDRABINDU +11302..11303 ; Grantha # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +11305..1130C ; Grantha # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L +1130F..11310 ; Grantha # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI +11313..11328 ; Grantha # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA +1132A..11330 ; Grantha # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA +11332..11333 ; Grantha # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA +11335..11339 ; Grantha # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA +1133C ; Grantha # Mn GRANTHA SIGN NUKTA +1133D ; Grantha # Lo GRANTHA SIGN AVAGRAHA +1133E..1133F ; Grantha # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11340 ; Grantha # Mn GRANTHA VOWEL SIGN II +11341..11344 ; Grantha # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; Grantha # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134D ; Grantha # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA +11357 ; Grantha # Mc GRANTHA AU LENGTH MARK +1135D..11361 ; Grantha # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL +11362..11363 ; Grantha # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +11366..1136C ; Grantha # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; Grantha # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA + +# Total code points: 83 + +# ================================================ + +16B00..16B2F ; Pahawh_Hmong # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU +16B30..16B36 ; Pahawh_Hmong # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM +16B37..16B3B ; Pahawh_Hmong # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM +16B3C..16B3F ; Pahawh_Hmong # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB +16B40..16B43 ; Pahawh_Hmong # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM +16B44 ; Pahawh_Hmong # Po PAHAWH HMONG SIGN XAUS +16B45 ; Pahawh_Hmong # So PAHAWH HMONG SIGN CIM TSOV ROG +16B50..16B59 ; Pahawh_Hmong # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE +16B5B..16B61 ; Pahawh_Hmong # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS +16B63..16B77 ; Pahawh_Hmong # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS +16B7D..16B8F ; Pahawh_Hmong # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ + +# Total code points: 127 + +# ================================================ + +11200..11211 ; Khojki # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA +11213..1122B ; Khojki # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA +1122C..1122E ; Khojki # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +1122F..11231 ; Khojki # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11232..11233 ; Khojki # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11234 ; Khojki # Mn KHOJKI SIGN ANUSVARA +11235 ; Khojki # Mc KHOJKI SIGN VIRAMA +11236..11237 ; Khojki # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA +11238..1123D ; Khojki # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN + +# Total code points: 61 + +# ================================================ + +10600..10736 ; Linear_A # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 +10740..10755 ; Linear_A # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE +10760..10767 ; Linear_A # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 + +# Total code points: 341 + +# ================================================ + +11150..11172 ; Mahajani # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA +11173 ; Mahajani # Mn MAHAJANI SIGN NUKTA +11174..11175 ; Mahajani # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK +11176 ; Mahajani # Lo MAHAJANI LIGATURE SHRI + +# Total code points: 39 + +# ================================================ + +10AC0..10AC7 ; Manichaean # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW +10AC8 ; Manichaean # So MANICHAEAN SIGN UD +10AC9..10AE4 ; Manichaean # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW +10AE5..10AE6 ; Manichaean # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW +10AEB..10AEF ; Manichaean # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED +10AF0..10AF6 ; Manichaean # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER + +# Total code points: 51 + +# ================================================ + +1E800..1E8C4 ; Mende_Kikakui # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON +1E8C7..1E8CF ; Mende_Kikakui # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE +1E8D0..1E8D6 ; Mende_Kikakui # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS + +# Total code points: 213 + +# ================================================ + +11600..1162F ; Modi # Lo [48] MODI LETTER A..MODI LETTER LLA +11630..11632 ; Modi # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +11633..1163A ; Modi # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163B..1163C ; Modi # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163D ; Modi # Mn MODI SIGN ANUSVARA +1163E ; Modi # Mc MODI SIGN VISARGA +1163F..11640 ; Modi # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA +11641..11643 ; Modi # Po [3] MODI DANDA..MODI ABBREVIATION SIGN +11644 ; Modi # Lo MODI SIGN HUVA +11650..11659 ; Modi # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE + +# Total code points: 79 + +# ================================================ + +16A40..16A5E ; Mro # Lo [31] MRO LETTER TA..MRO LETTER TEK +16A60..16A69 ; Mro # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE +16A6E..16A6F ; Mro # Po [2] MRO DANDA..MRO DOUBLE DANDA + +# Total code points: 43 + +# ================================================ + +10A80..10A9C ; Old_North_Arabian # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH +10A9D..10A9F ; Old_North_Arabian # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY + +# Total code points: 32 + +# ================================================ + +10880..1089E ; Nabataean # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW +108A7..108AF ; Nabataean # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED + +# Total code points: 40 + +# ================================================ + +10860..10876 ; Palmyrene # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW +10877..10878 ; Palmyrene # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON +10879..1087F ; Palmyrene # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY + +# Total code points: 32 + +# ================================================ + +11AC0..11AF8 ; Pau_Cin_Hau # Lo [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL + +# Total code points: 57 + +# ================================================ + +10350..10375 ; Old_Permic # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA +10376..1037A ; Old_Permic # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII + +# Total code points: 43 + +# ================================================ + +10B80..10B91 ; Psalter_Pahlavi # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW +10B99..10B9C ; Psalter_Pahlavi # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT +10BA9..10BAF ; Psalter_Pahlavi # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED + +# Total code points: 29 + +# ================================================ + +11580..115AE ; Siddham # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA +115AF..115B1 ; Siddham # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B2..115B5 ; Siddham # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115B8..115BB ; Siddham # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BC..115BD ; Siddham # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BE ; Siddham # Mc SIDDHAM SIGN VISARGA +115BF..115C0 ; Siddham # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +115C1..115C9 ; Siddham # Po [9] SIDDHAM SIGN SIDDHAM..SIDDHAM END OF TEXT MARK + +# Total code points: 72 + +# ================================================ + +112B0..112DE ; Khudawadi # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA +112DF ; Khudawadi # Mn KHUDAWADI SIGN ANUSVARA +112E0..112E2 ; Khudawadi # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +112E3..112EA ; Khudawadi # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA +112F0..112F9 ; Khudawadi # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE + +# Total code points: 69 + +# ================================================ + +11480..114AF ; Tirhuta # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA +114B0..114B2 ; Tirhuta # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B3..114B8 ; Tirhuta # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114B9 ; Tirhuta # Mc TIRHUTA VOWEL SIGN E +114BA ; Tirhuta # Mn TIRHUTA VOWEL SIGN SHORT E +114BB..114BE ; Tirhuta # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114BF..114C0 ; Tirhuta # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C1 ; Tirhuta # Mc TIRHUTA SIGN VISARGA +114C2..114C3 ; Tirhuta # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +114C4..114C5 ; Tirhuta # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG +114C6 ; Tirhuta # Po TIRHUTA ABBREVIATION SIGN +114C7 ; Tirhuta # Lo TIRHUTA OM +114D0..114D9 ; Tirhuta # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE + +# Total code points: 82 + +# ================================================ + +118A0..118DF ; Warang_Citi # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO +118E0..118E9 ; Warang_Citi # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE +118EA..118F2 ; Warang_Citi # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY +118FF ; Warang_Citi # Lo WARANG CITI OM + +# Total code points: 84 + # EOF --- old/jdk/test/java/text/Bidi/BidiConformance.java 2015-07-13 16:12:05.000000000 +0900 +++ new/jdk/test/java/text/Bidi/BidiConformance.java 2015-07-13 16:12:05.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /* * @test - * @bug 6850113 + * @bug 6850113 8032446 * @summary confirm the behavior of new Bidi implementation. (Backward compatibility) */ @@ -40,6 +40,8 @@ private static boolean verbose = false; private static boolean abort = false; + private static final byte MAX_EXPLICIT_LEVEL = 125; + public static void main(String[] args) { for (int i = 0; i < args.length; i++) { String arg = args[i]; @@ -368,15 +370,15 @@ AttributedString astr = new AttributedString(paragraph); astr.addAttribute(TextAttribute.RUN_DIRECTION, TextAttribute.RUN_DIRECTION_RTL); - astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-61), + astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-MAX_EXPLICIT_LEVEL), start, limit); try { bidi = new Bidi(astr.getIterator()); for (int i = start; i < limit; i++) { - if (bidi.getLevelAt(i) != 61) { + if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) { errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" + i + ") should not be " + bidi.getLevelAt(i) + - " but 60 when BIDI_EMBEDDING is -61."); + " but MAX_EXPLICIT_LEVEL-1 when BIDI_EMBEDDING is -MAX_EXPLICIT_LEVEL."); } } } @@ -387,14 +389,14 @@ astr = new AttributedString(paragraph); astr.addAttribute(TextAttribute.RUN_DIRECTION, TextAttribute.RUN_DIRECTION_RTL); - astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-62), + astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(-(MAX_EXPLICIT_LEVEL+1)), start, limit); try { bidi = new Bidi(astr.getIterator()); for (int i = start; i < limit; i++) { if (bidi.getLevelAt(i) != 1) { errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " + - "should be 1 when BIDI_EMBEDDING is -62."); + "should be 1 when BIDI_EMBEDDING is -(MAX_EXPLICIT_LEVEL+1)."); } } } @@ -405,14 +407,14 @@ astr = new AttributedString(paragraph); astr.addAttribute(TextAttribute.RUN_DIRECTION, TextAttribute.RUN_DIRECTION_RTL); - astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(60), + astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL-1), start, limit); try { bidi = new Bidi(astr.getIterator()); for (int i = start; i < limit; i++) { - if (bidi.getLevelAt(i) != 61) { + if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) { errorHandling("Bidi(AttributedCharacterIterator).getLevelAt() " + - "should be 61 when BIDI_EMBEDDING is 60."); + "should be MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL-1."); } } } @@ -423,15 +425,15 @@ astr = new AttributedString(paragraph); astr.addAttribute(TextAttribute.RUN_DIRECTION, TextAttribute.RUN_DIRECTION_RTL); - astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(61), + astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL), start, limit); try { bidi = new Bidi(astr.getIterator()); for (int i = start; i < limit; i++) { - if (bidi.getLevelAt(i) != 61) { + if (bidi.getLevelAt(i) != MAX_EXPLICIT_LEVEL) { errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" + i + ") should not be " + bidi.getLevelAt(i) + - " but 61 when BIDI_EMBEDDING is 61."); + " but MAX_EXPLICIT_LEVEL when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL."); } } } @@ -442,15 +444,15 @@ astr = new AttributedString(paragraph); astr.addAttribute(TextAttribute.RUN_DIRECTION, TextAttribute.RUN_DIRECTION_RTL); - astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(62), + astr.addAttribute(TextAttribute.BIDI_EMBEDDING, new Integer(MAX_EXPLICIT_LEVEL+1), start, limit); try { bidi = new Bidi(astr.getIterator()); for (int i = start; i < limit; i++) { if (bidi.getLevelAt(i) != 1) { - errorHandling("Bidi(AttributedCharacterIterator).getLevelAt()" + - " should not be " + bidi.getLevelAt(i) + - " but 1 when BIDI_EMBEDDING is 62."); + errorHandling("Bidi(AttributedCharacterIterator).getLevelAt(" + + i + ") should not be " + bidi.getLevelAt(i) + + " but 1 when BIDI_EMBEDDING is MAX_EXPLICIT_LEVEL+1."); } } } @@ -536,8 +538,8 @@ } byte[] actualLevels = new byte[text.length]; - byte[] validEmbeddings1 = {0, -61, -60, -2, -1}; - byte[] expectedLevels1 = {0, 61, 60, 2, 1}; + byte[] validEmbeddings1 = {0, -MAX_EXPLICIT_LEVEL, -(MAX_EXPLICIT_LEVEL-1), -2, -1}; + byte[] expectedLevels1 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1}; try { bidi = new Bidi(text, 0, validEmbeddings1, 0, 5, Bidi.DIRECTION_LEFT_TO_RIGHT); @@ -553,11 +555,11 @@ } catch (Exception e) { errorHandling("Bidi(char[], ...) should not throw an exception " + - "when embeddings is valid(-61)."); + "when embeddings is valid(-MAX_EXPLICIT_LEVEL)."); } - byte[] validEmbeddings2 = {0, 61, 60, 2, 1}; - byte[] expectedLevels2 = {0, 62, 60, 2, 2}; + byte[] validEmbeddings2 = {0, MAX_EXPLICIT_LEVEL, MAX_EXPLICIT_LEVEL-1, 2, 1}; + byte[] expectedLevels2 = {0, MAX_EXPLICIT_LEVEL+1, MAX_EXPLICIT_LEVEL-1, 2, 2}; try { bidi = new Bidi(text, 0, validEmbeddings2, 0, 5, Bidi.DIRECTION_LEFT_TO_RIGHT); @@ -573,35 +575,35 @@ } catch (Exception e) { errorHandling("Bidi(char[], ...) should not throw an exception " + - "when embeddings is valid(61)."); + "when embeddings is valid(MAX_EXPLICIT_LEVEL)."); } - byte[] invalidEmbeddings1 = {0, -62, 0, 0, 0}; + byte[] invalidEmbeddings1 = {0, -(MAX_EXPLICIT_LEVEL+1), 0, 0, 0}; try { bidi = new Bidi(text, 0, invalidEmbeddings1, 0, 5, Bidi.DIRECTION_LEFT_TO_RIGHT); if (bidi.getLevelAt(1) != 0) { errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " + - "when embeddings[1] is -62."); + "when embeddings[1] is -(MAX_EXPLICIT_LEVEL+1)."); } } catch (Exception e) { errorHandling("Bidi(char[], ...) should not throw an exception " + - "even when embeddings includes -62."); + "even when embeddings includes -(MAX_EXPLICIT_LEVEL+1)."); } - byte[] invalidEmbeddings2 = {0, 62, 0, 0, 0}; + byte[] invalidEmbeddings2 = {0, MAX_EXPLICIT_LEVEL+1, 0, 0, 0}; try { bidi = new Bidi(text, 0, invalidEmbeddings2, 0, 5, Bidi.DIRECTION_LEFT_TO_RIGHT); if (bidi.getLevelAt(1) != 0) { errorHandling("Bidi(char[], ...).getLevelAt(1) should be 0 " + - "when embeddings[1] is 62."); + "when embeddings[1] is MAX_EXPLICIT_LEVEL+1."); } } catch (Exception e) { errorHandling("Bidi(char[], ...) should not throw an exception " + - "even when embeddings includes 62."); + "even when embeddings includes MAX_EXPLICIT_LEVEL+1."); } try { @@ -1595,6 +1597,10 @@ private static final char PDF = '\u202C'; private static final char LRO = '\u202D'; private static final char RLO = '\u202E'; + private static final char LRI = '\u2066'; + private static final char RLI = '\u2067'; + private static final char FSI = '\u2068'; + private static final char PDI = '\u2069'; /* * 0x05D0-0x05EA: [R] Hewbrew letters (Strong) @@ -2002,8 +2008,8 @@ /* For Text #18 */ {" ABC (" + ArabicABC + " " + Arabic123 + ") 123.", - "0000001111222112220", "0000001111222112220", - "0000001111222112220", "1222111111222112221"}, + "0000001111222002220", "0000001111222002220", + "0000001111222002220", "1222111111222112221"}, /* For Text #19 */ {" " + HebrewABC + " (ABC 123) " + NKo123 + ".", @@ -2028,6 +2034,90 @@ PDF, "22222221111111111111110", "22222221111111111111110", "22222221111111111111110", "44444443333333333333331"}, + + /* For Text #23 */ + {" ABC (" + Arabic123 + " " + ArabicABC + ") 123.", + "0000002221111002220", "0000002221111002220", + "0000002221111002220", "1222112221111112221"}, + + /* For Text #24 */ + {" 123 (" + ArabicABC + " " + Arabic123 + ") ABC.", + "1222111111222112221", "1222111111222112221", + "0000001111222000000", "1222111111222112221"}, + + /* For Text #25 */ + {" 123 (" + Arabic123 + " " + ArabicABC + ") ABC.", + "1222112221111112221", "1222112221111112221", + "0000002221111000000", "1222112221111112221"}, + + /* For Text #26 */ + {" " + ArabicABC + " (ABC 123) " + Arabic123 + ".", + "1111112222222112221", "1111112222222112221", + "0111000000000002220", "1111112222222112221"}, + + /* For Text #27 */ + {" " + ArabicABC + " (123 ABC) " + Arabic123 + ".", + "1111112221222112221", "1111112221222112221", + "0111002220000002220", "1111112221222112221"}, + + /* For Text #28 */ + {" " + Arabic123 + " (ABC 123) " + ArabicABC + ".", + "0222000000000001110", "0222000000000001110", + "0222000000000001110", "1222112222222111111"}, + + /* For Text #29 */ + {" " + Arabic123 + " (123 ABC) " + ArabicABC + ".", + "0222000000000001110", "0222000000000001110", + "0222000000000001110", "1222112221222111111"}, + + /* For Text #30 */ + {RLI + "ABC " + ArabicABC + " " + ArabicABC + "." + PDI, + "02221111111110", "14443333333331", + "02221111111110", "14443333333331"}, + + /* For Text #31 */ + {"ABC abc \"" + RLI + "IJK " + ArabicABC + " " + ArabicABC + PDI + + ".\" \"" + RLI + ArabicABC + " " + ArabicABC + PDI + ",\" xyz XYZ.", + "0000000000222111111110000001111111000000000000", + "0000000000222111111110000001111111000000000000", + "0000000000222111111110000001111111000000000000", + "2222222222444333333332222223333333222222222221"}, + + /* For Text #32 */ + {ArabicABC + " " + ArabicABC + " '" + LRI + "abc def \"" + RLI + + "xyz " + ArabicABC + " " + ArabicABC + PDI + "\"" + PDI + "'?", + "111111111122222222224443333333322111", + "111111111122222222224443333333322111", + "111111100022222222224443333333322000", + "111111111122222222224443333333322111"}, + + /* For Text #33 */ + {FSI + Arabic123 + " ABC " + ArabicABC + " " + ArabicABC + "." + PDI, + "044422222333333320", "144422222333333321", + "044422222333333320", "144422222333333321"}, + + /* For Text #34 */ + {FSI + "123 ABC " + ArabicABC + " " + ArabicABC + "." + PDI, + "022222222333333320", "122222222333333321", + "022222222333333320", "122222222333333321"}, + + /* For Text #35 */ + {FSI + "123 " + ArabicABC + " ABC " + ArabicABC + "." + PDI, + "022211111222111110", "144433333444333331", + "022211111222111110", "144433333444333331"}, + + /* For Text #36 */ + {FSI + Arabic123 + " " + ArabicABC + " ABC " + ArabicABC + "." + PDI, + "022211111222111110", "144433333444333331", + "022211111222111110", "144433333444333331"}, + + /* For Text #37 */ + {FSI + Arabic123 + " 123." + PDI, + "0444222220", "1444222221", "0444222220", "1444222221"}, + + /* For Text #38 */ + {FSI + "123 " + Arabic123 + "." + PDI, + "0222244420", "1222244421", "0222244420", "1222244421"}, }; /* Golden data for baseIsLeftToRight() results */ @@ -2060,10 +2150,32 @@ {true, true, true, false}, {false, false, true, false}, - /* For Text #20 - $22 */ + /* For Text #20 - $24 */ + {true, true, true, false}, + {true, true, true, false}, {true, true, true, false}, {true, true, true, false}, + {false, false, true, false}, + + /* For Text #25 - $29 */ + {false, false, true, false}, + {false, false, true, false}, + {false, false, true, false}, {true, true, true, false}, + {true, true, true, false}, + + /* For Text #30 - $34 */ + {true, false, true, false}, + {true, true, true, false}, + {false, false, true, false}, + {true, false, true, false}, + {true , false, true, false}, + + /* For Text #35 - $38 */ + {true, false, true, false}, + {true, false, true, false}, + {true, false, true, false}, + {true, false, true, false}, }; /* Golden data for isLeftToRight() & isRightToLeft() results */ @@ -2097,7 +2209,29 @@ {{false, false, false, false}, {false, false, false, false}}, {{false, false, false, false}, {false, false, false, false}}, - /* For Text #20 - $22 */ + /* For Text #20 - $24 */ + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + + /* For Text #25 - $29 */ + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + + /* For Text #30 - $34 */ + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + {{false, false, false, false}, {false, false, false, false}}, + + /* For Text #35 - $37 */ + {{false, false, false, false}, {false, false, false, false}}, {{false, false, false, false}, {false, false, false, false}}, {{false, false, false, false}, {false, false, false, false}}, {{false, false, false, false}, {false, false, false, false}}, @@ -2113,8 +2247,13 @@ true, true, true, true, true, true, true, true, true, true, - /* For Text #20 - $22 */ - true, true, true, + /* For Text #20 - $29 */ + true, true, true, true, true, + true, true, true, true, true, + + /* For Text #30 - $37 */ + true, true, true, true, true, + true, true, true, true, }; /* --------------------------------------------------------------------- */ --- old/jdk/test/sun/net/idn/NFS4StringPrep.java 2015-07-13 16:12:06.000000000 +0900 +++ new/jdk/test/sun/net/idn/NFS4StringPrep.java 2015-07-13 16:12:06.000000000 +0900 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,7 +32,6 @@ import java.io.UnsupportedEncodingException; import java.text.ParseException; -import sun.text.normalizer.ICUData; import sun.net.idn.StringPrep; import sun.text.normalizer.UCharacterIterator; --- old/jdk/src/java.base/share/classes/sun/text/normalizer/ICUData.java 2015-07-13 16:12:07.000000000 +0900 +++ /dev/null 2015-07-13 16:12:07.000000000 +0900 @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.InputStream; -import java.net.URL; -import java.security.AccessController; -import java.security.PrivilegedAction; -import java.util.MissingResourceException; - -/** - * Provides access to ICU data files as InputStreams. Implements security checking. - */ -public final class ICUData { - - private static InputStream getStream(final Class<ICUData> root, final String resourceName, boolean required) { - InputStream i = null; - - if (System.getSecurityManager() != null) { - i = AccessController.doPrivileged(new PrivilegedAction<InputStream>() { - public InputStream run() { - return root.getResourceAsStream(resourceName); - } - }); - } else { - i = root.getResourceAsStream(resourceName); - } - - if (i == null && required) { - throw new MissingResourceException("could not locate data", root.getPackage().getName(), resourceName); - } - return i; - } - - /* - * Convenience override that calls getStream(ICUData.class, resourceName, false); - */ - public static InputStream getStream(String resourceName) { - return getStream(ICUData.class, resourceName, false); - } - - /* - * Convenience method that calls getStream(ICUData.class, resourceName, true). - */ - public static InputStream getRequiredStream(String resourceName) { - return getStream(ICUData.class, resourceName, true); - } -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/IntTrie.java 2015-07-13 16:12:08.000000000 +0900 +++ /dev/null 2015-07-13 16:12:08.000000000 +0900 @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.InputStream; -import java.io.DataInputStream; -import java.io.IOException; -import java.util.Arrays; - -/** - * Trie implementation which stores data in int, 32 bits. - * @author synwee - * @see com.ibm.icu.impl.Trie - * @since release 2.1, Jan 01 2002 - */ -public class IntTrie extends Trie -{ - // public constructors --------------------------------------------- - - /** - * <p>Creates a new Trie with the settings for the trie data.</p> - * <p>Unserialize the 32-bit-aligned input stream and use the data for the - * trie.</p> - * @param inputStream file input stream to a ICU data file, containing - * the trie - * @param datamanipulate object which provides methods to parse the char - * data - * @throws IOException thrown when data reading fails - * @draft 2.1 - */ - public IntTrie(InputStream inputStream, DataManipulate datamanipulate) - throws IOException - { - super(inputStream, datamanipulate); - if (!isIntTrie()) { - throw new IllegalArgumentException( - "Data given does not belong to a int trie."); - } - } - - // public methods -------------------------------------------------- - - /** - * Gets the value associated with the codepoint. - * If no value is associated with the codepoint, a default value will be - * returned. - * @param ch codepoint - * @return offset to data - * @draft 2.1 - */ - public final int getCodePointValue(int ch) - { - int offset = getCodePointOffset(ch); - return (offset >= 0) ? m_data_[offset] : m_initialValue_; - } - - /** - * Gets the value to the data which this lead surrogate character points - * to. - * Returned data may contain folding offset information for the next - * trailing surrogate character. - * This method does not guarantee correct results for trail surrogates. - * @param ch lead surrogate character - * @return data value - * @draft 2.1 - */ - public final int getLeadValue(char ch) - { - return m_data_[getLeadOffset(ch)]; - } - - /** - * Get a value from a folding offset (from the value of a lead surrogate) - * and a trail surrogate. - * @param leadvalue the value of a lead surrogate that contains the - * folding offset - * @param trail surrogate - * @return trie data value associated with the trail character - * @draft 2.1 - */ - public final int getTrailValue(int leadvalue, char trail) - { - if (m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - int offset = m_dataManipulate_.getFoldingOffset(leadvalue); - if (offset > 0) { - return m_data_[getRawOffset(offset, - (char)(trail & SURROGATE_MASK_))]; - } - return m_initialValue_; - } - - // protected methods ----------------------------------------------- - - /** - * <p>Parses the input stream and stores its trie content into a index and - * data array</p> - * @param inputStream data input stream containing trie data - * @exception IOException thrown when data reading fails - */ - protected final void unserialize(InputStream inputStream) - throws IOException - { - super.unserialize(inputStream); - // one used for initial value - m_data_ = new int[m_dataLength_]; - DataInputStream input = new DataInputStream(inputStream); - for (int i = 0; i < m_dataLength_; i ++) { - m_data_[i] = input.readInt(); - } - m_initialValue_ = m_data_[0]; - } - - /** - * Gets the offset to the data which the surrogate pair points to. - * @param lead lead surrogate - * @param trail trailing surrogate - * @return offset to data - * @draft 2.1 - */ - protected final int getSurrogateOffset(char lead, char trail) - { - if (m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - // get fold position for the next trail surrogate - int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead)); - - // get the real data from the folded lead/trail units - if (offset > 0) { - return getRawOffset(offset, (char)(trail & SURROGATE_MASK_)); - } - - // return -1 if there is an error, in this case we return the default - // value: m_initialValue_ - return -1; - } - - /** - * Gets the value at the argument index. - * For use internally in TrieIterator - * @param index value at index will be retrieved - * @return 32 bit value - * @see com.ibm.icu.impl.TrieIterator - * @draft 2.1 - */ - protected final int getValue(int index) - { - return m_data_[index]; - } - - /** - * Gets the default initial value - * @return 32 bit value - * @draft 2.1 - */ - protected final int getInitialValue() - { - return m_initialValue_; - } - - // package private methods ----------------------------------------- - - /** - * Internal constructor for builder use - * @param index the index array to be slotted into this trie - * @param data the data array to be slotted into this trie - * @param initialvalue the initial value for this trie - * @param options trie options to use - * @param datamanipulate folding implementation - */ - IntTrie(char index[], int data[], int initialvalue, int options, - DataManipulate datamanipulate) - { - super(index, options, datamanipulate); - m_data_ = data; - m_dataLength_ = m_data_.length; - m_initialValue_ = initialvalue; - } - - // private data members -------------------------------------------- - - /** - * Default value - */ - private int m_initialValue_; - /** - * Array of char data - */ - private int m_data_[]; -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/NormalizerDataReader.java 2015-07-13 16:12:08.000000000 +0900 +++ /dev/null 2015-07-13 16:12:08.000000000 +0900 @@ -1,389 +0,0 @@ -/* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.DataInputStream; -import java.io.InputStream; -import java.io.IOException; - -/** - * @author Ram Viswanadha - */ - - /* - * Description of the format of unorm.icu version 2.1. - * - * Main change from version 1 to version 2: - * Use of new, common Trie instead of normalization-specific tries. - * Change to version 2.1: add third/auxiliary trie with associated data. - * - * For more details of how to use the data structures see the code - * in unorm.cpp (runtime normalization code) and - * in gennorm.c and gennorm/store.c (build-time data generation). - * - * For the serialized format of Trie see Trie.c/TrieHeader. - * - * - Overall partition - * - * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c. - * After that there are the following structures: - * - * char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file - * - * Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE] - * - * char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT] - * extraData[0] contains the number of units for - * FC_NFKC_Closure (formatVersion>=2.1) - * - * char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT] - * combiningTableTop may include one 16-bit padding unit - * to make sure that fcdTrie is 32-bit-aligned - * - * Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE] - * - * Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE] - * - * - * The indexes array contains lengths and sizes of the following arrays and structures - * as well as the following values: - * indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop - * -- one more than the highest combining index computed for forward-only-combining characters - * indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop - * -- number of combining indexes computed for both-ways-combining characters - * indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop - * -- number of combining indexes computed for backward-only-combining characters - * - * indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD }) - * -- first code point with a quick check NF* value of NO/MAYBE - * - * - * - Tries - * - * The main structures are two Trie tables ("compact arrays"), - * each with one index array and one data array. - * See Trie.h and Trie.c. - * - * - * - Tries in unorm.icu - * - * The first trie (normTrie above) - * provides data for the NF* quick checks and normalization. - * The second trie (fcdTrie above) provides data just for FCD checks. - * - * - * - norm32 data words from the first trie - * - * The norm32Table contains one 32-bit word "norm32" per code point. - * It contains the following bit fields: - * 31..16 extra data index, EXTRA_SHIFT is used to shift this field down - * if this index is <EXTRA_INDEX_TOP then it is an index into - * extraData[] where variable-length normalization data for this - * code point is found - * if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP - * then this is a norm32 for a leading surrogate, and the index - * value is used together with the following trailing surrogate - * code unit in the second trie access - * if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP - * then this is a norm32 for a "special" character, - * i.e., the character is a Hangul syllable or a Jamo - * see EXTRA_HANGUL etc. - * generally, instead of extracting this index from the norm32 and - * comparing it with the above constants, - * the normalization code compares the entire norm32 value - * with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc. - * - * 15..8 combining class (cc) according to UnicodeData.txt - * - * 7..6 COMBINES_ANY flags, used in composition to see if a character - * combines with any following or preceding character(s) - * at all - * 7 COMBINES_BACK - * 6 COMBINES_FWD - * - * 5..0 quick check flags, set for "no" or "maybe", with separate flags for - * each normalization form - * the higher bits are "maybe" flags; for NF*D there are no such flags - * the lower bits are "no" flags for all forms, in the same order - * as the "maybe" flags, - * which is (MSB to LSB): NFKD NFD NFKC NFC - * 5..4 QC_ANY_MAYBE - * 3..0 QC_ANY_NO - * see further related constants - * - * - * - Extra data per code point - * - * "Extra data" is referenced by the index in norm32. - * It is variable-length data. It is only present, and only those parts - * of it are, as needed for a given character. - * The norm32 extra data index is added to the beginning of extraData[] - * to get to a vector of 16-bit words with data at the following offsets: - * - * [-1] Combining index for composition. - * Stored only if norm32&COMBINES_ANY . - * [0] Lengths of the canonical and compatibility decomposition strings. - * Stored only if there are decompositions, i.e., - * if norm32&(QC_NFD|QC_NFKD) - * High byte: length of NFKD, or 0 if none - * Low byte: length of NFD, or 0 if none - * Each length byte also has another flag: - * Bit 7 of a length byte is set if there are non-zero - * combining classes (cc's) associated with the respective - * decomposition. If this flag is set, then the decomposition - * is preceded by a 16-bit word that contains the - * leading and trailing cc's. - * Bits 6..0 of a length byte are the length of the - * decomposition string, not counting the cc word. - * [1..n] NFD - * [n+1..] NFKD - * - * Each of the two decompositions consists of up to two parts: - * - The 16-bit words with the leading and trailing cc's. - * This is only stored if bit 7 of the corresponding length byte - * is set. In this case, at least one of the cc's is not zero. - * High byte: leading cc==cc of the first code point in the decomposition string - * Low byte: trailing cc==cc of the last code point in the decomposition string - * - The decomposition string in UTF-16, with length code units. - * - * - * - Combining indexes and combiningTable[] - * - * Combining indexes are stored at the [-1] offset of the extra data - * if the character combines forward or backward with any other characters. - * They are used for (re)composition in NF*C. - * Values of combining indexes are arranged according to whether a character - * combines forward, backward, or both ways: - * forward-only < both ways < backward-only - * - * The index values for forward-only and both-ways combining characters - * are indexes into the combiningTable[]. - * The index values for backward-only combining characters are simply - * incremented from the preceding index values to be unique. - * - * In the combiningTable[], a variable-length list - * of variable-length (back-index, code point) pair entries is stored - * for each forward-combining character. - * - * These back-indexes are the combining indexes of both-ways or backward-only - * combining characters that the forward-combining character combines with. - * - * Each list is sorted in ascending order of back-indexes. - * Each list is terminated with the last back-index having bit 15 set. - * - * Each pair (back-index, code point) takes up either 2 or 3 - * 16-bit words. - * The first word of a list entry is the back-index, with its bit 15 set if - * this is the last pair in the list. - * - * The second word contains flags in bits 15..13 that determine - * if there is a third word and how the combined character is encoded: - * 15 set if there is a third word in this list entry - * 14 set if the result is a supplementary character - * 13 set if the result itself combines forward - * - * According to these bits 15..14 of the second word, - * the result character is encoded as follows: - * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of - * the second word. - * 10 The result is 0x2000..0xffff and stored in the third word. - * Bits 12..0 of the second word are not used. - * 11 The result is a supplementary character. - * Bits 9..0 of the leading surrogate are in bits 9..0 of - * the second word. - * Add 0xd800 to these bits to get the complete surrogate. - * Bits 12..10 of the second word are not used. - * The trailing surrogate is stored in the third word. - * - * - * - FCD trie - * - * The FCD trie is very simple. - * It is a folded trie with 16-bit data words. - * In each word, the high byte contains the leading cc of the character, - * and the low byte contains the trailing cc of the character. - * These cc's are the cc's of the first and last code points in the - * canonical decomposition of the character. - * - * Since all 16 bits are used for cc's, lead surrogates must be tested - * by checking the code unit instead of the trie data. - * This is done only if the 16-bit data word is not zero. - * If the code unit is a leading surrogate and the data word is not zero, - * then instead of cc's it contains the offset for the second trie lookup. - * - * - * - Auxiliary trie and data - * - * - * The auxiliary 16-bit trie contains data for additional properties. - * Bits - * 15..13 reserved - * 12 not NFC_Skippable (f) (formatVersion>=2.2) - * 11 flag: not a safe starter for canonical closure - * 10 composition exclusion - * 9.. 0 index into extraData[] to FC_NFKC_Closure string - * (not for lead surrogate), - * or lead surrogate offset (for lead surrogate, if 9..0 not zero) - * - * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable: - * (used in NormalizerTransliterator) - * - * A skippable character is - * a) unassigned, or ALL of the following: - * b) of combining class 0. - * c) not decomposed by this normalization form. - * AND if NFC or NFKC, - * d) can never compose with a previous character. - * e) can never compose with a following character. - * f) can never change if another character is added. - * Example: a-breve might satisfy all but f, but if you - * add an ogonek it changes to a-ogonek + breve - * - * a)..e) must be tested from norm32. - * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built - * into the auxiliary trie. - * The same bit is used for NFC and NFKC; (c) differs for them. - * As usual, we build the "not skippable" flags so that unassigned - * code points get a 0 bit. - * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well. - * Test Hangul LV syllables entirely in code. - * - * - * - FC_NFKC_Closure strings in extraData[] - * - * Strings are either stored as a single code unit or as the length - * followed by that many units. - * - */ -final class NormalizerDataReader implements ICUBinary.Authenticate { - - /** - * <p>Protected constructor.</p> - * @param inputStream ICU uprop.dat file input stream - * @exception IOException throw if data file fails authentication - * @draft 2.1 - */ - protected NormalizerDataReader(InputStream inputStream) - throws IOException{ - - unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); - dataInputStream = new DataInputStream(inputStream); - } - - // protected methods ------------------------------------------------- - - protected int[] readIndexes(int length)throws IOException{ - int[] indexes = new int[length]; - //Read the indexes - for (int i = 0; i <length ; i++) { - indexes[i] = dataInputStream.readInt(); - } - return indexes; - } - /** - * <p>Reads unorm.icu, parse it into blocks of data to be stored in - * NormalizerImpl.</P - * @param normBytes - * @param fcdBytes - * @param auxBytes - * @param extraData - * @param combiningTable - * @exception thrown when data reading fails - * @draft 2.1 - */ - protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes, - char[] extraData, char[] combiningTable) - throws IOException{ - - //Read the bytes that make up the normTrie - dataInputStream.readFully(normBytes); - - //normTrieStream= new ByteArrayInputStream(normBytes); - - //Read the extra data - for(int i=0;i<extraData.length;i++){ - extraData[i]=dataInputStream.readChar(); - } - - //Read the combining class table - for(int i=0; i<combiningTable.length; i++){ - combiningTable[i]=dataInputStream.readChar(); - } - - //Read the fcdTrie - dataInputStream.readFully(fcdBytes); - - - //Read the AuxTrie - dataInputStream.readFully(auxBytes); - } - - public byte[] getDataFormatVersion(){ - return DATA_FORMAT_VERSION; - } - - public boolean isDataVersionAcceptable(byte version[]) - { - return version[0] == DATA_FORMAT_VERSION[0] - && version[2] == DATA_FORMAT_VERSION[2] - && version[3] == DATA_FORMAT_VERSION[3]; - } - - public byte[] getUnicodeVersion(){ - return unicodeVersion; - } - // private data members ------------------------------------------------- - - - /** - * ICU data file input stream - */ - private DataInputStream dataInputStream; - - private byte[] unicodeVersion; - - /** - * File format version that this class understands. - * No guarantees are made if a older version is used - * see store.c of gennorm for more information and values - */ - private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F, - (byte)0x72, (byte)0x6D}; - private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2, - (byte)0x5, (byte)0x2}; - -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/RangeValueIterator.java 2015-07-13 16:12:08.000000000 +0900 +++ /dev/null 2015-07-13 16:12:08.000000000 +0900 @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - * <p>Interface for enabling iteration over sets of - * {@code <int index, int value>}, - * where index is the sorted integer index in ascending order and value, its - * associated integer value. - * <p>The result for each iteration is the consecutive range of - * {@code <int index, int value>} with the same value. Result is represented by - * {@code <start, limit, value>} where - * <ul> - * <li> start is the starting integer of the result range - * <li> limit is 1 after the maximum integer that follows start, such that - * all integers between start and (limit - 1), inclusive, have the same - * associated integer value. - * <li> value is the integer value that all integers from start to (limit - 1) - * share in common. - * </ul> - * <p> - * Hence value(start) = value(start + 1) = .... = value(start + n) = .... = - * value(limit - 1). However value(start -1) != value(start) and - * value(limit) != value(start). - * - * <p>Most implementations will be created by factory methods, such as the - * character type iterator in UCharacter.getTypeIterator. See example below. - * - * Example of use:<br> - * <pre> - * RangeValueIterator iterator = UCharacter.getTypeIterator(); - * RangeValueIterator.Element result = new RangeValueIterator.Element(); - * while (iterator.next(result)) { - * System.out.println("Codepoint \\u" + - * Integer.toHexString(result.start) + - * " to codepoint \\u" + - * Integer.toHexString(result.limit - 1) + - * " has the character type " + result.value); - * } - * </pre> - * @author synwee - * @stable ICU 2.6 - */ -public interface RangeValueIterator -{ - // public inner class --------------------------------------------- - - /** - * Return result wrapper for com.ibm.icu.util.RangeValueIterator. - * Stores the start and limit of the continous result range and the - * common value all integers between [start, limit - 1] has. - * @stable ICU 2.6 - */ - public class Element - { - // public data member --------------------------------------------- - - /** - * Starting integer of the continuous result range that has the same - * value - * @stable ICU 2.6 - */ - public int start; - /** - * (End + 1) integer of continuous result range that has the same - * value - * @stable ICU 2.6 - */ - public int limit; - /** - * Gets the common value of the continous result range - * @stable ICU 2.6 - */ - public int value; - - // public constructor -------------------------------------------- - - /** - * Empty default constructor to make javadoc happy - * @stable ICU 2.4 - */ - public Element() - { - } - } - - // public methods ------------------------------------------------- - - /** - * <p>Gets the next maximal result range with a common value and returns - * true if we are not at the end of the iteration, false otherwise.</p> - * <p>If the return boolean is a false, the contents of elements will not - * be updated.</p> - * @param element for storing the result range and value - * @return true if we are not at the end of the iteration, false otherwise. - * @see Element - * @stable ICU 2.6 - */ - public boolean next(Element element); - - /** - * Resets the iterator to the beginning of the iteration. - * @stable ICU 2.6 - */ - public void reset(); -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/RuleCharacterIterator.java 2015-07-13 16:12:09.000000000 +0900 +++ /dev/null 2015-07-13 16:12:09.000000000 +0900 @@ -1,371 +0,0 @@ -/* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -/* - ********************************************************************** - * Author: Alan Liu - * Created: September 23 2003 - * Since: ICU 2.8 - ********************************************************************** - */ - -package sun.text.normalizer; - -import java.text.ParsePosition; - -/** - * An iterator that returns 32-bit code points. This class is deliberately - * <em>not</em> related to any of the JDK or ICU4J character iterator classes - * in order to minimize complexity. - * @author Alan Liu - * @since ICU 2.8 - */ -@SuppressWarnings("deprecation") -public class RuleCharacterIterator { - - // TODO: Ideas for later. (Do not implement if not needed, lest the - // code coverage numbers go down due to unused methods.) - // 1. Add a copy constructor, equals() method, clone() method. - // 2. Rather than return DONE, throw an exception if the end - // is reached -- this is an alternate usage model, probably not useful. - // 3. Return isEscaped from next(). If this happens, - // don't keep an isEscaped member variable. - - /** - * Text being iterated. - */ - private String text; - - /** - * Position of iterator. - */ - private ParsePosition pos; - - /** - * Symbol table used to parse and dereference variables. May be null. - */ - private SymbolTable sym; - - /** - * Current variable expansion, or null if none. - */ - private char[] buf; - - /** - * Position within buf[]. Meaningless if buf == null. - */ - private int bufPos; - - /** - * Flag indicating whether the last character was parsed from an escape. - */ - private boolean isEscaped; - - /** - * Value returned when there are no more characters to iterate. - */ - public static final int DONE = -1; - - /** - * Bitmask option to enable parsing of variable names. - * If {@code (options & PARSE_VARIABLES) != 0}, - * then an embedded variable will be expanded to - * its value. Variables are parsed using the SymbolTable API. - */ - public static final int PARSE_VARIABLES = 1; - - /** - * Bitmask option to enable parsing of escape sequences. - * If {@code (options & PARSE_ESCAPES) != 0}, - * then an embedded escape sequence will be expanded - * to its value. Escapes are parsed using Utility.unescapeAt(). - */ - public static final int PARSE_ESCAPES = 2; - - /** - * Bitmask option to enable skipping of whitespace. - * If {@code (options & SKIP_WHITESPACE) != 0}, - * then whitespace characters will be silently - * skipped, as if they were not present in the input. Whitespace - * characters are defined by UCharacterProperty.isRuleWhiteSpace(). - */ - public static final int SKIP_WHITESPACE = 4; - - /** - * Constructs an iterator over the given text, starting at the given - * position. - * @param text the text to be iterated - * @param sym the symbol table, or null if there is none. If sym is null, - * then variables will not be deferenced, even if the PARSE_VARIABLES - * option is set. - * @param pos upon input, the index of the next character to return. If a - * variable has been dereferenced, then pos will <em>not</em> increment as - * characters of the variable value are iterated. - */ - public RuleCharacterIterator(String text, SymbolTable sym, - ParsePosition pos) { - if (text == null || pos.getIndex() > text.length()) { - throw new IllegalArgumentException(); - } - this.text = text; - this.sym = sym; - this.pos = pos; - buf = null; - } - - /** - * Returns true if this iterator has no more characters to return. - */ - public boolean atEnd() { - return buf == null && pos.getIndex() == text.length(); - } - - /** - * Returns the next character using the given options, or DONE if there - * are no more characters, and advance the position to the next - * character. - * @param options one or more of the following options, bitwise-OR-ed - * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. - * @return the current 32-bit code point, or DONE - */ - public int next(int options) { - int c = DONE; - isEscaped = false; - - for (;;) { - c = _current(); - _advance(UTF16.getCharCount(c)); - - if (c == SymbolTable.SYMBOL_REF && buf == null && - (options & PARSE_VARIABLES) != 0 && sym != null) { - String name = sym.parseReference(text, pos, text.length()); - // If name == null there was an isolated SYMBOL_REF; - // return it. Caller must be prepared for this. - if (name == null) { - break; - } - bufPos = 0; - buf = sym.lookup(name); - if (buf == null) { - throw new IllegalArgumentException( - "Undefined variable: " + name); - } - // Handle empty variable value - if (buf.length == 0) { - buf = null; - } - continue; - } - - if ((options & SKIP_WHITESPACE) != 0 && - UCharacterProperty.isRuleWhiteSpace(c)) { - continue; - } - - if (c == '\\' && (options & PARSE_ESCAPES) != 0) { - int offset[] = new int[] { 0 }; - c = Utility.unescapeAt(lookahead(), offset); - jumpahead(offset[0]); - isEscaped = true; - if (c < 0) { - throw new IllegalArgumentException("Invalid escape"); - } - } - - break; - } - - return c; - } - - /** - * Returns true if the last character returned by next() was - * escaped. This will only be the case if the option passed in to - * next() included PARSE_ESCAPED and the next character was an - * escape sequence. - */ - public boolean isEscaped() { - return isEscaped; - } - - /** - * Returns true if this iterator is currently within a variable expansion. - */ - public boolean inVariable() { - return buf != null; - } - - /** - * Returns an object which, when later passed to setPos(), will - * restore this iterator's position. Usage idiom: - * - * RuleCharacterIterator iterator = ...; - * Object pos = iterator.getPos(null); // allocate position object - * for (;;) { - * pos = iterator.getPos(pos); // reuse position object - * int c = iterator.next(...); - * ... - * } - * iterator.setPos(pos); - * - * @param p a position object previously returned by getPos(), - * or null. If not null, it will be updated and returned. If - * null, a new position object will be allocated and returned. - * @return a position object which may be passed to setPos(), - * either `p,' or if `p' == null, a newly-allocated object - */ - public Object getPos(Object p) { - if (p == null) { - return new Object[] {buf, new int[] {pos.getIndex(), bufPos}}; - } - Object[] a = (Object[]) p; - a[0] = buf; - int[] v = (int[]) a[1]; - v[0] = pos.getIndex(); - v[1] = bufPos; - return p; - } - - /** - * Restores this iterator to the position it had when getPos() - * returned the given object. - * @param p a position object previously returned by getPos() - */ - public void setPos(Object p) { - Object[] a = (Object[]) p; - buf = (char[]) a[0]; - int[] v = (int[]) a[1]; - pos.setIndex(v[0]); - bufPos = v[1]; - } - - /** - * Skips ahead past any ignored characters, as indicated by the given - * options. This is useful in conjunction with the lookahead() method. - * - * Currently, this only has an effect for SKIP_WHITESPACE. - * @param options one or more of the following options, bitwise-OR-ed - * together: PARSE_VARIABLES, PARSE_ESCAPES, SKIP_WHITESPACE. - */ - public void skipIgnored(int options) { - if ((options & SKIP_WHITESPACE) != 0) { - for (;;) { - int a = _current(); - if (!UCharacterProperty.isRuleWhiteSpace(a)) break; - _advance(UTF16.getCharCount(a)); - } - } - } - - /** - * Returns a string containing the remainder of the characters to be - * returned by this iterator, without any option processing. If the - * iterator is currently within a variable expansion, this will only - * extend to the end of the variable expansion. This method is provided - * so that iterators may interoperate with string-based APIs. The typical - * sequence of calls is to call skipIgnored(), then call lookahead(), then - * parse the string returned by lookahead(), then call jumpahead() to - * resynchronize the iterator. - * @return a string containing the characters to be returned by future - * calls to next() - */ - public String lookahead() { - if (buf != null) { - return new String(buf, bufPos, buf.length - bufPos); - } else { - return text.substring(pos.getIndex()); - } - } - - /** - * Advances the position by the given number of 16-bit code units. - * This is useful in conjunction with the lookahead() method. - * @param count the number of 16-bit code units to jump over - */ - public void jumpahead(int count) { - if (count < 0) { - throw new IllegalArgumentException(); - } - if (buf != null) { - bufPos += count; - if (bufPos > buf.length) { - throw new IllegalArgumentException(); - } - if (bufPos == buf.length) { - buf = null; - } - } else { - int i = pos.getIndex() + count; - pos.setIndex(i); - if (i > text.length()) { - throw new IllegalArgumentException(); - } - } - } - - /** - * Returns the current 32-bit code point without parsing escapes, parsing - * variables, or skipping whitespace. - * @return the current 32-bit code point - */ - private int _current() { - if (buf != null) { - return UTF16.charAt(buf, 0, buf.length, bufPos); - } else { - int i = pos.getIndex(); - return (i < text.length()) ? UTF16.charAt(text, i) : DONE; - } - } - - /** - * Advances the position by the given amount. - * @param count the number of 16-bit code units to advance past - */ - private void _advance(int count) { - if (buf != null) { - bufPos += count; - if (bufPos == buf.length) { - buf = null; - } - } else { - pos.setIndex(pos.getIndex() + count); - if (pos.getIndex() > text.length()) { - pos.setIndex(text.length()); - } - } - } -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/SymbolTable.java 2015-07-13 16:12:09.000000000 +0900 +++ /dev/null 2015-07-13 16:12:09.000000000 +0900 @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.text.ParsePosition; - -/** - * An interface that defines both lookup protocol and parsing of - * symbolic names. - * - * <p>A symbol table maintains two kinds of mappings. The first is - * between symbolic names and their values. For example, if the - * variable with the name "start" is set to the value "alpha" - * (perhaps, though not necessarily, through an expression such as - * "$start=alpha"), then the call lookup("start") will return the - * char[] array ['a', 'l', 'p', 'h', 'a']. - * - * <p>The second kind of mapping is between character values and - * UnicodeMatcher objects. This is used by RuleBasedTransliterator, - * which uses characters in the private use area to represent objects - * such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z], - * then lookupMatcher(0xE015) will return the UnicodeSet [a-z]. - * - * <p>Finally, a symbol table defines parsing behavior for symbolic - * names. All symbolic names start with the SYMBOL_REF character. - * When a parser encounters this character, it calls parseReference() - * with the position immediately following the SYMBOL_REF. The symbol - * table parses the name, if there is one, and returns it. - * - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ -@Deprecated -public interface SymbolTable { - - /** - * The character preceding a symbol reference name. - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - static final char SYMBOL_REF = '$'; - - /** - * Lookup the characters associated with this string and return it. - * Return {@code null} if no such name exists. The resultant - * array may have length zero. - * @param s the symbolic name to lookup - * @return a char array containing the name's value, or null if - * there is no mapping for s. - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - char[] lookup(String s); - - /** - * Lookup the UnicodeMatcher associated with the given character, and - * return it. Return {@code null} if not found. - * @param ch a 32-bit code point from 0 to 0x10FFFF inclusive. - * @return the UnicodeMatcher object represented by the given - * character, or null if there is no mapping for ch. - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - UnicodeMatcher lookupMatcher(int ch); - - /** - * Parse a symbol reference name from the given string, starting - * at the given position. If no valid symbol reference name is - * found, return null and leave pos unchanged. That is, if the - * character at pos cannot start a name, or if pos is at or after - * text.length(), then return null. This indicates an isolated - * SYMBOL_REF character. - * @param text the text to parse for the name - * @param pos on entry, the index of the first character to parse. - * This is the character following the SYMBOL_REF character. On - * exit, the index after the last parsed character. If the parse - * failed, pos is unchanged on exit. - * @param limit the index after the last character to be parsed. - * @return the parsed name, or null if there is no valid symbolic - * name at the given position. - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - String parseReference(String text, ParsePosition pos, int limit); -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/TrieIterator.java 2015-07-13 16:12:10.000000000 +0900 +++ /dev/null 2015-07-13 16:12:10.000000000 +0900 @@ -1,547 +0,0 @@ -/* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - * Class enabling iteration of the values in a Trie. - * <p>Result of each iteration contains the interval of codepoints that have - * the same value type and the value type itself. - * <p>The comparison of each codepoint value is done via extract(), which the - * default implementation is to return the value as it is. - * <p>Method extract() can be overwritten to perform manipulations on - * codepoint values in order to perform specialized comparison. - * <p>TrieIterator is designed to be a generic iterator for the CharTrie - * and the IntTrie, hence to accommodate both types of data, the return - * result will be in terms of int (32 bit) values. - * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use. - * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> - * Internally, icu4c's utrie_enum performs all iterations in its body. In Java - * sense, the caller will have to pass a object with a callback function - * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, - * uint32_t value) into utrie_enum. utrie_enum will then find ranges of - * codepoints with the same value as determined by - * UTrieEnumValue(const void *context, uint32_t value). for each range, - * utrie_enum calls the callback function to perform a task. In this way, - * icu4c performs the iteration within utrie_enum. - * To follow the JDK model, icu4j is slightly different from icu4c. - * Instead of requesting the caller to implement an object for a callback. - * The caller will have to implement a subclass of TrieIterator, fleshing out - * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, - * the caller will have to code his own iteration and flesh out the task - * (equivalent to UTrieEnumRange) to be performed in the iteration loop. - * - * <p>There are basically 3 usage scenarios for porting: - * <p>1) UTrieEnumValue is the only implemented callback then just implement a - * subclass of TrieIterator and override the extract(int) method. The - * extract(int) method is analogus to UTrieEnumValue callback. - * - * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement - * a subclass of TrieIterator, override the extract method and iterate, e.g.<br> - * {@code utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, - * set);}<br> - * In Java:<br> - * <pre> - * class TrieIteratorImpl extends TrieIterator{ - * public TrieIteratorImpl(Trie data){ - * super(data); - * } - * public int extract(int value){ - * // port the implementation of _enumPropertyStartsValue here - * } - * } - * .... - * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); - * while(fcdIter.next(result)) { - * // port the implementation of _enumPropertyStartsRange - * } - * </pre> - * - * <p>3) UTrieEnumRange is the only implemented callback then just implement - * the while loop, when utrie_enum is called - * <pre>{@code - * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); - * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); - * while(fcdIter.next(result)){ - * set.add(result.start); - * } - * }</pre> - * - * @author synwee - * @see com.ibm.icu.impl.Trie - * @see com.ibm.icu.lang.UCharacterTypeIterator - * @since release 2.1, Jan 17 2002 - */ -public class TrieIterator implements RangeValueIterator -{ - - // public constructor --------------------------------------------- - - /** - * TrieEnumeration constructor - * @param trie to be used - * @exception IllegalArgumentException throw when argument is null. - */ - public TrieIterator(Trie trie) - { - if (trie == null) { - throw new IllegalArgumentException( - "Argument trie cannot be null"); - } - m_trie_ = trie; - // synwee: check that extract belongs to the child class - m_initialValue_ = extract(m_trie_.getInitialValue()); - reset(); - } - - // public methods ------------------------------------------------- - - /** - * <p>Returns true if we are not at the end of the iteration, false - * otherwise.</p> - * <p>The next set of codepoints with the same value type will be - * calculated during this call and returned in the arguement element.</p> - * @param element return result - * @return true if we are not at the end of the iteration, false otherwise. - * @exception NoSuchElementException - if no more elements exist. - * @see com.ibm.icu.util.RangeValueIterator.Element - */ - public final boolean next(Element element) - { - if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { - return false; - } - if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && - calculateNextBMPElement(element)) { - return true; - } - calculateNextSupplementaryElement(element); - return true; - } - - /** - * Resets the iterator to the beginning of the iteration - */ - public final void reset() - { - m_currentCodepoint_ = 0; - m_nextCodepoint_ = 0; - m_nextIndex_ = 0; - m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; - if (m_nextBlock_ == 0) { - m_nextValue_ = m_initialValue_; - } - else { - m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); - } - m_nextBlockIndex_ = 0; - m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; - } - - // protected methods ---------------------------------------------- - - /** - * Called by next() to extracts a 32 bit value from a trie value - * used for comparison. - * This method is to be overwritten if special manipulation is to be done - * to retrieve a relevant comparison. - * The default function is to return the value as it is. - * @param value a value from the trie - * @return extracted value - */ - protected int extract(int value) - { - return value; - } - - // private methods ------------------------------------------------ - - /** - * Set the result values - * @param element return result object - * @param start codepoint of range - * @param limit (end + 1) codepoint of range - * @param value common value of range - */ - private final void setResult(Element element, int start, int limit, - int value) - { - element.start = start; - element.limit = limit; - element.value = value; - } - - /** - * Finding the next element. - * This method is called just before returning the result of - * next(). - * We always store the next element before it is requested. - * In the case that we have to continue calculations into the - * supplementary planes, a false will be returned. - * @param element return result object - * @return true if the next range is found, false if we have to proceed to - * the supplementary range. - */ - private final boolean calculateNextBMPElement(Element element) - { - int currentBlock = m_nextBlock_; - int currentValue = m_nextValue_; - m_currentCodepoint_ = m_nextCodepoint_; - m_nextCodepoint_ ++; - m_nextBlockIndex_ ++; - if (!checkBlockDetail(currentValue)) { - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - return true; - } - // synwee check that next block index == 0 here - // enumerate BMP - the main loop enumerates data blocks - while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { - m_nextIndex_ ++; - // because of the way the character is split to form the index - // the lead surrogate and trail surrogate can not be in the - // mid of a block - if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { - // skip lead surrogate code units, - // go to lead surrogate codepoints - m_nextIndex_ = BMP_INDEX_LENGTH_; - } - else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { - // go back to regular BMP code points - m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; - } - - m_nextBlockIndex_ = 0; - if (!checkBlock(currentBlock, currentValue)) { - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - return true; - } - } - m_nextCodepoint_ --; // step one back since this value has not been - m_nextBlockIndex_ --; // retrieved yet. - return false; - } - - /** - * Finds the next supplementary element. - * For each entry in the trie, the value to be delivered is passed through - * extract(). - * We always store the next element before it is requested. - * Called after calculateNextBMP() completes its round of BMP characters. - * There is a slight difference in the usage of m_currentCodepoint_ - * here as compared to calculateNextBMP(). Though both represents the - * lower bound of the next element, in calculateNextBMP() it gets set - * at the start of any loop, where-else, in calculateNextSupplementary() - * since m_currentCodepoint_ already contains the lower bound of the - * next element (passed down from calculateNextBMP()), we keep it till - * the end before resetting it to the new value. - * Note, if there are no more iterations, it will never get to here. - * Blocked out by next(). - * @param element return result object - */ - private final void calculateNextSupplementaryElement(Element element) - { - int currentValue = m_nextValue_; - int currentBlock = m_nextBlock_; - m_nextCodepoint_ ++; - m_nextBlockIndex_ ++; - - if (UTF16.getTrailSurrogate(m_nextCodepoint_) - != UTF16.TRAIL_SURROGATE_MIN_VALUE) { - // this piece is only called when we are in the middle of a lead - // surrogate block - if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - m_currentCodepoint_ = m_nextCodepoint_; - return; - } - // we have cleared one block - m_nextIndex_ ++; - m_nextTrailIndexOffset_ ++; - if (!checkTrailBlock(currentBlock, currentValue)) { - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - m_currentCodepoint_ = m_nextCodepoint_; - return; - } - } - int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); - // enumerate supplementary code points - while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { - // lead surrogate access - int leadBlock = - m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << - Trie.INDEX_STAGE_2_SHIFT_; - if (leadBlock == m_trie_.m_dataOffset_) { - // no entries for a whole block of lead surrogates - if (currentValue != m_initialValue_) { - m_nextValue_ = m_initialValue_; - m_nextBlock_ = 0; - m_nextBlockIndex_ = 0; - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - m_currentCodepoint_ = m_nextCodepoint_; - return; - } - - nextLead += DATA_BLOCK_LENGTH_; - // number of total affected supplementary codepoints in one - // block - // this is not a simple addition of - // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider - // that we might have moved some of the codepoints - m_nextCodepoint_ = UCharacterProperty.getRawSupplementary( - (char)nextLead, - (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); - continue; - } - if (m_trie_.m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - // enumerate trail surrogates for this lead surrogate - m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( - m_trie_.getValue(leadBlock + - (nextLead & Trie.INDEX_STAGE_3_MASK_))); - if (m_nextIndex_ <= 0) { - // no data for this lead surrogate - if (currentValue != m_initialValue_) { - m_nextValue_ = m_initialValue_; - m_nextBlock_ = 0; - m_nextBlockIndex_ = 0; - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - m_currentCodepoint_ = m_nextCodepoint_; - return; - } - m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; - } else { - m_nextTrailIndexOffset_ = 0; - if (!checkTrailBlock(currentBlock, currentValue)) { - setResult(element, m_currentCodepoint_, m_nextCodepoint_, - currentValue); - m_currentCodepoint_ = m_nextCodepoint_; - return; - } - } - nextLead ++; - } - - // deliver last range - setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, - currentValue); - } - - /** - * Internal block value calculations - * Performs calculations on a data block to find codepoints in m_nextBlock_ - * after the index m_nextBlockIndex_ that has the same value. - * Note m_*_ variables at this point is the next codepoint whose value - * has not been calculated. - * But when returned with false, it will be the last codepoint whose - * value has been calculated. - * @param currentValue the value which other codepoints are tested against - * @return true if the whole block has the same value as currentValue or if - * the whole block has been calculated, false otherwise. - */ - private final boolean checkBlockDetail(int currentValue) - { - while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { - m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + - m_nextBlockIndex_)); - if (m_nextValue_ != currentValue) { - return false; - } - ++ m_nextBlockIndex_; - ++ m_nextCodepoint_; - } - return true; - } - - /** - * Internal block value calculations - * Performs calculations on a data block to find codepoints in m_nextBlock_ - * that has the same value. - * Will call checkBlockDetail() if highlevel check fails. - * Note m_*_ variables at this point is the next codepoint whose value - * has not been calculated. - * @param currentBlock the initial block containing all currentValue - * @param currentValue the value which other codepoints are tested against - * @return true if the whole block has the same value as currentValue or if - * the whole block has been calculated, false otherwise. - */ - private final boolean checkBlock(int currentBlock, int currentValue) - { - m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << - Trie.INDEX_STAGE_2_SHIFT_; - if (m_nextBlock_ == currentBlock && - (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { - // the block is the same as the previous one, filled with - // currentValue - m_nextCodepoint_ += DATA_BLOCK_LENGTH_; - } - else if (m_nextBlock_ == 0) { - // this is the all-initial-value block - if (currentValue != m_initialValue_) { - m_nextValue_ = m_initialValue_; - m_nextBlockIndex_ = 0; - return false; - } - m_nextCodepoint_ += DATA_BLOCK_LENGTH_; - } - else { - if (!checkBlockDetail(currentValue)) { - return false; - } - } - return true; - } - - /** - * Internal block value calculations - * Performs calculations on multiple data blocks for a set of trail - * surrogates to find codepoints in m_nextBlock_ that has the same value. - * Will call checkBlock() for internal block checks. - * Note m_*_ variables at this point is the next codepoint whose value - * has not been calculated. - * @param currentBlock the initial block containing all currentValue - * @param currentValue the value which other codepoints are tested against - * @return true if the whole block has the same value as currentValue or if - * the whole block has been calculated, false otherwise. - */ - private final boolean checkTrailBlock(int currentBlock, - int currentValue) - { - // enumerate code points for this lead surrogate - while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) - { - // if we ever reach here, we are at the start of a new block - m_nextBlockIndex_ = 0; - // copy of most of the body of the BMP loop - if (!checkBlock(currentBlock, currentValue)) { - return false; - } - m_nextTrailIndexOffset_ ++; - m_nextIndex_ ++; - } - return true; - } - - /** - * Checks if we are beginning at the start of a initial block. - * If we are then the rest of the codepoints in this initial block - * has the same values. - * We increment m_nextCodepoint_ and relevant data members if so. - * This is used only in for the supplementary codepoints because - * the offset to the trail indexes could be 0. - * @return true if we are at the start of a initial block. - */ - private final boolean checkNullNextTrailIndex() - { - if (m_nextIndex_ <= 0) { - m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; - int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); - int leadBlock = - m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << - Trie.INDEX_STAGE_2_SHIFT_; - if (m_trie_.m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( - m_trie_.getValue(leadBlock + - (nextLead & Trie.INDEX_STAGE_3_MASK_))); - m_nextIndex_ --; - m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; - return true; - } - return false; - } - - // private data members -------------------------------------------- - - /** - * Size of the stage 1 BMP indexes - */ - private static final int BMP_INDEX_LENGTH_ = - 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; - /** - * Lead surrogate minimum value - */ - private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; - /** - * Trail surrogate minimum value - */ - private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; - /** - * Number of trail surrogate - */ - private static final int TRAIL_SURROGATE_COUNT_ = 0x400; - /** - * Number of stage 1 indexes for supplementary calculations that maps to - * each lead surrogate character. - * See second pass into getRawOffset for the trail surrogate character. - * 10 for significant number of bits for trail surrogates, 5 for what we - * discard during shifting. - */ - private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = - 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); - /** - * Number of data values in a stage 2 (data array) block. - */ - private static final int DATA_BLOCK_LENGTH_ = - 1 << Trie.INDEX_STAGE_1_SHIFT_; - /** - * Trie instance - */ - private Trie m_trie_; - /** - * Initial value for trie values - */ - private int m_initialValue_; - /** - * Next element results and data. - */ - private int m_currentCodepoint_; - private int m_nextCodepoint_; - private int m_nextValue_; - private int m_nextIndex_; - private int m_nextBlock_; - private int m_nextBlockIndex_; - private int m_nextTrailIndexOffset_; -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UCharacterPropertyReader.java 2015-07-13 16:12:10.000000000 +0900 +++ /dev/null 2015-07-13 16:12:10.000000000 +0900 @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.DataInputStream; -import java.io.InputStream; -import java.io.IOException; - -/** -* <p>Internal reader class for ICU data file uprops.icu containing -* Unicode codepoint data.</p> -* <p>This class simply reads uprops.icu, authenticates that it is a valid -* ICU data file and split its contents up into blocks of data for use in -* <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>. -* </p> -* <p>uprops.icu which is in big-endian format is jared together with this -* package.</p> -* -* Unicode character properties file format see -* (ICU4C)/source/tools/genprops/store.c -* -* @author Syn Wee Quek -* @since release 2.1, February 1st 2002 -*/ -final class UCharacterPropertyReader implements ICUBinary.Authenticate -{ - // public methods ---------------------------------------------------- - - public boolean isDataVersionAcceptable(byte version[]) - { - return version[0] == DATA_FORMAT_VERSION_[0] - && version[2] == DATA_FORMAT_VERSION_[2] - && version[3] == DATA_FORMAT_VERSION_[3]; - } - - // protected constructor --------------------------------------------- - - /** - * <p>Protected constructor.</p> - * @param inputStream ICU uprop.dat file input stream - * @exception IOException throw if data file fails authentication - */ - protected UCharacterPropertyReader(InputStream inputStream) - throws IOException - { - m_unicodeVersion_ = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, - this); - m_dataInputStream_ = new DataInputStream(inputStream); - } - - // protected methods ------------------------------------------------- - - /** - * <p>Reads uprops.icu, parse it into blocks of data to be stored in - * UCharacterProperty.</P - * @param ucharppty UCharacterProperty instance - * @exception IOException thrown when data reading fails - */ - protected void read(UCharacterProperty ucharppty) throws IOException - { - // read the indexes - int count = INDEX_SIZE_; - m_propertyOffset_ = m_dataInputStream_.readInt(); - count --; - m_exceptionOffset_ = m_dataInputStream_.readInt(); - count --; - m_caseOffset_ = m_dataInputStream_.readInt(); - count --; - m_additionalOffset_ = m_dataInputStream_.readInt(); - count --; - m_additionalVectorsOffset_ = m_dataInputStream_.readInt(); - count --; - m_additionalColumnsCount_ = m_dataInputStream_.readInt(); - count --; - m_reservedOffset_ = m_dataInputStream_.readInt(); - count --; - m_dataInputStream_.skipBytes(3 << 2); - count -= 3; - ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt(); - count --; // 10 - ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt(); - count --; // 11 - m_dataInputStream_.skipBytes(count << 2); - - // read the trie index block - // m_props_index_ in terms of ints - ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, null); - - // skip the 32 bit properties block - int size = m_exceptionOffset_ - m_propertyOffset_; - m_dataInputStream_.skipBytes(size * 4); - - // reads the 32 bit exceptions block - size = m_caseOffset_ - m_exceptionOffset_; - m_dataInputStream_.skipBytes(size * 4); - - // reads the 32 bit case block - size = (m_additionalOffset_ - m_caseOffset_) << 1; - m_dataInputStream_.skipBytes(size * 2); - - if(m_additionalColumnsCount_ > 0) { - // reads the additional property block - ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, null); - - // additional properties - size = m_reservedOffset_ - m_additionalVectorsOffset_; - ucharppty.m_additionalVectors_ = new int[size]; - for (int i = 0; i < size; i ++) { - ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt(); - } - } - - m_dataInputStream_.close(); - ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_; - ucharppty.m_unicodeVersion_ = VersionInfo.getInstance( - (int)m_unicodeVersion_[0], (int)m_unicodeVersion_[1], - (int)m_unicodeVersion_[2], (int)m_unicodeVersion_[3]); - } - - // private variables ------------------------------------------------- - - /** - * Index size - */ - private static final int INDEX_SIZE_ = 16; - - /** - * ICU data file input stream - */ - private DataInputStream m_dataInputStream_; - - /** - * Offset information in the indexes. - */ - private int m_propertyOffset_; - private int m_exceptionOffset_; - private int m_caseOffset_; - private int m_additionalOffset_; - private int m_additionalVectorsOffset_; - private int m_additionalColumnsCount_; - private int m_reservedOffset_; - private byte m_unicodeVersion_[]; - - /** - * Data format "UPro". - */ - private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50, - (byte)0x72, (byte)0x6F}; - /** - * Format version; this code works with all versions with the same major - * version number and the same Trie bit distribution. - */ - private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x5, (byte)0, - (byte)Trie.INDEX_STAGE_1_SHIFT_, - (byte)Trie.INDEX_STAGE_2_SHIFT_}; -} --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeMatcher.java 2015-07-13 16:12:10.000000000 +0900 +++ /dev/null 2015-07-13 16:12:10.000000000 +0900 @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - * <code>UnicodeMatcher</code> defines a protocol for objects that can - * match a range of characters in a Replaceable string. - * @stable ICU 2.0 - */ -public interface UnicodeMatcher { - - /** - * The character at index {@code i}, where - * {@code i < contextStart || i >= contextLimit}, - * is ETHER. This allows explicit matching by rules and UnicodeSets - * of text outside the context. In traditional terms, this allows anchoring - * at the start and/or end. - * @stable ICU 2.0 - */ - static final char ETHER = '\uFFFF'; - -} - -//eof --- old/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSetIterator.java 2015-07-13 16:12:11.000000000 +0900 +++ /dev/null 2015-07-13 16:12:11.000000000 +0900 @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.util.Iterator; - -/** - * UnicodeSetIterator iterates over the contents of a UnicodeSet. It - * iterates over either code points or code point ranges. After all - * code points or ranges have been returned, it returns the - * multicharacter strings of the UnicodSet, if any. - * - * <p>To iterate over code points, use a loop like this: - * <pre> - * UnicodeSetIterator it(set); - * while (set.next()) { - * if (set.codepoint != UnicodeSetIterator::IS_STRING) { - * processCodepoint(set.codepoint); - * } else { - * processString(set.string); - * } - * } - * </pre> - * - * <p>To iterate over code point ranges, use a loop like this: - * <pre> - * UnicodeSetIterator it(set); - * while (set.nextRange()) { - * if (set.codepoint != UnicodeSetIterator::IS_STRING) { - * processCodepointRange(set.codepoint, set.codepointEnd); - * } else { - * processString(set.string); - * } - * } - * </pre> - * @author M. Davis - * @stable ICU 2.0 - */ -public class UnicodeSetIterator { - - /** - * Value of {@code codepoint} if the iterator points to a string. - * If {@code codepoint == IS_STRING}, then examine - * {@code string} for the current iteration result. - * @stable ICU 2.0 - */ - public static int IS_STRING = -1; - - /** - * Current code point, or the special value {@code IS_STRING}, if - * the iterator points to a string. - * @stable ICU 2.0 - */ - public int codepoint; - - /** - * When iterating over ranges using {@code nextRange()}, - * {@code codepointEnd} contains the inclusive end of the - * iteration range, if {@code codepoint != IS_STRING}. If - * iterating over code points using {@code next()}, or if - * {@code codepoint == IS_STRING}, then the value of - * {@code codepointEnd} is undefined. - * @stable ICU 2.0 - */ - public int codepointEnd; - - /** - * If {@code codepoint == IS_STRING}, then {@code string} points - * to the current string. If {@code codepoint != IS_STRING}, the - * value of {@code string} is undefined. - * @stable ICU 2.0 - */ - public String string; - - /** - * Create an iterator over the given set. - * @param set set to iterate over - * @stable ICU 2.0 - */ - public UnicodeSetIterator(UnicodeSet set) { - reset(set); - } - - /** - * Returns the next element in the set, either a code point range - * or a string. If there are no more elements in the set, return - * false. If {@code codepoint == IS_STRING}, the value is a - * string in the {@code string} field. Otherwise the value is a - * range of one or more code points from {@code codepoint} to - * {@code codepointeEnd} inclusive. - * - * <p>The order of iteration is all code points ranges in sorted - * order, followed by all strings sorted order. Ranges are - * disjoint and non-contiguous. {@code string} is undefined - * unless {@code codepoint == IS_STRING}. Do not mix calls to - * {@code next()} and {@code nextRange()} without calling - * {@code reset()} between them. The results of doing so are - * undefined. - * - * @return true if there was another element in the set and this - * object contains the element. - * @stable ICU 2.0 - */ - public boolean nextRange() { - if (nextElement <= endElement) { - codepointEnd = endElement; - codepoint = nextElement; - nextElement = endElement+1; - return true; - } - if (range < endRange) { - loadRange(++range); - codepointEnd = endElement; - codepoint = nextElement; - nextElement = endElement+1; - return true; - } - - // stringIterator == null iff there are no string elements remaining - - if (stringIterator == null) return false; - codepoint = IS_STRING; // signal that value is actually a string - string = stringIterator.next(); - if (!stringIterator.hasNext()) stringIterator = null; - return true; - } - - /** - * Sets this iterator to visit the elements of the given set and - * resets it to the start of that set. The iterator is valid only - * so long as {@code set} is valid. - * @param uset the set to iterate over. - * @stable ICU 2.0 - */ - public void reset(UnicodeSet uset) { - set = uset; - reset(); - } - - /** - * Resets this iterator to the start of the set. - * @stable ICU 2.0 - */ - public void reset() { - endRange = set.getRangeCount() - 1; - range = 0; - endElement = -1; - nextElement = 0; - if (endRange >= 0) { - loadRange(range); - } - stringIterator = null; - if (set.strings != null) { - stringIterator = set.strings.iterator(); - if (!stringIterator.hasNext()) stringIterator = null; - } - } - - // ======================= PRIVATES =========================== - - private UnicodeSet set; - private int endRange = 0; - private int range = 0; - /** - * @internal - */ - protected int endElement; - /** - * @internal - */ - protected int nextElement; - private Iterator<String> stringIterator = null; - - /** - * Invariant: stringIterator is null when there are no (more) strings remaining - */ - - /** - * @internal - */ - protected void loadRange(int aRange) { - nextElement = set.getRangeStart(aRange); - endElement = set.getRangeEnd(aRange); - } -} Binary files old/jdk/src/java.base/share/classes/sun/text/resources/unorm.icu and /dev/null differ --- /dev/null 2015-07-13 16:12:12.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/bidi/BidiWriter.java 2015-07-13 16:12:11.000000000 +0900 @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* +******************************************************************************* +* Copyright (C) 2001-2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +/* Written by Simon Montagu, Matitiahu Allouche + * (ported from C code written by Markus W. Scherer) + */ + +package sun.text.bidi; + +import sun.text.normalizer.UCharacter; +import sun.text.normalizer.UTF16; + +final class BidiWriter { + + /** Bidi control code points */ + static final char LRM_CHAR = 0x200e; + static final char RLM_CHAR = 0x200f; + static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | + 1 << UCharacter.RIGHT_TO_LEFT_ARABIC); + + private static boolean IsCombining(int type) { + return ((1<<type & + (1<<UCharacter.NON_SPACING_MARK | + 1<<UCharacter.COMBINING_SPACING_MARK | + 1<<UCharacter.ENCLOSING_MARK)) != 0); + } + + /* + * When we have OUTPUT_REVERSE set on writeReordered(), then we + * semantically write RTL runs in reverse and later reverse them again. + * Instead, we actually write them in forward order to begin with. + * However, if the RTL run was to be mirrored, we need to mirror here now + * since the implicit second reversal must not do it. + * It looks strange to do mirroring in LTR output, but it is only because + * we are writing RTL output in reverse. + */ + private static String doWriteForward(String src, int options) { + /* optimize for several combinations of options */ + switch(options&(BidiBase.REMOVE_BIDI_CONTROLS|BidiBase.DO_MIRRORING)) { + case 0: { + /* simply return the LTR run */ + return src; + } + case BidiBase.DO_MIRRORING: { + StringBuffer dest = new StringBuffer(src.length()); + + /* do mirroring */ + int i=0; + int c; + + do { + c = UTF16.charAt(src, i); + i += UTF16.getCharCount(c); + UTF16.append(dest, UCharacter.getMirror(c)); + } while(i < src.length()); + return dest.toString(); + } + case BidiBase.REMOVE_BIDI_CONTROLS: { + StringBuilder dest = new StringBuilder(src.length()); + + /* copy the LTR run and remove any Bidi control characters */ + int i = 0; + char c; + do { + c = src.charAt(i++); + if(!BidiBase.IsBidiControlChar(c)) { + dest.append(c); + } + } while(i < src.length()); + return dest.toString(); + } + default: { + StringBuffer dest = new StringBuffer(src.length()); + + /* remove Bidi control characters and do mirroring */ + int i = 0; + int c; + do { + c = UTF16.charAt(src, i); + i += UTF16.getCharCount(c); + if(!BidiBase.IsBidiControlChar(c)) { + UTF16.append(dest, UCharacter.getMirror(c)); + } + } while(i < src.length()); + return dest.toString(); + } + } /* end of switch */ + } + + private static String doWriteForward(char[] text, int start, int limit, + int options) { + return doWriteForward(new String(text, start, limit - start), options); + } + + static String writeReverse(String src, int options) { + /* + * RTL run - + * + * RTL runs need to be copied to the destination in reverse order + * of code points, not code units, to keep Unicode characters intact. + * + * The general strategy for this is to read the source text + * in backward order, collect all code units for a code point + * (and optionally following combining characters, see below), + * and copy all these code units in ascending order + * to the destination for this run. + * + * Several options request whether combining characters + * should be kept after their base characters, + * whether Bidi control characters should be removed, and + * whether characters should be replaced by their mirror-image + * equivalent Unicode characters. + */ + StringBuffer dest = new StringBuffer(src.length()); + + /* optimize for several combinations of options */ + switch (options & + (BidiBase.REMOVE_BIDI_CONTROLS | + BidiBase.DO_MIRRORING | + BidiBase.KEEP_BASE_COMBINING)) { + + case 0: + /* + * With none of the "complicated" options set, the destination + * run will have the same length as the source run, + * and there is no mirroring and no keeping combining characters + * with their base characters. + * + * XXX: or dest = UTF16.reverse(new StringBuffer(src)); + */ + + int srcLength = src.length(); + + /* preserve character integrity */ + do { + /* i is always after the last code unit known to need to be kept + * in this segment */ + int i = srcLength; + + /* collect code units for one base character */ + srcLength -= UTF16.getCharCount(UTF16.charAt(src, + srcLength - 1)); + + /* copy this base character */ + dest.append(src.substring(srcLength, i)); + } while(srcLength > 0); + break; + + case BidiBase.KEEP_BASE_COMBINING: + /* + * Here, too, the destination + * run will have the same length as the source run, + * and there is no mirroring. + * We do need to keep combining characters with their base + * characters. + */ + srcLength = src.length(); + + /* preserve character integrity */ + do { + /* i is always after the last code unit known to need to be kept + * in this segment */ + int c; + int i = srcLength; + + /* collect code units and modifier letters for one base + * character */ + do { + c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + } while(srcLength > 0 && IsCombining(UCharacter.getType(c))); + + /* copy this "user character" */ + dest.append(src.substring(srcLength, i)); + } while(srcLength > 0); + break; + + default: + /* + * With several "complicated" options set, this is the most + * general and the slowest copying of an RTL run. + * We will do mirroring, remove Bidi controls, and + * keep combining characters with their base characters + * as requested. + */ + srcLength = src.length(); + + /* preserve character integrity */ + do { + /* i is always after the last code unit known to need to be kept + * in this segment */ + int i = srcLength; + + /* collect code units for one base character */ + int c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) { + /* collect modifier letters for this base character */ + while(srcLength > 0 && IsCombining(UCharacter.getType(c))) { + c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + } + } + + if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && + BidiBase.IsBidiControlChar(c)) { + /* do not copy this Bidi control character */ + continue; + } + + /* copy this "user character" */ + int j = srcLength; + if((options & BidiBase.DO_MIRRORING) != 0) { + /* mirror only the base character */ + c = UCharacter.getMirror(c); + UTF16.append(dest, c); + j += UTF16.getCharCount(c); + } + dest.append(src.substring(j, i)); + } while(srcLength > 0); + break; + } /* end of switch */ + + return dest.toString(); + } + + static String doWriteReverse(char[] text, int start, int limit, int options) { + return writeReverse(new String(text, start, limit - start), options); + } + + static String writeReordered(BidiBase bidi, int options) { + int run, runCount; + StringBuilder dest; + char[] text = bidi.text; + runCount = bidi.countRuns(); + + /* + * Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the + * reordering mode (checked below) is appropriate. + */ + if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) { + options |= BidiBase.INSERT_LRM_FOR_NUMERIC; + options &= ~BidiBase.REMOVE_BIDI_CONTROLS; + } + /* + * Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS + * and cancels BidiBase.INSERT_LRM_FOR_NUMERIC. + */ + if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) { + options |= BidiBase.REMOVE_BIDI_CONTROLS; + options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; + } + /* + * If we do not perform the "inverse Bidi" algorithm, then we + * don't need to insert any LRMs, and don't need to test for it. + */ + if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) && + (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) && + (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && + (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) { + options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; + } + dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? + bidi.length * 2 : bidi.length); + /* + * Iterate through all visual runs and copy the run text segments to + * the destination, according to the options. + * + * The tests for where to insert LRMs ignore the fact that there may be + * BN codes or non-BMP code points at the beginning and end of a run; + * they may insert LRMs unnecessarily but the tests are faster this way + * (this would have to be improved for UTF-8). + */ + if ((options & BidiBase.OUTPUT_REVERSE) == 0) { + /* forward output */ + if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { + /* do not insert Bidi controls */ + for (run = 0; run < runCount; ++run) { + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + } else { + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options)); + } + } + } else { + /* insert Bidi controls for "inverse Bidi" */ + byte[] dirProps = bidi.dirProps; + char uc; + int markFlag; + + for (run = 0; run < runCount; ++run) { + BidiRun bidiRun = bidi.getVisualRun(run); + markFlag=0; + /* check if something relevant in insertPoints */ + markFlag = bidi.runs[run].insertRemove; + if (markFlag < 0) { /* bidi controls count */ + markFlag = 0; + } + if (bidiRun.isEvenRun()) { + if (bidi.isInverse() && + dirProps[bidiRun.start] != BidiBase.L) { + markFlag |= BidiBase.LRM_BEFORE; + } + if ((markFlag & BidiBase.LRM_BEFORE) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + dest.append(doWriteForward(text, + bidiRun.start, bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + + if (bidi.isInverse() && + dirProps[bidiRun.limit - 1] != BidiBase.L) { + markFlag |= BidiBase.LRM_AFTER; + } + if ((markFlag & BidiBase.LRM_AFTER) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + } else { /* RTL run */ + if (bidi.isInverse() && + !bidi.testDirPropFlagAt(MASK_R_AL, + bidiRun.limit - 1)) { + markFlag |= BidiBase.RLM_BEFORE; + } + if ((markFlag & BidiBase.LRM_BEFORE) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options)); + + if(bidi.isInverse() && + (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { + markFlag |= BidiBase.RLM_AFTER; + } + if ((markFlag & BidiBase.LRM_AFTER) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + } + } + } + } else { + /* reverse output */ + if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { + /* do not insert Bidi controls */ + for(run = runCount; --run >= 0; ) { + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + dest.append(doWriteReverse(text, + bidiRun.start, bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + } else { + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, options)); + } + } + } else { + /* insert Bidi controls for "inverse Bidi" */ + + byte[] dirProps = bidi.dirProps; + + for (run = runCount; --run >= 0; ) { + /* reverse output */ + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + if (dirProps[bidiRun.limit - 1] != BidiBase.L) { + dest.append(LRM_CHAR); + } + + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options & ~BidiBase.DO_MIRRORING)); + + if (dirProps[bidiRun.start] != BidiBase.L) { + dest.append(LRM_CHAR); + } + } else { + if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { + dest.append(RLM_CHAR); + } + + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, options)); + + if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) { + dest.append(RLM_CHAR); + } + } + } + } + } + + return dest.toString(); + } +} --- /dev/null 2015-07-13 16:12:12.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/BMPSet.java 2015-07-13 16:12:12.000000000 +0900 @@ -0,0 +1,526 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package sun.text.normalizer; + +import sun.text.normalizer.UnicodeSet.SpanCondition; + +/** + * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. + * + * Latin-1: Look up bytes. + * 2-byte characters: Bits organized vertically. + * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. + * Supplementary characters: Call contains() on the parent set. + */ +final class BMPSet { + + /** + * One boolean ('true' or 'false') per Latin-1 character. + */ + private boolean[] latin1Contains; + + /** + * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points + * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} + * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) + * + * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at + * runtime. + */ + private int[] table7FF; + + /** + * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks + * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} + * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit + * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed + * and set.contains(c) must be called. + * + * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster + * validity checking at runtime. + */ + private int[] bmpBlockBits; + + /** + * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, + * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are + * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. + */ + private int[] list4kStarts; + + /** + * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for + * supplementary code points. The list is terminated with list[listLength-1]=0x110000. + */ + private final int[] list; + private final int listLength; // length used; list may be longer to minimize reallocs + + public BMPSet(final int[] parentList, int parentListLength) { + list = parentList; + listLength = parentListLength; + latin1Contains = new boolean[0x100]; + table7FF = new int[64]; + bmpBlockBits = new int[64]; + list4kStarts = new int[18]; + + /* + * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the + * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of + * indexes is for finding supplementary code points. + */ + list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); + int i; + for (i = 1; i <= 0x10; ++i) { + list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); + } + list4kStarts[0x11] = listLength - 1; + + initBits(); + } + + public boolean contains(int c) { + if (c <= 0xff) { + return (latin1Contains[c]); + } else if (c <= 0x7ff) { + return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); + } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + return (0 != twoBits); + } else { + // Look up the code point in its 4k block of code points. + return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); + } + } else if (c <= 0x10ffff) { + // surrogate or supplementary code point + return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); + } else { + // Out-of-range code points get false, consistent with long-standing + // behavior of UnicodeSet.contains(c). + return false; + } + } + + /** + * Span the initial substring for which each character c has spanCondition==contains(c). It must be + * spanCondition==0 or 1. + * + * @param start The start index + * @param outCount If not null: Receives the number of code points in the span. + * @return the limit (exclusive end) of the span + * + * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for + * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points + * as usual in ICU. + */ + public final int span(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + char c, c2; + int i = start; + int limit = s.length(); + int numSupplementary = 0; + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } else { + // span not + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } + if (outCount != null) { + int spanLength = i - start; + outCount.value = spanLength - numSupplementary; // number of code points + } + return i; + } + + /** + * Symmetrical with span(). + * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= + * limit and spanCondition==0 or 1. + * + * @return The string index which starts the span (i.e. inclusive). + */ + public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { + char c, c2; + + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } else { + // span not + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } + return limit + 1; + } + + /** + * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 + */ + private static void set32x64Bits(int[] table, int start, int limit) { + assert (64 == table.length); + int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. + int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. + + // Set one bit indicating an all-one block. + int bits = 1 << lead; + if ((start + 1) == limit) { // Single-character shortcut. + table[trail] |= bits; + return; + } + + int limitLead = limit >> 6; + int limitTrail = limit & 0x3f; + + if (lead == limitLead) { + // Partial vertical bit column. + while (trail < limitTrail) { + table[trail++] |= bits; + } + } else { + // Partial vertical bit column, + // followed by a bit rectangle, + // followed by another partial vertical bit column. + if (trail > 0) { + do { + table[trail++] |= bits; + } while (trail < 64); + ++lead; + } + if (lead < limitLead) { + bits = ~((1 << lead) - 1); + if (limitLead < 0x20) { + bits &= (1 << limitLead) - 1; + } + for (trail = 0; trail < 64; ++trail) { + table[trail] |= bits; + } + } + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. + // In that case, bits=1<<limitLead == 1<<0 == 1 + // (because Java << uses only the lower 5 bits of the shift operand) + // but the bits value is not used because trail<limitTrail is already false. + bits = 1 << limitLead; + for (trail = 0; trail < limitTrail; ++trail) { + table[trail] |= bits; + } + } + } + + private void initBits() { + int start, limit; + int listIndex = 0; + + // Set latin1Contains[]. + do { + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + if (start >= 0x100) { + break; + } + do { + latin1Contains[start++] = true; + } while (start < limit && start < 0x100); + } while (limit <= 0x100); + + // Set table7FF[]. + while (start < 0x800) { + set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); + if (limit > 0x800) { + start = 0x800; + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + + // Set bmpBlockBits[]. + int minStart = 0x800; + while (start < 0x10000) { + if (limit > 0x10000) { + limit = 0x10000; + } + + if (start < minStart) { + start = minStart; + } + if (start < limit) { // Else: Another range entirely in a known mixed-value block. + if (0 != (start & 0x3f)) { + // Mixed-value block of 64 code points. + start >>= 6; + bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); + start = (start + 1) << 6; // Round up to the next block boundary. + minStart = start; // Ignore further ranges in this block. + } + if (start < limit) { + if (start < (limit & ~0x3f)) { + // Multiple all-ones blocks of 64 code points each. + set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); + } + + if (0 != (limit & 0x3f)) { + // Mixed-value block of 64 code points. + limit >>= 6; + bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); + limit = (limit + 1) << 6; // Round up to the next block boundary. + minStart = limit; // Ignore further ranges in this block. + } + } + } + + if (limit == 0x10000) { + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + } + + /** + * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code + * points in a certain range. + * + * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and + * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. + * + * @param c + * a character in a subrange of MIN_VALUE..MAX_VALUE + * @param lo + * The lowest index to be returned. + * @param hi + * The highest index to be returned. + * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] + */ + private int findCodePoint(int c, int lo, int hi) { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if (c < list[lo]) + return lo; + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if (lo >= hi || c >= list[hi - 1]) + return hi; + // invariant: c >= list[lo] + // invariant: c < list[hi] + for (;;) { + int i = (lo + hi) >>> 1; + if (i == lo) { + break; // Found! + } else if (c < list[i]) { + hi = i; + } else { + lo = i; + } + } + return hi; + } + + private final boolean containsSlow(int c, int lo, int hi) { + return (0 != (findCodePoint(c, lo, hi) & 1)); + } +} + --- /dev/null 2015-07-13 16:12:13.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/FilteredNormalizer2.java 2015-07-13 16:12:13.000000000 +0900 @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* +******************************************************************************* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +package sun.text.normalizer; + +import java.io.IOException; + +/** + * Normalization filtered by a UnicodeSet. + * Normalizes portions of the text contained in the filter set and leaves + * portions not contained in the filter set unchanged. + * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE). + * Not-in-the-filter text is treated as "is normalized" and "quick check yes". + * This class implements all of (and only) the Normalizer2 API. + * An instance of this class is unmodifiable/immutable. + * @stable ICU 4.4 + * @author Markus W. Scherer + */ +class FilteredNormalizer2 extends Normalizer2 { + + /** + * Constructs a filtered normalizer wrapping any Normalizer2 instance + * and a filter set. + * Both are aliased and must not be modified or deleted while this object + * is used. + * The filter set should be frozen; otherwise the performance will suffer greatly. + * @param n2 wrapped Normalizer2 instance + * @param filterSet UnicodeSet which determines the characters to be normalized + * @stable ICU 4.4 + */ + public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) { + norm2=n2; + set=filterSet; + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + dest.setLength(0); + normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); + return dest; + } + + /** + * {@inheritDoc} + * @stable ICU 4.6 + */ + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, true); + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, false); + } + + /** + * {@inheritDoc} + * @stable ICU 4.6 + */ + @Override + public String getDecomposition(int c) { + return set.contains(c) ? norm2.getDecomposition(c) : null; + } + + /** + * {@inheritDoc} + * @stable ICU 49 + */ + @Override + public int getCombiningClass(int c) { + return set.contains(c) ? norm2.getCombiningClass(c) : 0; + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public boolean isNormalized(CharSequence s) { + UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; + for(int prevSpanLimit=0; prevSpanLimit<s.length();) { + int spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { + spanCondition=UnicodeSet.SpanCondition.SIMPLE; + } else { + if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) { + return false; + } + spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return true; + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public int spanQuickCheckYes(CharSequence s) { + UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; + for(int prevSpanLimit=0; prevSpanLimit<s.length();) { + int spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { + spanCondition=UnicodeSet.SpanCondition.SIMPLE; + } else { + int yesLimit= + prevSpanLimit+ + norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit)); + if(yesLimit<spanLimit) { + return yesLimit; + } + spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return s.length(); + } + + /** + * {@inheritDoc} + * @stable ICU 4.4 + */ + @Override + public boolean hasBoundaryBefore(int c) { + return !set.contains(c) || norm2.hasBoundaryBefore(c); + } + + // Internal: No argument checking, and appends to dest. + // Pass as input spanCondition the one that is likely to yield a non-zero + // span length at the start of src. + // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, + // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src + // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after + // an in-filter prefix. + private Appendable normalize(CharSequence src, Appendable dest, + UnicodeSet.SpanCondition spanCondition) { + // Don't throw away destination buffer between iterations. + StringBuilder tempDest=new StringBuilder(); + try { + for(int prevSpanLimit=0; prevSpanLimit<src.length();) { + int spanLimit=set.span(src, prevSpanLimit, spanCondition); + int spanLength=spanLimit-prevSpanLimit; + if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) { + if(spanLength!=0) { + dest.append(src, prevSpanLimit, spanLimit); + } + spanCondition=UnicodeSet.SpanCondition.SIMPLE; + } else { + if(spanLength!=0) { + // Not norm2.normalizeSecondAndAppend() because we do not want + // to modify the non-filter part of dest. + dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest)); + } + spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + } catch(IOException e) { + throw new InternalError(e.toString(), e); + } + return dest; + } + + private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second, + boolean doNormalize) { + if(first==second) { + throw new IllegalArgumentException(); + } + if(first.length()==0) { + if(doNormalize) { + return normalize(second, first); + } else { + return first.append(second); + } + } + // merge the in-filter suffix of the first string with the in-filter prefix of the second + int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE); + if(prefixLimit!=0) { + CharSequence prefix=second.subSequence(0, prefixLimit); + int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE); + if(suffixStart==0) { + if(doNormalize) { + norm2.normalizeSecondAndAppend(first, prefix); + } else { + norm2.append(first, prefix); + } + } else { + StringBuilder middle=new StringBuilder( + first.subSequence(suffixStart, first.length())); + if(doNormalize) { + norm2.normalizeSecondAndAppend(middle, prefix); + } else { + norm2.append(middle, prefix); + } + first.delete(suffixStart, 0x7fffffff).append(middle); + } + } + if(prefixLimit<second.length()) { + CharSequence rest=second.subSequence(prefixLimit, second.length()); + if(doNormalize) { + normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED); + } else { + first.append(rest); + } + } + return first; + } + + private Normalizer2 norm2; + private UnicodeSet set; +}; --- /dev/null 2015-07-13 16:12:14.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Norm2AllModes.java 2015-07-13 16:12:13.000000000 +0900 @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +package sun.text.normalizer; + +import java.io.IOException; + +final class Norm2AllModes { + // Public API dispatch via Normalizer2 subclasses -------------------------- *** + + // Normalizer2 implementation for the old UNORM_NONE. + public static final class NoopNormalizer2 extends Normalizer2 { + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest!=src) { + dest.setLength(0); + return dest.append(src); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest!=src) { + try { + return dest.append(src); + } catch(IOException e) { + throw new InternalError(e.toString(), e); + } + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public String getDecomposition(int c) { + return null; + } + + // No need to override the default getRawDecomposition(). + @Override + public boolean isNormalized(CharSequence s) { return true; } + + @Override + public int spanQuickCheckYes(CharSequence s) { return s.length(); } + + @Override + public boolean hasBoundaryBefore(int c) { return true; } + } + + // Intermediate class: + // Has NormalizerImpl and does boilerplate argument checking and setup. + public static abstract class Normalizer2WithImpl extends Normalizer2 { + public Normalizer2WithImpl(NormalizerImpl ni) { + impl=ni; + } + + // normalize + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + dest.setLength(0); + normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length())); + return dest; + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + NormalizerImpl.ReorderingBuffer buffer= + new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()); + normalize(src, buffer); + buffer.flush(); + return dest; + } + + protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer); + + // normalize and append + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, true); + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, false); + } + + public StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second, boolean doNormalize) { + if(first==second) { + throw new IllegalArgumentException(); + } + normalizeAndAppend( + second, doNormalize, + new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length())); + return first; + } + + protected abstract void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer); + + @Override + public String getDecomposition(int c) { + return impl.getDecomposition(c); + } + + @Override + public int getCombiningClass(int c) { + return impl.getCC(impl.getNorm16(c)); + } + + // quick checks + @Override + public boolean isNormalized(CharSequence s) { + return s.length()==spanQuickCheckYes(s); + } + + public final NormalizerImpl impl; + } + + public static final class DecomposeNormalizer2 extends Normalizer2WithImpl { + public DecomposeNormalizer2(NormalizerImpl ni) { + super(ni); + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.decompose(src, 0, src.length(), buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.decomposeAndAppend(src, doNormalize, buffer); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.decompose(s, 0, s.length(), null); + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); } + } + + public static final class ComposeNormalizer2 extends Normalizer2WithImpl { + public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) { + super(ni); + onlyContiguous=fcc; + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.compose(src, 0, src.length(), onlyContiguous, true, buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer); + } + + @Override + public boolean isNormalized(CharSequence s) { + // 5: small destCapacity for substring normalization + return impl.compose(s, 0, s.length(), + onlyContiguous, false, + new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5)); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1; + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); } + + private final boolean onlyContiguous; + } + + // instance cache ---------------------------------------------------------- *** + + private Norm2AllModes(NormalizerImpl ni) { + impl=ni; + comp=new ComposeNormalizer2(ni, false); + decomp=new DecomposeNormalizer2(ni); + } + + public final NormalizerImpl impl; + public final ComposeNormalizer2 comp; + public final DecomposeNormalizer2 decomp; + + private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) { + if(singleton.exception!=null) { + throw singleton.exception; + } + return singleton.allModes; + } + + public static Norm2AllModes getNFCInstance() { + return getInstanceFromSingleton(NFCSingleton.INSTANCE); + } + + public static Norm2AllModes getNFKCInstance() { + return getInstanceFromSingleton(NFKCSingleton.INSTANCE); + } + + public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2(); + + private static final class Norm2AllModesSingleton { + private Norm2AllModesSingleton(String name) { + try { + String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu"; + NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME); + allModes=new Norm2AllModes(impl); + } catch (RuntimeException e) { + exception=e; + } + } + + private Norm2AllModes allModes; + private RuntimeException exception; + } + + private static final class NFCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc"); + } + + private static final class NFKCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc"); + } +} --- /dev/null 2015-07-13 16:12:14.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Normalizer2.java 2015-07-13 16:12:14.000000000 +0900 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +package sun.text.normalizer; + +/** + * Unicode normalization functionality for standard Unicode normalization or + * for using custom mapping tables. + * All instances of this class are unmodifiable/immutable. + * The Normalizer2 class is not intended for public subclassing. + * <p> + * The primary functions are to produce a normalized string and to detect whether + * a string is already normalized. + * The most commonly used normalization forms are those defined in + * http://www.unicode.org/unicode/reports/tr15/ + * However, this API supports additional normalization forms for specialized purposes. + * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) + * and can be used in implementations of UTS #46. + * <p> + * Not only are the standard compose and decompose modes supplied, + * but additional modes are provided as documented in the Mode enum. + * <p> + * Some of the functions in this class identify normalization boundaries. + * At a normalization boundary, the portions of the string + * before it and starting from it do not interact and can be handled independently. + * <p> + * The spanQuickCheckYes() stops at a normalization boundary. + * When the goal is a normalized string, then the text before the boundary + * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). + * <p> + * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether + * a character is guaranteed to be at a normalization boundary, + * regardless of context. + * This is used for moving from one normalization boundary to the next + * or preceding boundary, and for performing iterative normalization. + * <p> + * Iterative normalization is useful when only a small portion of a + * longer string needs to be processed. + * For example, in ICU, iterative normalization is used by the NormalizationTransliterator + * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() + * (to process only the substring for which sort key bytes are computed). + * <p> + * The set of normalization boundaries returned by these functions may not be + * complete: There may be more boundaries that could be returned. + * Different functions may return different boundaries. + * @stable ICU 4.4 + * @author Markus W. Scherer + */ +abstract class Normalizer2 { + + /** + * Returns a Normalizer2 instance for Unicode NFC normalization. + * Same as getInstance(null, "nfc", Mode.COMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFCInstance() { + return Norm2AllModes.getNFCInstance().comp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFD normalization. + * Same as getInstance(null, "nfc", Mode.DECOMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFDInstance() { + return Norm2AllModes.getNFCInstance().decomp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFKC normalization. + * Same as getInstance(null, "nfkc", Mode.COMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFKCInstance() { + return Norm2AllModes.getNFKCInstance().comp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFKD normalization. + * Same as getInstance(null, "nfkc", Mode.DECOMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFKDInstance() { + return Norm2AllModes.getNFKCInstance().decomp; + } + + /** + * Returns the normalized form of the source string. + * @param src source string + * @return normalized src + * @stable ICU 4.4 + */ + public String normalize(CharSequence src) { + if(src instanceof String) { + // Fastpath: Do not construct a new String if the src is a String + // and is already normalized. + int spanLength=spanQuickCheckYes(src); + if(spanLength==src.length()) { + return (String)src; + } + StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); + return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); + } + return normalize(src, new StringBuilder(src.length())).toString(); + } + + /** + * Writes the normalized form of the source string to the destination string + * (replacing its contents) and returns the destination string. + * The source and destination strings must be different objects. + * @param src source string + * @param dest destination string; its contents is replaced with normalized src + * @return dest + * @stable ICU 4.4 + */ + public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); + + /** + * Writes the normalized form of the source string to the destination Appendable + * and returns the destination Appendable. + * The source and destination strings must be different objects. + * + * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}. + * + * @param src source string + * @param dest destination Appendable; gets normalized src appended + * @return dest + * @stable ICU 4.6 + */ + public abstract Appendable normalize(CharSequence src, Appendable dest); + + /** + * Appends the normalized form of the second string to the first string + * (merging them at the boundary) and returns the first string. + * The result is normalized if the first string was normalized. + * The first and second strings must be different objects. + * @param first string, should be normalized + * @param second string, will be normalized + * @return first + * @stable ICU 4.4 + */ + public abstract StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second); + + /** + * Appends the second string to the first string + * (merging them at the boundary) and returns the first string. + * The result is normalized if both the strings were normalized. + * The first and second strings must be different objects. + * @param first string, should be normalized + * @param second string, should be normalized + * @return first + * @stable ICU 4.4 + */ + public abstract StringBuilder append(StringBuilder first, CharSequence second); + + /** + * Gets the decomposition mapping of c. + * Roughly equivalent to normalizing the String form of c + * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function + * returns null if c does not have a decomposition mapping in this instance's data. + * This function is independent of the mode of the Normalizer2. + * @param c code point + * @return c's decomposition mapping, if any; otherwise null + * @stable ICU 4.6 + */ + public abstract String getDecomposition(int c); + + /** + * Gets the combining class of c. + * The default implementation returns 0 + * but all standard implementations return the Unicode Canonical_Combining_Class value. + * @param c code point + * @return c's combining class + * @stable ICU 49 + */ + public int getCombiningClass(int c) { return 0; } + + /** + * Tests if the string is normalized. + * Internally, in cases where the quickCheck() method would return "maybe" + * (which is only possible for the two COMPOSE modes) this method + * resolves to "yes" or "no" to provide a definitive result, + * at the cost of doing more work in those cases. + * @param s input string + * @return true if s is normalized + * @stable ICU 4.4 + */ + public abstract boolean isNormalized(CharSequence s); + + /** + * Returns the end of the normalized substring of the input string. + * In other words, with <code>end=spanQuickCheckYes(s);</code> + * the substring <code>s.subSequence(0, end)</code> + * will pass the quick check with a "yes" result. + * <p> + * The returned end index is usually one or more characters before the + * "no" or "maybe" character: The end index is at a normalization boundary. + * (See the class documentation for more about normalization boundaries.) + * <p> + * When the goal is a normalized string and most input strings are expected + * to be normalized already, then call this method, + * and if it returns a prefix shorter than the input string, + * copy that prefix and use normalizeSecondAndAppend() for the remainder. + * @param s input string + * @return "yes" span end index + * @stable ICU 4.4 + */ + public abstract int spanQuickCheckYes(CharSequence s); + + /** + * Tests if the character always has a normalization boundary before it, + * regardless of context. + * If true, then the character does not normalization-interact with + * preceding characters. + * In other words, a string containing this character can be normalized + * by processing portions before this character and starting from this + * character independently. + * This is used for iterative normalization. See the class documentation for details. + * @param c character to test + * @return true if c has a normalization boundary before it + * @stable ICU 4.4 + */ + public abstract boolean hasBoundaryBefore(int c); + + /** + * Sole constructor. (For invocation by subclass constructors, + * typically implicit.) + * @internal + * deprecated This API is ICU internal only. + */ + protected Normalizer2() { + } +} --- /dev/null 2015-07-13 16:12:15.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/OutputInt.java 2015-07-13 16:12:15.000000000 +0900 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package sun.text.normalizer; + +/** + * Simple struct-like class for int output parameters. + * Like <code>Output<Integer></code> but without auto-boxing. + * + * @internal but could become public + * deprecated This API is ICU internal only. + */ +class OutputInt { + + /** + * The value field. + * + * @internal + * deprecated This API is ICU internal only. + */ + public int value; +} --- /dev/null 2015-07-13 16:12:16.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Trie2.java 2015-07-13 16:12:15.000000000 +0900 @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package sun.text.normalizer; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * This is the interface and common implementation of a Unicode Trie2. + * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) + * to 16- or 32-bit integer values. It works best when there are ranges of + * characters with the same value, which is generally the case with Unicode + * character properties. + * + * This is the second common version of a Unicode trie (hence the name Trie2). + * + */ +abstract class Trie2 implements Iterable<Trie2.Range> { + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * + * Reads from the current position and leaves the buffer after the end of the trie. + * + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending + * on the width of the data. + * + * To obtain the width of the Trie2, check the actual class type of the returned Trie2. + * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will + * return only Tries of their specific type/size. + * + * The serialized Trie2 on the stream may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2, ready for use. + * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * + */ + public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { + // From ICU4C utrie2_impl.h + // * Trie2 data structure in serialized form: + // * + // * UTrie2Header header; + // * uint16_t index[header.index2Length]; + // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + // * @internal + // */ + // typedef struct UTrie2Header { + // /** "Tri2" in big-endian US-ASCII (0x54726932) */ + // uint32_t signature; + + // /** + // * options bit field: + // * 15.. 4 reserved (0) + // * 3.. 0 UTrie2ValueBits valueBits + // */ + // uint16_t options; + // + // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + // uint16_t indexLength; + // + // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + // uint16_t shiftedDataLength; + // + // /** Null index and data blocks, not shifted. */ + // uint16_t index2NullOffset, dataNullOffset; + // + // /** + // * First code point of the single-value range ending with U+10ffff, + // * rounded up and then shifted right by UTRIE2_SHIFT_1. + // */ + // uint16_t shiftedHighStart; + // } UTrie2Header; + + ByteOrder outerByteOrder = bytes.order(); + try { + UTrie2Header header = new UTrie2Header(); + + /* check the signature */ + header.signature = bytes.getInt(); + switch (header.signature) { + case 0x54726932: + // The buffer is already set to the trie data byte order. + break; + case 0x32697254: + // Temporarily reverse the byte order. + boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; + bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); + header.signature = 0x54726932; + break; + default: + throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); + } + + header.options = bytes.getChar(); + header.indexLength = bytes.getChar(); + header.shiftedDataLength = bytes.getChar(); + header.index2NullOffset = bytes.getChar(); + header.dataNullOffset = bytes.getChar(); + header.shiftedHighStart = bytes.getChar(); + + if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { + throw new IllegalArgumentException("UTrie2 serialized format error."); + } + + Trie2 This; + This = new Trie2_16(); + This.header = header; + + /* get the length values and offsets */ + This.indexLength = header.indexLength; + This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; + This.index2NullOffset = header.index2NullOffset; + This.dataNullOffset = header.dataNullOffset; + This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; + This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; + This.highValueIndex += This.indexLength; + + // Allocate the Trie2 index array. If the data width is 16 bits, the array also + // includes the space for the data. + + int indexArraySize = This.indexLength; + indexArraySize += This.dataLength; + This.index = new char[indexArraySize]; + + /* Read in the index */ + int i; + for (i=0; i<This.indexLength; i++) { + This.index[i] = bytes.getChar(); + } + + /* Read in the data. 16 bit data goes in the same array as the index. + * 32 bit data goes in its own separate data array. + */ + This.data16 = This.indexLength; + for (i=0; i<This.dataLength; i++) { + This.index[This.data16 + i] = bytes.getChar(); + } + + This.data32 = null; + This.initialValue = This.index[This.dataNullOffset]; + This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET]; + + return This; + } finally { + bytes.order(outerByteOrder); + } + } + + /** + * Get the value for a code point as stored in the Trie2. + * + * @param codePoint the code point + * @return the value + */ + abstract public int get(int codePoint); + + /** + * Get the trie value for a UTF-16 code unit. + * + * A Trie2 stores two distinct values for input in the lead surrogate + * range, one for lead surrogates, which is the value that will be + * returned by this function, and a second value that is returned + * by Trie2.get(). + * + * For code units outside of the lead surrogate range, this function + * returns the same result as Trie2.get(). + * + * This function, together with the alternate value for lead surrogates, + * makes possible very efficient processing of UTF-16 strings without + * first converting surrogate pairs to their corresponding 32 bit code point + * values. + * + * At build-time, enumerate the contents of the Trie2 to see if there + * is non-trivial (non-initialValue) data for any of the supplementary + * code points associated with a lead surrogate. + * If so, then set a special (application-specific) value for the + * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit(). + * + * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial + * data and the code unit is a lead surrogate, then check if a trail surrogate + * follows. If so, assemble the supplementary code point and look up its value + * with Trie2.get(); otherwise reset the lead + * surrogate's value or do a code point lookup for it. + * + * If there is only trivial data for lead and trail surrogates, then processing + * can often skip them. For example, in normalization or case mapping + * all characters that do not have any mappings are simply copied as is. + * + * @param c the code point or lead surrogate value. + * @return the value + */ + abstract public int getFromU16SingleLead(char c); + + /** + * When iterating over the contents of a Trie2, Elements of this type are produced. + * The iterator will return one item for each contiguous range of codepoints having the same value. + * + * When iterating, the same Trie2EnumRange object will be reused and returned for each range. + * If you need to retain complete iteration results, clone each returned Trie2EnumRange, + * or save the range in some other way, before advancing to the next iteration step. + */ + public static class Range { + public int startCodePoint; + public int endCodePoint; // Inclusive. + public int value; + public boolean leadSurrogate; + + public boolean equals(Object other) { + if (other == null || !(other.getClass().equals(getClass()))) { + return false; + } + Range tother = (Range)other; + return this.startCodePoint == tother.startCodePoint && + this.endCodePoint == tother.endCodePoint && + this.value == tother.value && + this.leadSurrogate == tother.leadSurrogate; + } + + public int hashCode() { + int h = initHash(); + h = hashUChar32(h, startCodePoint); + h = hashUChar32(h, endCodePoint); + h = hashInt(h, value); + h = hashByte(h, leadSurrogate? 1: 0); + return h; + } + } + + /** + * Create an iterator over the value ranges in this Trie2. + * Values from the Trie2 are not remapped or filtered, but are returned as they + * are stored in the Trie2. + * + * @return an Iterator + */ + public Iterator<Range> iterator() { + return iterator(defaultValueMapper); + } + + private static ValueMapper defaultValueMapper = new ValueMapper() { + public int map(int in) { + return in; + } + }; + + /** + * Create an iterator over the value ranges from this Trie2. + * Values from the Trie2 are passed through a caller-supplied remapping function, + * and it is the remapped values that determine the ranges that + * will be produced by the iterator. + * + * + * @param mapper provides a function to remap values obtained from the Trie2. + * @return an Iterator + */ + public Iterator<Range> iterator(ValueMapper mapper) { + return new Trie2Iterator(mapper); + } + + /** + * When iterating over the contents of a Trie2, an instance of TrieValueMapper may + * be used to remap the values from the Trie2. The remapped values will be used + * both in determining the ranges of codepoints and as the value to be returned + * for each range. + * + * Example of use, with an anonymous subclass of TrieValueMapper: + * + * + * ValueMapper m = new ValueMapper() { + * int map(int in) {return in & 0x1f;}; + * } + * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) { + * Trie2EnumRange r = i.next(); + * ... // Do something with the range r. + * } + * + */ + public interface ValueMapper { + public int map(int originalVal); + } + + //-------------------------------------------------------------------------------- + // + // Below this point are internal implementation items. No further public API. + // + //-------------------------------------------------------------------------------- + + /** + * Trie2 data structure in serialized form: + * + * UTrie2Header header; + * uint16_t index[header.index2Length]; + * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + * + * For Java, this is read from the stream into an instance of UTrie2Header. + * (The C version just places a struct over the raw serialized data.) + * + * @internal + */ + static class UTrie2Header { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + int signature; + + /** + * options bit field (uint16_t): + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + int options; + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ + int indexLength; + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ + int shiftedDataLength; + + /** Null index and data blocks, not shifted. (uint16_t) */ + int index2NullOffset, dataNullOffset; + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) + */ + int shiftedHighStart; + } + + // + // Data members of UTrie2. + // + UTrie2Header header; + char index[]; // Index array. Includes data for 16 bit Tries. + int data16; // Offset to data portion of the index array, if 16 bit data. + // zero if 32 bit data. + int data32[]; // NULL if 16b data is used via index + + int indexLength; + int dataLength; + int index2NullOffset; // 0xffff if there is no dedicated index-2 null block + int initialValue; + + /** Value returned for out-of-range code points and illegal UTF-8. */ + int errorValue; + + /* Start of the last range which ends at U+10ffff, and its value. */ + int highStart; + int highValueIndex; + + int dataNullOffset; + + /** + * Trie2 constants, defining shift widths, index array lengths, etc. + * + * These are needed for the runtime macros but users can treat these as + * implementation details and skip to the actual public API further below. + */ + + static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; + + + /** Shift size for getting the index-1 table offset. */ + static final int UTRIE2_SHIFT_1=6+5; + + /** Shift size for getting the index-2 table offset. */ + static final int UTRIE2_SHIFT_2=5; + + /** + * Difference between the two shift sizes, + * for getting an index-1 offset from an index-2 offset. 6=11-5 + */ + static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; + + /** + * Number of index-1 entries for the BMP. 32=0x20 + * This part of the index-1 table is omitted from the serialized form. + */ + static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; + + /** Number of entries in an index-2 block. 64=0x40 */ + static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2; + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1; + + /** Number of entries in a data block. 32=0x20 */ + static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2; + + /** Mask for getting the lower bits for the in-data-block offset. */ + static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1; + + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. + */ + static final int UTRIE2_INDEX_SHIFT=2; + + /** The alignment size of a data block. Also the granularity for compaction. */ + static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT; + + /** + * The part of the index-2 table for U+D800..U+DBFF stores values for + * lead surrogate code _units_ not code _points_. + * Values for lead surrogate code _points_ are indexed with this portion of the table. + * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; + static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; + static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; + + /** + * Implementation class for an iterator over a Trie2. + * + * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, + * then returns the special alternate values for the lead surrogates + * + * @internal + */ + class Trie2Iterator implements Iterator<Range> { + + // The normal constructor that configures the iterator to cover the complete + // contents of the Trie2 + Trie2Iterator(ValueMapper vm) { + mapper = vm; + nextStart = 0; + limitCP = 0x110000; + doLeadSurrogates = true; + } + + /** + * The main next() function for Trie2 iterators + * + */ + public Range next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + if (nextStart >= limitCP) { + // Switch over from iterating normal code point values to + // doing the alternate lead-surrogate values. + doingCodePoints = false; + nextStart = 0xd800; + } + int endOfRange = 0; + int val = 0; + int mappedVal = 0; + + if (doingCodePoints) { + // Iteration over code point values. + val = get(nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEnd(nextStart, limitCP, val); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= limitCP-1) { + break; + } + val = get(endOfRange+1); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEnd(endOfRange+1, limitCP, val); + } + } else { + // Iteration over the alternate lead surrogate values. + val = getFromU16SingleLead((char)nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEndLS((char)nextStart); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= 0xdbff) { + break; + } + val = getFromU16SingleLead((char)(endOfRange+1)); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEndLS((char)(endOfRange+1)); + } + } + returnValue.startCodePoint = nextStart; + returnValue.endCodePoint = endOfRange; + returnValue.value = mappedVal; + returnValue.leadSurrogate = !doingCodePoints; + nextStart = endOfRange+1; + return returnValue; + } + + /** + * + */ + public boolean hasNext() { + return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; + } + + private int rangeEndLS(char startingLS) { + if (startingLS >= 0xdbff) { + return 0xdbff; + } + + int c; + int val = getFromU16SingleLead(startingLS); + for (c = startingLS+1; c <= 0x0dbff; c++) { + if (getFromU16SingleLead((char)c) != val) { + break; + } + } + return c-1; + } + + // + // Iteration State Variables + // + private ValueMapper mapper; + private Range returnValue = new Range(); + // The starting code point for the next range to be returned. + private int nextStart; + // The upper limit for the last normal range to be returned. Normally 0x110000, but + // may be lower when iterating over the code points for a single lead surrogate. + private int limitCP; + + // True while iterating over the the Trie2 values for code points. + // False while iterating over the alternate values for lead surrogates. + private boolean doingCodePoints = true; + + // True if the iterator should iterate the special values for lead surrogates in + // addition to the normal values for code points. + private boolean doLeadSurrogates = true; + } + + /** + * Find the last character in a contiguous range of characters with the + * same Trie2 value as the input character. + * + * @param c The character to begin with. + * @return The last contiguous character with the same value. + */ + int rangeEnd(int start, int limitp, int val) { + int c; + int limit = Math.min(highStart, limitp); + + for (c = start+1; c < limit; c++) { + if (get(c) != val) { + break; + } + } + if (c >= highStart) { + c = limitp; + } + return c - 1; + } + + + // + // Hashing implementation functions. FNV hash. Respected public domain algorithm. + // + private static int initHash() { + return 0x811c9DC5; // unsigned 2166136261 + } + + private static int hashByte(int h, int b) { + h = h * 16777619; + h = h ^ b; + return h; + } + + private static int hashUChar32(int h, int c) { + h = Trie2.hashByte(h, c & 255); + h = Trie2.hashByte(h, (c>>8) & 255); + h = Trie2.hashByte(h, c>>16); + return h; + } + + private static int hashInt(int h, int i) { + h = Trie2.hashByte(h, i & 255); + h = Trie2.hashByte(h, (i>>8) & 255); + h = Trie2.hashByte(h, (i>>16) & 255); + h = Trie2.hashByte(h, (i>>24) & 255); + return h; + } + +} --- /dev/null 2015-07-13 16:12:16.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/Trie2_16.java 2015-07-13 16:12:16.000000000 +0900 @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package sun.text.normalizer; + +import java.io.IOException; +import java.nio.ByteBuffer; + + +/** + * @author aheninger + * + * A read-only Trie2, holding 16 bit data values. + * + * A Trie2 is a highly optimized data structure for mapping from Unicode + * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value. + * + * See class Trie2 for descriptions of the API for accessing the contents of a trie. + * + * The fundamental data access methods are declared final in this class, with + * the intent that applications might gain a little extra performance, when compared + * with calling the same methods via the abstract UTrie2 base class. + */ +public final class Trie2_16 extends Trie2 { + + /** + * Internal constructor, not for general use. + */ + Trie2_16() { + } + + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The serialized Trie2 in the bytes may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2_16, ready for use. + * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * @throws ClassCastException if the bytes contain a serialized Trie2_32 + */ + public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException { + return (Trie2_16) Trie2.createFromSerialized(bytes); + } + + /** + * Get the value for a code point as stored in the Trie2. + * + * @param codePoint the code point + * @return the value + */ + @Override + public final int get(int codePoint) { + int value; + int ix; + + if (codePoint >= 0) { + if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) { + // Ordinary BMP code point, excluding leading surrogates. + // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. + // 16 bit data is stored in the index array itself. + ix = index[codePoint >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0xffff) { + // Lead Surrogate Code Point. A Separate index section is stored for + // lead surrogate code units and code points. + // The main index has the code unit data. + // For this function, we need the code point data. + // Note: this expression could be refactored for slightly improved efficiency, but + // surrogate code points will be so rare in practice that it's not worth it. + ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint < highStart) { + // Supplemental code point, use two-level lookup. + ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1); + ix = index[ix]; + ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; + ix = index[ix]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0x10ffff) { + value = index[highValueIndex]; + return value; + } + } + + // Fall through. The code point is outside of the legal range of 0..0x10ffff. + return errorValue; + } + + + /** + * Get a Trie2 value for a UTF-16 code unit. + * + * This function returns the same value as get() if the input + * character is outside of the lead surrogate range + * + * There are two values stored in a Trie2 for inputs in the lead + * surrogate range. This function returns the alternate value, + * while Trie2.get() returns the main value. + * + * @param codeUnit a 16 bit code unit or lead surrogate value. + * @return the value + */ + @Override + public int getFromU16SingleLead(char codeUnit) { + int value; + int ix; + + // Because the input is a 16 bit char, we can skip the tests for it being in + // the BMP range. It is. + ix = index[codeUnit >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + + /** + * @return the number of bytes of the serialized trie + */ + public int getSerializedLength() { + return 16+(header.indexLength+dataLength)*2; + } +} --- /dev/null 2015-07-13 16:12:17.000000000 +0900 +++ new/jdk/src/java.base/share/classes/sun/text/normalizer/UnicodeSetStringSpan.java 2015-07-13 16:12:17.000000000 +0900 @@ -0,0 +1,1165 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package sun.text.normalizer; + +import java.util.ArrayList; + +import sun.text.normalizer.UnicodeSet.SpanCondition; + +/* + * Implement span() etc. for a set with strings. + * Avoid recursion because of its exponential complexity. + * Instead, try multiple paths at once and track them with an IndexList. + */ +class UnicodeSetStringSpan { + + /* + * Which span() variant will be used? The object is either built for one variant and used once, + * or built for all and may be used many times. + */ + public static final int WITH_COUNT = 0x40; // spanAndCount() may be called + public static final int FWD = 0x20; + public static final int BACK = 0x10; + // public static final int UTF16 = 8; + public static final int CONTAINED = 2; + public static final int NOT_CONTAINED = 1; + + public static final int ALL = 0x7f; + + public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; + public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; + public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; + public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; + + /** + * Special spanLength short values. (since Java has not unsigned byte type) + * All code points in the string are contained in the parent set. + */ + static final short ALL_CP_CONTAINED = 0xff; + + /** The spanLength is >=0xfe. */ + static final short LONG_SPAN = ALL_CP_CONTAINED - 1; + + /** Set for span(). Same as parent but without strings. */ + private UnicodeSet spanSet; + + /** + * Set for span(not contained). + * Same as spanSet, plus characters that start or end strings. + */ + private UnicodeSet spanNotSet; + + /** The strings of the parent set. */ + private ArrayList<String> strings; + + /** The lengths of span(), spanBack() etc. for each string. */ + private short[] spanLengths; + + /** Maximum lengths of relevant strings. */ + private int maxLength16; + + /** Are there strings that are not fully contained in the code point set? */ + private boolean someRelevant; + + /** Set up for all variants of span()? */ + private boolean all; + + /** Span helper */ + private OffsetList offsets; + + /** + * Constructs for all variants of span(), or only for any one variant. + * Initializes as little as possible, for single use. + */ + public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { + spanSet = new UnicodeSet(0, 0x10ffff); + // TODO: With Java 6, just take the parent set's strings as is, + // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. + // Then iterate via the first() and higher() methods. + // (We do not want to create multiple Iterator objects in each span().) + // See ICU ticket #7454. + strings = setStrings; + all = (which == ALL); + spanSet.retainAll(set); + if (0 != (which & NOT_CONTAINED)) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + spanNotSet = spanSet; + } + offsets = new OffsetList(); + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int stringsLength = strings.size(); + + int i, spanLength; + someRelevant = false; + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + someRelevant = true; + } + if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { + maxLength16 = length16; + } + } + if (!someRelevant && (which & WITH_COUNT) == 0) { + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if (all) { + spanSet.freeze(); + } + + int spanBackLengthsOffset; + + // Allocate a block of meta data. + int allocSize; + if (all) { + // 2 sets of span lengths + allocSize = stringsLength * (2); + } else { + allocSize = stringsLength; // One set of span lengths. + } + spanLengths = new short[allocSize]; + + if (all) { + // Store span lengths for all span() variants. + spanBackLengthsOffset = stringsLength; + } else { + // Store span lengths for only one span() variant. + spanBackLengthsOffset = 0; + } + + // Set the meta data and spanNotSet and write the UTF-8 strings. + + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + if (true /* 0 != (which & UTF16) */) { + if (0 != (which & CONTAINED)) { + if (0 != (which & FWD)) { + spanLengths[i] = makeSpanLengthByte(spanLength); + } + if (0 != (which & BACK)) { + spanLength = length16 + - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); + spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */{ + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant + // flag. + } + } + if (0 != (which & NOT_CONTAINED)) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + int c; + if (0 != (which & FWD)) { + c = string.codePointAt(0); + addToSpanNotSet(c); + } + if (0 != (which & BACK)) { + c = string.codePointBefore(length16); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. + if (all) { + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i] = ALL_CP_CONTAINED; + } + } + } + + // Finish. + if (all) { + spanNotSet.freeze(); + } + } + + /** + * Do the strings need to be checked in span() etc.? + * + * @return true if strings need to be checked (call span() here), + * false if not (use a BMPSet for best performance). + */ + public boolean needsStringSpanUTF16() { + return someRelevant; + } + + /** For fast UnicodeSet::contains(c). */ + public boolean contains(int c) { + return spanSet.contains(c); + } + + /** + * Adds a starting or ending string character to the spanNotSet + * so that a character span ends before any string. + */ + private void addToSpanNotSet(int c) { + if (spanNotSet == null || spanNotSet == spanSet) { + if (spanSet.contains(c)) { + return; // Nothing to do. + } + spanNotSet = spanSet.cloneAsThawed(); + } + spanNotSet.add(c); + } + + /* + * Note: In span() when spanLength==0 + * (after a string match, or at the beginning after an empty code point span) + * and in spanNot() and spanNotUTF8(), + * string matching could use a binary search because all string matches are done + * from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets + * with strings have very few very short strings. For cases with many strings, it might be better to use a different + * API and implementation with a DFA (state machine). + */ + + /* + * Algorithm for span(SpanCondition.CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, or recursively span + * for all but the shortest one and iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, + * neither for the code point there nor for any set string, + * then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points of the original set + * but none of its strings. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the recursive algorithm would have tried to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(SpanCondition.CONTAINED) + * because the recursive algorithm would continue after them anyway and + * find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), + * then pop the shortest string match end point and continue the loop, + * trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, then test if the + * code point after the previous string match is also contained in the set. + * Continue the loop with the shortest end point of + * either this code point or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, the function visits each string position at most once and + * finishes in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + + /* + * Algorithm for span(SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the standard algorithm would have tried to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have + * continued after the set string match anyway. + * ~ Remember the longest set string match (farthest end point) + * from the earliest starting point. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched, + * then continue the loop after the longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + /** + * Spans a string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the limit (exclusive end) of the span + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, null); + } + int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); + if (spanLimit == s.length()) { + return spanLimit; + } + return spanWithStrings(s, start, spanLimit, spanCondition); + } + + /** + * Synchronized method for complicated spans using the offsets. + * Avoids synchronization for simple cases. + * + * @param spanLimit = spanSet.span(s, start, CONTAINED) + */ + private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, + SpanCondition spanCondition) { + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int length = s.length(); + int pos = spanLimit, rest = length - spanLimit; + int spanLength = spanLimit - start; + int i, stringsLength = strings.size(); + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code + // point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest) { + break; + } + // Try to match if the increment is not listed already. + if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { + if (inc == rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if (overlap == 0) { + break; + } + --overlap; + ++inc; + } + } + } else /* SIMPLE */{ + int maxInc = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) + && matches16CPB(s, pos - overlap, length, string, length16)) { + maxInc = inc; // Longest match from earliest start. + maxOverlap = overlap; + break; + } + --overlap; + ++inc; + } + } + + if (maxInc != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos += maxInc; + rest -= maxInc; + if (rest == 0) { + return length; // Reached the end of the string. + } + spanLength = 0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == 0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); + spanLength = spanLimit - pos; + if (spanLength == rest || // Reached the end of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return spanLimit; + } + pos += spanLength; + rest -= spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOne(spanSet, s, pos, rest); + if (spanLength > 0) { + if (spanLength == rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos += spanLength; + rest -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int minOffset = offsets.popMinimum(null); + pos += minOffset; + rest -= minOffset; + spanLength = 0; // Match strings from after a string match. + } + } + + /** + * Spans a string and counts the smallest number of set elements on any path across the span. + * + * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. + * + * <p>If the set does not have any fully-contained strings, then we could optimize this + * like span(), but such sets are likely rare, and this is at least still linear. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @param outCount The count + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, outCount); + } + // Consider strings; they may overlap with the span, + // and they may result in a smaller count that with just code points. + if (spanCondition == SpanCondition.CONTAINED) { + return spanContainedAndCount(s, start, outCount); + } + // SIMPLE (not synchronized, does not use offsets) + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + int maxInc = (cpLength > 0) ? cpLength : 0; + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + if (maxInc < length16 && length16 <= rest && + matches16CPB(s, pos, length, string, length16)) { + maxInc = length16; + } + } + // We are done if there is no match beyond pos. + if (maxInc == 0) { + outCount.value = count; + return pos; + } + // Continue from the longest match. + ++count; + pos += maxInc; + rest -= maxInc; + } + outCount.value = count; + return pos; + } + + private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + offsets.addOffsetAndCount(cpLength, count + 1); + } + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + // Note: If the strings were sorted by length, then we could also + // avoid trying to match if there is already a match of the same length. + if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && + matches16CPB(s, pos, length, string, length16)) { + offsets.addOffsetAndCount(length16, count + 1); + } + } + // We are done if there is no match beyond pos. + if (offsets.isEmpty()) { + outCount.value = count; + return pos; + } + // Continue from the nearest match. + int minOffset = offsets.popMinimum(outCount); + count = outCount.value; + pos += minOffset; + rest -= minOffset; + } + outCount.value = count; + return pos; + } + + /** + * Span a string backwards. + * + * @param s The string to be spanned + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + */ + public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNotBack(s, length); + } + int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); + if (pos == 0) { + return 0; + } + int spanLength = length - pos; + + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int i, stringsLength = strings.size(); + int spanBackLengthsOffset = 0; + if (all) { + spanBackLengthsOffset = stringsLength; + } + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + int len1 = 0; + len1 = string.offsetByCodePoints(0, 1); + overlap -= len1; // Length of the string minus the first code point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos) { + break; + } + // Try to match if the decrement is not listed already. + if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { + if (dec == pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if (overlap == 0) { + break; + } + --overlap; + ++dec; + } + } + } else /* SIMPLE */{ + int maxDec = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) + && matches16CPB(s, pos - dec, length, string, length16)) { + maxDec = dec; // Longest match from latest end. + maxOverlap = overlap; + break; + } + --overlap; + ++dec; + } + } + + if (maxDec != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos -= maxDec; + if (pos == 0) { + return 0; // Reached the start of the string. + } + spanLength = 0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int oldPos = pos; + pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); + spanLength = oldPos - pos; + if (pos == 0 || // Reached the start of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOneBack(spanSet, s, pos); + if (spanLength > 0) { + if (spanLength == pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos -= offsets.popMinimum(null); + spanLength = 0; // Match strings from before a string match. + } + } + + /** + * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains + * all of the single code points of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). + * + If the current code point is in the original set, then return the current position. + * + If any set string matches at the current position, then return the current position. + * + If there is no match at the current position, neither for the code point + * there nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param outCount If not null: Receives the number of code points across the span. + * @return the limit (exclusive end) of the span + */ + private int spanNot(CharSequence s, int start, OutputInt outCount) { + int length = s.length(); + int pos = start, rest = length - start; + int stringsLength = strings.size(); + int count = 0; + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + int spanLimit; + if (outCount == null) { + spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); + } else { + spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); + outCount.value = count = count + outCount.value; + } + if (spanLimit == length) { + return length; // Reached the end of the string. + } + pos = spanLimit; + rest = length - spanLimit; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (int i = 0; i < stringsLength; ++i) { + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos -= cpLength; + rest += cpLength; + ++count; + } while (rest != 0); + if (outCount != null) { + outCount.value = count; + } + return length; // Reached the end of the string. + } + + private int spanNotBack(CharSequence s, int length) { + int pos = length; + int i, stringsLength = strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); + if (pos == 0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOneBack(spanSet, s, pos); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (i = 0; i < stringsLength; ++i) { + // Use spanLengths rather than a spanLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos += cpLength; + } while (pos != 0); + return 0; // Reached the start of the string. + } + + static short makeSpanLengthByte(int spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; + } + + // Compare strings without any argument checks. Requires length>0. + private static boolean matches16(CharSequence s, int start, final String t, int length) { + int end = start + length; + while (length-- > 0) { + if (s.charAt(--end) != t.charAt(length)) { + return false; + } + } + return true; + } + + /** + * Compare 16-bit Unicode strings (which may be malformed UTF-16) + * at code point boundaries. + * That is, each edge of a match must not be in the middle of a surrogate pair. + * @param s The string to match in. + * @param start The start index of s. + * @param limit The limit of the subsequence of s being spanned. + * @param t The substring to be matched in s. + * @param tlength The length of t. + */ + static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { + return matches16(s, start, t, tlength) + && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && + Character.isLowSurrogate(s.charAt(start))) + && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && + Character.isLowSurrogate(s.charAt(start + tlength))); + } + + /** + * Does the set contain the next code point? + * If so, return its length; otherwise return its negative length. + */ + static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { + char c = s.charAt(start); + if (c >= 0xd800 && c <= 0xdbff && length >= 2) { + char c2 = s.charAt(start + 1); + if (UTF16.isTrailSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { + char c = s.charAt(length - 1); + if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { + char c2 = s.charAt(length - 2); + if (UTF16.isLeadSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + /** + * Helper class for UnicodeSetStringSpan. + * + * <p>List of offsets from the current position from where to try matching + * a code point or a string. + * Stores offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time + * are relatively dense, that is, + * there are normally no gaps of hundreds or thousands of offset values. + * + * <p>This class optionally also tracks the minimum non-negative count for each position, + * intended to count the smallest number of elements of any path leading to that position. + * + * <p>The implementation uses a circular buffer of count integers, + * each indicating whether the corresponding offset is in the list, + * and its path element count. + * This avoids inserting into a sorted list of offsets (or absolute indexes) + * and physically moving part of the list. + * + * <p>Note: In principle, the caller should setMaxLength() to + * the maximum of the max string length and U16_LENGTH/U8_LENGTH + * to account for "long" single code points. + * + * <p>Note: An earlier version did not track counts and stored only byte flags. + * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ + private static final class OffsetList { + private int[] list; + private int length; + private int start; + + public OffsetList() { + list = new int[16]; // default size + } + + public void setMaxLength(int maxLength) { + if (maxLength > list.length) { + list = new int[maxLength]; + } + clear(); + } + + public void clear() { + for (int i = list.length; i-- > 0;) { + list[i] = 0; + } + start = length = 0; + } + + public boolean isEmpty() { + return (length == 0); + } + + /** + * Reduces all stored offsets by delta, used when the current position moves by delta. + * There must not be any offsets lower than delta. + * If there is an offset equal to delta, it is removed. + * + * @param delta [1..maxLength] + */ + public void shift(int delta) { + int i = start + delta; + if (i >= list.length) { + i -= list.length; + } + if (list[i] != 0) { + list[i] = 0; + --length; + } + start = i; + } + + /** + * Adds an offset. The list must not contain it yet. + * @param offset [1..maxLength] + */ + public void addOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + assert list[i] == 0; + list[i] = 1; + ++length; + } + + /** + * Adds an offset and updates its count. + * The list may already contain the offset. + * @param offset [1..maxLength] + */ + public void addOffsetAndCount(int offset, int count) { + assert count > 0; + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + if (list[i] == 0) { + list[i] = count; + ++length; + } else if (count < list[i]) { + list[i] = count; + } + } + + /** + * @param offset [1..maxLength] + */ + public boolean containsOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + return list[i] != 0; + } + + /** + * @param offset [1..maxLength] + */ + public boolean hasCountAtOffset(int offset, int count) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + int oldCount = list[i]; + return oldCount != 0 && oldCount <= count; + } + + /** + * Finds the lowest stored offset from a non-empty list, removes it, + * and reduces all other offsets by this minimum. + * @return min=[1..maxLength] + */ + public int popMinimum(OutputInt outCount) { + // Look for the next offset in list[start+1..list.length-1]. + int i = start, result; + while (++i < list.length) { + int count = list[i]; + if (count != 0) { + list[i] = 0; + --length; + result = i - start; + start = i; + if (outCount != null) { outCount.value = count; } + return result; + } + } + // i==list.length + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result = list.length - start; + i = 0; + int count; + while ((count = list[i]) == 0) { + ++i; + } + list[i] = 0; + --length; + start = i; + if (outCount != null) { outCount.value = count; } + return result + i; + } + } +} Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfc.icu differ Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfkc.icu differ Binary files /dev/null and new/jdk/src/java.base/share/classes/sun/text/resources/nfkc_cf.icu differ --- /dev/null 2015-07-13 16:12:20.000000000 +0900 +++ new/jdk/test/java/text/BreakIterator/Bug8032446.java 2015-07-13 16:12:19.000000000 +0900 @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + * @test + * @bug 8032446 + * @summary Confirm that BreakIterator works as expected with new characters in Unicode 7. + */ + +import java.text.*; +import java.util.*; + +public class Bug8032446 { + + public static void main(String[] args) { + boolean err = false; + + StringBuilder sb = new StringBuilder(); + for (int i = 0x10860; i <= 0x10876; i++) { // Palmyrene Letters + sb.append(Character.toChars(i)); + } + sb.append(" "); + for (int i = 0x10879; i <= 0x1087D; i++) { // Palmyrene Numbers + sb.append(Character.toChars(i)); + } + String s = sb.toString(); + + BreakIterator bi = BreakIterator.getWordInstance(Locale.ROOT); + bi.setText(s); + bi.first(); + + if (bi.next() != s.indexOf(' ')) { + throw new RuntimeException("Unexpected word breaking."); + } + } + +}