--- old/src/java.base/share/classes/java/net/IDN.java 2020-01-10 15:57:02.000000000 -0800 +++ new/src/java.base/share/classes/java/net/IDN.java 2020-01-10 15:57:02.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,9 +29,9 @@ import java.security.AccessController; import java.security.PrivilegedAction; -import sun.net.idn.StringPrep; -import sun.net.idn.Punycode; -import sun.text.normalizer.UCharacterIterator; +import jdk.internal.icu.impl.Punycode; +import jdk.internal.icu.text.StringPrep; +import jdk.internal.icu.text.UCharacterIterator; /** * Provides methods to convert internationalized domain names (IDNs) between @@ -226,7 +226,7 @@ InputStream stream = null; try { - final String IDN_PROFILE = "uidna.spp"; + final String IDN_PROFILE = "/sun/net/idn/uidna.spp"; if (System.getSecurityManager() != null) { stream = AccessController.doPrivileged(new PrivilegedAction<>() { public InputStream run() { --- old/src/java.base/share/classes/java/text/Bidi.java 2020-01-10 15:57:03.000000000 -0800 +++ new/src/java.base/share/classes/java/text/Bidi.java 2020-01-10 15:57:03.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -35,7 +35,7 @@ package java.text; -import sun.text.bidi.BidiBase; +import jdk.internal.icu.text.BidiBase; /** * This class implements the Unicode Bidirectional Algorithm. --- old/src/java.base/share/classes/java/text/CollationElementIterator.java 2020-01-10 15:57:04.000000000 -0800 +++ new/src/java.base/share/classes/java/text/CollationElementIterator.java 2020-01-10 15:57:04.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -41,7 +41,7 @@ import java.lang.Character; import java.util.Vector; import sun.text.CollatorUtilities; -import sun.text.normalizer.NormalizerBase; +import jdk.internal.icu.text.NormalizerBase; /** * The {@code CollationElementIterator} class is used as an iterator --- old/src/java.base/share/classes/java/text/Normalizer.java 2020-01-10 15:57:05.000000000 -0800 +++ new/src/java.base/share/classes/java/text/Normalizer.java 2020-01-10 15:57:05.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,7 +37,7 @@ package java.text; -import sun.text.normalizer.NormalizerBase; +import jdk.internal.icu.text.NormalizerBase; /** * This class provides the method {@code normalize} which transforms Unicode --- old/src/java.base/share/classes/java/text/RBTableBuilder.java 2020-01-10 15:57:06.000000000 -0800 +++ new/src/java.base/share/classes/java/text/RBTableBuilder.java 2020-01-10 15:57:06.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,8 +42,7 @@ import sun.text.UCompactIntArray; import sun.text.IntHashtable; import sun.text.ComposedCharIter; -import sun.text.CollatorUtilities; -import sun.text.normalizer.NormalizerImpl; +import jdk.internal.icu.impl.NormalizerImpl; /** * This class contains all the code to parse a RuleBasedCollator pattern --- old/src/java.base/share/classes/sun/text/CollatorUtilities.java 2020-01-10 15:57:08.000000000 -0800 +++ new/src/java.base/share/classes/sun/text/CollatorUtilities.java 2020-01-10 15:57:07.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2005, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,7 +25,7 @@ package sun.text; -import sun.text.normalizer.NormalizerBase; +import jdk.internal.icu.text.NormalizerBase; public class CollatorUtilities { --- old/src/java.base/share/classes/sun/text/ComposedCharIter.java 2020-01-10 15:57:09.000000000 -0800 +++ new/src/java.base/share/classes/sun/text/ComposedCharIter.java 2020-01-10 15:57:09.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,8 +25,8 @@ package sun.text; -import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.NormalizerImpl; +import jdk.internal.icu.impl.NormalizerImpl; +import jdk.internal.icu.text.NormalizerBase; public final class ComposedCharIter { /** --- old/src/java.base/share/classes/sun/text/Normalizer.java 2020-01-10 15:57:10.000000000 -0800 +++ new/src/java.base/share/classes/sun/text/Normalizer.java 2020-01-10 15:57:10.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,8 +25,8 @@ package sun.text; -import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.UCharacter; +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.NormalizerBase; /** * This Normalizer is for Unicode 3.2 support for IDNA only. --- old/test/jdk/java/text/Bidi/Bug6850113.java 2020-01-10 15:57:11.000000000 -0800 +++ new/test/jdk/java/text/Bidi/Bug6850113.java 2020-01-10 15:57:11.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,14 +22,14 @@ */ /* * @test - * @bug 6850113 + * @bug 6850113 8174270 * @summary Verify the return value of digit() for some digits. - * @modules java.base/sun.text.normalizer + * @modules java.base/jdk.internal.icu.lang * @compile -XDignore.symbol.file=true Bug6850113.java * @run main Bug6850113 */ -import sun.text.normalizer.UCharacter; +import jdk.internal.icu.lang.UCharacter; public class Bug6850113 { --- old/test/jdk/java/text/Bidi/Bug7051769.java 2020-01-10 15:57:12.000000000 -0800 +++ new/test/jdk/java/text/Bidi/Bug7051769.java 2020-01-10 15:57:12.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,7 +23,7 @@ /* * @test - * @bug 7051769 8038092 + * @bug 7051769 8038092 8174270 * @summary verify that Bidi.toString() returns the corect result. * The second run is intended to test lazy SharedSectets init for 8038092 * @modules java.desktop @@ -40,7 +40,7 @@ if (System.getProperty("preloadBidi", "").equals("true")) { // Make sure the SharedSecret is lazily initialized correctly try { - Class.forName("sun.text.bidi.BidiBase"); + Class.forName("jdk.internal.icu.text.BidiBase"); System.out.println("BidiBase class has been pre-loaded."); } catch (ClassNotFoundException e) { System.out.println("BidiBase class could not be pre-loaded."); @@ -68,7 +68,7 @@ TextAttribute.RUN_DIRECTION_RTL); String text = "\u0623\u0643\u062a\u0648\u0628\u0631 10"; - String expected = "sun.text.bidi.BidiBase[dir: 2 baselevel: 1 length: 9 runs: [1 1 1 1 1 1 1 2 2] text: [0x623 0x643 0x62a 0x648 0x628 0x631 0x20 0x661 0x660]]"; + String expected = "jdk.internal.icu.text.BidiBase[dir: 2 baselevel: 1 length: 9 runs: [1 1 1 1 1 1 1 2 2] text: [0x623 0x643 0x62a 0x648 0x628 0x631 0x20 0x661 0x660]]"; AttributedString as = new AttributedString(text, attrNS); AttributedCharacterIterator itr = as.getIterator(); --- old/test/jdk/java/text/Normalizer/ConformanceTest.java 2020-01-10 15:57:14.000000000 -0800 +++ new/test/jdk/java/text/Normalizer/ConformanceTest.java 2020-01-10 15:57:13.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,10 +22,10 @@ */ /* * @test - * @bug 4221795 6565620 6959267 7070436 7198195 8032446 8221431 + * @bug 4221795 6565620 6959267 7070436 7198195 8032446 8174270 8221431 * @summary Confirm Normalizer's fundamental behavior * @library /lib/testlibrary/java/lang - * @modules java.base/sun.text java.base/sun.text.normalizer + * @modules java.base/sun.text java.base/jdk.internal.icu.text * @compile -XDignore.symbol.file ConformanceTest.java * @run main/timeout=3000 ConformanceTest */ @@ -34,14 +34,12 @@ import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; -import java.lang.reflect.Method; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.util.BitSet; import java.util.StringTokenizer; -import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.NormalizerImpl; +import jdk.internal.icu.text.NormalizerBase; /* * Conformance test for java.text.Normalizer and sun.text.Normalizer. --- old/test/jdk/java/text/Normalizer/ICUBasicTest.java 2020-01-10 15:57:15.000000000 -0800 +++ new/test/jdk/java/text/Normalizer/ICUBasicTest.java 2020-01-10 15:57:15.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -22,10 +22,10 @@ */ /* * @test - * @bug 4221795 8032446 + * @bug 4221795 8032446 8174270 * @summary Confirm Normalizer's fundamental behavior. Imported from ICU4J 3.2's * src/com/ibm/icu/dev/test and modified. - * @modules java.base/sun.text java.base/sun.text.normalizer + * @modules java.base/sun.text java.base/jdk.internal.icu.text * @library /java/text/testlib * @compile -XDignore.symbol.file ICUBasicTest.java * @run main/timeout=30 ICUBasicTest @@ -39,11 +39,9 @@ */ import sun.text.Normalizer; -import sun.text.normalizer.NormalizerBase; -import sun.text.normalizer.NormalizerImpl; +import jdk.internal.icu.text.NormalizerBase; import static java.text.Normalizer.Form.*; -import static sun.text.normalizer.NormalizerBase.Mode.*; public class ICUBasicTest extends IntlTest { --- old/test/jdk/java/text/Normalizer/NormalizerAPITest.java 2020-01-10 15:57:16.000000000 -0800 +++ new/test/jdk/java/text/Normalizer/NormalizerAPITest.java 2020-01-10 15:57:16.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,9 +23,9 @@ /* * @test - * @bug 4221795 + * @bug 4221795 8174270 * @summary Confirm Normalizer's fundamental behavior - * @modules java.base/sun.text java.base/sun.text.normalizer + * @modules java.base/sun.text java.base/jdk.internal.icu.text * @library /java/text/testlib * @compile -XDignore.symbol.file NormalizerAPITest.java * @run main/timeout=30 NormalizerAPITest @@ -64,8 +64,8 @@ static final int[] options = { 0x00, sun.text.Normalizer.UNICODE_3_2, - sun.text.normalizer.NormalizerBase.UNICODE_3_2, - sun.text.normalizer.NormalizerBase.UNICODE_LATEST, + jdk.internal.icu.text.NormalizerBase.UNICODE_3_2, + jdk.internal.icu.text.NormalizerBase.UNICODE_LATEST, }; static final String nonNullStr = "testdata"; @@ -319,7 +319,7 @@ in.getClass().getSimpleName() + ") failed."); } out = sun.text.Normalizer.normalize(in, NFD, - sun.text.normalizer.NormalizerBase.UNICODE_LATEST); + jdk.internal.icu.text.NormalizerBase.UNICODE_LATEST); if (!out.equals(expected.toString())) { errln("sun.text.Normalizer.normalize(" + in.getClass().getSimpleName() + ") failed."); @@ -330,7 +330,7 @@ in.getClass().getSimpleName() + ") failed."); } if (!sun.text.Normalizer.isNormalized(expected, NFD, - sun.text.normalizer.NormalizerBase.UNICODE_LATEST)) { + jdk.internal.icu.text.NormalizerBase.UNICODE_LATEST)) { errln("sun.text.Normalizer.isNormalize(" + in.getClass().getSimpleName() + ") failed."); } --- old/test/jdk/java/text/Normalizer/ThreadSafeTest.java 2020-01-10 15:57:17.000000000 -0800 +++ new/test/jdk/java/text/Normalizer/ThreadSafeTest.java 2020-01-10 15:57:17.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,10 +23,10 @@ /* * @test - * @bug 4221795 8032446 - * @summary Confirm that java.text.Normalizer and sun.text.Normalize are + * @bug 4221795 8032446 8174270 + * @summary Confirm that java.text.Normalizer and sun.text.Normalizer are * thread-safe. - * @modules java.base/sun.text java.base/sun.text.normalizer + * @modules java.base/sun.text java.base/jdk.internal.icu.text * @compile -XDignore.symbol.file ThreadSafeTest.java * @run main/othervm -esa ThreadSafeTest 5 10 */ @@ -115,7 +115,7 @@ sun.text.Normalizer.UNICODE_3_2); testJavaNormalize(2, java.text.Normalizer.Form.NFKD); testSunNormalize(3, java.text.Normalizer.Form.NFC, - sun.text.normalizer.NormalizerBase.UNICODE_LATEST); + jdk.internal.icu.text.NormalizerBase.UNICODE_LATEST); testJavaNormalize(4, java.text.Normalizer.Form.NFD); testIsNormalized(0, java.text.Normalizer.Form.NFKC); --- old/test/jdk/sun/net/idn/NFS4StringPrep.java 2020-01-10 15:57:18.000000000 -0800 +++ new/test/jdk/sun/net/idn/NFS4StringPrep.java 2020-01-10 15:57:18.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,8 +32,8 @@ import java.io.UnsupportedEncodingException; import java.text.ParseException; -import sun.net.idn.StringPrep; -import sun.text.normalizer.UCharacterIterator; +import jdk.internal.icu.text.StringPrep; +import jdk.internal.icu.text.UCharacterIterator; /** * @author ram --- old/test/jdk/sun/net/idn/PunycodeTest.java 2020-01-10 15:57:20.000000000 -0800 +++ new/test/jdk/sun/net/idn/PunycodeTest.java 2020-01-10 15:57:20.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2006, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,17 +23,16 @@ /* * @test - * @summary Unit test for sun.net.idn.Punycode - * @bug 4737170 - * @modules java.base/sun.net.idn + * @summary Unit test for jdk.internal.icu.impl.Punycode + * @bug 4737170 8174270 + * @modules java.base/jdk.internal.icu.impl * @compile -XDignore.symbol.file PunycodeTest.java * @run main/othervm -ea PunycodeTest * @author Edward Wang */ import java.util.Scanner; -import java.text.ParseException; -import sun.net.idn.Punycode; +import jdk.internal.icu.impl.Punycode; /** * unit test for Punycode that is also originated from the sample code --- old/test/jdk/sun/net/idn/TestStringPrep.java 2020-01-10 15:57:21.000000000 -0800 +++ new/test/jdk/sun/net/idn/TestStringPrep.java 2020-01-10 15:57:21.000000000 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,10 +23,9 @@ /* * @test - * @summary Unit test for sun.net.idn.Punycode - * @bug 4737170 8060097 - * @modules java.base/sun.net.idn:+open - * java.base/sun.text.normalizer + * @summary Unit test for jdk.internal.icu.text.StringPrep + * @bug 4737170 8060097 8174270 + * @modules java.base/jdk.internal.icu.text * @library . * @compile -XDignore.symbol.file TestStringPrep.java NFS4StringPrep.java * TestData.java @@ -44,8 +43,8 @@ import java.io.InputStream; import java.util.Locale; -import sun.net.idn.StringPrep; -import sun.text.normalizer.UCharacterIterator; +import jdk.internal.icu.text.StringPrep; +import jdk.internal.icu.text.UCharacterIterator; public class TestStringPrep { public static void main(String[] args) throws Exception { --- old/src/java.base/share/classes/sun/text/normalizer/BMPSet.java 2020-01-10 15:57:23.000000000 -0800 +++ /dev/null 2020-01-10 15:57:23.000000000 -0800 @@ -1,526 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ****************************************************************************** - * - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - * - ****************************************************************************** - */ - -package sun.text.normalizer; - -import sun.text.normalizer.UnicodeSet.SpanCondition; - -/** - * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. - * - * Latin-1: Look up bytes. - * 2-byte characters: Bits organized vertically. - * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. - * Supplementary characters: Call contains() on the parent set. - */ -final class BMPSet { - - /** - * One boolean ('true' or 'false') per Latin-1 character. - */ - private boolean[] latin1Contains; - - /** - * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points - * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} - * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) - * - * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at - * runtime. - */ - private int[] table7FF; - - /** - * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks - * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} - * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit - * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed - * and set.contains(c) must be called. - * - * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster - * validity checking at runtime. - */ - private int[] bmpBlockBits; - - /** - * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, - * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are - * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. - */ - private int[] list4kStarts; - - /** - * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for - * supplementary code points. The list is terminated with list[listLength-1]=0x110000. - */ - private final int[] list; - private final int listLength; // length used; list may be longer to minimize reallocs - - public BMPSet(final int[] parentList, int parentListLength) { - list = parentList; - listLength = parentListLength; - latin1Contains = new boolean[0x100]; - table7FF = new int[64]; - bmpBlockBits = new int[64]; - list4kStarts = new int[18]; - - /* - * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the - * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of - * indexes is for finding supplementary code points. - */ - list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); - int i; - for (i = 1; i <= 0x10; ++i) { - list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); - } - list4kStarts[0x11] = listLength - 1; - - initBits(); - } - - public boolean contains(int c) { - if (c <= 0xff) { - return (latin1Contains[c]); - } else if (c <= 0x7ff) { - return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); - } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { - int lead = c >> 12; - int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; - if (twoBits <= 1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - return (0 != twoBits); - } else { - // Look up the code point in its 4k block of code points. - return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); - } - } else if (c <= 0x10ffff) { - // surrogate or supplementary code point - return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); - } else { - // Out-of-range code points get false, consistent with long-standing - // behavior of UnicodeSet.contains(c). - return false; - } - } - - /** - * Span the initial substring for which each character c has spanCondition==contains(c). It must be - * spanCondition==0 or 1. - * - * @param start The start index - * @param outCount If not null: Receives the number of code points in the span. - * @return the limit (exclusive end) of the span - * - * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for - * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points - * as usual in ICU. - */ - public final int span(CharSequence s, int start, SpanCondition spanCondition, - OutputInt outCount) { - char c, c2; - int i = start; - int limit = s.length(); - int numSupplementary = 0; - if (SpanCondition.NOT_CONTAINED != spanCondition) { - // span - while (i < limit) { - c = s.charAt(i); - if (c <= 0xff) { - if (!latin1Contains[c]) { - break; - } - } else if (c <= 0x7ff) { - if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { - break; - } - } else if (c < 0xd800 || - c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { - int lead = c >> 12; - int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; - if (twoBits <= 1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if (twoBits == 0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { - break; - } - } - } else { - // surrogate pair - int supplementary = UCharacterProperty.getRawSupplementary(c, c2); - if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - ++numSupplementary; - ++i; - } - ++i; - } - } else { - // span not - while (i < limit) { - c = s.charAt(i); - if (c <= 0xff) { - if (latin1Contains[c]) { - break; - } - } else if (c <= 0x7ff) { - if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { - break; - } - } else if (c < 0xd800 || - c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { - int lead = c >> 12; - int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; - if (twoBits <= 1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if (twoBits != 0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { - break; - } - } - } else { - // surrogate pair - int supplementary = UCharacterProperty.getRawSupplementary(c, c2); - if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - ++numSupplementary; - ++i; - } - ++i; - } - } - if (outCount != null) { - int spanLength = i - start; - outCount.value = spanLength - numSupplementary; // number of code points - } - return i; - } - - /** - * Symmetrical with span(). - * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= - * limit and spanCondition==0 or 1. - * - * @return The string index which starts the span (i.e. inclusive). - */ - public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { - char c, c2; - - if (SpanCondition.NOT_CONTAINED != spanCondition) { - // span - for (;;) { - c = s.charAt(--limit); - if (c <= 0xff) { - if (!latin1Contains[c]) { - break; - } - } else if (c <= 0x7ff) { - if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { - break; - } - } else if (c < 0xd800 || - c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { - int lead = c >> 12; - int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; - if (twoBits <= 1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if (twoBits == 0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { - break; - } - } - } else { - // surrogate pair - int supplementary = UCharacterProperty.getRawSupplementary(c2, c); - if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - --limit; - } - if (0 == limit) { - return 0; - } - } - } else { - // span not - for (;;) { - c = s.charAt(--limit); - if (c <= 0xff) { - if (latin1Contains[c]) { - break; - } - } else if (c <= 0x7ff) { - if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { - break; - } - } else if (c < 0xd800 || - c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { - int lead = c >> 12; - int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; - if (twoBits <= 1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if (twoBits != 0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { - break; - } - } - } else { - // surrogate pair - int supplementary = UCharacterProperty.getRawSupplementary(c2, c); - if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - --limit; - } - if (0 == limit) { - return 0; - } - } - } - return limit + 1; - } - - /** - * Set bits in a bit rectangle in "vertical" bit organization. start> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. - int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. - - // Set one bit indicating an all-one block. - int bits = 1 << lead; - if ((start + 1) == limit) { // Single-character shortcut. - table[trail] |= bits; - return; - } - - int limitLead = limit >> 6; - int limitTrail = limit & 0x3f; - - if (lead == limitLead) { - // Partial vertical bit column. - while (trail < limitTrail) { - table[trail++] |= bits; - } - } else { - // Partial vertical bit column, - // followed by a bit rectangle, - // followed by another partial vertical bit column. - if (trail > 0) { - do { - table[trail++] |= bits; - } while (trail < 64); - ++lead; - } - if (lead < limitLead) { - bits = ~((1 << lead) - 1); - if (limitLead < 0x20) { - bits &= (1 << limitLead) - 1; - } - for (trail = 0; trail < 64; ++trail) { - table[trail] |= bits; - } - } - // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. - // In that case, bits=1<= 0x100) { - break; - } - do { - latin1Contains[start++] = true; - } while (start < limit && start < 0x100); - } while (limit <= 0x100); - - // Set table7FF[]. - while (start < 0x800) { - set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); - if (limit > 0x800) { - start = 0x800; - break; - } - - start = list[listIndex++]; - if (listIndex < listLength) { - limit = list[listIndex++]; - } else { - limit = 0x110000; - } - } - - // Set bmpBlockBits[]. - int minStart = 0x800; - while (start < 0x10000) { - if (limit > 0x10000) { - limit = 0x10000; - } - - if (start < minStart) { - start = minStart; - } - if (start < limit) { // Else: Another range entirely in a known mixed-value block. - if (0 != (start & 0x3f)) { - // Mixed-value block of 64 code points. - start >>= 6; - bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); - start = (start + 1) << 6; // Round up to the next block boundary. - minStart = start; // Ignore further ranges in this block. - } - if (start < limit) { - if (start < (limit & ~0x3f)) { - // Multiple all-ones blocks of 64 code points each. - set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); - } - - if (0 != (limit & 0x3f)) { - // Mixed-value block of 64 code points. - limit >>= 6; - bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); - limit = (limit + 1) << 6; // Round up to the next block boundary. - minStart = limit; // Ignore further ranges in this block. - } - } - } - - if (limit == 0x10000) { - break; - } - - start = list[listIndex++]; - if (listIndex < listLength) { - limit = list[listIndex++]; - } else { - limit = 0x110000; - } - } - } - - /** - * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code - * points in a certain range. - * - * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and - * hi=findCodePoint(end) with 0<=lo<=hi= hi || c >= list[hi - 1]) - return hi; - // invariant: c >= list[lo] - // invariant: c < list[hi] - for (;;) { - int i = (lo + hi) >>> 1; - if (i == lo) { - break; // Found! - } else if (c < list[i]) { - hi = i; - } else { - lo = i; - } - } - return hi; - } - - private final boolean containsSlow(int c, int lo, int hi) { - return (0 != (findCodePoint(c, lo, hi) & 1)); - } -} - --- /dev/null 2020-01-10 15:57:23.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/BMPSet.java 2020-01-10 15:57:22.000000000 -0800 @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.UnicodeSet.SpanCondition; +import jdk.internal.icu.util.OutputInt; + +/** + * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. + * + * Latin-1: Look up bytes. + * 2-byte characters: Bits organized vertically. + * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. + * Supplementary characters: Call contains() on the parent set. + */ +public final class BMPSet { + + /** + * One boolean ('true' or 'false') per Latin-1 character. + */ + private boolean[] latin1Contains; + + /** + * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points + * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} + * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) + * + * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at + * runtime. + */ + private int[] table7FF; + + /** + * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks + * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} + * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit + * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed + * and set.contains(c) must be called. + * + * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster + * validity checking at runtime. + */ + private int[] bmpBlockBits; + + /** + * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, + * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are + * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. + */ + private int[] list4kStarts; + + /** + * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for + * supplementary code points. The list is terminated with list[listLength-1]=0x110000. + */ + private final int[] list; + private final int listLength; // length used; list may be longer to minimize reallocs + + public BMPSet(final int[] parentList, int parentListLength) { + list = parentList; + listLength = parentListLength; + latin1Contains = new boolean[0x100]; + table7FF = new int[64]; + bmpBlockBits = new int[64]; + list4kStarts = new int[18]; + + /* + * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the + * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of + * indexes is for finding supplementary code points. + */ + list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); + int i; + for (i = 1; i <= 0x10; ++i) { + list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); + } + list4kStarts[0x11] = listLength - 1; + + initBits(); + } + + public boolean contains(int c) { + if (c <= 0xff) { + return (latin1Contains[c]); + } else if (c <= 0x7ff) { + return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); + } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + return (0 != twoBits); + } else { + // Look up the code point in its 4k block of code points. + return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); + } + } else if (c <= 0x10ffff) { + // surrogate or supplementary code point + return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); + } else { + // Out-of-range code points get false, consistent with long-standing + // behavior of UnicodeSet.contains(c). + return false; + } + } + + /** + * Span the initial substring for which each character c has spanCondition==contains(c). It must be + * spanCondition==0 or 1. + * + * @param start The start index + * @param outCount If not null: Receives the number of code points in the span. + * @return the limit (exclusive end) of the span + * + * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for + * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points + * as usual in ICU. + */ + public final int span(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + char c, c2; + int i = start; + int limit = s.length(); + int numSupplementary = 0; + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } else { + // span not + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } + if (outCount != null) { + int spanLength = i - start; + outCount.value = spanLength - numSupplementary; // number of code points + } + return i; + } + + /** + * Symmetrical with span(). + * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= + * limit and spanCondition==0 or 1. + * + * @return The string index which starts the span (i.e. inclusive). + */ + public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { + char c, c2; + + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } else { + // span not + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } + return limit + 1; + } + + /** + * Set bits in a bit rectangle in "vertical" bit organization. start> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. + int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. + + // Set one bit indicating an all-one block. + int bits = 1 << lead; + if ((start + 1) == limit) { // Single-character shortcut. + table[trail] |= bits; + return; + } + + int limitLead = limit >> 6; + int limitTrail = limit & 0x3f; + + if (lead == limitLead) { + // Partial vertical bit column. + while (trail < limitTrail) { + table[trail++] |= bits; + } + } else { + // Partial vertical bit column, + // followed by a bit rectangle, + // followed by another partial vertical bit column. + if (trail > 0) { + do { + table[trail++] |= bits; + } while (trail < 64); + ++lead; + } + if (lead < limitLead) { + bits = ~((1 << lead) - 1); + if (limitLead < 0x20) { + bits &= (1 << limitLead) - 1; + } + for (trail = 0; trail < 64; ++trail) { + table[trail] |= bits; + } + } + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. + // In that case, bits=1<= 0x100) { + break; + } + do { + latin1Contains[start++] = true; + } while (start < limit && start < 0x100); + } while (limit <= 0x100); + + // Set table7FF[]. + while (start < 0x800) { + set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); + if (limit > 0x800) { + start = 0x800; + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + + // Set bmpBlockBits[]. + int minStart = 0x800; + while (start < 0x10000) { + if (limit > 0x10000) { + limit = 0x10000; + } + + if (start < minStart) { + start = minStart; + } + if (start < limit) { // Else: Another range entirely in a known mixed-value block. + if (0 != (start & 0x3f)) { + // Mixed-value block of 64 code points. + start >>= 6; + bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); + start = (start + 1) << 6; // Round up to the next block boundary. + minStart = start; // Ignore further ranges in this block. + } + if (start < limit) { + if (start < (limit & ~0x3f)) { + // Multiple all-ones blocks of 64 code points each. + set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); + } + + if (0 != (limit & 0x3f)) { + // Mixed-value block of 64 code points. + limit >>= 6; + bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); + limit = (limit + 1) << 6; // Round up to the next block boundary. + minStart = limit; // Ignore further ranges in this block. + } + } + } + + if (limit == 0x10000) { + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + } + + /** + * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code + * points in a certain range. + * + * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and + * hi=findCodePoint(end) with 0<=lo<=hi= hi || c >= list[hi - 1]) + return hi; + // invariant: c >= list[lo] + // invariant: c < list[hi] + for (;;) { + int i = (lo + hi) >>> 1; + if (i == lo) { + break; // Found! + } else if (c < list[i]) { + hi = i; + } else { + lo = i; + } + } + return hi; + } + + private final boolean containsSlow(int c, int lo, int hi) { + return (0 != (findCodePoint(c, lo, hi) & 1)); + } +} + --- old/src/java.base/share/classes/sun/text/normalizer/CharTrie.java 2020-01-10 15:57:24.000000000 -0800 +++ /dev/null 2020-01-10 15:57:24.000000000 -0800 @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ****************************************************************************** - * Copyright (C) 1996-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ****************************************************************************** - */ - -package sun.text.normalizer; - -import java.io.DataInputStream; -import java.io.InputStream; -import java.io.IOException; - -/** - * Trie implementation which stores data in char, 16 bits. - * @author synwee - * @see com.ibm.icu.impl.Trie - * @since release 2.1, Jan 01 2002 - */ - - // note that i need to handle the block calculations later, since chartrie - // in icu4c uses the same index array. -public class CharTrie extends Trie -{ - // public constructors --------------------------------------------- - - /** - *

Creates a new Trie with the settings for the trie data.

- *

Unserialize the 32-bit-aligned input stream and use the data for the - * trie.

- * @param inputStream file input stream to a ICU data file, containing - * the trie - * @param dataManipulate object which provides methods to parse the char - * data - * @throws IOException thrown when data reading fails - * @draft 2.1 - */ - public CharTrie(InputStream inputStream, - DataManipulate dataManipulate) throws IOException - { - super(inputStream, dataManipulate); - - if (!isCharTrie()) { - throw new IllegalArgumentException( - "Data given does not belong to a char trie."); - } - } - - // public methods -------------------------------------------------- - - /** - * Gets the value associated with the codepoint. - * If no value is associated with the codepoint, a default value will be - * returned. - * @param ch codepoint - * @return offset to data - */ - public final char getCodePointValue(int ch) - { - int offset; - - // fastpath for U+0000..U+D7FF - if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { - // copy of getRawOffset() - offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_) - + (ch & INDEX_STAGE_3_MASK_); - return m_data_[offset]; - } - - // handle U+D800..U+10FFFF - offset = getCodePointOffset(ch); - - // return -1 if there is an error, in this case we return the default - // value: m_initialValue_ - return (offset >= 0) ? m_data_[offset] : m_initialValue_; - } - - /** - * Gets the value to the data which this lead surrogate character points - * to. - * Returned data may contain folding offset information for the next - * trailing surrogate character. - * This method does not guarantee correct results for trail surrogates. - * @param ch lead surrogate character - * @return data value - */ - public final char getLeadValue(char ch) - { - return m_data_[getLeadOffset(ch)]; - } - - // protected methods ----------------------------------------------- - - /** - *

Parses the input stream and stores its trie content into a index and - * data array

- * @param inputStream data input stream containing trie data - * @exception IOException thrown when data reading fails - */ - protected final void unserialize(InputStream inputStream) - throws IOException - { - DataInputStream input = new DataInputStream(inputStream); - int indexDataLength = m_dataOffset_ + m_dataLength_; - m_index_ = new char[indexDataLength]; - for (int i = 0; i < indexDataLength; i ++) { - m_index_[i] = input.readChar(); - } - m_data_ = m_index_; - m_initialValue_ = m_data_[m_dataOffset_]; - } - - /** - * Gets the offset to the data which the surrogate pair points to. - * @param lead lead surrogate - * @param trail trailing surrogate - * @return offset to data - * @draft 2.1 - */ - protected final int getSurrogateOffset(char lead, char trail) - { - if (m_dataManipulate_ == null) { - throw new NullPointerException( - "The field DataManipulate in this Trie is null"); - } - - // get fold position for the next trail surrogate - int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead)); - - // get the real data from the folded lead/trail units - if (offset > 0) { - return getRawOffset(offset, (char)(trail & SURROGATE_MASK_)); - } - - // return -1 if there is an error, in this case we return the default - // value: m_initialValue_ - return -1; - } - - // private data members -------------------------------------------- - - /** - * Default value - */ - private char m_initialValue_; - /** - * Array of char data - */ - private char m_data_[]; -} --- /dev/null 2020-01-10 15:57:24.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/CharTrie.java 2020-01-10 15:57:24.000000000 -0800 @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.UTF16; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; + +/** + * Trie implementation which stores data in char, 16 bits. + * @author synwee + * @see com.ibm.icu.impl.Trie + * @since release 2.1, Jan 01 2002 + */ + + // note that i need to handle the block calculations later, since chartrie + // in icu4c uses the same index array. +public class CharTrie extends Trie +{ + // public constructors --------------------------------------------- + + /** + *

Creates a new Trie with the settings for the trie data.

+ *

Unserialize the 32-bit-aligned input stream and use the data for the + * trie.

+ * @param inputStream file input stream to a ICU data file, containing + * the trie + * @param dataManipulate object which provides methods to parse the char + * data + * @throws IOException thrown when data reading fails + * @draft 2.1 + */ + public CharTrie(InputStream inputStream, + DataManipulate dataManipulate) throws IOException + { + super(inputStream, dataManipulate); + + if (!isCharTrie()) { + throw new IllegalArgumentException( + "Data given does not belong to a char trie."); + } + } + + // public methods -------------------------------------------------- + + /** + * Gets the value associated with the codepoint. + * If no value is associated with the codepoint, a default value will be + * returned. + * @param ch codepoint + * @return offset to data + */ + public final char getCodePointValue(int ch) + { + int offset; + + // fastpath for U+0000..U+D7FF + if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { + // copy of getRawOffset() + offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_) + + (ch & INDEX_STAGE_3_MASK_); + return m_data_[offset]; + } + + // handle U+D800..U+10FFFF + offset = getCodePointOffset(ch); + + // return -1 if there is an error, in this case we return the default + // value: m_initialValue_ + return (offset >= 0) ? m_data_[offset] : m_initialValue_; + } + + /** + * Gets the value to the data which this lead surrogate character points + * to. + * Returned data may contain folding offset information for the next + * trailing surrogate character. + * This method does not guarantee correct results for trail surrogates. + * @param ch lead surrogate character + * @return data value + */ + public final char getLeadValue(char ch) + { + return m_data_[getLeadOffset(ch)]; + } + + // protected methods ----------------------------------------------- + + /** + *

Parses the input stream and stores its trie content into a index and + * data array

+ * @param inputStream data input stream containing trie data + * @exception IOException thrown when data reading fails + */ + protected final void unserialize(InputStream inputStream) + throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + int indexDataLength = m_dataOffset_ + m_dataLength_; + m_index_ = new char[indexDataLength]; + for (int i = 0; i < indexDataLength; i ++) { + m_index_[i] = input.readChar(); + } + m_data_ = m_index_; + m_initialValue_ = m_data_[m_dataOffset_]; + } + + /** + * Gets the offset to the data which the surrogate pair points to. + * @param lead lead surrogate + * @param trail trailing surrogate + * @return offset to data + * @draft 2.1 + */ + protected final int getSurrogateOffset(char lead, char trail) + { + if (m_dataManipulate_ == null) { + throw new NullPointerException( + "The field DataManipulate in this Trie is null"); + } + + // get fold position for the next trail surrogate + int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead)); + + // get the real data from the folded lead/trail units + if (offset > 0) { + return getRawOffset(offset, (char)(trail & SURROGATE_MASK_)); + } + + // return -1 if there is an error, in this case we return the default + // value: m_initialValue_ + return -1; + } + + // private data members -------------------------------------------- + + /** + * Default value + */ + private char m_initialValue_; + /** + * Array of char data + */ + private char m_data_[]; +} --- old/src/java.base/share/classes/sun/text/normalizer/CharacterIteratorWrapper.java 2020-01-10 15:57:26.000000000 -0800 +++ /dev/null 2020-01-10 15:57:26.000000000 -0800 @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.text.CharacterIterator; - -/** - * This class is a wrapper around CharacterIterator and implements the - * UCharacterIterator protocol - * @author ram - */ - -class CharacterIteratorWrapper extends UCharacterIterator { - - private CharacterIterator iterator; - - public CharacterIteratorWrapper(CharacterIterator iter){ - if(iter==null){ - throw new IllegalArgumentException(); - } - iterator = iter; - } - - /** - * @see UCharacterIterator#current() - */ - public int current() { - int c = iterator.current(); - if(c==CharacterIterator.DONE){ - return DONE; - } - return c; - } - - /** - * @see UCharacterIterator#getLength() - */ - public int getLength() { - return (iterator.getEndIndex() - iterator.getBeginIndex()); - } - - /** - * @see UCharacterIterator#getIndex() - */ - public int getIndex() { - return iterator.getIndex(); - } - - /** - * @see UCharacterIterator#next() - */ - public int next() { - int i = iterator.current(); - iterator.next(); - if(i==CharacterIterator.DONE){ - return DONE; - } - return i; - } - - /** - * @see UCharacterIterator#previous() - */ - public int previous() { - int i = iterator.previous(); - if(i==CharacterIterator.DONE){ - return DONE; - } - return i; - } - - /** - * @see UCharacterIterator#setIndex(int) - */ - public void setIndex(int index) { - iterator.setIndex(index); - } - - /** - * @see UCharacterIterator#getText(char[]) - */ - public int getText(char[] fillIn, int offset){ - int length =iterator.getEndIndex() - iterator.getBeginIndex(); - int currentIndex = iterator.getIndex(); - if(offset < 0 || offset + length > fillIn.length){ - throw new IndexOutOfBoundsException(Integer.toString(length)); - } - - for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) { - fillIn[offset++] = ch; - } - iterator.setIndex(currentIndex); - - return length; - } - - /** - * Creates a clone of this iterator. Clones the underlying character iterator. - * @see UCharacterIterator#clone() - */ - public Object clone(){ - try { - CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone(); - result.iterator = (CharacterIterator)this.iterator.clone(); - return result; - } catch (CloneNotSupportedException e) { - return null; // only invoked if bad underlying character iterator - } - } -} --- /dev/null 2020-01-10 15:57:26.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/CharacterIteratorWrapper.java 2020-01-10 15:57:25.000000000 -0800 @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.text.CharacterIterator; + +import jdk.internal.icu.text.UCharacterIterator; + +/** + * This class is a wrapper around CharacterIterator and implements the + * UCharacterIterator protocol + * @author ram + */ + +public class CharacterIteratorWrapper extends UCharacterIterator { + + private CharacterIterator iterator; + + public CharacterIteratorWrapper(CharacterIterator iter){ + if(iter==null){ + throw new IllegalArgumentException(); + } + iterator = iter; + } + + /** + * @see UCharacterIterator#current() + */ + public int current() { + int c = iterator.current(); + if(c==CharacterIterator.DONE){ + return DONE; + } + return c; + } + + /** + * @see UCharacterIterator#getLength() + */ + public int getLength() { + return (iterator.getEndIndex() - iterator.getBeginIndex()); + } + + /** + * @see UCharacterIterator#getIndex() + */ + public int getIndex() { + return iterator.getIndex(); + } + + /** + * @see UCharacterIterator#next() + */ + public int next() { + int i = iterator.current(); + iterator.next(); + if(i==CharacterIterator.DONE){ + return DONE; + } + return i; + } + + /** + * @see UCharacterIterator#previous() + */ + public int previous() { + int i = iterator.previous(); + if(i==CharacterIterator.DONE){ + return DONE; + } + return i; + } + + /** + * @see UCharacterIterator#setIndex(int) + */ + public void setIndex(int index) { + iterator.setIndex(index); + } + + /** + * @see UCharacterIterator#getText(char[]) + */ + public int getText(char[] fillIn, int offset){ + int length =iterator.getEndIndex() - iterator.getBeginIndex(); + int currentIndex = iterator.getIndex(); + if(offset < 0 || offset + length > fillIn.length){ + throw new IndexOutOfBoundsException(Integer.toString(length)); + } + + for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) { + fillIn[offset++] = ch; + } + iterator.setIndex(currentIndex); + + return length; + } + + /** + * Creates a clone of this iterator. Clones the underlying character iterator. + * @see UCharacterIterator#clone() + */ + public Object clone(){ + try { + CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone(); + result.iterator = (CharacterIterator)this.iterator.clone(); + return result; + } catch (CloneNotSupportedException e) { + return null; // only invoked if bad underlying character iterator + } + } +} --- old/src/java.base/share/classes/sun/text/normalizer/ICUBinary.java 2020-01-10 15:57:27.000000000 -0800 +++ /dev/null 2020-01-10 15:57:27.000000000 -0800 @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 1996-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.BufferedInputStream; -import java.io.DataInputStream; -import java.io.InputStream; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.security.AccessController; -import java.security.PrivilegedAction; - -public final class ICUBinary { - - private static final class IsAcceptable implements Authenticate { - @Override - public boolean isDataVersionAcceptable(byte version[]) { - return version[0] == 1; - } - } - - // public inner interface ------------------------------------------------ - - /** - * Special interface for data authentication - */ - public static interface Authenticate - { - /** - * Method used in ICUBinary.readHeader() to provide data format - * authentication. - * @param version version of the current data - * @return true if dataformat is an acceptable version, false otherwise - */ - public boolean isDataVersionAcceptable(byte version[]); - } - - // public methods -------------------------------------------------------- - - /** - * Loads an ICU binary data file and returns it as a ByteBuffer. - * The buffer contents is normally read-only, but its position etc. can be modified. - * - * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu". - * @return The data as a read-only ByteBuffer. - */ - public static ByteBuffer getRequiredData(String itemPath) { - final Class root = ICUBinary.class; - - try (InputStream is = AccessController.doPrivileged(new PrivilegedAction() { - public InputStream run() { - return root.getResourceAsStream(itemPath); - } - })) { - - // is.available() may return 0, or 1, or the total number of bytes in the stream, - // or some other number. - // Do not try to use is.available() == 0 to find the end of the stream! - byte[] bytes; - int avail = is.available(); - if (avail > 32) { - // There are more bytes available than just the ICU data header length. - // With luck, it is the total number of bytes. - bytes = new byte[avail]; - } else { - bytes = new byte[128]; // empty .res files are even smaller - } - // Call is.read(...) until one returns a negative value. - int length = 0; - for(;;) { - if (length < bytes.length) { - int numRead = is.read(bytes, length, bytes.length - length); - if (numRead < 0) { - break; // end of stream - } - length += numRead; - } else { - // See if we are at the end of the stream before we grow the array. - int nextByte = is.read(); - if (nextByte < 0) { - break; - } - int capacity = 2 * bytes.length; - if (capacity < 128) { - capacity = 128; - } else if (capacity < 0x4000) { - capacity *= 2; // Grow faster until we reach 16kB. - } - bytes = Arrays.copyOf(bytes, capacity); - bytes[length++] = (byte) nextByte; - } - } - return ByteBuffer.wrap(bytes, 0, length); - } - catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - /** - * Same as readHeader(), but returns a VersionInfo rather than a compact int. - */ - public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes, - int dataFormat, - Authenticate authenticate) - throws IOException { - return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate)); - } - - private static final byte BIG_ENDIAN_ = 1; - public static final byte[] readHeader(InputStream inputStream, - byte dataFormatIDExpected[], - Authenticate authenticate) - throws IOException - { - DataInputStream input = new DataInputStream(inputStream); - char headersize = input.readChar(); - int readcount = 2; - //reading the header format - byte magic1 = input.readByte(); - readcount ++; - byte magic2 = input.readByte(); - readcount ++; - if (magic1 != MAGIC1 || magic2 != MAGIC2) { - throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); - } - - input.readChar(); // reading size - readcount += 2; - input.readChar(); // reading reserved word - readcount += 2; - byte bigendian = input.readByte(); - readcount ++; - byte charset = input.readByte(); - readcount ++; - byte charsize = input.readByte(); - readcount ++; - input.readByte(); // reading reserved byte - readcount ++; - - byte dataFormatID[] = new byte[4]; - input.readFully(dataFormatID); - readcount += 4; - byte dataVersion[] = new byte[4]; - input.readFully(dataVersion); - readcount += 4; - byte unicodeVersion[] = new byte[4]; - input.readFully(unicodeVersion); - readcount += 4; - if (headersize < readcount) { - throw new IOException("Internal Error: Header size error"); - } - input.skipBytes(headersize - readcount); - - if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_ - || charsize != CHAR_SIZE_ - || !Arrays.equals(dataFormatIDExpected, dataFormatID) - || (authenticate != null - && !authenticate.isDataVersionAcceptable(dataVersion))) { - throw new IOException(HEADER_AUTHENTICATION_FAILED_); - } - return unicodeVersion; - } - - /** - * Reads an ICU data header, checks the data format, and returns the data version. - * - *

Assumes that the ByteBuffer position is 0 on input. - * The buffer byte order is set according to the data. - * The buffer position is advanced past the header (including UDataInfo and comment). - * - *

See C++ ucmndata.h and unicode/udata.h. - * - * @return dataVersion - * @throws IOException if this is not a valid ICU data item of the expected dataFormat - */ - public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate) - throws IOException { - assert bytes.position() == 0; - byte magic1 = bytes.get(2); - byte magic2 = bytes.get(3); - if (magic1 != MAGIC1 || magic2 != MAGIC2) { - throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); - } - - byte isBigEndian = bytes.get(8); - byte charsetFamily = bytes.get(9); - byte sizeofUChar = bytes.get(10); - if (isBigEndian < 0 || 1 < isBigEndian || - charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) { - throw new IOException(HEADER_AUTHENTICATION_FAILED_); - } - bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN); - - int headerSize = bytes.getChar(0); - int sizeofUDataInfo = bytes.getChar(4); - if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) { - throw new IOException("Internal Error: Header size error"); - } - // TODO: Change Authenticate to take int major, int minor, int milli, int micro - // to avoid array allocation. - byte[] formatVersion = new byte[] { - bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19) - }; - if (bytes.get(12) != (byte)(dataFormat >> 24) || - bytes.get(13) != (byte)(dataFormat >> 16) || - bytes.get(14) != (byte)(dataFormat >> 8) || - bytes.get(15) != (byte)dataFormat || - (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) { - throw new IOException(HEADER_AUTHENTICATION_FAILED_ + - String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d", - bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15), - formatVersion[0] & 0xff, formatVersion[1] & 0xff, - formatVersion[2] & 0xff, formatVersion[3] & 0xff)); - } - - bytes.position(headerSize); - return // dataVersion - ((int)bytes.get(20) << 24) | - ((bytes.get(21) & 0xff) << 16) | - ((bytes.get(22) & 0xff) << 8) | - (bytes.get(23) & 0xff); - } - - public static void skipBytes(ByteBuffer bytes, int skipLength) { - if (skipLength > 0) { - bytes.position(bytes.position() + skipLength); - } - } - - public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) { - byte[] dest = new byte[length]; - bytes.get(dest); - if (additionalSkipLength > 0) { - skipBytes(bytes, additionalSkipLength); - } - return dest; - } - - public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) { - CharSequence cs = bytes.asCharBuffer(); - String s = cs.subSequence(0, length).toString(); - skipBytes(bytes, length * 2 + additionalSkipLength); - return s; - } - - public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) { - char[] dest = new char[length]; - bytes.asCharBuffer().get(dest); - skipBytes(bytes, length * 2 + additionalSkipLength); - return dest; - } - - public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) { - int[] dest = new int[length]; - bytes.asIntBuffer().get(dest); - skipBytes(bytes, length * 4 + additionalSkipLength); - return dest; - } - - /** - * Returns a VersionInfo for the bytes in the compact version integer. - */ - public static VersionInfo getVersionInfoFromCompactInt(int version) { - return VersionInfo.getInstance( - version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff); - } - - // private variables ------------------------------------------------- - - /** - * Magic numbers to authenticate the data file - */ - private static final byte MAGIC1 = (byte)0xda; - private static final byte MAGIC2 = (byte)0x27; - - /** - * File format authentication values - */ - private static final byte CHAR_SET_ = 0; - private static final byte CHAR_SIZE_ = 2; - - /** - * Error messages - */ - private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ = - "ICUBinary data file error: Magic number authentication failed"; - private static final String HEADER_AUTHENTICATION_FAILED_ = - "ICUBinary data file error: Header authentication failed"; -} --- /dev/null 2020-01-10 15:57:27.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/ICUBinary.java 2020-01-10 15:57:27.000000000 -0800 @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.security.AccessController; +import java.security.PrivilegedAction; + +import jdk.internal.icu.util.VersionInfo; + +public final class ICUBinary { + + private static final class IsAcceptable implements Authenticate { + @Override + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 1; + } + } + + // public inner interface ------------------------------------------------ + + /** + * Special interface for data authentication + */ + public static interface Authenticate + { + /** + * Method used in ICUBinary.readHeader() to provide data format + * authentication. + * @param version version of the current data + * @return true if dataformat is an acceptable version, false otherwise + */ + public boolean isDataVersionAcceptable(byte version[]); + } + + // public methods -------------------------------------------------------- + + /** + * Loads an ICU binary data file and returns it as a ByteBuffer. + * The buffer contents is normally read-only, but its position etc. can be modified. + * + * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu". + * @return The data as a read-only ByteBuffer. + */ + public static ByteBuffer getRequiredData(String itemPath) { + final Class root = ICUBinary.class; + + try (InputStream is = AccessController.doPrivileged(new PrivilegedAction() { + public InputStream run() { + return root.getResourceAsStream(itemPath); + } + })) { + + // is.available() may return 0, or 1, or the total number of bytes in the stream, + // or some other number. + // Do not try to use is.available() == 0 to find the end of the stream! + byte[] bytes; + int avail = is.available(); + if (avail > 32) { + // There are more bytes available than just the ICU data header length. + // With luck, it is the total number of bytes. + bytes = new byte[avail]; + } else { + bytes = new byte[128]; // empty .res files are even smaller + } + // Call is.read(...) until one returns a negative value. + int length = 0; + for(;;) { + if (length < bytes.length) { + int numRead = is.read(bytes, length, bytes.length - length); + if (numRead < 0) { + break; // end of stream + } + length += numRead; + } else { + // See if we are at the end of the stream before we grow the array. + int nextByte = is.read(); + if (nextByte < 0) { + break; + } + int capacity = 2 * bytes.length; + if (capacity < 128) { + capacity = 128; + } else if (capacity < 0x4000) { + capacity *= 2; // Grow faster until we reach 16kB. + } + bytes = Arrays.copyOf(bytes, capacity); + bytes[length++] = (byte) nextByte; + } + } + return ByteBuffer.wrap(bytes, 0, length); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Same as readHeader(), but returns a VersionInfo rather than a compact int. + */ + public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes, + int dataFormat, + Authenticate authenticate) + throws IOException { + return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate)); + } + + private static final byte BIG_ENDIAN_ = 1; + public static final byte[] readHeader(InputStream inputStream, + byte dataFormatIDExpected[], + Authenticate authenticate) + throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + char headersize = input.readChar(); + int readcount = 2; + //reading the header format + byte magic1 = input.readByte(); + readcount ++; + byte magic2 = input.readByte(); + readcount ++; + if (magic1 != MAGIC1 || magic2 != MAGIC2) { + throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); + } + + input.readChar(); // reading size + readcount += 2; + input.readChar(); // reading reserved word + readcount += 2; + byte bigendian = input.readByte(); + readcount ++; + byte charset = input.readByte(); + readcount ++; + byte charsize = input.readByte(); + readcount ++; + input.readByte(); // reading reserved byte + readcount ++; + + byte dataFormatID[] = new byte[4]; + input.readFully(dataFormatID); + readcount += 4; + byte dataVersion[] = new byte[4]; + input.readFully(dataVersion); + readcount += 4; + byte unicodeVersion[] = new byte[4]; + input.readFully(unicodeVersion); + readcount += 4; + if (headersize < readcount) { + throw new IOException("Internal Error: Header size error"); + } + input.skipBytes(headersize - readcount); + + if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_ + || charsize != CHAR_SIZE_ + || !Arrays.equals(dataFormatIDExpected, dataFormatID) + || (authenticate != null + && !authenticate.isDataVersionAcceptable(dataVersion))) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_); + } + return unicodeVersion; + } + + /** + * Reads an ICU data header, checks the data format, and returns the data version. + * + *

Assumes that the ByteBuffer position is 0 on input. + * The buffer byte order is set according to the data. + * The buffer position is advanced past the header (including UDataInfo and comment). + * + *

See C++ ucmndata.h and unicode/udata.h. + * + * @return dataVersion + * @throws IOException if this is not a valid ICU data item of the expected dataFormat + */ + public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate) + throws IOException { + assert bytes.position() == 0; + byte magic1 = bytes.get(2); + byte magic2 = bytes.get(3); + if (magic1 != MAGIC1 || magic2 != MAGIC2) { + throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); + } + + byte isBigEndian = bytes.get(8); + byte charsetFamily = bytes.get(9); + byte sizeofUChar = bytes.get(10); + if (isBigEndian < 0 || 1 < isBigEndian || + charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_); + } + bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN); + + int headerSize = bytes.getChar(0); + int sizeofUDataInfo = bytes.getChar(4); + if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) { + throw new IOException("Internal Error: Header size error"); + } + // TODO: Change Authenticate to take int major, int minor, int milli, int micro + // to avoid array allocation. + byte[] formatVersion = new byte[] { + bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19) + }; + if (bytes.get(12) != (byte)(dataFormat >> 24) || + bytes.get(13) != (byte)(dataFormat >> 16) || + bytes.get(14) != (byte)(dataFormat >> 8) || + bytes.get(15) != (byte)dataFormat || + (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_ + + String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d", + bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15), + formatVersion[0] & 0xff, formatVersion[1] & 0xff, + formatVersion[2] & 0xff, formatVersion[3] & 0xff)); + } + + bytes.position(headerSize); + return // dataVersion + ((int)bytes.get(20) << 24) | + ((bytes.get(21) & 0xff) << 16) | + ((bytes.get(22) & 0xff) << 8) | + (bytes.get(23) & 0xff); + } + + public static void skipBytes(ByteBuffer bytes, int skipLength) { + if (skipLength > 0) { + bytes.position(bytes.position() + skipLength); + } + } + + public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) { + byte[] dest = new byte[length]; + bytes.get(dest); + if (additionalSkipLength > 0) { + skipBytes(bytes, additionalSkipLength); + } + return dest; + } + + public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) { + CharSequence cs = bytes.asCharBuffer(); + String s = cs.subSequence(0, length).toString(); + skipBytes(bytes, length * 2 + additionalSkipLength); + return s; + } + + public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) { + char[] dest = new char[length]; + bytes.asCharBuffer().get(dest); + skipBytes(bytes, length * 2 + additionalSkipLength); + return dest; + } + + public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) { + int[] dest = new int[length]; + bytes.asIntBuffer().get(dest); + skipBytes(bytes, length * 4 + additionalSkipLength); + return dest; + } + + /** + * Returns a VersionInfo for the bytes in the compact version integer. + */ + public static VersionInfo getVersionInfoFromCompactInt(int version) { + return VersionInfo.getInstance( + version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff); + } + + // private variables ------------------------------------------------- + + /** + * Magic numbers to authenticate the data file + */ + private static final byte MAGIC1 = (byte)0xda; + private static final byte MAGIC2 = (byte)0x27; + + /** + * File format authentication values + */ + private static final byte CHAR_SET_ = 0; + private static final byte CHAR_SIZE_ = 2; + + /** + * Error messages + */ + private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ = + "ICUBinary data file error: Magic number authentication failed"; + private static final String HEADER_AUTHENTICATION_FAILED_ = + "ICUBinary data file error: Header authentication failed"; +} --- old/src/java.base/share/classes/sun/text/normalizer/Norm2AllModes.java 2020-01-10 15:57:29.000000000 -0800 +++ /dev/null 2020-01-10 15:57:29.000000000 -0800 @@ -1,287 +0,0 @@ -/* - * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; - -final class Norm2AllModes { - // Public API dispatch via Normalizer2 subclasses -------------------------- *** - - // Normalizer2 implementation for the old UNORM_NONE. - public static final class NoopNormalizer2 extends Normalizer2 { - @Override - public StringBuilder normalize(CharSequence src, StringBuilder dest) { - if(dest!=src) { - dest.setLength(0); - return dest.append(src); - } else { - throw new IllegalArgumentException(); - } - } - - @Override - public Appendable normalize(CharSequence src, Appendable dest) { - if(dest!=src) { - try { - return dest.append(src); - } catch(IOException e) { - throw new InternalError(e.toString(), e); - } - } else { - throw new IllegalArgumentException(); - } - } - - @Override - public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { - if(first!=second) { - return first.append(second); - } else { - throw new IllegalArgumentException(); - } - } - - @Override - public StringBuilder append(StringBuilder first, CharSequence second) { - if(first!=second) { - return first.append(second); - } else { - throw new IllegalArgumentException(); - } - } - - @Override - public String getDecomposition(int c) { - return null; - } - - // No need to override the default getRawDecomposition(). - @Override - public boolean isNormalized(CharSequence s) { return true; } - - @Override - public int spanQuickCheckYes(CharSequence s) { return s.length(); } - - @Override - public boolean hasBoundaryBefore(int c) { return true; } - } - - // Intermediate class: - // Has NormalizerImpl and does boilerplate argument checking and setup. - public abstract static class Normalizer2WithImpl extends Normalizer2 { - public Normalizer2WithImpl(NormalizerImpl ni) { - impl=ni; - } - - // normalize - @Override - public StringBuilder normalize(CharSequence src, StringBuilder dest) { - if(dest==src) { - throw new IllegalArgumentException(); - } - dest.setLength(0); - normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length())); - return dest; - } - - @Override - public Appendable normalize(CharSequence src, Appendable dest) { - if(dest==src) { - throw new IllegalArgumentException(); - } - NormalizerImpl.ReorderingBuffer buffer= - new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()); - normalize(src, buffer); - buffer.flush(); - return dest; - } - - protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer); - - // normalize and append - @Override - public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { - return normalizeSecondAndAppend(first, second, true); - } - - @Override - public StringBuilder append(StringBuilder first, CharSequence second) { - return normalizeSecondAndAppend(first, second, false); - } - - public StringBuilder normalizeSecondAndAppend( - StringBuilder first, CharSequence second, boolean doNormalize) { - if(first==second) { - throw new IllegalArgumentException(); - } - normalizeAndAppend( - second, doNormalize, - new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length())); - return first; - } - - protected abstract void normalizeAndAppend( - CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer); - - @Override - public String getDecomposition(int c) { - return impl.getDecomposition(c); - } - - @Override - public int getCombiningClass(int c) { - return impl.getCC(impl.getNorm16(c)); - } - - // quick checks - @Override - public boolean isNormalized(CharSequence s) { - return s.length()==spanQuickCheckYes(s); - } - - public final NormalizerImpl impl; - } - - public static final class DecomposeNormalizer2 extends Normalizer2WithImpl { - public DecomposeNormalizer2(NormalizerImpl ni) { - super(ni); - } - - @Override - protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { - impl.decompose(src, 0, src.length(), buffer); - } - - @Override - protected void normalizeAndAppend( - CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { - impl.decomposeAndAppend(src, doNormalize, buffer); - } - - @Override - public int spanQuickCheckYes(CharSequence s) { - return impl.decompose(s, 0, s.length(), null); - } - - @Override - public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); } - } - - public static final class ComposeNormalizer2 extends Normalizer2WithImpl { - public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) { - super(ni); - onlyContiguous=fcc; - } - - @Override - protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { - impl.compose(src, 0, src.length(), onlyContiguous, true, buffer); - } - - @Override - protected void normalizeAndAppend( - CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { - impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer); - } - - @Override - public boolean isNormalized(CharSequence s) { - // 5: small destCapacity for substring normalization - return impl.compose(s, 0, s.length(), - onlyContiguous, false, - new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5)); - } - - @Override - public int spanQuickCheckYes(CharSequence s) { - return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1; - } - - @Override - public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); } - - private final boolean onlyContiguous; - } - - // instance cache ---------------------------------------------------------- *** - - private Norm2AllModes(NormalizerImpl ni) { - impl=ni; - comp=new ComposeNormalizer2(ni, false); - decomp=new DecomposeNormalizer2(ni); - } - - public final NormalizerImpl impl; - public final ComposeNormalizer2 comp; - public final DecomposeNormalizer2 decomp; - - private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) { - if(singleton.exception!=null) { - throw singleton.exception; - } - return singleton.allModes; - } - - public static Norm2AllModes getNFCInstance() { - return getInstanceFromSingleton(NFCSingleton.INSTANCE); - } - - public static Norm2AllModes getNFKCInstance() { - return getInstanceFromSingleton(NFKCSingleton.INSTANCE); - } - - public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2(); - - private static final class Norm2AllModesSingleton { - private Norm2AllModesSingleton(String name) { - try { - String DATA_FILE_NAME = "/sun/text/resources/" + name + ".nrm"; - NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME); - allModes=new Norm2AllModes(impl); - } catch (RuntimeException e) { - exception=e; - } - } - - private Norm2AllModes allModes; - private RuntimeException exception; - } - - private static final class NFCSingleton { - private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc"); - } - - private static final class NFKCSingleton { - private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc"); - } -} --- /dev/null 2020-01-10 15:57:29.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Norm2AllModes.java 2020-01-10 15:57:29.000000000 -0800 @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; + +import jdk.internal.icu.text.Normalizer2; +import jdk.internal.icu.util.VersionInfo; + +public final class Norm2AllModes { + // Public API dispatch via Normalizer2 subclasses -------------------------- *** + + // Normalizer2 implementation for the old UNORM_NONE. + public static final class NoopNormalizer2 extends Normalizer2 { + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest!=src) { + dest.setLength(0); + return dest.append(src); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest!=src) { + try { + return dest.append(src); + } catch(IOException e) { + throw new InternalError(e.toString(), e); + } + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public String getDecomposition(int c) { + return null; + } + + // No need to override the default getRawDecomposition(). + @Override + public boolean isNormalized(CharSequence s) { return true; } + + @Override + public int spanQuickCheckYes(CharSequence s) { return s.length(); } + + @Override + public boolean hasBoundaryBefore(int c) { return true; } + } + + // Intermediate class: + // Has NormalizerImpl and does boilerplate argument checking and setup. + public abstract static class Normalizer2WithImpl extends Normalizer2 { + public Normalizer2WithImpl(NormalizerImpl ni) { + impl=ni; + } + + // normalize + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + dest.setLength(0); + normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length())); + return dest; + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + NormalizerImpl.ReorderingBuffer buffer= + new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()); + normalize(src, buffer); + buffer.flush(); + return dest; + } + + protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer); + + // normalize and append + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, true); + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, false); + } + + public StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second, boolean doNormalize) { + if(first==second) { + throw new IllegalArgumentException(); + } + normalizeAndAppend( + second, doNormalize, + new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length())); + return first; + } + + protected abstract void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer); + + @Override + public String getDecomposition(int c) { + return impl.getDecomposition(c); + } + + @Override + public int getCombiningClass(int c) { + return impl.getCC(impl.getNorm16(c)); + } + + // quick checks + @Override + public boolean isNormalized(CharSequence s) { + return s.length()==spanQuickCheckYes(s); + } + + public final NormalizerImpl impl; + } + + public static final class DecomposeNormalizer2 extends Normalizer2WithImpl { + public DecomposeNormalizer2(NormalizerImpl ni) { + super(ni); + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.decompose(src, 0, src.length(), buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.decomposeAndAppend(src, doNormalize, buffer); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.decompose(s, 0, s.length(), null); + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); } + } + + public static final class ComposeNormalizer2 extends Normalizer2WithImpl { + public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) { + super(ni); + onlyContiguous=fcc; + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.compose(src, 0, src.length(), onlyContiguous, true, buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer); + } + + @Override + public boolean isNormalized(CharSequence s) { + // 5: small destCapacity for substring normalization + return impl.compose(s, 0, s.length(), + onlyContiguous, false, + new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5)); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1; + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); } + + private final boolean onlyContiguous; + } + + // instance cache ---------------------------------------------------------- *** + + private Norm2AllModes(NormalizerImpl ni) { + impl=ni; + comp=new ComposeNormalizer2(ni, false); + decomp=new DecomposeNormalizer2(ni); + } + + public final NormalizerImpl impl; + public final ComposeNormalizer2 comp; + public final DecomposeNormalizer2 decomp; + + private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) { + if(singleton.exception!=null) { + throw singleton.exception; + } + return singleton.allModes; + } + + public static Norm2AllModes getNFCInstance() { + return getInstanceFromSingleton(NFCSingleton.INSTANCE); + } + + public static Norm2AllModes getNFKCInstance() { + return getInstanceFromSingleton(NFKCSingleton.INSTANCE); + } + + public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2(); + + private static final class Norm2AllModesSingleton { + private Norm2AllModesSingleton(String name) { + try { + @SuppressWarnings("deprecation") + String DATA_FILE_NAME = "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + "/" + name + ".nrm"; + NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME); + allModes=new Norm2AllModes(impl); + } catch (RuntimeException e) { + exception=e; + } + } + + private Norm2AllModes allModes; + private RuntimeException exception; + } + + private static final class NFCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc"); + } + + private static final class NFKCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc"); + } +} --- old/src/java.base/share/classes/sun/text/normalizer/NormalizerImpl.java 2020-01-10 15:57:31.000000000 -0800 +++ /dev/null 2020-01-10 15:57:31.000000000 -0800 @@ -1,2188 +0,0 @@ -/* - * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.text.Normalizer; - -// Original filename in ICU4J: Normalizer2Impl.java -public final class NormalizerImpl { - public static final class Hangul { - /* Korean Hangul and Jamo constants */ - public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ - public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ - public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ - - public static final int HANGUL_BASE=0xac00; - public static final int HANGUL_END=0xd7a3; - - public static final int JAMO_L_COUNT=19; - public static final int JAMO_V_COUNT=21; - public static final int JAMO_T_COUNT=28; - - public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; - public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; - - public static boolean isHangul(int c) { - return HANGUL_BASE<=c && c - * If dest is a StringBuilder, then the buffer writes directly to it. - * Otherwise, the buffer maintains a StringBuilder for intermediate text segments - * until no further changes are necessary and whole segments are appended. - * append() methods that take combining-class values always write to the StringBuilder. - * Other append() methods flush and append to the Appendable. - */ - public static final class ReorderingBuffer implements Appendable { - public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { - impl=ni; - app=dest; - if (app instanceof StringBuilder) { - appIsStringBuilder=true; - str=(StringBuilder)dest; - // In Java, the constructor subsumes public void init(int destCapacity) - str.ensureCapacity(destCapacity); - reorderStart=0; - if(str.length()==0) { - lastCC=0; - } else { - setIterator(); - lastCC=previousCC(); - // Set reorderStart after the last code point with cc<=1 if there is one. - if(lastCC>1) { - while(previousCC()>1) {} - } - reorderStart=codePointLimit; - } - } else { - appIsStringBuilder=false; - str=new StringBuilder(); - reorderStart=0; - lastCC=0; - } - } - - public boolean isEmpty() { return str.length()==0; } - public int length() { return str.length(); } - public int getLastCC() { return lastCC; } - - public StringBuilder getStringBuilder() { return str; } - - public boolean equals(CharSequence s, int start, int limit) { - return UTF16Plus.equal(str, 0, str.length(), s, start, limit); - } - - public void append(int c, int cc) { - if(lastCC<=cc || cc==0) { - str.appendCodePoint(c); - lastCC=cc; - if(cc<=1) { - reorderStart=str.length(); - } - } else { - insert(c, cc); - } - } - public void append(CharSequence s, int start, int limit, boolean isNFD, - int leadCC, int trailCC) { - if(start==limit) { - return; - } - if(lastCC<=leadCC || leadCC==0) { - if(trailCC<=1) { - reorderStart=str.length()+(limit-start); - } else if(leadCC<=1) { - reorderStart=str.length()+1; // Ok if not a code point boundary. - } - str.append(s, start, limit); - lastCC=trailCC; - } else { - int c=Character.codePointAt(s, start); - start+=Character.charCount(c); - insert(c, leadCC); // insert first code point - while(startcc;) {} - // insert c at codePointLimit, after the character with prevCC<=cc - if(c<=0xffff) { - str.insert(codePointLimit, (char)c); - if(cc<=1) { - reorderStart=codePointLimit+1; - } - } else { - str.insert(codePointLimit, Character.toChars(c)); - if(cc<=1) { - reorderStart=codePointLimit+2; - } - } - } - - private final NormalizerImpl impl; - private final Appendable app; - private final StringBuilder str; - private final boolean appIsStringBuilder; - private int reorderStart; - private int lastCC; - - // private backward iterator - private void setIterator() { codePointStart=str.length(); } - private void skipPrevious() { // Requires 0=codePointStart) { - return 0; - } - int c=str.codePointBefore(codePointStart); - codePointStart-=Character.charCount(c); - return impl.getCCFromYesOrMaybeCP(c); - } - private int codePointStart, codePointLimit; - } - - // TODO: Propose as public API on the UTF16 class. - // TODO: Propose widening UTF16 methods that take char to take int. - // TODO: Propose widening UTF16 methods that take String to take CharSequence. - public static final class UTF16Plus { - /** - * Is this code point a lead surrogate (U+d800..U+dbff)? - * @param c code unit or code point - * @return true or false - */ - public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } - /** - * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), - * is it a lead surrogate? - * @param c code unit or code point - * @return true or false - */ - public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } - - /** - * Compares two CharSequence subsequences for binary equality. - * @param s1 first sequence - * @param start1 start offset in first sequence - * @param limit1 limit offset in first sequence - * @param s2 second sequence - * @param start2 start offset in second sequence - * @param limit2 limit offset in second sequence - * @return true if s1.subSequence(start1, limit1) contains the same text - * as s2.subSequence(start2, limit2) - */ - public static boolean equal(CharSequence s1, int start1, int limit1, - CharSequence s2, int start2, int limit2) { - if((limit1-start1)!=(limit2-start2)) { - return false; - } - if(s1==s2 && start1==start2) { - return true; - } - while(start1>DELTA_SHIFT)-MAX_DELTA-1; - - // Read the normTrie. - int offset=inIndexes[IX_NORM_TRIE_OFFSET]; - int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; - int triePosition = bytes.position(); - normTrie = CodePointTrie.Fast16.fromBinary(bytes); - int trieLength = bytes.position() - triePosition; - if(trieLength>(nextOffset-offset)) { - throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); - } - ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes - - // Read the composition and mapping data. - offset=nextOffset; - nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; - int numChars=(nextOffset-offset)/2; - if(numChars!=0) { - maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); - extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); - } - - // smallFCD: new in formatVersion 2 - offset=nextOffset; - smallFCD=new byte[0x100]; - bytes.get(smallFCD); - - return this; - } catch(IOException e) { - throw new InternalError(e); - } - } - public NormalizerImpl load(String name) { - return load(ICUBinary.getRequiredData(name)); - } - - // The trie stores values for lead surrogate code *units*. - // Surrogate code *points* are inert. - public int getNorm16(int c) { - return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); - } - public int getRawNorm16(int c) { return normTrie.get(c); } - public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16=MIN_NORMAL_MAYBE_YES) { - return getCCFromNormalYesOrMaybe(norm16); - } - if(norm16> OFFSET_SHIFT) & 0xff; - } - public static int getCCFromYesOrMaybe(int norm16) { - return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; - } - public int getCCFromYesOrMaybeCP(int c) { - if (c < minCompNoMaybeCP) { return 0; } - return getCCFromYesOrMaybe(getNorm16(c)); - } - - /** - * Returns the FCD data for code point c. - * @param c A Unicode code point. - * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. - */ - public int getFCD16(int c) { - if(c>8]; - if(bits==0) { return false; } - return ((bits>>((lead>>5)&7))&1)!=0; - } - - /** Gets the FCD value from the regular normalization data. */ - public int getFCD16FromNormData(int c) { - int norm16=getNorm16(c); - if (norm16 >= limitNoNo) { - if(norm16>=MIN_NORMAL_MAYBE_YES) { - // combining mark - norm16=getCCFromNormalYesOrMaybe(norm16); - return norm16|(norm16<<8); - } else if(norm16>=minMaybeYes) { - return 0; - } else { // isDecompNoAlgorithmic(norm16) - int deltaTrailCC = norm16 & DELTA_TCCC_MASK; - if (deltaTrailCC <= DELTA_TCCC_1) { - return deltaTrailCC >> OFFSET_SHIFT; - } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16=getRawNorm16(c); - } - } - if(norm16<=minYesNo || isHangulLVT(norm16)) { - // no decomposition or Hangul syllable, all zeros - return 0; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - int fcd16=firstUnit>>8; // tccc - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc - } - return fcd16; - } - - /** - * Gets the decomposition for one code point. - * @param c code point - * @return c's decomposition, if it has one; returns null if it does not have a decomposition - */ - public String getDecomposition(int c) { - int norm16; - if(c>OFFSET_SHIFT; - int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; - return extraData.substring(mapping, mapping+length); - } - - // Fixed norm16 values. - public static final int MIN_YES_YES_WITH_CC=0xfe02; - public static final int JAMO_VT=0xfe00; - public static final int MIN_NORMAL_MAYBE_YES=0xfc00; - public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE - public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE - - // norm16 bit 0 is comp-boundary-after. - public static final int HAS_COMP_BOUNDARY_AFTER=1; - public static final int OFFSET_SHIFT=1; - - // For algorithmic one-way mappings, norm16 bits 2..1 indicate the - // tccc (0, 1, >1) for quick FCC boundary-after tests. - public static final int DELTA_TCCC_0=0; - public static final int DELTA_TCCC_1=2; - public static final int DELTA_TCCC_GT_1=4; - public static final int DELTA_TCCC_MASK=6; - public static final int DELTA_SHIFT=3; - - public static final int MAX_DELTA=0x40; - - // Byte offsets from the start of the data, after the generic header. - public static final int IX_NORM_TRIE_OFFSET=0; - public static final int IX_EXTRA_DATA_OFFSET=1; - public static final int IX_SMALL_FCD_OFFSET=2; - public static final int IX_RESERVED3_OFFSET=3; - public static final int IX_TOTAL_SIZE=7; - public static final int MIN_CCC_LCCC_CP=0x300; - // Code point thresholds for quick check codes. - public static final int IX_MIN_DECOMP_NO_CP=8; - public static final int IX_MIN_COMP_NO_MAYBE_CP=9; - - // Norm16 value thresholds for quick check combinations and types of extra data. - - /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ - public static final int IX_MIN_YES_NO=10; - /** Mappings are comp-normalized. */ - public static final int IX_MIN_NO_NO=11; - public static final int IX_LIMIT_NO_NO=12; - public static final int IX_MIN_MAYBE_YES=13; - - /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ - public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; - /** Mappings are not comp-normalized but have a comp boundary before. */ - public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; - /** Mappings do not have a comp boundary before. */ - public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; - /** Mappings to the empty string. */ - public static final int IX_MIN_NO_NO_EMPTY=17; - - public static final int IX_MIN_LCCC_CP=18; - public static final int IX_COUNT=20; - - public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; - public static final int MAPPING_HAS_RAW_MAPPING=0x40; - // unused bit 0x20; - public static final int MAPPING_LENGTH_MASK=0x1f; - - public static final int COMP_1_LAST_TUPLE=0x8000; - public static final int COMP_1_TRIPLE=1; - public static final int COMP_1_TRAIL_LIMIT=0x3400; - public static final int COMP_1_TRAIL_MASK=0x7ffe; - public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit - public static final int COMP_2_TRAIL_SHIFT=6; - public static final int COMP_2_TRAIL_MASK=0xffc0; - - // higher-level functionality ------------------------------------------ *** - - /** - * Decomposes s[src, limit[ and writes the result to dest. - * limit can be NULL if src is NUL-terminated. - * destLengthEstimate is the initial dest buffer capacity and can be -1. - */ - public void decompose(CharSequence s, int src, int limit, StringBuilder dest, - int destLengthEstimate) { - if(destLengthEstimate<0) { - destLengthEstimate=limit-src; - } - dest.setLength(0); - ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); - decompose(s, src, limit, buffer); - } - - // Dual functionality: - // buffer!=NULL: normalize - // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes - public int decompose(CharSequence s, int src, int limit, - ReorderingBuffer buffer) { - int minNoCP=minDecompNoCP; - - int prevSrc; - int c=0; - int norm16=0; - - // only for quick check - int prevBoundary=src; - int prevCC=0; - - for(;;) { - // count code units below the minimum or with irrelevant data for the quick check - for(prevSrc=src; src!=limit;) { - if( (c=s.charAt(src))=limit) { - break; - } - c=Character.codePointAt(s, src); - cc=getCC(getNorm16(c)); - }; - buffer.append(s, 0, src, false, firstCC, prevCC); - buffer.append(s, src, limit); - } - - // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. - // doCompose: normalize - // !doCompose: isNormalized (buffer must be empty and initialized) - public boolean compose(CharSequence s, int src, int limit, - boolean onlyContiguous, - boolean doCompose, - ReorderingBuffer buffer) { - int prevBoundary=src; - int minNoMaybeCP=minCompNoMaybeCP; - - for (;;) { - // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, - // or with (compYes && ccc==0) properties. - int prevSrc; - int c = 0; - int norm16 = 0; - for (;;) { - if (src == limit) { - if (prevBoundary != limit && doCompose) { - buffer.append(s, prevBoundary, limit); - } - return true; - } - if( (c=s.charAt(src))=minNoNo. - // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) - // or a "yesYes" with ccc!=0. - // It is not a Hangul syllable or Jamo L because those have "yes" properties. - - // Medium-fast path: Handle cases that do not require full decomposition and recomposition. - if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes - if (!doCompose) { - return false; - } - // Fast path for mapping a character that is immediately surrounded by boundaries. - // In this case, we need not decompose around the current character. - if (isDecompNoAlgorithmic(norm16)) { - // Maps to a single isCompYesAndZeroCC character - // which also implies hasCompBoundaryBefore. - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || - hasCompBoundaryBefore(s, src, limit)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append(mapAlgorithmic(c, norm16), 0); - prevBoundary = src; - continue; - } - } else if (norm16 < minNoNoCompBoundaryBefore) { - // The mapping is comp-normalized which also implies hasCompBoundaryBefore. - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || - hasCompBoundaryBefore(s, src, limit)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - int mapping = norm16 >> OFFSET_SHIFT; - int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; - buffer.append(extraData, mapping, mapping + length); - prevBoundary = src; - continue; - } - } else if (norm16 >= minNoNoEmpty) { - // The current character maps to nothing. - // Simply omit it from the output if there is a boundary before _or_ after it. - // The character itself implies no boundaries. - if (hasCompBoundaryBefore(s, src, limit) || - hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - prevBoundary = src; - continue; - } - } - // Other "noNo" type, or need to examine more text around this character: - // Fall through to the slow path. - } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { - char prev=s.charAt(prevSrc-1); - if(c= 0) { - int syllable = Hangul.HANGUL_BASE + - (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * - Hangul.JAMO_T_COUNT + t; - --prevSrc; // Replace the Jamo L as well. - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append((char)syllable); - prevBoundary = src; - continue; - } - // If we see L+V+x where x!=T then we drop to the slow path, - // decompose and recompose. - // This is to deal with NFKC finding normal L and V but a - // compatibility variant of a T. - // We need to either fully compose that combination here - // (which would complicate the code and may not work with strange custom data) - // or use the slow path. - } - } else if (Hangul.isHangulLV(prev)) { - // The current character is a Jamo Trailing consonant, - // compose with previous Hangul LV that does not contain a Jamo T. - if (!doCompose) { - return false; - } - int syllable = prev + c - Hangul.JAMO_T_BASE; - --prevSrc; // Replace the Hangul LV as well. - if (prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - buffer.append((char)syllable); - prevBoundary = src; - continue; - } - // No matching context, or may need to decompose surrounding text first: - // Fall through to the slow path. - } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC - // One or more combining marks that do not combine-back: - // Check for canonical order, copy unchanged if ok and - // if followed by a character with a boundary-before. - int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 - if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { - // Fails FCD test, need to decompose and contiguously recompose. - if (!doCompose) { - return false; - } - } else { - // If !onlyContiguous (not FCC), then we ignore the tccc of - // the previous character which passed the quick check "yes && ccc==0" test. - int n16; - for (;;) { - if (src == limit) { - if (doCompose) { - buffer.append(s, prevBoundary, limit); - } - return true; - } - int prevCC = cc; - c = Character.codePointAt(s, src); - n16 = normTrie.get(c); - if (n16 >= MIN_YES_YES_WITH_CC) { - cc = getCCFromNormalYesOrMaybe(n16); - if (prevCC > cc) { - if (!doCompose) { - return false; - } - break; - } - } else { - break; - } - src += Character.charCount(c); - } - // p is after the last in-order combining mark. - // If there is a boundary here, then we continue with no change. - if (norm16HasCompBoundaryBefore(n16)) { - if (isCompYesAndZeroCC(n16)) { - src += Character.charCount(c); - } - continue; - } - // Use the slow path. There is no boundary in [prevSrc, src[. - } - } - - // Slow path: Find the nearest boundaries around the current character, - // decompose and recompose. - if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { - c = Character.codePointBefore(s, prevSrc); - norm16 = normTrie.get(c); - if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { - prevSrc -= Character.charCount(c); - } - } - if (doCompose && prevBoundary != prevSrc) { - buffer.append(s, prevBoundary, prevSrc); - } - int recomposeStartIndex=buffer.length(); - // We know there is not a boundary here. - decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, - buffer); - // Decompose until the next boundary. - src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, - buffer); - recompose(buffer, recomposeStartIndex, onlyContiguous); - if(!doCompose) { - if(!buffer.equals(s, prevSrc, src)) { - return false; - } - buffer.remove(); - } - prevBoundary=src; - } - } - - /** - * Very similar to compose(): Make the same changes in both places if relevant. - * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) - * !doSpan: quickCheck - * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and - * bit 0: set if "maybe"; otherwise, if the span length<s.length() - * then the quick check result is "no" - */ - public int composeQuickCheck(CharSequence s, int src, int limit, - boolean onlyContiguous, boolean doSpan) { - int qcResult=0; - int prevBoundary=src; - int minNoMaybeCP=minCompNoMaybeCP; - - for(;;) { - // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, - // or with (compYes && ccc==0) properties. - int prevSrc; - int c = 0; - int norm16 = 0; - for (;;) { - if(src==limit) { - return (src<<1)|qcResult; // "yes" or "maybe" - } - if( (c=s.charAt(src))=minNoNo. - // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) - // or a "yesYes" with ccc!=0. - // It is not a Hangul syllable or Jamo L because those have "yes" properties. - - int prevNorm16 = INERT; - if (prevBoundary != prevSrc) { - prevBoundary = prevSrc; - if (!norm16HasCompBoundaryBefore(norm16)) { - c = Character.codePointBefore(s, prevSrc); - int n16 = getNorm16(c); - if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { - prevBoundary -= Character.charCount(c); - prevNorm16 = n16; - } - } - } - - if(isMaybeOrNonZeroCC(norm16)) { - int cc=getCCFromYesOrMaybe(norm16); - if (onlyContiguous /* FCC */ && cc != 0 && - getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { - // The [prevBoundary..prevSrc[ character - // passed the quick check "yes && ccc==0" test - // but is out of canonical order with the current combining mark. - } else { - // If !onlyContiguous (not FCC), then we ignore the tccc of - // the previous character which passed the quick check "yes && ccc==0" test. - for (;;) { - if (norm16 < MIN_YES_YES_WITH_CC) { - if (!doSpan) { - qcResult = 1; - } else { - return prevBoundary << 1; // spanYes does not care to know it's "maybe" - } - } - if (src == limit) { - return (src<<1) | qcResult; // "yes" or "maybe" - } - int prevCC = cc; - c = Character.codePointAt(s, src); - norm16 = getNorm16(c); - if (isMaybeOrNonZeroCC(norm16)) { - cc = getCCFromYesOrMaybe(norm16); - if (!(prevCC <= cc || cc == 0)) { - break; - } - } else { - break; - } - src += Character.charCount(c); - } - // src is after the last in-order combining mark. - if (isCompYesAndZeroCC(norm16)) { - prevBoundary = src; - src += Character.charCount(c); - continue; - } - } - } - return prevBoundary<<1; // "no" - } - } - public void composeAndAppend(CharSequence s, - boolean doCompose, - boolean onlyContiguous, - ReorderingBuffer buffer) { - int src=0, limit=s.length(); - if(!buffer.isEmpty()) { - int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); - if(0!=firstStarterInSrc) { - int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), - buffer.length(), onlyContiguous); - StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ - firstStarterInSrc+16); - middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); - buffer.removeSuffix(buffer.length()-lastStarterInDest); - middle.append(s, 0, firstStarterInSrc); - compose(middle, 0, middle.length(), onlyContiguous, true, buffer); - src=firstStarterInSrc; - } - } - if(doCompose) { - compose(s, src, limit, onlyContiguous, true, buffer); - } else { - buffer.append(s, src, limit); - } - } - // Dual functionality: - // buffer!=NULL: normalize - // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes - public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { - // Note: In this function we use buffer->appendZeroCC() because we track - // the lead and trail combining classes here, rather than leaving it to - // the ReorderingBuffer. - // The exception is the call to decomposeShort() which uses the buffer - // in the normal way. - - // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. - // Similar to the prevBoundary in the compose() implementation. - int prevBoundary=src; - int prevSrc; - int c=0; - int prevFCD16=0; - int fcd16=0; - - for(;;) { - // count code units with lccc==0 - for(prevSrc=src; src!=limit;) { - if((c=s.charAt(src))1) { - --prevBoundary; - } - } - } else { - int p=src-1; - if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { - prevBoundary=p; - } - } - if(buffer!=null) { - // The last lccc==0 character is excluded from the - // flush-and-append call in case it needs to be modified. - buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); - buffer.append(s, prevBoundary, src); - } - // The start of the current character (c). - prevSrc=src; - } else if(src==limit) { - break; - } - - src+=Character.charCount(c); - // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. - // Check for proper order, and decompose locally if necessary. - if((prevFCD16&0xff)<=(fcd16>>8)) { - // proper order: prev tccc <= current lccc - if((fcd16&0xff)<=1) { - prevBoundary=src; - } - if(buffer!=null) { - buffer.appendZeroCC(c); - } - prevFCD16=fcd16; - continue; - } else if(buffer==null) { - return prevBoundary; // quick check "no" - } else { - /* - * Back out the part of the source that we copied or appended - * already but is now going to be decomposed. - * prevSrc is set to after what was copied/appended. - */ - buffer.removeSuffix(prevSrc-prevBoundary); - /* - * Find the part of the source that needs to be decomposed, - * up to the next safe boundary. - */ - src=findNextFCDBoundary(s, src, limit); - /* - * The source text does not fulfill the conditions for FCD. - * Decompose and reorder a limited piece of the text. - */ - decomposeShort(s, prevBoundary, src, false, false, buffer); - prevBoundary=src; - prevFCD16=0; - } - } - return src; - } - - public boolean hasDecompBoundaryBefore(int c) { - return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || - norm16HasDecompBoundaryBefore(getNorm16(c)); - } - public boolean norm16HasDecompBoundaryBefore(int norm16) { - if (norm16 < minNoNoCompNoMaybeCC) { - return true; - } - if (norm16 >= limitNoNo) { - return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - // true if leadCC==0 (hasFCDBoundaryBefore()) - return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; - } - public boolean hasDecompBoundaryAfter(int c) { - if (c < minDecompNoCP) { - return true; - } - if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { - return true; - } - return norm16HasDecompBoundaryAfter(getNorm16(c)); - } - public boolean norm16HasDecompBoundaryAfter(int norm16) { - if(norm16 <= minYesNo || isHangulLVT(norm16)) { - return true; - } - if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; - } - // Maps to an isCompYesAndZeroCC. - return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; - } - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - // decomp after-boundary: same as hasFCDBoundaryAfter(), - // fcd16<=1 || trailCC==0 - if(firstUnit>0x1ff) { - return false; // trailCC>1 - } - if(firstUnit<=0xff) { - return true; // trailCC==0 - } - // if(trailCC==1) test leadCC==0, same as checking for before-boundary - // true if leadCC==0 (hasFCDBoundaryBefore()) - return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; - } - public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } - - public boolean hasCompBoundaryBefore(int c) { - return c=minMaybeYes; } - private static boolean isInert(int norm16) { return norm16==INERT; } - private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } - private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } - private boolean isHangulLV(int norm16) { return norm16==minYesNo; } - private boolean isHangulLVT(int norm16) { - return norm16==hangulLVT(); - } - private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } - - // For use with isCompYes(). - // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. - // static uint8_t getCCFromYes(uint16_t norm16) { - // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; - // } - private int getCCFromNoNo(int norm16) { - int mapping=norm16>>OFFSET_SHIFT; - if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - return extraData.charAt(mapping-1)&0xff; - } else { - return 0; - } - } - int getTrailCCFromCompYesAndZeroCC(int norm16) { - if(norm16<=minYesNo) { - return 0; // yesYes and Hangul LV have ccc=tccc=0 - } else { - // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. - return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo - } - } - - // Requires algorithmic-NoNo. - private int mapAlgorithmic(int c, int norm16) { - return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; - } - - // Requires minYesNo>OFFSET_SHIFT); } - - /** - * @return index into maybeYesCompositions, or -1 - */ - private int getCompositionsListForDecompYes(int norm16) { - if(norm16>OFFSET_SHIFT; - } - } - /** - * @return index into maybeYesCompositions - */ - private int getCompositionsListForComposite(int norm16) { - // A composite has both mapping & compositions list. - int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; - int firstUnit=maybeYesCompositions.charAt(list); - return list+ // mapping in maybeYesCompositions - 1+ // +1 to skip the first unit with the mapping length - (firstUnit&MAPPING_LENGTH_MASK); // + mapping length - } - - // Decompose a short piece of text which is likely to contain characters that - // fail the quick check loop and/or where the quick check loop's overhead - // is unlikely to be amortized. - // Called by the compose() and makeFCD() implementations. - // Public in Java for collation implementation code. - private int decomposeShort( - CharSequence s, int src, int limit, - boolean stopAtCompBoundary, boolean onlyContiguous, - ReorderingBuffer buffer) { - while(src= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - buffer.append(c, getCCFromYesOrMaybe(norm16)); - return; - } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16=getRawNorm16(c); - } - if (norm16 < minYesNo) { - // c does not decompose - buffer.append(c, 0); - } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { - // Hangul syllable: decompose algorithmically - Hangul.decompose(c, buffer); - } else { - // c decomposes, get everything from the variable-length extra data - int mapping=norm16>>OFFSET_SHIFT; - int firstUnit=extraData.charAt(mapping); - int length=firstUnit&MAPPING_LENGTH_MASK; - int leadCC, trailCC; - trailCC=firstUnit>>8; - if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { - leadCC=extraData.charAt(mapping-1)>>8; - } else { - leadCC=0; - } - ++mapping; // skip over the firstUnit - buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); - } - } - - /** - * Finds the recomposition result for - * a forward-combining "lead" character, - * specified with a pointer to its compositions list, - * and a backward-combining "trail" character. - * - *

If the lead and trail characters combine, then this function returns - * the following "compositeAndFwd" value: - *

-     * Bits 21..1  composite character
-     * Bit      0  set if the composite is a forward-combining starter
-     * 
- * otherwise it returns -1. - * - *

The compositions list has (trail, compositeAndFwd) pair entries, - * encoded as either pairs or triples of 16-bit units. - * The last entry has the high bit of its first unit set. - * - *

The list is sorted by ascending trail characters (there are no duplicates). - * A linear search is used. - * - *

See normalizer2impl.h for a more detailed description - * of the compositions list format. - */ - private static int combine(String compositions, int list, int trail) { - int key1, firstUnit; - if(trail(firstUnit=compositions.charAt(list))) { - list+=2+(firstUnit&COMP_1_TRIPLE); - } - if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { - if((firstUnit&COMP_1_TRIPLE)!=0) { - return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); - } else { - return compositions.charAt(list+1); - } - } - } else { - // trail character is 3400..10FFFF - // result entry has 3 units - key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); - int key2=(trail<(firstUnit=compositions.charAt(list))) { - list+=2+(firstUnit&COMP_1_TRIPLE); - } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { - if(key2>(secondUnit=compositions.charAt(list+1))) { - if((firstUnit&COMP_1_LAST_TUPLE)!=0) { - break; - } else { - list+=3; - } - } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { - return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); - } else { - break; - } - } else { - break; - } - } - } - return -1; - } - - /* - * Recomposes the buffer text starting at recomposeStartIndex - * (which is in NFD - decomposed and canonically ordered), - * and truncates the buffer contents. - * - * Note that recomposition never lengthens the text: - * Any character consists of either one or two code units; - * a composition may contain at most one more code unit than the original starter, - * while the combining mark that is removed has at least one code unit. - */ - private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, - boolean onlyContiguous) { - StringBuilder sb=buffer.getStringBuilder(); - int p=recomposeStartIndex; - if(p==sb.length()) { - return; - } - - int starter, pRemove; - int compositionsList; - int c, compositeAndFwd; - int norm16; - int cc, prevCC; - boolean starterIsSupplementary; - - // Some of the following variables are not used until we have a forward-combining starter - // and are only initialized now to avoid compiler warnings. - compositionsList=-1; // used as indicator for whether we have a forward-combining starter - starter=-1; - starterIsSupplementary=false; - prevCC=0; - - for(;;) { - c=sb.codePointAt(p); - p+=Character.charCount(c); - norm16=getNorm16(c); - cc=getCCFromYesOrMaybe(norm16); - if( // this character combines backward and - isMaybe(norm16) && - // we have seen a starter that combines forward and - compositionsList>=0 && - // the backward-combining character is not blocked - (prevCC=0) { - // The starter and the combining mark (c) do combine. - int composite=compositeAndFwd>>1; - - // Remove the combining mark. - pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark - sb.delete(pRemove, p); - p=pRemove; - // Replace the starter with the composite. - if(starterIsSupplementary) { - if(composite>0xffff) { - // both are supplementary - sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); - sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); - } else { - sb.setCharAt(starter, (char)c); - sb.deleteCharAt(starter+1); - // The composite is shorter than the starter, - // move the intermediate characters forward one. - starterIsSupplementary=false; - --p; - } - } else if(composite>0xffff) { - // The composite is longer than the starter, - // move the intermediate characters back one. - starterIsSupplementary=true; - sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); - sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); - ++p; - } else { - // both are on the BMP - sb.setCharAt(starter, (char)composite); - } - - // Keep prevCC because we removed the combining mark. - - if(p==sb.length()) { - break; - } - // Is the composite a starter that combines forward? - if((compositeAndFwd&1)!=0) { - compositionsList= - getCompositionsListForComposite(getRawNorm16(composite)); - } else { - compositionsList=-1; - } - - // We combined; continue with looking for compositions. - continue; - } - } - - // no combination this time - prevCC=cc; - if(p==sb.length()) { - break; - } - - // If c did not combine, then check if it is a starter. - if(cc==0) { - // Found a new starter. - if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { - // It may combine with something, prepare for it. - if(c<=0xffff) { - starterIsSupplementary=false; - starter=p-1; - } else { - starterIsSupplementary=true; - starter=p-2; - } - } - } else if(onlyContiguous) { - // FCC: no discontiguous compositions; any intervening character blocks. - compositionsList=-1; - } - } - buffer.flush(); - } - - /** - * Does c have a composition boundary before it? - * True if its decomposition begins with a character that has - * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). - * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes - * (isCompYesAndZeroCC()) so we need not decompose. - */ - private boolean hasCompBoundaryBefore(int c, int norm16) { - return c> OFFSET_SHIFT) <= 0x1ff); - } - - private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { - while(p>0) { - int c=Character.codePointBefore(s, p); - int norm16 = getNorm16(c); - if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { - break; - } - p-=Character.charCount(c); - if(hasCompBoundaryBefore(c, norm16)) { - break; - } - } - return p; - } - private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { - while(p= 0x0009 && c <= 0x000D) || - (c >= 0x0020 && c <= 0x002F) || - (c >= 0x003A && c <= 0x0040) || - (c >= 0x005B && c <= 0x0060) || - (c >= 0x007B && c <= 0x007E); - } - - public static String canonicalDecomposeWithSingleQuotation(String string) { - Normalizer2 impl = Normalizer2.getNFDInstance(); - char[] src = string.toCharArray(); - int srcIndex = 0; - int srcLimit = src.length; - char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 - int destIndex = 0; - int destLimit = dest.length; - - int prevSrc; - String norm; - int reorderStartIndex, length; - char c1, c2; - int cp; - int minNoMaybe = 0x00c0; - int cc, prevCC, trailCC; - char[] p; - int pStart; - - // initialize - reorderStartIndex = 0; - prevCC = 0; - norm = null; - cp = 0; - pStart = 0; - - cc = trailCC = -1; // initialize to bogus value - c1 = 0; - for (;;) { - prevSrc=srcIndex; - //quick check (1)less than minNoMaybe (2)no decomp (3)hangual - while (srcIndex != srcLimit && - ((c1 = src[srcIndex]) < minNoMaybe || - (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || - (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables - prevCC = 0; - srcIndex += (cp < 0x10000) ? 1 : 2; - } - - // copy these code units all at once - if (srcIndex != prevSrc) { - length = srcIndex - prevSrc; - if ((destIndex + length) <= destLimit) { - System.arraycopy(src,prevSrc,dest,destIndex,length); - } - - destIndex += length; - reorderStartIndex = destIndex; - } - - // end of source reached? - if (srcIndex == srcLimit) { - break; - } - - // cp already contains *src and norm32 is set for it, increment src - srcIndex += (cp < 0x10000) ? 1 : 2; - - if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { - c2 = 0; - length = 1; - - if (Character.isHighSurrogate(c1) - || Character.isLowSurrogate(c1)) { - norm = null; - } - } else { - length = 2; - c2 = src[srcIndex-1]; - } - - // get the decomposition and the lead and trail cc's - if (norm == null) { - // cp does not decompose - cc = trailCC = UCharacter.getCombiningClass(cp); - p = null; - pStart = -1; - } else { - - pStart = 0; - p = norm.toCharArray(); - length = p.length; - int cpNum = norm.codePointCount(0, length); - cc= UCharacter.getCombiningClass(norm.codePointAt(0)); - trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); - if (length == 1) { - // fastpath a single code unit from decomposition - c1 = p[pStart]; - c2 = 0; - p = null; - pStart = -1; - } - } - - if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations - // buffer overflow - char[] tmpBuf = new char[destLimit * 2]; - System.arraycopy(dest, 0, tmpBuf, 0, destIndex); - dest = tmpBuf; - destLimit = dest.length; - } - - // append the decomposition to the destination buffer, assume length>0 - { - int reorderSplit = destIndex; - if (p == null) { - // fastpath: single code point - if (needSingleQuotation(c1)) { - //if we need single quotation, no need to consider "prevCC" - //and it must NOT be a supplementary pair - dest[destIndex++] = '\''; - dest[destIndex++] = c1; - dest[destIndex++] = '\''; - trailCC = 0; - } else if(cc != 0 && cc < prevCC) { - // (c1, c2) is out of order with respect to the preceding - // text - destIndex += length; - trailCC = insertOrdered(dest, reorderStartIndex, - reorderSplit, destIndex, c1, c2, cc); - } else { - // just append (c1, c2) - dest[destIndex++] = c1; - if(c2 != 0) { - dest[destIndex++] = c2; - } - } - } else { - // general: multiple code points (ordered by themselves) - // from decomposition - if (needSingleQuotation(p[pStart])) { - dest[destIndex++] = '\''; - dest[destIndex++] = p[pStart++]; - dest[destIndex++] = '\''; - length--; - do { - dest[destIndex++] = p[pStart++]; - } while(--length > 0); - } else if (cc != 0 && cc < prevCC) { - destIndex += length; - trailCC = mergeOrdered(dest, reorderStartIndex, - reorderSplit, p, pStart, - pStart+length); - } else { - // just append the decomposition - do { - dest[destIndex++] = p[pStart++]; - } while (--length > 0); - } - } - } - prevCC = trailCC; - if(prevCC == 0) { - reorderStartIndex = destIndex; - } - } - - return new String(dest, 0, destIndex); - } - - /** - * simpler, single-character version of mergeOrdered() - - * bubble-insert one single code point into the preceding string - * which is already canonically ordered - * (c, c2) may or may not yet have been inserted at src[current]..src[p] - * - * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) - * - * before: src[start]..src[current] is already ordered, and - * src[current]..src[p] may or may not hold (c, c2) but - * must be exactly the same length as (c, c2) - * after: src[start]..src[p] is ordered - * - * @return the trailing combining class - */ - private static int/*unsigned byte*/ insertOrdered(char[] source, - int start, - int current, int p, - char c1, char c2, - int/*unsigned byte*/ cc) { - int back, preBack; - int r; - int prevCC, trailCC=cc; - - if (start=prevCC - preBack=back=current; - - PrevArgs prevArgs = new PrevArgs(); - prevArgs.current = current; - prevArgs.start = start; - prevArgs.src = source; - prevArgs.c1 = c1; - prevArgs.c2 = c2; - - // get the prevCC - prevCC=getPrevCC(prevArgs); - preBack = prevArgs.current; - - if(cc=prevCC) { - break; - } - back=preBack; - } - - // this is where we are right now with all these indicies: - // [start]..[pPreBack] 0..? code points that we can ignore - // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc - // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) - // [current]..[p] 1 code point (c, c2) with cc - - // move the code units in between up - r=p; - do { - source[--r]=source[--current]; - } while (back!=current); - } - } - - // insert (c1, c2) - source[current] = c1; - if (c2!=0) { - source[(current+1)] = c2; - } - - // we know the cc of the last code point - return trailCC; - } - /** - * merge two UTF-16 string parts together - * to canonically order (order by combining classes) their concatenation - * - * the two strings may already be adjacent, so that the merging is done - * in-place if the two strings are not adjacent, then the buffer holding the - * first one must be large enough - * the second string may or may not be ordered in itself - * - * before: [start]..[current] is already ordered, and - * [next]..[limit] may be ordered in itself, but - * is not in relation to [start..current[ - * after: [start..current+(limit-next)[ is ordered - * - * the algorithm is a simple bubble-sort that takes the characters from - * src[next++] and inserts them in correct combining class order into the - * preceding part of the string - * - * since this function is called much less often than the single-code point - * insertOrdered(), it just uses that for easier maintenance - * - * @return the trailing combining class - */ - private static int /*unsigned byte*/ mergeOrdered(char[] source, - int start, - int current, - char[] data, - int next, - int limit) { - int r; - int /*unsigned byte*/ cc, trailCC=0; - boolean adjacent; - - adjacent= current==next; - NextCCArgs ncArgs = new NextCCArgs(); - ncArgs.source = data; - ncArgs.next = next; - ncArgs.limit = limit; - - if(start!=current) { - - while(ncArgs.next + * If dest is a StringBuilder, then the buffer writes directly to it. + * Otherwise, the buffer maintains a StringBuilder for intermediate text segments + * until no further changes are necessary and whole segments are appended. + * append() methods that take combining-class values always write to the StringBuilder. + * Other append() methods flush and append to the Appendable. + */ + public static final class ReorderingBuffer implements Appendable { + public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { + impl=ni; + app=dest; + if (app instanceof StringBuilder) { + appIsStringBuilder=true; + str=(StringBuilder)dest; + // In Java, the constructor subsumes public void init(int destCapacity) + str.ensureCapacity(destCapacity); + reorderStart=0; + if(str.length()==0) { + lastCC=0; + } else { + setIterator(); + lastCC=previousCC(); + // Set reorderStart after the last code point with cc<=1 if there is one. + if(lastCC>1) { + while(previousCC()>1) {} + } + reorderStart=codePointLimit; + } + } else { + appIsStringBuilder=false; + str=new StringBuilder(); + reorderStart=0; + lastCC=0; + } + } + + public boolean isEmpty() { return str.length()==0; } + public int length() { return str.length(); } + public int getLastCC() { return lastCC; } + + public StringBuilder getStringBuilder() { return str; } + + public boolean equals(CharSequence s, int start, int limit) { + return UTF16Plus.equal(str, 0, str.length(), s, start, limit); + } + + public void append(int c, int cc) { + if(lastCC<=cc || cc==0) { + str.appendCodePoint(c); + lastCC=cc; + if(cc<=1) { + reorderStart=str.length(); + } + } else { + insert(c, cc); + } + } + public void append(CharSequence s, int start, int limit, boolean isNFD, + int leadCC, int trailCC) { + if(start==limit) { + return; + } + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. + } + str.append(s, start, limit); + lastCC=trailCC; + } else { + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(startcc;) {} + // insert c at codePointLimit, after the character with prevCC<=cc + if(c<=0xffff) { + str.insert(codePointLimit, (char)c); + if(cc<=1) { + reorderStart=codePointLimit+1; + } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; + } + } + } + + private final NormalizerImpl impl; + private final Appendable app; + private final StringBuilder str; + private final boolean appIsStringBuilder; + private int reorderStart; + private int lastCC; + + // private backward iterator + private void setIterator() { codePointStart=str.length(); } + private void skipPrevious() { // Requires 0=codePointStart) { + return 0; + } + int c=str.codePointBefore(codePointStart); + codePointStart-=Character.charCount(c); + return impl.getCCFromYesOrMaybeCP(c); + } + private int codePointStart, codePointLimit; + } + + // TODO: Propose as public API on the UTF16 class. + // TODO: Propose widening UTF16 methods that take char to take int. + // TODO: Propose widening UTF16 methods that take String to take CharSequence. + public static final class UTF16Plus { + /** + * Is this code point a lead surrogate (U+d800..U+dbff)? + * @param c code unit or code point + * @return true or false + */ + public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } + /** + * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), + * is it a lead surrogate? + * @param c code unit or code point + * @return true or false + */ + public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } + + /** + * Compares two CharSequence subsequences for binary equality. + * @param s1 first sequence + * @param start1 start offset in first sequence + * @param limit1 limit offset in first sequence + * @param s2 second sequence + * @param start2 start offset in second sequence + * @param limit2 limit offset in second sequence + * @return true if s1.subSequence(start1, limit1) contains the same text + * as s2.subSequence(start2, limit2) + */ + public static boolean equal(CharSequence s1, int start1, int limit1, + CharSequence s2, int start2, int limit2) { + if((limit1-start1)!=(limit2-start2)) { + return false; + } + if(s1==s2 && start1==start2) { + return true; + } + while(start1>DELTA_SHIFT)-MAX_DELTA-1; + + // Read the normTrie. + int offset=inIndexes[IX_NORM_TRIE_OFFSET]; + int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; + int triePosition = bytes.position(); + normTrie = CodePointTrie.Fast16.fromBinary(bytes); + int trieLength = bytes.position() - triePosition; + if(trieLength>(nextOffset-offset)) { + throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); + } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes + + // Read the composition and mapping data. + offset=nextOffset; + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; + int numChars=(nextOffset-offset)/2; + if(numChars!=0) { + maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); + extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); + } + + // smallFCD: new in formatVersion 2 + offset=nextOffset; + smallFCD=new byte[0x100]; + bytes.get(smallFCD); + + return this; + } catch(IOException e) { + throw new InternalError(e); + } + } + public NormalizerImpl load(String name) { + return load(ICUBinary.getRequiredData(name)); + } + + // The trie stores values for lead surrogate code *units*. + // Surrogate code *points* are inert. + public int getNorm16(int c) { + return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); + } + public int getRawNorm16(int c) { return normTrie.get(c); } + public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16=MIN_NORMAL_MAYBE_YES) { + return getCCFromNormalYesOrMaybe(norm16); + } + if(norm16> OFFSET_SHIFT) & 0xff; + } + public static int getCCFromYesOrMaybe(int norm16) { + return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; + } + public int getCCFromYesOrMaybeCP(int c) { + if (c < minCompNoMaybeCP) { return 0; } + return getCCFromYesOrMaybe(getNorm16(c)); + } + + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + public int getFCD16(int c) { + if(c>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + int norm16=getNorm16(c); + if (norm16 >= limitNoNo) { + if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16=getCCFromNormalYesOrMaybe(norm16); + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else { // isDecompNoAlgorithmic(norm16) + int deltaTrailCC = norm16 & DELTA_TCCC_MASK; + if (deltaTrailCC <= DELTA_TCCC_1) { + return deltaTrailCC >> OFFSET_SHIFT; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + } + if(norm16<=minYesNo || isHangulLVT(norm16)) { + // no decomposition or Hangul syllable, all zeros + return 0; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int fcd16=firstUnit>>8; // tccc + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc + } + return fcd16; + } + + /** + * Gets the decomposition for one code point. + * @param c code point + * @return c's decomposition, if it has one; returns null if it does not have a decomposition + */ + public String getDecomposition(int c) { + int norm16; + if(c>OFFSET_SHIFT; + int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; + return extraData.substring(mapping, mapping+length); + } + + // Fixed norm16 values. + public static final int MIN_YES_YES_WITH_CC=0xfe02; + public static final int JAMO_VT=0xfe00; + public static final int MIN_NORMAL_MAYBE_YES=0xfc00; + public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE + public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE + + // norm16 bit 0 is comp-boundary-after. + public static final int HAS_COMP_BOUNDARY_AFTER=1; + public static final int OFFSET_SHIFT=1; + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + public static final int DELTA_TCCC_0=0; + public static final int DELTA_TCCC_1=2; + public static final int DELTA_TCCC_GT_1=4; + public static final int DELTA_TCCC_MASK=6; + public static final int DELTA_SHIFT=3; + + public static final int MAX_DELTA=0x40; + + // Byte offsets from the start of the data, after the generic header. + public static final int IX_NORM_TRIE_OFFSET=0; + public static final int IX_EXTRA_DATA_OFFSET=1; + public static final int IX_SMALL_FCD_OFFSET=2; + public static final int IX_RESERVED3_OFFSET=3; + public static final int IX_TOTAL_SIZE=7; + public static final int MIN_CCC_LCCC_CP=0x300; + // Code point thresholds for quick check codes. + public static final int IX_MIN_DECOMP_NO_CP=8; + public static final int IX_MIN_COMP_NO_MAYBE_CP=9; + + // Norm16 value thresholds for quick check combinations and types of extra data. + + /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ + public static final int IX_MIN_YES_NO=10; + /** Mappings are comp-normalized. */ + public static final int IX_MIN_NO_NO=11; + public static final int IX_LIMIT_NO_NO=12; + public static final int IX_MIN_MAYBE_YES=13; + + /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ + public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; + /** Mappings are not comp-normalized but have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; + /** Mappings do not have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; + /** Mappings to the empty string. */ + public static final int IX_MIN_NO_NO_EMPTY=17; + + public static final int IX_MIN_LCCC_CP=18; + public static final int IX_COUNT=20; + + public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; + public static final int MAPPING_HAS_RAW_MAPPING=0x40; + // unused bit 0x20; + public static final int MAPPING_LENGTH_MASK=0x1f; + + public static final int COMP_1_LAST_TUPLE=0x8000; + public static final int COMP_1_TRIPLE=1; + public static final int COMP_1_TRAIL_LIMIT=0x3400; + public static final int COMP_1_TRAIL_MASK=0x7ffe; + public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit + public static final int COMP_2_TRAIL_SHIFT=6; + public static final int COMP_2_TRAIL_MASK=0xffc0; + + // higher-level functionality ------------------------------------------ *** + + /** + * Decomposes s[src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + public void decompose(CharSequence s, int src, int limit, StringBuilder dest, + int destLengthEstimate) { + if(destLengthEstimate<0) { + destLengthEstimate=limit-src; + } + dest.setLength(0); + ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); + decompose(s, src, limit, buffer); + } + + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int decompose(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + int minNoCP=minDecompNoCP; + + int prevSrc; + int c=0; + int norm16=0; + + // only for quick check + int prevBoundary=src; + int prevCC=0; + + for(;;) { + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src; src!=limit;) { + if( (c=s.charAt(src))=limit) { + break; + } + c=Character.codePointAt(s, src); + cc=getCC(getNorm16(c)); + }; + buffer.append(s, 0, src, false, firstCC, prevCC); + buffer.append(s, src, limit); + } + + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. + // doCompose: normalize + // !doCompose: isNormalized (buffer must be empty and initialized) + public boolean compose(CharSequence s, int src, int limit, + boolean onlyContiguous, + boolean doCompose, + ReorderingBuffer buffer) { + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for (;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if (src == limit) { + if (prevBoundary != limit && doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + if( (c=s.charAt(src))=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + // Medium-fast path: Handle cases that do not require full decomposition and recomposition. + if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (!doCompose) { + return false; + } + // Fast path for mapping a character that is immediately surrounded by boundaries. + // In this case, we need not decompose around the current character. + if (isDecompNoAlgorithmic(norm16)) { + // Maps to a single isCompYesAndZeroCC character + // which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append(mapAlgorithmic(c, norm16), 0); + prevBoundary = src; + continue; + } + } else if (norm16 < minNoNoCompBoundaryBefore) { + // The mapping is comp-normalized which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int mapping = norm16 >> OFFSET_SHIFT; + int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; + buffer.append(extraData, mapping, mapping + length); + prevBoundary = src; + continue; + } + } else if (norm16 >= minNoNoEmpty) { + // The current character maps to nothing. + // Simply omit it from the output if there is a boundary before _or_ after it. + // The character itself implies no boundaries. + if (hasCompBoundaryBefore(s, src, limit) || + hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + prevBoundary = src; + continue; + } + } + // Other "noNo" type, or need to examine more text around this character: + // Fall through to the slow path. + } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { + char prev=s.charAt(prevSrc-1); + if(c= 0) { + int syllable = Hangul.HANGUL_BASE + + (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * + Hangul.JAMO_T_COUNT + t; + --prevSrc; // Replace the Jamo L as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // If we see L+V+x where x!=T then we drop to the slow path, + // decompose and recompose. + // This is to deal with NFKC finding normal L and V but a + // compatibility variant of a T. + // We need to either fully compose that combination here + // (which would complicate the code and may not work with strange custom data) + // or use the slow path. + } + } else if (Hangul.isHangulLV(prev)) { + // The current character is a Jamo Trailing consonant, + // compose with previous Hangul LV that does not contain a Jamo T. + if (!doCompose) { + return false; + } + int syllable = prev + c - Hangul.JAMO_T_BASE; + --prevSrc; // Replace the Hangul LV as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // No matching context, or may need to decompose surrounding text first: + // Fall through to the slow path. + } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC + // One or more combining marks that do not combine-back: + // Check for canonical order, copy unchanged if ok and + // if followed by a character with a boundary-before. + int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 + if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { + // Fails FCD test, need to decompose and contiguously recompose. + if (!doCompose) { + return false; + } + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + int n16; + for (;;) { + if (src == limit) { + if (doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + int prevCC = cc; + c = Character.codePointAt(s, src); + n16 = normTrie.get(c); + if (n16 >= MIN_YES_YES_WITH_CC) { + cc = getCCFromNormalYesOrMaybe(n16); + if (prevCC > cc) { + if (!doCompose) { + return false; + } + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // p is after the last in-order combining mark. + // If there is a boundary here, then we continue with no change. + if (norm16HasCompBoundaryBefore(n16)) { + if (isCompYesAndZeroCC(n16)) { + src += Character.charCount(c); + } + continue; + } + // Use the slow path. There is no boundary in [prevSrc, src[. + } + } + + // Slow path: Find the nearest boundaries around the current character, + // decompose and recompose. + if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + norm16 = normTrie.get(c); + if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + prevSrc -= Character.charCount(c); + } + } + if (doCompose && prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int recomposeStartIndex=buffer.length(); + // We know there is not a boundary here. + decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, + buffer); + // Decompose until the next boundary. + src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, + buffer); + recompose(buffer, recomposeStartIndex, onlyContiguous); + if(!doCompose) { + if(!buffer.equals(s, prevSrc, src)) { + return false; + } + buffer.remove(); + } + prevBoundary=src; + } + } + + /** + * Very similar to compose(): Make the same changes in both places if relevant. + * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) + * !doSpan: quickCheck + * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and + * bit 0: set if "maybe"; otherwise, if the span length<s.length() + * then the quick check result is "no" + */ + public int composeQuickCheck(CharSequence s, int src, int limit, + boolean onlyContiguous, boolean doSpan) { + int qcResult=0; + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for(;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if(src==limit) { + return (src<<1)|qcResult; // "yes" or "maybe" + } + if( (c=s.charAt(src))=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + int prevNorm16 = INERT; + if (prevBoundary != prevSrc) { + prevBoundary = prevSrc; + if (!norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + int n16 = getNorm16(c); + if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { + prevBoundary -= Character.charCount(c); + prevNorm16 = n16; + } + } + } + + if(isMaybeOrNonZeroCC(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if (onlyContiguous /* FCC */ && cc != 0 && + getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { + // The [prevBoundary..prevSrc[ character + // passed the quick check "yes && ccc==0" test + // but is out of canonical order with the current combining mark. + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + for (;;) { + if (norm16 < MIN_YES_YES_WITH_CC) { + if (!doSpan) { + qcResult = 1; + } else { + return prevBoundary << 1; // spanYes does not care to know it's "maybe" + } + } + if (src == limit) { + return (src<<1) | qcResult; // "yes" or "maybe" + } + int prevCC = cc; + c = Character.codePointAt(s, src); + norm16 = getNorm16(c); + if (isMaybeOrNonZeroCC(norm16)) { + cc = getCCFromYesOrMaybe(norm16); + if (!(prevCC <= cc || cc == 0)) { + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // src is after the last in-order combining mark. + if (isCompYesAndZeroCC(norm16)) { + prevBoundary = src; + src += Character.charCount(c); + continue; + } + } + } + return prevBoundary<<1; // "no" + } + } + public void composeAndAppend(CharSequence s, + boolean doCompose, + boolean onlyContiguous, + ReorderingBuffer buffer) { + int src=0, limit=s.length(); + if(!buffer.isEmpty()) { + int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); + if(0!=firstStarterInSrc) { + int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), + buffer.length(), onlyContiguous); + StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ + firstStarterInSrc+16); + middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); + buffer.removeSuffix(buffer.length()-lastStarterInDest); + middle.append(s, 0, firstStarterInSrc); + compose(middle, 0, middle.length(), onlyContiguous, true, buffer); + src=firstStarterInSrc; + } + } + if(doCompose) { + compose(s, src, limit, onlyContiguous, true, buffer); + } else { + buffer.append(s, src, limit); + } + } + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { + // Note: In this function we use buffer->appendZeroCC() because we track + // the lead and trail combining classes here, rather than leaving it to + // the ReorderingBuffer. + // The exception is the call to decomposeShort() which uses the buffer + // in the normal way. + + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. + // Similar to the prevBoundary in the compose() implementation. + int prevBoundary=src; + int prevSrc; + int c=0; + int prevFCD16=0; + int fcd16=0; + + for(;;) { + // count code units with lccc==0 + for(prevSrc=src; src!=limit;) { + if((c=s.charAt(src))1) { + --prevBoundary; + } + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc

1) { + prevBoundary=p; + } + } + if(buffer!=null) { + // The last lccc==0 character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; + } + + src+=Character.charCount(c); + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. + // Check for proper order, and decompose locally if necessary. + if((prevFCD16&0xff)<=(fcd16>>8)) { + // proper order: prev tccc <= current lccc + if((fcd16&0xff)<=1) { + prevBoundary=src; + } + if(buffer!=null) { + buffer.appendZeroCC(c); + } + prevFCD16=fcd16; + continue; + } else if(buffer==null) { + return prevBoundary; // quick check "no" + } else { + /* + * Back out the part of the source that we copied or appended + * already but is now going to be decomposed. + * prevSrc is set to after what was copied/appended. + */ + buffer.removeSuffix(prevSrc-prevBoundary); + /* + * Find the part of the source that needs to be decomposed, + * up to the next safe boundary. + */ + src=findNextFCDBoundary(s, src, limit); + /* + * The source text does not fulfill the conditions for FCD. + * Decompose and reorder a limited piece of the text. + */ + decomposeShort(s, prevBoundary, src, false, false, buffer); + prevBoundary=src; + prevFCD16=0; + } + } + return src; + } + + public boolean hasDecompBoundaryBefore(int c) { + return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || + norm16HasDecompBoundaryBefore(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryBefore(int norm16) { + if (norm16 < minNoNoCompNoMaybeCC) { + return true; + } + if (norm16 >= limitNoNo) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean hasDecompBoundaryAfter(int c) { + if (c < minDecompNoCP) { + return true; + } + if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { + return true; + } + return norm16HasDecompBoundaryAfter(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryAfter(int norm16) { + if(norm16 <= minYesNo || isHangulLVT(norm16)) { + return true; + } + if (norm16 >= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // Maps to an isCompYesAndZeroCC. + return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // decomp after-boundary: same as hasFCDBoundaryAfter(), + // fcd16<=1 || trailCC==0 + if(firstUnit>0x1ff) { + return false; // trailCC>1 + } + if(firstUnit<=0xff) { + return true; // trailCC==0 + } + // if(trailCC==1) test leadCC==0, same as checking for before-boundary + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } + + public boolean hasCompBoundaryBefore(int c) { + return c=minMaybeYes; } + private static boolean isInert(int norm16) { return norm16==INERT; } + private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } + private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } + private boolean isHangulLV(int norm16) { return norm16==minYesNo; } + private boolean isHangulLVT(int norm16) { + return norm16==hangulLVT(); + } + private boolean isCompYesAndZeroCC(int norm16) { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; + // } + private int getCCFromNoNo(int norm16) { + int mapping=norm16>>OFFSET_SHIFT; + if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + return extraData.charAt(mapping-1)&0xff; + } else { + return 0; + } + } + int getTrailCCFromCompYesAndZeroCC(int norm16) { + if(norm16<=minYesNo) { + return 0; // yesYes and Hangul LV have ccc=tccc=0 + } else { + // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. + return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo + } + } + + // Requires algorithmic-NoNo. + private int mapAlgorithmic(int c, int norm16) { + return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; + } + + // Requires minYesNo>OFFSET_SHIFT); } + + /** + * @return index into maybeYesCompositions, or -1 + */ + private int getCompositionsListForDecompYes(int norm16) { + if(norm16>OFFSET_SHIFT; + } + } + /** + * @return index into maybeYesCompositions + */ + private int getCompositionsListForComposite(int norm16) { + // A composite has both mapping & compositions list. + int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; + int firstUnit=maybeYesCompositions.charAt(list); + return list+ // mapping in maybeYesCompositions + 1+ // +1 to skip the first unit with the mapping length + (firstUnit&MAPPING_LENGTH_MASK); // + mapping length + } + + // Decompose a short piece of text which is likely to contain characters that + // fail the quick check loop and/or where the quick check loop's overhead + // is unlikely to be amortized. + // Called by the compose() and makeFCD() implementations. + // Public in Java for collation implementation code. + private int decomposeShort( + CharSequence s, int src, int limit, + boolean stopAtCompBoundary, boolean onlyContiguous, + ReorderingBuffer buffer) { + while(src= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + buffer.append(c, getCCFromYesOrMaybe(norm16)); + return; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + if (norm16 < minYesNo) { + // c does not decompose + buffer.append(c, 0); + } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { + // Hangul syllable: decompose algorithmically + Hangul.decompose(c, buffer); + } else { + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int length=firstUnit&MAPPING_LENGTH_MASK; + int leadCC, trailCC; + trailCC=firstUnit>>8; + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + leadCC=extraData.charAt(mapping-1)>>8; + } else { + leadCC=0; + } + ++mapping; // skip over the firstUnit + buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); + } + } + + /** + * Finds the recomposition result for + * a forward-combining "lead" character, + * specified with a pointer to its compositions list, + * and a backward-combining "trail" character. + * + *

If the lead and trail characters combine, then this function returns + * the following "compositeAndFwd" value: + *

+     * Bits 21..1  composite character
+     * Bit      0  set if the composite is a forward-combining starter
+     * 
+ * otherwise it returns -1. + * + *

The compositions list has (trail, compositeAndFwd) pair entries, + * encoded as either pairs or triples of 16-bit units. + * The last entry has the high bit of its first unit set. + * + *

The list is sorted by ascending trail characters (there are no duplicates). + * A linear search is used. + * + *

See normalizer2impl.h for a more detailed description + * of the compositions list format. + */ + private static int combine(String compositions, int list, int trail) { + int key1, firstUnit; + if(trail(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if((firstUnit&COMP_1_TRIPLE)!=0) { + return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + } else { + return compositions.charAt(list+1); + } + } + } else { + // trail character is 3400..10FFFF + // result entry has 3 units + key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); + int key2=(trail<(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if(key2>(secondUnit=compositions.charAt(list+1))) { + if((firstUnit&COMP_1_LAST_TUPLE)!=0) { + break; + } else { + list+=3; + } + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + } else { + break; + } + } else { + break; + } + } + } + return -1; + } + + /* + * Recomposes the buffer text starting at recomposeStartIndex + * (which is in NFD - decomposed and canonically ordered), + * and truncates the buffer contents. + * + * Note that recomposition never lengthens the text: + * Any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit. + */ + private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, + boolean onlyContiguous) { + StringBuilder sb=buffer.getStringBuilder(); + int p=recomposeStartIndex; + if(p==sb.length()) { + return; + } + + int starter, pRemove; + int compositionsList; + int c, compositeAndFwd; + int norm16; + int cc, prevCC; + boolean starterIsSupplementary; + + // Some of the following variables are not used until we have a forward-combining starter + // and are only initialized now to avoid compiler warnings. + compositionsList=-1; // used as indicator for whether we have a forward-combining starter + starter=-1; + starterIsSupplementary=false; + prevCC=0; + + for(;;) { + c=sb.codePointAt(p); + p+=Character.charCount(c); + norm16=getNorm16(c); + cc=getCCFromYesOrMaybe(norm16); + if( // this character combines backward and + isMaybe(norm16) && + // we have seen a starter that combines forward and + compositionsList>=0 && + // the backward-combining character is not blocked + (prevCC=0) { + // The starter and the combining mark (c) do combine. + int composite=compositeAndFwd>>1; + + // Remove the combining mark. + pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark + sb.delete(pRemove, p); + p=pRemove; + // Replace the starter with the composite. + if(starterIsSupplementary) { + if(composite>0xffff) { + // both are supplementary + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); + } else { + sb.setCharAt(starter, (char)c); + sb.deleteCharAt(starter+1); + // The composite is shorter than the starter, + // move the intermediate characters forward one. + starterIsSupplementary=false; + --p; + } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); + } + + // Keep prevCC because we removed the combining mark. + + if(p==sb.length()) { + break; + } + // Is the composite a starter that combines forward? + if((compositeAndFwd&1)!=0) { + compositionsList= + getCompositionsListForComposite(getRawNorm16(composite)); + } else { + compositionsList=-1; + } + + // We combined; continue with looking for compositions. + continue; + } + } + + // no combination this time + prevCC=cc; + if(p==sb.length()) { + break; + } + + // If c did not combine, then check if it is a starter. + if(cc==0) { + // Found a new starter. + if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { + // It may combine with something, prepare for it. + if(c<=0xffff) { + starterIsSupplementary=false; + starter=p-1; + } else { + starterIsSupplementary=true; + starter=p-2; + } + } + } else if(onlyContiguous) { + // FCC: no discontiguous compositions; any intervening character blocks. + compositionsList=-1; + } + } + buffer.flush(); + } + + /** + * Does c have a composition boundary before it? + * True if its decomposition begins with a character that has + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes + * (isCompYesAndZeroCC()) so we need not decompose. + */ + private boolean hasCompBoundaryBefore(int c, int norm16) { + return c> OFFSET_SHIFT) <= 0x1ff); + } + + private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { + while(p>0) { + int c=Character.codePointBefore(s, p); + int norm16 = getNorm16(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } + p-=Character.charCount(c); + if(hasCompBoundaryBefore(c, norm16)) { + break; + } + } + return p; + } + private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { + while(p= 0x0009 && c <= 0x000D) || + (c >= 0x0020 && c <= 0x002F) || + (c >= 0x003A && c <= 0x0040) || + (c >= 0x005B && c <= 0x0060) || + (c >= 0x007B && c <= 0x007E); + } + + public static String canonicalDecomposeWithSingleQuotation(String string) { + Normalizer2 impl = Normalizer2.getNFDInstance(); + char[] src = string.toCharArray(); + int srcIndex = 0; + int srcLimit = src.length; + char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 + int destIndex = 0; + int destLimit = dest.length; + + int prevSrc; + String norm; + int reorderStartIndex, length; + char c1, c2; + int cp; + int minNoMaybe = 0x00c0; + int cc, prevCC, trailCC; + char[] p; + int pStart; + + // initialize + reorderStartIndex = 0; + prevCC = 0; + norm = null; + cp = 0; + pStart = 0; + + cc = trailCC = -1; // initialize to bogus value + c1 = 0; + for (;;) { + prevSrc=srcIndex; + //quick check (1)less than minNoMaybe (2)no decomp (3)hangual + while (srcIndex != srcLimit && + ((c1 = src[srcIndex]) < minNoMaybe || + (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || + (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables + prevCC = 0; + srcIndex += (cp < 0x10000) ? 1 : 2; + } + + // copy these code units all at once + if (srcIndex != prevSrc) { + length = srcIndex - prevSrc; + if ((destIndex + length) <= destLimit) { + System.arraycopy(src,prevSrc,dest,destIndex,length); + } + + destIndex += length; + reorderStartIndex = destIndex; + } + + // end of source reached? + if (srcIndex == srcLimit) { + break; + } + + // cp already contains *src and norm32 is set for it, increment src + srcIndex += (cp < 0x10000) ? 1 : 2; + + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + c2 = 0; + length = 1; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; + } + } else { + length = 2; + c2 = src[srcIndex-1]; + } + + // get the decomposition and the lead and trail cc's + if (norm == null) { + // cp does not decompose + cc = trailCC = UCharacter.getCombiningClass(cp); + p = null; + pStart = -1; + } else { + + pStart = 0; + p = norm.toCharArray(); + length = p.length; + int cpNum = norm.codePointCount(0, length); + cc= UCharacter.getCombiningClass(norm.codePointAt(0)); + trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); + if (length == 1) { + // fastpath a single code unit from decomposition + c1 = p[pStart]; + c2 = 0; + p = null; + pStart = -1; + } + } + + if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations + // buffer overflow + char[] tmpBuf = new char[destLimit * 2]; + System.arraycopy(dest, 0, tmpBuf, 0, destIndex); + dest = tmpBuf; + destLimit = dest.length; + } + + // append the decomposition to the destination buffer, assume length>0 + { + int reorderSplit = destIndex; + if (p == null) { + // fastpath: single code point + if (needSingleQuotation(c1)) { + //if we need single quotation, no need to consider "prevCC" + //and it must NOT be a supplementary pair + dest[destIndex++] = '\''; + dest[destIndex++] = c1; + dest[destIndex++] = '\''; + trailCC = 0; + } else if(cc != 0 && cc < prevCC) { + // (c1, c2) is out of order with respect to the preceding + // text + destIndex += length; + trailCC = insertOrdered(dest, reorderStartIndex, + reorderSplit, destIndex, c1, c2, cc); + } else { + // just append (c1, c2) + dest[destIndex++] = c1; + if(c2 != 0) { + dest[destIndex++] = c2; + } + } + } else { + // general: multiple code points (ordered by themselves) + // from decomposition + if (needSingleQuotation(p[pStart])) { + dest[destIndex++] = '\''; + dest[destIndex++] = p[pStart++]; + dest[destIndex++] = '\''; + length--; + do { + dest[destIndex++] = p[pStart++]; + } while(--length > 0); + } else if (cc != 0 && cc < prevCC) { + destIndex += length; + trailCC = mergeOrdered(dest, reorderStartIndex, + reorderSplit, p, pStart, + pStart+length); + } else { + // just append the decomposition + do { + dest[destIndex++] = p[pStart++]; + } while (--length > 0); + } + } + } + prevCC = trailCC; + if(prevCC == 0) { + reorderStartIndex = destIndex; + } + } + + return new String(dest, 0, destIndex); + } + + /** + * simpler, single-character version of mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at src[current]..src[p] + * + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) + * + * before: src[start]..src[current] is already ordered, and + * src[current]..src[p] may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: src[start]..src[p] is ordered + * + * @return the trailing combining class + */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; + + if (start=prevCC + preBack=back=current; + + PrevArgs prevArgs = new PrevArgs(); + prevArgs.current = current; + prevArgs.start = start; + prevArgs.src = source; + prevArgs.c1 = c1; + prevArgs.c2 = c2; + + // get the prevCC + prevCC=getPrevCC(prevArgs); + preBack = prevArgs.current; + + if(cc=prevCC) { + break; + } + back=preBack; + } + + // this is where we are right now with all these indicies: + // [start]..[pPreBack] 0..? code points that we can ignore + // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc + // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) + // [current]..[p] 1 code point (c, c2) with cc + + // move the code units in between up + r=p; + do { + source[--r]=source[--current]; + } while (back!=current); + } + } + + // insert (c1, c2) + source[current] = c1; + if (c2!=0) { + source[(current+1)] = c2; + } + + // we know the cc of the last code point + return trailCC; + } + /** + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done + * in-place if the two strings are not adjacent, then the buffer holding the + * first one must be large enough + * the second string may or may not be ordered in itself + * + * before: [start]..[current] is already ordered, and + * [next]..[limit] may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from + * src[next++] and inserts them in correct combining class order into the + * preceding part of the string + * + * since this function is called much less often than the single-code point + * insertOrdered(), it just uses that for easier maintenance + * + * @return the trailing combining class + */ + private static int /*unsigned byte*/ mergeOrdered(char[] source, + int start, + int current, + char[] data, + int next, + int limit) { + int r; + int /*unsigned byte*/ cc, trailCC=0; + boolean adjacent; + + adjacent= current==next; + NextCCArgs ncArgs = new NextCCArgs(); + ncArgs.source = data; + ncArgs.next = next; + ncArgs.limit = limit; + + if(start!=current) { + + while(ncArgs.next((BASE-TMIN)*TMAX)/2; count+=BASE) { - delta/=(BASE-TMIN); - } - - return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); - } - - /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. - */ - static final int[] basicToDigit= new int[]{ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - - private static char asciiCaseMap(char b, boolean uppercase) { - if(uppercase) { - if(SMALL_A<=b && b<=SMALL_Z) { - b-=(SMALL_A-CAPITAL_A); - } - } else { - if(CAPITAL_A<=b && b<=CAPITAL_Z) { - b+=(SMALL_A-CAPITAL_A); - } - } - return b; - } - - /** - * digitToBasic() returns the basic code point whose value - * (when used for representing integers) is d, which must be in the - * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is - * nonzero, in which case the uppercase form is used. - */ - private static char digitToBasic(int digit, boolean uppercase) { - /* 0..25 map to ASCII a..z or A..Z */ - /* 26..35 map to ASCII 0..9 */ - if(digit<26) { - if(uppercase) { - return (char)(CAPITAL_A+digit); - } else { - return (char)(SMALL_A+digit); - } - } else { - return (char)((ZERO-26)+digit); - } - } - /** - * Converts Unicode to Punycode. - * The input string must not contain single, unpaired surrogates. - * The output will be represented as an array of ASCII code points. - * - * @param src - * @param caseFlags - * @return - * @throws ParseException - */ - public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ - - int[] cpBuffer = new int[MAX_CP_COUNT]; - int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; - char c, c2; - int srcLength = src.length(); - int destCapacity = MAX_CP_COUNT; - char[] dest = new char[destCapacity]; - StringBuffer result = new StringBuffer(); - /* - * Handle the basic code points and - * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): - */ - srcCPCount=destLength=0; - - for(j=0; j0) { - if(destLength state to , but guard against overflow: - */ - if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { - throw new RuntimeException("Internal program error"); - } - delta+=(m-n)*(handledCPCount+1); - n=m; - - /* Encode a sequence of same code points n */ - for(j=0; jTMAX) { - t=TMAX; - } - */ - - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; - } - - if(q0;) { - if(src.charAt(--j)==DELIMITER) { - break; - } - } - destLength=basicLength=destCPCount=j; - - while(j>0) { - b=src.charAt(--j); - if(!isBasic(b)) { - throw new ParseException("Illegal char found", -1); - } - - if(j0 ? basicLength+1 : 0; in=srcLength) { - throw new ParseException("Illegal char found", -1); - } - - digit=basicToDigit[(byte)src.charAt(in++)]; - if(digit<0) { - throw new ParseException("Invalid char found", -1); - } - if(digit>(0x7fffffff-i)/w) { - /* integer overflow */ - throw new ParseException("Illegal char found", -1); - } - - i+=digit*w; - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; - } - if(digit0x7fffffff/(BASE-t)) { - /* integer overflow */ - throw new ParseException("Illegal char found", -1); - } - w*=BASE-t; - } - - /* - * Modification from sample code: - * Increments destCPCount here, - * where needed instead of in for() loop tail. - */ - ++destCPCount; - bias=adaptBias(i-oldi, destCPCount, (oldi==0)); - - /* - * i was supposed to wrap around from (incremented) destCPCount to 0, - * incrementing n each time, so we'll fix that now: - */ - if(i/destCPCount>(0x7fffffff-n)) { - /* integer overflow */ - throw new ParseException("Illegal char found", -1); - } - - n+=i/destCPCount; - i%=destCPCount; - /* not needed for Punycode: */ - /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ - - if(n>0x10ffff || isSurrogate(n)) { - /* Unicode code point overflow */ - throw new ParseException("Illegal char found", -1); - } - - /* Insert n at position i of the output: */ - cpLength=UTF16.getCharCount(n); - if((destLength+cpLength)1) { - firstSupplementaryIndex=codeUnitIndex; - } else { - ++firstSupplementaryIndex; - } - } else { - codeUnitIndex=firstSupplementaryIndex; - codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); - } - - /* use the UChar index codeUnitIndex instead of the code point index i */ - if(codeUnitIndex((BASE-TMIN)*TMAX)/2; count+=BASE) { + delta/=(BASE-TMIN); + } + + return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + } + + /** + * basicToDigit[] contains the numeric value of a basic code + * point (for use in representing integers) in the range 0 to + * BASE-1, or -1 if b is does not represent a value. + */ + static final int[] basicToDigit= new int[]{ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + private static char asciiCaseMap(char b, boolean uppercase) { + if(uppercase) { + if(SMALL_A<=b && b<=SMALL_Z) { + b-=(SMALL_A-CAPITAL_A); + } + } else { + if(CAPITAL_A<=b && b<=CAPITAL_Z) { + b+=(SMALL_A-CAPITAL_A); + } + } + return b; + } + + /** + * digitToBasic() returns the basic code point whose value + * (when used for representing integers) is d, which must be in the + * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is + * nonzero, in which case the uppercase form is used. + */ + private static char digitToBasic(int digit, boolean uppercase) { + /* 0..25 map to ASCII a..z or A..Z */ + /* 26..35 map to ASCII 0..9 */ + if(digit<26) { + if(uppercase) { + return (char)(CAPITAL_A+digit); + } else { + return (char)(SMALL_A+digit); + } + } else { + return (char)((ZERO-26)+digit); + } + } + /** + * Converts Unicode to Punycode. + * The input string must not contain single, unpaired surrogates. + * The output will be represented as an array of ASCII code points. + * + * @param src + * @param caseFlags + * @return + * @throws ParseException + */ + public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ + + int[] cpBuffer = new int[MAX_CP_COUNT]; + int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; + char c, c2; + int srcLength = src.length(); + int destCapacity = MAX_CP_COUNT; + char[] dest = new char[destCapacity]; + StringBuffer result = new StringBuffer(); + /* + * Handle the basic code points and + * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): + */ + srcCPCount=destLength=0; + + for(j=0; j0) { + if(destLength state to , but guard against overflow: + */ + if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + throw new RuntimeException("Internal program error"); + } + delta+=(m-n)*(handledCPCount+1); + n=m; + + /* Encode a sequence of same code points n */ + for(j=0; jTMAX) { + t=TMAX; + } + */ + + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + + if(q0;) { + if(src.charAt(--j)==DELIMITER) { + break; + } + } + destLength=basicLength=destCPCount=j; + + while(j>0) { + b=src.charAt(--j); + if(!isBasic(b)) { + throw new ParseException("Illegal char found", -1); + } + + if(j0 ? basicLength+1 : 0; in=srcLength) { + throw new ParseException("Illegal char found", -1); + } + + digit=basicToDigit[(byte)src.charAt(in++)]; + if(digit<0) { + throw new ParseException("Invalid char found", -1); + } + if(digit>(0x7fffffff-i)/w) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + + i+=digit*w; + t=k-bias; + if(t=(bias+TMAX)) { + t=TMAX; + } + if(digit0x7fffffff/(BASE-t)) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + w*=BASE-t; + } + + /* + * Modification from sample code: + * Increments destCPCount here, + * where needed instead of in for() loop tail. + */ + ++destCPCount; + bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + + /* + * i was supposed to wrap around from (incremented) destCPCount to 0, + * incrementing n each time, so we'll fix that now: + */ + if(i/destCPCount>(0x7fffffff-n)) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + + n+=i/destCPCount; + i%=destCPCount; + /* not needed for Punycode: */ + /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ + + if(n>0x10ffff || isSurrogate(n)) { + /* Unicode code point overflow */ + throw new ParseException("Illegal char found", -1); + } + + /* Insert n at position i of the output: */ + cpLength=UTF16.getCharCount(n); + if((destLength+cpLength)1) { + firstSupplementaryIndex=codeUnitIndex; + } else { + ++firstSupplementaryIndex; + } + } else { + codeUnitIndex=firstSupplementaryIndex; + codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); + } + + /* use the UChar index codeUnitIndex instead of the code point index i */ + if(codeUnitIndexReplaceableobject - * @return copy of this iterator - */ - public Object clone(){ - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - return null; // never invoked - } - } - - /** - * Returns the current UTF16 character. - * @return current UTF16 character - */ - public int current(){ - if (currentIndex < replaceable.length()) { - return replaceable.charAt(currentIndex); - } - return DONE; - } - - /** - * Returns the length of the text - * @return length of the text - */ - public int getLength(){ - return replaceable.length(); - } - - /** - * Gets the current currentIndex in text. - * @return current currentIndex in text. - */ - public int getIndex(){ - return currentIndex; - } - - /** - * Returns next UTF16 character and increments the iterator's currentIndex by 1. - * If the resulting currentIndex is greater or equal to the text length, the - * currentIndex is reset to the text length and a value of DONECODEPOINT is - * returned. - * @return next UTF16 character in text or DONE if the new currentIndex is off the - * end of the text range. - */ - public int next(){ - if (currentIndex < replaceable.length()) { - return replaceable.charAt(currentIndex++); - } - return DONE; - } - - - /** - * Returns previous UTF16 character and decrements the iterator's currentIndex by - * 1. - * If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a - * value of DONECODEPOINT is returned. - * @return next UTF16 character in text or DONE if the new currentIndex is off the - * start of the text range. - */ - public int previous(){ - if (currentIndex > 0) { - return replaceable.charAt(--currentIndex); - } - return DONE; - } - - /** - * Sets the currentIndex to the specified currentIndex in the text and returns that - * single UTF16 character at currentIndex. - * This assumes the text is stored as 16-bit code units. - * @param currentIndex the currentIndex within the text. - * @exception IllegalArgumentException is thrown if an invalid currentIndex is - * supplied. i.e. currentIndex is out of bounds. - */ - public void setIndex(int currentIndex) { - if (currentIndex < 0 || currentIndex > replaceable.length()) { - throw new IllegalArgumentException(); - } - this.currentIndex = currentIndex; - } - - public int getText(char[] fillIn, int offset){ - int length = replaceable.length(); - if(offset < 0 || offset + length > fillIn.length){ - throw new IndexOutOfBoundsException(Integer.toString(length)); - } - replaceable.getChars(0,length,fillIn,offset); - return length; - } - - // private data members ---------------------------------------------------- - - /** - * Replaceable object - */ - private Replaceable replaceable; - /** - * Current currentIndex - */ - private int currentIndex; - -} --- /dev/null 2020-01-10 15:57:33.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/ReplaceableUCharacterIterator.java 2020-01-10 15:57:33.000000000 -0800 @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.Replaceable; +import jdk.internal.icu.text.ReplaceableString; +import jdk.internal.icu.text.UCharacterIterator; + +/** + * DLF docs must define behavior when Replaceable is mutated underneath + * the iterator. + * + * This and ICUCharacterIterator share some code, maybe they should share + * an implementation, or the common state and implementation should be + * moved up into UCharacterIterator. + * + * What are first, last, and getBeginIndex doing here?!?!?! + */ +public class ReplaceableUCharacterIterator extends UCharacterIterator { + + // public constructor ------------------------------------------------------ + + /** + * Public constructor + * @param str text which the iterator will be based on + */ + public ReplaceableUCharacterIterator(String str){ + if(str==null){ + throw new IllegalArgumentException(); + } + this.replaceable = new ReplaceableString(str); + this.currentIndex = 0; + } + + /** + * Public constructor + * @param buf buffer of text on which the iterator will be based + */ + public ReplaceableUCharacterIterator(StringBuffer buf){ + if(buf==null){ + throw new IllegalArgumentException(); + } + this.replaceable = new ReplaceableString(buf); + this.currentIndex = 0; + } + + // public methods ---------------------------------------------------------- + + /** + * Creates a copy of this iterator, does not clone the underlying + * Replaceableobject + * @return copy of this iterator + */ + public Object clone(){ + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + return null; // never invoked + } + } + + /** + * Returns the current UTF16 character. + * @return current UTF16 character + */ + public int current(){ + if (currentIndex < replaceable.length()) { + return replaceable.charAt(currentIndex); + } + return DONE; + } + + /** + * Returns the length of the text + * @return length of the text + */ + public int getLength(){ + return replaceable.length(); + } + + /** + * Gets the current currentIndex in text. + * @return current currentIndex in text. + */ + public int getIndex(){ + return currentIndex; + } + + /** + * Returns next UTF16 character and increments the iterator's currentIndex by 1. + * If the resulting currentIndex is greater or equal to the text length, the + * currentIndex is reset to the text length and a value of DONECODEPOINT is + * returned. + * @return next UTF16 character in text or DONE if the new currentIndex is off the + * end of the text range. + */ + public int next(){ + if (currentIndex < replaceable.length()) { + return replaceable.charAt(currentIndex++); + } + return DONE; + } + + + /** + * Returns previous UTF16 character and decrements the iterator's currentIndex by + * 1. + * If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a + * value of DONECODEPOINT is returned. + * @return next UTF16 character in text or DONE if the new currentIndex is off the + * start of the text range. + */ + public int previous(){ + if (currentIndex > 0) { + return replaceable.charAt(--currentIndex); + } + return DONE; + } + + /** + * Sets the currentIndex to the specified currentIndex in the text and returns that + * single UTF16 character at currentIndex. + * This assumes the text is stored as 16-bit code units. + * @param currentIndex the currentIndex within the text. + * @exception IllegalArgumentException is thrown if an invalid currentIndex is + * supplied. i.e. currentIndex is out of bounds. + */ + public void setIndex(int currentIndex) { + if (currentIndex < 0 || currentIndex > replaceable.length()) { + throw new IllegalArgumentException(); + } + this.currentIndex = currentIndex; + } + + public int getText(char[] fillIn, int offset){ + int length = replaceable.length(); + if(offset < 0 || offset + length > fillIn.length){ + throw new IndexOutOfBoundsException(Integer.toString(length)); + } + replaceable.getChars(0,length,fillIn,offset); + return length; + } + + // private data members ---------------------------------------------------- + + /** + * Replaceable object + */ + private Replaceable replaceable; + /** + * Current currentIndex + */ + private int currentIndex; + +} --- old/src/java.base/share/classes/sun/net/idn/StringPrepDataReader.java 2020-01-10 15:57:35.000000000 -0800 +++ /dev/null 2020-01-10 15:57:35.000000000 -0800 @@ -1,127 +0,0 @@ -/* - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* -/* - ****************************************************************************** - * Copyright (C) 2003, International Business Machines Corporation and * - * others. All Rights Reserved. * - ****************************************************************************** - * - * Created on May 2, 2003 - * - * To change the template for this generated file go to - * Window>Preferences>Java>Code Generation>Code and Comments - */ -// CHANGELOG -// 2005-05-19 Edward Wang -// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/impl/StringPrepDataReader.java -// - move from package com.ibm.icu.impl to package sun.net.idn -// -package sun.net.idn; - -import java.io.DataInputStream; -import java.io.IOException; -import java.io.InputStream; - -import sun.text.normalizer.ICUBinary; - - -/** - * @author ram - * - * To change the template for this generated type comment go to - * Window>Preferences>Java>Code Generation>Code and Comments - */ -final class StringPrepDataReader implements ICUBinary.Authenticate { - - /** - *

private constructor.

- * @param inputStream ICU uprop.dat file input stream - * @exception IOException throw if data file fails authentication - * @draft 2.1 - */ - public StringPrepDataReader(InputStream inputStream) - throws IOException{ - - unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); - - - dataInputStream = new DataInputStream(inputStream); - - } - - public void read(byte[] idnaBytes, - char[] mappingTable) - throws IOException{ - - //Read the bytes that make up the idnaTrie - dataInputStream.read(idnaBytes); - - //Read the extra data - for(int i=0;iPreferences>Java>Code Generation>Code and Comments + */ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/impl/StringPrepDataReader.java +// - move from package com.ibm.icu.impl to package sun.net.idn +// +package jdk.internal.icu.impl; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; + +import jdk.internal.icu.impl.ICUBinary; + + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public final class StringPrepDataReader implements ICUBinary.Authenticate { + + /** + *

private constructor.

+ * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + public StringPrepDataReader(InputStream inputStream) + throws IOException{ + + unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); + + + dataInputStream = new DataInputStream(inputStream); + + } + + public void read(byte[] idnaBytes, + char[] mappingTable) + throws IOException{ + + //Read the bytes that make up the idnaTrie + dataInputStream.read(idnaBytes); + + //Read the extra data + for(int i=0;iA trie is a kind of compressed, serializable table of values - * associated with Unicode code points (0..0x10ffff).

- *

This class defines the basic structure of a trie and provides methods - * to retrieve the offsets to the actual data.

- *

Data will be the form of an array of basic types, char or int.

- *

The actual data format will have to be specified by the user in the - * inner static interface com.ibm.icu.impl.Trie.DataManipulate.

- *

This trie implementation is optimized for getting offset while walking - * forward through a UTF-16 string. - * Therefore, the simplest and fastest access macros are the - * fromLead() and fromOffsetTrail() methods. - * The fromBMP() method are a little more complicated; they get offsets even - * for lead surrogate codepoints, while the fromLead() method get special - * "folded" offsets for lead surrogate code units if there is relevant data - * associated with them. - * From such a folded offsets, an offset needs to be extracted to supply - * to the fromOffsetTrail() methods. - * To handle such supplementary codepoints, some offset information are kept - * in the data.

- *

Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve - * that offset from the folded value for the lead surrogate unit.

- *

For examples of use, see com.ibm.icu.impl.CharTrie or - * com.ibm.icu.impl.IntTrie.

- * @author synwee - * @see com.ibm.icu.impl.CharTrie - * @see com.ibm.icu.impl.IntTrie - * @since release 2.1, Jan 01 2002 - */ -public abstract class Trie -{ - // public class declaration ---------------------------------------- - - /** - * Character data in com.ibm.impl.Trie have different user-specified format - * for different purposes. - * This interface specifies methods to be implemented in order for - * com.ibm.impl.Trie, to surrogate offset information encapsulated within - * the data. - */ - public static interface DataManipulate - { - /** - * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's - * data - * the index array offset of the indexes for that lead surrogate. - * @param value data value for a surrogate from the trie, including the - * folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - public int getFoldingOffset(int value); - } - - // default implementation - private static class DefaultGetFoldingOffset implements DataManipulate { - public int getFoldingOffset(int value) { - return value; - } - } - - // protected constructor ------------------------------------------- - - /** - * Trie constructor for CharTrie use. - * @param inputStream ICU data file input stream which contains the - * trie - * @param dataManipulate object containing the information to parse the - * trie data - * @throws IOException thrown when input stream does not have the - * right header. - */ - protected Trie(InputStream inputStream, - DataManipulate dataManipulate) throws IOException - { - DataInputStream input = new DataInputStream(inputStream); - // Magic number to authenticate the data. - int signature = input.readInt(); - m_options_ = input.readInt(); - - if (!checkHeader(signature)) { - throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); - } - - if(dataManipulate != null) { - m_dataManipulate_ = dataManipulate; - } else { - m_dataManipulate_ = new DefaultGetFoldingOffset(); - } - m_isLatin1Linear_ = (m_options_ & - HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; - m_dataOffset_ = input.readInt(); - m_dataLength_ = input.readInt(); - unserialize(inputStream); - } - - // protected data members ------------------------------------------ - - /** - * Lead surrogate code points' index displacement in the index array. - *
{@code
-     * 0x10000-0xd800=0x2800
-     * 0x2800 >> INDEX_STAGE_1_SHIFT_
-     * }
- */ - protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; - /** - * Shift size for shifting right the input index. 1..9 - */ - protected static final int INDEX_STAGE_1_SHIFT_ = 5; - /** - * Shift size for shifting left the index array values. - * Increases possible data size with 16-bit index values at the cost - * of compactability. - * This requires blocks of stage 2 data to be aligned by - * DATA_GRANULARITY. - * 0..INDEX_STAGE_1_SHIFT - */ - protected static final int INDEX_STAGE_2_SHIFT_ = 2; - /** - * Number of data values in a stage 2 (data array) block. - */ - protected static final int DATA_BLOCK_LENGTH=1< - * getRawOffset(0, ch); - *

- * will do. Otherwise if it is a supplementary character formed by - * surrogates lead and trail. Then we would have to call getRawOffset() - * with getFoldingIndexOffset(). See getSurrogateOffset(). - * @param offset index offset which ch is to start from - * @param ch index to be used after offset - * @return offset to the data - */ - protected final int getRawOffset(int offset, char ch) - { - return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] - << INDEX_STAGE_2_SHIFT_) - + (ch & INDEX_STAGE_3_MASK_); - } - - /** - * Gets the offset to data which the BMP character points to - * Treats a lead surrogate as a normal code point. - * @param ch BMP character - * @return offset to data - */ - protected final int getBMPOffset(char ch) - { - return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE - && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) - ? getRawOffset(LEAD_INDEX_OFFSET_, ch) - : getRawOffset(0, ch); - // using a getRawOffset(ch) makes no diff - } - - /** - * Gets the offset to the data which this lead surrogate character points - * to. - * Data at the returned offset may contain folding offset information for - * the next trailing surrogate character. - * @param ch lead surrogate character - * @return offset to data - */ - protected final int getLeadOffset(char ch) - { - return getRawOffset(0, ch); - } - - /** - * Internal trie getter from a code point. - * Could be faster(?) but longer with - * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} - * Gets the offset to data which the codepoint points to - * @param ch codepoint - * @return offset to data - */ - protected final int getCodePointOffset(int ch) - { - // if ((ch >> 16) == 0) slower - if (ch < 0) { - return -1; - } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { - // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works - return getRawOffset(0, (char)ch); - } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { - // BMP codepoint - return getBMPOffset((char)ch); - } else if (ch <= UCharacter.MAX_VALUE) { - // look at the construction of supplementary characters - // trail forms the ends of it. - return getSurrogateOffset(UTF16.getLeadSurrogate(ch), - (char)(ch & SURROGATE_MASK_)); - } else { - // return -1 if there is an error, in this case we return - return -1; - } - } - - /** - *

Parses the inputstream and creates the trie index with it.

- *

This is overwritten by the child classes. - * @param inputStream input stream containing the trie information - * @exception IOException thrown when data reading fails. - */ - protected void unserialize(InputStream inputStream) throws IOException - { - //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ - m_index_ = new char[m_dataOffset_]; - DataInputStream input = new DataInputStream(inputStream); - for (int i = 0; i < m_dataOffset_; i ++) { - m_index_[i] = input.readChar(); - } - } - - /** - * Determines if this is a 16 bit trie - * @return true if this is a 16 bit trie - */ - protected final boolean isCharTrie() - { - return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; - } - - // private data members -------------------------------------------- - - /** - * Latin 1 option mask - */ - protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; - /** - * Constant number to authenticate the byte block - */ - protected static final int HEADER_SIGNATURE_ = 0x54726965; - /** - * Header option formatting - */ - private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; - protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; - protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; - - /** - * Flag indicator for Latin quick access data block - */ - private boolean m_isLatin1Linear_; - - /** - *

Trie options field.

- *

options bit field:
- * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH
- * 8 0 = 16-bit data, 1=32-bit data
- * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT
- * 3..0 INDEX_STAGE_2_SHIFT // 1..9
- */ - private int m_options_; - - // private methods --------------------------------------------------- - - /** - * Authenticates raw data header. - * Checking the header information, signature and options. - * @param signature This contains the options and type of a Trie - * @return true if the header is authenticated valid - */ - private final boolean checkHeader(int signature) - { - // check the signature - // Trie in big-endian US-ASCII (0x54726965). - // Magic number to authenticate the data. - if (signature != HEADER_SIGNATURE_) { - return false; - } - - if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != - INDEX_STAGE_1_SHIFT_ || - ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & - HEADER_OPTIONS_SHIFT_MASK_) - != INDEX_STAGE_2_SHIFT_) { - return false; - } - return true; - } -} --- /dev/null 2020-01-10 15:57:36.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Trie.java 2020-01-10 15:57:36.000000000 -0800 @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.UTF16; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; + +/** + *

A trie is a kind of compressed, serializable table of values + * associated with Unicode code points (0..0x10ffff).

+ *

This class defines the basic structure of a trie and provides methods + * to retrieve the offsets to the actual data.

+ *

Data will be the form of an array of basic types, char or int.

+ *

The actual data format will have to be specified by the user in the + * inner static interface com.ibm.icu.impl.Trie.DataManipulate.

+ *

This trie implementation is optimized for getting offset while walking + * forward through a UTF-16 string. + * Therefore, the simplest and fastest access macros are the + * fromLead() and fromOffsetTrail() methods. + * The fromBMP() method are a little more complicated; they get offsets even + * for lead surrogate codepoints, while the fromLead() method get special + * "folded" offsets for lead surrogate code units if there is relevant data + * associated with them. + * From such a folded offsets, an offset needs to be extracted to supply + * to the fromOffsetTrail() methods. + * To handle such supplementary codepoints, some offset information are kept + * in the data.

+ *

Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve + * that offset from the folded value for the lead surrogate unit.

+ *

For examples of use, see com.ibm.icu.impl.CharTrie or + * com.ibm.icu.impl.IntTrie.

+ * @author synwee + * @see com.ibm.icu.impl.CharTrie + * @see com.ibm.icu.impl.IntTrie + * @since release 2.1, Jan 01 2002 + */ +public abstract class Trie +{ + // public class declaration ---------------------------------------- + + /** + * Character data in com.ibm.impl.Trie have different user-specified format + * for different purposes. + * This interface specifies methods to be implemented in order for + * com.ibm.impl.Trie, to surrogate offset information encapsulated within + * the data. + */ + public static interface DataManipulate + { + /** + * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's + * data + * the index array offset of the indexes for that lead surrogate. + * @param value data value for a surrogate from the trie, including the + * folding offset + * @return data offset or 0 if there is no data for the lead surrogate + */ + public int getFoldingOffset(int value); + } + + // default implementation + private static class DefaultGetFoldingOffset implements DataManipulate { + public int getFoldingOffset(int value) { + return value; + } + } + + // protected constructor ------------------------------------------- + + /** + * Trie constructor for CharTrie use. + * @param inputStream ICU data file input stream which contains the + * trie + * @param dataManipulate object containing the information to parse the + * trie data + * @throws IOException thrown when input stream does not have the + * right header. + */ + protected Trie(InputStream inputStream, + DataManipulate dataManipulate) throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + // Magic number to authenticate the data. + int signature = input.readInt(); + m_options_ = input.readInt(); + + if (!checkHeader(signature)) { + throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); + } + + if(dataManipulate != null) { + m_dataManipulate_ = dataManipulate; + } else { + m_dataManipulate_ = new DefaultGetFoldingOffset(); + } + m_isLatin1Linear_ = (m_options_ & + HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; + m_dataOffset_ = input.readInt(); + m_dataLength_ = input.readInt(); + unserialize(inputStream); + } + + // protected data members ------------------------------------------ + + /** + * Lead surrogate code points' index displacement in the index array. + *
{@code
+     * 0x10000-0xd800=0x2800
+     * 0x2800 >> INDEX_STAGE_1_SHIFT_
+     * }
+ */ + protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; + /** + * Shift size for shifting right the input index. 1..9 + */ + protected static final int INDEX_STAGE_1_SHIFT_ = 5; + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires blocks of stage 2 data to be aligned by + * DATA_GRANULARITY. + * 0..INDEX_STAGE_1_SHIFT + */ + protected static final int INDEX_STAGE_2_SHIFT_ = 2; + /** + * Number of data values in a stage 2 (data array) block. + */ + protected static final int DATA_BLOCK_LENGTH=1< + * getRawOffset(0, ch); + *

+ * will do. Otherwise if it is a supplementary character formed by + * surrogates lead and trail. Then we would have to call getRawOffset() + * with getFoldingIndexOffset(). See getSurrogateOffset(). + * @param offset index offset which ch is to start from + * @param ch index to be used after offset + * @return offset to the data + */ + protected final int getRawOffset(int offset, char ch) + { + return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] + << INDEX_STAGE_2_SHIFT_) + + (ch & INDEX_STAGE_3_MASK_); + } + + /** + * Gets the offset to data which the BMP character points to + * Treats a lead surrogate as a normal code point. + * @param ch BMP character + * @return offset to data + */ + protected final int getBMPOffset(char ch) + { + return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE + && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) + ? getRawOffset(LEAD_INDEX_OFFSET_, ch) + : getRawOffset(0, ch); + // using a getRawOffset(ch) makes no diff + } + + /** + * Gets the offset to the data which this lead surrogate character points + * to. + * Data at the returned offset may contain folding offset information for + * the next trailing surrogate character. + * @param ch lead surrogate character + * @return offset to data + */ + protected final int getLeadOffset(char ch) + { + return getRawOffset(0, ch); + } + + /** + * Internal trie getter from a code point. + * Could be faster(?) but longer with + * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} + * Gets the offset to data which the codepoint points to + * @param ch codepoint + * @return offset to data + */ + protected final int getCodePointOffset(int ch) + { + // if ((ch >> 16) == 0) slower + if (ch < 0) { + return -1; + } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { + // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works + return getRawOffset(0, (char)ch); + } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { + // BMP codepoint + return getBMPOffset((char)ch); + } else if (ch <= UCharacter.MAX_VALUE) { + // look at the construction of supplementary characters + // trail forms the ends of it. + return getSurrogateOffset(UTF16.getLeadSurrogate(ch), + (char)(ch & SURROGATE_MASK_)); + } else { + // return -1 if there is an error, in this case we return + return -1; + } + } + + /** + *

Parses the inputstream and creates the trie index with it.

+ *

This is overwritten by the child classes. + * @param inputStream input stream containing the trie information + * @exception IOException thrown when data reading fails. + */ + protected void unserialize(InputStream inputStream) throws IOException + { + //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ + m_index_ = new char[m_dataOffset_]; + DataInputStream input = new DataInputStream(inputStream); + for (int i = 0; i < m_dataOffset_; i ++) { + m_index_[i] = input.readChar(); + } + } + + /** + * Determines if this is a 16 bit trie + * @return true if this is a 16 bit trie + */ + protected final boolean isCharTrie() + { + return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; + } + + // private data members -------------------------------------------- + + /** + * Latin 1 option mask + */ + protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; + /** + * Constant number to authenticate the byte block + */ + protected static final int HEADER_SIGNATURE_ = 0x54726965; + /** + * Header option formatting + */ + private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; + protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; + protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; + + /** + * Flag indicator for Latin quick access data block + */ + private boolean m_isLatin1Linear_; + + /** + *

Trie options field.

+ *

options bit field:
+ * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH
+ * 8 0 = 16-bit data, 1=32-bit data
+ * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT
+ * 3..0 INDEX_STAGE_2_SHIFT // 1..9
+ */ + private int m_options_; + + // private methods --------------------------------------------------- + + /** + * Authenticates raw data header. + * Checking the header information, signature and options. + * @param signature This contains the options and type of a Trie + * @return true if the header is authenticated valid + */ + private final boolean checkHeader(int signature) + { + // check the signature + // Trie in big-endian US-ASCII (0x54726965). + // Magic number to authenticate the data. + if (signature != HEADER_SIGNATURE_) { + return false; + } + + if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != + INDEX_STAGE_1_SHIFT_ || + ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & + HEADER_OPTIONS_SHIFT_MASK_) + != INDEX_STAGE_2_SHIFT_) { + return false; + } + return true; + } +} --- old/src/java.base/share/classes/sun/text/normalizer/Trie2.java 2020-01-10 15:57:37.000000000 -0800 +++ /dev/null 2020-01-10 15:57:37.000000000 -0800 @@ -1,655 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Iterator; -import java.util.NoSuchElementException; - - -/** - * This is the interface and common implementation of a Unicode Trie2. - * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) - * to 16- or 32-bit integer values. It works best when there are ranges of - * characters with the same value, which is generally the case with Unicode - * character properties. - * - * This is the second common version of a Unicode trie (hence the name Trie2). - * - */ -abstract class Trie2 implements Iterable { - - /** - * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). - * - * Reads from the current position and leaves the buffer after the end of the trie. - * - * The serialized format is identical between ICU4C and ICU4J, so this function - * will work with serialized Trie2s from either. - * - * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending - * on the width of the data. - * - * To obtain the width of the Trie2, check the actual class type of the returned Trie2. - * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will - * return only Tries of their specific type/size. - * - * The serialized Trie2 on the stream may be in either little or big endian byte order. - * This allows using serialized Tries from ICU4C without needing to consider the - * byte order of the system that created them. - * - * @param bytes a byte buffer to the serialized form of a UTrie2. - * @return An unserialized Trie2, ready for use. - * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. - * @throws IOException if a read error occurs in the buffer. - * - */ - public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { - // From ICU4C utrie2_impl.h - // * Trie2 data structure in serialized form: - // * - // * UTrie2Header header; - // * uint16_t index[header.index2Length]; - // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] - // * @internal - // */ - // typedef struct UTrie2Header { - // /** "Tri2" in big-endian US-ASCII (0x54726932) */ - // uint32_t signature; - - // /** - // * options bit field: - // * 15.. 4 reserved (0) - // * 3.. 0 UTrie2ValueBits valueBits - // */ - // uint16_t options; - // - // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ - // uint16_t indexLength; - // - // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ - // uint16_t shiftedDataLength; - // - // /** Null index and data blocks, not shifted. */ - // uint16_t index2NullOffset, dataNullOffset; - // - // /** - // * First code point of the single-value range ending with U+10ffff, - // * rounded up and then shifted right by UTRIE2_SHIFT_1. - // */ - // uint16_t shiftedHighStart; - // } UTrie2Header; - - ByteOrder outerByteOrder = bytes.order(); - try { - UTrie2Header header = new UTrie2Header(); - - /* check the signature */ - header.signature = bytes.getInt(); - switch (header.signature) { - case 0x54726932: - // The buffer is already set to the trie data byte order. - break; - case 0x32697254: - // Temporarily reverse the byte order. - boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; - bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); - header.signature = 0x54726932; - break; - default: - throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); - } - - header.options = bytes.getChar(); - header.indexLength = bytes.getChar(); - header.shiftedDataLength = bytes.getChar(); - header.index2NullOffset = bytes.getChar(); - header.dataNullOffset = bytes.getChar(); - header.shiftedHighStart = bytes.getChar(); - - if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { - throw new IllegalArgumentException("UTrie2 serialized format error."); - } - - Trie2 This; - This = new Trie2_16(); - This.header = header; - - /* get the length values and offsets */ - This.indexLength = header.indexLength; - This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; - This.index2NullOffset = header.index2NullOffset; - This.dataNullOffset = header.dataNullOffset; - This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; - This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; - This.highValueIndex += This.indexLength; - - // Allocate the Trie2 index array. If the data width is 16 bits, the array also - // includes the space for the data. - - int indexArraySize = This.indexLength; - indexArraySize += This.dataLength; - This.index = new char[indexArraySize]; - - /* Read in the index */ - int i; - for (i=0; i iterator() { - return iterator(defaultValueMapper); - } - - private static ValueMapper defaultValueMapper = new ValueMapper() { - public int map(int in) { - return in; - } - }; - - /** - * Create an iterator over the value ranges from this Trie2. - * Values from the Trie2 are passed through a caller-supplied remapping function, - * and it is the remapped values that determine the ranges that - * will be produced by the iterator. - * - * - * @param mapper provides a function to remap values obtained from the Trie2. - * @return an Iterator - */ - public Iterator iterator(ValueMapper mapper) { - return new Trie2Iterator(mapper); - } - - /** - * When iterating over the contents of a Trie2, an instance of TrieValueMapper may - * be used to remap the values from the Trie2. The remapped values will be used - * both in determining the ranges of codepoints and as the value to be returned - * for each range. - * - * Example of use, with an anonymous subclass of TrieValueMapper: - * - * - * ValueMapper m = new ValueMapper() { - * int map(int in) {return in & 0x1f;}; - * } - * for (Iterator iter = trie.iterator(m); i.hasNext(); ) { - * Trie2EnumRange r = i.next(); - * ... // Do something with the range r. - * } - * - */ - public interface ValueMapper { - public int map(int originalVal); - } - - //-------------------------------------------------------------------------------- - // - // Below this point are internal implementation items. No further public API. - // - //-------------------------------------------------------------------------------- - - /** - * Trie2 data structure in serialized form: - * - * UTrie2Header header; - * uint16_t index[header.index2Length]; - * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] - * - * For Java, this is read from the stream into an instance of UTrie2Header. - * (The C version just places a struct over the raw serialized data.) - * - * @internal - */ - static class UTrie2Header { - /** "Tri2" in big-endian US-ASCII (0x54726932) */ - int signature; - - /** - * options bit field (uint16_t): - * 15.. 4 reserved (0) - * 3.. 0 UTrie2ValueBits valueBits - */ - int options; - - /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ - int indexLength; - - /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ - int shiftedDataLength; - - /** Null index and data blocks, not shifted. (uint16_t) */ - int index2NullOffset, dataNullOffset; - - /** - * First code point of the single-value range ending with U+10ffff, - * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) - */ - int shiftedHighStart; - } - - // - // Data members of UTrie2. - // - UTrie2Header header; - char index[]; // Index array. Includes data for 16 bit Tries. - int data16; // Offset to data portion of the index array, if 16 bit data. - // zero if 32 bit data. - int data32[]; // NULL if 16b data is used via index - - int indexLength; - int dataLength; - int index2NullOffset; // 0xffff if there is no dedicated index-2 null block - int initialValue; - - /** Value returned for out-of-range code points and illegal UTF-8. */ - int errorValue; - - /* Start of the last range which ends at U+10ffff, and its value. */ - int highStart; - int highValueIndex; - - int dataNullOffset; - - /** - * Trie2 constants, defining shift widths, index array lengths, etc. - * - * These are needed for the runtime macros but users can treat these as - * implementation details and skip to the actual public API further below. - */ - - static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; - - - /** Shift size for getting the index-1 table offset. */ - static final int UTRIE2_SHIFT_1=6+5; - - /** Shift size for getting the index-2 table offset. */ - static final int UTRIE2_SHIFT_2=5; - - /** - * Difference between the two shift sizes, - * for getting an index-1 offset from an index-2 offset. 6=11-5 - */ - static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; - - /** - * Number of index-1 entries for the BMP. 32=0x20 - * This part of the index-1 table is omitted from the serialized form. - */ - static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; - - /** Number of entries in an index-2 block. 64=0x40 */ - static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) - */ - static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; - static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; - - /** Count the lengths of both BMP pieces. 2080=0x820 */ - static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; - - /** - * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. - * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. - */ - static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; - static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ - - /** - * The index-1 table, only used for supplementary code points, at offset 2112=0x840. - * Variable length, for code points up to highStart, where the last single-value range starts. - * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. - * (For 0x100000 supplementary code points U+10000..U+10ffff.) - * - * The part of the index-2 table for supplementary code points starts - * after this index-1 table. - * - * Both the index-1 table and the following part of the index-2 table - * are omitted completely if there is only BMP data. - */ - static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; - - /** - * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. - * Used with linear access for single bytes 0..0xbf for simple error handling. - * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. - */ - static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; - - /** - * Implementation class for an iterator over a Trie2. - * - * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, - * then returns the special alternate values for the lead surrogates - * - * @internal - */ - class Trie2Iterator implements Iterator { - - // The normal constructor that configures the iterator to cover the complete - // contents of the Trie2 - Trie2Iterator(ValueMapper vm) { - mapper = vm; - nextStart = 0; - limitCP = 0x110000; - doLeadSurrogates = true; - } - - /** - * The main next() function for Trie2 iterators - * - */ - public Range next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - if (nextStart >= limitCP) { - // Switch over from iterating normal code point values to - // doing the alternate lead-surrogate values. - doingCodePoints = false; - nextStart = 0xd800; - } - int endOfRange = 0; - int val = 0; - int mappedVal = 0; - - if (doingCodePoints) { - // Iteration over code point values. - val = get(nextStart); - mappedVal = mapper.map(val); - endOfRange = rangeEnd(nextStart, limitCP, val); - // Loop once for each range in the Trie2 with the same raw (unmapped) value. - // Loop continues so long as the mapped values are the same. - for (;;) { - if (endOfRange >= limitCP-1) { - break; - } - val = get(endOfRange+1); - if (mapper.map(val) != mappedVal) { - break; - } - endOfRange = rangeEnd(endOfRange+1, limitCP, val); - } - } else { - // Iteration over the alternate lead surrogate values. - val = getFromU16SingleLead((char)nextStart); - mappedVal = mapper.map(val); - endOfRange = rangeEndLS((char)nextStart); - // Loop once for each range in the Trie2 with the same raw (unmapped) value. - // Loop continues so long as the mapped values are the same. - for (;;) { - if (endOfRange >= 0xdbff) { - break; - } - val = getFromU16SingleLead((char)(endOfRange+1)); - if (mapper.map(val) != mappedVal) { - break; - } - endOfRange = rangeEndLS((char)(endOfRange+1)); - } - } - returnValue.startCodePoint = nextStart; - returnValue.endCodePoint = endOfRange; - returnValue.value = mappedVal; - returnValue.leadSurrogate = !doingCodePoints; - nextStart = endOfRange+1; - return returnValue; - } - - /** - * - */ - public boolean hasNext() { - return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; - } - - private int rangeEndLS(char startingLS) { - if (startingLS >= 0xdbff) { - return 0xdbff; - } - - int c; - int val = getFromU16SingleLead(startingLS); - for (c = startingLS+1; c <= 0x0dbff; c++) { - if (getFromU16SingleLead((char)c) != val) { - break; - } - } - return c-1; - } - - // - // Iteration State Variables - // - private ValueMapper mapper; - private Range returnValue = new Range(); - // The starting code point for the next range to be returned. - private int nextStart; - // The upper limit for the last normal range to be returned. Normally 0x110000, but - // may be lower when iterating over the code points for a single lead surrogate. - private int limitCP; - - // True while iterating over the Trie2 values for code points. - // False while iterating over the alternate values for lead surrogates. - private boolean doingCodePoints = true; - - // True if the iterator should iterate the special values for lead surrogates in - // addition to the normal values for code points. - private boolean doLeadSurrogates = true; - } - - /** - * Find the last character in a contiguous range of characters with the - * same Trie2 value as the input character. - * - * @param c The character to begin with. - * @return The last contiguous character with the same value. - */ - int rangeEnd(int start, int limitp, int val) { - int c; - int limit = Math.min(highStart, limitp); - - for (c = start+1; c < limit; c++) { - if (get(c) != val) { - break; - } - } - if (c >= highStart) { - c = limitp; - } - return c - 1; - } - - - // - // Hashing implementation functions. FNV hash. Respected public domain algorithm. - // - private static int initHash() { - return 0x811c9DC5; // unsigned 2166136261 - } - - private static int hashByte(int h, int b) { - h = h * 16777619; - h = h ^ b; - return h; - } - - private static int hashUChar32(int h, int c) { - h = Trie2.hashByte(h, c & 255); - h = Trie2.hashByte(h, (c>>8) & 255); - h = Trie2.hashByte(h, c>>16); - return h; - } - - private static int hashInt(int h, int i) { - h = Trie2.hashByte(h, i & 255); - h = Trie2.hashByte(h, (i>>8) & 255); - h = Trie2.hashByte(h, (i>>16) & 255); - h = Trie2.hashByte(h, (i>>24) & 255); - return h; - } - -} --- /dev/null 2020-01-10 15:57:37.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Trie2.java 2020-01-10 15:57:37.000000000 -0800 @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * This is the interface and common implementation of a Unicode Trie2. + * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) + * to 16- or 32-bit integer values. It works best when there are ranges of + * characters with the same value, which is generally the case with Unicode + * character properties. + * + * This is the second common version of a Unicode trie (hence the name Trie2). + * + */ +abstract class Trie2 implements Iterable { + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * + * Reads from the current position and leaves the buffer after the end of the trie. + * + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending + * on the width of the data. + * + * To obtain the width of the Trie2, check the actual class type of the returned Trie2. + * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will + * return only Tries of their specific type/size. + * + * The serialized Trie2 on the stream may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2, ready for use. + * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * + */ + public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { + // From ICU4C utrie2_impl.h + // * Trie2 data structure in serialized form: + // * + // * UTrie2Header header; + // * uint16_t index[header.index2Length]; + // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + // * @internal + // */ + // typedef struct UTrie2Header { + // /** "Tri2" in big-endian US-ASCII (0x54726932) */ + // uint32_t signature; + + // /** + // * options bit field: + // * 15.. 4 reserved (0) + // * 3.. 0 UTrie2ValueBits valueBits + // */ + // uint16_t options; + // + // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + // uint16_t indexLength; + // + // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + // uint16_t shiftedDataLength; + // + // /** Null index and data blocks, not shifted. */ + // uint16_t index2NullOffset, dataNullOffset; + // + // /** + // * First code point of the single-value range ending with U+10ffff, + // * rounded up and then shifted right by UTRIE2_SHIFT_1. + // */ + // uint16_t shiftedHighStart; + // } UTrie2Header; + + ByteOrder outerByteOrder = bytes.order(); + try { + UTrie2Header header = new UTrie2Header(); + + /* check the signature */ + header.signature = bytes.getInt(); + switch (header.signature) { + case 0x54726932: + // The buffer is already set to the trie data byte order. + break; + case 0x32697254: + // Temporarily reverse the byte order. + boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; + bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); + header.signature = 0x54726932; + break; + default: + throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); + } + + header.options = bytes.getChar(); + header.indexLength = bytes.getChar(); + header.shiftedDataLength = bytes.getChar(); + header.index2NullOffset = bytes.getChar(); + header.dataNullOffset = bytes.getChar(); + header.shiftedHighStart = bytes.getChar(); + + if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { + throw new IllegalArgumentException("UTrie2 serialized format error."); + } + + Trie2 This; + This = new Trie2_16(); + This.header = header; + + /* get the length values and offsets */ + This.indexLength = header.indexLength; + This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; + This.index2NullOffset = header.index2NullOffset; + This.dataNullOffset = header.dataNullOffset; + This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; + This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; + This.highValueIndex += This.indexLength; + + // Allocate the Trie2 index array. If the data width is 16 bits, the array also + // includes the space for the data. + + int indexArraySize = This.indexLength; + indexArraySize += This.dataLength; + This.index = new char[indexArraySize]; + + /* Read in the index */ + int i; + for (i=0; i iterator() { + return iterator(defaultValueMapper); + } + + private static ValueMapper defaultValueMapper = new ValueMapper() { + public int map(int in) { + return in; + } + }; + + /** + * Create an iterator over the value ranges from this Trie2. + * Values from the Trie2 are passed through a caller-supplied remapping function, + * and it is the remapped values that determine the ranges that + * will be produced by the iterator. + * + * + * @param mapper provides a function to remap values obtained from the Trie2. + * @return an Iterator + */ + public Iterator iterator(ValueMapper mapper) { + return new Trie2Iterator(mapper); + } + + /** + * When iterating over the contents of a Trie2, an instance of TrieValueMapper may + * be used to remap the values from the Trie2. The remapped values will be used + * both in determining the ranges of codepoints and as the value to be returned + * for each range. + * + * Example of use, with an anonymous subclass of TrieValueMapper: + * + * + * ValueMapper m = new ValueMapper() { + * int map(int in) {return in & 0x1f;}; + * } + * for (Iterator iter = trie.iterator(m); i.hasNext(); ) { + * Trie2EnumRange r = i.next(); + * ... // Do something with the range r. + * } + * + */ + public interface ValueMapper { + public int map(int originalVal); + } + + //-------------------------------------------------------------------------------- + // + // Below this point are internal implementation items. No further public API. + // + //-------------------------------------------------------------------------------- + + /** + * Trie2 data structure in serialized form: + * + * UTrie2Header header; + * uint16_t index[header.index2Length]; + * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + * + * For Java, this is read from the stream into an instance of UTrie2Header. + * (The C version just places a struct over the raw serialized data.) + * + * @internal + */ + static class UTrie2Header { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + int signature; + + /** + * options bit field (uint16_t): + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + int options; + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ + int indexLength; + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ + int shiftedDataLength; + + /** Null index and data blocks, not shifted. (uint16_t) */ + int index2NullOffset, dataNullOffset; + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) + */ + int shiftedHighStart; + } + + // + // Data members of UTrie2. + // + UTrie2Header header; + char index[]; // Index array. Includes data for 16 bit Tries. + int data16; // Offset to data portion of the index array, if 16 bit data. + // zero if 32 bit data. + int data32[]; // NULL if 16b data is used via index + + int indexLength; + int dataLength; + int index2NullOffset; // 0xffff if there is no dedicated index-2 null block + int initialValue; + + /** Value returned for out-of-range code points and illegal UTF-8. */ + int errorValue; + + /* Start of the last range which ends at U+10ffff, and its value. */ + int highStart; + int highValueIndex; + + int dataNullOffset; + + /** + * Trie2 constants, defining shift widths, index array lengths, etc. + * + * These are needed for the runtime macros but users can treat these as + * implementation details and skip to the actual public API further below. + */ + + static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; + + + /** Shift size for getting the index-1 table offset. */ + static final int UTRIE2_SHIFT_1=6+5; + + /** Shift size for getting the index-2 table offset. */ + static final int UTRIE2_SHIFT_2=5; + + /** + * Difference between the two shift sizes, + * for getting an index-1 offset from an index-2 offset. 6=11-5 + */ + static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; + + /** + * Number of index-1 entries for the BMP. 32=0x20 + * This part of the index-1 table is omitted from the serialized form. + */ + static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; + + /** Number of entries in an index-2 block. 64=0x40 */ + static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; + static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; + static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; + + /** + * Implementation class for an iterator over a Trie2. + * + * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, + * then returns the special alternate values for the lead surrogates + * + * @internal + */ + class Trie2Iterator implements Iterator { + + // The normal constructor that configures the iterator to cover the complete + // contents of the Trie2 + Trie2Iterator(ValueMapper vm) { + mapper = vm; + nextStart = 0; + limitCP = 0x110000; + doLeadSurrogates = true; + } + + /** + * The main next() function for Trie2 iterators + * + */ + public Range next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + if (nextStart >= limitCP) { + // Switch over from iterating normal code point values to + // doing the alternate lead-surrogate values. + doingCodePoints = false; + nextStart = 0xd800; + } + int endOfRange = 0; + int val = 0; + int mappedVal = 0; + + if (doingCodePoints) { + // Iteration over code point values. + val = get(nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEnd(nextStart, limitCP, val); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= limitCP-1) { + break; + } + val = get(endOfRange+1); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEnd(endOfRange+1, limitCP, val); + } + } else { + // Iteration over the alternate lead surrogate values. + val = getFromU16SingleLead((char)nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEndLS((char)nextStart); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= 0xdbff) { + break; + } + val = getFromU16SingleLead((char)(endOfRange+1)); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEndLS((char)(endOfRange+1)); + } + } + returnValue.startCodePoint = nextStart; + returnValue.endCodePoint = endOfRange; + returnValue.value = mappedVal; + returnValue.leadSurrogate = !doingCodePoints; + nextStart = endOfRange+1; + return returnValue; + } + + /** + * + */ + public boolean hasNext() { + return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; + } + + private int rangeEndLS(char startingLS) { + if (startingLS >= 0xdbff) { + return 0xdbff; + } + + int c; + int val = getFromU16SingleLead(startingLS); + for (c = startingLS+1; c <= 0x0dbff; c++) { + if (getFromU16SingleLead((char)c) != val) { + break; + } + } + return c-1; + } + + // + // Iteration State Variables + // + private ValueMapper mapper; + private Range returnValue = new Range(); + // The starting code point for the next range to be returned. + private int nextStart; + // The upper limit for the last normal range to be returned. Normally 0x110000, but + // may be lower when iterating over the code points for a single lead surrogate. + private int limitCP; + + // True while iterating over the Trie2 values for code points. + // False while iterating over the alternate values for lead surrogates. + private boolean doingCodePoints = true; + + // True if the iterator should iterate the special values for lead surrogates in + // addition to the normal values for code points. + private boolean doLeadSurrogates = true; + } + + /** + * Find the last character in a contiguous range of characters with the + * same Trie2 value as the input character. + * + * @param c The character to begin with. + * @return The last contiguous character with the same value. + */ + int rangeEnd(int start, int limitp, int val) { + int c; + int limit = Math.min(highStart, limitp); + + for (c = start+1; c < limit; c++) { + if (get(c) != val) { + break; + } + } + if (c >= highStart) { + c = limitp; + } + return c - 1; + } + + + // + // Hashing implementation functions. FNV hash. Respected public domain algorithm. + // + private static int initHash() { + return 0x811c9DC5; // unsigned 2166136261 + } + + private static int hashByte(int h, int b) { + h = h * 16777619; + h = h ^ b; + return h; + } + + private static int hashUChar32(int h, int c) { + h = Trie2.hashByte(h, c & 255); + h = Trie2.hashByte(h, (c>>8) & 255); + h = Trie2.hashByte(h, c>>16); + return h; + } + + private static int hashInt(int h, int i) { + h = Trie2.hashByte(h, i & 255); + h = Trie2.hashByte(h, (i>>8) & 255); + h = Trie2.hashByte(h, (i>>16) & 255); + h = Trie2.hashByte(h, (i>>24) & 255); + return h; + } + +} --- old/src/java.base/share/classes/sun/text/normalizer/Trie2_16.java 2020-01-10 15:57:39.000000000 -0800 +++ /dev/null 2020-01-10 15:57:39.000000000 -0800 @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2009-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; - - -/** - * @author aheninger - * - * A read-only Trie2, holding 16 bit data values. - * - * A Trie2 is a highly optimized data structure for mapping from Unicode - * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value. - * - * See class Trie2 for descriptions of the API for accessing the contents of a trie. - * - * The fundamental data access methods are declared final in this class, with - * the intent that applications might gain a little extra performance, when compared - * with calling the same methods via the abstract UTrie2 base class. - */ -public final class Trie2_16 extends Trie2 { - - /** - * Internal constructor, not for general use. - */ - Trie2_16() { - } - - - /** - * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). - * The serialized format is identical between ICU4C and ICU4J, so this function - * will work with serialized Trie2s from either. - * - * The serialized Trie2 in the bytes may be in either little or big endian byte order. - * This allows using serialized Tries from ICU4C without needing to consider the - * byte order of the system that created them. - * - * @param bytes a byte buffer to the serialized form of a UTrie2. - * @return An unserialized Trie2_16, ready for use. - * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2. - * @throws IOException if a read error occurs in the buffer. - * @throws ClassCastException if the bytes contain a serialized Trie2_32 - */ - public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException { - return (Trie2_16) Trie2.createFromSerialized(bytes); - } - - /** - * Get the value for a code point as stored in the Trie2. - * - * @param codePoint the code point - * @return the value - */ - @Override - public final int get(int codePoint) { - int value; - int ix; - - if (codePoint >= 0) { - if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) { - // Ordinary BMP code point, excluding leading surrogates. - // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. - // 16 bit data is stored in the index array itself. - ix = index[codePoint >> UTRIE2_SHIFT_2]; - ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); - value = index[ix]; - return value; - } - if (codePoint <= 0xffff) { - // Lead Surrogate Code Point. A Separate index section is stored for - // lead surrogate code units and code points. - // The main index has the code unit data. - // For this function, we need the code point data. - // Note: this expression could be refactored for slightly improved efficiency, but - // surrogate code points will be so rare in practice that it's not worth it. - ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)]; - ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); - value = index[ix]; - return value; - } - if (codePoint < highStart) { - // Supplemental code point, use two-level lookup. - ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1); - ix = index[ix]; - ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; - ix = index[ix]; - ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); - value = index[ix]; - return value; - } - if (codePoint <= 0x10ffff) { - value = index[highValueIndex]; - return value; - } - } - - // Fall through. The code point is outside of the legal range of 0..0x10ffff. - return errorValue; - } - - - /** - * Get a Trie2 value for a UTF-16 code unit. - * - * This function returns the same value as get() if the input - * character is outside of the lead surrogate range - * - * There are two values stored in a Trie2 for inputs in the lead - * surrogate range. This function returns the alternate value, - * while Trie2.get() returns the main value. - * - * @param codeUnit a 16 bit code unit or lead surrogate value. - * @return the value - */ - @Override - public int getFromU16SingleLead(char codeUnit) { - int value; - int ix; - - // Because the input is a 16 bit char, we can skip the tests for it being in - // the BMP range. It is. - ix = index[codeUnit >> UTRIE2_SHIFT_2]; - ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK); - value = index[ix]; - return value; - } - - /** - * @return the number of bytes of the serialized trie - */ - public int getSerializedLength() { - return 16+(header.indexLength+dataLength)*2; - } -} --- /dev/null 2020-01-10 15:57:39.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Trie2_16.java 2020-01-10 15:57:38.000000000 -0800 @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; + + +/** + * @author aheninger + * + * A read-only Trie2, holding 16 bit data values. + * + * A Trie2 is a highly optimized data structure for mapping from Unicode + * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value. + * + * See class Trie2 for descriptions of the API for accessing the contents of a trie. + * + * The fundamental data access methods are declared final in this class, with + * the intent that applications might gain a little extra performance, when compared + * with calling the same methods via the abstract UTrie2 base class. + */ +public final class Trie2_16 extends Trie2 { + + /** + * Internal constructor, not for general use. + */ + Trie2_16() { + } + + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The serialized Trie2 in the bytes may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2_16, ready for use. + * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * @throws ClassCastException if the bytes contain a serialized Trie2_32 + */ + public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException { + return (Trie2_16) Trie2.createFromSerialized(bytes); + } + + /** + * Get the value for a code point as stored in the Trie2. + * + * @param codePoint the code point + * @return the value + */ + @Override + public final int get(int codePoint) { + int value; + int ix; + + if (codePoint >= 0) { + if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) { + // Ordinary BMP code point, excluding leading surrogates. + // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. + // 16 bit data is stored in the index array itself. + ix = index[codePoint >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0xffff) { + // Lead Surrogate Code Point. A Separate index section is stored for + // lead surrogate code units and code points. + // The main index has the code unit data. + // For this function, we need the code point data. + // Note: this expression could be refactored for slightly improved efficiency, but + // surrogate code points will be so rare in practice that it's not worth it. + ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint < highStart) { + // Supplemental code point, use two-level lookup. + ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1); + ix = index[ix]; + ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; + ix = index[ix]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0x10ffff) { + value = index[highValueIndex]; + return value; + } + } + + // Fall through. The code point is outside of the legal range of 0..0x10ffff. + return errorValue; + } + + + /** + * Get a Trie2 value for a UTF-16 code unit. + * + * This function returns the same value as get() if the input + * character is outside of the lead surrogate range + * + * There are two values stored in a Trie2 for inputs in the lead + * surrogate range. This function returns the alternate value, + * while Trie2.get() returns the main value. + * + * @param codeUnit a 16 bit code unit or lead surrogate value. + * @return the value + */ + @Override + public int getFromU16SingleLead(char codeUnit) { + int value; + int ix; + + // Because the input is a 16 bit char, we can skip the tests for it being in + // the BMP range. It is. + ix = index[codeUnit >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + + /** + * @return the number of bytes of the serialized trie + */ + public int getSerializedLength() { + return 16+(header.indexLength+dataLength)*2; + } +} --- old/src/java.base/share/classes/sun/text/normalizer/UBiDiProps.java 2020-01-10 15:57:40.000000000 -0800 +++ /dev/null 2020-01-10 15:57:40.000000000 -0800 @@ -1,267 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * - * Copyright (C) 2004-2014, International Business Machines - * Corporation and others. All Rights Reserved. - * - ******************************************************************************* - * file name: UBiDiProps.java - * encoding: US-ASCII - * tab size: 8 (not used) - * indentation:4 - * - * created on: 2005jan16 - * created by: Markus W. Scherer - * - * Low-level Unicode bidi/shaping properties access. - * Java port of ubidi_props.h/.c. - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.MissingResourceException; - -public final class UBiDiProps { - // constructors etc. --------------------------------------------------- *** - - // port of ubidi_openProps() - private UBiDiProps() throws IOException{ - ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); - readData(bytes); - } - - private void readData(ByteBuffer bytes) throws IOException { - // read the header - ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); - - // read indexes[] - int i, count; - count=bytes.getInt(); - if(countexpectedTrieLength) { - throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); - } - // skip padding after trie bytes - ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); - - // read mirrors[] - count=indexes[IX_MIRROR_LENGTH]; - if(count>0) { - mirrors=new int[count]; - for(i=0; i>JT_SHIFT; - } - - public final int getJoiningGroup(int c) { - int start, limit; - - start=indexes[IX_JG_START]; - limit=indexes[IX_JG_LIMIT]; - if(start<=c && c>BPT_SHIFT; - } - - public final int getPairedBracket(int c) { - int props=trie.get(c); - if((props&BPT_MASK)==0) { - return c; - } else { - return getMirror(c, props); - } - } - - // data members -------------------------------------------------------- *** - private int indexes[]; - private int mirrors[]; - private byte jgArray[]; - private byte jgArray2[]; - - private Trie2_16 trie; - - // data format constants ----------------------------------------------- *** - private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu"; - - /* format "BiDi" */ - private static final int FMT=0x42694469; - - /* indexes into indexes[] */ - private static final int IX_TRIE_SIZE=2; - private static final int IX_MIRROR_LENGTH=3; - - private static final int IX_JG_START=4; - private static final int IX_JG_LIMIT=5; - private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */ - private static final int IX_JG_LIMIT2=7; - - private static final int IX_TOP=16; - - // definitions for 16-bit bidi/shaping properties word ----------------- *** - - /* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ - private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */ - - private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ - - private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */ - - private static final int CLASS_MASK= 0x0000001f; - private static final int JT_MASK= 0x000000e0; - private static final int BPT_MASK= 0x00000300; - - private static final int getClassFromProps(int props) { - return props&CLASS_MASK; - } - private static final boolean getFlagFromProps(int props, int shift) { - return ((props>>shift)&1)!=0; - } - private static final int getMirrorDeltaFromProps(int props) { - return (short)props>>MIRROR_DELTA_SHIFT; - } - - private static final int ESC_MIRROR_DELTA=-4; - - // definitions for 32-bit mirror table entry --------------------------- *** - - /* the source Unicode code point takes 21 bits (20..0) */ - private static final int MIRROR_INDEX_SHIFT=21; - - private static final int getMirrorCodePoint(int m) { - return m&0x1fffff; - } - private static final int getMirrorIndex(int m) { - return m>>>MIRROR_INDEX_SHIFT; - } - - - /* - * public singleton instance - */ - public static final UBiDiProps INSTANCE; - - // This static initializer block must be placed after - // other static member initialization - static { - try { - INSTANCE = new UBiDiProps(); - } catch (IOException e) { - throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,""); - } - } -} --- /dev/null 2020-01-10 15:57:40.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/UBiDiProps.java 2020-01-10 15:57:40.000000000 -0800 @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * + * Copyright (C) 2004-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: UBiDiProps.java + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2005jan16 + * created by: Markus W. Scherer + * + * Low-level Unicode bidi/shaping properties access. + * Java port of ubidi_props.h/.c. + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.util.VersionInfo; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.MissingResourceException; + +public final class UBiDiProps { + // constructors etc. --------------------------------------------------- *** + + // port of ubidi_openProps() + private UBiDiProps() throws IOException{ + ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); + readData(bytes); + } + + private void readData(ByteBuffer bytes) throws IOException { + // read the header + ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); + + // read indexes[] + int i, count; + count=bytes.getInt(); + if(countexpectedTrieLength) { + throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); + + // read mirrors[] + count=indexes[IX_MIRROR_LENGTH]; + if(count>0) { + mirrors=new int[count]; + for(i=0; i>JT_SHIFT; + } + + public final int getJoiningGroup(int c) { + int start, limit; + + start=indexes[IX_JG_START]; + limit=indexes[IX_JG_LIMIT]; + if(start<=c && c>BPT_SHIFT; + } + + public final int getPairedBracket(int c) { + int props=trie.get(c); + if((props&BPT_MASK)==0) { + return c; + } else { + return getMirror(c, props); + } + } + + // data members -------------------------------------------------------- *** + private int indexes[]; + private int mirrors[]; + private byte jgArray[]; + private byte jgArray2[]; + + private Trie2_16 trie; + + // data format constants ----------------------------------------------- *** + @SuppressWarnings("deprecation") + private static final String DATA_FILE_NAME = + "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + + "/ubidi.icu"; + + /* format "BiDi" */ + private static final int FMT=0x42694469; + + /* indexes into indexes[] */ + private static final int IX_TRIE_SIZE=2; + private static final int IX_MIRROR_LENGTH=3; + + private static final int IX_JG_START=4; + private static final int IX_JG_LIMIT=5; + private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */ + private static final int IX_JG_LIMIT2=7; + + private static final int IX_TOP=16; + + // definitions for 16-bit bidi/shaping properties word ----------------- *** + + /* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ + private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */ + + private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + + private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */ + + private static final int CLASS_MASK= 0x0000001f; + private static final int JT_MASK= 0x000000e0; + private static final int BPT_MASK= 0x00000300; + + private static final int getClassFromProps(int props) { + return props&CLASS_MASK; + } + private static final boolean getFlagFromProps(int props, int shift) { + return ((props>>shift)&1)!=0; + } + private static final int getMirrorDeltaFromProps(int props) { + return (short)props>>MIRROR_DELTA_SHIFT; + } + + private static final int ESC_MIRROR_DELTA=-4; + + // definitions for 32-bit mirror table entry --------------------------- *** + + /* the source Unicode code point takes 21 bits (20..0) */ + private static final int MIRROR_INDEX_SHIFT=21; + + private static final int getMirrorCodePoint(int m) { + return m&0x1fffff; + } + private static final int getMirrorIndex(int m) { + return m>>>MIRROR_INDEX_SHIFT; + } + + + /* + * public singleton instance + */ + public static final UBiDiProps INSTANCE; + + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UBiDiProps(); + } catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,""); + } + } +} --- old/src/java.base/share/classes/sun/text/normalizer/UCharacterProperty.java 2020-01-10 15:57:41.000000000 -0800 +++ /dev/null 2020-01-10 15:57:41.000000000 -0800 @@ -1,607 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * Copyright (C) 1996-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.Iterator; -import java.util.MissingResourceException; - -import sun.text.normalizer.UCharacter.HangulSyllableType; -import sun.text.normalizer.UCharacter.NumericType; - -/** -*

Internal class used for Unicode character property database.

-*

This classes store binary data read from uprops.icu. -* It does not have the capability to parse the data into more high-level -* information. It only returns bytes of information when required.

-*

Due to the form most commonly used for retrieval, array of char is used -* to store the binary data.

-*

UCharacterPropertyDB also contains information on accessing indexes to -* significant points in the binary data.

-*

Responsibility for molding the binary data into more meaning form lies on -* UCharacter.

-* @author Syn Wee Quek -* @since release 2.1, february 1st 2002 -*/ - -final class UCharacterProperty -{ - // public data members ----------------------------------------------- - - /* - * public singleton instance - */ - public static final UCharacterProperty INSTANCE; - - /** - * Trie data - */ - public Trie2_16 m_trie_; - - /** - * Unicode version - */ - public VersionInfo m_unicodeVersion_; - - /** - * Character type mask - */ - public static final int TYPE_MASK = 0x1F; - - // uprops.h enum UPropertySource --------------------------------------- *** - - /** From uchar.c/uprops.icu main trie */ - public static final int SRC_CHAR=1; - /** From uchar.c/uprops.icu properties vectors trie */ - public static final int SRC_PROPSVEC=2; - /** From ubidi_props.c/ubidi.icu */ - public static final int SRC_BIDI=5; - /** From normalizer2impl.cpp/nfc.nrm */ - public static final int SRC_NFC=8; - /** From normalizer2impl.cpp/nfkc.nrm */ - public static final int SRC_NFKC=9; - - // public methods ---------------------------------------------------- - - /** - * Gets the main property value for code point ch. - * @param ch code point whose property value is to be retrieved - * @return property value of code point - */ - public final int getProperty(int ch) - { - return m_trie_.get(ch); - } - - /** - * Gets the unicode additional properties. - * Java version of C u_getUnicodeProperties(). - * @param codepoint codepoint whose additional properties is to be - * retrieved - * @param column The column index. - * @return unicode properties - */ - public int getAdditional(int codepoint, int column) { - assert column >= 0; - if (column >= m_additionalColumnsCount_) { - return 0; - } - return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; - } - - /** - *

Get the "age" of the code point.

- *

The "age" is the Unicode version when the code point was first - * designated (as a non-character or for Private Use) or assigned a - * character.

- *

This can be useful to avoid emitting code points to receiving - * processes that do not accept newer characters.

- *

The data is from the UCD file DerivedAge.txt.

- *

This API does not check the validity of the codepoint.

- * @param codepoint The code point. - * @return the Unicode version number - */ - public VersionInfo getAge(int codepoint) - { - int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; - return VersionInfo.getInstance( - (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, - version & LAST_NIBBLE_MASK_, 0, 0); - } - - // int-value and enumerated properties --------------------------------- *** - - public int getType(int c) { - return getProperty(c)&TYPE_MASK; - } - - /* - * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. - * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. - */ - private static final int /* UHangulSyllableType */ gcbToHst[]={ - HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ - HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ - HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ - HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ - HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ - HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ - HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ - HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ - HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ - HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ - /* - * Omit GCB values beyond what we need for hst. - * The code below checks for the array length. - */ - }; - - private class IntProperty { - int column; // SRC_PROPSVEC column, or "source" if mask==0 - int mask; - int shift; - - IntProperty(int column, int mask, int shift) { - this.column=column; - this.mask=mask; - this.shift=shift; - } - - IntProperty(int source) { - this.column=source; - this.mask=0; - } - - int getValue(int c) { - // systematic, directly stored properties - return (getAdditional(c, column)&mask)>>>shift; - } - } - - private class BiDiIntProperty extends IntProperty { - BiDiIntProperty() { - super(SRC_BIDI); - } - } - - private class CombiningClassIntProperty extends IntProperty { - CombiningClassIntProperty(int source) { - super(source); - } - } - - private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties - int which; - int max; - - NormQuickCheckIntProperty(int source, int which, int max) { - super(source); - this.which=which; - this.max=max; - } - } - - private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE - int getValue(int c) { - return UBiDiProps.INSTANCE.getPairedBracketType(c); - } - }; - - public int getIntPropertyValue(int c, int which) { - if (which == BIDI_PAIRED_BRACKET_TYPE) { - return intProp.getValue(c); - } - return 0; // undefined - } - - /** - * Forms a supplementary code point from the argument character
- * Note this is for internal use hence no checks for the validity of the - * surrogate characters are done - * @param lead lead surrogate character - * @param trail trailing surrogate character - * @return code point of the supplementary character - */ - public static int getRawSupplementary(char lead, char trail) - { - return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; - } - - /** - * Gets the type mask - * @param type character type - * @return mask - */ - public static final int getMask(int type) - { - return 1 << type; - } - - /** - * Returns the digit values of characters like 'A' - 'Z', normal, - * half-width and full-width. This method assumes that the other digit - * characters are checked by the calling method. - * @param ch character to test - * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise - * its corresponding digit will be returned. - */ - public static int getEuropeanDigit(int ch) { - if ((ch > 0x7a && ch < 0xff21) - || ch < 0x41 || (ch > 0x5a && ch < 0x61) - || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { - return -1; - } - if (ch <= 0x7a) { - // ch >= 0x41 or ch < 0x61 - return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); - } - // ch >= 0xff21 - if (ch <= 0xff3a) { - return ch + 10 - 0xff21; - } - // ch >= 0xff41 && ch <= 0xff5a - return ch + 10 - 0xff41; - } - - public int digit(int c) { - int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; - if(value<=9) { - return value; - } else { - return -1; - } - } - - // protected variables ----------------------------------------------- - - /** - * Extra property trie - */ - Trie2_16 m_additionalTrie_; - /** - * Extra property vectors, 1st column for age and second for binary - * properties. - */ - int m_additionalVectors_[]; - /** - * Number of additional columns - */ - int m_additionalColumnsCount_; - /** - * Maximum values for block, bits used as in vector word - * 0 - */ - int m_maxBlockScriptValue_; - /** - * Maximum values for script, bits used as in vector word - * 0 - */ - int m_maxJTGValue_; - /** - * Script_Extensions data - */ - public char[] m_scriptExtensions_; - - // private variables ------------------------------------------------- - - /** - * Default name of the datafile - */ - private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu"; - - /** - * Shift value for lead surrogate to form a supplementary character. - */ - private static final int LEAD_SURROGATE_SHIFT_ = 10; - /** - * Offset to add to combined surrogate pair to avoid masking. - */ - private static final int SURROGATE_OFFSET_ = - UTF16.SUPPLEMENTARY_MIN_VALUE - - (UTF16.SURROGATE_MIN_VALUE << - LEAD_SURROGATE_SHIFT_) - - UTF16.TRAIL_SURROGATE_MIN_VALUE; - - - // property data constants ------------------------------------------------- - - /** - * Numeric types and values in the main properties words. - */ - private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; - private static final int getNumericTypeValue(int props) { - return props >> NUMERIC_TYPE_VALUE_SHIFT_; - } - - /* constants for the storage form of numeric types and values */ - /** No numeric value. */ - private static final int NTV_NONE_ = 0; - /** Decimal digits: nv=0..9 */ - private static final int NTV_DECIMAL_START_ = 1; - /** Other digits: nv=0..9 */ - private static final int NTV_DIGIT_START_ = 11; - /** Small integers: nv=0..154 */ - private static final int NTV_NUMERIC_START_ = 21; - - private static final int ntvGetType(int ntv) { - return - (ntv==NTV_NONE_) ? NumericType.NONE : - (ntv expectedTrieLength) { - throw new IOException("uprops.icu: not enough bytes for main trie"); - } - // skip padding after trie bytes - ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); - - // skip unused intervening data structures - ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); - - if(m_additionalColumnsCount_ > 0) { - // reads the additional property block - m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); - expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; - trieLength = m_additionalTrie_.getSerializedLength(); - if(trieLength > expectedTrieLength) { - throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); - } - // skip padding after trie bytes - ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); - - // additional properties - int size = scriptExtensionsOffset - additionalVectorsOffset; - m_additionalVectors_ = new int[size]; - for (int i = 0; i < size; i ++) { - m_additionalVectors_[i] = bytes.getInt(); - } - } - - // Script_Extensions - int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; - if(numChars > 0) { - m_scriptExtensions_ = new char[numChars]; - for(int i = 0; i < numChars; ++i) { - m_scriptExtensions_[i] = bytes.getChar(); - } - } - } - - private static final class IsAcceptable implements ICUBinary.Authenticate { - // @Override when we switch to Java 6 - public boolean isDataVersionAcceptable(byte version[]) { - return version[0] == 7; - } - } - - private static final int DATA_FORMAT = 0x5550726F; // "UPro" - - public void upropsvec_addPropertyStarts(UnicodeSet set) { - /* add the start code point of each same-value range of the properties vectors trie */ - if(m_additionalColumnsCount_>0) { - /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ - Iterator trieIterator = m_additionalTrie_.iterator(); - Trie2.Range range; - while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { - set.add(range.startCodePoint); - } - } - } - - // This static initializer block must be placed after - // other static member initialization - static { - try { - INSTANCE = new UCharacterProperty(); - } - catch (IOException e) { - throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); - } - } - - - // Moved from UProperty.java - /** - * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). - * Used in UAX #9: Unicode Bidirectional Algorithm - * (http://www.unicode.org/reports/tr9/) - * Returns UCharacter.BidiPairedBracketType values. - * @stable ICU 52 - */ - public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; - -} --- /dev/null 2020-01-10 15:57:41.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/UCharacterProperty.java 2020-01-10 15:57:41.000000000 -0800 @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.MissingResourceException; + +import jdk.internal.icu.lang.UCharacter.HangulSyllableType; +import jdk.internal.icu.lang.UCharacter.NumericType; +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.text.UnicodeSet; +import jdk.internal.icu.util.VersionInfo; + +/** +*

Internal class used for Unicode character property database.

+*

This classes store binary data read from uprops.icu. +* It does not have the capability to parse the data into more high-level +* information. It only returns bytes of information when required.

+*

Due to the form most commonly used for retrieval, array of char is used +* to store the binary data.

+*

UCharacterPropertyDB also contains information on accessing indexes to +* significant points in the binary data.

+*

Responsibility for molding the binary data into more meaning form lies on +* UCharacter.

+* @author Syn Wee Quek +* @since release 2.1, february 1st 2002 +*/ + +public final class UCharacterProperty +{ + // public data members ----------------------------------------------- + + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + + /** + * Trie data + */ + public Trie2_16 m_trie_; + + /** + * Unicode version + */ + public VersionInfo m_unicodeVersion_; + + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + + // uprops.h enum UPropertySource --------------------------------------- *** + + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; + /** From uchar.c/uprops.icu properties vectors trie */ + public static final int SRC_PROPSVEC=2; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; + + // public methods ---------------------------------------------------- + + /** + * Gets the main property value for code point ch. + * @param ch code point whose property value is to be retrieved + * @return property value of code point + */ + public final int getProperty(int ch) + { + return m_trie_.get(ch); + } + + /** + * Gets the unicode additional properties. + * Java version of C u_getUnicodeProperties(). + * @param codepoint codepoint whose additional properties is to be + * retrieved + * @param column The column index. + * @return unicode properties + */ + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } + + /** + *

Get the "age" of the code point.

+ *

The "age" is the Unicode version when the code point was first + * designated (as a non-character or for Private Use) or assigned a + * character.

+ *

This can be useful to avoid emitting code points to receiving + * processes that do not accept newer characters.

+ *

The data is from the UCD file DerivedAge.txt.

+ *

This API does not check the validity of the codepoint.

+ * @param codepoint The code point. + * @return the Unicode version number + */ + public VersionInfo getAge(int codepoint) + { + int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; + return VersionInfo.getInstance( + (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, + version & LAST_NIBBLE_MASK_, 0, 0); + } + + // int-value and enumerated properties --------------------------------- *** + + public int getType(int c) { + return getProperty(c)&TYPE_MASK; + } + + /* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ + private static final int /* UHangulSyllableType */ gcbToHst[]={ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ + HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ + HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ + HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ + HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ + HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ + }; + + private class IntProperty { + int column; // SRC_PROPSVEC column, or "source" if mask==0 + int mask; + int shift; + + IntProperty(int column, int mask, int shift) { + this.column=column; + this.mask=mask; + this.shift=shift; + } + + IntProperty(int source) { + this.column=source; + this.mask=0; + } + + int getValue(int c) { + // systematic, directly stored properties + return (getAdditional(c, column)&mask)>>>shift; + } + } + + private class BiDiIntProperty extends IntProperty { + BiDiIntProperty() { + super(SRC_BIDI); + } + } + + private class CombiningClassIntProperty extends IntProperty { + CombiningClassIntProperty(int source) { + super(source); + } + } + + private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties + int which; + int max; + + NormQuickCheckIntProperty(int source, int which, int max) { + super(source); + this.which=which; + this.max=max; + } + } + + private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE + int getValue(int c) { + return UBiDiProps.INSTANCE.getPairedBracketType(c); + } + }; + + public int getIntPropertyValue(int c, int which) { + if (which == BIDI_PAIRED_BRACKET_TYPE) { + return intProp.getValue(c); + } + return 0; // undefined + } + + /** + * Forms a supplementary code point from the argument character
+ * Note this is for internal use hence no checks for the validity of the + * surrogate characters are done + * @param lead lead surrogate character + * @param trail trailing surrogate character + * @return code point of the supplementary character + */ + public static int getRawSupplementary(char lead, char trail) + { + return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; + } + + /** + * Gets the type mask + * @param type character type + * @return mask + */ + public static final int getMask(int type) + { + return 1 << type; + } + + /** + * Returns the digit values of characters like 'A' - 'Z', normal, + * half-width and full-width. This method assumes that the other digit + * characters are checked by the calling method. + * @param ch character to test + * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise + * its corresponding digit will be returned. + */ + public static int getEuropeanDigit(int ch) { + if ((ch > 0x7a && ch < 0xff21) + || ch < 0x41 || (ch > 0x5a && ch < 0x61) + || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { + return -1; + } + if (ch <= 0x7a) { + // ch >= 0x41 or ch < 0x61 + return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); + } + // ch >= 0xff21 + if (ch <= 0xff3a) { + return ch + 10 - 0xff21; + } + // ch >= 0xff41 && ch <= 0xff5a + return ch + 10 - 0xff41; + } + + public int digit(int c) { + int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; + if(value<=9) { + return value; + } else { + return -1; + } + } + + // protected variables ----------------------------------------------- + + /** + * Extra property trie + */ + Trie2_16 m_additionalTrie_; + /** + * Extra property vectors, 1st column for age and second for binary + * properties. + */ + int m_additionalVectors_[]; + /** + * Number of additional columns + */ + int m_additionalColumnsCount_; + /** + * Maximum values for block, bits used as in vector word + * 0 + */ + int m_maxBlockScriptValue_; + /** + * Maximum values for script, bits used as in vector word + * 0 + */ + int m_maxJTGValue_; + /** + * Script_Extensions data + */ + public char[] m_scriptExtensions_; + + // private variables ------------------------------------------------- + + /** + * Default name of the datafile + */ + @SuppressWarnings("deprecation") + private static final String DATA_FILE_NAME_ = + "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + + "/uprops.icu"; + + /** + * Shift value for lead surrogate to form a supplementary character. + */ + private static final int LEAD_SURROGATE_SHIFT_ = 10; + /** + * Offset to add to combined surrogate pair to avoid masking. + */ + private static final int SURROGATE_OFFSET_ = + UTF16.SUPPLEMENTARY_MIN_VALUE - + (UTF16.SURROGATE_MIN_VALUE << + LEAD_SURROGATE_SHIFT_) - + UTF16.TRAIL_SURROGATE_MIN_VALUE; + + + // property data constants ------------------------------------------------- + + /** + * Numeric types and values in the main properties words. + */ + private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; + private static final int getNumericTypeValue(int props) { + return props >> NUMERIC_TYPE_VALUE_SHIFT_; + } + + /* constants for the storage form of numeric types and values */ + /** No numeric value. */ + private static final int NTV_NONE_ = 0; + /** Decimal digits: nv=0..9 */ + private static final int NTV_DECIMAL_START_ = 1; + /** Other digits: nv=0..9 */ + private static final int NTV_DIGIT_START_ = 11; + /** Small integers: nv=0..154 */ + private static final int NTV_NUMERIC_START_ = 21; + + private static final int ntvGetType(int ntv) { + return + (ntv==NTV_NONE_) ? NumericType.NONE : + (ntv expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for main trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // skip unused intervening data structures + ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); + + if(m_additionalColumnsCount_ > 0) { + // reads the additional property block + m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); + expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; + trieLength = m_additionalTrie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // additional properties + int size = scriptExtensionsOffset - additionalVectorsOffset; + m_additionalVectors_ = new int[size]; + for (int i = 0; i < size; i ++) { + m_additionalVectors_[i] = bytes.getInt(); + } + } + + // Script_Extensions + int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; + if(numChars > 0) { + m_scriptExtensions_ = new char[numChars]; + for(int i = 0; i < numChars; ++i) { + m_scriptExtensions_[i] = bytes.getChar(); + } + } + } + + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 7; + } + } + + private static final int DATA_FORMAT = 0x5550726F; // "UPro" + + public void upropsvec_addPropertyStarts(UnicodeSet set) { + /* add the start code point of each same-value range of the properties vectors trie */ + if(m_additionalColumnsCount_>0) { + /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ + Iterator trieIterator = m_additionalTrie_.iterator(); + Trie2.Range range; + while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { + set.add(range.startCodePoint); + } + } + } + + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UCharacterProperty(); + } + catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); + } + } + + + // Moved from UProperty.java + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + +} --- old/src/java.base/share/classes/sun/text/normalizer/UnicodeSetStringSpan.java 2020-01-10 15:57:42.000000000 -0800 +++ /dev/null 2020-01-10 15:57:43.000000000 -0800 @@ -1,1165 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ****************************************************************************** - * - * Copyright (C) 2009-2014, International Business Machines - * Corporation and others. All Rights Reserved. - * - ****************************************************************************** - */ - -package sun.text.normalizer; - -import java.util.ArrayList; - -import sun.text.normalizer.UnicodeSet.SpanCondition; - -/* - * Implement span() etc. for a set with strings. - * Avoid recursion because of its exponential complexity. - * Instead, try multiple paths at once and track them with an IndexList. - */ -class UnicodeSetStringSpan { - - /* - * Which span() variant will be used? The object is either built for one variant and used once, - * or built for all and may be used many times. - */ - public static final int WITH_COUNT = 0x40; // spanAndCount() may be called - public static final int FWD = 0x20; - public static final int BACK = 0x10; - // public static final int UTF16 = 8; - public static final int CONTAINED = 2; - public static final int NOT_CONTAINED = 1; - - public static final int ALL = 0x7f; - - public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; - public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; - public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; - public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; - - /** - * Special spanLength short values. (since Java has not unsigned byte type) - * All code points in the string are contained in the parent set. - */ - static final short ALL_CP_CONTAINED = 0xff; - - /** The spanLength is >=0xfe. */ - static final short LONG_SPAN = ALL_CP_CONTAINED - 1; - - /** Set for span(). Same as parent but without strings. */ - private UnicodeSet spanSet; - - /** - * Set for span(not contained). - * Same as spanSet, plus characters that start or end strings. - */ - private UnicodeSet spanNotSet; - - /** The strings of the parent set. */ - private ArrayList strings; - - /** The lengths of span(), spanBack() etc. for each string. */ - private short[] spanLengths; - - /** Maximum lengths of relevant strings. */ - private int maxLength16; - - /** Are there strings that are not fully contained in the code point set? */ - private boolean someRelevant; - - /** Set up for all variants of span()? */ - private boolean all; - - /** Span helper */ - private OffsetList offsets; - - /** - * Constructs for all variants of span(), or only for any one variant. - * Initializes as little as possible, for single use. - */ - public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) { - spanSet = new UnicodeSet(0, 0x10ffff); - // TODO: With Java 6, just take the parent set's strings as is, - // as a NavigableSet, rather than as an ArrayList copy of the set of strings. - // Then iterate via the first() and higher() methods. - // (We do not want to create multiple Iterator objects in each span().) - // See ICU ticket #7454. - strings = setStrings; - all = (which == ALL); - spanSet.retainAll(set); - if (0 != (which & NOT_CONTAINED)) { - // Default to the same sets. - // addToSpanNotSet() will create a separate set if necessary. - spanNotSet = spanSet; - } - offsets = new OffsetList(); - - // Determine if the strings even need to be taken into account at all for span() etc. - // If any string is relevant, then all strings need to be used for - // span(longest match) but only the relevant ones for span(while contained). - // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH - // and do not store UTF-8 strings if !thisRelevant and CONTAINED. - // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) - // Also count the lengths of the UTF-8 versions of the strings for memory allocation. - int stringsLength = strings.size(); - - int i, spanLength; - someRelevant = false; - for (i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - spanLength = spanSet.span(string, SpanCondition.CONTAINED); - if (spanLength < length16) { // Relevant string. - someRelevant = true; - } - if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { - maxLength16 = length16; - } - } - if (!someRelevant && (which & WITH_COUNT) == 0) { - return; - } - - // Freeze after checking for the need to use strings at all because freezing - // a set takes some time and memory which are wasted if there are no relevant strings. - if (all) { - spanSet.freeze(); - } - - int spanBackLengthsOffset; - - // Allocate a block of meta data. - int allocSize; - if (all) { - // 2 sets of span lengths - allocSize = stringsLength * (2); - } else { - allocSize = stringsLength; // One set of span lengths. - } - spanLengths = new short[allocSize]; - - if (all) { - // Store span lengths for all span() variants. - spanBackLengthsOffset = stringsLength; - } else { - // Store span lengths for only one span() variant. - spanBackLengthsOffset = 0; - } - - // Set the meta data and spanNotSet and write the UTF-8 strings. - - for (i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - spanLength = spanSet.span(string, SpanCondition.CONTAINED); - if (spanLength < length16) { // Relevant string. - if (true /* 0 != (which & UTF16) */) { - if (0 != (which & CONTAINED)) { - if (0 != (which & FWD)) { - spanLengths[i] = makeSpanLengthByte(spanLength); - } - if (0 != (which & BACK)) { - spanLength = length16 - - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); - spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); - } - } else /* not CONTAINED, not all, but NOT_CONTAINED */{ - spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant - // flag. - } - } - if (0 != (which & NOT_CONTAINED)) { - // Add string start and end code points to the spanNotSet so that - // a span(while not contained) stops before any string. - int c; - if (0 != (which & FWD)) { - c = string.codePointAt(0); - addToSpanNotSet(c); - } - if (0 != (which & BACK)) { - c = string.codePointBefore(length16); - addToSpanNotSet(c); - } - } - } else { // Irrelevant string. - if (all) { - spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; - } else { - // All spanXYZLengths pointers contain the same address. - spanLengths[i] = ALL_CP_CONTAINED; - } - } - } - - // Finish. - if (all) { - spanNotSet.freeze(); - } - } - - /** - * Do the strings need to be checked in span() etc.? - * - * @return true if strings need to be checked (call span() here), - * false if not (use a BMPSet for best performance). - */ - public boolean needsStringSpanUTF16() { - return someRelevant; - } - - /** For fast UnicodeSet::contains(c). */ - public boolean contains(int c) { - return spanSet.contains(c); - } - - /** - * Adds a starting or ending string character to the spanNotSet - * so that a character span ends before any string. - */ - private void addToSpanNotSet(int c) { - if (spanNotSet == null || spanNotSet == spanSet) { - if (spanSet.contains(c)) { - return; // Nothing to do. - } - spanNotSet = spanSet.cloneAsThawed(); - } - spanNotSet.add(c); - } - - /* - * Note: In span() when spanLength==0 - * (after a string match, or at the beginning after an empty code point span) - * and in spanNot() and spanNotUTF8(), - * string matching could use a binary search because all string matches are done - * from the same start index. - * - * For UTF-8, this would require a comparison function that returns UTF-16 order. - * - * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets - * with strings have very few very short strings. For cases with many strings, it might be better to use a different - * API and implementation with a DFA (state machine). - */ - - /* - * Algorithm for span(SpanCondition.CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Either recursively span for each code point or string match, or recursively span - * for all but the shortest one and iteratively continue the span with the shortest local match. - * + Remember the longest recursive span (the farthest end point). - * + If there is no match at the current position, - * neither for the code point there nor for any set string, - * then stop and return the longest recursive span length. - * - * Optimized implementation: - * - * (We assume that most sets will have very few very short strings. - * A span using a string-less set is extremely fast.) - * - * Create and cache a spanSet which contains all of the single code points of the original set - * but none of its strings. - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points - * must be matched with a partial overlap - * because the recursive algorithm would have tried to match them at every position. - * ~ Set strings that entirely consist of set-contained code points - * are irrelevant for span(SpanCondition.CONTAINED) - * because the recursive algorithm would continue after them anyway and - * find the longest recursive match from their end. - * ~ Rather than recursing, note each end point of a set string match. - * + If no set string matched after spanSet.span(), - * then return with where the spanSet.span() ended. - * + If at least one set string matched after spanSet.span(), - * then pop the shortest string match end point and continue the loop, - * trying to match all set strings from there. - * + If at least one more set string matched after a previous string match, then test if the - * code point after the previous string match is also contained in the set. - * Continue the loop with the shortest end point of - * either this code point or a matching set string. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - * - * By noting each end point of a set string match, the function visits each string position at most once and - * finishes in linear time. - * - * The recursive algorithm may visit the same string position many times - * if multiple paths lead to it and finishes in exponential time. - */ - - /* - * Algorithm for span(SIMPLE) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then remember to continue after it. - * + If a set string matches at the current position, then remember to continue after it. - * + Continue from the farthest match position and ignore all others. - * + If there is no match at the current position, then stop and return the current position. - * - * Optimized implementation: - * - * (Same assumption and spanSet as above.) - * - * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). - * - Loop: - * + Try to match each set string at the end of the spanLength. - * ~ Set strings that start with set-contained code points - * must be matched with a partial overlap - * because the standard algorithm would have tried to match them earlier. - * ~ Set strings that entirely consist of set-contained code points - * must be matched with a full overlap because the longest-match algorithm - * would hide set string matches that end earlier. - * Such set strings need not be matched earlier inside the code point span - * because the standard algorithm would then have - * continued after the set string match anyway. - * ~ Remember the longest set string match (farthest end point) - * from the earliest starting point. - * + If no set string matched after spanSet.span(), - * then return with where the spanSet.span() ended. - * + If at least one set string matched, - * then continue the loop after the longest match from the earliest position. - * + If no more set string matched after a previous string match, - * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). - * Stop if spanLength==0, otherwise continue the loop. - */ - /** - * Spans a string. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param spanCondition The span condition - * @return the limit (exclusive end) of the span - */ - public int span(CharSequence s, int start, SpanCondition spanCondition) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNot(s, start, null); - } - int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); - if (spanLimit == s.length()) { - return spanLimit; - } - return spanWithStrings(s, start, spanLimit, spanCondition); - } - - /** - * Synchronized method for complicated spans using the offsets. - * Avoids synchronization for simple cases. - * - * @param spanLimit = spanSet.span(s, start, CONTAINED) - */ - private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, - SpanCondition spanCondition) { - // Consider strings; they may overlap with the span. - int initSize = 0; - if (spanCondition == SpanCondition.CONTAINED) { - // Use offset list to try all possibilities. - initSize = maxLength16; - } - offsets.setMaxLength(initSize); - int length = s.length(); - int pos = spanLimit, rest = length - spanLimit; - int spanLength = spanLimit - start; - int i, stringsLength = strings.size(); - for (;;) { - if (spanCondition == SpanCondition.CONTAINED) { - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[i]; - if (overlap == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-overlap..pos. - if (overlap >= LONG_SPAN) { - overlap = length16; - // While contained: No point matching fully inside the code point span. - overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code - // point. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int inc = length16 - overlap; // Keep overlap+inc==length16. - for (;;) { - if (inc > rest) { - break; - } - // Try to match if the increment is not listed already. - if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { - if (inc == rest) { - return length; // Reached the end of the string. - } - offsets.addOffset(inc); - } - if (overlap == 0) { - break; - } - --overlap; - ++inc; - } - } - } else /* SIMPLE */{ - int maxInc = 0, maxOverlap = 0; - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the earliest start. - - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-overlap..pos. - if (overlap >= LONG_SPAN) { - overlap = length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the earliest start. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int inc = length16 - overlap; // Keep overlap+inc==length16. - for (;;) { - if (inc > rest || overlap < maxOverlap) { - break; - } - // Try to match if the string is longer or starts earlier. - if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) - && matches16CPB(s, pos - overlap, length, string, length16)) { - maxInc = inc; // Longest match from earliest start. - maxOverlap = overlap; - break; - } - --overlap; - ++inc; - } - } - - if (maxInc != 0 || maxOverlap != 0) { - // Longest-match algorithm, and there was a string match. - // Simply continue after it. - pos += maxInc; - rest -= maxInc; - if (rest == 0) { - return length; // Reached the end of the string. - } - spanLength = 0; // Match strings from after a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if (spanLength != 0 || pos == 0) { - // The position is after an unlimited code point span (spanLength!=0), - // not after a string match. - // The only position where spanLength==0 after a span is pos==0. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if (offsets.isEmpty()) { - return pos; // No strings matched after a span. - } - // Match strings from after the next string match. - } else { - // The position is after a string match (or a single code point). - if (offsets.isEmpty()) { - // No more strings matched after a previous string match. - // Try another code point span from after the last string match. - spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); - spanLength = spanLimit - pos; - if (spanLength == rest || // Reached the end of the string, or - spanLength == 0 // neither strings nor span progressed. - ) { - return spanLimit; - } - pos += spanLength; - rest -= spanLength; - continue; // spanLength>0: Match strings from after a span. - } else { - // Try to match only one code point from after a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength = spanOne(spanSet, s, pos, rest); - if (spanLength > 0) { - if (spanLength == rest) { - return length; // Reached the end of the string. - } - // Match strings after this code point. - // There cannot be any increments below it because UnicodeSet strings - // contain multiple code points. - pos += spanLength; - rest -= spanLength; - offsets.shift(spanLength); - spanLength = 0; - continue; // Match strings from after a single code point. - } - // Match strings from after the next string match. - } - } - int minOffset = offsets.popMinimum(null); - pos += minOffset; - rest -= minOffset; - spanLength = 0; // Match strings from after a string match. - } - } - - /** - * Spans a string and counts the smallest number of set elements on any path across the span. - * - *

For proper counting, we cannot ignore strings that are fully contained in code point spans. - * - *

If the set does not have any fully-contained strings, then we could optimize this - * like span(), but such sets are likely rare, and this is at least still linear. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param spanCondition The span condition - * @param outCount The count - * @return the limit (exclusive end) of the span - */ - public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, - OutputInt outCount) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNot(s, start, outCount); - } - // Consider strings; they may overlap with the span, - // and they may result in a smaller count that with just code points. - if (spanCondition == SpanCondition.CONTAINED) { - return spanContainedAndCount(s, start, outCount); - } - // SIMPLE (not synchronized, does not use offsets) - int stringsLength = strings.size(); - int length = s.length(); - int pos = start; - int rest = length - start; - int count = 0; - while (rest != 0) { - // Try to match the next code point. - int cpLength = spanOne(spanSet, s, pos, rest); - int maxInc = (cpLength > 0) ? cpLength : 0; - // Try to match all of the strings. - for (int i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - if (maxInc < length16 && length16 <= rest && - matches16CPB(s, pos, length, string, length16)) { - maxInc = length16; - } - } - // We are done if there is no match beyond pos. - if (maxInc == 0) { - outCount.value = count; - return pos; - } - // Continue from the longest match. - ++count; - pos += maxInc; - rest -= maxInc; - } - outCount.value = count; - return pos; - } - - private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { - // Use offset list to try all possibilities. - offsets.setMaxLength(maxLength16); - int stringsLength = strings.size(); - int length = s.length(); - int pos = start; - int rest = length - start; - int count = 0; - while (rest != 0) { - // Try to match the next code point. - int cpLength = spanOne(spanSet, s, pos, rest); - if (cpLength > 0) { - offsets.addOffsetAndCount(cpLength, count + 1); - } - // Try to match all of the strings. - for (int i = 0; i < stringsLength; ++i) { - String string = strings.get(i); - int length16 = string.length(); - // Note: If the strings were sorted by length, then we could also - // avoid trying to match if there is already a match of the same length. - if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && - matches16CPB(s, pos, length, string, length16)) { - offsets.addOffsetAndCount(length16, count + 1); - } - } - // We are done if there is no match beyond pos. - if (offsets.isEmpty()) { - outCount.value = count; - return pos; - } - // Continue from the nearest match. - int minOffset = offsets.popMinimum(outCount); - count = outCount.value; - pos += minOffset; - rest -= minOffset; - } - outCount.value = count; - return pos; - } - - /** - * Span a string backwards. - * - * @param s The string to be spanned - * @param spanCondition The span condition - * @return The string index which starts the span (i.e. inclusive). - */ - public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { - if (spanCondition == SpanCondition.NOT_CONTAINED) { - return spanNotBack(s, length); - } - int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); - if (pos == 0) { - return 0; - } - int spanLength = length - pos; - - // Consider strings; they may overlap with the span. - int initSize = 0; - if (spanCondition == SpanCondition.CONTAINED) { - // Use offset list to try all possibilities. - initSize = maxLength16; - } - offsets.setMaxLength(initSize); - int i, stringsLength = strings.size(); - int spanBackLengthsOffset = 0; - if (all) { - spanBackLengthsOffset = stringsLength; - } - for (;;) { - if (spanCondition == SpanCondition.CONTAINED) { - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[spanBackLengthsOffset + i]; - if (overlap == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if (overlap >= LONG_SPAN) { - overlap = length16; - // While contained: No point matching fully inside the code point span. - int len1 = 0; - len1 = string.offsetByCodePoints(0, 1); - overlap -= len1; // Length of the string minus the first code point. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int dec = length16 - overlap; // Keep dec+overlap==length16. - for (;;) { - if (dec > pos) { - break; - } - // Try to match if the decrement is not listed already. - if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { - if (dec == pos) { - return 0; // Reached the start of the string. - } - offsets.addOffset(dec); - } - if (overlap == 0) { - break; - } - --overlap; - ++dec; - } - } - } else /* SIMPLE */{ - int maxDec = 0, maxOverlap = 0; - for (i = 0; i < stringsLength; ++i) { - int overlap = spanLengths[spanBackLengthsOffset + i]; - // For longest match, we do need to try to match even an all-contained string - // to find the match from the latest end. - - String string = strings.get(i); - - int length16 = string.length(); - - // Try to match this string at pos-(length16-overlap)..pos-length16. - if (overlap >= LONG_SPAN) { - overlap = length16; - // Longest match: Need to match fully inside the code point span - // to find the match from the latest end. - } - if (overlap > spanLength) { - overlap = spanLength; - } - int dec = length16 - overlap; // Keep dec+overlap==length16. - for (;;) { - if (dec > pos || overlap < maxOverlap) { - break; - } - // Try to match if the string is longer or ends later. - if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) - && matches16CPB(s, pos - dec, length, string, length16)) { - maxDec = dec; // Longest match from latest end. - maxOverlap = overlap; - break; - } - --overlap; - ++dec; - } - } - - if (maxDec != 0 || maxOverlap != 0) { - // Longest-match algorithm, and there was a string match. - // Simply continue before it. - pos -= maxDec; - if (pos == 0) { - return 0; // Reached the start of the string. - } - spanLength = 0; // Match strings from before a string match. - continue; - } - } - // Finished trying to match all strings at pos. - - if (spanLength != 0 || pos == length) { - // The position is before an unlimited code point span (spanLength!=0), - // not before a string match. - // The only position where spanLength==0 before a span is pos==length. - // Otherwise, an unlimited code point span is only tried again when no - // strings match, and if such a non-initial span fails we stop. - if (offsets.isEmpty()) { - return pos; // No strings matched before a span. - } - // Match strings from before the next string match. - } else { - // The position is before a string match (or a single code point). - if (offsets.isEmpty()) { - // No more strings matched before a previous string match. - // Try another code point span from before the last string match. - int oldPos = pos; - pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); - spanLength = oldPos - pos; - if (pos == 0 || // Reached the start of the string, or - spanLength == 0 // neither strings nor span progressed. - ) { - return pos; - } - continue; // spanLength>0: Match strings from before a span. - } else { - // Try to match only one code point from before a string match if some - // string matched beyond it, so that we try all possible positions - // and don't overshoot. - spanLength = spanOneBack(spanSet, s, pos); - if (spanLength > 0) { - if (spanLength == pos) { - return 0; // Reached the start of the string. - } - // Match strings before this code point. - // There cannot be any decrements below it because UnicodeSet strings - // contain multiple code points. - pos -= spanLength; - offsets.shift(spanLength); - spanLength = 0; - continue; // Match strings from before a single code point. - } - // Match strings from before the next string match. - } - } - pos -= offsets.popMinimum(null); - spanLength = 0; // Match strings from before a string match. - } - } - - /** - * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) - * - * Theoretical algorithm: - * - Iterate through the string, and at each code point boundary: - * + If the code point there is in the set, then return with the current position. - * + If a set string matches at the current position, then return with the current position. - * - * Optimized implementation: - * - * (Same assumption as for span() above.) - * - * Create and cache a spanNotSet which contains - * all of the single code points of the original set but none of its strings. - * For each set string add its initial code point to the spanNotSet. - * (Also add its final code point for spanNotBack().) - * - * - Loop: - * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). - * + If the current code point is in the original set, then return the current position. - * + If any set string matches at the current position, then return the current position. - * + If there is no match at the current position, neither for the code point - * there nor for any set string, then skip this code point and continue the loop. - * This happens for set-string-initial code points that were added to spanNotSet - * when there is not actually a match for such a set string. - * - * @param s The string to be spanned - * @param start The start index that the span begins - * @param outCount If not null: Receives the number of code points across the span. - * @return the limit (exclusive end) of the span - */ - private int spanNot(CharSequence s, int start, OutputInt outCount) { - int length = s.length(); - int pos = start, rest = length - start; - int stringsLength = strings.size(); - int count = 0; - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - int spanLimit; - if (outCount == null) { - spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); - } else { - spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); - outCount.value = count = count + outCount.value; - } - if (spanLimit == length) { - return length; // Reached the end of the string. - } - pos = spanLimit; - rest = length - spanLimit; - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int cpLength = spanOne(spanSet, s, pos, rest); - if (cpLength > 0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for (int i = 0; i < stringsLength; ++i) { - if (spanLengths[i] == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos -= cpLength; - rest += cpLength; - ++count; - } while (rest != 0); - if (outCount != null) { - outCount.value = count; - } - return length; // Reached the end of the string. - } - - private int spanNotBack(CharSequence s, int length) { - int pos = length; - int i, stringsLength = strings.size(); - do { - // Span until we find a code point from the set, - // or a code point that starts or ends some string. - pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); - if (pos == 0) { - return 0; // Reached the start of the string. - } - - // Check whether the current code point is in the original set, - // without the string starts and ends. - int cpLength = spanOneBack(spanSet, s, pos); - if (cpLength > 0) { - return pos; // There is a set element at pos. - } - - // Try to match the strings at pos. - for (i = 0; i < stringsLength; ++i) { - // Use spanLengths rather than a spanLengths pointer because - // it is easier and we only need to know whether the string is irrelevant - // which is the same in either array. - if (spanLengths[i] == ALL_CP_CONTAINED) { - continue; // Irrelevant string. - } - String string = strings.get(i); - - int length16 = string.length(); - if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { - return pos; // There is a set element at pos. - } - } - - // The span(while not contained) ended on a string start/end which is - // not in the original set. Skip this code point and continue. - // cpLength<0 - pos += cpLength; - } while (pos != 0); - return 0; // Reached the start of the string. - } - - static short makeSpanLengthByte(int spanLength) { - // 0xfe==UnicodeSetStringSpan::LONG_SPAN - return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; - } - - // Compare strings without any argument checks. Requires length>0. - private static boolean matches16(CharSequence s, int start, final String t, int length) { - int end = start + length; - while (length-- > 0) { - if (s.charAt(--end) != t.charAt(length)) { - return false; - } - } - return true; - } - - /** - * Compare 16-bit Unicode strings (which may be malformed UTF-16) - * at code point boundaries. - * That is, each edge of a match must not be in the middle of a surrogate pair. - * @param s The string to match in. - * @param start The start index of s. - * @param limit The limit of the subsequence of s being spanned. - * @param t The substring to be matched in s. - * @param tlength The length of t. - */ - static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { - return matches16(s, start, t, tlength) - && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && - Character.isLowSurrogate(s.charAt(start))) - && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && - Character.isLowSurrogate(s.charAt(start + tlength))); - } - - /** - * Does the set contain the next code point? - * If so, return its length; otherwise return its negative length. - */ - static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { - char c = s.charAt(start); - if (c >= 0xd800 && c <= 0xdbff && length >= 2) { - char c2 = s.charAt(start + 1); - if (UTF16.isTrailSurrogate(c2)) { - int supplementary = UCharacterProperty.getRawSupplementary(c, c2); - return set.contains(supplementary) ? 2 : -2; - } - } - return set.contains(c) ? 1 : -1; - } - - static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { - char c = s.charAt(length - 1); - if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { - char c2 = s.charAt(length - 2); - if (UTF16.isLeadSurrogate(c2)) { - int supplementary = UCharacterProperty.getRawSupplementary(c2, c); - return set.contains(supplementary) ? 2 : -2; - } - } - return set.contains(c) ? 1 : -1; - } - - /** - * Helper class for UnicodeSetStringSpan. - * - *

List of offsets from the current position from where to try matching - * a code point or a string. - * Stores offsets rather than indexes to simplify the code and use the same list - * for both increments (in span()) and decrements (in spanBack()). - * - *

Assumption: The maximum offset is limited, and the offsets that are stored at any one time - * are relatively dense, that is, - * there are normally no gaps of hundreds or thousands of offset values. - * - *

This class optionally also tracks the minimum non-negative count for each position, - * intended to count the smallest number of elements of any path leading to that position. - * - *

The implementation uses a circular buffer of count integers, - * each indicating whether the corresponding offset is in the list, - * and its path element count. - * This avoids inserting into a sorted list of offsets (or absolute indexes) - * and physically moving part of the list. - * - *

Note: In principle, the caller should setMaxLength() to - * the maximum of the max string length and U16_LENGTH/U8_LENGTH - * to account for "long" single code points. - * - *

Note: An earlier version did not track counts and stored only byte flags. - * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, - * the list could be stored as bit flags in a single integer. - * Rather than handling a circular buffer with a start list index, - * the integer would simply be shifted when lower offsets are removed. - * UnicodeSet does not have a limit on the lengths of strings. - */ - private static final class OffsetList { - private int[] list; - private int length; - private int start; - - public OffsetList() { - list = new int[16]; // default size - } - - public void setMaxLength(int maxLength) { - if (maxLength > list.length) { - list = new int[maxLength]; - } - clear(); - } - - public void clear() { - for (int i = list.length; i-- > 0;) { - list[i] = 0; - } - start = length = 0; - } - - public boolean isEmpty() { - return (length == 0); - } - - /** - * Reduces all stored offsets by delta, used when the current position moves by delta. - * There must not be any offsets lower than delta. - * If there is an offset equal to delta, it is removed. - * - * @param delta [1..maxLength] - */ - public void shift(int delta) { - int i = start + delta; - if (i >= list.length) { - i -= list.length; - } - if (list[i] != 0) { - list[i] = 0; - --length; - } - start = i; - } - - /** - * Adds an offset. The list must not contain it yet. - * @param offset [1..maxLength] - */ - public void addOffset(int offset) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - assert list[i] == 0; - list[i] = 1; - ++length; - } - - /** - * Adds an offset and updates its count. - * The list may already contain the offset. - * @param offset [1..maxLength] - */ - public void addOffsetAndCount(int offset, int count) { - assert count > 0; - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - if (list[i] == 0) { - list[i] = count; - ++length; - } else if (count < list[i]) { - list[i] = count; - } - } - - /** - * @param offset [1..maxLength] - */ - public boolean containsOffset(int offset) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - return list[i] != 0; - } - - /** - * @param offset [1..maxLength] - */ - public boolean hasCountAtOffset(int offset, int count) { - int i = start + offset; - if (i >= list.length) { - i -= list.length; - } - int oldCount = list[i]; - return oldCount != 0 && oldCount <= count; - } - - /** - * Finds the lowest stored offset from a non-empty list, removes it, - * and reduces all other offsets by this minimum. - * @return min=[1..maxLength] - */ - public int popMinimum(OutputInt outCount) { - // Look for the next offset in list[start+1..list.length-1]. - int i = start, result; - while (++i < list.length) { - int count = list[i]; - if (count != 0) { - list[i] = 0; - --length; - result = i - start; - start = i; - if (outCount != null) { outCount.value = count; } - return result; - } - } - // i==list.length - - // Wrap around and look for the next offset in list[0..start]. - // Since the list is not empty, there will be one. - result = list.length - start; - i = 0; - int count; - while ((count = list[i]) == 0) { - ++i; - } - list[i] = 0; - --length; - start = i; - if (outCount != null) { outCount.value = count; } - return result + i; - } - } -} --- /dev/null 2020-01-10 15:57:43.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java 2020-01-10 15:57:42.000000000 -0800 @@ -0,0 +1,1168 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import java.util.ArrayList; + +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.text.UnicodeSet; +import jdk.internal.icu.text.UnicodeSet.SpanCondition; +import jdk.internal.icu.util.OutputInt; + +/* + * Implement span() etc. for a set with strings. + * Avoid recursion because of its exponential complexity. + * Instead, try multiple paths at once and track them with an IndexList. + */ +public class UnicodeSetStringSpan { + + /* + * Which span() variant will be used? The object is either built for one variant and used once, + * or built for all and may be used many times. + */ + public static final int WITH_COUNT = 0x40; // spanAndCount() may be called + public static final int FWD = 0x20; + public static final int BACK = 0x10; + // public static final int UTF16 = 8; + public static final int CONTAINED = 2; + public static final int NOT_CONTAINED = 1; + + public static final int ALL = 0x7f; + + public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; + public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; + public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; + public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; + + /** + * Special spanLength short values. (since Java has not unsigned byte type) + * All code points in the string are contained in the parent set. + */ + static final short ALL_CP_CONTAINED = 0xff; + + /** The spanLength is >=0xfe. */ + static final short LONG_SPAN = ALL_CP_CONTAINED - 1; + + /** Set for span(). Same as parent but without strings. */ + private UnicodeSet spanSet; + + /** + * Set for span(not contained). + * Same as spanSet, plus characters that start or end strings. + */ + private UnicodeSet spanNotSet; + + /** The strings of the parent set. */ + private ArrayList strings; + + /** The lengths of span(), spanBack() etc. for each string. */ + private short[] spanLengths; + + /** Maximum lengths of relevant strings. */ + private int maxLength16; + + /** Are there strings that are not fully contained in the code point set? */ + private boolean someRelevant; + + /** Set up for all variants of span()? */ + private boolean all; + + /** Span helper */ + private OffsetList offsets; + + /** + * Constructs for all variants of span(), or only for any one variant. + * Initializes as little as possible, for single use. + */ + public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList setStrings, int which) { + spanSet = new UnicodeSet(0, 0x10ffff); + // TODO: With Java 6, just take the parent set's strings as is, + // as a NavigableSet, rather than as an ArrayList copy of the set of strings. + // Then iterate via the first() and higher() methods. + // (We do not want to create multiple Iterator objects in each span().) + // See ICU ticket #7454. + strings = setStrings; + all = (which == ALL); + spanSet.retainAll(set); + if (0 != (which & NOT_CONTAINED)) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + spanNotSet = spanSet; + } + offsets = new OffsetList(); + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int stringsLength = strings.size(); + + int i, spanLength; + someRelevant = false; + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + someRelevant = true; + } + if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { + maxLength16 = length16; + } + } + if (!someRelevant && (which & WITH_COUNT) == 0) { + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if (all) { + spanSet.freeze(); + } + + int spanBackLengthsOffset; + + // Allocate a block of meta data. + int allocSize; + if (all) { + // 2 sets of span lengths + allocSize = stringsLength * (2); + } else { + allocSize = stringsLength; // One set of span lengths. + } + spanLengths = new short[allocSize]; + + if (all) { + // Store span lengths for all span() variants. + spanBackLengthsOffset = stringsLength; + } else { + // Store span lengths for only one span() variant. + spanBackLengthsOffset = 0; + } + + // Set the meta data and spanNotSet and write the UTF-8 strings. + + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + if (true /* 0 != (which & UTF16) */) { + if (0 != (which & CONTAINED)) { + if (0 != (which & FWD)) { + spanLengths[i] = makeSpanLengthByte(spanLength); + } + if (0 != (which & BACK)) { + spanLength = length16 + - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); + spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */{ + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant + // flag. + } + } + if (0 != (which & NOT_CONTAINED)) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + int c; + if (0 != (which & FWD)) { + c = string.codePointAt(0); + addToSpanNotSet(c); + } + if (0 != (which & BACK)) { + c = string.codePointBefore(length16); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. + if (all) { + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i] = ALL_CP_CONTAINED; + } + } + } + + // Finish. + if (all) { + spanNotSet.freeze(); + } + } + + /** + * Do the strings need to be checked in span() etc.? + * + * @return true if strings need to be checked (call span() here), + * false if not (use a BMPSet for best performance). + */ + public boolean needsStringSpanUTF16() { + return someRelevant; + } + + /** For fast UnicodeSet::contains(c). */ + public boolean contains(int c) { + return spanSet.contains(c); + } + + /** + * Adds a starting or ending string character to the spanNotSet + * so that a character span ends before any string. + */ + private void addToSpanNotSet(int c) { + if (spanNotSet == null || spanNotSet == spanSet) { + if (spanSet.contains(c)) { + return; // Nothing to do. + } + spanNotSet = spanSet.cloneAsThawed(); + } + spanNotSet.add(c); + } + + /* + * Note: In span() when spanLength==0 + * (after a string match, or at the beginning after an empty code point span) + * and in spanNot() and spanNotUTF8(), + * string matching could use a binary search because all string matches are done + * from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets + * with strings have very few very short strings. For cases with many strings, it might be better to use a different + * API and implementation with a DFA (state machine). + */ + + /* + * Algorithm for span(SpanCondition.CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, or recursively span + * for all but the shortest one and iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, + * neither for the code point there nor for any set string, + * then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points of the original set + * but none of its strings. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the recursive algorithm would have tried to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(SpanCondition.CONTAINED) + * because the recursive algorithm would continue after them anyway and + * find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), + * then pop the shortest string match end point and continue the loop, + * trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, then test if the + * code point after the previous string match is also contained in the set. + * Continue the loop with the shortest end point of + * either this code point or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, the function visits each string position at most once and + * finishes in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + + /* + * Algorithm for span(SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the standard algorithm would have tried to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have + * continued after the set string match anyway. + * ~ Remember the longest set string match (farthest end point) + * from the earliest starting point. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched, + * then continue the loop after the longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + /** + * Spans a string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the limit (exclusive end) of the span + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, null); + } + int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); + if (spanLimit == s.length()) { + return spanLimit; + } + return spanWithStrings(s, start, spanLimit, spanCondition); + } + + /** + * Synchronized method for complicated spans using the offsets. + * Avoids synchronization for simple cases. + * + * @param spanLimit = spanSet.span(s, start, CONTAINED) + */ + private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, + SpanCondition spanCondition) { + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int length = s.length(); + int pos = spanLimit, rest = length - spanLimit; + int spanLength = spanLimit - start; + int i, stringsLength = strings.size(); + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code + // point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest) { + break; + } + // Try to match if the increment is not listed already. + if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { + if (inc == rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if (overlap == 0) { + break; + } + --overlap; + ++inc; + } + } + } else /* SIMPLE */{ + int maxInc = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) + && matches16CPB(s, pos - overlap, length, string, length16)) { + maxInc = inc; // Longest match from earliest start. + maxOverlap = overlap; + break; + } + --overlap; + ++inc; + } + } + + if (maxInc != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos += maxInc; + rest -= maxInc; + if (rest == 0) { + return length; // Reached the end of the string. + } + spanLength = 0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == 0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); + spanLength = spanLimit - pos; + if (spanLength == rest || // Reached the end of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return spanLimit; + } + pos += spanLength; + rest -= spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOne(spanSet, s, pos, rest); + if (spanLength > 0) { + if (spanLength == rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos += spanLength; + rest -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int minOffset = offsets.popMinimum(null); + pos += minOffset; + rest -= minOffset; + spanLength = 0; // Match strings from after a string match. + } + } + + /** + * Spans a string and counts the smallest number of set elements on any path across the span. + * + *

For proper counting, we cannot ignore strings that are fully contained in code point spans. + * + *

If the set does not have any fully-contained strings, then we could optimize this + * like span(), but such sets are likely rare, and this is at least still linear. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @param outCount The count + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, outCount); + } + // Consider strings; they may overlap with the span, + // and they may result in a smaller count that with just code points. + if (spanCondition == SpanCondition.CONTAINED) { + return spanContainedAndCount(s, start, outCount); + } + // SIMPLE (not synchronized, does not use offsets) + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + int maxInc = (cpLength > 0) ? cpLength : 0; + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + if (maxInc < length16 && length16 <= rest && + matches16CPB(s, pos, length, string, length16)) { + maxInc = length16; + } + } + // We are done if there is no match beyond pos. + if (maxInc == 0) { + outCount.value = count; + return pos; + } + // Continue from the longest match. + ++count; + pos += maxInc; + rest -= maxInc; + } + outCount.value = count; + return pos; + } + + private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + offsets.addOffsetAndCount(cpLength, count + 1); + } + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + // Note: If the strings were sorted by length, then we could also + // avoid trying to match if there is already a match of the same length. + if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && + matches16CPB(s, pos, length, string, length16)) { + offsets.addOffsetAndCount(length16, count + 1); + } + } + // We are done if there is no match beyond pos. + if (offsets.isEmpty()) { + outCount.value = count; + return pos; + } + // Continue from the nearest match. + int minOffset = offsets.popMinimum(outCount); + count = outCount.value; + pos += minOffset; + rest -= minOffset; + } + outCount.value = count; + return pos; + } + + /** + * Span a string backwards. + * + * @param s The string to be spanned + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + */ + public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNotBack(s, length); + } + int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); + if (pos == 0) { + return 0; + } + int spanLength = length - pos; + + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int i, stringsLength = strings.size(); + int spanBackLengthsOffset = 0; + if (all) { + spanBackLengthsOffset = stringsLength; + } + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + int len1 = 0; + len1 = string.offsetByCodePoints(0, 1); + overlap -= len1; // Length of the string minus the first code point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos) { + break; + } + // Try to match if the decrement is not listed already. + if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { + if (dec == pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if (overlap == 0) { + break; + } + --overlap; + ++dec; + } + } + } else /* SIMPLE */{ + int maxDec = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) + && matches16CPB(s, pos - dec, length, string, length16)) { + maxDec = dec; // Longest match from latest end. + maxOverlap = overlap; + break; + } + --overlap; + ++dec; + } + } + + if (maxDec != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos -= maxDec; + if (pos == 0) { + return 0; // Reached the start of the string. + } + spanLength = 0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int oldPos = pos; + pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); + spanLength = oldPos - pos; + if (pos == 0 || // Reached the start of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOneBack(spanSet, s, pos); + if (spanLength > 0) { + if (spanLength == pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos -= offsets.popMinimum(null); + spanLength = 0; // Match strings from before a string match. + } + } + + /** + * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains + * all of the single code points of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). + * + If the current code point is in the original set, then return the current position. + * + If any set string matches at the current position, then return the current position. + * + If there is no match at the current position, neither for the code point + * there nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param outCount If not null: Receives the number of code points across the span. + * @return the limit (exclusive end) of the span + */ + private int spanNot(CharSequence s, int start, OutputInt outCount) { + int length = s.length(); + int pos = start, rest = length - start; + int stringsLength = strings.size(); + int count = 0; + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + int spanLimit; + if (outCount == null) { + spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); + } else { + spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); + outCount.value = count = count + outCount.value; + } + if (spanLimit == length) { + return length; // Reached the end of the string. + } + pos = spanLimit; + rest = length - spanLimit; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (int i = 0; i < stringsLength; ++i) { + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos -= cpLength; + rest += cpLength; + ++count; + } while (rest != 0); + if (outCount != null) { + outCount.value = count; + } + return length; // Reached the end of the string. + } + + private int spanNotBack(CharSequence s, int length) { + int pos = length; + int i, stringsLength = strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); + if (pos == 0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOneBack(spanSet, s, pos); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (i = 0; i < stringsLength; ++i) { + // Use spanLengths rather than a spanLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos += cpLength; + } while (pos != 0); + return 0; // Reached the start of the string. + } + + static short makeSpanLengthByte(int spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; + } + + // Compare strings without any argument checks. Requires length>0. + private static boolean matches16(CharSequence s, int start, final String t, int length) { + int end = start + length; + while (length-- > 0) { + if (s.charAt(--end) != t.charAt(length)) { + return false; + } + } + return true; + } + + /** + * Compare 16-bit Unicode strings (which may be malformed UTF-16) + * at code point boundaries. + * That is, each edge of a match must not be in the middle of a surrogate pair. + * @param s The string to match in. + * @param start The start index of s. + * @param limit The limit of the subsequence of s being spanned. + * @param t The substring to be matched in s. + * @param tlength The length of t. + */ + static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { + return matches16(s, start, t, tlength) + && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && + Character.isLowSurrogate(s.charAt(start))) + && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && + Character.isLowSurrogate(s.charAt(start + tlength))); + } + + /** + * Does the set contain the next code point? + * If so, return its length; otherwise return its negative length. + */ + static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { + char c = s.charAt(start); + if (c >= 0xd800 && c <= 0xdbff && length >= 2) { + char c2 = s.charAt(start + 1); + if (UTF16.isTrailSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { + char c = s.charAt(length - 1); + if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { + char c2 = s.charAt(length - 2); + if (UTF16.isLeadSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + /** + * Helper class for UnicodeSetStringSpan. + * + *

List of offsets from the current position from where to try matching + * a code point or a string. + * Stores offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + *

Assumption: The maximum offset is limited, and the offsets that are stored at any one time + * are relatively dense, that is, + * there are normally no gaps of hundreds or thousands of offset values. + * + *

This class optionally also tracks the minimum non-negative count for each position, + * intended to count the smallest number of elements of any path leading to that position. + * + *

The implementation uses a circular buffer of count integers, + * each indicating whether the corresponding offset is in the list, + * and its path element count. + * This avoids inserting into a sorted list of offsets (or absolute indexes) + * and physically moving part of the list. + * + *

Note: In principle, the caller should setMaxLength() to + * the maximum of the max string length and U16_LENGTH/U8_LENGTH + * to account for "long" single code points. + * + *

Note: An earlier version did not track counts and stored only byte flags. + * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ + private static final class OffsetList { + private int[] list; + private int length; + private int start; + + public OffsetList() { + list = new int[16]; // default size + } + + public void setMaxLength(int maxLength) { + if (maxLength > list.length) { + list = new int[maxLength]; + } + clear(); + } + + public void clear() { + for (int i = list.length; i-- > 0;) { + list[i] = 0; + } + start = length = 0; + } + + public boolean isEmpty() { + return (length == 0); + } + + /** + * Reduces all stored offsets by delta, used when the current position moves by delta. + * There must not be any offsets lower than delta. + * If there is an offset equal to delta, it is removed. + * + * @param delta [1..maxLength] + */ + public void shift(int delta) { + int i = start + delta; + if (i >= list.length) { + i -= list.length; + } + if (list[i] != 0) { + list[i] = 0; + --length; + } + start = i; + } + + /** + * Adds an offset. The list must not contain it yet. + * @param offset [1..maxLength] + */ + public void addOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + assert list[i] == 0; + list[i] = 1; + ++length; + } + + /** + * Adds an offset and updates its count. + * The list may already contain the offset. + * @param offset [1..maxLength] + */ + public void addOffsetAndCount(int offset, int count) { + assert count > 0; + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + if (list[i] == 0) { + list[i] = count; + ++length; + } else if (count < list[i]) { + list[i] = count; + } + } + + /** + * @param offset [1..maxLength] + */ + public boolean containsOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + return list[i] != 0; + } + + /** + * @param offset [1..maxLength] + */ + public boolean hasCountAtOffset(int offset, int count) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + int oldCount = list[i]; + return oldCount != 0 && oldCount <= count; + } + + /** + * Finds the lowest stored offset from a non-empty list, removes it, + * and reduces all other offsets by this minimum. + * @return min=[1..maxLength] + */ + public int popMinimum(OutputInt outCount) { + // Look for the next offset in list[start+1..list.length-1]. + int i = start, result; + while (++i < list.length) { + int count = list[i]; + if (count != 0) { + list[i] = 0; + --length; + result = i - start; + start = i; + if (outCount != null) { outCount.value = count; } + return result; + } + } + // i==list.length + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result = list.length - start; + i = 0; + int count; + while ((count = list[i]) == 0) { + ++i; + } + list[i] = 0; + --length; + start = i; + if (outCount != null) { outCount.value = count; } + return result + i; + } + } +} --- old/src/java.base/share/classes/sun/text/normalizer/Utility.java 2020-01-10 15:57:44.000000000 -0800 +++ /dev/null 2020-01-10 15:57:44.000000000 -0800 @@ -1,273 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * Copyright (C) 1996-2011, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.io.IOException; -import java.util.Locale; - -final class Utility { - - /** - * Convert characters outside the range U+0020 to U+007F to - * Unicode escapes, and convert backslash to a double backslash. - */ - public static final String escape(String s) { - StringBuilder buf = new StringBuilder(); - for (int i=0; i= ' ' && c <= 0x007F) { - if (c == '\\') { - buf.append("\\\\"); // That is, "\\" - } else { - buf.append((char)c); - } - } else { - boolean four = c <= 0xFFFF; - buf.append(four ? "\\u" : "\\U"); - buf.append(hex(c, four ? 4 : 8)); - } - } - return buf.toString(); - } - - /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ - private static final char[] UNESCAPE_MAP = { - /*" 0x22, 0x22 */ - /*' 0x27, 0x27 */ - /*? 0x3F, 0x3F */ - /*\ 0x5C, 0x5C */ - /*a*/ 0x61, 0x07, - /*b*/ 0x62, 0x08, - /*e*/ 0x65, 0x1b, - /*f*/ 0x66, 0x0c, - /*n*/ 0x6E, 0x0a, - /*r*/ 0x72, 0x0d, - /*t*/ 0x74, 0x09, - /*v*/ 0x76, 0x0b - }; - - /** - * Convert an escape to a 32-bit code point value. We attempt - * to parallel the icu4c unescapeAt() function. - * @param offset16 an array containing offset to the character - * after the backslash. Upon return offset16[0] will - * be updated to point after the escape sequence. - * @return character value from 0 to 10FFFF, or -1 on error. - */ - public static int unescapeAt(String s, int[] offset16) { - int c; - int result = 0; - int n = 0; - int minDig = 0; - int maxDig = 0; - int bitsPerDigit = 4; - int dig; - int i; - boolean braces = false; - - /* Check that offset is in range */ - int offset = offset16[0]; - int length = s.length(); - if (offset < 0 || offset >= length) { - return -1; - } - - /* Fetch first UChar after '\\' */ - c = Character.codePointAt(s, offset); - offset += UTF16.getCharCount(c); - - /* Convert hexadecimal and octal escapes */ - switch (c) { - case 'u': - minDig = maxDig = 4; - break; - case 'U': - minDig = maxDig = 8; - break; - case 'x': - minDig = 1; - if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { - ++offset; - braces = true; - maxDig = 8; - } else { - maxDig = 2; - } - break; - default: - dig = UCharacter.digit(c, 8); - if (dig >= 0) { - minDig = 1; - maxDig = 3; - n = 1; /* Already have first octal digit */ - bitsPerDigit = 3; - result = dig; - } - break; - } - if (minDig != 0) { - while (offset < length && n < maxDig) { - c = UTF16.charAt(s, offset); - dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); - if (dig < 0) { - break; - } - result = (result << bitsPerDigit) | dig; - offset += UTF16.getCharCount(c); - ++n; - } - if (n < minDig) { - return -1; - } - if (braces) { - if (c != 0x7D /*}*/) { - return -1; - } - ++offset; - } - if (result < 0 || result >= 0x110000) { - return -1; - } - // If an escape sequence specifies a lead surrogate, see - // if there is a trail surrogate after it, either as an - // escape or as a literal. If so, join them up into a - // supplementary. - if (offset < length && - UTF16.isLeadSurrogate((char) result)) { - int ahead = offset+1; - c = s.charAt(offset); // [sic] get 16-bit code unit - if (c == '\\' && ahead < length) { - int o[] = new int[] { ahead }; - c = unescapeAt(s, o); - ahead = o[0]; - } - if (UTF16.isTrailSurrogate((char) c)) { - offset = ahead; - result = UCharacterProperty.getRawSupplementary( - (char) result, (char) c); - } - } - offset16[0] = offset; - return result; - } - - /* Convert C-style escapes in table */ - for (i=0; i= 0x20 && c <= 0x7E); - } - - /** - * Escape unprintable characters using uxxxx notation - * for U+0000 to U+FFFF and Uxxxxxxxx for U+10000 and - * above. If the character is printable ASCII, then do nothing - * and return FALSE. Otherwise, append the escaped notation and - * return TRUE. - */ - public static boolean escapeUnprintable(T result, int c) { - try { - if (isUnprintable(c)) { - result.append('\\'); - if ((c & ~0xFFFF) != 0) { - result.append('U'); - result.append(DIGITS[0xF&(c>>28)]); - result.append(DIGITS[0xF&(c>>24)]); - result.append(DIGITS[0xF&(c>>20)]); - result.append(DIGITS[0xF&(c>>16)]); - } else { - result.append('u'); - } - result.append(DIGITS[0xF&(c>>12)]); - result.append(DIGITS[0xF&(c>>8)]); - result.append(DIGITS[0xF&(c>>4)]); - result.append(DIGITS[0xF&c]); - return true; - } - return false; - } catch (IOException e) { - throw new IllegalArgumentException(e); - } - } -} --- /dev/null 2020-01-10 15:57:44.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/impl/Utility.java 2020-01-10 15:57:43.000000000 -0800 @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * Copyright (C) 1996-2011, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.UTF16; + +import java.io.IOException; +import java.util.Locale; + +public final class Utility { + + /** + * Convert characters outside the range U+0020 to U+007F to + * Unicode escapes, and convert backslash to a double backslash. + */ + public static final String escape(String s) { + StringBuilder buf = new StringBuilder(); + for (int i=0; i= ' ' && c <= 0x007F) { + if (c == '\\') { + buf.append("\\\\"); // That is, "\\" + } else { + buf.append((char)c); + } + } else { + boolean four = c <= 0xFFFF; + buf.append(four ? "\\u" : "\\U"); + buf.append(hex(c, four ? 4 : 8)); + } + } + return buf.toString(); + } + + /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ + private static final char[] UNESCAPE_MAP = { + /*" 0x22, 0x22 */ + /*' 0x27, 0x27 */ + /*? 0x3F, 0x3F */ + /*\ 0x5C, 0x5C */ + /*a*/ 0x61, 0x07, + /*b*/ 0x62, 0x08, + /*e*/ 0x65, 0x1b, + /*f*/ 0x66, 0x0c, + /*n*/ 0x6E, 0x0a, + /*r*/ 0x72, 0x0d, + /*t*/ 0x74, 0x09, + /*v*/ 0x76, 0x0b + }; + + /** + * Convert an escape to a 32-bit code point value. We attempt + * to parallel the icu4c unescapeAt() function. + * @param offset16 an array containing offset to the character + * after the backslash. Upon return offset16[0] will + * be updated to point after the escape sequence. + * @return character value from 0 to 10FFFF, or -1 on error. + */ + public static int unescapeAt(String s, int[] offset16) { + int c; + int result = 0; + int n = 0; + int minDig = 0; + int maxDig = 0; + int bitsPerDigit = 4; + int dig; + int i; + boolean braces = false; + + /* Check that offset is in range */ + int offset = offset16[0]; + int length = s.length(); + if (offset < 0 || offset >= length) { + return -1; + } + + /* Fetch first UChar after '\\' */ + c = Character.codePointAt(s, offset); + offset += UTF16.getCharCount(c); + + /* Convert hexadecimal and octal escapes */ + switch (c) { + case 'u': + minDig = maxDig = 4; + break; + case 'U': + minDig = maxDig = 8; + break; + case 'x': + minDig = 1; + if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { + ++offset; + braces = true; + maxDig = 8; + } else { + maxDig = 2; + } + break; + default: + dig = UCharacter.digit(c, 8); + if (dig >= 0) { + minDig = 1; + maxDig = 3; + n = 1; /* Already have first octal digit */ + bitsPerDigit = 3; + result = dig; + } + break; + } + if (minDig != 0) { + while (offset < length && n < maxDig) { + c = UTF16.charAt(s, offset); + dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); + if (dig < 0) { + break; + } + result = (result << bitsPerDigit) | dig; + offset += UTF16.getCharCount(c); + ++n; + } + if (n < minDig) { + return -1; + } + if (braces) { + if (c != 0x7D /*}*/) { + return -1; + } + ++offset; + } + if (result < 0 || result >= 0x110000) { + return -1; + } + // If an escape sequence specifies a lead surrogate, see + // if there is a trail surrogate after it, either as an + // escape or as a literal. If so, join them up into a + // supplementary. + if (offset < length && + UTF16.isLeadSurrogate((char) result)) { + int ahead = offset+1; + c = s.charAt(offset); // [sic] get 16-bit code unit + if (c == '\\' && ahead < length) { + int o[] = new int[] { ahead }; + c = unescapeAt(s, o); + ahead = o[0]; + } + if (UTF16.isTrailSurrogate((char) c)) { + offset = ahead; + result = UCharacterProperty.getRawSupplementary( + (char) result, (char) c); + } + } + offset16[0] = offset; + return result; + } + + /* Convert C-style escapes in table */ + for (i=0; i= 0x20 && c <= 0x7E); + } + + /** + * Escape unprintable characters using uxxxx notation + * for U+0000 to U+FFFF and Uxxxxxxxx for U+10000 and + * above. If the character is printable ASCII, then do nothing + * and return FALSE. Otherwise, append the escaped notation and + * return TRUE. + */ + public static boolean escapeUnprintable(T result, int c) { + try { + if (isUnprintable(c)) { + result.append('\\'); + if ((c & ~0xFFFF) != 0) { + result.append('U'); + result.append(DIGITS[0xF&(c>>28)]); + result.append(DIGITS[0xF&(c>>24)]); + result.append(DIGITS[0xF&(c>>20)]); + result.append(DIGITS[0xF&(c>>16)]); + } else { + result.append('u'); + } + result.append(DIGITS[0xF&(c>>12)]); + result.append(DIGITS[0xF&(c>>8)]); + result.append(DIGITS[0xF&(c>>4)]); + result.append(DIGITS[0xF&c]); + return true; + } + return false; + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } +} Binary files old/src/java.base/share/classes/sun/text/resources/nfc.nrm and /dev/null differ Binary files /dev/null and new/src/java.base/share/classes/jdk/internal/icu/impl/data/icudt64b/nfc.nrm differ Binary files old/src/java.base/share/classes/sun/text/resources/nfkc.nrm and /dev/null differ Binary files /dev/null and new/src/java.base/share/classes/jdk/internal/icu/impl/data/icudt64b/nfkc.nrm differ Binary files old/src/java.base/share/classes/sun/text/resources/ubidi.icu and /dev/null differ Binary files /dev/null and new/src/java.base/share/classes/jdk/internal/icu/impl/data/icudt64b/ubidi.icu differ Binary files old/src/java.base/share/classes/sun/text/resources/uprops.icu and /dev/null differ Binary files /dev/null and new/src/java.base/share/classes/jdk/internal/icu/impl/data/icudt64b/uprops.icu differ --- old/src/java.base/share/classes/sun/text/normalizer/UCharacter.java 2020-01-10 15:57:48.000000000 -0800 +++ /dev/null 2020-01-10 15:57:48.000000000 -0800 @@ -1,539 +0,0 @@ -/* - * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/** -******************************************************************************* -* Copyright (C) 1996-2014, International Business Machines Corporation and -* others. All Rights Reserved. -******************************************************************************* -*/ - -package sun.text.normalizer; - -/** - *

The UCharacter class provides extensions to the - * - * java.lang.Character class. These extensions provide support for - * more Unicode properties and together with the UTF16 - * class, provide support for supplementary characters (those with code - * points above U+FFFF). - * Each ICU release supports the latest version of Unicode available at that time. - * - *

Code points are represented in these API using ints. While it would be - * more convenient in Java to have a separate primitive datatype for them, - * ints suffice in the meantime. - * - *

To use this class please add the jar file name icu4j.jar to the - * class path, since it contains data files which supply the information used - * by this file.
- * E.g. In Windows
- * set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar.
- * Otherwise, another method would be to copy the files uprops.dat and - * unames.icu from the icu4j source subdirectory - * $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory - * $ICU4J_CLASS/com.ibm.icu.impl.data. - * - *

Aside from the additions for UTF-16 support, and the updated Unicode - * properties, the main differences between UCharacter and Character are: - *

    - *
  • UCharacter is not designed to be a char wrapper and does not have - * APIs to which involves management of that single char.
    - * These include: - *
      - *
    • char charValue(), - *
    • int compareTo(java.lang.Character, java.lang.Character), etc. - *
    - *
  • UCharacter does not include Character APIs that are deprecated, nor - * does it include the Java-specific character information, such as - * boolean isJavaIdentifierPart(char ch). - *
  • Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric - * values '10' - '35'. UCharacter also does this in digit and - * getNumericValue, to adhere to the java semantics of these - * methods. New methods unicodeDigit, and - * getUnicodeNumericValue do not treat the above code points - * as having numeric values. This is a semantic change from ICU4J 1.3.1. - *
- *

- * Further detail on differences can be determined using the program - * - * com.ibm.icu.dev.test.lang.UCharacterCompare - *

- *

- * In addition to Java compatibility functions, which calculate derived properties, - * this API provides low-level access to the Unicode Character Database. - *

- *

- * Unicode assigns each code point (not just assigned character) values for - * many properties. - * Most of them are simple boolean flags, or constants from a small enumerated list. - * For some properties, values are strings or other relatively more complex types. - *

- *

- * For more information see - * "About the Unicode Character Database" - * (http://www.unicode.org/ucd/) - * and the ICU - * User Guide chapter on Properties - * (http://www.icu-project.org/userguide/properties.html). - *

- *

- * There are also functions that provide easy migration from C/POSIX functions - * like isblank(). Their use is generally discouraged because the C/POSIX - * standards do not define their semantics beyond the ASCII range, which means - * that different implementations exhibit very different behavior. - * Instead, Unicode properties should be used directly. - *

- *

- * There are also only a few, broad C/POSIX character classes, and they tend - * to be used for conflicting purposes. For example, the "isalpha()" class - * is sometimes used to determine word boundaries, while a more sophisticated - * approach would at least distinguish initial letters from continuation - * characters (the latter including combining marks). - * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) - * Another example: There is no "istitle()" class for titlecase characters. - *

- *

- * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. - * ICU implements them according to the Standard Recommendations in - * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions - * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). - *

- *

- * API access for C/POSIX character classes is as follows: - *

{@code
- * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
- * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
- * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
- * - punct:     ((1<
- * 

- *

- * The C/POSIX character classes are also available in UnicodeSet patterns, - * using patterns like [:graph:] or \p{graph}. - *

- * - * There are several ICU (and Java) whitespace functions. - * Comparison:
    - *
  • isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; - * most of general categories "Z" (separators) + most whitespace ISO controls - * (including no-break spaces, but excluding IS1..IS4 and ZWSP) - *
  • isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces - *
  • isSpaceChar: just Z (including no-break spaces)
- *

- *

- * This class is not subclassable. - *

- * @author Syn Wee Quek - * @stable ICU 2.1 - * @see com.ibm.icu.lang.UCharacterEnums - */ - -public final class UCharacter -{ - - /** - * Joining Group constants. - * @see UProperty#JOINING_GROUP - * @stable ICU 2.4 - */ - public static interface JoiningGroup - { - /** - * @stable ICU 2.4 - */ - public static final int NO_JOINING_GROUP = 0; - } - - /** - * Numeric Type constants. - * @see UProperty#NUMERIC_TYPE - * @stable ICU 2.4 - */ - public static interface NumericType - { - /** - * @stable ICU 2.4 - */ - public static final int NONE = 0; - /** - * @stable ICU 2.4 - */ - public static final int DECIMAL = 1; - /** - * @stable ICU 2.4 - */ - public static final int DIGIT = 2; - /** - * @stable ICU 2.4 - */ - public static final int NUMERIC = 3; - /** - * @stable ICU 2.4 - */ - public static final int COUNT = 4; - } - - /** - * Hangul Syllable Type constants. - * - * @see UProperty#HANGUL_SYLLABLE_TYPE - * @stable ICU 2.6 - */ - public static interface HangulSyllableType - { - /** - * @stable ICU 2.6 - */ - public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ - /** - * @stable ICU 2.6 - */ - public static final int LEADING_JAMO = 1; /*[L]*/ - /** - * @stable ICU 2.6 - */ - public static final int VOWEL_JAMO = 2; /*[V]*/ - /** - * @stable ICU 2.6 - */ - public static final int TRAILING_JAMO = 3; /*[T]*/ - /** - * @stable ICU 2.6 - */ - public static final int LV_SYLLABLE = 4; /*[LV]*/ - /** - * @stable ICU 2.6 - */ - public static final int LVT_SYLLABLE = 5; /*[LVT]*/ - /** - * @stable ICU 2.6 - */ - public static final int COUNT = 6; - } - - // public data members ----------------------------------------------- - - /** - * The lowest Unicode code point value. - * @stable ICU 2.1 - */ - public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; - - /** - * The highest Unicode code point value (scalar value) according to the - * Unicode Standard. - * This is a 21-bit value (21 bits, rounded up).
- * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE - * @stable ICU 2.1 - */ - public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; - - // public methods ---------------------------------------------------- - - /** - * Returns the numeric value of a decimal digit code point. - *
This method observes the semantics of - * java.lang.Character.digit(). Note that this - * will return positive values for code points for which isDigit - * returns false, just like java.lang.Character. - *
Semantic Change: In release 1.3.1 and - * prior, this did not treat the European letters as having a - * digit value, and also treated numeric letters and other numbers as - * digits. - * This has been changed to conform to the java semantics. - *
A code point is a valid digit if and only if: - *
    - *
  • ch is a decimal digit or one of the european letters, and - *
  • the value of ch is less than the specified radix. - *
- * @param ch the code point to query - * @param radix the radix - * @return the numeric value represented by the code point in the - * specified radix, or -1 if the code point is not a decimal digit - * or if its value is too large for the radix - * @stable ICU 2.1 - */ - public static int digit(int ch, int radix) - { - if (2 <= radix && radix <= 36) { - int value = digit(ch); - if (value < 0) { - // ch is not a decimal digit, try latin letters - value = UCharacterProperty.getEuropeanDigit(ch); - } - return (value < radix) ? value : -1; - } else { - return -1; // invalid radix - } - } - - /** - * Returns the numeric value of a decimal digit code point. - *
This is a convenience overload of digit(int, int) - * that provides a decimal radix. - *
Semantic Change: In release 1.3.1 and prior, this - * treated numeric letters and other numbers as digits. This has - * been changed to conform to the java semantics. - * @param ch the code point to query - * @return the numeric value represented by the code point, - * or -1 if the code point is not a decimal digit or if its - * value is too large for a decimal radix - * @stable ICU 2.1 - */ - public static int digit(int ch) - { - return UCharacterProperty.INSTANCE.digit(ch); - } - - /** - * Returns a value indicating a code point's Unicode category. - * Up-to-date Unicode implementation of java.lang.Character.getType() - * except for the above mentioned code points that had their category - * changed.
- * Return results are constants from the interface - * UCharacterCategory
- * NOTE: the UCharacterCategory values are not compatible with - * those returned by java.lang.Character.getType. UCharacterCategory values - * match the ones used in ICU4C, while java.lang.Character type - * values, though similar, skip the value 17.

- * @param ch code point whose type is to be determined - * @return category which is a value of UCharacterCategory - * @stable ICU 2.1 - */ - public static int getType(int ch) - { - return UCharacterProperty.INSTANCE.getType(ch); - } - - /** - * Returns the Bidirection property of a code point. - * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional - * property.
- * Result returned belongs to the interface - * UCharacterDirection - * @param ch the code point to be determined its direction - * @return direction constant from UCharacterDirection. - * @stable ICU 2.1 - */ - public static int getDirection(int ch) - { - return UBiDiProps.INSTANCE.getClass(ch); - } - - /** - * Maps the specified code point to a "mirror-image" code point. - * For code points with the "mirrored" property, implementations sometimes - * need a "poor man's" mapping to another code point such that the default - * glyph may serve as the mirror-image of the default glyph of the - * specified code point.
- * This is useful for text conversion to and from codepages with visual - * order, and for displays without glyph selection capabilities. - * @param ch code point whose mirror is to be retrieved - * @return another code point that may serve as a mirror-image substitute, - * or ch itself if there is no such mapping or ch does not have the - * "mirrored" property - * @stable ICU 2.1 - */ - public static int getMirror(int ch) - { - return UBiDiProps.INSTANCE.getMirror(ch); - } - - /** - * Maps the specified character to its paired bracket character. - * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). - * Otherwise c itself is returned. - * See http://www.unicode.org/reports/tr9/ - * - * @param c the code point to be mapped - * @return the paired bracket code point, - * or c itself if there is no such mapping - * (Bidi_Paired_Bracket_Type=None) - * - * @see UProperty#BIDI_PAIRED_BRACKET - * @see UProperty#BIDI_PAIRED_BRACKET_TYPE - * @see #getMirror(int) - * @stable ICU 52 - */ - public static int getBidiPairedBracket(int c) { - return UBiDiProps.INSTANCE.getPairedBracket(c); - } - - /** - * Returns the combining class of the argument codepoint - * @param ch code point whose combining is to be retrieved - * @return the combining class of the codepoint - * @stable ICU 2.1 - */ - public static int getCombiningClass(int ch) - { - return Normalizer2.getNFDInstance().getCombiningClass(ch); - } - - /** - * Returns the version of Unicode data used. - * @return the unicode version number used - * @stable ICU 2.1 - */ - public static VersionInfo getUnicodeVersion() - { - return UCharacterProperty.INSTANCE.m_unicodeVersion_; - } - - /** - * Returns a code point corresponding to the two UTF16 characters. - * @param lead the lead char - * @param trail the trail char - * @return code point if surrogate characters are valid. - * @exception IllegalArgumentException thrown when argument characters do - * not form a valid codepoint - * @stable ICU 2.1 - */ - public static int getCodePoint(char lead, char trail) - { - if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { - return UCharacterProperty.getRawSupplementary(lead, trail); - } - throw new IllegalArgumentException("Illegal surrogate characters"); - } - - /** - * Returns the "age" of the code point.

- *

The "age" is the Unicode version when the code point was first - * designated (as a non-character or for Private Use) or assigned a - * character. - *

This can be useful to avoid emitting code points to receiving - * processes that do not accept newer characters.

- *

The data is from the UCD file DerivedAge.txt.

- * @param ch The code point. - * @return the Unicode version number - * @stable ICU 2.6 - */ - public static VersionInfo getAge(int ch) - { - if (ch < MIN_VALUE || ch > MAX_VALUE) { - throw new IllegalArgumentException("Codepoint out of bounds"); - } - return UCharacterProperty.INSTANCE.getAge(ch); - } - - /** - * Returns the property value for an Unicode property type of a code point. - * Also returns binary and mask property values.

- *

Unicode, especially in version 3.2, defines many more properties than - * the original set in UnicodeData.txt.

- *

The properties APIs are intended to reflect Unicode properties as - * defined in the Unicode Character Database (UCD) and Unicode Technical - * Reports (UTR). For details about the properties see - * http://www.unicode.org/.

- *

For names of Unicode properties see the UCD file PropertyAliases.txt. - *

- *
-     * Sample usage:
-     * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
-     * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
-     * boolean b = (ideo == 1) ? true : false;
-     * 
- * @param ch code point to test. - * @param type UProperty selector constant, identifies which binary - * property to check. Must be - * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or - * UProperty.INT_START <= type < UProperty.INT_LIMIT or - * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. - * @return numeric value that is directly the property value or, - * for enumerated properties, corresponds to the numeric value of - * the enumerated constant of the respective property value - * enumeration type (cast to enum type if necessary). - * Returns 0 or 1 (for false / true) for binary Unicode properties. - * Returns a bit-mask for mask properties. - * Returns 0 if 'type' is out of bounds or if the Unicode version - * does not have data for the property at all, or not for this code - * point. - * @see UProperty - * @see #hasBinaryProperty - * @see #getIntPropertyMinValue - * @see #getIntPropertyMaxValue - * @see #getUnicodeVersion - * @stable ICU 2.4 - */ - // for BiDiBase.java - public static int getIntPropertyValue(int ch, int type) { - return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type); - } - - // private constructor ----------------------------------------------- - - /** - * Private constructor to prevent instantiation - */ - private UCharacter() { } - - /* - * Copied from UCharacterEnums.java - */ - - /** - * Character type Mn - * @stable ICU 2.1 - */ - public static final byte NON_SPACING_MARK = 6; - /** - * Character type Me - * @stable ICU 2.1 - */ - public static final byte ENCLOSING_MARK = 7; - /** - * Character type Mc - * @stable ICU 2.1 - */ - public static final byte COMBINING_SPACING_MARK = 8; - /** - * Character type count - * @stable ICU 2.1 - */ - public static final byte CHAR_CATEGORY_COUNT = 30; - - /** - * Directional type R - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT = 1; - /** - * Directional type AL - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT_ARABIC = 13; -} --- /dev/null 2020-01-10 15:57:48.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/lang/UCharacter.java 2020-01-10 15:57:48.000000000 -0800 @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** +******************************************************************************* +* Copyright (C) 1996-2014, International Business Machines Corporation and +* others. All Rights Reserved. +******************************************************************************* +*/ + +package jdk.internal.icu.lang; + +import jdk.internal.icu.impl.UBiDiProps; +import jdk.internal.icu.impl.UCharacterProperty; +import jdk.internal.icu.text.Normalizer2; +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.util.VersionInfo; + +/** + *

The UCharacter class provides extensions to the + * + * java.lang.Character class. These extensions provide support for + * more Unicode properties and together with the UTF16 + * class, provide support for supplementary characters (those with code + * points above U+FFFF). + * Each ICU release supports the latest version of Unicode available at that time. + * + *

Code points are represented in these API using ints. While it would be + * more convenient in Java to have a separate primitive datatype for them, + * ints suffice in the meantime. + * + *

To use this class please add the jar file name icu4j.jar to the + * class path, since it contains data files which supply the information used + * by this file.
+ * E.g. In Windows
+ * set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar.
+ * Otherwise, another method would be to copy the files uprops.dat and + * unames.icu from the icu4j source subdirectory + * $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory + * $ICU4J_CLASS/com.ibm.icu.impl.data. + * + *

Aside from the additions for UTF-16 support, and the updated Unicode + * properties, the main differences between UCharacter and Character are: + *

    + *
  • UCharacter is not designed to be a char wrapper and does not have + * APIs to which involves management of that single char.
    + * These include: + *
      + *
    • char charValue(), + *
    • int compareTo(java.lang.Character, java.lang.Character), etc. + *
    + *
  • UCharacter does not include Character APIs that are deprecated, nor + * does it include the Java-specific character information, such as + * boolean isJavaIdentifierPart(char ch). + *
  • Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric + * values '10' - '35'. UCharacter also does this in digit and + * getNumericValue, to adhere to the java semantics of these + * methods. New methods unicodeDigit, and + * getUnicodeNumericValue do not treat the above code points + * as having numeric values. This is a semantic change from ICU4J 1.3.1. + *
+ *

+ * Further detail on differences can be determined using the program + * + * com.ibm.icu.dev.test.lang.UCharacterCompare + *

+ *

+ * In addition to Java compatibility functions, which calculate derived properties, + * this API provides low-level access to the Unicode Character Database. + *

+ *

+ * Unicode assigns each code point (not just assigned character) values for + * many properties. + * Most of them are simple boolean flags, or constants from a small enumerated list. + * For some properties, values are strings or other relatively more complex types. + *

+ *

+ * For more information see + * "About the Unicode Character Database" + * (http://www.unicode.org/ucd/) + * and the ICU + * User Guide chapter on Properties + * (http://www.icu-project.org/userguide/properties.html). + *

+ *

+ * There are also functions that provide easy migration from C/POSIX functions + * like isblank(). Their use is generally discouraged because the C/POSIX + * standards do not define their semantics beyond the ASCII range, which means + * that different implementations exhibit very different behavior. + * Instead, Unicode properties should be used directly. + *

+ *

+ * There are also only a few, broad C/POSIX character classes, and they tend + * to be used for conflicting purposes. For example, the "isalpha()" class + * is sometimes used to determine word boundaries, while a more sophisticated + * approach would at least distinguish initial letters from continuation + * characters (the latter including combining marks). + * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) + * Another example: There is no "istitle()" class for titlecase characters. + *

+ *

+ * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. + * ICU implements them according to the Standard Recommendations in + * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions + * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). + *

+ *

+ * API access for C/POSIX character classes is as follows: + *

{@code
+ * - alpha:     isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
+ * - lower:     isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
+ * - upper:     isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
+ * - punct:     ((1<
+ * 

+ *

+ * The C/POSIX character classes are also available in UnicodeSet patterns, + * using patterns like [:graph:] or \p{graph}. + *

+ * + * There are several ICU (and Java) whitespace functions. + * Comparison:
    + *
  • isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; + * most of general categories "Z" (separators) + most whitespace ISO controls + * (including no-break spaces, but excluding IS1..IS4 and ZWSP) + *
  • isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces + *
  • isSpaceChar: just Z (including no-break spaces)
+ *

+ *

+ * This class is not subclassable. + *

+ * @author Syn Wee Quek + * @stable ICU 2.1 + * @see com.ibm.icu.lang.UCharacterEnums + */ + +public final class UCharacter +{ + + /** + * Joining Group constants. + * @see UProperty#JOINING_GROUP + * @stable ICU 2.4 + */ + public static interface JoiningGroup + { + /** + * @stable ICU 2.4 + */ + public static final int NO_JOINING_GROUP = 0; + } + + /** + * Numeric Type constants. + * @see UProperty#NUMERIC_TYPE + * @stable ICU 2.4 + */ + public static interface NumericType + { + /** + * @stable ICU 2.4 + */ + public static final int NONE = 0; + /** + * @stable ICU 2.4 + */ + public static final int DECIMAL = 1; + /** + * @stable ICU 2.4 + */ + public static final int DIGIT = 2; + /** + * @stable ICU 2.4 + */ + public static final int NUMERIC = 3; + /** + * @stable ICU 2.4 + */ + public static final int COUNT = 4; + } + + /** + * Hangul Syllable Type constants. + * + * @see UProperty#HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ + public static interface HangulSyllableType + { + /** + * @stable ICU 2.6 + */ + public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ + /** + * @stable ICU 2.6 + */ + public static final int LEADING_JAMO = 1; /*[L]*/ + /** + * @stable ICU 2.6 + */ + public static final int VOWEL_JAMO = 2; /*[V]*/ + /** + * @stable ICU 2.6 + */ + public static final int TRAILING_JAMO = 3; /*[T]*/ + /** + * @stable ICU 2.6 + */ + public static final int LV_SYLLABLE = 4; /*[LV]*/ + /** + * @stable ICU 2.6 + */ + public static final int LVT_SYLLABLE = 5; /*[LVT]*/ + /** + * @stable ICU 2.6 + */ + public static final int COUNT = 6; + } + + // public data members ----------------------------------------------- + + /** + * The lowest Unicode code point value. + * @stable ICU 2.1 + */ + public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; + + /** + * The highest Unicode code point value (scalar value) according to the + * Unicode Standard. + * This is a 21-bit value (21 bits, rounded up).
+ * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE + * @stable ICU 2.1 + */ + public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; + + // public methods ---------------------------------------------------- + + /** + * Returns the numeric value of a decimal digit code point. + *
This method observes the semantics of + * java.lang.Character.digit(). Note that this + * will return positive values for code points for which isDigit + * returns false, just like java.lang.Character. + *
Semantic Change: In release 1.3.1 and + * prior, this did not treat the European letters as having a + * digit value, and also treated numeric letters and other numbers as + * digits. + * This has been changed to conform to the java semantics. + *
A code point is a valid digit if and only if: + *
    + *
  • ch is a decimal digit or one of the european letters, and + *
  • the value of ch is less than the specified radix. + *
+ * @param ch the code point to query + * @param radix the radix + * @return the numeric value represented by the code point in the + * specified radix, or -1 if the code point is not a decimal digit + * or if its value is too large for the radix + * @stable ICU 2.1 + */ + public static int digit(int ch, int radix) + { + if (2 <= radix && radix <= 36) { + int value = digit(ch); + if (value < 0) { + // ch is not a decimal digit, try latin letters + value = UCharacterProperty.getEuropeanDigit(ch); + } + return (value < radix) ? value : -1; + } else { + return -1; // invalid radix + } + } + + /** + * Returns the numeric value of a decimal digit code point. + *
This is a convenience overload of digit(int, int) + * that provides a decimal radix. + *
Semantic Change: In release 1.3.1 and prior, this + * treated numeric letters and other numbers as digits. This has + * been changed to conform to the java semantics. + * @param ch the code point to query + * @return the numeric value represented by the code point, + * or -1 if the code point is not a decimal digit or if its + * value is too large for a decimal radix + * @stable ICU 2.1 + */ + public static int digit(int ch) + { + return UCharacterProperty.INSTANCE.digit(ch); + } + + /** + * Returns a value indicating a code point's Unicode category. + * Up-to-date Unicode implementation of java.lang.Character.getType() + * except for the above mentioned code points that had their category + * changed.
+ * Return results are constants from the interface + * UCharacterCategory
+ * NOTE: the UCharacterCategory values are not compatible with + * those returned by java.lang.Character.getType. UCharacterCategory values + * match the ones used in ICU4C, while java.lang.Character type + * values, though similar, skip the value 17.

+ * @param ch code point whose type is to be determined + * @return category which is a value of UCharacterCategory + * @stable ICU 2.1 + */ + public static int getType(int ch) + { + return UCharacterProperty.INSTANCE.getType(ch); + } + + /** + * Returns the Bidirection property of a code point. + * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional + * property.
+ * Result returned belongs to the interface + * UCharacterDirection + * @param ch the code point to be determined its direction + * @return direction constant from UCharacterDirection. + * @stable ICU 2.1 + */ + public static int getDirection(int ch) + { + return UBiDiProps.INSTANCE.getClass(ch); + } + + /** + * Maps the specified code point to a "mirror-image" code point. + * For code points with the "mirrored" property, implementations sometimes + * need a "poor man's" mapping to another code point such that the default + * glyph may serve as the mirror-image of the default glyph of the + * specified code point.
+ * This is useful for text conversion to and from codepages with visual + * order, and for displays without glyph selection capabilities. + * @param ch code point whose mirror is to be retrieved + * @return another code point that may serve as a mirror-image substitute, + * or ch itself if there is no such mapping or ch does not have the + * "mirrored" property + * @stable ICU 2.1 + */ + public static int getMirror(int ch) + { + return UBiDiProps.INSTANCE.getMirror(ch); + } + + /** + * Maps the specified character to its paired bracket character. + * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). + * Otherwise c itself is returned. + * See http://www.unicode.org/reports/tr9/ + * + * @param c the code point to be mapped + * @return the paired bracket code point, + * or c itself if there is no such mapping + * (Bidi_Paired_Bracket_Type=None) + * + * @see UProperty#BIDI_PAIRED_BRACKET + * @see UProperty#BIDI_PAIRED_BRACKET_TYPE + * @see #getMirror(int) + * @stable ICU 52 + */ + public static int getBidiPairedBracket(int c) { + return UBiDiProps.INSTANCE.getPairedBracket(c); + } + + /** + * Returns the combining class of the argument codepoint + * @param ch code point whose combining is to be retrieved + * @return the combining class of the codepoint + * @stable ICU 2.1 + */ + public static int getCombiningClass(int ch) + { + return Normalizer2.getNFDInstance().getCombiningClass(ch); + } + + /** + * Returns the version of Unicode data used. + * @return the unicode version number used + * @stable ICU 2.1 + */ + public static VersionInfo getUnicodeVersion() + { + return UCharacterProperty.INSTANCE.m_unicodeVersion_; + } + + /** + * Returns a code point corresponding to the two UTF16 characters. + * @param lead the lead char + * @param trail the trail char + * @return code point if surrogate characters are valid. + * @exception IllegalArgumentException thrown when argument characters do + * not form a valid codepoint + * @stable ICU 2.1 + */ + public static int getCodePoint(char lead, char trail) + { + if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { + return UCharacterProperty.getRawSupplementary(lead, trail); + } + throw new IllegalArgumentException("Illegal surrogate characters"); + } + + /** + * Returns the "age" of the code point.

+ *

The "age" is the Unicode version when the code point was first + * designated (as a non-character or for Private Use) or assigned a + * character. + *

This can be useful to avoid emitting code points to receiving + * processes that do not accept newer characters.

+ *

The data is from the UCD file DerivedAge.txt.

+ * @param ch The code point. + * @return the Unicode version number + * @stable ICU 2.6 + */ + public static VersionInfo getAge(int ch) + { + if (ch < MIN_VALUE || ch > MAX_VALUE) { + throw new IllegalArgumentException("Codepoint out of bounds"); + } + return UCharacterProperty.INSTANCE.getAge(ch); + } + + /** + * Returns the property value for an Unicode property type of a code point. + * Also returns binary and mask property values.

+ *

Unicode, especially in version 3.2, defines many more properties than + * the original set in UnicodeData.txt.

+ *

The properties APIs are intended to reflect Unicode properties as + * defined in the Unicode Character Database (UCD) and Unicode Technical + * Reports (UTR). For details about the properties see + * http://www.unicode.org/.

+ *

For names of Unicode properties see the UCD file PropertyAliases.txt. + *

+ *
+     * Sample usage:
+     * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
+     * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
+     * boolean b = (ideo == 1) ? true : false;
+     * 
+ * @param ch code point to test. + * @param type UProperty selector constant, identifies which binary + * property to check. Must be + * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or + * UProperty.INT_START <= type < UProperty.INT_LIMIT or + * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. + * @return numeric value that is directly the property value or, + * for enumerated properties, corresponds to the numeric value of + * the enumerated constant of the respective property value + * enumeration type (cast to enum type if necessary). + * Returns 0 or 1 (for false / true) for binary Unicode properties. + * Returns a bit-mask for mask properties. + * Returns 0 if 'type' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code + * point. + * @see UProperty + * @see #hasBinaryProperty + * @see #getIntPropertyMinValue + * @see #getIntPropertyMaxValue + * @see #getUnicodeVersion + * @stable ICU 2.4 + */ + // for BiDiBase.java + public static int getIntPropertyValue(int ch, int type) { + return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type); + } + + // private constructor ----------------------------------------------- + + /** + * Private constructor to prevent instantiation + */ + private UCharacter() { } + + /* + * Copied from UCharacterEnums.java + */ + + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final byte NON_SPACING_MARK = 6; + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final byte ENCLOSING_MARK = 7; + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final byte COMBINING_SPACING_MARK = 8; + /** + * Character type count + * @stable ICU 2.1 + */ + public static final byte CHAR_CATEGORY_COUNT = 30; + + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; +} --- old/src/java.base/share/classes/sun/net/idn/UCharacterDirection.java 2020-01-10 15:57:50.000000000 -0800 +++ /dev/null 2020-01-10 15:57:50.000000000 -0800 @@ -1,112 +0,0 @@ -/* - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* -/** -******************************************************************************* -* Copyright (C) 1996-2004, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -*/ -// CHANGELOG -// 2005-05-19 Edward Wang -// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterDirection.java -// - move from package com.ibm.icu.lang to package sun.net.idn -// - -package sun.net.idn; - -/** - * Enumerated Unicode character linguistic direction constants. - * Used as return results from UCharacter - *

- * This class is not subclassable - *

- * @author Syn Wee Quek - * @stable ICU 2.1 - */ - -@SuppressWarnings("deprecation") -final class UCharacterDirection implements UCharacterEnums.ECharacterDirection { - - // private constructor ========================================= - ///CLOVER:OFF - /** - * Private constructor to prevent initialisation - */ - private UCharacterDirection() - { - } - ///CLOVER:ON - - /** - * Gets the name of the argument direction - * @param dir direction type to retrieve name - * @return directional name - * @stable ICU 2.1 - */ - public static String toString(int dir) { - switch(dir) - { - case LEFT_TO_RIGHT : - return "Left-to-Right"; - case RIGHT_TO_LEFT : - return "Right-to-Left"; - case EUROPEAN_NUMBER : - return "European Number"; - case EUROPEAN_NUMBER_SEPARATOR : - return "European Number Separator"; - case EUROPEAN_NUMBER_TERMINATOR : - return "European Number Terminator"; - case ARABIC_NUMBER : - return "Arabic Number"; - case COMMON_NUMBER_SEPARATOR : - return "Common Number Separator"; - case BLOCK_SEPARATOR : - return "Paragraph Separator"; - case SEGMENT_SEPARATOR : - return "Segment Separator"; - case WHITE_SPACE_NEUTRAL : - return "Whitespace"; - case OTHER_NEUTRAL : - return "Other Neutrals"; - case LEFT_TO_RIGHT_EMBEDDING : - return "Left-to-Right Embedding"; - case LEFT_TO_RIGHT_OVERRIDE : - return "Left-to-Right Override"; - case RIGHT_TO_LEFT_ARABIC : - return "Right-to-Left Arabic"; - case RIGHT_TO_LEFT_EMBEDDING : - return "Right-to-Left Embedding"; - case RIGHT_TO_LEFT_OVERRIDE : - return "Right-to-Left Override"; - case POP_DIRECTIONAL_FORMAT : - return "Pop Directional Format"; - case DIR_NON_SPACING_MARK : - return "Non-Spacing Mark"; - case BOUNDARY_NEUTRAL : - return "Boundary Neutral"; - } - return "Unassigned"; - } -} --- /dev/null 2020-01-10 15:57:50.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/lang/UCharacterDirection.java 2020-01-10 15:57:49.000000000 -0800 @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/** +******************************************************************************* +* Copyright (C) 1996-2004, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterDirection.java +// - move from package com.ibm.icu.lang to package sun.net.idn +// + +package jdk.internal.icu.lang; + +/** + * Enumerated Unicode character linguistic direction constants. + * Used as return results from UCharacter + *

+ * This class is not subclassable + *

+ * @author Syn Wee Quek + * @stable ICU 2.1 + */ + +@SuppressWarnings("deprecation") +public final class UCharacterDirection implements UCharacterEnums.ECharacterDirection { + + // private constructor ========================================= + ///CLOVER:OFF + /** + * Private constructor to prevent initialisation + */ + private UCharacterDirection() + { + } + ///CLOVER:ON + + /** + * Gets the name of the argument direction + * @param dir direction type to retrieve name + * @return directional name + * @stable ICU 2.1 + */ + public static String toString(int dir) { + switch(dir) + { + case LEFT_TO_RIGHT : + return "Left-to-Right"; + case RIGHT_TO_LEFT : + return "Right-to-Left"; + case EUROPEAN_NUMBER : + return "European Number"; + case EUROPEAN_NUMBER_SEPARATOR : + return "European Number Separator"; + case EUROPEAN_NUMBER_TERMINATOR : + return "European Number Terminator"; + case ARABIC_NUMBER : + return "Arabic Number"; + case COMMON_NUMBER_SEPARATOR : + return "Common Number Separator"; + case BLOCK_SEPARATOR : + return "Paragraph Separator"; + case SEGMENT_SEPARATOR : + return "Segment Separator"; + case WHITE_SPACE_NEUTRAL : + return "Whitespace"; + case OTHER_NEUTRAL : + return "Other Neutrals"; + case LEFT_TO_RIGHT_EMBEDDING : + return "Left-to-Right Embedding"; + case LEFT_TO_RIGHT_OVERRIDE : + return "Left-to-Right Override"; + case RIGHT_TO_LEFT_ARABIC : + return "Right-to-Left Arabic"; + case RIGHT_TO_LEFT_EMBEDDING : + return "Right-to-Left Embedding"; + case RIGHT_TO_LEFT_OVERRIDE : + return "Right-to-Left Override"; + case POP_DIRECTIONAL_FORMAT : + return "Pop Directional Format"; + case DIR_NON_SPACING_MARK : + return "Non-Spacing Mark"; + case BOUNDARY_NEUTRAL : + return "Boundary Neutral"; + } + return "Unassigned"; + } +} --- old/src/java.base/share/classes/sun/net/idn/UCharacterEnums.java 2020-01-10 15:57:51.000000000 -0800 +++ /dev/null 2020-01-10 15:57:51.000000000 -0800 @@ -1,587 +0,0 @@ -/* - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* -/** - ******************************************************************************* - * Copyright (C) 2004, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ -// CHANGELOG -// 2005-05-19 Edward Wang -// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterEnums.java -// - move from package com.ibm.icu.lang to package sun.net.idn -// -// 2011-09-06 Kurchi Subhra Hazra -// - Added @Deprecated tag to the following: -// - class UCharacterEnums -// - interfaces ECharacterCategory, ECharacterDirection -// - fields INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, -// DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT, -// DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR -// DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER, -// DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_PARAGRAPH_SEPARATOR, -// DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE, -// DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, -// DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, -// DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, -// DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DIRECTIONALITY_NON_SPACING_MARK, -// DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_UNDEFINED -// - -package sun.net.idn; - -/** - * A container for the different 'enumerated types' used by UCharacter. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - -@Deprecated -class UCharacterEnums { - - /** This is just a namespace, it is not instantiatable. */ - private UCharacterEnums() {}; - - /** - * 'Enum' for the CharacterCategory constants. These constants are - * compatible in name but not in value with those defined in - * java.lang.Character. - * @see UCharacterCategory - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static interface ECharacterCategory { - /** - * Unassigned character type - * @stable ICU 2.1 - */ - public static final int UNASSIGNED = 0; - - /** - * Character type Cn - * Not Assigned (no characters in [UnicodeData.txt] have this property) - * @stable ICU 2.6 - */ - public static final int GENERAL_OTHER_TYPES = 0; - - /** - * Character type Lu - * @stable ICU 2.1 - */ - public static final int UPPERCASE_LETTER = 1; - - /** - * Character type Ll - * @stable ICU 2.1 - */ - public static final int LOWERCASE_LETTER = 2; - - /** - * Character type Lt - * @stable ICU 2.1 - */ - - public static final int TITLECASE_LETTER = 3; - - /** - * Character type Lm - * @stable ICU 2.1 - */ - public static final int MODIFIER_LETTER = 4; - - /** - * Character type Lo - * @stable ICU 2.1 - */ - public static final int OTHER_LETTER = 5; - - /** - * Character type Mn - * @stable ICU 2.1 - */ - public static final int NON_SPACING_MARK = 6; - - /** - * Character type Me - * @stable ICU 2.1 - */ - public static final int ENCLOSING_MARK = 7; - - /** - * Character type Mc - * @stable ICU 2.1 - */ - public static final int COMBINING_SPACING_MARK = 8; - - /** - * Character type Nd - * @stable ICU 2.1 - */ - public static final int DECIMAL_DIGIT_NUMBER = 9; - - /** - * Character type Nl - * @stable ICU 2.1 - */ - public static final int LETTER_NUMBER = 10; - - /** - * Character type No - * @stable ICU 2.1 - */ - public static final int OTHER_NUMBER = 11; - - /** - * Character type Zs - * @stable ICU 2.1 - */ - public static final int SPACE_SEPARATOR = 12; - - /** - * Character type Zl - * @stable ICU 2.1 - */ - public static final int LINE_SEPARATOR = 13; - - /** - * Character type Zp - * @stable ICU 2.1 - */ - public static final int PARAGRAPH_SEPARATOR = 14; - - /** - * Character type Cc - * @stable ICU 2.1 - */ - public static final int CONTROL = 15; - - /** - * Character type Cf - * @stable ICU 2.1 - */ - public static final int FORMAT = 16; - - /** - * Character type Co - * @stable ICU 2.1 - */ - public static final int PRIVATE_USE = 17; - - /** - * Character type Cs - * @stable ICU 2.1 - */ - public static final int SURROGATE = 18; - - /** - * Character type Pd - * @stable ICU 2.1 - */ - public static final int DASH_PUNCTUATION = 19; - - /** - * Character type Ps - * @stable ICU 2.1 - */ - public static final int START_PUNCTUATION = 20; - - /** - * Character type Pe - * @stable ICU 2.1 - */ - public static final int END_PUNCTUATION = 21; - - /** - * Character type Pc - * @stable ICU 2.1 - */ - public static final int CONNECTOR_PUNCTUATION = 22; - - /** - * Character type Po - * @stable ICU 2.1 - */ - public static final int OTHER_PUNCTUATION = 23; - - /** - * Character type Sm - * @stable ICU 2.1 - */ - public static final int MATH_SYMBOL = 24; - - /** - * Character type Sc - * @stable ICU 2.1 - */ - public static final int CURRENCY_SYMBOL = 25; - - /** - * Character type Sk - * @stable ICU 2.1 - */ - public static final int MODIFIER_SYMBOL = 26; - - /** - * Character type So - * @stable ICU 2.1 - */ - public static final int OTHER_SYMBOL = 27; - - /** - * Character type Pi - * @see #INITIAL_QUOTE_PUNCTUATION - * @stable ICU 2.1 - */ - public static final int INITIAL_PUNCTUATION = 28; - - /** - * Character type Pi - * This name is compatible with java.lang.Character's name for this type. - * @see #INITIAL_PUNCTUATION - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final int INITIAL_QUOTE_PUNCTUATION = 28; - - /** - * Character type Pf - * @see #FINAL_QUOTE_PUNCTUATION - * @stable ICU 2.1 - */ - public static final int FINAL_PUNCTUATION = 29; - - /** - * Character type Pf - * This name is compatible with java.lang.Character's name for this type. - * @see #FINAL_PUNCTUATION - * @draft ICU 2.8 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final int FINAL_QUOTE_PUNCTUATION = 29; - - /** - * Character type count - * @stable ICU 2.1 - */ - public static final int CHAR_CATEGORY_COUNT = 30; - } - - /** - * 'Enum' for the CharacterDirection constants. There are two sets - * of names, those used in ICU, and those used in the JDK. The - * JDK constants are compatible in name but not in value - * with those defined in java.lang.Character. - * @see UCharacterDirection - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - - @Deprecated - public static interface ECharacterDirection { - /** - * Directional type L - * @stable ICU 2.1 - */ - public static final int LEFT_TO_RIGHT = 0; - - /** - * JDK-compatible synonum for LEFT_TO_RIGHT. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = (byte)LEFT_TO_RIGHT; - - /** - * Directional type R - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT = 1; - - /** - * JDK-compatible synonum for RIGHT_TO_LEFT. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = (byte)RIGHT_TO_LEFT; - - /** - * Directional type EN - * @stable ICU 2.1 - */ - public static final int EUROPEAN_NUMBER = 2; - - /** - * JDK-compatible synonum for EUROPEAN_NUMBER. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = (byte)EUROPEAN_NUMBER; - - /** - * Directional type ES - * @stable ICU 2.1 - */ - public static final int EUROPEAN_NUMBER_SEPARATOR = 3; - - /** - * JDK-compatible synonum for EUROPEAN_NUMBER_SEPARATOR. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = (byte)EUROPEAN_NUMBER_SEPARATOR; - - /** - * Directional type ET - * @stable ICU 2.1 - */ - public static final int EUROPEAN_NUMBER_TERMINATOR = 4; - - /** - * JDK-compatible synonum for EUROPEAN_NUMBER_TERMINATOR. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = (byte)EUROPEAN_NUMBER_TERMINATOR; - - /** - * Directional type AN - * @stable ICU 2.1 - */ - public static final int ARABIC_NUMBER = 5; - - /** - * JDK-compatible synonum for ARABIC_NUMBER. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_ARABIC_NUMBER = (byte)ARABIC_NUMBER; - - /** - * Directional type CS - * @stable ICU 2.1 - */ - public static final int COMMON_NUMBER_SEPARATOR = 6; - - /** - * JDK-compatible synonum for COMMON_NUMBER_SEPARATOR. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = (byte)COMMON_NUMBER_SEPARATOR; - - /** - * Directional type B - * @stable ICU 2.1 - */ - public static final int BLOCK_SEPARATOR = 7; - - /** - * JDK-compatible synonum for BLOCK_SEPARATOR. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = (byte)BLOCK_SEPARATOR; - - /** - * Directional type S - * @stable ICU 2.1 - */ - public static final int SEGMENT_SEPARATOR = 8; - - /** - * JDK-compatible synonum for SEGMENT_SEPARATOR. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = (byte)SEGMENT_SEPARATOR; - - /** - * Directional type WS - * @stable ICU 2.1 - */ - public static final int WHITE_SPACE_NEUTRAL = 9; - - /** - * JDK-compatible synonum for WHITE_SPACE_NEUTRAL. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_WHITESPACE = (byte)WHITE_SPACE_NEUTRAL; - - /** - * Directional type ON - * @stable ICU 2.1 - */ - public static final int OTHER_NEUTRAL = 10; - - /** - * JDK-compatible synonum for OTHER_NEUTRAL. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_OTHER_NEUTRALS = (byte)OTHER_NEUTRAL; - - /** - * Directional type LRE - * @stable ICU 2.1 - */ - public static final int LEFT_TO_RIGHT_EMBEDDING = 11; - - /** - * JDK-compatible synonum for LEFT_TO_RIGHT_EMBEDDING. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = (byte)LEFT_TO_RIGHT_EMBEDDING; - - /** - * Directional type LRO - * @stable ICU 2.1 - */ - public static final int LEFT_TO_RIGHT_OVERRIDE = 12; - - /** - * JDK-compatible synonum for LEFT_TO_RIGHT_OVERRIDE. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = (byte)LEFT_TO_RIGHT_OVERRIDE; - - /** - * Directional type AL - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT_ARABIC = 13; - - /** - * JDK-compatible synonum for RIGHT_TO_LEFT_ARABIC. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = (byte)RIGHT_TO_LEFT_ARABIC; - - /** - * Directional type RLE - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT_EMBEDDING = 14; - - /** - * JDK-compatible synonum for RIGHT_TO_LEFT_EMBEDDING. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = (byte)RIGHT_TO_LEFT_EMBEDDING; - - /** - * Directional type RLO - * @stable ICU 2.1 - */ - public static final int RIGHT_TO_LEFT_OVERRIDE = 15; - - /** - * JDK-compatible synonum for RIGHT_TO_LEFT_OVERRIDE. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = (byte)RIGHT_TO_LEFT_OVERRIDE; - - /** - * Directional type PDF - * @stable ICU 2.1 - */ - public static final int POP_DIRECTIONAL_FORMAT = 16; - - /** - * JDK-compatible synonum for POP_DIRECTIONAL_FORMAT. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = (byte)POP_DIRECTIONAL_FORMAT; - - /** - * Directional type NSM - * @stable ICU 2.1 - */ - public static final int DIR_NON_SPACING_MARK = 17; - - /** - * JDK-compatible synonum for DIR_NON_SPACING_MARK. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_NON_SPACING_MARK = (byte)DIR_NON_SPACING_MARK; - - /** - * Directional type BN - * @stable ICU 2.1 - */ - public static final int BOUNDARY_NEUTRAL = 18; - - /** - * JDK-compatible synonum for BOUNDARY_NEUTRAL. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = (byte)BOUNDARY_NEUTRAL; - - /** - * Number of directional types - * @stable ICU 2.1 - */ - public static final int CHAR_DIRECTION_COUNT = 19; - - /** - * Undefined bidirectional character type. Undefined char - * values have undefined directionality in the Unicode specification. - * @draft ICU 3.0 - * @deprecated This is a draft API and might change in a future release of ICU. - */ - @Deprecated - public static final byte DIRECTIONALITY_UNDEFINED = -1; - } -} --- /dev/null 2020-01-10 15:57:51.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/lang/UCharacterEnums.java 2020-01-10 15:57:51.000000000 -0800 @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/** + ******************************************************************************* + * Copyright (C) 2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterEnums.java +// - move from package com.ibm.icu.lang to package sun.net.idn +// +// 2011-09-06 Kurchi Subhra Hazra +// - Added @Deprecated tag to the following: +// - class UCharacterEnums +// - interfaces ECharacterCategory, ECharacterDirection +// - fields INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, +// DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT, +// DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR +// DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER, +// DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_PARAGRAPH_SEPARATOR, +// DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE, +// DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, +// DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, +// DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, +// DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DIRECTIONALITY_NON_SPACING_MARK, +// DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_UNDEFINED +// + +package jdk.internal.icu.lang; + +/** + * A container for the different 'enumerated types' used by UCharacter. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + +@Deprecated +class UCharacterEnums { + + /** This is just a namespace, it is not instantiatable. */ + private UCharacterEnums() {}; + + /** + * 'Enum' for the CharacterCategory constants. These constants are + * compatible in name but not in value with those defined in + * java.lang.Character. + * @see UCharacterCategory + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static interface ECharacterCategory { + /** + * Unassigned character type + * @stable ICU 2.1 + */ + public static final int UNASSIGNED = 0; + + /** + * Character type Cn + * Not Assigned (no characters in [UnicodeData.txt] have this property) + * @stable ICU 2.6 + */ + public static final int GENERAL_OTHER_TYPES = 0; + + /** + * Character type Lu + * @stable ICU 2.1 + */ + public static final int UPPERCASE_LETTER = 1; + + /** + * Character type Ll + * @stable ICU 2.1 + */ + public static final int LOWERCASE_LETTER = 2; + + /** + * Character type Lt + * @stable ICU 2.1 + */ + + public static final int TITLECASE_LETTER = 3; + + /** + * Character type Lm + * @stable ICU 2.1 + */ + public static final int MODIFIER_LETTER = 4; + + /** + * Character type Lo + * @stable ICU 2.1 + */ + public static final int OTHER_LETTER = 5; + + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final int NON_SPACING_MARK = 6; + + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final int ENCLOSING_MARK = 7; + + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final int COMBINING_SPACING_MARK = 8; + + /** + * Character type Nd + * @stable ICU 2.1 + */ + public static final int DECIMAL_DIGIT_NUMBER = 9; + + /** + * Character type Nl + * @stable ICU 2.1 + */ + public static final int LETTER_NUMBER = 10; + + /** + * Character type No + * @stable ICU 2.1 + */ + public static final int OTHER_NUMBER = 11; + + /** + * Character type Zs + * @stable ICU 2.1 + */ + public static final int SPACE_SEPARATOR = 12; + + /** + * Character type Zl + * @stable ICU 2.1 + */ + public static final int LINE_SEPARATOR = 13; + + /** + * Character type Zp + * @stable ICU 2.1 + */ + public static final int PARAGRAPH_SEPARATOR = 14; + + /** + * Character type Cc + * @stable ICU 2.1 + */ + public static final int CONTROL = 15; + + /** + * Character type Cf + * @stable ICU 2.1 + */ + public static final int FORMAT = 16; + + /** + * Character type Co + * @stable ICU 2.1 + */ + public static final int PRIVATE_USE = 17; + + /** + * Character type Cs + * @stable ICU 2.1 + */ + public static final int SURROGATE = 18; + + /** + * Character type Pd + * @stable ICU 2.1 + */ + public static final int DASH_PUNCTUATION = 19; + + /** + * Character type Ps + * @stable ICU 2.1 + */ + public static final int START_PUNCTUATION = 20; + + /** + * Character type Pe + * @stable ICU 2.1 + */ + public static final int END_PUNCTUATION = 21; + + /** + * Character type Pc + * @stable ICU 2.1 + */ + public static final int CONNECTOR_PUNCTUATION = 22; + + /** + * Character type Po + * @stable ICU 2.1 + */ + public static final int OTHER_PUNCTUATION = 23; + + /** + * Character type Sm + * @stable ICU 2.1 + */ + public static final int MATH_SYMBOL = 24; + + /** + * Character type Sc + * @stable ICU 2.1 + */ + public static final int CURRENCY_SYMBOL = 25; + + /** + * Character type Sk + * @stable ICU 2.1 + */ + public static final int MODIFIER_SYMBOL = 26; + + /** + * Character type So + * @stable ICU 2.1 + */ + public static final int OTHER_SYMBOL = 27; + + /** + * Character type Pi + * @see #INITIAL_QUOTE_PUNCTUATION + * @stable ICU 2.1 + */ + public static final int INITIAL_PUNCTUATION = 28; + + /** + * Character type Pi + * This name is compatible with java.lang.Character's name for this type. + * @see #INITIAL_PUNCTUATION + * @draft ICU 2.8 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final int INITIAL_QUOTE_PUNCTUATION = 28; + + /** + * Character type Pf + * @see #FINAL_QUOTE_PUNCTUATION + * @stable ICU 2.1 + */ + public static final int FINAL_PUNCTUATION = 29; + + /** + * Character type Pf + * This name is compatible with java.lang.Character's name for this type. + * @see #FINAL_PUNCTUATION + * @draft ICU 2.8 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final int FINAL_QUOTE_PUNCTUATION = 29; + + /** + * Character type count + * @stable ICU 2.1 + */ + public static final int CHAR_CATEGORY_COUNT = 30; + } + + /** + * 'Enum' for the CharacterDirection constants. There are two sets + * of names, those used in ICU, and those used in the JDK. The + * JDK constants are compatible in name but not in value + * with those defined in java.lang.Character. + * @see UCharacterDirection + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + + @Deprecated + public static interface ECharacterDirection { + /** + * Directional type L + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT = 0; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = (byte)LEFT_TO_RIGHT; + + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = (byte)RIGHT_TO_LEFT; + + /** + * Directional type EN + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER = 2; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = (byte)EUROPEAN_NUMBER; + + /** + * Directional type ES + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER_SEPARATOR = 3; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = (byte)EUROPEAN_NUMBER_SEPARATOR; + + /** + * Directional type ET + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER_TERMINATOR = 4; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER_TERMINATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = (byte)EUROPEAN_NUMBER_TERMINATOR; + + /** + * Directional type AN + * @stable ICU 2.1 + */ + public static final int ARABIC_NUMBER = 5; + + /** + * JDK-compatible synonum for ARABIC_NUMBER. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_ARABIC_NUMBER = (byte)ARABIC_NUMBER; + + /** + * Directional type CS + * @stable ICU 2.1 + */ + public static final int COMMON_NUMBER_SEPARATOR = 6; + + /** + * JDK-compatible synonum for COMMON_NUMBER_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = (byte)COMMON_NUMBER_SEPARATOR; + + /** + * Directional type B + * @stable ICU 2.1 + */ + public static final int BLOCK_SEPARATOR = 7; + + /** + * JDK-compatible synonum for BLOCK_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = (byte)BLOCK_SEPARATOR; + + /** + * Directional type S + * @stable ICU 2.1 + */ + public static final int SEGMENT_SEPARATOR = 8; + + /** + * JDK-compatible synonum for SEGMENT_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = (byte)SEGMENT_SEPARATOR; + + /** + * Directional type WS + * @stable ICU 2.1 + */ + public static final int WHITE_SPACE_NEUTRAL = 9; + + /** + * JDK-compatible synonum for WHITE_SPACE_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_WHITESPACE = (byte)WHITE_SPACE_NEUTRAL; + + /** + * Directional type ON + * @stable ICU 2.1 + */ + public static final int OTHER_NEUTRAL = 10; + + /** + * JDK-compatible synonum for OTHER_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_OTHER_NEUTRALS = (byte)OTHER_NEUTRAL; + + /** + * Directional type LRE + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT_EMBEDDING = 11; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT_EMBEDDING. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = (byte)LEFT_TO_RIGHT_EMBEDDING; + + /** + * Directional type LRO + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT_OVERRIDE = 12; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT_OVERRIDE. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = (byte)LEFT_TO_RIGHT_OVERRIDE; + + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_ARABIC. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = (byte)RIGHT_TO_LEFT_ARABIC; + + /** + * Directional type RLE + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_EMBEDDING = 14; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_EMBEDDING. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = (byte)RIGHT_TO_LEFT_EMBEDDING; + + /** + * Directional type RLO + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_OVERRIDE = 15; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_OVERRIDE. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = (byte)RIGHT_TO_LEFT_OVERRIDE; + + /** + * Directional type PDF + * @stable ICU 2.1 + */ + public static final int POP_DIRECTIONAL_FORMAT = 16; + + /** + * JDK-compatible synonum for POP_DIRECTIONAL_FORMAT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = (byte)POP_DIRECTIONAL_FORMAT; + + /** + * Directional type NSM + * @stable ICU 2.1 + */ + public static final int DIR_NON_SPACING_MARK = 17; + + /** + * JDK-compatible synonum for DIR_NON_SPACING_MARK. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_NON_SPACING_MARK = (byte)DIR_NON_SPACING_MARK; + + /** + * Directional type BN + * @stable ICU 2.1 + */ + public static final int BOUNDARY_NEUTRAL = 18; + + /** + * JDK-compatible synonum for BOUNDARY_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = (byte)BOUNDARY_NEUTRAL; + + /** + * Number of directional types + * @stable ICU 2.1 + */ + public static final int CHAR_DIRECTION_COUNT = 19; + + /** + * Undefined bidirectional character type. Undefined char + * values have undefined directionality in the Unicode specification. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_UNDEFINED = -1; + } +} --- old/src/java.base/share/classes/sun/text/bidi/BidiBase.java 2020-01-10 15:57:52.000000000 -0800 +++ /dev/null 2020-01-10 15:57:52.000000000 -0800 @@ -1,4780 +0,0 @@ -/* - * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* -******************************************************************************* -* Copyright (C) 2001-2014, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -*/ - -/* FOOD FOR THOUGHT: currently the reordering modes are a mixture of - * algorithm for direct BiDi, algorithm for inverse Bidi and the bizarre - * concept of RUNS_ONLY which is a double operation. - * It could be advantageous to divide this into 3 concepts: - * a) Operation: direct / inverse / RUNS_ONLY - * b) Direct algorithm: default / NUMBERS_SPECIAL / GROUP_NUMBERS_WITH_L - * c) Inverse algorithm: default / INVERSE_LIKE_DIRECT / NUMBERS_SPECIAL - * This would allow combinations not possible today like RUNS_ONLY with - * NUMBERS_SPECIAL. - * Also allow to set INSERT_MARKS for the direct step of RUNS_ONLY and - * REMOVE_CONTROLS for the inverse step. - * Not all combinations would be supported, and probably not all do make sense. - * This would need to document which ones are supported and what are the - * fallbacks for unsupported combinations. - */ - -package sun.text.bidi; - -import java.lang.reflect.Array; -import java.text.AttributedCharacterIterator; -import java.text.Bidi; -import java.util.Arrays; -import jdk.internal.access.JavaAWTFontAccess; -import jdk.internal.access.SharedSecrets; -import sun.text.normalizer.UBiDiProps; -import sun.text.normalizer.UCharacter; -import sun.text.normalizer.UTF16; - -/** - * - *

Bidi algorithm for ICU

- * - * This is an implementation of the Unicode Bidirectional Algorithm. The - * algorithm is defined in the Unicode Standard Annex #9. - *

- * - * Note: Libraries that perform a bidirectional algorithm and reorder strings - * accordingly are sometimes called "Storage Layout Engines". ICU's Bidi and - * shaping (ArabicShaping) classes can be used at the core of such "Storage - * Layout Engines". - * - *

General remarks about the API:

- * - * The "limit" of a sequence of characters is the position just after - * their last character, i.e., one more than that position. - *

- * - * Some of the API methods provide access to "runs". Such a - * "run" is defined as a sequence of characters that are at the same - * embedding level after performing the Bidi algorithm. - * - *

Basic concept: paragraph

- * A piece of text can be divided into several paragraphs by characters - * with the Bidi class Block Separator. For handling of - * paragraphs, see: - *
    - *
  • {@link #countParagraphs} - *
  • {@link #getParaLevel} - *
  • {@link #getParagraph} - *
  • {@link #getParagraphByIndex} - *
- * - *

Basic concept: text direction

- * The direction of a piece of text may be: - *
    - *
  • {@link #LTR} - *
  • {@link #RTL} - *
  • {@link #MIXED} - *
  • {@link #NEUTRAL} - *
- * - *

Basic concept: levels

- * - * Levels in this API represent embedding levels according to the Unicode - * Bidirectional Algorithm. - * Their low-order bit (even/odd value) indicates the visual direction.

- * - * Levels can be abstract values when used for the - * paraLevel and embeddingLevels - * arguments of setPara(); there: - *

    - *
  • the high-order bit of an embeddingLevels[] - * value indicates whether the using application is - * specifying the level of a character to override whatever the - * Bidi implementation would resolve it to.
  • - *
  • paraLevel can be set to the - * pseudo-level values LEVEL_DEFAULT_LTR - * and LEVEL_DEFAULT_RTL.
  • - *
- * - *

The related constants are not real, valid level values. - * DEFAULT_XXX can be used to specify - * a default for the paragraph level for - * when the setPara() method - * shall determine it but there is no - * strongly typed character in the input.

- * - * Note that the value for LEVEL_DEFAULT_LTR is even - * and the one for LEVEL_DEFAULT_RTL is odd, - * just like with normal LTR and RTL level values - - * these special values are designed that way. Also, the implementation - * assumes that MAX_EXPLICIT_LEVEL is odd. - * - *

See Also: - *

    - *
  • {@link #LEVEL_DEFAULT_LTR} - *
  • {@link #LEVEL_DEFAULT_RTL} - *
  • {@link #LEVEL_OVERRIDE} - *
  • {@link #MAX_EXPLICIT_LEVEL} - *
  • {@link #setPara} - *
- * - *

Basic concept: Reordering Mode

- * Reordering mode values indicate which variant of the Bidi algorithm to - * use. - * - *

See Also: - *

    - *
  • {@link #setReorderingMode} - *
  • {@link #REORDER_DEFAULT} - *
  • {@link #REORDER_NUMBERS_SPECIAL} - *
  • {@link #REORDER_GROUP_NUMBERS_WITH_R} - *
  • {@link #REORDER_RUNS_ONLY} - *
  • {@link #REORDER_INVERSE_NUMBERS_AS_L} - *
  • {@link #REORDER_INVERSE_LIKE_DIRECT} - *
  • {@link #REORDER_INVERSE_FOR_NUMBERS_SPECIAL} - *
- * - *

Basic concept: Reordering Options

- * Reordering options can be applied during Bidi text transformations. - * - *

See Also: - *

    - *
  • {@link #setReorderingOptions} - *
  • {@link #OPTION_DEFAULT} - *
  • {@link #OPTION_INSERT_MARKS} - *
  • {@link #OPTION_REMOVE_CONTROLS} - *
  • {@link #OPTION_STREAMING} - *
- * - * - * @author Simon Montagu, Matitiahu Allouche (ported from C code written by Markus W. Scherer) - * @stable ICU 3.8 - * - * - *

Sample code for the ICU Bidi API

- * - *
Rendering a paragraph with the ICU Bidi API
- * - * This is (hypothetical) sample code that illustrates how the ICU Bidi API - * could be used to render a paragraph of text. Rendering code depends highly on - * the graphics system, therefore this sample code must make a lot of - * assumptions, which may or may not match any existing graphics system's - * properties. - * - *

- * The basic assumptions are: - *

- *
    - *
  • Rendering is done from left to right on a horizontal line.
  • - *
  • A run of single-style, unidirectional text can be rendered at once. - *
  • - *
  • Such a run of text is passed to the graphics system with characters - * (code units) in logical order.
  • - *
  • The line-breaking algorithm is very complicated and Locale-dependent - - * and therefore its implementation omitted from this sample code.
  • - *
- * - *
{@code
- *
- *  package com.ibm.icu.dev.test.bidi;
- *
- *  import com.ibm.icu.text.Bidi;
- *  import com.ibm.icu.text.BidiRun;
- *
- *  public class Sample {
- *
- *      static final int styleNormal = 0;
- *      static final int styleSelected = 1;
- *      static final int styleBold = 2;
- *      static final int styleItalics = 4;
- *      static final int styleSuper=8;
- *      static final int styleSub = 16;
- *
- *      static class StyleRun {
- *          int limit;
- *          int style;
- *
- *          public StyleRun(int limit, int style) {
- *              this.limit = limit;
- *              this.style = style;
- *          }
- *      }
- *
- *      static class Bounds {
- *          int start;
- *          int limit;
- *
- *          public Bounds(int start, int limit) {
- *              this.start = start;
- *              this.limit = limit;
- *          }
- *      }
- *
- *      static int getTextWidth(String text, int start, int limit,
- *                              StyleRun[] styleRuns, int styleRunCount) {
- *          // simplistic way to compute the width
- *          return limit - start;
- *      }
- *
- *      // set limit and StyleRun limit for a line
- *      // from text[start] and from styleRuns[styleRunStart]
- *      // using Bidi.getLogicalRun(...)
- *      // returns line width
- *      static int getLineBreak(String text, Bounds line, Bidi para,
- *                              StyleRun styleRuns[], Bounds styleRun) {
- *          // dummy return
- *          return 0;
- *      }
- *
- *      // render runs on a line sequentially, always from left to right
- *
- *      // prepare rendering a new line
- *      static void startLine(byte textDirection, int lineWidth) {
- *          System.out.println();
- *      }
- *
- *      // render a run of text and advance to the right by the run width
- *      // the text[start..limit-1] is always in logical order
- *      static void renderRun(String text, int start, int limit,
- *                            byte textDirection, int style) {
- *      }
- *
- *      // We could compute a cross-product
- *      // from the style runs with the directional runs
- *      // and then reorder it.
- *      // Instead, here we iterate over each run type
- *      // and render the intersections -
- *      // with shortcuts in simple (and common) cases.
- *      // renderParagraph() is the main function.
- *
- *      // render a directional run with
- *      // (possibly) multiple style runs intersecting with it
- *      static void renderDirectionalRun(String text, int start, int limit,
- *                                       byte direction, StyleRun styleRuns[],
- *                                       int styleRunCount) {
- *          int i;
- *
- *          // iterate over style runs
- *          if (direction == Bidi.LTR) {
- *              int styleLimit;
- *              for (i = 0; i < styleRunCount; ++i) {
- *                  styleLimit = styleRuns[i].limit;
- *                  if (start < styleLimit) {
- *                      if (styleLimit > limit) {
- *                          styleLimit = limit;
- *                      }
- *                      renderRun(text, start, styleLimit,
- *                                direction, styleRuns[i].style);
- *                      if (styleLimit == limit) {
- *                          break;
- *                      }
- *                      start = styleLimit;
- *                  }
- *              }
- *          } else {
- *              int styleStart;
- *
- *              for (i = styleRunCount-1; i >= 0; --i) {
- *                  if (i > 0) {
- *                      styleStart = styleRuns[i-1].limit;
- *                  } else {
- *                      styleStart = 0;
- *                  }
- *                  if (limit >= styleStart) {
- *                      if (styleStart < start) {
- *                          styleStart = start;
- *                      }
- *                      renderRun(text, styleStart, limit, direction,
- *                                styleRuns[i].style);
- *                      if (styleStart == start) {
- *                          break;
- *                      }
- *                      limit = styleStart;
- *                  }
- *              }
- *          }
- *      }
- *
- *      // the line object represents text[start..limit-1]
- *      static void renderLine(Bidi line, String text, int start, int limit,
- *                             StyleRun styleRuns[], int styleRunCount) {
- *          byte direction = line.getDirection();
- *          if (direction != Bidi.MIXED) {
- *              // unidirectional
- *              if (styleRunCount <= 1) {
- *                  renderRun(text, start, limit, direction, styleRuns[0].style);
- *              } else {
- *                  renderDirectionalRun(text, start, limit, direction,
- *                                       styleRuns, styleRunCount);
- *              }
- *          } else {
- *              // mixed-directional
- *              int count, i;
- *              BidiRun run;
- *
- *              try {
- *                  count = line.countRuns();
- *              } catch (IllegalStateException e) {
- *                  e.printStackTrace();
- *                  return;
- *              }
- *              if (styleRunCount <= 1) {
- *                  int style = styleRuns[0].style;
- *
- *                  // iterate over directional runs
- *                  for (i = 0; i < count; ++i) {
- *                      run = line.getVisualRun(i);
- *                      renderRun(text, run.getStart(), run.getLimit(),
- *                                run.getDirection(), style);
- *                  }
- *              } else {
- *                  // iterate over both directional and style runs
- *                  for (i = 0; i < count; ++i) {
- *                      run = line.getVisualRun(i);
- *                      renderDirectionalRun(text, run.getStart(),
- *                                           run.getLimit(), run.getDirection(),
- *                                           styleRuns, styleRunCount);
- *                  }
- *              }
- *          }
- *      }
- *
- *      static void renderParagraph(String text, byte textDirection,
- *                                  StyleRun styleRuns[], int styleRunCount,
- *                                  int lineWidth) {
- *          int length = text.length();
- *          Bidi para = new Bidi();
- *          try {
- *              para.setPara(text,
- *                           textDirection != 0 ? Bidi.LEVEL_DEFAULT_RTL
- *                                              : Bidi.LEVEL_DEFAULT_LTR,
- *                           null);
- *          } catch (Exception e) {
- *              e.printStackTrace();
- *              return;
- *          }
- *          byte paraLevel = (byte)(1 & para.getParaLevel());
- *          StyleRun styleRun = new StyleRun(length, styleNormal);
- *
- *          if (styleRuns == null || styleRunCount <= 0) {
- *              styleRuns = new StyleRun[1];
- *              styleRunCount = 1;
- *              styleRuns[0] = styleRun;
- *          }
- *          // assume styleRuns[styleRunCount-1].limit>=length
- *
- *          int width = getTextWidth(text, 0, length, styleRuns, styleRunCount);
- *          if (width <= lineWidth) {
- *              // everything fits onto one line
- *
- *              // prepare rendering a new line from either left or right
- *              startLine(paraLevel, width);
- *
- *              renderLine(para, text, 0, length, styleRuns, styleRunCount);
- *          } else {
- *              // we need to render several lines
- *              Bidi line = new Bidi(length, 0);
- *              int start = 0, limit;
- *              int styleRunStart = 0, styleRunLimit;
- *
- *              for (;;) {
- *                  limit = length;
- *                  styleRunLimit = styleRunCount;
- *                  width = getLineBreak(text, new Bounds(start, limit),
- *                                       para, styleRuns,
- *                                       new Bounds(styleRunStart, styleRunLimit));
- *                  try {
- *                      line = para.setLine(start, limit);
- *                  } catch (Exception e) {
- *                      e.printStackTrace();
- *                      return;
- *                  }
- *                  // prepare rendering a new line
- *                  // from either left or right
- *                  startLine(paraLevel, width);
- *
- *                  if (styleRunStart > 0) {
- *                      int newRunCount = styleRuns.length - styleRunStart;
- *                      StyleRun[] newRuns = new StyleRun[newRunCount];
- *                      System.arraycopy(styleRuns, styleRunStart, newRuns, 0,
- *                                       newRunCount);
- *                      renderLine(line, text, start, limit, newRuns,
- *                                 styleRunLimit - styleRunStart);
- *                  } else {
- *                      renderLine(line, text, start, limit, styleRuns,
- *                                 styleRunLimit - styleRunStart);
- *                  }
- *                  if (limit == length) {
- *                      break;
- *                  }
- *                  start = limit;
- *                  styleRunStart = styleRunLimit - 1;
- *                  if (start >= styleRuns[styleRunStart].limit) {
- *                      ++styleRunStart;
- *                  }
- *              }
- *          }
- *      }
- *
- *      public static void main(String[] args)
- *      {
- *          renderParagraph("Some Latin text...", Bidi.LTR, null, 0, 80);
- *          renderParagraph("Some Hebrew text...", Bidi.RTL, null, 0, 60);
- *      }
- *  }
- *
- * }
- */ - -/* - * General implementation notes: - * - * Throughout the implementation, there are comments like (W2) that refer to - * rules of the BiDi algorithm, in this example to the second rule of the - * resolution of weak types. - * - * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32) - * character according to UTF-16, the second UChar gets the directional property of - * the entire character assigned, while the first one gets a BN, a boundary - * neutral, type, which is ignored by most of the algorithm according to - * rule (X9) and the implementation suggestions of the BiDi algorithm. - * - * Later, adjustWSLevels() will set the level for each BN to that of the - * following character (UChar), which results in surrogate pairs getting the - * same level on each of their surrogates. - * - * In a UTF-8 implementation, the same thing could be done: the last byte of - * a multi-byte sequence would get the "real" property, while all previous - * bytes of that sequence would get BN. - * - * It is not possible to assign all those parts of a character the same real - * property because this would fail in the resolution of weak types with rules - * that look at immediately surrounding types. - * - * As a related topic, this implementation does not remove Boundary Neutral - * types from the input, but ignores them wherever this is relevant. - * For example, the loop for the resolution of the weak types reads - * types until it finds a non-BN. - * Also, explicit embedding codes are neither changed into BN nor removed. - * They are only treated the same way real BNs are. - * As stated before, adjustWSLevels() takes care of them at the end. - * For the purpose of conformance, the levels of all these codes - * do not matter. - * - * Note that this implementation modifies the dirProps - * after the initial setup, when applying X5c (replace FSI by LRI or RLI), - * X6, N0 (replace paired brackets by L or R). - * - * In this implementation, the resolution of weak types (W1 to W6), - * neutrals (N1 and N2), and the assignment of the resolved level (In) - * are all done in one single loop, in resolveImplicitLevels(). - * Changes of dirProp values are done on the fly, without writing - * them back to the dirProps array. - * - * - * This implementation contains code that allows to bypass steps of the - * algorithm that are not needed on the specific paragraph - * in order to speed up the most common cases considerably, - * like text that is entirely LTR, or RTL text without numbers. - * - * Most of this is done by setting a bit for each directional property - * in a flags variable and later checking for whether there are - * any LTR characters or any RTL characters, or both, whether - * there are any explicit embedding codes, etc. - * - * If the (Xn) steps are performed, then the flags are re-evaluated, - * because they will then not contain the embedding codes any more - * and will be adjusted for override codes, so that subsequently - * more bypassing may be possible than what the initial flags suggested. - * - * If the text is not mixed-directional, then the - * algorithm steps for the weak type resolution are not performed, - * and all levels are set to the paragraph level. - * - * If there are no explicit embedding codes, then the (Xn) steps - * are not performed. - * - * If embedding levels are supplied as a parameter, then all - * explicit embedding codes are ignored, and the (Xn) steps - * are not performed. - * - * White Space types could get the level of the run they belong to, - * and are checked with a test of (flags&MASK_EMBEDDING) to - * consider if the paragraph direction should be considered in - * the flags variable. - * - * If there are no White Space types in the paragraph, then - * (L1) is not necessary in adjustWSLevels(). - */ - -public class BidiBase { - - static class Point { - int pos; /* position in text */ - int flag; /* flag for LRM/RLM, before/after */ - } - - static class InsertPoints { - int size; - int confirmed; - Point[] points = new Point[0]; - } - - static class Opening { - int position; /* position of opening bracket */ - int match; /* matching char or -position of closing bracket */ - int contextPos; /* position of last strong char found before opening */ - short flags; /* bits for L or R/AL found within the pair */ - byte contextDir; /* L or R according to last strong char before opening */ - } - - static class IsoRun { - int contextPos; /* position of char determining context */ - short start; /* index of first opening entry for this run */ - short limit; /* index after last opening entry for this run */ - byte level; /* level of this run */ - byte lastStrong; /* bidi class of last strong char found in this run */ - byte lastBase; /* bidi class of last base char found in this run */ - byte contextDir; /* L or R to use as context for following openings */ - } - - static class BracketData { - Opening[] openings = new Opening[SIMPLE_PARAS_COUNT]; - int isoRunLast; /* index of last used entry */ - /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL - + 1 for index 0, + 1 for before the first isolated sequence */ - IsoRun[] isoRuns = new IsoRun[MAX_EXPLICIT_LEVEL+2]; - boolean isNumbersSpecial; /*reordering mode for NUMBERS_SPECIAL */ - } - - static class Isolate { - int startON; - int start1; - short stateImp; - short state; - } - - /** Paragraph level setting

- * - * Constant indicating that the base direction depends on the first strong - * directional character in the text according to the Unicode Bidirectional - * Algorithm. If no strong directional character is present, - * then set the paragraph level to 0 (left-to-right).

- * - * If this value is used in conjunction with reordering modes - * REORDER_INVERSE_LIKE_DIRECT or - * REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder - * is assumed to be visual LTR, and the text after reordering is required - * to be the corresponding logical string with appropriate contextual - * direction. The direction of the result string will be RTL if either - * the rightmost or leftmost strong character of the source text is RTL - * or Arabic Letter, the direction will be LTR otherwise.

- * - * If reordering option OPTION_INSERT_MARKS is set, an RLM may - * be added at the beginning of the result string to ensure round trip - * (that the result string, when reordered back to visual, will produce - * the original source text). - * @see #REORDER_INVERSE_LIKE_DIRECT - * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL - * @stable ICU 3.8 - */ - public static final byte LEVEL_DEFAULT_LTR = (byte)0x7e; - - /** Paragraph level setting

- * - * Constant indicating that the base direction depends on the first strong - * directional character in the text according to the Unicode Bidirectional - * Algorithm. If no strong directional character is present, - * then set the paragraph level to 1 (right-to-left).

- * - * If this value is used in conjunction with reordering modes - * REORDER_INVERSE_LIKE_DIRECT or - * REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder - * is assumed to be visual LTR, and the text after reordering is required - * to be the corresponding logical string with appropriate contextual - * direction. The direction of the result string will be RTL if either - * the rightmost or leftmost strong character of the source text is RTL - * or Arabic Letter, or if the text contains no strong character; - * the direction will be LTR otherwise.

- * - * If reordering option OPTION_INSERT_MARKS is set, an RLM may - * be added at the beginning of the result string to ensure round trip - * (that the result string, when reordered back to visual, will produce - * the original source text). - * @see #REORDER_INVERSE_LIKE_DIRECT - * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL - * @stable ICU 3.8 - */ - public static final byte LEVEL_DEFAULT_RTL = (byte)0x7f; - - /** - * Maximum explicit embedding level. - * (The maximum resolved level can be up to MAX_EXPLICIT_LEVEL+1). - * @stable ICU 3.8 - */ - public static final byte MAX_EXPLICIT_LEVEL = 125; - - /** - * Bit flag for level input. - * Overrides directional properties. - * @stable ICU 3.8 - */ - public static final byte LEVEL_OVERRIDE = (byte)0x80; - - /** - * Special value which can be returned by the mapping methods when a - * logical index has no corresponding visual index or vice-versa. This may - * happen for the logical-to-visual mapping of a Bidi control when option - * OPTION_REMOVE_CONTROLS is - * specified. This can also happen for the visual-to-logical mapping of a - * Bidi mark (LRM or RLM) inserted by option - * OPTION_INSERT_MARKS. - * @see #getVisualIndex - * @see #getVisualMap - * @see #getLogicalIndex - * @see #getLogicalMap - * @see #OPTION_INSERT_MARKS - * @see #OPTION_REMOVE_CONTROLS - * @stable ICU 3.8 - */ - public static final int MAP_NOWHERE = -1; - - /** - * Left-to-right text. - *

    - *
  • As return value for getDirection(), it means - * that the source string contains no right-to-left characters, or - * that the source string is empty and the paragraph level is even. - *
  • As return value for getBaseDirection(), it - * means that the first strong character of the source string has - * a left-to-right direction. - *
- * @stable ICU 3.8 - */ - public static final byte LTR = 0; - - /** - * Right-to-left text. - *
    - *
  • As return value for getDirection(), it means - * that the source string contains no left-to-right characters, or - * that the source string is empty and the paragraph level is odd. - *
  • As return value for getBaseDirection(), it - * means that the first strong character of the source string has - * a right-to-left direction. - *
- * @stable ICU 3.8 - */ - public static final byte RTL = 1; - - /** - * Mixed-directional text. - *

As return value for getDirection(), it means - * that the source string contains both left-to-right and - * right-to-left characters. - * @stable ICU 3.8 - */ - public static final byte MIXED = 2; - - /** - * option bit for writeReordered(): - * keep combining characters after their base characters in RTL runs - * - * @see #writeReordered - * @stable ICU 3.8 - */ - public static final short KEEP_BASE_COMBINING = 1; - - /** - * option bit for writeReordered(): - * replace characters with the "mirrored" property in RTL runs - * by their mirror-image mappings - * - * @see #writeReordered - * @stable ICU 3.8 - */ - public static final short DO_MIRRORING = 2; - - /** - * option bit for writeReordered(): - * surround the run with LRMs if necessary; - * this is part of the approximate "inverse Bidi" algorithm - * - *

This option does not imply corresponding adjustment of the index - * mappings.

- * - * @see #setInverse - * @see #writeReordered - * @stable ICU 3.8 - */ - public static final short INSERT_LRM_FOR_NUMERIC = 4; - - /** - * option bit for writeReordered(): - * remove Bidi control characters - * (this does not affect INSERT_LRM_FOR_NUMERIC) - * - *

This option does not imply corresponding adjustment of the index - * mappings.

- * - * @see #writeReordered - * @see #INSERT_LRM_FOR_NUMERIC - * @stable ICU 3.8 - */ - public static final short REMOVE_BIDI_CONTROLS = 8; - - /** - * option bit for writeReordered(): - * write the output in reverse order - * - *

This has the same effect as calling writeReordered() - * first without this option, and then calling - * writeReverse() without mirroring. - * Doing this in the same step is faster and avoids a temporary buffer. - * An example for using this option is output to a character terminal that - * is designed for RTL scripts and stores text in reverse order.

- * - * @see #writeReordered - * @stable ICU 3.8 - */ - public static final short OUTPUT_REVERSE = 16; - - /** Reordering mode: Regular Logical to Visual Bidi algorithm according to Unicode. - * @see #setReorderingMode - * @stable ICU 3.8 - */ - private static final short REORDER_DEFAULT = 0; - - /** Reordering mode: Logical to Visual algorithm which handles numbers in - * a way which mimicks the behavior of Windows XP. - * @see #setReorderingMode - * @stable ICU 3.8 - */ - private static final short REORDER_NUMBERS_SPECIAL = 1; - - /** Reordering mode: Logical to Visual algorithm grouping numbers with - * adjacent R characters (reversible algorithm). - * @see #setReorderingMode - * @stable ICU 3.8 - */ - private static final short REORDER_GROUP_NUMBERS_WITH_R = 2; - - /** Reordering mode: Reorder runs only to transform a Logical LTR string - * to the logical RTL string with the same display, or vice-versa.
- * If this mode is set together with option - * OPTION_INSERT_MARKS, some Bidi controls in the source - * text may be removed and other controls may be added to produce the - * minimum combination which has the required display. - * @see #OPTION_INSERT_MARKS - * @see #setReorderingMode - * @stable ICU 3.8 - */ - static final short REORDER_RUNS_ONLY = 3; - - /** Reordering mode: Visual to Logical algorithm which handles numbers - * like L (same algorithm as selected by setInverse(true). - * @see #setInverse - * @see #setReorderingMode - * @stable ICU 3.8 - */ - static final short REORDER_INVERSE_NUMBERS_AS_L = 4; - - /** Reordering mode: Visual to Logical algorithm equivalent to the regular - * Logical to Visual algorithm. - * @see #setReorderingMode - * @stable ICU 3.8 - */ - static final short REORDER_INVERSE_LIKE_DIRECT = 5; - - /** Reordering mode: Inverse Bidi (Visual to Logical) algorithm for the - * REORDER_NUMBERS_SPECIAL Bidi algorithm. - * @see #setReorderingMode - * @stable ICU 3.8 - */ - static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6; - - /* Reordering mode values must be ordered so that all the regular logical to - * visual modes come first, and all inverse Bidi modes come last. - */ - private static final short REORDER_LAST_LOGICAL_TO_VISUAL = - REORDER_NUMBERS_SPECIAL; - - /** - * Option bit for setReorderingOptions: - * insert Bidi marks (LRM or RLM) when needed to ensure correct result of - * a reordering to a Logical order - * - *

This option must be set or reset before calling - * setPara.

- * - *

This option is significant only with reordering modes which generate - * a result with Logical order, specifically.

- *
    - *
  • REORDER_RUNS_ONLY
  • - *
  • REORDER_INVERSE_NUMBERS_AS_L
  • - *
  • REORDER_INVERSE_LIKE_DIRECT
  • - *
  • REORDER_INVERSE_FOR_NUMBERS_SPECIAL
  • - *
- * - *

If this option is set in conjunction with reordering mode - * REORDER_INVERSE_NUMBERS_AS_L or with calling - * setInverse(true), it implies option - * INSERT_LRM_FOR_NUMERIC in calls to method - * writeReordered().

- * - *

For other reordering modes, a minimum number of LRM or RLM characters - * will be added to the source text after reordering it so as to ensure - * round trip, i.e. when applying the inverse reordering mode on the - * resulting logical text with removal of Bidi marks - * (option OPTION_REMOVE_CONTROLS set before calling - * setPara() or option - * REMOVE_BIDI_CONTROLS in - * writeReordered), the result will be identical to the - * source text in the first transformation. - * - *

This option will be ignored if specified together with option - * OPTION_REMOVE_CONTROLS. It inhibits option - * REMOVE_BIDI_CONTROLS in calls to method - * writeReordered() and it implies option - * INSERT_LRM_FOR_NUMERIC in calls to method - * writeReordered() if the reordering mode is - * REORDER_INVERSE_NUMBERS_AS_L.

- * - * @see #setReorderingMode - * @see #setReorderingOptions - * @see #INSERT_LRM_FOR_NUMERIC - * @see #REMOVE_BIDI_CONTROLS - * @see #OPTION_REMOVE_CONTROLS - * @see #REORDER_RUNS_ONLY - * @see #REORDER_INVERSE_NUMBERS_AS_L - * @see #REORDER_INVERSE_LIKE_DIRECT - * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL - * @stable ICU 3.8 - */ - static final int OPTION_INSERT_MARKS = 1; - - /** - * Option bit for setReorderingOptions: - * remove Bidi control characters - * - *

This option must be set or reset before calling - * setPara.

- * - *

This option nullifies option - * OPTION_INSERT_MARKS. It inhibits option - * INSERT_LRM_FOR_NUMERIC in calls to method - * writeReordered() and it implies option - * REMOVE_BIDI_CONTROLS in calls to that method.

- * - * @see #setReorderingMode - * @see #setReorderingOptions - * @see #OPTION_INSERT_MARKS - * @see #INSERT_LRM_FOR_NUMERIC - * @see #REMOVE_BIDI_CONTROLS - * @stable ICU 3.8 - */ - static final int OPTION_REMOVE_CONTROLS = 2; - - /** - * Option bit for setReorderingOptions: - * process the output as part of a stream to be continued - * - *

This option must be set or reset before calling - * setPara.

- * - *

This option specifies that the caller is interested in processing - * large text object in parts. The results of the successive calls are - * expected to be concatenated by the caller. Only the call for the last - * part will have this option bit off.

- * - *

When this option bit is on, setPara() may process - * less than the full source text in order to truncate the text at a - * meaningful boundary. The caller should call - * getProcessedLength() immediately after calling - * setPara() in order to determine how much of the source - * text has been processed. Source text beyond that length should be - * resubmitted in following calls to setPara. The - * processed length may be less than the length of the source text if a - * character preceding the last character of the source text constitutes a - * reasonable boundary (like a block separator) for text to be continued.
- * If the last character of the source text constitutes a reasonable - * boundary, the whole text will be processed at once.
- * If nowhere in the source text there exists - * such a reasonable boundary, the processed length will be zero.
- * The caller should check for such an occurrence and do one of the following: - *

  • submit a larger amount of text with a better chance to include - * a reasonable boundary.
  • - *
  • resubmit the same text after turning off option - * OPTION_STREAMING.
- * In all cases, this option should be turned off before processing the last - * part of the text.

- * - *

When the OPTION_STREAMING option is used, it is - * recommended to call orderParagraphsLTR(true) before calling - * setPara() so that later paragraphs may be concatenated to - * previous paragraphs on the right. - *

- * - * @see #setReorderingMode - * @see #setReorderingOptions - * @see #getProcessedLength - * @stable ICU 3.8 - */ - private static final int OPTION_STREAMING = 4; - - /* - * Comparing the description of the Bidi algorithm with this implementation - * is easier with the same names for the Bidi types in the code as there. - * See UCharacterDirection - */ - /* private */ static final byte L = 0; - private static final byte R = 1; - private static final byte EN = 2; - private static final byte ES = 3; - private static final byte ET = 4; - private static final byte AN = 5; - private static final byte CS = 6; - static final byte B = 7; - private static final byte S = 8; - private static final byte WS = 9; - private static final byte ON = 10; - private static final byte LRE = 11; - private static final byte LRO = 12; - private static final byte AL = 13; - private static final byte RLE = 14; - private static final byte RLO = 15; - private static final byte PDF = 16; - private static final byte NSM = 17; - private static final byte BN = 18; - private static final byte FSI = 19; - private static final byte LRI = 20; - private static final byte RLI = 21; - private static final byte PDI = 22; - private static final byte ENL = PDI + 1; /* EN after W7 */ - private static final byte ENR = ENL + 1; /* EN not subject to W7 */ - - // Number of directional types - private static final int CHAR_DIRECTION_COUNT = 23; - - /** - * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). - * Used in UAX #9: Unicode Bidirectional Algorithm - * (http://www.unicode.org/reports/tr9/) - * Returns UCharacter.BidiPairedBracketType values. - * @stable ICU 52 - */ - public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; - - /** - * Bidi Paired Bracket Type constants. - * - * @see UProperty#BIDI_PAIRED_BRACKET_TYPE - * @stable ICU 52 - */ - public static interface BidiPairedBracketType { - /** - * Not a paired bracket. - * @stable ICU 52 - */ - public static final int NONE = 0; - /** - * Open paired bracket. - * @stable ICU 52 - */ - public static final int OPEN = 1; - /** - * Close paired bracket. - * @stable ICU 52 - */ - public static final int CLOSE = 2; - /** - * @stable ICU 52 - */ - public static final int COUNT = 3; - } - - /* number of paras entries allocated initially */ - static final int SIMPLE_PARAS_COUNT = 10; - - private static final char CR = '\r'; - private static final char LF = '\n'; - - static final int LRM_BEFORE = 1; - static final int LRM_AFTER = 2; - static final int RLM_BEFORE = 4; - static final int RLM_AFTER = 8; - - /* flags for Opening.flags */ - static final byte FOUND_L = (byte)DirPropFlag(L); - static final byte FOUND_R = (byte)DirPropFlag(R); - - /* - * The following bit is used for the directional isolate status. - * Stack entries corresponding to isolate sequences are greater than ISOLATE. - */ - static final int ISOLATE = 0x0100; - - /* - * reference to parent paragraph object (reference to self if this object is - * a paragraph object); set to null in a newly opened object; set to a - * real value after a successful execution of setPara or setLine - */ - BidiBase paraBidi; - - final UBiDiProps bdp; - - /* character array representing the current text */ - char[] text; - - /* length of the current text */ - int originalLength; - - /* if the option OPTION_STREAMING is set, this is the length of - * text actually processed by setPara, which may be shorter - * than the original length. Otherwise, it is identical to the original - * length. - */ - public int length; - - /* if option OPTION_REMOVE_CONTROLS is set, and/or Bidi - * marks are allowed to be inserted in one of the reordering modes, the - * length of the result string may be different from the processed length. - */ - int resultLength; - - /* indicators for whether memory may be allocated after construction */ - boolean mayAllocateText; - boolean mayAllocateRuns; - - /* arrays with one value per text-character */ - byte[] dirPropsMemory = new byte[1]; - byte[] levelsMemory = new byte[1]; - byte[] dirProps; - byte[] levels; - - /* are we performing an approximation of the "inverse Bidi" algorithm? */ - boolean isInverse; - - /* are we using the basic algorithm or its variation? */ - int reorderingMode; - - /* bitmask for reordering options */ - int reorderingOptions; - - /* must block separators receive level 0? */ - boolean orderParagraphsLTR; - - /* the paragraph level */ - byte paraLevel; - - /* original paraLevel when contextual */ - /* must be one of DEFAULT_xxx or 0 if not contextual */ - byte defaultParaLevel; - - /* the following is set in setPara, used in processPropertySeq */ - - ImpTabPair impTabPair; /* reference to levels state table pair */ - - /* the overall paragraph or line directionality*/ - byte direction; - - /* flags is a bit set for which directional properties are in the text */ - int flags; - - /* lastArabicPos is index to the last AL in the text, -1 if none */ - int lastArabicPos; - - /* characters after trailingWSStart are WS and are */ - /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */ - int trailingWSStart; - - /* fields for paragraph handling, set in getDirProps() */ - int paraCount; - int[] paras_limit = new int[SIMPLE_PARAS_COUNT]; - byte[] paras_level = new byte[SIMPLE_PARAS_COUNT]; - - /* fields for line reordering */ - int runCount; /* ==-1: runs not set up yet */ - BidiRun[] runsMemory = new BidiRun[0]; - BidiRun[] runs; - - /* for non-mixed text, we only need a tiny array of runs (no allocation) */ - BidiRun[] simpleRuns = {new BidiRun()}; - - /* fields for managing isolate sequences */ - Isolate[] isolates; - - /* maximum or current nesting depth of isolate sequences */ - /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal - nesting encountered. - Within resolveImplicitLevels(), this is the index of the current isolates - stack entry. */ - int isolateCount; - - /* mapping of runs in logical order to visual order */ - int[] logicalToVisualRunsMap; - /* flag to indicate that the map has been updated */ - boolean isGoodLogicalToVisualRunsMap; - - /* for inverse Bidi with insertion of directional marks */ - InsertPoints insertPoints = new InsertPoints(); - - /* for option OPTION_REMOVE_CONTROLS */ - int controlCount; - - /* - * Sometimes, bit values are more appropriate - * to deal with directionality properties. - * Abbreviations in these method names refer to names - * used in the Bidi algorithm. - */ - static int DirPropFlag(byte dir) { - return (1 << dir); - } - - boolean testDirPropFlagAt(int flag, int index) { - return ((DirPropFlag(dirProps[index]) & flag) != 0); - } - - static final int DirPropFlagMultiRuns = DirPropFlag((byte)31); - - /* to avoid some conditional statements, use tiny constant arrays */ - static final int DirPropFlagLR[] = { DirPropFlag(L), DirPropFlag(R) }; - static final int DirPropFlagE[] = { DirPropFlag(LRE), DirPropFlag(RLE) }; - static final int DirPropFlagO[] = { DirPropFlag(LRO), DirPropFlag(RLO) }; - - static final int DirPropFlagLR(byte level) { return DirPropFlagLR[level & 1]; } - static final int DirPropFlagE(byte level) { return DirPropFlagE[level & 1]; } - static final int DirPropFlagO(byte level) { return DirPropFlagO[level & 1]; } - static final byte DirFromStrong(byte strong) { return strong == L ? L : R; } - static final byte NoOverride(byte level) { return (byte)(level & ~LEVEL_OVERRIDE); } - - /* are there any characters that are LTR or RTL? */ - static final int MASK_LTR = - DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(ENL)|DirPropFlag(ENR)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(LRI); - static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(RLI); - - static final int MASK_R_AL = DirPropFlag(R)|DirPropFlag(AL); - - /* explicit embedding codes */ - private static final int MASK_EXPLICIT = DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(PDF); - private static final int MASK_BN_EXPLICIT = DirPropFlag(BN)|MASK_EXPLICIT; - - /* explicit isolate codes */ - private static final int MASK_ISO = DirPropFlag(LRI)|DirPropFlag(RLI)|DirPropFlag(FSI)|DirPropFlag(PDI); - - /* paragraph and segment separators */ - private static final int MASK_B_S = DirPropFlag(B)|DirPropFlag(S); - - /* all types that are counted as White Space or Neutral in some steps */ - static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT|MASK_ISO; - - /* types that are neutrals or could becomes neutrals in (Wn) */ - private static final int MASK_POSSIBLE_N = DirPropFlag(ON)|DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_WS; - - /* - * These types may be changed to "e", - * the embedding type (L or R) of the run, - * in the Bidi algorithm (N2) - */ - private static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N; - - /* - * the dirProp's L and R are defined to 0 and 1 values in UCharacterDirection.java - */ - private static byte GetLRFromLevel(byte level) - { - return (byte)(level & 1); - } - - private static boolean IsDefaultLevel(byte level) - { - return ((level & LEVEL_DEFAULT_LTR) == LEVEL_DEFAULT_LTR); - } - - static boolean IsBidiControlChar(int c) - { - /* check for range 0x200c to 0x200f (ZWNJ, ZWJ, LRM, RLM) or - 0x202a to 0x202e (LRE, RLE, PDF, LRO, RLO) */ - return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e)) - || ((c >= 0x2066) && (c <= 0x2069))); - } - - void verifyValidPara() - { - if (!(this == this.paraBidi)) { - throw new IllegalStateException(); - } - } - - void verifyValidParaOrLine() - { - BidiBase para = this.paraBidi; - /* verify Para */ - if (this == para) { - return; - } - /* verify Line */ - if ((para == null) || (para != para.paraBidi)) { - throw new IllegalStateException(); - } - } - - void verifyRange(int index, int start, int limit) - { - if (index < start || index >= limit) { - throw new IllegalArgumentException("Value " + index + - " is out of range " + start + " to " + limit); - } - } - - /** - * Allocate a Bidi object with preallocated memory - * for internal structures. - * This method provides a Bidi object like the default constructor - * but it also preallocates memory for internal structures - * according to the sizings supplied by the caller.

- * The preallocation can be limited to some of the internal memory - * by setting some values to 0 here. That means that if, e.g., - * maxRunCount cannot be reasonably predetermined and should not - * be set to maxLength (the only failproof value) to avoid - * wasting memory, then maxRunCount could be set to 0 here - * and the internal structures that are associated with it will be allocated - * on demand, just like with the default constructor. - * - * @param maxLength is the maximum text or line length that internal memory - * will be preallocated for. An attempt to associate this object with a - * longer text will fail, unless this value is 0, which leaves the allocation - * up to the implementation. - * - * @param maxRunCount is the maximum anticipated number of same-level runs - * that internal memory will be preallocated for. An attempt to access - * visual runs on an object that was not preallocated for as many runs - * as the text was actually resolved to will fail, - * unless this value is 0, which leaves the allocation up to the implementation.

- * The number of runs depends on the actual text and maybe anywhere between - * 1 and maxLength. It is typically small. - * - * @throws IllegalArgumentException if maxLength or maxRunCount is less than 0 - * @stable ICU 3.8 - */ - public BidiBase(int maxLength, int maxRunCount) - { - /* check the argument values */ - if (maxLength < 0 || maxRunCount < 0) { - throw new IllegalArgumentException(); - } - - /* reset the object, all reference variables null, all flags false, - all sizes 0. - In fact, we don't need to do anything, since class members are - initialized as zero when an instance is created. - */ - /* - mayAllocateText = false; - mayAllocateRuns = false; - orderParagraphsLTR = false; - paraCount = 0; - runCount = 0; - trailingWSStart = 0; - flags = 0; - paraLevel = 0; - defaultParaLevel = 0; - direction = 0; - */ - /* get Bidi properties */ - bdp = UBiDiProps.INSTANCE; - - /* allocate memory for arrays as requested */ - if (maxLength > 0) { - getInitialDirPropsMemory(maxLength); - getInitialLevelsMemory(maxLength); - } else { - mayAllocateText = true; - } - - if (maxRunCount > 0) { - // if maxRunCount == 1, use simpleRuns[] - if (maxRunCount > 1) { - getInitialRunsMemory(maxRunCount); - } - } else { - mayAllocateRuns = true; - } - } - - /* - * We are allowed to allocate memory if object==null or - * mayAllocate==true for each array that we need. - * - * Assume sizeNeeded>0. - * If object != null, then assume size > 0. - */ - private Object getMemory(String label, Object array, Class arrayClass, - boolean mayAllocate, int sizeNeeded) - { - int len = Array.getLength(array); - - /* we have at least enough memory and must not allocate */ - if (sizeNeeded == len) { - return array; - } - if (!mayAllocate) { - /* we must not allocate */ - if (sizeNeeded <= len) { - return array; - } - throw new OutOfMemoryError("Failed to allocate memory for " - + label); - } - /* we may try to grow or shrink */ - /* FOOD FOR THOUGHT: when shrinking it should be possible to avoid - the allocation altogether and rely on this.length */ - try { - return Array.newInstance(arrayClass, sizeNeeded); - } catch (Exception e) { - throw new OutOfMemoryError("Failed to allocate memory for " - + label); - } - } - - /* helper methods for each allocated array */ - private void getDirPropsMemory(boolean mayAllocate, int len) - { - Object array = getMemory("DirProps", dirPropsMemory, Byte.TYPE, mayAllocate, len); - dirPropsMemory = (byte[]) array; - } - - void getDirPropsMemory(int len) - { - getDirPropsMemory(mayAllocateText, len); - } - - private void getLevelsMemory(boolean mayAllocate, int len) - { - Object array = getMemory("Levels", levelsMemory, Byte.TYPE, mayAllocate, len); - levelsMemory = (byte[]) array; - } - - void getLevelsMemory(int len) - { - getLevelsMemory(mayAllocateText, len); - } - - private void getRunsMemory(boolean mayAllocate, int len) - { - Object array = getMemory("Runs", runsMemory, BidiRun.class, mayAllocate, len); - runsMemory = (BidiRun[]) array; - } - - void getRunsMemory(int len) - { - getRunsMemory(mayAllocateRuns, len); - } - - /* additional methods used by constructor - always allow allocation */ - private void getInitialDirPropsMemory(int len) - { - getDirPropsMemory(true, len); - } - - private void getInitialLevelsMemory(int len) - { - getLevelsMemory(true, len); - } - - private void getInitialRunsMemory(int len) - { - getRunsMemory(true, len); - } - - /** - * Is this Bidi object set to perform the inverse Bidi - * algorithm? - *

Note: calling this method after setting the reordering mode with - * setReorderingMode will return true if the - * reordering mode was set to - * REORDER_INVERSE_NUMBERS_AS_L, false - * for all other values.

- * - * @return true if the Bidi object is set to - * perform the inverse Bidi algorithm by handling numbers as L. - * - * @see #setInverse - * @see #setReorderingMode - * @see #REORDER_INVERSE_NUMBERS_AS_L - * @stable ICU 3.8 - */ - public boolean isInverse() { - return isInverse; - } - - /* perform (P2)..(P3) ------------------------------------------------------- */ - - /* - * Check that there are enough entries in the arrays paras_limit and paras_level - */ - private void checkParaCount() { - int[] saveLimits; - byte[] saveLevels; - int count = paraCount; - if (count <= paras_level.length) - return; - int oldLength = paras_level.length; - saveLimits = paras_limit; - saveLevels = paras_level; - try { - paras_limit = new int[count * 2]; - paras_level = new byte[count * 2]; - } catch (Exception e) { - throw new OutOfMemoryError("Failed to allocate memory for paras"); - } - System.arraycopy(saveLimits, 0, paras_limit, 0, oldLength); - System.arraycopy(saveLevels, 0, paras_level, 0, oldLength); - } - - /* - * Get the directional properties for the text, calculate the flags bit-set, and - * determine the paragraph level if necessary (in paras_level[i]). - * FSI initiators are also resolved and their dirProp replaced with LRI or RLI. - * When encountering an FSI, it is initially replaced with an LRI, which is the - * default. Only if a strong R or AL is found within its scope will the LRI be - * replaced by an RLI. - */ - static final int NOT_SEEKING_STRONG = 0; /* 0: not contextual paraLevel, not after FSI */ - static final int SEEKING_STRONG_FOR_PARA = 1; /* 1: looking for first strong char in para */ - static final int SEEKING_STRONG_FOR_FSI = 2; /* 2: looking for first strong after FSI */ - static final int LOOKING_FOR_PDI = 3; /* 3: found strong after FSI, looking for PDI */ - - private void getDirProps() - { - int i = 0, i0, i1; - flags = 0; /* collect all directionalities in the text */ - int uchar; - byte dirProp; - byte defaultParaLevel = 0; /* initialize to avoid compiler warnings */ - boolean isDefaultLevel = IsDefaultLevel(paraLevel); - /* for inverse Bidi, the default para level is set to RTL if there is a - strong R or AL character at either end of the text */ - boolean isDefaultLevelInverse=isDefaultLevel && - (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || - reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL); - lastArabicPos = -1; - int controlCount = 0; - boolean removeBidiControls = (reorderingOptions & OPTION_REMOVE_CONTROLS) != 0; - - byte state; - byte lastStrong = ON; /* for default level & inverse Bidi */ - /* The following stacks are used to manage isolate sequences. Those - sequences may be nested, but obviously never more deeply than the - maximum explicit embedding level. - lastStack is the index of the last used entry in the stack. A value of -1 - means that there is no open isolate sequence. - lastStack is reset to -1 on paragraph boundaries. */ - /* The following stack contains the position of the initiator of - each open isolate sequence */ - int[] isolateStartStack= new int[MAX_EXPLICIT_LEVEL+1]; - /* The following stack contains the last known state before - encountering the initiator of an isolate sequence */ - byte[] previousStateStack = new byte[MAX_EXPLICIT_LEVEL+1]; - int stackLast=-1; - - if ((reorderingOptions & OPTION_STREAMING) != 0) - length = 0; - defaultParaLevel = (byte)(paraLevel & 1); - - if (isDefaultLevel) { - paras_level[0] = defaultParaLevel; - lastStrong = defaultParaLevel; - state = SEEKING_STRONG_FOR_PARA; - } else { - paras_level[0] = paraLevel; - state = NOT_SEEKING_STRONG; - } - /* count paragraphs and determine the paragraph level (P2..P3) */ - /* - * see comment on constant fields: - * the LEVEL_DEFAULT_XXX values are designed so that - * their low-order bit alone yields the intended default - */ - - for (i = 0; i < originalLength; /* i is incremented in the loop */) { - i0 = i; /* index of first code unit */ - uchar = UTF16.charAt(text, 0, originalLength, i); - i += UTF16.getCharCount(uchar); - i1 = i - 1; /* index of last code unit, gets the directional property */ - - dirProp = (byte)getCustomizedClass(uchar); - flags |= DirPropFlag(dirProp); - dirProps[i1] = dirProp; - if (i1 > i0) { /* set previous code units' properties to BN */ - flags |= DirPropFlag(BN); - do { - dirProps[--i1] = BN; - } while (i1 > i0); - } - if (removeBidiControls && IsBidiControlChar(uchar)) { - controlCount++; - } - if (dirProp == L) { - if (state == SEEKING_STRONG_FOR_PARA) { - paras_level[paraCount - 1] = 0; - state = NOT_SEEKING_STRONG; - } - else if (state == SEEKING_STRONG_FOR_FSI) { - if (stackLast <= MAX_EXPLICIT_LEVEL) { - /* no need for next statement, already set by default */ - /* dirProps[isolateStartStack[stackLast]] = LRI; */ - flags |= DirPropFlag(LRI); - } - state = LOOKING_FOR_PDI; - } - lastStrong = L; - continue; - } - if (dirProp == R || dirProp == AL) { - if (state == SEEKING_STRONG_FOR_PARA) { - paras_level[paraCount - 1] = 1; - state = NOT_SEEKING_STRONG; - } - else if (state == SEEKING_STRONG_FOR_FSI) { - if (stackLast <= MAX_EXPLICIT_LEVEL) { - dirProps[isolateStartStack[stackLast]] = RLI; - flags |= DirPropFlag(RLI); - } - state = LOOKING_FOR_PDI; - } - lastStrong = R; - if (dirProp == AL) - lastArabicPos = i - 1; - continue; - } - if (dirProp >= FSI && dirProp <= RLI) { /* FSI, LRI or RLI */ - stackLast++; - if (stackLast <= MAX_EXPLICIT_LEVEL) { - isolateStartStack[stackLast] = i - 1; - previousStateStack[stackLast] = state; - } - if (dirProp == FSI) { - dirProps[i-1] = LRI; /* default if no strong char */ - state = SEEKING_STRONG_FOR_FSI; - } - else - state = LOOKING_FOR_PDI; - continue; - } - if (dirProp == PDI) { - if (state == SEEKING_STRONG_FOR_FSI) { - if (stackLast <= MAX_EXPLICIT_LEVEL) { - /* no need for next statement, already set by default */ - /* dirProps[isolateStartStack[stackLast]] = LRI; */ - flags |= DirPropFlag(LRI); - } - } - if (stackLast >= 0) { - if (stackLast <= MAX_EXPLICIT_LEVEL) - state = previousStateStack[stackLast]; - stackLast--; - } - continue; - } - if (dirProp == B) { - if (i < originalLength && uchar == CR && text[i] == LF) /* do nothing on the CR */ - continue; - paras_limit[paraCount - 1] = i; - if (isDefaultLevelInverse && lastStrong == R) - paras_level[paraCount - 1] = 1; - if ((reorderingOptions & OPTION_STREAMING) != 0) { - /* When streaming, we only process whole paragraphs - thus some updates are only done on paragraph boundaries */ - length = i; /* i is index to next character */ - this.controlCount = controlCount; - } - if (i < originalLength) { /* B not last char in text */ - paraCount++; - checkParaCount(); /* check that there is enough memory for a new para entry */ - if (isDefaultLevel) { - paras_level[paraCount - 1] = defaultParaLevel; - state = SEEKING_STRONG_FOR_PARA; - lastStrong = defaultParaLevel; - } else { - paras_level[paraCount - 1] = paraLevel; - state = NOT_SEEKING_STRONG; - } - stackLast = -1; - } - continue; - } - } - /* +Ignore still open isolate sequences with overflow */ - if (stackLast > MAX_EXPLICIT_LEVEL) { - stackLast = MAX_EXPLICIT_LEVEL; - state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */ - } - /* Resolve direction of still unresolved open FSI sequences */ - while (stackLast >= 0) { - if (state == SEEKING_STRONG_FOR_FSI) { - /* no need for next statement, already set by default */ - /* dirProps[isolateStartStack[stackLast]] = LRI; */ - flags |= DirPropFlag(LRI); - break; - } - state = previousStateStack[stackLast]; - stackLast--; - } - /* When streaming, ignore text after the last paragraph separator */ - if ((reorderingOptions & OPTION_STREAMING) != 0) { - if (length < originalLength) - paraCount--; - } else { - paras_limit[paraCount - 1] = originalLength; - this.controlCount = controlCount; - } - /* For inverse bidi, default para direction is RTL if there is - a strong R or AL at either end of the paragraph */ - if (isDefaultLevelInverse && lastStrong == R) { - paras_level[paraCount - 1] = 1; - } - if (isDefaultLevel) { - paraLevel = paras_level[0]; - } - /* The following is needed to resolve the text direction for default level - paragraphs containing no strong character */ - for (i = 0; i < paraCount; i++) - flags |= DirPropFlagLR(paras_level[i]); - - if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { - flags |= DirPropFlag(L); - } - } - - /* determine the paragraph level at position index */ - byte GetParaLevelAt(int pindex) - { - if (defaultParaLevel == 0 || pindex < paras_limit[0]) - return paraLevel; - int i; - for (i = 1; i < paraCount; i++) - if (pindex < paras_limit[i]) - break; - if (i >= paraCount) - i = paraCount - 1; - return paras_level[i]; - } - - /* Functions for handling paired brackets ----------------------------------- */ - - /* In the isoRuns array, the first entry is used for text outside of any - isolate sequence. Higher entries are used for each more deeply nested - isolate sequence. isoRunLast is the index of the last used entry. The - openings array is used to note the data of opening brackets not yet - matched by a closing bracket, or matched but still susceptible to change - level. - Each isoRun entry contains the index of the first and - one-after-last openings entries for pending opening brackets it - contains. The next openings entry to use is the one-after-last of the - most deeply nested isoRun entry. - isoRun entries also contain their current embedding level and the last - encountered strong character, since these will be needed to resolve - the level of paired brackets. */ - - private void bracketInit(BracketData bd) { - bd.isoRunLast = 0; - bd.isoRuns[0] = new IsoRun(); - bd.isoRuns[0].start = 0; - bd.isoRuns[0].limit = 0; - bd.isoRuns[0].level = GetParaLevelAt(0); - bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(GetParaLevelAt(0) & 1); - bd.isoRuns[0].contextPos = 0; - bd.openings = new Opening[SIMPLE_PARAS_COUNT]; - bd.isNumbersSpecial = reorderingMode == REORDER_NUMBERS_SPECIAL || - reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL; - } - - /* paragraph boundary */ - private void bracketProcessB(BracketData bd, byte level) { - bd.isoRunLast = 0; - bd.isoRuns[0].limit = 0; - bd.isoRuns[0].level = level; - bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(level & 1); - bd.isoRuns[0].contextPos = 0; - } - - /* LRE, LRO, RLE, RLO, PDF */ - private void bracketProcessBoundary(BracketData bd, int lastCcPos, - byte contextLevel, byte embeddingLevel) { - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - if ((DirPropFlag(dirProps[lastCcPos]) & MASK_ISO) != 0) /* after an isolate */ - return; - if (NoOverride(embeddingLevel) > NoOverride(contextLevel)) /* not a PDF */ - contextLevel = embeddingLevel; - pLastIsoRun.limit = pLastIsoRun.start; - pLastIsoRun.level = embeddingLevel; - pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(contextLevel & 1); - pLastIsoRun.contextPos = lastCcPos; - } - - /* LRI or RLI */ - private void bracketProcessLRI_RLI(BracketData bd, byte level) { - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - short lastLimit; - pLastIsoRun.lastBase = ON; - lastLimit = pLastIsoRun.limit; - bd.isoRunLast++; - pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - if (pLastIsoRun == null) - pLastIsoRun = bd.isoRuns[bd.isoRunLast] = new IsoRun(); - pLastIsoRun.start = pLastIsoRun.limit = lastLimit; - pLastIsoRun.level = level; - pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(level & 1); - pLastIsoRun.contextPos = 0; - } - - /* PDI */ - private void bracketProcessPDI(BracketData bd) { - IsoRun pLastIsoRun; - bd.isoRunLast--; - pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - pLastIsoRun.lastBase = ON; - } - - /* newly found opening bracket: create an openings entry */ - private void bracketAddOpening(BracketData bd, char match, int position) { - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - Opening pOpening; - if (pLastIsoRun.limit >= bd.openings.length) { /* no available new entry */ - Opening[] saveOpenings = bd.openings; - int count; - try { - count = bd.openings.length; - bd.openings = new Opening[count * 2]; - } catch (Exception e) { - throw new OutOfMemoryError("Failed to allocate memory for openings"); - } - System.arraycopy(saveOpenings, 0, bd.openings, 0, count); - } - pOpening = bd.openings[pLastIsoRun.limit]; - if (pOpening == null) - pOpening = bd.openings[pLastIsoRun.limit]= new Opening(); - pOpening.position = position; - pOpening.match = match; - pOpening.contextDir = pLastIsoRun.contextDir; - pOpening.contextPos = pLastIsoRun.contextPos; - pOpening.flags = 0; - pLastIsoRun.limit++; - } - - /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */ - private void fixN0c(BracketData bd, int openingIndex, int newPropPosition, byte newProp) { - /* This function calls itself recursively */ - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - Opening qOpening; - int k, openingPosition, closingPosition; - for (k = openingIndex+1; k < pLastIsoRun.limit; k++) { - qOpening = bd.openings[k]; - if (qOpening.match >= 0) /* not an N0c match */ - continue; - if (newPropPosition < qOpening.contextPos) - break; - if (newPropPosition >= qOpening.position) - continue; - if (newProp == qOpening.contextDir) - break; - openingPosition = qOpening.position; - dirProps[openingPosition] = newProp; - closingPosition = -(qOpening.match); - dirProps[closingPosition] = newProp; - qOpening.match = 0; /* prevent further changes */ - fixN0c(bd, k, openingPosition, newProp); - fixN0c(bd, k, closingPosition, newProp); - } - } - - /* process closing bracket; return L or R if N0b or N0c, ON if N0d */ - private byte bracketProcessClosing(BracketData bd, int openIdx, int position) { - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - Opening pOpening, qOpening; - byte direction; - boolean stable; - byte newProp; - pOpening = bd.openings[openIdx]; - direction = (byte)(pLastIsoRun.level & 1); - stable = true; /* assume stable until proved otherwise */ - - /* The stable flag is set when brackets are paired and their - level is resolved and cannot be changed by what will be - found later in the source string. - An unstable match can occur only when applying N0c, where - the resolved level depends on the preceding context, and - this context may be affected by text occurring later. - Example: RTL paragraph containing: abc[(latin) HEBREW] - When the closing parenthesis is encountered, it appears - that N0c1 must be applied since 'abc' sets an opposite - direction context and both parentheses receive level 2. - However, when the closing square bracket is processed, - N0b applies because of 'HEBREW' being included within the - brackets, thus the square brackets are treated like R and - receive level 1. However, this changes the preceding - context of the opening parenthesis, and it now appears - that N0c2 must be applied to the parentheses rather than - N0c1. */ - - if ((direction == 0 && (pOpening.flags & FOUND_L) > 0) || - (direction == 1 && (pOpening.flags & FOUND_R) > 0)) { /* N0b */ - newProp = direction; - } - else if ((pOpening.flags & (FOUND_L | FOUND_R)) != 0) { /* N0c */ - /* it is stable if there is no preceding text or in - conditions too complicated and not worth checking */ - stable = (openIdx == pLastIsoRun.start); - if (direction != pOpening.contextDir) - newProp = pOpening.contextDir; /* N0c1 */ - else - newProp = direction; /* N0c2 */ - } else { - /* forget this and any brackets nested within this pair */ - pLastIsoRun.limit = (short)openIdx; - return ON; /* N0d */ - } - dirProps[pOpening.position] = newProp; - dirProps[position] = newProp; - /* Update nested N0c pairs that may be affected */ - fixN0c(bd, openIdx, pOpening.position, newProp); - if (stable) { - pLastIsoRun.limit = (short)openIdx; /* forget any brackets nested within this pair */ - /* remove lower located synonyms if any */ - while (pLastIsoRun.limit > pLastIsoRun.start && - bd.openings[pLastIsoRun.limit - 1].position == pOpening.position) - pLastIsoRun.limit--; - } else { - int k; - pOpening.match = -position; - /* neutralize lower located synonyms if any */ - k = openIdx - 1; - while (k >= pLastIsoRun.start && - bd.openings[k].position == pOpening.position) - bd.openings[k--].match = 0; - /* neutralize any unmatched opening between the current pair; - this will also neutralize higher located synonyms if any */ - for (k = openIdx + 1; k < pLastIsoRun.limit; k++) { - qOpening =bd.openings[k]; - if (qOpening.position >= position) - break; - if (qOpening.match > 0) - qOpening.match = 0; - } - } - return newProp; - } - - /* handle strong characters, digits and candidates for closing brackets */ - private void bracketProcessChar(BracketData bd, int position) { - IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; - byte dirProp, newProp; - byte level; - dirProp = dirProps[position]; - if (dirProp == ON) { - char c, match; - int idx; - /* First see if it is a matching closing bracket. Hopefully, this is - more efficient than checking if it is a closing bracket at all */ - c = text[position]; - for (idx = pLastIsoRun.limit - 1; idx >= pLastIsoRun.start; idx--) { - if (bd.openings[idx].match != c) - continue; - /* We have a match */ - newProp = bracketProcessClosing(bd, idx, position); - if(newProp == ON) { /* N0d */ - c = 0; /* prevent handling as an opening */ - break; - } - pLastIsoRun.lastBase = ON; - pLastIsoRun.contextDir = newProp; - pLastIsoRun.contextPos = position; - level = levels[position]; - if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ - short flag; - int i; - newProp = (byte)(level & 1); - pLastIsoRun.lastStrong = newProp; - flag = (short)DirPropFlag(newProp); - for (i = pLastIsoRun.start; i < idx; i++) - bd.openings[i].flags |= flag; - /* matching brackets are not overridden by LRO/RLO */ - levels[position] &= ~LEVEL_OVERRIDE; - } - /* matching brackets are not overridden by LRO/RLO */ - levels[bd.openings[idx].position] &= ~LEVEL_OVERRIDE; - return; - } - /* We get here only if the ON character is not a matching closing - bracket or it is a case of N0d */ - /* Now see if it is an opening bracket */ - if (c != 0) { - match = (char)UCharacter.getBidiPairedBracket(c); /* get the matching char */ - } else { - match = 0; - } - if (match != c && /* has a matching char */ - UCharacter.getIntPropertyValue(c, BIDI_PAIRED_BRACKET_TYPE) == - /* opening bracket */ BidiPairedBracketType.OPEN) { - /* special case: process synonyms - create an opening entry for each synonym */ - if (match == 0x232A) { /* RIGHT-POINTING ANGLE BRACKET */ - bracketAddOpening(bd, (char)0x3009, position); - } - else if (match == 0x3009) { /* RIGHT ANGLE BRACKET */ - bracketAddOpening(bd, (char)0x232A, position); - } - bracketAddOpening(bd, match, position); - } - } - level = levels[position]; - if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ - newProp = (byte)(level & 1); - if (dirProp != S && dirProp != WS && dirProp != ON) - dirProps[position] = newProp; - pLastIsoRun.lastBase = newProp; - pLastIsoRun.lastStrong = newProp; - pLastIsoRun.contextDir = newProp; - pLastIsoRun.contextPos = position; - } - else if (dirProp <= R || dirProp == AL) { - newProp = DirFromStrong(dirProp); - pLastIsoRun.lastBase = dirProp; - pLastIsoRun.lastStrong = dirProp; - pLastIsoRun.contextDir = newProp; - pLastIsoRun.contextPos = position; - } - else if(dirProp == EN) { - pLastIsoRun.lastBase = EN; - if (pLastIsoRun.lastStrong == L) { - newProp = L; /* W7 */ - if (!bd.isNumbersSpecial) - dirProps[position] = ENL; - pLastIsoRun.contextDir = L; - pLastIsoRun.contextPos = position; - } - else { - newProp = R; /* N0 */ - if (pLastIsoRun.lastStrong == AL) - dirProps[position] = AN; /* W2 */ - else - dirProps[position] = ENR; - pLastIsoRun.contextDir = R; - pLastIsoRun.contextPos = position; - } - } - else if (dirProp == AN) { - newProp = R; /* N0 */ - pLastIsoRun.lastBase = AN; - pLastIsoRun.contextDir = R; - pLastIsoRun.contextPos = position; - } - else if (dirProp == NSM) { - /* if the last real char was ON, change NSM to ON so that it - will stay ON even if the last real char is a bracket which - may be changed to L or R */ - newProp = pLastIsoRun.lastBase; - if (newProp == ON) - dirProps[position] = newProp; - } - else { - newProp = dirProp; - pLastIsoRun.lastBase = dirProp; - } - if (newProp <= R || newProp == AL) { - int i; - short flag = (short)DirPropFlag(DirFromStrong(newProp)); - for (i = pLastIsoRun.start; i < pLastIsoRun.limit; i++) - if (position > bd.openings[i].position) - bd.openings[i].flags |= flag; - } - } - - /* perform (X1)..(X9) ------------------------------------------------------- */ - - /* determine if the text is mixed-directional or single-directional */ - private byte directionFromFlags() { - - /* if the text contains AN and neutrals, then some neutrals may become RTL */ - if (!((flags & MASK_RTL) != 0 || - ((flags & DirPropFlag(AN)) != 0 && - (flags & MASK_POSSIBLE_N) != 0))) { - return LTR; - } else if ((flags & MASK_LTR) == 0) { - return RTL; - } else { - return MIXED; - } - } - - /* - * Resolve the explicit levels as specified by explicit embedding codes. - * Recalculate the flags to have them reflect the real properties - * after taking the explicit embeddings into account. - * - * The BiDi algorithm is designed to result in the same behavior whether embedding - * levels are externally specified (from "styled text", supposedly the preferred - * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text. - * That is why (X9) instructs to remove all not-isolate explicit codes (and BN). - * However, in a real implementation, the removal of these codes and their index - * positions in the plain text is undesirable since it would result in - * reallocated, reindexed text. - * Instead, this implementation leaves the codes in there and just ignores them - * in the subsequent processing. - * In order to get the same reordering behavior, positions with a BN or a not-isolate - * explicit embedding code just get the same level assigned as the last "real" - * character. - * - * Some implementations, not this one, then overwrite some of these - * directionality properties at "real" same-level-run boundaries by - * L or R codes so that the resolution of weak types can be performed on the - * entire paragraph at once instead of having to parse it once more and - * perform that resolution on same-level-runs. - * This limits the scope of the implicit rules in effectively - * the same way as the run limits. - * - * Instead, this implementation does not modify these codes, except for - * paired brackets whose properties (ON) may be replaced by L or R. - * On one hand, the paragraph has to be scanned for same-level-runs, but - * on the other hand, this saves another loop to reset these codes, - * or saves making and modifying a copy of dirProps[]. - * - * - * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm. - * - * - * Handling the stack of explicit levels (Xn): - * - * With the BiDi stack of explicit levels, as pushed with each - * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI, - * the explicit level must never exceed MAX_EXPLICIT_LEVEL. - * - * In order to have a correct push-pop semantics even in the case of overflows, - * overflow counters and a valid isolate counter are used as described in UAX#9 - * section 3.3.2 "Explicit Levels and Directions". - * - * This implementation assumes that MAX_EXPLICIT_LEVEL is odd. - * - * Returns the direction - * - */ - private byte resolveExplicitLevels() { - int i = 0; - byte dirProp; - byte level = GetParaLevelAt(0); - byte dirct; - isolateCount = 0; - - /* determine if the text is mixed-directional or single-directional */ - dirct = directionFromFlags(); - - /* we may not need to resolve any explicit levels */ - if (dirct != MIXED) { - /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */ - return dirct; - } - - if (reorderingMode > REORDER_LAST_LOGICAL_TO_VISUAL) { - /* inverse BiDi: mixed, but all characters are at the same embedding level */ - /* set all levels to the paragraph level */ - int paraIndex, start, limit; - for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { - if (paraIndex == 0) - start = 0; - else - start = paras_limit[paraIndex - 1]; - limit = paras_limit[paraIndex]; - level = paras_level[paraIndex]; - for (i = start; i < limit; i++) - levels[i] =level; - } - return dirct; /* no bracket matching for inverse BiDi */ - } - if ((flags & (MASK_EXPLICIT | MASK_ISO)) == 0) { - /* no embeddings, set all levels to the paragraph level */ - /* we still have to perform bracket matching */ - int paraIndex, start, limit; - BracketData bracketData = new BracketData(); - bracketInit(bracketData); - for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { - if (paraIndex == 0) - start = 0; - else - start = paras_limit[paraIndex-1]; - limit = paras_limit[paraIndex]; - level = paras_level[paraIndex]; - for (i = start; i < limit; i++) { - levels[i] = level; - dirProp = dirProps[i]; - if (dirProp == BN) - continue; - if (dirProp == B) { - if ((i + 1) < length) { - if (text[i] == CR && text[i + 1] == LF) - continue; /* skip CR when followed by LF */ - bracketProcessB(bracketData, level); - } - continue; - } - bracketProcessChar(bracketData, i); - } - } - return dirct; - } - /* continue to perform (Xn) */ - - /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */ - /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */ - byte embeddingLevel = level, newLevel; - byte previousLevel = level; /* previous level for regular (not CC) characters */ - int lastCcPos = 0; /* index of last effective LRx,RLx, PDx */ - - /* The following stack remembers the embedding level and the ISOLATE flag of level runs. - stackLast points to its current entry. */ - short[] stack = new short[MAX_EXPLICIT_LEVEL + 2]; /* we never push anything >= MAX_EXPLICIT_LEVEL - but we need one more entry as base */ - int stackLast = 0; - int overflowIsolateCount = 0; - int overflowEmbeddingCount = 0; - int validIsolateCount = 0; - BracketData bracketData = new BracketData(); - bracketInit(bracketData); - stack[0] = level; /* initialize base entry to para level, no override, no isolate */ - - /* recalculate the flags */ - flags = 0; - - for (i = 0; i < length; i++) { - dirProp = dirProps[i]; - switch (dirProp) { - case LRE: - case RLE: - case LRO: - case RLO: - /* (X2, X3, X4, X5) */ - flags |= DirPropFlag(BN); - levels[i] = previousLevel; - if (dirProp == LRE || dirProp == LRO) { - /* least greater even level */ - newLevel = (byte)((embeddingLevel+2) & ~(LEVEL_OVERRIDE | 1)); - } else { - /* least greater odd level */ - newLevel = (byte)((NoOverride(embeddingLevel) + 1) | 1); - } - if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 && - overflowEmbeddingCount == 0) { - lastCcPos = i; - embeddingLevel = newLevel; - if (dirProp == LRO || dirProp == RLO) - embeddingLevel |= LEVEL_OVERRIDE; - stackLast++; - stack[stackLast] = embeddingLevel; - /* we don't need to set LEVEL_OVERRIDE off for LRE and RLE - since this has already been done for newLevel which is - the source for embeddingLevel. - */ - } else { - if (overflowIsolateCount == 0) - overflowEmbeddingCount++; - } - break; - case PDF: - /* (X7) */ - flags |= DirPropFlag(BN); - levels[i] = previousLevel; - /* handle all the overflow cases first */ - if (overflowIsolateCount > 0) { - break; - } - if (overflowEmbeddingCount > 0) { - overflowEmbeddingCount--; - break; - } - if (stackLast > 0 && stack[stackLast] < ISOLATE) { /* not an isolate entry */ - lastCcPos = i; - stackLast--; - embeddingLevel = (byte)stack[stackLast]; - } - break; - case LRI: - case RLI: - flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); - levels[i] = NoOverride(embeddingLevel); - if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { - bracketProcessBoundary(bracketData, lastCcPos, - previousLevel, embeddingLevel); - flags |= DirPropFlagMultiRuns; - } - previousLevel = embeddingLevel; - /* (X5a, X5b) */ - if (dirProp == LRI) - /* least greater even level */ - newLevel=(byte)((embeddingLevel+2)&~(LEVEL_OVERRIDE|1)); - else - /* least greater odd level */ - newLevel=(byte)((NoOverride(embeddingLevel)+1)|1); - if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 - && overflowEmbeddingCount == 0) { - flags |= DirPropFlag(dirProp); - lastCcPos = i; - validIsolateCount++; - if (validIsolateCount > isolateCount) - isolateCount = validIsolateCount; - embeddingLevel = newLevel; - /* we can increment stackLast without checking because newLevel - will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */ - stackLast++; - stack[stackLast] = (short)(embeddingLevel + ISOLATE); - bracketProcessLRI_RLI(bracketData, embeddingLevel); - } else { - /* make it WS so that it is handled by adjustWSLevels() */ - dirProps[i] = WS; - overflowIsolateCount++; - } - break; - case PDI: - if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { - bracketProcessBoundary(bracketData, lastCcPos, - previousLevel, embeddingLevel); - flags |= DirPropFlagMultiRuns; - } - /* (X6a) */ - if (overflowIsolateCount > 0) { - overflowIsolateCount--; - /* make it WS so that it is handled by adjustWSLevels() */ - dirProps[i] = WS; - } - else if (validIsolateCount > 0) { - flags |= DirPropFlag(PDI); - lastCcPos = i; - overflowEmbeddingCount = 0; - while (stack[stackLast] < ISOLATE) /* pop embedding entries */ - stackLast--; /* until the last isolate entry */ - stackLast--; /* pop also the last isolate entry */ - validIsolateCount--; - bracketProcessPDI(bracketData); - } else - /* make it WS so that it is handled by adjustWSLevels() */ - dirProps[i] = WS; - embeddingLevel = (byte)(stack[stackLast] & ~ISOLATE); - flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); - previousLevel = embeddingLevel; - levels[i] = NoOverride(embeddingLevel); - break; - case B: - flags |= DirPropFlag(B); - levels[i] = GetParaLevelAt(i); - if ((i + 1) < length) { - if (text[i] == CR && text[i + 1] == LF) - break; /* skip CR when followed by LF */ - overflowEmbeddingCount = overflowIsolateCount = 0; - validIsolateCount = 0; - stackLast = 0; - previousLevel = embeddingLevel = GetParaLevelAt(i + 1); - stack[0] = embeddingLevel; /* initialize base entry to para level, no override, no isolate */ - bracketProcessB(bracketData, embeddingLevel); - } - break; - case BN: - /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */ - /* they will get their levels set correctly in adjustWSLevels() */ - levels[i] = previousLevel; - flags |= DirPropFlag(BN); - break; - default: - /* all other types are normal characters and get the "real" level */ - if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { - bracketProcessBoundary(bracketData, lastCcPos, - previousLevel, embeddingLevel); - flags |= DirPropFlagMultiRuns; - if ((embeddingLevel & LEVEL_OVERRIDE) != 0) - flags |= DirPropFlagO(embeddingLevel); - else - flags |= DirPropFlagE(embeddingLevel); - } - previousLevel = embeddingLevel; - levels[i] = embeddingLevel; - bracketProcessChar(bracketData, i); - /* the dirProp may have been changed in bracketProcessChar() */ - flags |= DirPropFlag(dirProps[i]); - break; - } - } - if ((flags & MASK_EMBEDDING) != 0) { - flags |= DirPropFlagLR(paraLevel); - } - if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { - flags |= DirPropFlag(L); - } - /* again, determine if the text is mixed-directional or single-directional */ - dirct = directionFromFlags(); - - return dirct; - } - - /* - * Use a pre-specified embedding levels array: - * - * Adjust the directional properties for overrides (->LEVEL_OVERRIDE), - * ignore all explicit codes (X9), - * and check all the preset levels. - * - * Recalculate the flags to have them reflect the real properties - * after taking the explicit embeddings into account. - */ - private byte checkExplicitLevels() { - byte dirProp; - int i; - int isolateCount = 0; - - this.flags = 0; /* collect all directionalities in the text */ - byte level; - this.isolateCount = 0; - - for (i = 0; i < length; ++i) { - if (levels[i] == 0) { - levels[i] = paraLevel; - } - - // for backward compatibility - if (MAX_EXPLICIT_LEVEL < (levels[i]&0x7f)) { - if ((levels[i] & LEVEL_OVERRIDE) != 0) { - levels[i] = (byte)(paraLevel|LEVEL_OVERRIDE); - } else { - levels[i] = paraLevel; - } - } - - level = levels[i]; - dirProp = dirProps[i]; - if (dirProp == LRI || dirProp == RLI) { - isolateCount++; - if (isolateCount > this.isolateCount) - this.isolateCount = isolateCount; - } - else if (dirProp == PDI) { - isolateCount--; - } else if (dirProp == B) { - isolateCount = 0; - } - if ((level & LEVEL_OVERRIDE) != 0) { - /* keep the override flag in levels[i] but adjust the flags */ - level &= ~LEVEL_OVERRIDE; /* make the range check below simpler */ - flags |= DirPropFlagO(level); - } else { - /* set the flags */ - flags |= DirPropFlagE(level) | DirPropFlag(dirProp); - } - if ((level < GetParaLevelAt(i) && - !((0 == level) && (dirProp == B))) || - (MAX_EXPLICIT_LEVEL < level)) { - /* level out of bounds */ - throw new IllegalArgumentException("level " + level + - " out of bounds at " + i); - } - } - if ((flags & MASK_EMBEDDING) != 0) { - flags |= DirPropFlagLR(paraLevel); - } - /* determine if the text is mixed-directional or single-directional */ - return directionFromFlags(); - } - - /*********************************************************************/ - /* The Properties state machine table */ - /*********************************************************************/ - /* */ - /* All table cells are 8 bits: */ - /* bits 0..4: next state */ - /* bits 5..7: action to perform (if > 0) */ - /* */ - /* Cells may be of format "n" where n represents the next state */ - /* (except for the rightmost column). */ - /* Cells may also be of format "_(x,y)" where x represents an action */ - /* to perform and y represents the next state. */ - /* */ - /*********************************************************************/ - /* Definitions and type for properties state tables */ - /*********************************************************************/ - private static final int IMPTABPROPS_COLUMNS = 16; - private static final int IMPTABPROPS_RES = IMPTABPROPS_COLUMNS - 1; - private static short GetStateProps(short cell) { - return (short)(cell & 0x1f); - } - private static short GetActionProps(short cell) { - return (short)(cell >> 5); - } - - private static final short groupProp[] = /* dirProp regrouped */ - { - /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ENL ENR */ - 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4, 13, 14 - }; - private static final short _L = 0; - private static final short _R = 1; - private static final short _EN = 2; - private static final short _AN = 3; - private static final short _ON = 4; - private static final short _S = 5; - private static final short _B = 6; /* reduced dirProp */ - - /*********************************************************************/ - /* */ - /* PROPERTIES STATE TABLE */ - /* */ - /* In table impTabProps, */ - /* - the ON column regroups ON and WS, FSI, RLI, LRI and PDI */ - /* - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF */ - /* - the Res column is the reduced property assigned to a run */ - /* */ - /* Action 1: process current run1, init new run1 */ - /* 2: init new run2 */ - /* 3: process run1, process run2, init new run1 */ - /* 4: process run1, set run1=run2, init new run2 */ - /* */ - /* Notes: */ - /* 1) This table is used in resolveImplicitLevels(). */ - /* 2) This table triggers actions when there is a change in the Bidi*/ - /* property of incoming characters (action 1). */ - /* 3) Most such property sequences are processed immediately (in */ - /* fact, passed to processPropertySeq(). */ - /* 4) However, numbers are assembled as one sequence. This means */ - /* that undefined situations (like CS following digits, until */ - /* it is known if the next char will be a digit) are held until */ - /* following chars define them. */ - /* Example: digits followed by CS, then comes another CS or ON; */ - /* the digits will be processed, then the CS assigned */ - /* as the start of an ON sequence (action 3). */ - /* 5) There are cases where more than one sequence must be */ - /* processed, for instance digits followed by CS followed by L: */ - /* the digits must be processed as one sequence, and the CS */ - /* must be processed as an ON sequence, all this before starting */ - /* assembling chars for the opening L sequence. */ - /* */ - /* */ - private static final short impTabProps[][] = - { -/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, ENL, ENR, Res */ -/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, 18, 21, _ON }, -/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, 32+18, 32+21, _L }, -/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, 32+18, 32+21, _R }, -/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, 32+18, 32+21, _R }, -/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, 18, 21, _EN }, -/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, 32+18, 32+21, _AN }, -/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, 18, 21, _AN }, -/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, 32+18, 32+21, _ON }, -/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, 32+18, 32+21, _ON }, -/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, 18, 21, _ON }, -/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, 18, 21, _EN }, -/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, 18, 21, _EN }, -/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, 96+18, 96+21, _AN }, -/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, 18, 21, _AN }, -/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3,128+18,128+21, _ON }, -/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, 32+18, 32+21, _S }, -/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, 32+18, 32+21, _S }, -/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, 32+18, 32+21, _B }, -/*18 ENL */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 64+19, 20, 64+19, 18, 18, 32+3, 18, 21, _L }, -/*19 ENL+ES/CS */ { 96+1, 96+2, 18, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 19, 128+7, 96+3, 18, 21, _L }, -/*20 ENL+ET */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 32+7, 20, 32+7, 20, 20, 32+3, 18, 21, _L }, -/*21 ENR */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 64+22, 23, 64+22, 21, 21, 32+3, 18, 21, _AN }, -/*22 ENR+ES/CS */ { 96+1, 96+2, 21, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 22, 128+7, 96+3, 18, 21, _AN }, -/*23 ENR+ET */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 32+7, 23, 32+7, 23, 23, 32+3, 18, 21, _AN } - }; - - /*********************************************************************/ - /* The levels state machine tables */ - /*********************************************************************/ - /* */ - /* All table cells are 8 bits: */ - /* bits 0..3: next state */ - /* bits 4..7: action to perform (if > 0) */ - /* */ - /* Cells may be of format "n" where n represents the next state */ - /* (except for the rightmost column). */ - /* Cells may also be of format "_(x,y)" where x represents an action */ - /* to perform and y represents the next state. */ - /* */ - /* This format limits each table to 16 states each and to 15 actions.*/ - /* */ - /*********************************************************************/ - /* Definitions and type for levels state tables */ - /*********************************************************************/ - private static final int IMPTABLEVELS_COLUMNS = _B + 2; - private static final int IMPTABLEVELS_RES = IMPTABLEVELS_COLUMNS - 1; - private static short GetState(byte cell) { return (short)(cell & 0x0f); } - private static short GetAction(byte cell) { return (short)(cell >> 4); } - - private static class ImpTabPair { - byte[][][] imptab; - short[][] impact; - - ImpTabPair(byte[][] table1, byte[][] table2, - short[] act1, short[] act2) { - imptab = new byte[][][] {table1, table2}; - impact = new short[][] {act1, act2}; - } - } - - /*********************************************************************/ - /* */ - /* LEVELS STATE TABLES */ - /* */ - /* In all levels state tables, */ - /* - state 0 is the initial state */ - /* - the Res column is the increment to add to the text level */ - /* for this property sequence. */ - /* */ - /* The impact arrays for each table of a pair map the local action */ - /* numbers of the table to the total list of actions. For instance, */ - /* action 2 in a given table corresponds to the action number which */ - /* appears in entry [2] of the impact array for that table. */ - /* The first entry of all impact arrays must be 0. */ - /* */ - /* Action 1: init conditional sequence */ - /* 2: prepend conditional sequence to current sequence */ - /* 3: set ON sequence to new level - 1 */ - /* 4: init EN/AN/ON sequence */ - /* 5: fix EN/AN/ON sequence followed by R */ - /* 6: set previous level sequence to level 2 */ - /* */ - /* Notes: */ - /* 1) These tables are used in processPropertySeq(). The input */ - /* is property sequences as determined by resolveImplicitLevels. */ - /* 2) Most such property sequences are processed immediately */ - /* (levels are assigned). */ - /* 3) However, some sequences cannot be assigned a final level till */ - /* one or more following sequences are received. For instance, */ - /* ON following an R sequence within an even-level paragraph. */ - /* If the following sequence is R, the ON sequence will be */ - /* assigned basic run level+1, and so will the R sequence. */ - /* 4) S is generally handled like ON, since its level will be fixed */ - /* to paragraph level in adjustWSLevels(). */ - /* */ - - private static final byte impTabL_DEFAULT[][] = /* Even paragraph level */ - /* In this table, conditional sequences receive the lower possible level - until proven otherwise. - */ - { - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 1, 0, 2, 0, 0, 0, 0 }, - /* 1 : R */ { 0, 1, 3, 3, 0x14, 0x14, 0, 1 }, - /* 2 : AN */ { 0, 1, 0, 2, 0x15, 0x15, 0, 2 }, - /* 3 : R+EN/AN */ { 0, 1, 3, 3, 0x14, 0x14, 0, 2 }, - /* 4 : R+ON */ { 0, 0x21, 0x33, 0x33, 4, 4, 0, 0 }, - /* 5 : AN+ON */ { 0, 0x21, 0, 0x32, 5, 5, 0, 0 } - }; - - private static final byte impTabR_DEFAULT[][] = /* Odd paragraph level */ - /* In this table, conditional sequences receive the lower possible level - until proven otherwise. - */ - { - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 1, 0, 2, 2, 0, 0, 0, 0 }, - /* 1 : L */ { 1, 0, 1, 3, 0x14, 0x14, 0, 1 }, - /* 2 : EN/AN */ { 1, 0, 2, 2, 0, 0, 0, 1 }, - /* 3 : L+AN */ { 1, 0, 1, 3, 5, 5, 0, 1 }, - /* 4 : L+ON */ { 0x21, 0, 0x21, 3, 4, 4, 0, 0 }, - /* 5 : L+AN+ON */ { 1, 0, 1, 3, 5, 5, 0, 0 } - }; - - private static final short[] impAct0 = {0,1,2,3,4}; - - private static final ImpTabPair impTab_DEFAULT = new ImpTabPair( - impTabL_DEFAULT, impTabR_DEFAULT, impAct0, impAct0); - - private static final byte impTabL_NUMBERS_SPECIAL[][] = { /* Even paragraph level */ - /* In this table, conditional sequences receive the lower possible - level until proven otherwise. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 2, 0x11, 0x11, 0, 0, 0, 0 }, - /* 1 : L+EN/AN */ { 0, 0x42, 1, 1, 0, 0, 0, 0 }, - /* 2 : R */ { 0, 2, 4, 4, 0x13, 0x13, 0, 1 }, - /* 3 : R+ON */ { 0, 0x22, 0x34, 0x34, 3, 3, 0, 0 }, - /* 4 : R+EN/AN */ { 0, 2, 4, 4, 0x13, 0x13, 0, 2 } - }; - private static final ImpTabPair impTab_NUMBERS_SPECIAL = new ImpTabPair( - impTabL_NUMBERS_SPECIAL, impTabR_DEFAULT, impAct0, impAct0); - - private static final byte impTabL_GROUP_NUMBERS_WITH_R[][] = { - /* In this table, EN/AN+ON sequences receive levels as if associated with R - until proven that there is L or sor/eor on both sides. AN is handled like EN. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 init */ { 0, 3, 0x11, 0x11, 0, 0, 0, 0 }, - /* 1 EN/AN */ { 0x20, 3, 1, 1, 2, 0x20, 0x20, 2 }, - /* 2 EN/AN+ON */ { 0x20, 3, 1, 1, 2, 0x20, 0x20, 1 }, - /* 3 R */ { 0, 3, 5, 5, 0x14, 0, 0, 1 }, - /* 4 R+ON */ { 0x20, 3, 5, 5, 4, 0x20, 0x20, 1 }, - /* 5 R+EN/AN */ { 0, 3, 5, 5, 0x14, 0, 0, 2 } - }; - private static final byte impTabR_GROUP_NUMBERS_WITH_R[][] = { - /* In this table, EN/AN+ON sequences receive levels as if associated with R - until proven that there is L on both sides. AN is handled like EN. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 init */ { 2, 0, 1, 1, 0, 0, 0, 0 }, - /* 1 EN/AN */ { 2, 0, 1, 1, 0, 0, 0, 1 }, - /* 2 L */ { 2, 0, 0x14, 0x14, 0x13, 0, 0, 1 }, - /* 3 L+ON */ { 0x22, 0, 4, 4, 3, 0, 0, 0 }, - /* 4 L+EN/AN */ { 0x22, 0, 4, 4, 3, 0, 0, 1 } - }; - private static final ImpTabPair impTab_GROUP_NUMBERS_WITH_R = new - ImpTabPair(impTabL_GROUP_NUMBERS_WITH_R, - impTabR_GROUP_NUMBERS_WITH_R, impAct0, impAct0); - - private static final byte impTabL_INVERSE_NUMBERS_AS_L[][] = { - /* This table is identical to the Default LTR table except that EN and AN - are handled like L. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 1, 0, 0, 0, 0, 0, 0 }, - /* 1 : R */ { 0, 1, 0, 0, 0x14, 0x14, 0, 1 }, - /* 2 : AN */ { 0, 1, 0, 0, 0x15, 0x15, 0, 2 }, - /* 3 : R+EN/AN */ { 0, 1, 0, 0, 0x14, 0x14, 0, 2 }, - /* 4 : R+ON */ { 0x20, 1, 0x20, 0x20, 4, 4, 0x20, 1 }, - /* 5 : AN+ON */ { 0x20, 1, 0x20, 0x20, 5, 5, 0x20, 1 } - }; - private static final byte impTabR_INVERSE_NUMBERS_AS_L[][] = { - /* This table is identical to the Default RTL table except that EN and AN - are handled like L. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 1, 0, 1, 1, 0, 0, 0, 0 }, - /* 1 : L */ { 1, 0, 1, 1, 0x14, 0x14, 0, 1 }, - /* 2 : EN/AN */ { 1, 0, 1, 1, 0, 0, 0, 1 }, - /* 3 : L+AN */ { 1, 0, 1, 1, 5, 5, 0, 1 }, - /* 4 : L+ON */ { 0x21, 0, 0x21, 0x21, 4, 4, 0, 0 }, - /* 5 : L+AN+ON */ { 1, 0, 1, 1, 5, 5, 0, 0 } - }; - private static final ImpTabPair impTab_INVERSE_NUMBERS_AS_L = new ImpTabPair - (impTabL_INVERSE_NUMBERS_AS_L, impTabR_INVERSE_NUMBERS_AS_L, - impAct0, impAct0); - - private static final byte impTabR_INVERSE_LIKE_DIRECT[][] = { /* Odd paragraph level */ - /* In this table, conditional sequences receive the lower possible level - until proven otherwise. - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 1, 0, 2, 2, 0, 0, 0, 0 }, - /* 1 : L */ { 1, 0, 1, 2, 0x13, 0x13, 0, 1 }, - /* 2 : EN/AN */ { 1, 0, 2, 2, 0, 0, 0, 1 }, - /* 3 : L+ON */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 0 }, - /* 4 : L+ON+AN */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 3 }, - /* 5 : L+AN+ON */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 2 }, - /* 6 : L+ON+EN */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 1 } - }; - private static final short[] impAct1 = {0,1,13,14}; - private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT = new ImpTabPair( - impTabL_DEFAULT, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); - - private static final byte impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS[][] = { - /* The case handled in this table is (visually): R EN L - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 0x63, 0, 1, 0, 0, 0, 0 }, - /* 1 : L+AN */ { 0, 0x63, 0, 1, 0x12, 0x30, 0, 4 }, - /* 2 : L+AN+ON */ { 0x20, 0x63, 0x20, 1, 2, 0x30, 0x20, 3 }, - /* 3 : R */ { 0, 0x63, 0x55, 0x56, 0x14, 0x30, 0, 3 }, - /* 4 : R+ON */ { 0x30, 0x43, 0x55, 0x56, 4, 0x30, 0x30, 3 }, - /* 5 : R+EN */ { 0x30, 0x43, 5, 0x56, 0x14, 0x30, 0x30, 4 }, - /* 6 : R+AN */ { 0x30, 0x43, 0x55, 6, 0x14, 0x30, 0x30, 4 } - }; - private static final byte impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS[][] = { - /* The cases handled in this table are (visually): R EN L - R L AN L - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0x13, 0, 1, 1, 0, 0, 0, 0 }, - /* 1 : R+EN/AN */ { 0x23, 0, 1, 1, 2, 0x40, 0, 1 }, - /* 2 : R+EN/AN+ON */ { 0x23, 0, 1, 1, 2, 0x40, 0, 0 }, - /* 3 : L */ { 3, 0, 3, 0x36, 0x14, 0x40, 0, 1 }, - /* 4 : L+ON */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 0 }, - /* 5 : L+ON+EN */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 1 }, - /* 6 : L+AN */ { 0x53, 0x40, 6, 6, 4, 0x40, 0x40, 3 } - }; - private static final short[] impAct2 = {0,1,2,5,6,7,8}; - private static final short[] impAct3 = {0,1,9,10,11,12}; - private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT_WITH_MARKS = - new ImpTabPair(impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS, - impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); - - private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL = new ImpTabPair( - impTabL_NUMBERS_SPECIAL, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); - - private static final byte impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS[][] = { - /* The case handled in this table is (visually): R EN L - */ - /* L, R, EN, AN, ON, S, B, Res */ - /* 0 : init */ { 0, 0x62, 1, 1, 0, 0, 0, 0 }, - /* 1 : L+EN/AN */ { 0, 0x62, 1, 1, 0, 0x30, 0, 4 }, - /* 2 : R */ { 0, 0x62, 0x54, 0x54, 0x13, 0x30, 0, 3 }, - /* 3 : R+ON */ { 0x30, 0x42, 0x54, 0x54, 3, 0x30, 0x30, 3 }, - /* 4 : R+EN/AN */ { 0x30, 0x42, 4, 4, 0x13, 0x30, 0x30, 4 } - }; - private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS = new - ImpTabPair(impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS, - impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); - - private static class LevState { - byte[][] impTab; /* level table pointer */ - short[] impAct; /* action map array */ - int startON; /* start of ON sequence */ - int startL2EN; /* start of level 2 sequence */ - int lastStrongRTL; /* index of last found R or AL */ - int runStart; /* start position of the run */ - short state; /* current state */ - byte runLevel; /* run level before implicit solving */ - } - - /*------------------------------------------------------------------------*/ - - static final int FIRSTALLOC = 10; - /* - * param pos: position where to insert - * param flag: one of LRM_BEFORE, LRM_AFTER, RLM_BEFORE, RLM_AFTER - */ - private void addPoint(int pos, int flag) - { - Point point = new Point(); - - int len = insertPoints.points.length; - if (len == 0) { - insertPoints.points = new Point[FIRSTALLOC]; - len = FIRSTALLOC; - } - if (insertPoints.size >= len) { /* no room for new point */ - Point[] savePoints = insertPoints.points; - insertPoints.points = new Point[len * 2]; - System.arraycopy(savePoints, 0, insertPoints.points, 0, len); - } - point.pos = pos; - point.flag = flag; - insertPoints.points[insertPoints.size] = point; - insertPoints.size++; - } - - private void setLevelsOutsideIsolates(int start, int limit, byte level) - { - byte dirProp; - int isolateCount = 0, k; - for (k = start; k < limit; k++) { - dirProp = dirProps[k]; - if (dirProp == PDI) - isolateCount--; - if (isolateCount == 0) { - levels[k] = level; - } - if (dirProp == LRI || dirProp == RLI) - isolateCount++; - } - } - - /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */ - - /* - * This implementation of the (Wn) rules applies all rules in one pass. - * In order to do so, it needs a look-ahead of typically 1 character - * (except for W5: sequences of ET) and keeps track of changes - * in a rule Wp that affect a later Wq (p= 0) { - addPoint(levState.startL2EN, LRM_BEFORE); - } - levState.startL2EN = -1; /* not within previous if since could also be -2 */ - /* check if we had any relevant EN/AN after R/AL */ - if ((insertPoints.points.length == 0) || - (insertPoints.size <= insertPoints.confirmed)) { - /* nothing, just clean up */ - levState.lastStrongRTL = -1; - /* check if we have a pending conditional segment */ - level = impTab[oldStateSeq][IMPTABLEVELS_RES]; - if ((level & 1) != 0 && levState.startON > 0) { /* after ON */ - start = levState.startON; /* reset to basic run level */ - } - if (_prop == _S) { /* add LRM before S */ - addPoint(start0, LRM_BEFORE); - insertPoints.confirmed = insertPoints.size; - } - break; - } - /* reset previous RTL cont to level for LTR text */ - for (k = levState.lastStrongRTL + 1; k < start0; k++) { - /* reset odd level, leave runLevel+2 as is */ - levels[k] = (byte)((levels[k] - 2) & ~1); - } - /* mark insert points as confirmed */ - insertPoints.confirmed = insertPoints.size; - levState.lastStrongRTL = -1; - if (_prop == _S) { /* add LRM before S */ - addPoint(start0, LRM_BEFORE); - insertPoints.confirmed = insertPoints.size; - } - break; - - case 6: /* R/AL after possible relevant EN/AN */ - /* just clean up */ - if (insertPoints.points.length > 0) - /* remove all non confirmed insert points */ - insertPoints.size = insertPoints.confirmed; - levState.startON = -1; - levState.startL2EN = -1; - levState.lastStrongRTL = limit - 1; - break; - - case 7: /* EN/AN after R/AL + possible cont */ - /* check for real AN */ - - if ((_prop == _AN) && (dirProps[start0] == AN) && - (reorderingMode != REORDER_INVERSE_FOR_NUMBERS_SPECIAL)) - { - /* real AN */ - if (levState.startL2EN == -1) { /* if no relevant EN already found */ - /* just note the rightmost digit as a strong RTL */ - levState.lastStrongRTL = limit - 1; - break; - } - if (levState.startL2EN >= 0) { /* after EN, no AN */ - addPoint(levState.startL2EN, LRM_BEFORE); - levState.startL2EN = -2; - } - /* note AN */ - addPoint(start0, LRM_BEFORE); - break; - } - /* if first EN/AN after R/AL */ - if (levState.startL2EN == -1) { - levState.startL2EN = start0; - } - break; - - case 8: /* note location of latest R/AL */ - levState.lastStrongRTL = limit - 1; - levState.startON = -1; - break; - - case 9: /* L after R+ON/EN/AN */ - /* include possible adjacent number on the left */ - for (k = start0-1; k >= 0 && ((levels[k] & 1) == 0); k--) { - } - if (k >= 0) { - addPoint(k, RLM_BEFORE); /* add RLM before */ - insertPoints.confirmed = insertPoints.size; /* confirm it */ - } - levState.startON = start0; - break; - - case 10: /* AN after L */ - /* AN numbers between L text on both sides may be trouble. */ - /* tentatively bracket with LRMs; will be confirmed if followed by L */ - addPoint(start0, LRM_BEFORE); /* add LRM before */ - addPoint(start0, LRM_AFTER); /* add LRM after */ - break; - - case 11: /* R after L+ON/EN/AN */ - /* false alert, infirm LRMs around previous AN */ - insertPoints.size=insertPoints.confirmed; - if (_prop == _S) { /* add RLM before S */ - addPoint(start0, RLM_BEFORE); - insertPoints.confirmed = insertPoints.size; - } - break; - - case 12: /* L after L+ON/AN */ - level = (byte)(levState.runLevel + addLevel); - for (k=levState.startON; k < start0; k++) { - if (levels[k] < level) { - levels[k] = level; - } - } - insertPoints.confirmed = insertPoints.size; /* confirm inserts */ - levState.startON = start0; - break; - - case 13: /* L after L+ON+EN/AN/ON */ - level = levState.runLevel; - for (k = start0-1; k >= levState.startON; k--) { - if (levels[k] == level+3) { - while (levels[k] == level+3) { - levels[k--] -= 2; - } - while (levels[k] == level) { - k--; - } - } - if (levels[k] == level+2) { - levels[k] = level; - continue; - } - levels[k] = (byte)(level+1); - } - break; - - case 14: /* R after L+ON+EN/AN/ON */ - level = (byte)(levState.runLevel+1); - for (k = start0-1; k >= levState.startON; k--) { - if (levels[k] > level) { - levels[k] -= 2; - } - } - break; - - default: /* we should never get here */ - throw new IllegalStateException("Internal ICU error in processPropertySeq"); - } - } - if ((addLevel) != 0 || (start < start0)) { - level = (byte)(levState.runLevel + addLevel); - if (start >= levState.runStart) { - for (k = start; k < limit; k++) { - levels[k] = level; - } - } else { - setLevelsOutsideIsolates(start, limit, level); - } - } - } - - private void resolveImplicitLevels(int start, int limit, short sor, short eor) - { - byte dirProp; - LevState levState = new LevState(); - int i, start1, start2; - short oldStateImp, stateImp, actionImp; - short gprop, resProp, cell; - boolean inverseRTL; - short nextStrongProp = R; - int nextStrongPos = -1; - - /* check for RTL inverse Bidi mode */ - /* FOOD FOR THOUGHT: in case of RTL inverse Bidi, it would make sense to - * loop on the text characters from end to start. - * This would need a different properties state table (at least different - * actions) and different levels state tables (maybe very similar to the - * LTR corresponding ones. - */ - inverseRTL=((start0) && - (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || - reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL)); - /* initialize for property and levels state table */ - levState.startL2EN = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ - levState.lastStrongRTL = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ - levState.runStart = start; - levState.runLevel = levels[start]; - levState.impTab = impTabPair.imptab[levState.runLevel & 1]; - levState.impAct = impTabPair.impact[levState.runLevel & 1]; - - /* The isolates[] entries contain enough information to - resume the bidi algorithm in the same state as it was - when it was interrupted by an isolate sequence. */ - if (dirProps[start] == PDI) { - levState.startON = isolates[isolateCount].startON; - start1 = isolates[isolateCount].start1; - stateImp = isolates[isolateCount].stateImp; - levState.state = isolates[isolateCount].state; - isolateCount--; - } else { - levState.startON = -1; - start1 = start; - if (dirProps[start] == NSM) - stateImp = (short)(1 + sor); - else - stateImp = 0; - levState.state = 0; - processPropertySeq(levState, sor, start, start); - } - start2 = start; /* to make the Java compiler happy */ - - for (i = start; i <= limit; i++) { - if (i >= limit) { - int k; - for (k = limit - 1; - k > start && - (DirPropFlag(dirProps[k]) & MASK_BN_EXPLICIT) != 0; - k--); - dirProp = dirProps[k]; - if (dirProp == LRI || dirProp == RLI) - break; /* no forced closing for sequence ending with LRI/RLI */ - gprop = eor; - } else { - byte prop, prop1; - prop = dirProps[i]; - if (prop == B) - isolateCount = -1; /* current isolates stack entry == none */ - if (inverseRTL) { - if (prop == AL) { - /* AL before EN does not make it AN */ - prop = R; - } else if (prop == EN) { - if (nextStrongPos <= i) { - /* look for next strong char (L/R/AL) */ - int j; - nextStrongProp = R; /* set default */ - nextStrongPos = limit; - for (j = i+1; j < limit; j++) { - prop1 = dirProps[j]; - if (prop1 == L || prop1 == R || prop1 == AL) { - nextStrongProp = prop1; - nextStrongPos = j; - break; - } - } - } - if (nextStrongProp == AL) { - prop = AN; - } - } - } - gprop = groupProp[prop]; - } - oldStateImp = stateImp; - cell = impTabProps[oldStateImp][gprop]; - stateImp = GetStateProps(cell); /* isolate the new state */ - actionImp = GetActionProps(cell); /* isolate the action */ - if ((i == limit) && (actionImp == 0)) { - /* there is an unprocessed sequence if its property == eor */ - actionImp = 1; /* process the last sequence */ - } - if (actionImp != 0) { - resProp = impTabProps[oldStateImp][IMPTABPROPS_RES]; - switch (actionImp) { - case 1: /* process current seq1, init new seq1 */ - processPropertySeq(levState, resProp, start1, i); - start1 = i; - break; - case 2: /* init new seq2 */ - start2 = i; - break; - case 3: /* process seq1, process seq2, init new seq1 */ - processPropertySeq(levState, resProp, start1, start2); - processPropertySeq(levState, _ON, start2, i); - start1 = i; - break; - case 4: /* process seq1, set seq1=seq2, init new seq2 */ - processPropertySeq(levState, resProp, start1, start2); - start1 = start2; - start2 = i; - break; - default: /* we should never get here */ - throw new IllegalStateException("Internal ICU error in resolveImplicitLevels"); - } - } - } - - /* look for the last char not a BN or LRE/RLE/LRO/RLO/PDF */ - for (i = limit - 1; - i > start && - (DirPropFlag(dirProps[i]) & MASK_BN_EXPLICIT) != 0; - i--); - dirProp = dirProps[i]; - if ((dirProp == LRI || dirProp == RLI) && limit < length) { - isolateCount++; - if (isolates[isolateCount] == null) - isolates[isolateCount] = new Isolate(); - isolates[isolateCount].stateImp = stateImp; - isolates[isolateCount].state = levState.state; - isolates[isolateCount].start1 = start1; - isolates[isolateCount].startON = levState.startON; - } - else - processPropertySeq(levState, eor, limit, limit); - } - - /* perform (L1) and (X9) ---------------------------------------------------- */ - - /* - * Reset the embedding levels for some non-graphic characters (L1). - * This method also sets appropriate levels for BN, and - * explicit embedding types that are supposed to have been removed - * from the paragraph in (X9). - */ - private void adjustWSLevels() { - int i; - - if ((flags & MASK_WS) != 0) { - int flag; - i = trailingWSStart; - while (i > 0) { - /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */ - while (i > 0 && ((flag = DirPropFlag(dirProps[--i])) & MASK_WS) != 0) { - if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { - levels[i] = 0; - } else { - levels[i] = GetParaLevelAt(i); - } - } - - /* reset BN to the next character's paraLevel until B/S, which restarts above loop */ - /* here, i+1 is guaranteed to be 0) { - flag = DirPropFlag(dirProps[--i]); - if ((flag & MASK_BN_EXPLICIT) != 0) { - levels[i] = levels[i + 1]; - } else if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { - levels[i] = 0; - break; - } else if ((flag & MASK_B_S) != 0){ - levels[i] = GetParaLevelAt(i); - break; - } - } - } - } - } - - private void setParaSuccess() { - paraBidi = this; /* mark successful setPara */ - } - - private int Bidi_Min(int x, int y) { - return x < y ? x : y; - } - - private int Bidi_Abs(int x) { - return x >= 0 ? x : -x; - } - - void setParaRunsOnly(char[] parmText, byte parmParaLevel) { - int[] visualMap; - String visualText; - int saveLength, saveTrailingWSStart; - byte[] saveLevels; - byte saveDirection; - int i, j, visualStart, logicalStart, - oldRunCount, runLength, addedRuns, insertRemove, - start, limit, step, indexOddBit, logicalPos, - index, index1; - int saveOptions; - - reorderingMode = REORDER_DEFAULT; - int parmLength = parmText.length; - if (parmLength == 0) { - setPara(parmText, parmParaLevel, null); - reorderingMode = REORDER_RUNS_ONLY; - return; - } - /* obtain memory for mapping table and visual text */ - saveOptions = reorderingOptions; - if ((saveOptions & OPTION_INSERT_MARKS) > 0) { - reorderingOptions &= ~OPTION_INSERT_MARKS; - reorderingOptions |= OPTION_REMOVE_CONTROLS; - } - parmParaLevel &= 1; /* accept only 0 or 1 */ - setPara(parmText, parmParaLevel, null); - /* we cannot access directly levels since it is not yet set if - * direction is not MIXED - */ - saveLevels = new byte[this.length]; - System.arraycopy(getLevels(), 0, saveLevels, 0, this.length); - saveTrailingWSStart = trailingWSStart; - - /* FOOD FOR THOUGHT: instead of writing the visual text, we could use - * the visual map and the dirProps array to drive the second call - * to setPara (but must make provision for possible removal of - * Bidi controls. Alternatively, only use the dirProps array via - * customized classifier callback. - */ - visualText = writeReordered(DO_MIRRORING); - visualMap = getVisualMap(); - this.reorderingOptions = saveOptions; - saveLength = this.length; - saveDirection=this.direction; - - this.reorderingMode = REORDER_INVERSE_LIKE_DIRECT; - parmParaLevel ^= 1; - setPara(visualText, parmParaLevel, null); - BidiLine.getRuns(this); - /* check if some runs must be split, count how many splits */ - addedRuns = 0; - oldRunCount = this.runCount; - visualStart = 0; - for (i = 0; i < oldRunCount; i++, visualStart += runLength) { - runLength = runs[i].limit - visualStart; - if (runLength < 2) { - continue; - } - logicalStart = runs[i].start; - for (j = logicalStart+1; j < logicalStart+runLength; j++) { - index = visualMap[j]; - index1 = visualMap[j-1]; - if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { - addedRuns++; - } - } - } - if (addedRuns > 0) { - getRunsMemory(oldRunCount + addedRuns); - if (runCount == 1) { - /* because we switch from UBiDi.simpleRuns to UBiDi.runs */ - runsMemory[0] = runs[0]; - } else { - System.arraycopy(runs, 0, runsMemory, 0, runCount); - } - runs = runsMemory; - runCount += addedRuns; - for (i = oldRunCount; i < runCount; i++) { - if (runs[i] == null) { - runs[i] = new BidiRun(0, 0, (byte)0); - } - } - } - /* split runs which are not consecutive in source text */ - int newI; - for (i = oldRunCount-1; i >= 0; i--) { - newI = i + addedRuns; - runLength = i==0 ? runs[0].limit : - runs[i].limit - runs[i-1].limit; - logicalStart = runs[i].start; - indexOddBit = runs[i].level & 1; - if (runLength < 2) { - if (addedRuns > 0) { - runs[newI].copyFrom(runs[i]); - } - logicalPos = visualMap[logicalStart]; - runs[newI].start = logicalPos; - runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); - continue; - } - if (indexOddBit > 0) { - start = logicalStart; - limit = logicalStart + runLength - 1; - step = 1; - } else { - start = logicalStart + runLength - 1; - limit = logicalStart; - step = -1; - } - for (j = start; j != limit; j += step) { - index = visualMap[j]; - index1 = visualMap[j+step]; - if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { - logicalPos = Bidi_Min(visualMap[start], index); - runs[newI].start = logicalPos; - runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); - runs[newI].limit = runs[i].limit; - runs[i].limit -= Bidi_Abs(j - start) + 1; - insertRemove = runs[i].insertRemove & (LRM_AFTER|RLM_AFTER); - runs[newI].insertRemove = insertRemove; - runs[i].insertRemove &= ~insertRemove; - start = j + step; - addedRuns--; - newI--; - } - } - if (addedRuns > 0) { - runs[newI].copyFrom(runs[i]); - } - logicalPos = Bidi_Min(visualMap[start], visualMap[limit]); - runs[newI].start = logicalPos; - runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); - } - - cleanup1: - /* restore initial paraLevel */ - this.paraLevel ^= 1; - cleanup2: - /* restore real text */ - this.text = parmText; - this.length = saveLength; - this.originalLength = parmLength; - this.direction=saveDirection; - this.levels = saveLevels; - this.trailingWSStart = saveTrailingWSStart; - if (runCount > 1) { - this.direction = MIXED; - } - cleanup3: - this.reorderingMode = REORDER_RUNS_ONLY; - } - - /** - * Perform the Unicode Bidi algorithm. It is defined in the - * Unicode Standard Annex #9, - * version 13, - * also described in The Unicode Standard, Version 4.0 .

- * - * This method takes a piece of plain text containing one or more paragraphs, - * with or without externally specified embedding levels from styled - * text and computes the left-right-directionality of each character.

- * - * If the entire text is all of the same directionality, then - * the method may not perform all the steps described by the algorithm, - * i.e., some levels may not be the same as if all steps were performed. - * This is not relevant for unidirectional text.
- * For example, in pure LTR text with numbers the numbers would get - * a resolved level of 2 higher than the surrounding text according to - * the algorithm. This implementation may set all resolved levels to - * the same value in such a case.

- * - * The text can be composed of multiple paragraphs. Occurrence of a block - * separator in the text terminates a paragraph, and whatever comes next starts - * a new paragraph. The exception to this rule is when a Carriage Return (CR) - * is followed by a Line Feed (LF). Both CR and LF are block separators, but - * in that case, the pair of characters is considered as terminating the - * preceding paragraph, and a new paragraph will be started by a character - * coming after the LF. - * - * Although the text is passed here as a String, it is - * stored internally as an array of characters. Therefore the - * documentation will refer to indexes of the characters in the text. - * - * @param text contains the text that the Bidi algorithm will be performed - * on. This text can be retrieved with getText() or - * getTextAsString.
- * - * @param paraLevel specifies the default level for the text; - * it is typically 0 (LTR) or 1 (RTL). - * If the method shall determine the paragraph level from the text, - * then paraLevel can be set to - * either LEVEL_DEFAULT_LTR - * or LEVEL_DEFAULT_RTL; if the text contains multiple - * paragraphs, the paragraph level shall be determined separately for - * each paragraph; if a paragraph does not include any strongly typed - * character, then the desired default is used (0 for LTR or 1 for RTL). - * Any other value between 0 and MAX_EXPLICIT_LEVEL - * is also valid, with odd levels indicating RTL. - * - * @param embeddingLevels (in) may be used to preset the embedding and override levels, - * ignoring characters like LRE and PDF in the text. - * A level overrides the directional property of its corresponding - * (same index) character if the level has the - * LEVEL_OVERRIDE bit set.

- * Except for that bit, it must be - * paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL, - * with one exception: a level of zero may be specified for a - * paragraph separator even if paraLevel>0 when multiple - * paragraphs are submitted in the same call to setPara().

- * Caution: A reference to this array, not a copy - * of the levels, will be stored in the Bidi object; - * the embeddingLevels - * should not be modified to avoid unexpected results on subsequent - * Bidi operations. However, the setPara() and - * setLine() methods may modify some or all of the - * levels.

- * Note: the embeddingLevels array must - * have one entry for each character in text. - * - * @throws IllegalArgumentException if the values in embeddingLevels are - * not within the allowed range - * - * @see #LEVEL_DEFAULT_LTR - * @see #LEVEL_DEFAULT_RTL - * @see #LEVEL_OVERRIDE - * @see #MAX_EXPLICIT_LEVEL - * @stable ICU 3.8 - */ - void setPara(String text, byte paraLevel, byte[] embeddingLevels) - { - if (text == null) { - setPara(new char[0], paraLevel, embeddingLevels); - } else { - setPara(text.toCharArray(), paraLevel, embeddingLevels); - } - } - - /** - * Perform the Unicode Bidi algorithm. It is defined in the - * Unicode Standard Annex #9, - * version 13, - * also described in The Unicode Standard, Version 4.0 .

- * - * This method takes a piece of plain text containing one or more paragraphs, - * with or without externally specified embedding levels from styled - * text and computes the left-right-directionality of each character.

- * - * If the entire text is all of the same directionality, then - * the method may not perform all the steps described by the algorithm, - * i.e., some levels may not be the same as if all steps were performed. - * This is not relevant for unidirectional text.
- * For example, in pure LTR text with numbers the numbers would get - * a resolved level of 2 higher than the surrounding text according to - * the algorithm. This implementation may set all resolved levels to - * the same value in such a case. - * - * The text can be composed of multiple paragraphs. Occurrence of a block - * separator in the text terminates a paragraph, and whatever comes next starts - * a new paragraph. The exception to this rule is when a Carriage Return (CR) - * is followed by a Line Feed (LF). Both CR and LF are block separators, but - * in that case, the pair of characters is considered as terminating the - * preceding paragraph, and a new paragraph will be started by a character - * coming after the LF. - * - * The text is stored internally as an array of characters. Therefore the - * documentation will refer to indexes of the characters in the text. - * - * @param chars contains the text that the Bidi algorithm will be performed - * on. This text can be retrieved with getText() or - * getTextAsString.
- * - * @param paraLevel specifies the default level for the text; - * it is typically 0 (LTR) or 1 (RTL). - * If the method shall determine the paragraph level from the text, - * then paraLevel can be set to - * either LEVEL_DEFAULT_LTR - * or LEVEL_DEFAULT_RTL; if the text contains multiple - * paragraphs, the paragraph level shall be determined separately for - * each paragraph; if a paragraph does not include any strongly typed - * character, then the desired default is used (0 for LTR or 1 for RTL). - * Any other value between 0 and MAX_EXPLICIT_LEVEL - * is also valid, with odd levels indicating RTL. - * - * @param embeddingLevels (in) may be used to preset the embedding and - * override levels, ignoring characters like LRE and PDF in the text. - * A level overrides the directional property of its corresponding - * (same index) character if the level has the - * LEVEL_OVERRIDE bit set.

- * Except for that bit, it must be - * paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL, - * with one exception: a level of zero may be specified for a - * paragraph separator even if paraLevel>0 when multiple - * paragraphs are submitted in the same call to setPara().

- * Caution: A reference to this array, not a copy - * of the levels, will be stored in the Bidi object; - * the embeddingLevels - * should not be modified to avoid unexpected results on subsequent - * Bidi operations. However, the setPara() and - * setLine() methods may modify some or all of the - * levels.

- * Note: the embeddingLevels array must - * have one entry for each character in text. - * - * @throws IllegalArgumentException if the values in embeddingLevels are - * not within the allowed range - * - * @see #LEVEL_DEFAULT_LTR - * @see #LEVEL_DEFAULT_RTL - * @see #LEVEL_OVERRIDE - * @see #MAX_EXPLICIT_LEVEL - * @stable ICU 3.8 - */ - void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels) - { - /* check the argument values */ - if (paraLevel < LEVEL_DEFAULT_LTR) { - verifyRange(paraLevel, 0, MAX_EXPLICIT_LEVEL + 1); - } - if (chars == null) { - chars = new char[0]; - } - - /* special treatment for RUNS_ONLY mode */ - if (reorderingMode == REORDER_RUNS_ONLY) { - setParaRunsOnly(chars, paraLevel); - return; - } - - /* initialize the Bidi object */ - this.paraBidi = null; /* mark unfinished setPara */ - this.text = chars; - this.length = this.originalLength = this.resultLength = text.length; - this.paraLevel = paraLevel; - this.direction = (byte)(paraLevel & 1); - this.paraCount = 1; - - /* Allocate zero-length arrays instead of setting to null here; then - * checks for null in various places can be eliminated. - */ - dirProps = new byte[0]; - levels = new byte[0]; - runs = new BidiRun[0]; - isGoodLogicalToVisualRunsMap = false; - insertPoints.size = 0; /* clean up from last call */ - insertPoints.confirmed = 0; /* clean up from last call */ - - /* - * Save the original paraLevel if contextual; otherwise, set to 0. - */ - defaultParaLevel = IsDefaultLevel(paraLevel) ? paraLevel : 0; - - if (length == 0) { - /* - * For an empty paragraph, create a Bidi object with the paraLevel and - * the flags and the direction set but without allocating zero-length arrays. - * There is nothing more to do. - */ - if (IsDefaultLevel(paraLevel)) { - this.paraLevel &= 1; - defaultParaLevel = 0; - } - flags = DirPropFlagLR(paraLevel); - runCount = 0; - paraCount = 0; - setParaSuccess(); - return; - } - - runCount = -1; - - /* - * Get the directional properties, - * the flags bit-set, and - * determine the paragraph level if necessary. - */ - getDirPropsMemory(length); - dirProps = dirPropsMemory; - getDirProps(); - /* the processed length may have changed if OPTION_STREAMING is set */ - trailingWSStart = length; /* the levels[] will reflect the WS run */ - - /* are explicit levels specified? */ - if (embeddingLevels == null) { - /* no: determine explicit levels according to the (Xn) rules */ - getLevelsMemory(length); - levels = levelsMemory; - direction = resolveExplicitLevels(); - } else { - /* set BN for all explicit codes, check that all levels are 0 or paraLevel..MAX_EXPLICIT_LEVEL */ - levels = embeddingLevels; - direction = checkExplicitLevels(); - } - - /* allocate isolate memory */ - if (isolateCount > 0) { - if (isolates == null || isolates.length < isolateCount) - isolates = new Isolate[isolateCount + 3]; /* keep some reserve */ - } - isolateCount = -1; /* current isolates stack entry == none */ - - /* - * The steps after (X9) in the Bidi algorithm are performed only if - * the paragraph text has mixed directionality! - */ - switch (direction) { - case LTR: - /* all levels are implicitly at paraLevel (important for getLevels()) */ - trailingWSStart = 0; - break; - case RTL: - /* all levels are implicitly at paraLevel (important for getLevels()) */ - trailingWSStart = 0; - break; - default: - /* - * Choose the right implicit state table - */ - switch(reorderingMode) { - case REORDER_DEFAULT: - this.impTabPair = impTab_DEFAULT; - break; - case REORDER_NUMBERS_SPECIAL: - this.impTabPair = impTab_NUMBERS_SPECIAL; - break; - case REORDER_GROUP_NUMBERS_WITH_R: - this.impTabPair = impTab_GROUP_NUMBERS_WITH_R; - break; - case REORDER_RUNS_ONLY: - /* we should never get here */ - throw new InternalError("Internal ICU error in setPara"); - /* break; */ - case REORDER_INVERSE_NUMBERS_AS_L: - this.impTabPair = impTab_INVERSE_NUMBERS_AS_L; - break; - case REORDER_INVERSE_LIKE_DIRECT: - if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { - this.impTabPair = impTab_INVERSE_LIKE_DIRECT_WITH_MARKS; - } else { - this.impTabPair = impTab_INVERSE_LIKE_DIRECT; - } - break; - case REORDER_INVERSE_FOR_NUMBERS_SPECIAL: - if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { - this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS; - } else { - this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL; - } - break; - } - /* - * If there are no external levels specified and there - * are no significant explicit level codes in the text, - * then we can treat the entire paragraph as one run. - * Otherwise, we need to perform the following rules on runs of - * the text with the same embedding levels. (X10) - * "Significant" explicit level codes are ones that actually - * affect non-BN characters. - * Examples for "insignificant" ones are empty embeddings - * LRE-PDF, LRE-RLE-PDF-PDF, etc. - */ - if (embeddingLevels == null && paraCount <= 1 && - (flags & DirPropFlagMultiRuns) == 0) { - resolveImplicitLevels(0, length, - GetLRFromLevel(GetParaLevelAt(0)), - GetLRFromLevel(GetParaLevelAt(length - 1))); - } else { - /* sor, eor: start and end types of same-level-run */ - int start, limit = 0; - byte level, nextLevel; - short sor, eor; - - /* determine the first sor and set eor to it because of the loop body (sor=eor there) */ - level = GetParaLevelAt(0); - nextLevel = levels[0]; - if (level < nextLevel) { - eor = GetLRFromLevel(nextLevel); - } else { - eor = GetLRFromLevel(level); - } - - do { - /* determine start and limit of the run (end points just behind the run) */ - - /* the values for this run's start are the same as for the previous run's end */ - start = limit; - level = nextLevel; - if ((start > 0) && (dirProps[start - 1] == B)) { - /* except if this is a new paragraph, then set sor = para level */ - sor = GetLRFromLevel(GetParaLevelAt(start)); - } else { - sor = eor; - } - - /* search for the limit of this run */ - while ((++limit < length) && - ((levels[limit] == level) || - ((DirPropFlag(dirProps[limit]) & MASK_BN_EXPLICIT) != 0))) {} - - /* get the correct level of the next run */ - if (limit < length) { - nextLevel = levels[limit]; - } else { - nextLevel = GetParaLevelAt(length - 1); - } - - /* determine eor from max(level, nextLevel); sor is last run's eor */ - if (NoOverride(level) < NoOverride(nextLevel)) { - eor = GetLRFromLevel(nextLevel); - } else { - eor = GetLRFromLevel(level); - } - - /* if the run consists of overridden directional types, then there - are no implicit types to be resolved */ - if ((level & LEVEL_OVERRIDE) == 0) { - resolveImplicitLevels(start, limit, sor, eor); - } else { - /* remove the LEVEL_OVERRIDE flags */ - do { - levels[start++] &= ~LEVEL_OVERRIDE; - } while (start < limit); - } - } while (limit < length); - } - - /* reset the embedding levels for some non-graphic characters (L1), (X9) */ - adjustWSLevels(); - - break; - } - - /* add RLM for inverse Bidi with contextual orientation resolving - * to RTL which would not round-trip otherwise - */ - if ((defaultParaLevel > 0) && - ((reorderingOptions & OPTION_INSERT_MARKS) != 0) && - ((reorderingMode == REORDER_INVERSE_LIKE_DIRECT) || - (reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) { - int start, last; - byte level; - byte dirProp; - for (int i = 0; i < paraCount; i++) { - last = paras_limit[i] - 1; - level = paras_level[i]; - if (level == 0) - continue; /* LTR paragraph */ - start = i == 0 ? 0 : paras_limit[i - 1]; - for (int j = last; j >= start; j--) { - dirProp = dirProps[j]; - if (dirProp == L) { - if (j < last) { - while (dirProps[last] == B) { - last--; - } - } - addPoint(last, RLM_BEFORE); - break; - } - if ((DirPropFlag(dirProp) & MASK_R_AL) != 0) { - break; - } - } - } - } - - if ((reorderingOptions & OPTION_REMOVE_CONTROLS) != 0) { - resultLength -= controlCount; - } else { - resultLength += insertPoints.size; - } - setParaSuccess(); - } - - /** - * Perform the Unicode Bidi algorithm on a given paragraph, as defined in the - * Unicode Standard Annex #9, - * version 13, - * also described in The Unicode Standard, Version 4.0 .

- * - * This method takes a paragraph of text and computes the - * left-right-directionality of each character. The text should not - * contain any Unicode block separators.

- * - * The RUN_DIRECTION attribute in the text, if present, determines the base - * direction (left-to-right or right-to-left). If not present, the base - * direction is computed using the Unicode Bidirectional Algorithm, - * defaulting to left-to-right if there are no strong directional characters - * in the text. This attribute, if present, must be applied to all the text - * in the paragraph.

- * - * The BIDI_EMBEDDING attribute in the text, if present, represents - * embedding level information. Negative values from -1 to -62 indicate - * overrides at the absolute value of the level. Positive values from 1 to - * 62 indicate embeddings. Where values are zero or not defined, the base - * embedding level as determined by the base direction is assumed.

- * - * The NUMERIC_SHAPING attribute in the text, if present, converts European - * digits to other decimal digits before running the bidi algorithm. This - * attribute, if present, must be applied to all the text in the paragraph. - * - * If the entire text is all of the same directionality, then - * the method may not perform all the steps described by the algorithm, - * i.e., some levels may not be the same as if all steps were performed. - * This is not relevant for unidirectional text.
- * For example, in pure LTR text with numbers the numbers would get - * a resolved level of 2 higher than the surrounding text according to - * the algorithm. This implementation may set all resolved levels to - * the same value in such a case.

- * - * @param paragraph a paragraph of text with optional character and - * paragraph attribute information - * @stable ICU 3.8 - */ - public void setPara(AttributedCharacterIterator paragraph) - { - byte paraLvl; - char ch = paragraph.first(); - Boolean runDirection = - (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION); - Object shaper = paragraph.getAttribute(TextAttributeConstants.NUMERIC_SHAPING); - - if (runDirection == null) { - paraLvl = LEVEL_DEFAULT_LTR; - } else { - paraLvl = (runDirection.equals(TextAttributeConstants.RUN_DIRECTION_LTR)) ? - LTR : RTL; - } - - byte[] lvls = null; - int len = paragraph.getEndIndex() - paragraph.getBeginIndex(); - byte[] embeddingLevels = new byte[len]; - char[] txt = new char[len]; - int i = 0; - while (ch != AttributedCharacterIterator.DONE) { - txt[i] = ch; - Integer embedding = - (Integer) paragraph.getAttribute(TextAttributeConstants.BIDI_EMBEDDING); - if (embedding != null) { - byte level = embedding.byteValue(); - if (level == 0) { - /* no-op */ - } else if (level < 0) { - lvls = embeddingLevels; - embeddingLevels[i] = (byte)((0 - level) | LEVEL_OVERRIDE); - } else { - lvls = embeddingLevels; - embeddingLevels[i] = level; - } - } - ch = paragraph.next(); - ++i; - } - - if (shaper != null) { - NumericShapings.shape(shaper, txt, 0, len); - } - setPara(txt, paraLvl, lvls); - } - - /** - * Specify whether block separators must be allocated level zero, - * so that successive paragraphs will progress from left to right. - * This method must be called before setPara(). - * Paragraph separators (B) may appear in the text. Setting them to level zero - * means that all paragraph separators (including one possibly appearing - * in the last text position) are kept in the reordered text after the text - * that they follow in the source text. - * When this feature is not enabled, a paragraph separator at the last - * position of the text before reordering will go to the first position - * of the reordered text when the paragraph level is odd. - * - * @param ordarParaLTR specifies whether paragraph separators (B) must - * receive level 0, so that successive paragraphs progress from left to right. - * - * @see #setPara - * @stable ICU 3.8 - */ - public void orderParagraphsLTR(boolean ordarParaLTR) { - orderParagraphsLTR = ordarParaLTR; - } - - /** - * Get the directionality of the text. - * - * @return a value of LTR, RTL or MIXED - * that indicates if the entire text - * represented by this object is unidirectional, - * and which direction, or if it is mixed-directional. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @see #LTR - * @see #RTL - * @see #MIXED - * @stable ICU 3.8 - */ - public byte getDirection() - { - verifyValidParaOrLine(); - return direction; - } - - /** - * Get the length of the text. - * - * @return The length of the text that the Bidi object was - * created for. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @stable ICU 3.8 - */ - public int getLength() - { - verifyValidParaOrLine(); - return originalLength; - } - - /* paragraphs API methods ------------------------------------------------- */ - - /** - * Get the paragraph level of the text. - * - * @return The paragraph level. If there are multiple paragraphs, their - * level may vary if the required paraLevel is LEVEL_DEFAULT_LTR or - * LEVEL_DEFAULT_RTL. In that case, the level of the first paragraph - * is returned. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @see #LEVEL_DEFAULT_LTR - * @see #LEVEL_DEFAULT_RTL - * @see #getParagraph - * @see #getParagraphByIndex - * @stable ICU 3.8 - */ - public byte getParaLevel() - { - verifyValidParaOrLine(); - return paraLevel; - } - - /** - * Retrieves the Bidi class for a given code point. - *

If a BidiClassifier is defined and returns a value - * other than CLASS_DEFAULT, that value is used; otherwise - * the default class determination mechanism is invoked.

- * - * @param c The code point to get a Bidi class for. - * - * @return The Bidi class for the character c that is in effect - * for this Bidi instance. - * - * @stable ICU 3.8 - */ - public int getCustomizedClass(int c) { - int dir; - - dir = bdp.getClass(c); - if (dir >= CHAR_DIRECTION_COUNT) - dir = ON; - return dir; - } - - /** - * setLine() returns a Bidi object to - * contain the reordering information, especially the resolved levels, - * for all the characters in a line of text. This line of text is - * specified by referring to a Bidi object representing - * this information for a piece of text containing one or more paragraphs, - * and by specifying a range of indexes in this text.

- * In the new line object, the indexes will range from 0 to limit-start-1.

- * - * This is used after calling setPara() - * for a piece of text, and after line-breaking on that text. - * It is not necessary if each paragraph is treated as a single line.

- * - * After line-breaking, rules (L1) and (L2) for the treatment of - * trailing WS and for reordering are performed on - * a Bidi object that represents a line.

- * - * Important: the line Bidi object may - * reference data within the global text Bidi object. - * You should not alter the content of the global text object until - * you are finished using the line object. - * - * @param start is the line's first index into the text. - * - * @param limit is just behind the line's last index into the text - * (its last index +1). - * - * @return a Bidi object that will now represent a line of the text. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara - * @throws IllegalArgumentException if start and limit are not in the range - * 0<=start<limit<=getProcessedLength(), - * or if the specified line crosses a paragraph boundary - * - * @see #setPara - * @see #getProcessedLength - * @stable ICU 3.8 - */ - public Bidi setLine(Bidi bidi, BidiBase bidiBase, Bidi newBidi, BidiBase newBidiBase, int start, int limit) - { - verifyValidPara(); - verifyRange(start, 0, limit); - verifyRange(limit, 0, length+1); - - return BidiLine.setLine(this, newBidi, newBidiBase, start, limit); - } - - /** - * Get the level for one character. - * - * @param charIndex the index of a character. - * - * @return The level for the character at charIndex. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @throws IllegalArgumentException if charIndex is not in the range - * 0<=charIndex<getProcessedLength() - * - * @see #getProcessedLength - * @stable ICU 3.8 - */ - public byte getLevelAt(int charIndex) - { - // for backward compatibility - if (charIndex < 0 || charIndex >= length) { - return (byte)getBaseLevel(); - } - - verifyValidParaOrLine(); - verifyRange(charIndex, 0, length); - return BidiLine.getLevelAt(this, charIndex); - } - - /** - * Get an array of levels for each character.

- * - * Note that this method may allocate memory under some - * circumstances, unlike getLevelAt(). - * - * @return The levels array for the text, - * or null if an error occurs. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @stable ICU 3.8 - */ - byte[] getLevels() - { - verifyValidParaOrLine(); - if (length <= 0) { - return new byte[0]; - } - return BidiLine.getLevels(this); - } - - /** - * Get the number of runs. - * This method may invoke the actual reordering on the - * Bidi object, after setPara() - * may have resolved only the levels of the text. Therefore, - * countRuns() may have to allocate memory, - * and may throw an exception if it fails to do so. - * - * @return The number of runs. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @stable ICU 3.8 - */ - public int countRuns() - { - verifyValidParaOrLine(); - BidiLine.getRuns(this); - return runCount; - } - - /** - * - * Get a BidiRun object according to its index. BidiRun methods - * may be used to retrieve the run's logical start, length and level, - * which can be even for an LTR run or odd for an RTL run. - * In an RTL run, the character at the logical start is - * visually on the right of the displayed run. - * The length is the number of characters in the run.

- * countRuns() is normally called - * before the runs are retrieved. - * - *

- * Example: - *

-     *  Bidi bidi = new Bidi();
-     *  String text = "abc 123 DEFG xyz";
-     *  bidi.setPara(text, Bidi.RTL, null);
-     *  int i, count=bidi.countRuns(), logicalStart, visualIndex=0, length;
-     *  BidiRun run;
-     *  for (i = 0; i < count; ++i) {
-     *      run = bidi.getVisualRun(i);
-     *      logicalStart = run.getStart();
-     *      length = run.getLength();
-     *      if (Bidi.LTR == run.getEmbeddingLevel()) {
-     *          do { // LTR
-     *              show_char(text.charAt(logicalStart++), visualIndex++);
-     *          } while (--length > 0);
-     *      } else {
-     *          logicalStart += length;  // logicalLimit
-     *          do { // RTL
-     *              show_char(text.charAt(--logicalStart), visualIndex++);
-     *          } while (--length > 0);
-     *      }
-     *  }
-     * 
- *

- * Note that in right-to-left runs, code like this places - * second surrogates before first ones (which is generally a bad idea) - * and combining characters before base characters. - *

- * Use of {@link #writeReordered}, optionally with the - * {@link #KEEP_BASE_COMBINING} option, can be considered in - * order to avoid these issues. - * - * @param runIndex is the number of the run in visual order, in the - * range [0..countRuns()-1]. - * - * @return a BidiRun object containing the details of the run. The - * directionality of the run is - * LTR==0 or RTL==1, - * never MIXED. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @throws IllegalArgumentException if runIndex is not in - * the range 0<=runIndex<countRuns() - * - * @see #countRuns() - * @see com.ibm.icu.text.BidiRun - * @see com.ibm.icu.text.BidiRun#getStart() - * @see com.ibm.icu.text.BidiRun#getLength() - * @see com.ibm.icu.text.BidiRun#getEmbeddingLevel() - * @stable ICU 3.8 - */ - BidiRun getVisualRun(int runIndex) - { - verifyValidParaOrLine(); - BidiLine.getRuns(this); - verifyRange(runIndex, 0, runCount); - return BidiLine.getVisualRun(this, runIndex); - } - - /** - * Get a visual-to-logical index map (array) for the characters in the - * Bidi (paragraph or line) object. - *

- * Some values in the map may be MAP_NOWHERE if the - * corresponding text characters are Bidi marks inserted in the visual - * output by the option OPTION_INSERT_MARKS. - *

- * When the visual output is altered by using options of - * writeReordered() such as INSERT_LRM_FOR_NUMERIC, - * KEEP_BASE_COMBINING, OUTPUT_REVERSE, - * REMOVE_BIDI_CONTROLS, the logical positions returned may not - * be correct. It is advised to use, when possible, reordering options - * such as {@link #OPTION_INSERT_MARKS} and {@link #OPTION_REMOVE_CONTROLS}. - * - * @return an array of getResultLength() - * indexes which will reflect the reordering of the characters.

- * The index map will result in - * indexMap[visualIndex]==logicalIndex, where - * indexMap represents the returned array. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @see #getLogicalMap - * @see #getLogicalIndex - * @see #getResultLength - * @see #MAP_NOWHERE - * @see #OPTION_INSERT_MARKS - * @see #writeReordered - * @stable ICU 3.8 - */ - private int[] getVisualMap() - { - /* countRuns() checks successful call to setPara/setLine */ - countRuns(); - if (resultLength <= 0) { - return new int[0]; - } - return BidiLine.getVisualMap(this); - } - - /** - * This is a convenience method that does not use a Bidi object. - * It is intended to be used for when an application has determined the levels - * of objects (character sequences) and just needs to have them reordered (L2). - * This is equivalent to using getVisualMap() on a - * Bidi object. - * - * @param levels is an array of levels that have been determined by - * the application. - * - * @return an array of levels.length - * indexes which will reflect the reordering of the characters.

- * The index map will result in - * indexMap[visualIndex]==logicalIndex, where - * indexMap represents the returned array. - * - * @stable ICU 3.8 - */ - private static int[] reorderVisual(byte[] levels) - { - return BidiLine.reorderVisual(levels); - } - - /** - * Constant indicating that the base direction depends on the first strong - * directional character in the text according to the Unicode Bidirectional - * Algorithm. If no strong directional character is present, the base - * direction is right-to-left. - * @stable ICU 3.8 - */ - public static final int DIRECTION_DEFAULT_RIGHT_TO_LEFT = LEVEL_DEFAULT_RTL; - - /** - * Create Bidi from the given text, embedding, and direction information. - * The embeddings array may be null. If present, the values represent - * embedding level information. Negative values from -1 to -61 indicate - * overrides at the absolute value of the level. Positive values from 1 to - * 61 indicate embeddings. Where values are zero, the base embedding level - * as determined by the base direction is assumed.

- * - * Note: this constructor calls setPara() internally. - * - * @param text an array containing the paragraph of text to process. - * @param textStart the index into the text array of the start of the - * paragraph. - * @param embeddings an array containing embedding values for each character - * in the paragraph. This can be null, in which case it is assumed - * that there is no external embedding information. - * @param embStart the index into the embedding array of the start of the - * paragraph. - * @param paragraphLength the length of the paragraph in the text and - * embeddings arrays. - * @param flags a collection of flags that control the algorithm. The - * algorithm understands the flags DIRECTION_LEFT_TO_RIGHT, - * DIRECTION_RIGHT_TO_LEFT, DIRECTION_DEFAULT_LEFT_TO_RIGHT, and - * DIRECTION_DEFAULT_RIGHT_TO_LEFT. Other values are reserved. - * - * @throws IllegalArgumentException if the values in embeddings are - * not within the allowed range - * - * @see #DIRECTION_LEFT_TO_RIGHT - * @see #DIRECTION_RIGHT_TO_LEFT - * @see #DIRECTION_DEFAULT_LEFT_TO_RIGHT - * @see #DIRECTION_DEFAULT_RIGHT_TO_LEFT - * @stable ICU 3.8 - */ - public BidiBase(char[] text, - int textStart, - byte[] embeddings, - int embStart, - int paragraphLength, - int flags) - { - this(0, 0); - byte paraLvl; - switch (flags) { - case Bidi.DIRECTION_LEFT_TO_RIGHT: - default: - paraLvl = LTR; - break; - case Bidi.DIRECTION_RIGHT_TO_LEFT: - paraLvl = RTL; - break; - case Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT: - paraLvl = LEVEL_DEFAULT_LTR; - break; - case Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT: - paraLvl = LEVEL_DEFAULT_RTL; - break; - } - byte[] paraEmbeddings; - if (embeddings == null) { - paraEmbeddings = null; - } else { - paraEmbeddings = new byte[paragraphLength]; - byte lev; - for (int i = 0; i < paragraphLength; i++) { - lev = embeddings[i + embStart]; - if (lev < 0) { - lev = (byte)((- lev) | LEVEL_OVERRIDE); - } else if (lev == 0) { - lev = paraLvl; - if (paraLvl > MAX_EXPLICIT_LEVEL) { - lev &= 1; - } - } - paraEmbeddings[i] = lev; - } - } - - char[] paraText = new char[paragraphLength]; - System.arraycopy(text, textStart, paraText, 0, paragraphLength); - setPara(paraText, paraLvl, paraEmbeddings); - } - - /** - * Return true if the line is not left-to-right or right-to-left. This means - * it either has mixed runs of left-to-right and right-to-left text, or the - * base direction differs from the direction of the only run of text. - * - * @return true if the line is not left-to-right or right-to-left. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara - * @stable ICU 3.8 - */ - public boolean isMixed() - { - return (!isLeftToRight() && !isRightToLeft()); - } - - /** - * Return true if the line is all left-to-right text and the base direction - * is left-to-right. - * - * @return true if the line is all left-to-right text and the base direction - * is left-to-right. - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara - * @stable ICU 3.8 - */ - public boolean isLeftToRight() - { - return (getDirection() == LTR && (paraLevel & 1) == 0); - } - - /** - * Return true if the line is all right-to-left text, and the base direction - * is right-to-left - * - * @return true if the line is all right-to-left text, and the base - * direction is right-to-left - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara - * @stable ICU 3.8 - */ - public boolean isRightToLeft() - { - return (getDirection() == RTL && (paraLevel & 1) == 1); - } - - /** - * Return true if the base direction is left-to-right - * - * @return true if the base direction is left-to-right - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @stable ICU 3.8 - */ - public boolean baseIsLeftToRight() - { - return (getParaLevel() == LTR); - } - - /** - * Return the base level (0 if left-to-right, 1 if right-to-left). - * - * @return the base level - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @stable ICU 3.8 - */ - public int getBaseLevel() - { - return getParaLevel(); - } - - /** - * Compute the logical to visual run mapping - */ - void getLogicalToVisualRunsMap() - { - if (isGoodLogicalToVisualRunsMap) { - return; - } - int count = countRuns(); - if ((logicalToVisualRunsMap == null) || - (logicalToVisualRunsMap.length < count)) { - logicalToVisualRunsMap = new int[count]; - } - int i; - long[] keys = new long[count]; - for (i = 0; i < count; i++) { - keys[i] = ((long)(runs[i].start)<<32) + i; - } - Arrays.sort(keys); - for (i = 0; i < count; i++) { - logicalToVisualRunsMap[i] = (int)(keys[i] & 0x00000000FFFFFFFF); - } - isGoodLogicalToVisualRunsMap = true; - } - - /** - * Return the level of the nth logical run in this line. - * - * @param run the index of the run, between 0 and countRuns()-1 - * - * @return the level of the run - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @throws IllegalArgumentException if run is not in - * the range 0<=run<countRuns() - * @stable ICU 3.8 - */ - public int getRunLevel(int run) - { - verifyValidParaOrLine(); - BidiLine.getRuns(this); - - // for backward compatibility - if (run < 0 || run >= runCount) { - return getParaLevel(); - } - - getLogicalToVisualRunsMap(); - return runs[logicalToVisualRunsMap[run]].level; - } - - /** - * Return the index of the character at the start of the nth logical run in - * this line, as an offset from the start of the line. - * - * @param run the index of the run, between 0 and countRuns() - * - * @return the start of the run - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @throws IllegalArgumentException if run is not in - * the range 0<=run<countRuns() - * @stable ICU 3.8 - */ - public int getRunStart(int run) - { - verifyValidParaOrLine(); - BidiLine.getRuns(this); - - // for backward compatibility - if (runCount == 1) { - return 0; - } else if (run == runCount) { - return length; - } - - getLogicalToVisualRunsMap(); - return runs[logicalToVisualRunsMap[run]].start; - } - - /** - * Return the index of the character past the end of the nth logical run in - * this line, as an offset from the start of the line. For example, this - * will return the length of the line for the last run on the line. - * - * @param run the index of the run, between 0 and countRuns() - * - * @return the limit of the run - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * @throws IllegalArgumentException if run is not in - * the range 0<=run<countRuns() - * @stable ICU 3.8 - */ - public int getRunLimit(int run) - { - verifyValidParaOrLine(); - BidiLine.getRuns(this); - - // for backward compatibility - if (runCount == 1) { - return length; - } - - getLogicalToVisualRunsMap(); - int idx = logicalToVisualRunsMap[run]; - int len = idx == 0 ? runs[idx].limit : - runs[idx].limit - runs[idx-1].limit; - return runs[idx].start + len; - } - - /** - * Return true if the specified text requires bidi analysis. If this returns - * false, the text will display left-to-right. Clients can then avoid - * constructing a Bidi object. Text in the Arabic Presentation Forms area of - * Unicode is presumed to already be shaped and ordered for display, and so - * will not cause this method to return true. - * - * @param text the text containing the characters to test - * @param start the start of the range of characters to test - * @param limit the limit of the range of characters to test - * - * @return true if the range of characters requires bidi analysis - * - * @stable ICU 3.8 - */ - public static boolean requiresBidi(char[] text, - int start, - int limit) - { - final int RTLMask = (1 << R | - 1 << AL | - 1 << RLE | - 1 << RLO | - 1 << AN); - - if (0 > start || start > limit || limit > text.length) { - throw new IllegalArgumentException("Value start " + start + - " is out of range 0 to " + limit); - } - - for (int i = start; i < limit; ++i) { - if (Character.isHighSurrogate(text[i]) && i < (limit-1) && - Character.isLowSurrogate(text[i+1])) { - if (((1 << UCharacter.getDirection(Character.codePointAt(text, i))) & RTLMask) != 0) { - return true; - } - } else if (((1 << UCharacter.getDirection(text[i])) & RTLMask) != 0) { - return true; - } - } - - return false; - } - - /** - * Reorder the objects in the array into visual order based on their levels. - * This is a utility method to use when you have a collection of objects - * representing runs of text in logical order, each run containing text at a - * single level. The elements at index from - * objectStart up to objectStart + count in the - * objects array will be reordered into visual order assuming - * each run of text has the level indicated by the corresponding element in - * the levels array (at index - objectStart + levelStart). - * - * @param levels an array representing the bidi level of each object - * @param levelStart the start position in the levels array - * @param objects the array of objects to be reordered into visual order - * @param objectStart the start position in the objects array - * @param count the number of objects to reorder - * @stable ICU 3.8 - */ - public static void reorderVisually(byte[] levels, - int levelStart, - Object[] objects, - int objectStart, - int count) - { - // for backward compatibility - if (0 > levelStart || levels.length <= levelStart) { - throw new IllegalArgumentException("Value levelStart " + - levelStart + " is out of range 0 to " + - (levels.length-1)); - } - if (0 > objectStart || objects.length <= objectStart) { - throw new IllegalArgumentException("Value objectStart " + - levelStart + " is out of range 0 to " + - (objects.length-1)); - } - if (0 > count || objects.length < (objectStart+count)) { - throw new IllegalArgumentException("Value count " + - levelStart + " is out of range 0 to " + - (objects.length - objectStart)); - } - - byte[] reorderLevels = new byte[count]; - System.arraycopy(levels, levelStart, reorderLevels, 0, count); - int[] indexMap = reorderVisual(reorderLevels); - Object[] temp = new Object[count]; - System.arraycopy(objects, objectStart, temp, 0, count); - for (int i = 0; i < count; ++i) { - objects[objectStart + i] = temp[indexMap[i]]; - } - } - - /** - * Take a Bidi object containing the reordering - * information for a piece of text (one or more paragraphs) set by - * setPara() or for a line of text set by setLine() - * and return a string containing the reordered text. - * - *

The text may have been aliased (only a reference was stored - * without copying the contents), thus it must not have been modified - * since the setPara() call.

- * - * This method preserves the integrity of characters with multiple - * code units and (optionally) combining characters. - * Characters in RTL runs can be replaced by mirror-image characters - * in the returned string. Note that "real" mirroring has to be done in a - * rendering engine by glyph selection and that for many "mirrored" - * characters there are no Unicode characters as mirror-image equivalents. - * There are also options to insert or remove Bidi control - * characters; see the descriptions of the return value and the - * options parameter, and of the option bit flags. - * - * @param options A bit set of options for the reordering that control - * how the reordered text is written. - * The options include mirroring the characters on a code - * point basis and inserting LRM characters, which is used - * especially for transforming visually stored text - * to logically stored text (although this is still an - * imperfect implementation of an "inverse Bidi" algorithm - * because it uses the "forward Bidi" algorithm at its core). - * The available options are: - * DO_MIRRORING, - * INSERT_LRM_FOR_NUMERIC, - * KEEP_BASE_COMBINING, - * OUTPUT_REVERSE, - * REMOVE_BIDI_CONTROLS, - * STREAMING - * - * @return The reordered text. - * If the INSERT_LRM_FOR_NUMERIC option is set, then - * the length of the returned string could be as large as - * getLength()+2*countRuns().
- * If the REMOVE_BIDI_CONTROLS option is set, then the - * length of the returned string may be less than - * getLength().
- * If none of these options is set, then the length of the returned - * string will be exactly getProcessedLength(). - * - * @throws IllegalStateException if this call is not preceded by a successful - * call to setPara or setLine - * - * @see #DO_MIRRORING - * @see #INSERT_LRM_FOR_NUMERIC - * @see #KEEP_BASE_COMBINING - * @see #OUTPUT_REVERSE - * @see #REMOVE_BIDI_CONTROLS - * @see #OPTION_STREAMING - * @see #getProcessedLength - * @stable ICU 3.8 - */ - public String writeReordered(int options) - { - verifyValidParaOrLine(); - if (length == 0) { - /* nothing to do */ - return ""; - } - return BidiWriter.writeReordered(this, options); - } - - /** - * Display the bidi internal state, used in debugging. - */ - public String toString() { - StringBuilder buf = new StringBuilder(getClass().getName()); - - buf.append("[dir: "); - buf.append(direction); - buf.append(" baselevel: "); - buf.append(paraLevel); - buf.append(" length: "); - buf.append(length); - buf.append(" runs: "); - if (levels == null) { - buf.append("none"); - } else { - buf.append('['); - buf.append(levels[0]); - for (int i = 1; i < levels.length; i++) { - buf.append(' '); - buf.append(levels[i]); - } - buf.append(']'); - } - buf.append(" text: [0x"); - buf.append(Integer.toHexString(text[0])); - for (int i = 1; i < text.length; i++) { - buf.append(" 0x"); - buf.append(Integer.toHexString(text[i])); - } - buf.append("]]"); - - return buf.toString(); - } - - /** - * A class that provides access to constants defined by - * java.awt.font.TextAttribute without creating a static dependency. - */ - private static class TextAttributeConstants { - // Make sure to load the AWT's TextAttribute class before using the constants, if any. - static { - try { - Class.forName("java.awt.font.TextAttribute", true, null); - } catch (ClassNotFoundException e) {} - } - static final JavaAWTFontAccess jafa = SharedSecrets.getJavaAWTFontAccess(); - - /** - * TextAttribute instances (or a fake Attribute type if - * java.awt.font.TextAttribute is not present) - */ - static final AttributedCharacterIterator.Attribute RUN_DIRECTION = - getTextAttribute("RUN_DIRECTION"); - static final AttributedCharacterIterator.Attribute NUMERIC_SHAPING = - getTextAttribute("NUMERIC_SHAPING"); - static final AttributedCharacterIterator.Attribute BIDI_EMBEDDING = - getTextAttribute("BIDI_EMBEDDING"); - - /** - * TextAttribute.RUN_DIRECTION_LTR - */ - static final Boolean RUN_DIRECTION_LTR = (jafa == null) ? - Boolean.FALSE : (Boolean)jafa.getTextAttributeConstant("RUN_DIRECTION_LTR"); - - @SuppressWarnings("serial") - private static AttributedCharacterIterator.Attribute - getTextAttribute(String name) - { - if (jafa == null) { - // fake attribute - return new AttributedCharacterIterator.Attribute(name) { }; - } else { - return (AttributedCharacterIterator.Attribute)jafa.getTextAttributeConstant(name); - } - } - } - - /** - * A class that provides access to java.awt.font.NumericShaper without - * creating a static dependency. - */ - private static class NumericShapings { - // Make sure to load the AWT's NumericShaper class before calling shape, if any. - static { - try { - Class.forName("java.awt.font.NumericShaper", true, null); - } catch (ClassNotFoundException e) {} - } - static final JavaAWTFontAccess jafa = SharedSecrets.getJavaAWTFontAccess(); - - /** - * Invokes NumericShaping shape(text,start,count) method. - */ - static void shape(Object shaper, char[] text, int start, int count) { - if (jafa != null) { - jafa.shape(shaper, text, start, count); - } - } - } - -} --- /dev/null 2020-01-10 15:57:52.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/BidiBase.java 2020-01-10 15:57:52.000000000 -0800 @@ -0,0 +1,4780 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* +******************************************************************************* +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ + +/* FOOD FOR THOUGHT: currently the reordering modes are a mixture of + * algorithm for direct BiDi, algorithm for inverse Bidi and the bizarre + * concept of RUNS_ONLY which is a double operation. + * It could be advantageous to divide this into 3 concepts: + * a) Operation: direct / inverse / RUNS_ONLY + * b) Direct algorithm: default / NUMBERS_SPECIAL / GROUP_NUMBERS_WITH_L + * c) Inverse algorithm: default / INVERSE_LIKE_DIRECT / NUMBERS_SPECIAL + * This would allow combinations not possible today like RUNS_ONLY with + * NUMBERS_SPECIAL. + * Also allow to set INSERT_MARKS for the direct step of RUNS_ONLY and + * REMOVE_CONTROLS for the inverse step. + * Not all combinations would be supported, and probably not all do make sense. + * This would need to document which ones are supported and what are the + * fallbacks for unsupported combinations. + */ + +package jdk.internal.icu.text; + +import java.lang.reflect.Array; +import java.text.AttributedCharacterIterator; +import java.text.Bidi; +import java.util.Arrays; +import jdk.internal.access.JavaAWTFontAccess; +import jdk.internal.access.SharedSecrets; +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.impl.UBiDiProps; + +/** + * + *

Bidi algorithm for ICU

+ * + * This is an implementation of the Unicode Bidirectional Algorithm. The + * algorithm is defined in the Unicode Standard Annex #9. + *

+ * + * Note: Libraries that perform a bidirectional algorithm and reorder strings + * accordingly are sometimes called "Storage Layout Engines". ICU's Bidi and + * shaping (ArabicShaping) classes can be used at the core of such "Storage + * Layout Engines". + * + *

General remarks about the API:

+ * + * The "limit" of a sequence of characters is the position just after + * their last character, i.e., one more than that position. + *

+ * + * Some of the API methods provide access to "runs". Such a + * "run" is defined as a sequence of characters that are at the same + * embedding level after performing the Bidi algorithm. + * + *

Basic concept: paragraph

+ * A piece of text can be divided into several paragraphs by characters + * with the Bidi class Block Separator. For handling of + * paragraphs, see: + *
    + *
  • {@link #countParagraphs} + *
  • {@link #getParaLevel} + *
  • {@link #getParagraph} + *
  • {@link #getParagraphByIndex} + *
+ * + *

Basic concept: text direction

+ * The direction of a piece of text may be: + *
    + *
  • {@link #LTR} + *
  • {@link #RTL} + *
  • {@link #MIXED} + *
  • {@link #NEUTRAL} + *
+ * + *

Basic concept: levels

+ * + * Levels in this API represent embedding levels according to the Unicode + * Bidirectional Algorithm. + * Their low-order bit (even/odd value) indicates the visual direction.

+ * + * Levels can be abstract values when used for the + * paraLevel and embeddingLevels + * arguments of setPara(); there: + *

    + *
  • the high-order bit of an embeddingLevels[] + * value indicates whether the using application is + * specifying the level of a character to override whatever the + * Bidi implementation would resolve it to.
  • + *
  • paraLevel can be set to the + * pseudo-level values LEVEL_DEFAULT_LTR + * and LEVEL_DEFAULT_RTL.
  • + *
+ * + *

The related constants are not real, valid level values. + * DEFAULT_XXX can be used to specify + * a default for the paragraph level for + * when the setPara() method + * shall determine it but there is no + * strongly typed character in the input.

+ * + * Note that the value for LEVEL_DEFAULT_LTR is even + * and the one for LEVEL_DEFAULT_RTL is odd, + * just like with normal LTR and RTL level values - + * these special values are designed that way. Also, the implementation + * assumes that MAX_EXPLICIT_LEVEL is odd. + * + *

See Also: + *

    + *
  • {@link #LEVEL_DEFAULT_LTR} + *
  • {@link #LEVEL_DEFAULT_RTL} + *
  • {@link #LEVEL_OVERRIDE} + *
  • {@link #MAX_EXPLICIT_LEVEL} + *
  • {@link #setPara} + *
+ * + *

Basic concept: Reordering Mode

+ * Reordering mode values indicate which variant of the Bidi algorithm to + * use. + * + *

See Also: + *

    + *
  • {@link #setReorderingMode} + *
  • {@link #REORDER_DEFAULT} + *
  • {@link #REORDER_NUMBERS_SPECIAL} + *
  • {@link #REORDER_GROUP_NUMBERS_WITH_R} + *
  • {@link #REORDER_RUNS_ONLY} + *
  • {@link #REORDER_INVERSE_NUMBERS_AS_L} + *
  • {@link #REORDER_INVERSE_LIKE_DIRECT} + *
  • {@link #REORDER_INVERSE_FOR_NUMBERS_SPECIAL} + *
+ * + *

Basic concept: Reordering Options

+ * Reordering options can be applied during Bidi text transformations. + * + *

See Also: + *

    + *
  • {@link #setReorderingOptions} + *
  • {@link #OPTION_DEFAULT} + *
  • {@link #OPTION_INSERT_MARKS} + *
  • {@link #OPTION_REMOVE_CONTROLS} + *
  • {@link #OPTION_STREAMING} + *
+ * + * + * @author Simon Montagu, Matitiahu Allouche (ported from C code written by Markus W. Scherer) + * @stable ICU 3.8 + * + * + *

Sample code for the ICU Bidi API

+ * + *
Rendering a paragraph with the ICU Bidi API
+ * + * This is (hypothetical) sample code that illustrates how the ICU Bidi API + * could be used to render a paragraph of text. Rendering code depends highly on + * the graphics system, therefore this sample code must make a lot of + * assumptions, which may or may not match any existing graphics system's + * properties. + * + *

+ * The basic assumptions are: + *

+ *
    + *
  • Rendering is done from left to right on a horizontal line.
  • + *
  • A run of single-style, unidirectional text can be rendered at once. + *
  • + *
  • Such a run of text is passed to the graphics system with characters + * (code units) in logical order.
  • + *
  • The line-breaking algorithm is very complicated and Locale-dependent - + * and therefore its implementation omitted from this sample code.
  • + *
+ * + *
{@code
+ *
+ *  package com.ibm.icu.dev.test.bidi;
+ *
+ *  import com.ibm.icu.text.Bidi;
+ *  import com.ibm.icu.text.BidiRun;
+ *
+ *  public class Sample {
+ *
+ *      static final int styleNormal = 0;
+ *      static final int styleSelected = 1;
+ *      static final int styleBold = 2;
+ *      static final int styleItalics = 4;
+ *      static final int styleSuper=8;
+ *      static final int styleSub = 16;
+ *
+ *      static class StyleRun {
+ *          int limit;
+ *          int style;
+ *
+ *          public StyleRun(int limit, int style) {
+ *              this.limit = limit;
+ *              this.style = style;
+ *          }
+ *      }
+ *
+ *      static class Bounds {
+ *          int start;
+ *          int limit;
+ *
+ *          public Bounds(int start, int limit) {
+ *              this.start = start;
+ *              this.limit = limit;
+ *          }
+ *      }
+ *
+ *      static int getTextWidth(String text, int start, int limit,
+ *                              StyleRun[] styleRuns, int styleRunCount) {
+ *          // simplistic way to compute the width
+ *          return limit - start;
+ *      }
+ *
+ *      // set limit and StyleRun limit for a line
+ *      // from text[start] and from styleRuns[styleRunStart]
+ *      // using Bidi.getLogicalRun(...)
+ *      // returns line width
+ *      static int getLineBreak(String text, Bounds line, Bidi para,
+ *                              StyleRun styleRuns[], Bounds styleRun) {
+ *          // dummy return
+ *          return 0;
+ *      }
+ *
+ *      // render runs on a line sequentially, always from left to right
+ *
+ *      // prepare rendering a new line
+ *      static void startLine(byte textDirection, int lineWidth) {
+ *          System.out.println();
+ *      }
+ *
+ *      // render a run of text and advance to the right by the run width
+ *      // the text[start..limit-1] is always in logical order
+ *      static void renderRun(String text, int start, int limit,
+ *                            byte textDirection, int style) {
+ *      }
+ *
+ *      // We could compute a cross-product
+ *      // from the style runs with the directional runs
+ *      // and then reorder it.
+ *      // Instead, here we iterate over each run type
+ *      // and render the intersections -
+ *      // with shortcuts in simple (and common) cases.
+ *      // renderParagraph() is the main function.
+ *
+ *      // render a directional run with
+ *      // (possibly) multiple style runs intersecting with it
+ *      static void renderDirectionalRun(String text, int start, int limit,
+ *                                       byte direction, StyleRun styleRuns[],
+ *                                       int styleRunCount) {
+ *          int i;
+ *
+ *          // iterate over style runs
+ *          if (direction == Bidi.LTR) {
+ *              int styleLimit;
+ *              for (i = 0; i < styleRunCount; ++i) {
+ *                  styleLimit = styleRuns[i].limit;
+ *                  if (start < styleLimit) {
+ *                      if (styleLimit > limit) {
+ *                          styleLimit = limit;
+ *                      }
+ *                      renderRun(text, start, styleLimit,
+ *                                direction, styleRuns[i].style);
+ *                      if (styleLimit == limit) {
+ *                          break;
+ *                      }
+ *                      start = styleLimit;
+ *                  }
+ *              }
+ *          } else {
+ *              int styleStart;
+ *
+ *              for (i = styleRunCount-1; i >= 0; --i) {
+ *                  if (i > 0) {
+ *                      styleStart = styleRuns[i-1].limit;
+ *                  } else {
+ *                      styleStart = 0;
+ *                  }
+ *                  if (limit >= styleStart) {
+ *                      if (styleStart < start) {
+ *                          styleStart = start;
+ *                      }
+ *                      renderRun(text, styleStart, limit, direction,
+ *                                styleRuns[i].style);
+ *                      if (styleStart == start) {
+ *                          break;
+ *                      }
+ *                      limit = styleStart;
+ *                  }
+ *              }
+ *          }
+ *      }
+ *
+ *      // the line object represents text[start..limit-1]
+ *      static void renderLine(Bidi line, String text, int start, int limit,
+ *                             StyleRun styleRuns[], int styleRunCount) {
+ *          byte direction = line.getDirection();
+ *          if (direction != Bidi.MIXED) {
+ *              // unidirectional
+ *              if (styleRunCount <= 1) {
+ *                  renderRun(text, start, limit, direction, styleRuns[0].style);
+ *              } else {
+ *                  renderDirectionalRun(text, start, limit, direction,
+ *                                       styleRuns, styleRunCount);
+ *              }
+ *          } else {
+ *              // mixed-directional
+ *              int count, i;
+ *              BidiRun run;
+ *
+ *              try {
+ *                  count = line.countRuns();
+ *              } catch (IllegalStateException e) {
+ *                  e.printStackTrace();
+ *                  return;
+ *              }
+ *              if (styleRunCount <= 1) {
+ *                  int style = styleRuns[0].style;
+ *
+ *                  // iterate over directional runs
+ *                  for (i = 0; i < count; ++i) {
+ *                      run = line.getVisualRun(i);
+ *                      renderRun(text, run.getStart(), run.getLimit(),
+ *                                run.getDirection(), style);
+ *                  }
+ *              } else {
+ *                  // iterate over both directional and style runs
+ *                  for (i = 0; i < count; ++i) {
+ *                      run = line.getVisualRun(i);
+ *                      renderDirectionalRun(text, run.getStart(),
+ *                                           run.getLimit(), run.getDirection(),
+ *                                           styleRuns, styleRunCount);
+ *                  }
+ *              }
+ *          }
+ *      }
+ *
+ *      static void renderParagraph(String text, byte textDirection,
+ *                                  StyleRun styleRuns[], int styleRunCount,
+ *                                  int lineWidth) {
+ *          int length = text.length();
+ *          Bidi para = new Bidi();
+ *          try {
+ *              para.setPara(text,
+ *                           textDirection != 0 ? Bidi.LEVEL_DEFAULT_RTL
+ *                                              : Bidi.LEVEL_DEFAULT_LTR,
+ *                           null);
+ *          } catch (Exception e) {
+ *              e.printStackTrace();
+ *              return;
+ *          }
+ *          byte paraLevel = (byte)(1 & para.getParaLevel());
+ *          StyleRun styleRun = new StyleRun(length, styleNormal);
+ *
+ *          if (styleRuns == null || styleRunCount <= 0) {
+ *              styleRuns = new StyleRun[1];
+ *              styleRunCount = 1;
+ *              styleRuns[0] = styleRun;
+ *          }
+ *          // assume styleRuns[styleRunCount-1].limit>=length
+ *
+ *          int width = getTextWidth(text, 0, length, styleRuns, styleRunCount);
+ *          if (width <= lineWidth) {
+ *              // everything fits onto one line
+ *
+ *              // prepare rendering a new line from either left or right
+ *              startLine(paraLevel, width);
+ *
+ *              renderLine(para, text, 0, length, styleRuns, styleRunCount);
+ *          } else {
+ *              // we need to render several lines
+ *              Bidi line = new Bidi(length, 0);
+ *              int start = 0, limit;
+ *              int styleRunStart = 0, styleRunLimit;
+ *
+ *              for (;;) {
+ *                  limit = length;
+ *                  styleRunLimit = styleRunCount;
+ *                  width = getLineBreak(text, new Bounds(start, limit),
+ *                                       para, styleRuns,
+ *                                       new Bounds(styleRunStart, styleRunLimit));
+ *                  try {
+ *                      line = para.setLine(start, limit);
+ *                  } catch (Exception e) {
+ *                      e.printStackTrace();
+ *                      return;
+ *                  }
+ *                  // prepare rendering a new line
+ *                  // from either left or right
+ *                  startLine(paraLevel, width);
+ *
+ *                  if (styleRunStart > 0) {
+ *                      int newRunCount = styleRuns.length - styleRunStart;
+ *                      StyleRun[] newRuns = new StyleRun[newRunCount];
+ *                      System.arraycopy(styleRuns, styleRunStart, newRuns, 0,
+ *                                       newRunCount);
+ *                      renderLine(line, text, start, limit, newRuns,
+ *                                 styleRunLimit - styleRunStart);
+ *                  } else {
+ *                      renderLine(line, text, start, limit, styleRuns,
+ *                                 styleRunLimit - styleRunStart);
+ *                  }
+ *                  if (limit == length) {
+ *                      break;
+ *                  }
+ *                  start = limit;
+ *                  styleRunStart = styleRunLimit - 1;
+ *                  if (start >= styleRuns[styleRunStart].limit) {
+ *                      ++styleRunStart;
+ *                  }
+ *              }
+ *          }
+ *      }
+ *
+ *      public static void main(String[] args)
+ *      {
+ *          renderParagraph("Some Latin text...", Bidi.LTR, null, 0, 80);
+ *          renderParagraph("Some Hebrew text...", Bidi.RTL, null, 0, 60);
+ *      }
+ *  }
+ *
+ * }
+ */ + +/* + * General implementation notes: + * + * Throughout the implementation, there are comments like (W2) that refer to + * rules of the BiDi algorithm, in this example to the second rule of the + * resolution of weak types. + * + * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32) + * character according to UTF-16, the second UChar gets the directional property of + * the entire character assigned, while the first one gets a BN, a boundary + * neutral, type, which is ignored by most of the algorithm according to + * rule (X9) and the implementation suggestions of the BiDi algorithm. + * + * Later, adjustWSLevels() will set the level for each BN to that of the + * following character (UChar), which results in surrogate pairs getting the + * same level on each of their surrogates. + * + * In a UTF-8 implementation, the same thing could be done: the last byte of + * a multi-byte sequence would get the "real" property, while all previous + * bytes of that sequence would get BN. + * + * It is not possible to assign all those parts of a character the same real + * property because this would fail in the resolution of weak types with rules + * that look at immediately surrounding types. + * + * As a related topic, this implementation does not remove Boundary Neutral + * types from the input, but ignores them wherever this is relevant. + * For example, the loop for the resolution of the weak types reads + * types until it finds a non-BN. + * Also, explicit embedding codes are neither changed into BN nor removed. + * They are only treated the same way real BNs are. + * As stated before, adjustWSLevels() takes care of them at the end. + * For the purpose of conformance, the levels of all these codes + * do not matter. + * + * Note that this implementation modifies the dirProps + * after the initial setup, when applying X5c (replace FSI by LRI or RLI), + * X6, N0 (replace paired brackets by L or R). + * + * In this implementation, the resolution of weak types (W1 to W6), + * neutrals (N1 and N2), and the assignment of the resolved level (In) + * are all done in one single loop, in resolveImplicitLevels(). + * Changes of dirProp values are done on the fly, without writing + * them back to the dirProps array. + * + * + * This implementation contains code that allows to bypass steps of the + * algorithm that are not needed on the specific paragraph + * in order to speed up the most common cases considerably, + * like text that is entirely LTR, or RTL text without numbers. + * + * Most of this is done by setting a bit for each directional property + * in a flags variable and later checking for whether there are + * any LTR characters or any RTL characters, or both, whether + * there are any explicit embedding codes, etc. + * + * If the (Xn) steps are performed, then the flags are re-evaluated, + * because they will then not contain the embedding codes any more + * and will be adjusted for override codes, so that subsequently + * more bypassing may be possible than what the initial flags suggested. + * + * If the text is not mixed-directional, then the + * algorithm steps for the weak type resolution are not performed, + * and all levels are set to the paragraph level. + * + * If there are no explicit embedding codes, then the (Xn) steps + * are not performed. + * + * If embedding levels are supplied as a parameter, then all + * explicit embedding codes are ignored, and the (Xn) steps + * are not performed. + * + * White Space types could get the level of the run they belong to, + * and are checked with a test of (flags&MASK_EMBEDDING) to + * consider if the paragraph direction should be considered in + * the flags variable. + * + * If there are no White Space types in the paragraph, then + * (L1) is not necessary in adjustWSLevels(). + */ + +// Original filename in ICU4J: Bidi.java +public class BidiBase { + + static class Point { + int pos; /* position in text */ + int flag; /* flag for LRM/RLM, before/after */ + } + + static class InsertPoints { + int size; + int confirmed; + Point[] points = new Point[0]; + } + + static class Opening { + int position; /* position of opening bracket */ + int match; /* matching char or -position of closing bracket */ + int contextPos; /* position of last strong char found before opening */ + short flags; /* bits for L or R/AL found within the pair */ + byte contextDir; /* L or R according to last strong char before opening */ + } + + static class IsoRun { + int contextPos; /* position of char determining context */ + short start; /* index of first opening entry for this run */ + short limit; /* index after last opening entry for this run */ + byte level; /* level of this run */ + byte lastStrong; /* bidi class of last strong char found in this run */ + byte lastBase; /* bidi class of last base char found in this run */ + byte contextDir; /* L or R to use as context for following openings */ + } + + static class BracketData { + Opening[] openings = new Opening[SIMPLE_PARAS_COUNT]; + int isoRunLast; /* index of last used entry */ + /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL + + 1 for index 0, + 1 for before the first isolated sequence */ + IsoRun[] isoRuns = new IsoRun[MAX_EXPLICIT_LEVEL+2]; + boolean isNumbersSpecial; /*reordering mode for NUMBERS_SPECIAL */ + } + + static class Isolate { + int startON; + int start1; + short stateImp; + short state; + } + + /** Paragraph level setting

+ * + * Constant indicating that the base direction depends on the first strong + * directional character in the text according to the Unicode Bidirectional + * Algorithm. If no strong directional character is present, + * then set the paragraph level to 0 (left-to-right).

+ * + * If this value is used in conjunction with reordering modes + * REORDER_INVERSE_LIKE_DIRECT or + * REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder + * is assumed to be visual LTR, and the text after reordering is required + * to be the corresponding logical string with appropriate contextual + * direction. The direction of the result string will be RTL if either + * the rightmost or leftmost strong character of the source text is RTL + * or Arabic Letter, the direction will be LTR otherwise.

+ * + * If reordering option OPTION_INSERT_MARKS is set, an RLM may + * be added at the beginning of the result string to ensure round trip + * (that the result string, when reordered back to visual, will produce + * the original source text). + * @see #REORDER_INVERSE_LIKE_DIRECT + * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL + * @stable ICU 3.8 + */ + public static final byte LEVEL_DEFAULT_LTR = (byte)0x7e; + + /** Paragraph level setting

+ * + * Constant indicating that the base direction depends on the first strong + * directional character in the text according to the Unicode Bidirectional + * Algorithm. If no strong directional character is present, + * then set the paragraph level to 1 (right-to-left).

+ * + * If this value is used in conjunction with reordering modes + * REORDER_INVERSE_LIKE_DIRECT or + * REORDER_INVERSE_FOR_NUMBERS_SPECIAL, the text to reorder + * is assumed to be visual LTR, and the text after reordering is required + * to be the corresponding logical string with appropriate contextual + * direction. The direction of the result string will be RTL if either + * the rightmost or leftmost strong character of the source text is RTL + * or Arabic Letter, or if the text contains no strong character; + * the direction will be LTR otherwise.

+ * + * If reordering option OPTION_INSERT_MARKS is set, an RLM may + * be added at the beginning of the result string to ensure round trip + * (that the result string, when reordered back to visual, will produce + * the original source text). + * @see #REORDER_INVERSE_LIKE_DIRECT + * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL + * @stable ICU 3.8 + */ + public static final byte LEVEL_DEFAULT_RTL = (byte)0x7f; + + /** + * Maximum explicit embedding level. + * (The maximum resolved level can be up to MAX_EXPLICIT_LEVEL+1). + * @stable ICU 3.8 + */ + public static final byte MAX_EXPLICIT_LEVEL = 125; + + /** + * Bit flag for level input. + * Overrides directional properties. + * @stable ICU 3.8 + */ + public static final byte LEVEL_OVERRIDE = (byte)0x80; + + /** + * Special value which can be returned by the mapping methods when a + * logical index has no corresponding visual index or vice-versa. This may + * happen for the logical-to-visual mapping of a Bidi control when option + * OPTION_REMOVE_CONTROLS is + * specified. This can also happen for the visual-to-logical mapping of a + * Bidi mark (LRM or RLM) inserted by option + * OPTION_INSERT_MARKS. + * @see #getVisualIndex + * @see #getVisualMap + * @see #getLogicalIndex + * @see #getLogicalMap + * @see #OPTION_INSERT_MARKS + * @see #OPTION_REMOVE_CONTROLS + * @stable ICU 3.8 + */ + public static final int MAP_NOWHERE = -1; + + /** + * Left-to-right text. + *

    + *
  • As return value for getDirection(), it means + * that the source string contains no right-to-left characters, or + * that the source string is empty and the paragraph level is even. + *
  • As return value for getBaseDirection(), it + * means that the first strong character of the source string has + * a left-to-right direction. + *
+ * @stable ICU 3.8 + */ + public static final byte LTR = 0; + + /** + * Right-to-left text. + *
    + *
  • As return value for getDirection(), it means + * that the source string contains no left-to-right characters, or + * that the source string is empty and the paragraph level is odd. + *
  • As return value for getBaseDirection(), it + * means that the first strong character of the source string has + * a right-to-left direction. + *
+ * @stable ICU 3.8 + */ + public static final byte RTL = 1; + + /** + * Mixed-directional text. + *

As return value for getDirection(), it means + * that the source string contains both left-to-right and + * right-to-left characters. + * @stable ICU 3.8 + */ + public static final byte MIXED = 2; + + /** + * option bit for writeReordered(): + * keep combining characters after their base characters in RTL runs + * + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short KEEP_BASE_COMBINING = 1; + + /** + * option bit for writeReordered(): + * replace characters with the "mirrored" property in RTL runs + * by their mirror-image mappings + * + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short DO_MIRRORING = 2; + + /** + * option bit for writeReordered(): + * surround the run with LRMs if necessary; + * this is part of the approximate "inverse Bidi" algorithm + * + *

This option does not imply corresponding adjustment of the index + * mappings.

+ * + * @see #setInverse + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short INSERT_LRM_FOR_NUMERIC = 4; + + /** + * option bit for writeReordered(): + * remove Bidi control characters + * (this does not affect INSERT_LRM_FOR_NUMERIC) + * + *

This option does not imply corresponding adjustment of the index + * mappings.

+ * + * @see #writeReordered + * @see #INSERT_LRM_FOR_NUMERIC + * @stable ICU 3.8 + */ + public static final short REMOVE_BIDI_CONTROLS = 8; + + /** + * option bit for writeReordered(): + * write the output in reverse order + * + *

This has the same effect as calling writeReordered() + * first without this option, and then calling + * writeReverse() without mirroring. + * Doing this in the same step is faster and avoids a temporary buffer. + * An example for using this option is output to a character terminal that + * is designed for RTL scripts and stores text in reverse order.

+ * + * @see #writeReordered + * @stable ICU 3.8 + */ + public static final short OUTPUT_REVERSE = 16; + + /** Reordering mode: Regular Logical to Visual Bidi algorithm according to Unicode. + * @see #setReorderingMode + * @stable ICU 3.8 + */ + private static final short REORDER_DEFAULT = 0; + + /** Reordering mode: Logical to Visual algorithm which handles numbers in + * a way which mimicks the behavior of Windows XP. + * @see #setReorderingMode + * @stable ICU 3.8 + */ + private static final short REORDER_NUMBERS_SPECIAL = 1; + + /** Reordering mode: Logical to Visual algorithm grouping numbers with + * adjacent R characters (reversible algorithm). + * @see #setReorderingMode + * @stable ICU 3.8 + */ + private static final short REORDER_GROUP_NUMBERS_WITH_R = 2; + + /** Reordering mode: Reorder runs only to transform a Logical LTR string + * to the logical RTL string with the same display, or vice-versa.
+ * If this mode is set together with option + * OPTION_INSERT_MARKS, some Bidi controls in the source + * text may be removed and other controls may be added to produce the + * minimum combination which has the required display. + * @see #OPTION_INSERT_MARKS + * @see #setReorderingMode + * @stable ICU 3.8 + */ + static final short REORDER_RUNS_ONLY = 3; + + /** Reordering mode: Visual to Logical algorithm which handles numbers + * like L (same algorithm as selected by setInverse(true). + * @see #setInverse + * @see #setReorderingMode + * @stable ICU 3.8 + */ + static final short REORDER_INVERSE_NUMBERS_AS_L = 4; + + /** Reordering mode: Visual to Logical algorithm equivalent to the regular + * Logical to Visual algorithm. + * @see #setReorderingMode + * @stable ICU 3.8 + */ + static final short REORDER_INVERSE_LIKE_DIRECT = 5; + + /** Reordering mode: Inverse Bidi (Visual to Logical) algorithm for the + * REORDER_NUMBERS_SPECIAL Bidi algorithm. + * @see #setReorderingMode + * @stable ICU 3.8 + */ + static final short REORDER_INVERSE_FOR_NUMBERS_SPECIAL = 6; + + /* Reordering mode values must be ordered so that all the regular logical to + * visual modes come first, and all inverse Bidi modes come last. + */ + private static final short REORDER_LAST_LOGICAL_TO_VISUAL = + REORDER_NUMBERS_SPECIAL; + + /** + * Option bit for setReorderingOptions: + * insert Bidi marks (LRM or RLM) when needed to ensure correct result of + * a reordering to a Logical order + * + *

This option must be set or reset before calling + * setPara.

+ * + *

This option is significant only with reordering modes which generate + * a result with Logical order, specifically.

+ *
    + *
  • REORDER_RUNS_ONLY
  • + *
  • REORDER_INVERSE_NUMBERS_AS_L
  • + *
  • REORDER_INVERSE_LIKE_DIRECT
  • + *
  • REORDER_INVERSE_FOR_NUMBERS_SPECIAL
  • + *
+ * + *

If this option is set in conjunction with reordering mode + * REORDER_INVERSE_NUMBERS_AS_L or with calling + * setInverse(true), it implies option + * INSERT_LRM_FOR_NUMERIC in calls to method + * writeReordered().

+ * + *

For other reordering modes, a minimum number of LRM or RLM characters + * will be added to the source text after reordering it so as to ensure + * round trip, i.e. when applying the inverse reordering mode on the + * resulting logical text with removal of Bidi marks + * (option OPTION_REMOVE_CONTROLS set before calling + * setPara() or option + * REMOVE_BIDI_CONTROLS in + * writeReordered), the result will be identical to the + * source text in the first transformation. + * + *

This option will be ignored if specified together with option + * OPTION_REMOVE_CONTROLS. It inhibits option + * REMOVE_BIDI_CONTROLS in calls to method + * writeReordered() and it implies option + * INSERT_LRM_FOR_NUMERIC in calls to method + * writeReordered() if the reordering mode is + * REORDER_INVERSE_NUMBERS_AS_L.

+ * + * @see #setReorderingMode + * @see #setReorderingOptions + * @see #INSERT_LRM_FOR_NUMERIC + * @see #REMOVE_BIDI_CONTROLS + * @see #OPTION_REMOVE_CONTROLS + * @see #REORDER_RUNS_ONLY + * @see #REORDER_INVERSE_NUMBERS_AS_L + * @see #REORDER_INVERSE_LIKE_DIRECT + * @see #REORDER_INVERSE_FOR_NUMBERS_SPECIAL + * @stable ICU 3.8 + */ + static final int OPTION_INSERT_MARKS = 1; + + /** + * Option bit for setReorderingOptions: + * remove Bidi control characters + * + *

This option must be set or reset before calling + * setPara.

+ * + *

This option nullifies option + * OPTION_INSERT_MARKS. It inhibits option + * INSERT_LRM_FOR_NUMERIC in calls to method + * writeReordered() and it implies option + * REMOVE_BIDI_CONTROLS in calls to that method.

+ * + * @see #setReorderingMode + * @see #setReorderingOptions + * @see #OPTION_INSERT_MARKS + * @see #INSERT_LRM_FOR_NUMERIC + * @see #REMOVE_BIDI_CONTROLS + * @stable ICU 3.8 + */ + static final int OPTION_REMOVE_CONTROLS = 2; + + /** + * Option bit for setReorderingOptions: + * process the output as part of a stream to be continued + * + *

This option must be set or reset before calling + * setPara.

+ * + *

This option specifies that the caller is interested in processing + * large text object in parts. The results of the successive calls are + * expected to be concatenated by the caller. Only the call for the last + * part will have this option bit off.

+ * + *

When this option bit is on, setPara() may process + * less than the full source text in order to truncate the text at a + * meaningful boundary. The caller should call + * getProcessedLength() immediately after calling + * setPara() in order to determine how much of the source + * text has been processed. Source text beyond that length should be + * resubmitted in following calls to setPara. The + * processed length may be less than the length of the source text if a + * character preceding the last character of the source text constitutes a + * reasonable boundary (like a block separator) for text to be continued.
+ * If the last character of the source text constitutes a reasonable + * boundary, the whole text will be processed at once.
+ * If nowhere in the source text there exists + * such a reasonable boundary, the processed length will be zero.
+ * The caller should check for such an occurrence and do one of the following: + *

  • submit a larger amount of text with a better chance to include + * a reasonable boundary.
  • + *
  • resubmit the same text after turning off option + * OPTION_STREAMING.
+ * In all cases, this option should be turned off before processing the last + * part of the text.

+ * + *

When the OPTION_STREAMING option is used, it is + * recommended to call orderParagraphsLTR(true) before calling + * setPara() so that later paragraphs may be concatenated to + * previous paragraphs on the right. + *

+ * + * @see #setReorderingMode + * @see #setReorderingOptions + * @see #getProcessedLength + * @stable ICU 3.8 + */ + private static final int OPTION_STREAMING = 4; + + /* + * Comparing the description of the Bidi algorithm with this implementation + * is easier with the same names for the Bidi types in the code as there. + * See UCharacterDirection + */ + /* private */ static final byte L = 0; + private static final byte R = 1; + private static final byte EN = 2; + private static final byte ES = 3; + private static final byte ET = 4; + private static final byte AN = 5; + private static final byte CS = 6; + static final byte B = 7; + private static final byte S = 8; + private static final byte WS = 9; + private static final byte ON = 10; + private static final byte LRE = 11; + private static final byte LRO = 12; + private static final byte AL = 13; + private static final byte RLE = 14; + private static final byte RLO = 15; + private static final byte PDF = 16; + private static final byte NSM = 17; + private static final byte BN = 18; + private static final byte FSI = 19; + private static final byte LRI = 20; + private static final byte RLI = 21; + private static final byte PDI = 22; + private static final byte ENL = PDI + 1; /* EN after W7 */ + private static final byte ENR = ENL + 1; /* EN not subject to W7 */ + + // Number of directional types + private static final int CHAR_DIRECTION_COUNT = 23; + + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + + /** + * Bidi Paired Bracket Type constants. + * + * @see UProperty#BIDI_PAIRED_BRACKET_TYPE + * @stable ICU 52 + */ + public static interface BidiPairedBracketType { + /** + * Not a paired bracket. + * @stable ICU 52 + */ + public static final int NONE = 0; + /** + * Open paired bracket. + * @stable ICU 52 + */ + public static final int OPEN = 1; + /** + * Close paired bracket. + * @stable ICU 52 + */ + public static final int CLOSE = 2; + /** + * @stable ICU 52 + */ + public static final int COUNT = 3; + } + + /* number of paras entries allocated initially */ + static final int SIMPLE_PARAS_COUNT = 10; + + private static final char CR = '\r'; + private static final char LF = '\n'; + + static final int LRM_BEFORE = 1; + static final int LRM_AFTER = 2; + static final int RLM_BEFORE = 4; + static final int RLM_AFTER = 8; + + /* flags for Opening.flags */ + static final byte FOUND_L = (byte)DirPropFlag(L); + static final byte FOUND_R = (byte)DirPropFlag(R); + + /* + * The following bit is used for the directional isolate status. + * Stack entries corresponding to isolate sequences are greater than ISOLATE. + */ + static final int ISOLATE = 0x0100; + + /* + * reference to parent paragraph object (reference to self if this object is + * a paragraph object); set to null in a newly opened object; set to a + * real value after a successful execution of setPara or setLine + */ + BidiBase paraBidi; + + final UBiDiProps bdp; + + /* character array representing the current text */ + char[] text; + + /* length of the current text */ + int originalLength; + + /* if the option OPTION_STREAMING is set, this is the length of + * text actually processed by setPara, which may be shorter + * than the original length. Otherwise, it is identical to the original + * length. + */ + public int length; + + /* if option OPTION_REMOVE_CONTROLS is set, and/or Bidi + * marks are allowed to be inserted in one of the reordering modes, the + * length of the result string may be different from the processed length. + */ + int resultLength; + + /* indicators for whether memory may be allocated after construction */ + boolean mayAllocateText; + boolean mayAllocateRuns; + + /* arrays with one value per text-character */ + byte[] dirPropsMemory = new byte[1]; + byte[] levelsMemory = new byte[1]; + byte[] dirProps; + byte[] levels; + + /* are we performing an approximation of the "inverse Bidi" algorithm? */ + boolean isInverse; + + /* are we using the basic algorithm or its variation? */ + int reorderingMode; + + /* bitmask for reordering options */ + int reorderingOptions; + + /* must block separators receive level 0? */ + boolean orderParagraphsLTR; + + /* the paragraph level */ + byte paraLevel; + + /* original paraLevel when contextual */ + /* must be one of DEFAULT_xxx or 0 if not contextual */ + byte defaultParaLevel; + + /* the following is set in setPara, used in processPropertySeq */ + + ImpTabPair impTabPair; /* reference to levels state table pair */ + + /* the overall paragraph or line directionality*/ + byte direction; + + /* flags is a bit set for which directional properties are in the text */ + int flags; + + /* lastArabicPos is index to the last AL in the text, -1 if none */ + int lastArabicPos; + + /* characters after trailingWSStart are WS and are */ + /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */ + int trailingWSStart; + + /* fields for paragraph handling, set in getDirProps() */ + int paraCount; + int[] paras_limit = new int[SIMPLE_PARAS_COUNT]; + byte[] paras_level = new byte[SIMPLE_PARAS_COUNT]; + + /* fields for line reordering */ + int runCount; /* ==-1: runs not set up yet */ + BidiRun[] runsMemory = new BidiRun[0]; + BidiRun[] runs; + + /* for non-mixed text, we only need a tiny array of runs (no allocation) */ + BidiRun[] simpleRuns = {new BidiRun()}; + + /* fields for managing isolate sequences */ + Isolate[] isolates; + + /* maximum or current nesting depth of isolate sequences */ + /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal + nesting encountered. + Within resolveImplicitLevels(), this is the index of the current isolates + stack entry. */ + int isolateCount; + + /* mapping of runs in logical order to visual order */ + int[] logicalToVisualRunsMap; + /* flag to indicate that the map has been updated */ + boolean isGoodLogicalToVisualRunsMap; + + /* for inverse Bidi with insertion of directional marks */ + InsertPoints insertPoints = new InsertPoints(); + + /* for option OPTION_REMOVE_CONTROLS */ + int controlCount; + + /* + * Sometimes, bit values are more appropriate + * to deal with directionality properties. + * Abbreviations in these method names refer to names + * used in the Bidi algorithm. + */ + static int DirPropFlag(byte dir) { + return (1 << dir); + } + + boolean testDirPropFlagAt(int flag, int index) { + return ((DirPropFlag(dirProps[index]) & flag) != 0); + } + + static final int DirPropFlagMultiRuns = DirPropFlag((byte)31); + + /* to avoid some conditional statements, use tiny constant arrays */ + static final int DirPropFlagLR[] = { DirPropFlag(L), DirPropFlag(R) }; + static final int DirPropFlagE[] = { DirPropFlag(LRE), DirPropFlag(RLE) }; + static final int DirPropFlagO[] = { DirPropFlag(LRO), DirPropFlag(RLO) }; + + static final int DirPropFlagLR(byte level) { return DirPropFlagLR[level & 1]; } + static final int DirPropFlagE(byte level) { return DirPropFlagE[level & 1]; } + static final int DirPropFlagO(byte level) { return DirPropFlagO[level & 1]; } + static final byte DirFromStrong(byte strong) { return strong == L ? L : R; } + static final byte NoOverride(byte level) { return (byte)(level & ~LEVEL_OVERRIDE); } + + /* are there any characters that are LTR or RTL? */ + static final int MASK_LTR = + DirPropFlag(L)|DirPropFlag(EN)|DirPropFlag(ENL)|DirPropFlag(ENR)|DirPropFlag(AN)|DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(LRI); + static final int MASK_RTL = DirPropFlag(R)|DirPropFlag(AL)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(RLI); + + static final int MASK_R_AL = DirPropFlag(R)|DirPropFlag(AL); + + /* explicit embedding codes */ + private static final int MASK_EXPLICIT = DirPropFlag(LRE)|DirPropFlag(LRO)|DirPropFlag(RLE)|DirPropFlag(RLO)|DirPropFlag(PDF); + private static final int MASK_BN_EXPLICIT = DirPropFlag(BN)|MASK_EXPLICIT; + + /* explicit isolate codes */ + private static final int MASK_ISO = DirPropFlag(LRI)|DirPropFlag(RLI)|DirPropFlag(FSI)|DirPropFlag(PDI); + + /* paragraph and segment separators */ + private static final int MASK_B_S = DirPropFlag(B)|DirPropFlag(S); + + /* all types that are counted as White Space or Neutral in some steps */ + static final int MASK_WS = MASK_B_S|DirPropFlag(WS)|MASK_BN_EXPLICIT|MASK_ISO; + + /* types that are neutrals or could becomes neutrals in (Wn) */ + private static final int MASK_POSSIBLE_N = DirPropFlag(ON)|DirPropFlag(CS)|DirPropFlag(ES)|DirPropFlag(ET)|MASK_WS; + + /* + * These types may be changed to "e", + * the embedding type (L or R) of the run, + * in the Bidi algorithm (N2) + */ + private static final int MASK_EMBEDDING = DirPropFlag(NSM)|MASK_POSSIBLE_N; + + /* + * the dirProp's L and R are defined to 0 and 1 values in UCharacterDirection.java + */ + private static byte GetLRFromLevel(byte level) + { + return (byte)(level & 1); + } + + private static boolean IsDefaultLevel(byte level) + { + return ((level & LEVEL_DEFAULT_LTR) == LEVEL_DEFAULT_LTR); + } + + static boolean IsBidiControlChar(int c) + { + /* check for range 0x200c to 0x200f (ZWNJ, ZWJ, LRM, RLM) or + 0x202a to 0x202e (LRE, RLE, PDF, LRO, RLO) */ + return (((c & 0xfffffffc) == 0x200c) || ((c >= 0x202a) && (c <= 0x202e)) + || ((c >= 0x2066) && (c <= 0x2069))); + } + + void verifyValidPara() + { + if (!(this == this.paraBidi)) { + throw new IllegalStateException(); + } + } + + void verifyValidParaOrLine() + { + BidiBase para = this.paraBidi; + /* verify Para */ + if (this == para) { + return; + } + /* verify Line */ + if ((para == null) || (para != para.paraBidi)) { + throw new IllegalStateException(); + } + } + + void verifyRange(int index, int start, int limit) + { + if (index < start || index >= limit) { + throw new IllegalArgumentException("Value " + index + + " is out of range " + start + " to " + limit); + } + } + + /** + * Allocate a Bidi object with preallocated memory + * for internal structures. + * This method provides a Bidi object like the default constructor + * but it also preallocates memory for internal structures + * according to the sizings supplied by the caller.

+ * The preallocation can be limited to some of the internal memory + * by setting some values to 0 here. That means that if, e.g., + * maxRunCount cannot be reasonably predetermined and should not + * be set to maxLength (the only failproof value) to avoid + * wasting memory, then maxRunCount could be set to 0 here + * and the internal structures that are associated with it will be allocated + * on demand, just like with the default constructor. + * + * @param maxLength is the maximum text or line length that internal memory + * will be preallocated for. An attempt to associate this object with a + * longer text will fail, unless this value is 0, which leaves the allocation + * up to the implementation. + * + * @param maxRunCount is the maximum anticipated number of same-level runs + * that internal memory will be preallocated for. An attempt to access + * visual runs on an object that was not preallocated for as many runs + * as the text was actually resolved to will fail, + * unless this value is 0, which leaves the allocation up to the implementation.

+ * The number of runs depends on the actual text and maybe anywhere between + * 1 and maxLength. It is typically small. + * + * @throws IllegalArgumentException if maxLength or maxRunCount is less than 0 + * @stable ICU 3.8 + */ + public BidiBase(int maxLength, int maxRunCount) + { + /* check the argument values */ + if (maxLength < 0 || maxRunCount < 0) { + throw new IllegalArgumentException(); + } + + /* reset the object, all reference variables null, all flags false, + all sizes 0. + In fact, we don't need to do anything, since class members are + initialized as zero when an instance is created. + */ + /* + mayAllocateText = false; + mayAllocateRuns = false; + orderParagraphsLTR = false; + paraCount = 0; + runCount = 0; + trailingWSStart = 0; + flags = 0; + paraLevel = 0; + defaultParaLevel = 0; + direction = 0; + */ + /* get Bidi properties */ + bdp = UBiDiProps.INSTANCE; + + /* allocate memory for arrays as requested */ + if (maxLength > 0) { + getInitialDirPropsMemory(maxLength); + getInitialLevelsMemory(maxLength); + } else { + mayAllocateText = true; + } + + if (maxRunCount > 0) { + // if maxRunCount == 1, use simpleRuns[] + if (maxRunCount > 1) { + getInitialRunsMemory(maxRunCount); + } + } else { + mayAllocateRuns = true; + } + } + + /* + * We are allowed to allocate memory if object==null or + * mayAllocate==true for each array that we need. + * + * Assume sizeNeeded>0. + * If object != null, then assume size > 0. + */ + private Object getMemory(String label, Object array, Class arrayClass, + boolean mayAllocate, int sizeNeeded) + { + int len = Array.getLength(array); + + /* we have at least enough memory and must not allocate */ + if (sizeNeeded == len) { + return array; + } + if (!mayAllocate) { + /* we must not allocate */ + if (sizeNeeded <= len) { + return array; + } + throw new OutOfMemoryError("Failed to allocate memory for " + + label); + } + /* we may try to grow or shrink */ + /* FOOD FOR THOUGHT: when shrinking it should be possible to avoid + the allocation altogether and rely on this.length */ + try { + return Array.newInstance(arrayClass, sizeNeeded); + } catch (Exception e) { + throw new OutOfMemoryError("Failed to allocate memory for " + + label); + } + } + + /* helper methods for each allocated array */ + private void getDirPropsMemory(boolean mayAllocate, int len) + { + Object array = getMemory("DirProps", dirPropsMemory, Byte.TYPE, mayAllocate, len); + dirPropsMemory = (byte[]) array; + } + + void getDirPropsMemory(int len) + { + getDirPropsMemory(mayAllocateText, len); + } + + private void getLevelsMemory(boolean mayAllocate, int len) + { + Object array = getMemory("Levels", levelsMemory, Byte.TYPE, mayAllocate, len); + levelsMemory = (byte[]) array; + } + + void getLevelsMemory(int len) + { + getLevelsMemory(mayAllocateText, len); + } + + private void getRunsMemory(boolean mayAllocate, int len) + { + Object array = getMemory("Runs", runsMemory, BidiRun.class, mayAllocate, len); + runsMemory = (BidiRun[]) array; + } + + void getRunsMemory(int len) + { + getRunsMemory(mayAllocateRuns, len); + } + + /* additional methods used by constructor - always allow allocation */ + private void getInitialDirPropsMemory(int len) + { + getDirPropsMemory(true, len); + } + + private void getInitialLevelsMemory(int len) + { + getLevelsMemory(true, len); + } + + private void getInitialRunsMemory(int len) + { + getRunsMemory(true, len); + } + + /** + * Is this Bidi object set to perform the inverse Bidi + * algorithm? + *

Note: calling this method after setting the reordering mode with + * setReorderingMode will return true if the + * reordering mode was set to + * REORDER_INVERSE_NUMBERS_AS_L, false + * for all other values.

+ * + * @return true if the Bidi object is set to + * perform the inverse Bidi algorithm by handling numbers as L. + * + * @see #setInverse + * @see #setReorderingMode + * @see #REORDER_INVERSE_NUMBERS_AS_L + * @stable ICU 3.8 + */ + public boolean isInverse() { + return isInverse; + } + + /* perform (P2)..(P3) ------------------------------------------------------- */ + + /* + * Check that there are enough entries in the arrays paras_limit and paras_level + */ + private void checkParaCount() { + int[] saveLimits; + byte[] saveLevels; + int count = paraCount; + if (count <= paras_level.length) + return; + int oldLength = paras_level.length; + saveLimits = paras_limit; + saveLevels = paras_level; + try { + paras_limit = new int[count * 2]; + paras_level = new byte[count * 2]; + } catch (Exception e) { + throw new OutOfMemoryError("Failed to allocate memory for paras"); + } + System.arraycopy(saveLimits, 0, paras_limit, 0, oldLength); + System.arraycopy(saveLevels, 0, paras_level, 0, oldLength); + } + + /* + * Get the directional properties for the text, calculate the flags bit-set, and + * determine the paragraph level if necessary (in paras_level[i]). + * FSI initiators are also resolved and their dirProp replaced with LRI or RLI. + * When encountering an FSI, it is initially replaced with an LRI, which is the + * default. Only if a strong R or AL is found within its scope will the LRI be + * replaced by an RLI. + */ + static final int NOT_SEEKING_STRONG = 0; /* 0: not contextual paraLevel, not after FSI */ + static final int SEEKING_STRONG_FOR_PARA = 1; /* 1: looking for first strong char in para */ + static final int SEEKING_STRONG_FOR_FSI = 2; /* 2: looking for first strong after FSI */ + static final int LOOKING_FOR_PDI = 3; /* 3: found strong after FSI, looking for PDI */ + + private void getDirProps() + { + int i = 0, i0, i1; + flags = 0; /* collect all directionalities in the text */ + int uchar; + byte dirProp; + byte defaultParaLevel = 0; /* initialize to avoid compiler warnings */ + boolean isDefaultLevel = IsDefaultLevel(paraLevel); + /* for inverse Bidi, the default para level is set to RTL if there is a + strong R or AL character at either end of the text */ + boolean isDefaultLevelInverse=isDefaultLevel && + (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL); + lastArabicPos = -1; + int controlCount = 0; + boolean removeBidiControls = (reorderingOptions & OPTION_REMOVE_CONTROLS) != 0; + + byte state; + byte lastStrong = ON; /* for default level & inverse Bidi */ + /* The following stacks are used to manage isolate sequences. Those + sequences may be nested, but obviously never more deeply than the + maximum explicit embedding level. + lastStack is the index of the last used entry in the stack. A value of -1 + means that there is no open isolate sequence. + lastStack is reset to -1 on paragraph boundaries. */ + /* The following stack contains the position of the initiator of + each open isolate sequence */ + int[] isolateStartStack= new int[MAX_EXPLICIT_LEVEL+1]; + /* The following stack contains the last known state before + encountering the initiator of an isolate sequence */ + byte[] previousStateStack = new byte[MAX_EXPLICIT_LEVEL+1]; + int stackLast=-1; + + if ((reorderingOptions & OPTION_STREAMING) != 0) + length = 0; + defaultParaLevel = (byte)(paraLevel & 1); + + if (isDefaultLevel) { + paras_level[0] = defaultParaLevel; + lastStrong = defaultParaLevel; + state = SEEKING_STRONG_FOR_PARA; + } else { + paras_level[0] = paraLevel; + state = NOT_SEEKING_STRONG; + } + /* count paragraphs and determine the paragraph level (P2..P3) */ + /* + * see comment on constant fields: + * the LEVEL_DEFAULT_XXX values are designed so that + * their low-order bit alone yields the intended default + */ + + for (i = 0; i < originalLength; /* i is incremented in the loop */) { + i0 = i; /* index of first code unit */ + uchar = UTF16.charAt(text, 0, originalLength, i); + i += UTF16.getCharCount(uchar); + i1 = i - 1; /* index of last code unit, gets the directional property */ + + dirProp = (byte)getCustomizedClass(uchar); + flags |= DirPropFlag(dirProp); + dirProps[i1] = dirProp; + if (i1 > i0) { /* set previous code units' properties to BN */ + flags |= DirPropFlag(BN); + do { + dirProps[--i1] = BN; + } while (i1 > i0); + } + if (removeBidiControls && IsBidiControlChar(uchar)) { + controlCount++; + } + if (dirProp == L) { + if (state == SEEKING_STRONG_FOR_PARA) { + paras_level[paraCount - 1] = 0; + state = NOT_SEEKING_STRONG; + } + else if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); + } + state = LOOKING_FOR_PDI; + } + lastStrong = L; + continue; + } + if (dirProp == R || dirProp == AL) { + if (state == SEEKING_STRONG_FOR_PARA) { + paras_level[paraCount - 1] = 1; + state = NOT_SEEKING_STRONG; + } + else if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + dirProps[isolateStartStack[stackLast]] = RLI; + flags |= DirPropFlag(RLI); + } + state = LOOKING_FOR_PDI; + } + lastStrong = R; + if (dirProp == AL) + lastArabicPos = i - 1; + continue; + } + if (dirProp >= FSI && dirProp <= RLI) { /* FSI, LRI or RLI */ + stackLast++; + if (stackLast <= MAX_EXPLICIT_LEVEL) { + isolateStartStack[stackLast] = i - 1; + previousStateStack[stackLast] = state; + } + if (dirProp == FSI) { + dirProps[i-1] = LRI; /* default if no strong char */ + state = SEEKING_STRONG_FOR_FSI; + } + else + state = LOOKING_FOR_PDI; + continue; + } + if (dirProp == PDI) { + if (state == SEEKING_STRONG_FOR_FSI) { + if (stackLast <= MAX_EXPLICIT_LEVEL) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); + } + } + if (stackLast >= 0) { + if (stackLast <= MAX_EXPLICIT_LEVEL) + state = previousStateStack[stackLast]; + stackLast--; + } + continue; + } + if (dirProp == B) { + if (i < originalLength && uchar == CR && text[i] == LF) /* do nothing on the CR */ + continue; + paras_limit[paraCount - 1] = i; + if (isDefaultLevelInverse && lastStrong == R) + paras_level[paraCount - 1] = 1; + if ((reorderingOptions & OPTION_STREAMING) != 0) { + /* When streaming, we only process whole paragraphs + thus some updates are only done on paragraph boundaries */ + length = i; /* i is index to next character */ + this.controlCount = controlCount; + } + if (i < originalLength) { /* B not last char in text */ + paraCount++; + checkParaCount(); /* check that there is enough memory for a new para entry */ + if (isDefaultLevel) { + paras_level[paraCount - 1] = defaultParaLevel; + state = SEEKING_STRONG_FOR_PARA; + lastStrong = defaultParaLevel; + } else { + paras_level[paraCount - 1] = paraLevel; + state = NOT_SEEKING_STRONG; + } + stackLast = -1; + } + continue; + } + } + /* +Ignore still open isolate sequences with overflow */ + if (stackLast > MAX_EXPLICIT_LEVEL) { + stackLast = MAX_EXPLICIT_LEVEL; + state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */ + } + /* Resolve direction of still unresolved open FSI sequences */ + while (stackLast >= 0) { + if (state == SEEKING_STRONG_FOR_FSI) { + /* no need for next statement, already set by default */ + /* dirProps[isolateStartStack[stackLast]] = LRI; */ + flags |= DirPropFlag(LRI); + break; + } + state = previousStateStack[stackLast]; + stackLast--; + } + /* When streaming, ignore text after the last paragraph separator */ + if ((reorderingOptions & OPTION_STREAMING) != 0) { + if (length < originalLength) + paraCount--; + } else { + paras_limit[paraCount - 1] = originalLength; + this.controlCount = controlCount; + } + /* For inverse bidi, default para direction is RTL if there is + a strong R or AL at either end of the paragraph */ + if (isDefaultLevelInverse && lastStrong == R) { + paras_level[paraCount - 1] = 1; + } + if (isDefaultLevel) { + paraLevel = paras_level[0]; + } + /* The following is needed to resolve the text direction for default level + paragraphs containing no strong character */ + for (i = 0; i < paraCount; i++) + flags |= DirPropFlagLR(paras_level[i]); + + if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { + flags |= DirPropFlag(L); + } + } + + /* determine the paragraph level at position index */ + byte GetParaLevelAt(int pindex) + { + if (defaultParaLevel == 0 || pindex < paras_limit[0]) + return paraLevel; + int i; + for (i = 1; i < paraCount; i++) + if (pindex < paras_limit[i]) + break; + if (i >= paraCount) + i = paraCount - 1; + return paras_level[i]; + } + + /* Functions for handling paired brackets ----------------------------------- */ + + /* In the isoRuns array, the first entry is used for text outside of any + isolate sequence. Higher entries are used for each more deeply nested + isolate sequence. isoRunLast is the index of the last used entry. The + openings array is used to note the data of opening brackets not yet + matched by a closing bracket, or matched but still susceptible to change + level. + Each isoRun entry contains the index of the first and + one-after-last openings entries for pending opening brackets it + contains. The next openings entry to use is the one-after-last of the + most deeply nested isoRun entry. + isoRun entries also contain their current embedding level and the last + encountered strong character, since these will be needed to resolve + the level of paired brackets. */ + + private void bracketInit(BracketData bd) { + bd.isoRunLast = 0; + bd.isoRuns[0] = new IsoRun(); + bd.isoRuns[0].start = 0; + bd.isoRuns[0].limit = 0; + bd.isoRuns[0].level = GetParaLevelAt(0); + bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(GetParaLevelAt(0) & 1); + bd.isoRuns[0].contextPos = 0; + bd.openings = new Opening[SIMPLE_PARAS_COUNT]; + bd.isNumbersSpecial = reorderingMode == REORDER_NUMBERS_SPECIAL || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL; + } + + /* paragraph boundary */ + private void bracketProcessB(BracketData bd, byte level) { + bd.isoRunLast = 0; + bd.isoRuns[0].limit = 0; + bd.isoRuns[0].level = level; + bd.isoRuns[0].lastStrong = bd.isoRuns[0].lastBase = bd.isoRuns[0].contextDir = (byte)(level & 1); + bd.isoRuns[0].contextPos = 0; + } + + /* LRE, LRO, RLE, RLO, PDF */ + private void bracketProcessBoundary(BracketData bd, int lastCcPos, + byte contextLevel, byte embeddingLevel) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + if ((DirPropFlag(dirProps[lastCcPos]) & MASK_ISO) != 0) /* after an isolate */ + return; + if (NoOverride(embeddingLevel) > NoOverride(contextLevel)) /* not a PDF */ + contextLevel = embeddingLevel; + pLastIsoRun.limit = pLastIsoRun.start; + pLastIsoRun.level = embeddingLevel; + pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(contextLevel & 1); + pLastIsoRun.contextPos = lastCcPos; + } + + /* LRI or RLI */ + private void bracketProcessLRI_RLI(BracketData bd, byte level) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + short lastLimit; + pLastIsoRun.lastBase = ON; + lastLimit = pLastIsoRun.limit; + bd.isoRunLast++; + pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + if (pLastIsoRun == null) + pLastIsoRun = bd.isoRuns[bd.isoRunLast] = new IsoRun(); + pLastIsoRun.start = pLastIsoRun.limit = lastLimit; + pLastIsoRun.level = level; + pLastIsoRun.lastStrong = pLastIsoRun.lastBase = pLastIsoRun.contextDir = (byte)(level & 1); + pLastIsoRun.contextPos = 0; + } + + /* PDI */ + private void bracketProcessPDI(BracketData bd) { + IsoRun pLastIsoRun; + bd.isoRunLast--; + pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + pLastIsoRun.lastBase = ON; + } + + /* newly found opening bracket: create an openings entry */ + private void bracketAddOpening(BracketData bd, char match, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening pOpening; + if (pLastIsoRun.limit >= bd.openings.length) { /* no available new entry */ + Opening[] saveOpenings = bd.openings; + int count; + try { + count = bd.openings.length; + bd.openings = new Opening[count * 2]; + } catch (Exception e) { + throw new OutOfMemoryError("Failed to allocate memory for openings"); + } + System.arraycopy(saveOpenings, 0, bd.openings, 0, count); + } + pOpening = bd.openings[pLastIsoRun.limit]; + if (pOpening == null) + pOpening = bd.openings[pLastIsoRun.limit]= new Opening(); + pOpening.position = position; + pOpening.match = match; + pOpening.contextDir = pLastIsoRun.contextDir; + pOpening.contextPos = pLastIsoRun.contextPos; + pOpening.flags = 0; + pLastIsoRun.limit++; + } + + /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */ + private void fixN0c(BracketData bd, int openingIndex, int newPropPosition, byte newProp) { + /* This function calls itself recursively */ + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening qOpening; + int k, openingPosition, closingPosition; + for (k = openingIndex+1; k < pLastIsoRun.limit; k++) { + qOpening = bd.openings[k]; + if (qOpening.match >= 0) /* not an N0c match */ + continue; + if (newPropPosition < qOpening.contextPos) + break; + if (newPropPosition >= qOpening.position) + continue; + if (newProp == qOpening.contextDir) + break; + openingPosition = qOpening.position; + dirProps[openingPosition] = newProp; + closingPosition = -(qOpening.match); + dirProps[closingPosition] = newProp; + qOpening.match = 0; /* prevent further changes */ + fixN0c(bd, k, openingPosition, newProp); + fixN0c(bd, k, closingPosition, newProp); + } + } + + /* process closing bracket; return L or R if N0b or N0c, ON if N0d */ + private byte bracketProcessClosing(BracketData bd, int openIdx, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + Opening pOpening, qOpening; + byte direction; + boolean stable; + byte newProp; + pOpening = bd.openings[openIdx]; + direction = (byte)(pLastIsoRun.level & 1); + stable = true; /* assume stable until proved otherwise */ + + /* The stable flag is set when brackets are paired and their + level is resolved and cannot be changed by what will be + found later in the source string. + An unstable match can occur only when applying N0c, where + the resolved level depends on the preceding context, and + this context may be affected by text occurring later. + Example: RTL paragraph containing: abc[(latin) HEBREW] + When the closing parenthesis is encountered, it appears + that N0c1 must be applied since 'abc' sets an opposite + direction context and both parentheses receive level 2. + However, when the closing square bracket is processed, + N0b applies because of 'HEBREW' being included within the + brackets, thus the square brackets are treated like R and + receive level 1. However, this changes the preceding + context of the opening parenthesis, and it now appears + that N0c2 must be applied to the parentheses rather than + N0c1. */ + + if ((direction == 0 && (pOpening.flags & FOUND_L) > 0) || + (direction == 1 && (pOpening.flags & FOUND_R) > 0)) { /* N0b */ + newProp = direction; + } + else if ((pOpening.flags & (FOUND_L | FOUND_R)) != 0) { /* N0c */ + /* it is stable if there is no preceding text or in + conditions too complicated and not worth checking */ + stable = (openIdx == pLastIsoRun.start); + if (direction != pOpening.contextDir) + newProp = pOpening.contextDir; /* N0c1 */ + else + newProp = direction; /* N0c2 */ + } else { + /* forget this and any brackets nested within this pair */ + pLastIsoRun.limit = (short)openIdx; + return ON; /* N0d */ + } + dirProps[pOpening.position] = newProp; + dirProps[position] = newProp; + /* Update nested N0c pairs that may be affected */ + fixN0c(bd, openIdx, pOpening.position, newProp); + if (stable) { + pLastIsoRun.limit = (short)openIdx; /* forget any brackets nested within this pair */ + /* remove lower located synonyms if any */ + while (pLastIsoRun.limit > pLastIsoRun.start && + bd.openings[pLastIsoRun.limit - 1].position == pOpening.position) + pLastIsoRun.limit--; + } else { + int k; + pOpening.match = -position; + /* neutralize lower located synonyms if any */ + k = openIdx - 1; + while (k >= pLastIsoRun.start && + bd.openings[k].position == pOpening.position) + bd.openings[k--].match = 0; + /* neutralize any unmatched opening between the current pair; + this will also neutralize higher located synonyms if any */ + for (k = openIdx + 1; k < pLastIsoRun.limit; k++) { + qOpening =bd.openings[k]; + if (qOpening.position >= position) + break; + if (qOpening.match > 0) + qOpening.match = 0; + } + } + return newProp; + } + + /* handle strong characters, digits and candidates for closing brackets */ + private void bracketProcessChar(BracketData bd, int position) { + IsoRun pLastIsoRun = bd.isoRuns[bd.isoRunLast]; + byte dirProp, newProp; + byte level; + dirProp = dirProps[position]; + if (dirProp == ON) { + char c, match; + int idx; + /* First see if it is a matching closing bracket. Hopefully, this is + more efficient than checking if it is a closing bracket at all */ + c = text[position]; + for (idx = pLastIsoRun.limit - 1; idx >= pLastIsoRun.start; idx--) { + if (bd.openings[idx].match != c) + continue; + /* We have a match */ + newProp = bracketProcessClosing(bd, idx, position); + if(newProp == ON) { /* N0d */ + c = 0; /* prevent handling as an opening */ + break; + } + pLastIsoRun.lastBase = ON; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + level = levels[position]; + if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ + short flag; + int i; + newProp = (byte)(level & 1); + pLastIsoRun.lastStrong = newProp; + flag = (short)DirPropFlag(newProp); + for (i = pLastIsoRun.start; i < idx; i++) + bd.openings[i].flags |= flag; + /* matching brackets are not overridden by LRO/RLO */ + levels[position] &= ~LEVEL_OVERRIDE; + } + /* matching brackets are not overridden by LRO/RLO */ + levels[bd.openings[idx].position] &= ~LEVEL_OVERRIDE; + return; + } + /* We get here only if the ON character is not a matching closing + bracket or it is a case of N0d */ + /* Now see if it is an opening bracket */ + if (c != 0) { + match = (char)UCharacter.getBidiPairedBracket(c); /* get the matching char */ + } else { + match = 0; + } + if (match != c && /* has a matching char */ + UCharacter.getIntPropertyValue(c, BIDI_PAIRED_BRACKET_TYPE) == + /* opening bracket */ BidiPairedBracketType.OPEN) { + /* special case: process synonyms + create an opening entry for each synonym */ + if (match == 0x232A) { /* RIGHT-POINTING ANGLE BRACKET */ + bracketAddOpening(bd, (char)0x3009, position); + } + else if (match == 0x3009) { /* RIGHT ANGLE BRACKET */ + bracketAddOpening(bd, (char)0x232A, position); + } + bracketAddOpening(bd, match, position); + } + } + level = levels[position]; + if ((level & LEVEL_OVERRIDE) != 0) { /* X4, X5 */ + newProp = (byte)(level & 1); + if (dirProp != S && dirProp != WS && dirProp != ON) + dirProps[position] = newProp; + pLastIsoRun.lastBase = newProp; + pLastIsoRun.lastStrong = newProp; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + } + else if (dirProp <= R || dirProp == AL) { + newProp = DirFromStrong(dirProp); + pLastIsoRun.lastBase = dirProp; + pLastIsoRun.lastStrong = dirProp; + pLastIsoRun.contextDir = newProp; + pLastIsoRun.contextPos = position; + } + else if(dirProp == EN) { + pLastIsoRun.lastBase = EN; + if (pLastIsoRun.lastStrong == L) { + newProp = L; /* W7 */ + if (!bd.isNumbersSpecial) + dirProps[position] = ENL; + pLastIsoRun.contextDir = L; + pLastIsoRun.contextPos = position; + } + else { + newProp = R; /* N0 */ + if (pLastIsoRun.lastStrong == AL) + dirProps[position] = AN; /* W2 */ + else + dirProps[position] = ENR; + pLastIsoRun.contextDir = R; + pLastIsoRun.contextPos = position; + } + } + else if (dirProp == AN) { + newProp = R; /* N0 */ + pLastIsoRun.lastBase = AN; + pLastIsoRun.contextDir = R; + pLastIsoRun.contextPos = position; + } + else if (dirProp == NSM) { + /* if the last real char was ON, change NSM to ON so that it + will stay ON even if the last real char is a bracket which + may be changed to L or R */ + newProp = pLastIsoRun.lastBase; + if (newProp == ON) + dirProps[position] = newProp; + } + else { + newProp = dirProp; + pLastIsoRun.lastBase = dirProp; + } + if (newProp <= R || newProp == AL) { + int i; + short flag = (short)DirPropFlag(DirFromStrong(newProp)); + for (i = pLastIsoRun.start; i < pLastIsoRun.limit; i++) + if (position > bd.openings[i].position) + bd.openings[i].flags |= flag; + } + } + + /* perform (X1)..(X9) ------------------------------------------------------- */ + + /* determine if the text is mixed-directional or single-directional */ + private byte directionFromFlags() { + + /* if the text contains AN and neutrals, then some neutrals may become RTL */ + if (!((flags & MASK_RTL) != 0 || + ((flags & DirPropFlag(AN)) != 0 && + (flags & MASK_POSSIBLE_N) != 0))) { + return LTR; + } else if ((flags & MASK_LTR) == 0) { + return RTL; + } else { + return MIXED; + } + } + + /* + * Resolve the explicit levels as specified by explicit embedding codes. + * Recalculate the flags to have them reflect the real properties + * after taking the explicit embeddings into account. + * + * The BiDi algorithm is designed to result in the same behavior whether embedding + * levels are externally specified (from "styled text", supposedly the preferred + * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text. + * That is why (X9) instructs to remove all not-isolate explicit codes (and BN). + * However, in a real implementation, the removal of these codes and their index + * positions in the plain text is undesirable since it would result in + * reallocated, reindexed text. + * Instead, this implementation leaves the codes in there and just ignores them + * in the subsequent processing. + * In order to get the same reordering behavior, positions with a BN or a not-isolate + * explicit embedding code just get the same level assigned as the last "real" + * character. + * + * Some implementations, not this one, then overwrite some of these + * directionality properties at "real" same-level-run boundaries by + * L or R codes so that the resolution of weak types can be performed on the + * entire paragraph at once instead of having to parse it once more and + * perform that resolution on same-level-runs. + * This limits the scope of the implicit rules in effectively + * the same way as the run limits. + * + * Instead, this implementation does not modify these codes, except for + * paired brackets whose properties (ON) may be replaced by L or R. + * On one hand, the paragraph has to be scanned for same-level-runs, but + * on the other hand, this saves another loop to reset these codes, + * or saves making and modifying a copy of dirProps[]. + * + * + * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm. + * + * + * Handling the stack of explicit levels (Xn): + * + * With the BiDi stack of explicit levels, as pushed with each + * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI, + * the explicit level must never exceed MAX_EXPLICIT_LEVEL. + * + * In order to have a correct push-pop semantics even in the case of overflows, + * overflow counters and a valid isolate counter are used as described in UAX#9 + * section 3.3.2 "Explicit Levels and Directions". + * + * This implementation assumes that MAX_EXPLICIT_LEVEL is odd. + * + * Returns the direction + * + */ + private byte resolveExplicitLevels() { + int i = 0; + byte dirProp; + byte level = GetParaLevelAt(0); + byte dirct; + isolateCount = 0; + + /* determine if the text is mixed-directional or single-directional */ + dirct = directionFromFlags(); + + /* we may not need to resolve any explicit levels */ + if (dirct != MIXED) { + /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */ + return dirct; + } + + if (reorderingMode > REORDER_LAST_LOGICAL_TO_VISUAL) { + /* inverse BiDi: mixed, but all characters are at the same embedding level */ + /* set all levels to the paragraph level */ + int paraIndex, start, limit; + for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { + if (paraIndex == 0) + start = 0; + else + start = paras_limit[paraIndex - 1]; + limit = paras_limit[paraIndex]; + level = paras_level[paraIndex]; + for (i = start; i < limit; i++) + levels[i] =level; + } + return dirct; /* no bracket matching for inverse BiDi */ + } + if ((flags & (MASK_EXPLICIT | MASK_ISO)) == 0) { + /* no embeddings, set all levels to the paragraph level */ + /* we still have to perform bracket matching */ + int paraIndex, start, limit; + BracketData bracketData = new BracketData(); + bracketInit(bracketData); + for (paraIndex = 0; paraIndex < paraCount; paraIndex++) { + if (paraIndex == 0) + start = 0; + else + start = paras_limit[paraIndex-1]; + limit = paras_limit[paraIndex]; + level = paras_level[paraIndex]; + for (i = start; i < limit; i++) { + levels[i] = level; + dirProp = dirProps[i]; + if (dirProp == BN) + continue; + if (dirProp == B) { + if ((i + 1) < length) { + if (text[i] == CR && text[i + 1] == LF) + continue; /* skip CR when followed by LF */ + bracketProcessB(bracketData, level); + } + continue; + } + bracketProcessChar(bracketData, i); + } + } + return dirct; + } + /* continue to perform (Xn) */ + + /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */ + /* both variables may carry the LEVEL_OVERRIDE flag to indicate the override status */ + byte embeddingLevel = level, newLevel; + byte previousLevel = level; /* previous level for regular (not CC) characters */ + int lastCcPos = 0; /* index of last effective LRx,RLx, PDx */ + + /* The following stack remembers the embedding level and the ISOLATE flag of level runs. + stackLast points to its current entry. */ + short[] stack = new short[MAX_EXPLICIT_LEVEL + 2]; /* we never push anything >= MAX_EXPLICIT_LEVEL + but we need one more entry as base */ + int stackLast = 0; + int overflowIsolateCount = 0; + int overflowEmbeddingCount = 0; + int validIsolateCount = 0; + BracketData bracketData = new BracketData(); + bracketInit(bracketData); + stack[0] = level; /* initialize base entry to para level, no override, no isolate */ + + /* recalculate the flags */ + flags = 0; + + for (i = 0; i < length; i++) { + dirProp = dirProps[i]; + switch (dirProp) { + case LRE: + case RLE: + case LRO: + case RLO: + /* (X2, X3, X4, X5) */ + flags |= DirPropFlag(BN); + levels[i] = previousLevel; + if (dirProp == LRE || dirProp == LRO) { + /* least greater even level */ + newLevel = (byte)((embeddingLevel+2) & ~(LEVEL_OVERRIDE | 1)); + } else { + /* least greater odd level */ + newLevel = (byte)((NoOverride(embeddingLevel) + 1) | 1); + } + if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 && + overflowEmbeddingCount == 0) { + lastCcPos = i; + embeddingLevel = newLevel; + if (dirProp == LRO || dirProp == RLO) + embeddingLevel |= LEVEL_OVERRIDE; + stackLast++; + stack[stackLast] = embeddingLevel; + /* we don't need to set LEVEL_OVERRIDE off for LRE and RLE + since this has already been done for newLevel which is + the source for embeddingLevel. + */ + } else { + if (overflowIsolateCount == 0) + overflowEmbeddingCount++; + } + break; + case PDF: + /* (X7) */ + flags |= DirPropFlag(BN); + levels[i] = previousLevel; + /* handle all the overflow cases first */ + if (overflowIsolateCount > 0) { + break; + } + if (overflowEmbeddingCount > 0) { + overflowEmbeddingCount--; + break; + } + if (stackLast > 0 && stack[stackLast] < ISOLATE) { /* not an isolate entry */ + lastCcPos = i; + stackLast--; + embeddingLevel = (byte)stack[stackLast]; + } + break; + case LRI: + case RLI: + flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); + levels[i] = NoOverride(embeddingLevel); + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + } + previousLevel = embeddingLevel; + /* (X5a, X5b) */ + if (dirProp == LRI) + /* least greater even level */ + newLevel=(byte)((embeddingLevel+2)&~(LEVEL_OVERRIDE|1)); + else + /* least greater odd level */ + newLevel=(byte)((NoOverride(embeddingLevel)+1)|1); + if (newLevel <= MAX_EXPLICIT_LEVEL && overflowIsolateCount == 0 + && overflowEmbeddingCount == 0) { + flags |= DirPropFlag(dirProp); + lastCcPos = i; + validIsolateCount++; + if (validIsolateCount > isolateCount) + isolateCount = validIsolateCount; + embeddingLevel = newLevel; + /* we can increment stackLast without checking because newLevel + will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */ + stackLast++; + stack[stackLast] = (short)(embeddingLevel + ISOLATE); + bracketProcessLRI_RLI(bracketData, embeddingLevel); + } else { + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + overflowIsolateCount++; + } + break; + case PDI: + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + } + /* (X6a) */ + if (overflowIsolateCount > 0) { + overflowIsolateCount--; + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + } + else if (validIsolateCount > 0) { + flags |= DirPropFlag(PDI); + lastCcPos = i; + overflowEmbeddingCount = 0; + while (stack[stackLast] < ISOLATE) /* pop embedding entries */ + stackLast--; /* until the last isolate entry */ + stackLast--; /* pop also the last isolate entry */ + validIsolateCount--; + bracketProcessPDI(bracketData); + } else + /* make it WS so that it is handled by adjustWSLevels() */ + dirProps[i] = WS; + embeddingLevel = (byte)(stack[stackLast] & ~ISOLATE); + flags |= DirPropFlag(ON) | DirPropFlagLR(embeddingLevel); + previousLevel = embeddingLevel; + levels[i] = NoOverride(embeddingLevel); + break; + case B: + flags |= DirPropFlag(B); + levels[i] = GetParaLevelAt(i); + if ((i + 1) < length) { + if (text[i] == CR && text[i + 1] == LF) + break; /* skip CR when followed by LF */ + overflowEmbeddingCount = overflowIsolateCount = 0; + validIsolateCount = 0; + stackLast = 0; + previousLevel = embeddingLevel = GetParaLevelAt(i + 1); + stack[0] = embeddingLevel; /* initialize base entry to para level, no override, no isolate */ + bracketProcessB(bracketData, embeddingLevel); + } + break; + case BN: + /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */ + /* they will get their levels set correctly in adjustWSLevels() */ + levels[i] = previousLevel; + flags |= DirPropFlag(BN); + break; + default: + /* all other types are normal characters and get the "real" level */ + if (NoOverride(embeddingLevel) != NoOverride(previousLevel)) { + bracketProcessBoundary(bracketData, lastCcPos, + previousLevel, embeddingLevel); + flags |= DirPropFlagMultiRuns; + if ((embeddingLevel & LEVEL_OVERRIDE) != 0) + flags |= DirPropFlagO(embeddingLevel); + else + flags |= DirPropFlagE(embeddingLevel); + } + previousLevel = embeddingLevel; + levels[i] = embeddingLevel; + bracketProcessChar(bracketData, i); + /* the dirProp may have been changed in bracketProcessChar() */ + flags |= DirPropFlag(dirProps[i]); + break; + } + } + if ((flags & MASK_EMBEDDING) != 0) { + flags |= DirPropFlagLR(paraLevel); + } + if (orderParagraphsLTR && (flags & DirPropFlag(B)) != 0) { + flags |= DirPropFlag(L); + } + /* again, determine if the text is mixed-directional or single-directional */ + dirct = directionFromFlags(); + + return dirct; + } + + /* + * Use a pre-specified embedding levels array: + * + * Adjust the directional properties for overrides (->LEVEL_OVERRIDE), + * ignore all explicit codes (X9), + * and check all the preset levels. + * + * Recalculate the flags to have them reflect the real properties + * after taking the explicit embeddings into account. + */ + private byte checkExplicitLevels() { + byte dirProp; + int i; + int isolateCount = 0; + + this.flags = 0; /* collect all directionalities in the text */ + byte level; + this.isolateCount = 0; + + for (i = 0; i < length; ++i) { + if (levels[i] == 0) { + levels[i] = paraLevel; + } + + // for backward compatibility + if (MAX_EXPLICIT_LEVEL < (levels[i]&0x7f)) { + if ((levels[i] & LEVEL_OVERRIDE) != 0) { + levels[i] = (byte)(paraLevel|LEVEL_OVERRIDE); + } else { + levels[i] = paraLevel; + } + } + + level = levels[i]; + dirProp = dirProps[i]; + if (dirProp == LRI || dirProp == RLI) { + isolateCount++; + if (isolateCount > this.isolateCount) + this.isolateCount = isolateCount; + } + else if (dirProp == PDI) { + isolateCount--; + } else if (dirProp == B) { + isolateCount = 0; + } + if ((level & LEVEL_OVERRIDE) != 0) { + /* keep the override flag in levels[i] but adjust the flags */ + level &= ~LEVEL_OVERRIDE; /* make the range check below simpler */ + flags |= DirPropFlagO(level); + } else { + /* set the flags */ + flags |= DirPropFlagE(level) | DirPropFlag(dirProp); + } + if ((level < GetParaLevelAt(i) && + !((0 == level) && (dirProp == B))) || + (MAX_EXPLICIT_LEVEL < level)) { + /* level out of bounds */ + throw new IllegalArgumentException("level " + level + + " out of bounds at " + i); + } + } + if ((flags & MASK_EMBEDDING) != 0) { + flags |= DirPropFlagLR(paraLevel); + } + /* determine if the text is mixed-directional or single-directional */ + return directionFromFlags(); + } + + /*********************************************************************/ + /* The Properties state machine table */ + /*********************************************************************/ + /* */ + /* All table cells are 8 bits: */ + /* bits 0..4: next state */ + /* bits 5..7: action to perform (if > 0) */ + /* */ + /* Cells may be of format "n" where n represents the next state */ + /* (except for the rightmost column). */ + /* Cells may also be of format "_(x,y)" where x represents an action */ + /* to perform and y represents the next state. */ + /* */ + /*********************************************************************/ + /* Definitions and type for properties state tables */ + /*********************************************************************/ + private static final int IMPTABPROPS_COLUMNS = 16; + private static final int IMPTABPROPS_RES = IMPTABPROPS_COLUMNS - 1; + private static short GetStateProps(short cell) { + return (short)(cell & 0x1f); + } + private static short GetActionProps(short cell) { + return (short)(cell >> 5); + } + + private static final short groupProp[] = /* dirProp regrouped */ + { + /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ENL ENR */ + 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4, 13, 14 + }; + private static final short _L = 0; + private static final short _R = 1; + private static final short _EN = 2; + private static final short _AN = 3; + private static final short _ON = 4; + private static final short _S = 5; + private static final short _B = 6; /* reduced dirProp */ + + /*********************************************************************/ + /* */ + /* PROPERTIES STATE TABLE */ + /* */ + /* In table impTabProps, */ + /* - the ON column regroups ON and WS, FSI, RLI, LRI and PDI */ + /* - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF */ + /* - the Res column is the reduced property assigned to a run */ + /* */ + /* Action 1: process current run1, init new run1 */ + /* 2: init new run2 */ + /* 3: process run1, process run2, init new run1 */ + /* 4: process run1, set run1=run2, init new run2 */ + /* */ + /* Notes: */ + /* 1) This table is used in resolveImplicitLevels(). */ + /* 2) This table triggers actions when there is a change in the Bidi*/ + /* property of incoming characters (action 1). */ + /* 3) Most such property sequences are processed immediately (in */ + /* fact, passed to processPropertySeq(). */ + /* 4) However, numbers are assembled as one sequence. This means */ + /* that undefined situations (like CS following digits, until */ + /* it is known if the next char will be a digit) are held until */ + /* following chars define them. */ + /* Example: digits followed by CS, then comes another CS or ON; */ + /* the digits will be processed, then the CS assigned */ + /* as the start of an ON sequence (action 3). */ + /* 5) There are cases where more than one sequence must be */ + /* processed, for instance digits followed by CS followed by L: */ + /* the digits must be processed as one sequence, and the CS */ + /* must be processed as an ON sequence, all this before starting */ + /* assembling chars for the opening L sequence. */ + /* */ + /* */ + private static final short impTabProps[][] = + { +/* L, R, EN, AN, ON, S, B, ES, ET, CS, BN, NSM, AL, ENL, ENR, Res */ +/* 0 Init */ { 1, 2, 4, 5, 7, 15, 17, 7, 9, 7, 0, 7, 3, 18, 21, _ON }, +/* 1 L */ { 1, 32+2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 1, 1, 32+3, 32+18, 32+21, _L }, +/* 2 R */ { 32+1, 2, 32+4, 32+5, 32+7, 32+15, 32+17, 32+7, 32+9, 32+7, 2, 2, 32+3, 32+18, 32+21, _R }, +/* 3 AL */ { 32+1, 32+2, 32+6, 32+6, 32+8, 32+16, 32+17, 32+8, 32+8, 32+8, 3, 3, 3, 32+18, 32+21, _R }, +/* 4 EN */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 64+10, 11, 64+10, 4, 4, 32+3, 18, 21, _EN }, +/* 5 AN */ { 32+1, 32+2, 32+4, 5, 32+7, 32+15, 32+17, 32+7, 32+9, 64+12, 5, 5, 32+3, 32+18, 32+21, _AN }, +/* 6 AL:EN/AN */ { 32+1, 32+2, 6, 6, 32+8, 32+16, 32+17, 32+8, 32+8, 64+13, 6, 6, 32+3, 18, 21, _AN }, +/* 7 ON */ { 32+1, 32+2, 32+4, 32+5, 7, 32+15, 32+17, 7, 64+14, 7, 7, 7, 32+3, 32+18, 32+21, _ON }, +/* 8 AL:ON */ { 32+1, 32+2, 32+6, 32+6, 8, 32+16, 32+17, 8, 8, 8, 8, 8, 32+3, 32+18, 32+21, _ON }, +/* 9 ET */ { 32+1, 32+2, 4, 32+5, 7, 32+15, 32+17, 7, 9, 7, 9, 9, 32+3, 18, 21, _ON }, +/*10 EN+ES/CS */ { 96+1, 96+2, 4, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 10, 128+7, 96+3, 18, 21, _EN }, +/*11 EN+ET */ { 32+1, 32+2, 4, 32+5, 32+7, 32+15, 32+17, 32+7, 11, 32+7, 11, 11, 32+3, 18, 21, _EN }, +/*12 AN+CS */ { 96+1, 96+2, 96+4, 5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 12, 128+7, 96+3, 96+18, 96+21, _AN }, +/*13 AL:EN/AN+CS */ { 96+1, 96+2, 6, 6, 128+8, 96+16, 96+17, 128+8, 128+8, 128+8, 13, 128+8, 96+3, 18, 21, _AN }, +/*14 ON+ET */ { 32+1, 32+2, 128+4, 32+5, 7, 32+15, 32+17, 7, 14, 7, 14, 14, 32+3,128+18,128+21, _ON }, +/*15 S */ { 32+1, 32+2, 32+4, 32+5, 32+7, 15, 32+17, 32+7, 32+9, 32+7, 15, 32+7, 32+3, 32+18, 32+21, _S }, +/*16 AL:S */ { 32+1, 32+2, 32+6, 32+6, 32+8, 16, 32+17, 32+8, 32+8, 32+8, 16, 32+8, 32+3, 32+18, 32+21, _S }, +/*17 B */ { 32+1, 32+2, 32+4, 32+5, 32+7, 32+15, 17, 32+7, 32+9, 32+7, 17, 32+7, 32+3, 32+18, 32+21, _B }, +/*18 ENL */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 64+19, 20, 64+19, 18, 18, 32+3, 18, 21, _L }, +/*19 ENL+ES/CS */ { 96+1, 96+2, 18, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 19, 128+7, 96+3, 18, 21, _L }, +/*20 ENL+ET */ { 32+1, 32+2, 18, 32+5, 32+7, 32+15, 32+17, 32+7, 20, 32+7, 20, 20, 32+3, 18, 21, _L }, +/*21 ENR */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 64+22, 23, 64+22, 21, 21, 32+3, 18, 21, _AN }, +/*22 ENR+ES/CS */ { 96+1, 96+2, 21, 96+5, 128+7, 96+15, 96+17, 128+7,128+14, 128+7, 22, 128+7, 96+3, 18, 21, _AN }, +/*23 ENR+ET */ { 32+1, 32+2, 21, 32+5, 32+7, 32+15, 32+17, 32+7, 23, 32+7, 23, 23, 32+3, 18, 21, _AN } + }; + + /*********************************************************************/ + /* The levels state machine tables */ + /*********************************************************************/ + /* */ + /* All table cells are 8 bits: */ + /* bits 0..3: next state */ + /* bits 4..7: action to perform (if > 0) */ + /* */ + /* Cells may be of format "n" where n represents the next state */ + /* (except for the rightmost column). */ + /* Cells may also be of format "_(x,y)" where x represents an action */ + /* to perform and y represents the next state. */ + /* */ + /* This format limits each table to 16 states each and to 15 actions.*/ + /* */ + /*********************************************************************/ + /* Definitions and type for levels state tables */ + /*********************************************************************/ + private static final int IMPTABLEVELS_COLUMNS = _B + 2; + private static final int IMPTABLEVELS_RES = IMPTABLEVELS_COLUMNS - 1; + private static short GetState(byte cell) { return (short)(cell & 0x0f); } + private static short GetAction(byte cell) { return (short)(cell >> 4); } + + private static class ImpTabPair { + byte[][][] imptab; + short[][] impact; + + ImpTabPair(byte[][] table1, byte[][] table2, + short[] act1, short[] act2) { + imptab = new byte[][][] {table1, table2}; + impact = new short[][] {act1, act2}; + } + } + + /*********************************************************************/ + /* */ + /* LEVELS STATE TABLES */ + /* */ + /* In all levels state tables, */ + /* - state 0 is the initial state */ + /* - the Res column is the increment to add to the text level */ + /* for this property sequence. */ + /* */ + /* The impact arrays for each table of a pair map the local action */ + /* numbers of the table to the total list of actions. For instance, */ + /* action 2 in a given table corresponds to the action number which */ + /* appears in entry [2] of the impact array for that table. */ + /* The first entry of all impact arrays must be 0. */ + /* */ + /* Action 1: init conditional sequence */ + /* 2: prepend conditional sequence to current sequence */ + /* 3: set ON sequence to new level - 1 */ + /* 4: init EN/AN/ON sequence */ + /* 5: fix EN/AN/ON sequence followed by R */ + /* 6: set previous level sequence to level 2 */ + /* */ + /* Notes: */ + /* 1) These tables are used in processPropertySeq(). The input */ + /* is property sequences as determined by resolveImplicitLevels. */ + /* 2) Most such property sequences are processed immediately */ + /* (levels are assigned). */ + /* 3) However, some sequences cannot be assigned a final level till */ + /* one or more following sequences are received. For instance, */ + /* ON following an R sequence within an even-level paragraph. */ + /* If the following sequence is R, the ON sequence will be */ + /* assigned basic run level+1, and so will the R sequence. */ + /* 4) S is generally handled like ON, since its level will be fixed */ + /* to paragraph level in adjustWSLevels(). */ + /* */ + + private static final byte impTabL_DEFAULT[][] = /* Even paragraph level */ + /* In this table, conditional sequences receive the lower possible level + until proven otherwise. + */ + { + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0, 1, 0, 2, 0, 0, 0, 0 }, + /* 1 : R */ { 0, 1, 3, 3, 0x14, 0x14, 0, 1 }, + /* 2 : AN */ { 0, 1, 0, 2, 0x15, 0x15, 0, 2 }, + /* 3 : R+EN/AN */ { 0, 1, 3, 3, 0x14, 0x14, 0, 2 }, + /* 4 : R+ON */ { 0, 0x21, 0x33, 0x33, 4, 4, 0, 0 }, + /* 5 : AN+ON */ { 0, 0x21, 0, 0x32, 5, 5, 0, 0 } + }; + + private static final byte impTabR_DEFAULT[][] = /* Odd paragraph level */ + /* In this table, conditional sequences receive the lower possible level + until proven otherwise. + */ + { + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 1, 0, 2, 2, 0, 0, 0, 0 }, + /* 1 : L */ { 1, 0, 1, 3, 0x14, 0x14, 0, 1 }, + /* 2 : EN/AN */ { 1, 0, 2, 2, 0, 0, 0, 1 }, + /* 3 : L+AN */ { 1, 0, 1, 3, 5, 5, 0, 1 }, + /* 4 : L+ON */ { 0x21, 0, 0x21, 3, 4, 4, 0, 0 }, + /* 5 : L+AN+ON */ { 1, 0, 1, 3, 5, 5, 0, 0 } + }; + + private static final short[] impAct0 = {0,1,2,3,4}; + + private static final ImpTabPair impTab_DEFAULT = new ImpTabPair( + impTabL_DEFAULT, impTabR_DEFAULT, impAct0, impAct0); + + private static final byte impTabL_NUMBERS_SPECIAL[][] = { /* Even paragraph level */ + /* In this table, conditional sequences receive the lower possible + level until proven otherwise. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0, 2, 0x11, 0x11, 0, 0, 0, 0 }, + /* 1 : L+EN/AN */ { 0, 0x42, 1, 1, 0, 0, 0, 0 }, + /* 2 : R */ { 0, 2, 4, 4, 0x13, 0x13, 0, 1 }, + /* 3 : R+ON */ { 0, 0x22, 0x34, 0x34, 3, 3, 0, 0 }, + /* 4 : R+EN/AN */ { 0, 2, 4, 4, 0x13, 0x13, 0, 2 } + }; + private static final ImpTabPair impTab_NUMBERS_SPECIAL = new ImpTabPair( + impTabL_NUMBERS_SPECIAL, impTabR_DEFAULT, impAct0, impAct0); + + private static final byte impTabL_GROUP_NUMBERS_WITH_R[][] = { + /* In this table, EN/AN+ON sequences receive levels as if associated with R + until proven that there is L or sor/eor on both sides. AN is handled like EN. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 init */ { 0, 3, 0x11, 0x11, 0, 0, 0, 0 }, + /* 1 EN/AN */ { 0x20, 3, 1, 1, 2, 0x20, 0x20, 2 }, + /* 2 EN/AN+ON */ { 0x20, 3, 1, 1, 2, 0x20, 0x20, 1 }, + /* 3 R */ { 0, 3, 5, 5, 0x14, 0, 0, 1 }, + /* 4 R+ON */ { 0x20, 3, 5, 5, 4, 0x20, 0x20, 1 }, + /* 5 R+EN/AN */ { 0, 3, 5, 5, 0x14, 0, 0, 2 } + }; + private static final byte impTabR_GROUP_NUMBERS_WITH_R[][] = { + /* In this table, EN/AN+ON sequences receive levels as if associated with R + until proven that there is L on both sides. AN is handled like EN. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 init */ { 2, 0, 1, 1, 0, 0, 0, 0 }, + /* 1 EN/AN */ { 2, 0, 1, 1, 0, 0, 0, 1 }, + /* 2 L */ { 2, 0, 0x14, 0x14, 0x13, 0, 0, 1 }, + /* 3 L+ON */ { 0x22, 0, 4, 4, 3, 0, 0, 0 }, + /* 4 L+EN/AN */ { 0x22, 0, 4, 4, 3, 0, 0, 1 } + }; + private static final ImpTabPair impTab_GROUP_NUMBERS_WITH_R = new + ImpTabPair(impTabL_GROUP_NUMBERS_WITH_R, + impTabR_GROUP_NUMBERS_WITH_R, impAct0, impAct0); + + private static final byte impTabL_INVERSE_NUMBERS_AS_L[][] = { + /* This table is identical to the Default LTR table except that EN and AN + are handled like L. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0, 1, 0, 0, 0, 0, 0, 0 }, + /* 1 : R */ { 0, 1, 0, 0, 0x14, 0x14, 0, 1 }, + /* 2 : AN */ { 0, 1, 0, 0, 0x15, 0x15, 0, 2 }, + /* 3 : R+EN/AN */ { 0, 1, 0, 0, 0x14, 0x14, 0, 2 }, + /* 4 : R+ON */ { 0x20, 1, 0x20, 0x20, 4, 4, 0x20, 1 }, + /* 5 : AN+ON */ { 0x20, 1, 0x20, 0x20, 5, 5, 0x20, 1 } + }; + private static final byte impTabR_INVERSE_NUMBERS_AS_L[][] = { + /* This table is identical to the Default RTL table except that EN and AN + are handled like L. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 1, 0, 1, 1, 0, 0, 0, 0 }, + /* 1 : L */ { 1, 0, 1, 1, 0x14, 0x14, 0, 1 }, + /* 2 : EN/AN */ { 1, 0, 1, 1, 0, 0, 0, 1 }, + /* 3 : L+AN */ { 1, 0, 1, 1, 5, 5, 0, 1 }, + /* 4 : L+ON */ { 0x21, 0, 0x21, 0x21, 4, 4, 0, 0 }, + /* 5 : L+AN+ON */ { 1, 0, 1, 1, 5, 5, 0, 0 } + }; + private static final ImpTabPair impTab_INVERSE_NUMBERS_AS_L = new ImpTabPair + (impTabL_INVERSE_NUMBERS_AS_L, impTabR_INVERSE_NUMBERS_AS_L, + impAct0, impAct0); + + private static final byte impTabR_INVERSE_LIKE_DIRECT[][] = { /* Odd paragraph level */ + /* In this table, conditional sequences receive the lower possible level + until proven otherwise. + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 1, 0, 2, 2, 0, 0, 0, 0 }, + /* 1 : L */ { 1, 0, 1, 2, 0x13, 0x13, 0, 1 }, + /* 2 : EN/AN */ { 1, 0, 2, 2, 0, 0, 0, 1 }, + /* 3 : L+ON */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 0 }, + /* 4 : L+ON+AN */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 3 }, + /* 5 : L+AN+ON */ { 0x21, 0x30, 6, 4, 5, 5, 0x30, 2 }, + /* 6 : L+ON+EN */ { 0x21, 0x30, 6, 4, 3, 3, 0x30, 1 } + }; + private static final short[] impAct1 = {0,1,13,14}; + private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT = new ImpTabPair( + impTabL_DEFAULT, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); + + private static final byte impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS[][] = { + /* The case handled in this table is (visually): R EN L + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0, 0x63, 0, 1, 0, 0, 0, 0 }, + /* 1 : L+AN */ { 0, 0x63, 0, 1, 0x12, 0x30, 0, 4 }, + /* 2 : L+AN+ON */ { 0x20, 0x63, 0x20, 1, 2, 0x30, 0x20, 3 }, + /* 3 : R */ { 0, 0x63, 0x55, 0x56, 0x14, 0x30, 0, 3 }, + /* 4 : R+ON */ { 0x30, 0x43, 0x55, 0x56, 4, 0x30, 0x30, 3 }, + /* 5 : R+EN */ { 0x30, 0x43, 5, 0x56, 0x14, 0x30, 0x30, 4 }, + /* 6 : R+AN */ { 0x30, 0x43, 0x55, 6, 0x14, 0x30, 0x30, 4 } + }; + private static final byte impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS[][] = { + /* The cases handled in this table are (visually): R EN L + R L AN L + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0x13, 0, 1, 1, 0, 0, 0, 0 }, + /* 1 : R+EN/AN */ { 0x23, 0, 1, 1, 2, 0x40, 0, 1 }, + /* 2 : R+EN/AN+ON */ { 0x23, 0, 1, 1, 2, 0x40, 0, 0 }, + /* 3 : L */ { 3, 0, 3, 0x36, 0x14, 0x40, 0, 1 }, + /* 4 : L+ON */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 0 }, + /* 5 : L+ON+EN */ { 0x53, 0x40, 5, 0x36, 4, 0x40, 0x40, 1 }, + /* 6 : L+AN */ { 0x53, 0x40, 6, 6, 4, 0x40, 0x40, 3 } + }; + private static final short[] impAct2 = {0,1,2,5,6,7,8}; + private static final short[] impAct3 = {0,1,9,10,11,12}; + private static final ImpTabPair impTab_INVERSE_LIKE_DIRECT_WITH_MARKS = + new ImpTabPair(impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS, + impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); + + private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL = new ImpTabPair( + impTabL_NUMBERS_SPECIAL, impTabR_INVERSE_LIKE_DIRECT, impAct0, impAct1); + + private static final byte impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS[][] = { + /* The case handled in this table is (visually): R EN L + */ + /* L, R, EN, AN, ON, S, B, Res */ + /* 0 : init */ { 0, 0x62, 1, 1, 0, 0, 0, 0 }, + /* 1 : L+EN/AN */ { 0, 0x62, 1, 1, 0, 0x30, 0, 4 }, + /* 2 : R */ { 0, 0x62, 0x54, 0x54, 0x13, 0x30, 0, 3 }, + /* 3 : R+ON */ { 0x30, 0x42, 0x54, 0x54, 3, 0x30, 0x30, 3 }, + /* 4 : R+EN/AN */ { 0x30, 0x42, 4, 4, 0x13, 0x30, 0x30, 4 } + }; + private static final ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS = new + ImpTabPair(impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS, + impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS, impAct2, impAct3); + + private static class LevState { + byte[][] impTab; /* level table pointer */ + short[] impAct; /* action map array */ + int startON; /* start of ON sequence */ + int startL2EN; /* start of level 2 sequence */ + int lastStrongRTL; /* index of last found R or AL */ + int runStart; /* start position of the run */ + short state; /* current state */ + byte runLevel; /* run level before implicit solving */ + } + + /*------------------------------------------------------------------------*/ + + static final int FIRSTALLOC = 10; + /* + * param pos: position where to insert + * param flag: one of LRM_BEFORE, LRM_AFTER, RLM_BEFORE, RLM_AFTER + */ + private void addPoint(int pos, int flag) + { + Point point = new Point(); + + int len = insertPoints.points.length; + if (len == 0) { + insertPoints.points = new Point[FIRSTALLOC]; + len = FIRSTALLOC; + } + if (insertPoints.size >= len) { /* no room for new point */ + Point[] savePoints = insertPoints.points; + insertPoints.points = new Point[len * 2]; + System.arraycopy(savePoints, 0, insertPoints.points, 0, len); + } + point.pos = pos; + point.flag = flag; + insertPoints.points[insertPoints.size] = point; + insertPoints.size++; + } + + private void setLevelsOutsideIsolates(int start, int limit, byte level) + { + byte dirProp; + int isolateCount = 0, k; + for (k = start; k < limit; k++) { + dirProp = dirProps[k]; + if (dirProp == PDI) + isolateCount--; + if (isolateCount == 0) { + levels[k] = level; + } + if (dirProp == LRI || dirProp == RLI) + isolateCount++; + } + } + + /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */ + + /* + * This implementation of the (Wn) rules applies all rules in one pass. + * In order to do so, it needs a look-ahead of typically 1 character + * (except for W5: sequences of ET) and keeps track of changes + * in a rule Wp that affect a later Wq (p= 0) { + addPoint(levState.startL2EN, LRM_BEFORE); + } + levState.startL2EN = -1; /* not within previous if since could also be -2 */ + /* check if we had any relevant EN/AN after R/AL */ + if ((insertPoints.points.length == 0) || + (insertPoints.size <= insertPoints.confirmed)) { + /* nothing, just clean up */ + levState.lastStrongRTL = -1; + /* check if we have a pending conditional segment */ + level = impTab[oldStateSeq][IMPTABLEVELS_RES]; + if ((level & 1) != 0 && levState.startON > 0) { /* after ON */ + start = levState.startON; /* reset to basic run level */ + } + if (_prop == _S) { /* add LRM before S */ + addPoint(start0, LRM_BEFORE); + insertPoints.confirmed = insertPoints.size; + } + break; + } + /* reset previous RTL cont to level for LTR text */ + for (k = levState.lastStrongRTL + 1; k < start0; k++) { + /* reset odd level, leave runLevel+2 as is */ + levels[k] = (byte)((levels[k] - 2) & ~1); + } + /* mark insert points as confirmed */ + insertPoints.confirmed = insertPoints.size; + levState.lastStrongRTL = -1; + if (_prop == _S) { /* add LRM before S */ + addPoint(start0, LRM_BEFORE); + insertPoints.confirmed = insertPoints.size; + } + break; + + case 6: /* R/AL after possible relevant EN/AN */ + /* just clean up */ + if (insertPoints.points.length > 0) + /* remove all non confirmed insert points */ + insertPoints.size = insertPoints.confirmed; + levState.startON = -1; + levState.startL2EN = -1; + levState.lastStrongRTL = limit - 1; + break; + + case 7: /* EN/AN after R/AL + possible cont */ + /* check for real AN */ + + if ((_prop == _AN) && (dirProps[start0] == AN) && + (reorderingMode != REORDER_INVERSE_FOR_NUMBERS_SPECIAL)) + { + /* real AN */ + if (levState.startL2EN == -1) { /* if no relevant EN already found */ + /* just note the rightmost digit as a strong RTL */ + levState.lastStrongRTL = limit - 1; + break; + } + if (levState.startL2EN >= 0) { /* after EN, no AN */ + addPoint(levState.startL2EN, LRM_BEFORE); + levState.startL2EN = -2; + } + /* note AN */ + addPoint(start0, LRM_BEFORE); + break; + } + /* if first EN/AN after R/AL */ + if (levState.startL2EN == -1) { + levState.startL2EN = start0; + } + break; + + case 8: /* note location of latest R/AL */ + levState.lastStrongRTL = limit - 1; + levState.startON = -1; + break; + + case 9: /* L after R+ON/EN/AN */ + /* include possible adjacent number on the left */ + for (k = start0-1; k >= 0 && ((levels[k] & 1) == 0); k--) { + } + if (k >= 0) { + addPoint(k, RLM_BEFORE); /* add RLM before */ + insertPoints.confirmed = insertPoints.size; /* confirm it */ + } + levState.startON = start0; + break; + + case 10: /* AN after L */ + /* AN numbers between L text on both sides may be trouble. */ + /* tentatively bracket with LRMs; will be confirmed if followed by L */ + addPoint(start0, LRM_BEFORE); /* add LRM before */ + addPoint(start0, LRM_AFTER); /* add LRM after */ + break; + + case 11: /* R after L+ON/EN/AN */ + /* false alert, infirm LRMs around previous AN */ + insertPoints.size=insertPoints.confirmed; + if (_prop == _S) { /* add RLM before S */ + addPoint(start0, RLM_BEFORE); + insertPoints.confirmed = insertPoints.size; + } + break; + + case 12: /* L after L+ON/AN */ + level = (byte)(levState.runLevel + addLevel); + for (k=levState.startON; k < start0; k++) { + if (levels[k] < level) { + levels[k] = level; + } + } + insertPoints.confirmed = insertPoints.size; /* confirm inserts */ + levState.startON = start0; + break; + + case 13: /* L after L+ON+EN/AN/ON */ + level = levState.runLevel; + for (k = start0-1; k >= levState.startON; k--) { + if (levels[k] == level+3) { + while (levels[k] == level+3) { + levels[k--] -= 2; + } + while (levels[k] == level) { + k--; + } + } + if (levels[k] == level+2) { + levels[k] = level; + continue; + } + levels[k] = (byte)(level+1); + } + break; + + case 14: /* R after L+ON+EN/AN/ON */ + level = (byte)(levState.runLevel+1); + for (k = start0-1; k >= levState.startON; k--) { + if (levels[k] > level) { + levels[k] -= 2; + } + } + break; + + default: /* we should never get here */ + throw new IllegalStateException("Internal ICU error in processPropertySeq"); + } + } + if ((addLevel) != 0 || (start < start0)) { + level = (byte)(levState.runLevel + addLevel); + if (start >= levState.runStart) { + for (k = start; k < limit; k++) { + levels[k] = level; + } + } else { + setLevelsOutsideIsolates(start, limit, level); + } + } + } + + private void resolveImplicitLevels(int start, int limit, short sor, short eor) + { + byte dirProp; + LevState levState = new LevState(); + int i, start1, start2; + short oldStateImp, stateImp, actionImp; + short gprop, resProp, cell; + boolean inverseRTL; + short nextStrongProp = R; + int nextStrongPos = -1; + + /* check for RTL inverse Bidi mode */ + /* FOOD FOR THOUGHT: in case of RTL inverse Bidi, it would make sense to + * loop on the text characters from end to start. + * This would need a different properties state table (at least different + * actions) and different levels state tables (maybe very similar to the + * LTR corresponding ones. + */ + inverseRTL=((start0) && + (reorderingMode == REORDER_INVERSE_LIKE_DIRECT || + reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL)); + /* initialize for property and levels state table */ + levState.startL2EN = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ + levState.lastStrongRTL = -1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */ + levState.runStart = start; + levState.runLevel = levels[start]; + levState.impTab = impTabPair.imptab[levState.runLevel & 1]; + levState.impAct = impTabPair.impact[levState.runLevel & 1]; + + /* The isolates[] entries contain enough information to + resume the bidi algorithm in the same state as it was + when it was interrupted by an isolate sequence. */ + if (dirProps[start] == PDI) { + levState.startON = isolates[isolateCount].startON; + start1 = isolates[isolateCount].start1; + stateImp = isolates[isolateCount].stateImp; + levState.state = isolates[isolateCount].state; + isolateCount--; + } else { + levState.startON = -1; + start1 = start; + if (dirProps[start] == NSM) + stateImp = (short)(1 + sor); + else + stateImp = 0; + levState.state = 0; + processPropertySeq(levState, sor, start, start); + } + start2 = start; /* to make the Java compiler happy */ + + for (i = start; i <= limit; i++) { + if (i >= limit) { + int k; + for (k = limit - 1; + k > start && + (DirPropFlag(dirProps[k]) & MASK_BN_EXPLICIT) != 0; + k--); + dirProp = dirProps[k]; + if (dirProp == LRI || dirProp == RLI) + break; /* no forced closing for sequence ending with LRI/RLI */ + gprop = eor; + } else { + byte prop, prop1; + prop = dirProps[i]; + if (prop == B) + isolateCount = -1; /* current isolates stack entry == none */ + if (inverseRTL) { + if (prop == AL) { + /* AL before EN does not make it AN */ + prop = R; + } else if (prop == EN) { + if (nextStrongPos <= i) { + /* look for next strong char (L/R/AL) */ + int j; + nextStrongProp = R; /* set default */ + nextStrongPos = limit; + for (j = i+1; j < limit; j++) { + prop1 = dirProps[j]; + if (prop1 == L || prop1 == R || prop1 == AL) { + nextStrongProp = prop1; + nextStrongPos = j; + break; + } + } + } + if (nextStrongProp == AL) { + prop = AN; + } + } + } + gprop = groupProp[prop]; + } + oldStateImp = stateImp; + cell = impTabProps[oldStateImp][gprop]; + stateImp = GetStateProps(cell); /* isolate the new state */ + actionImp = GetActionProps(cell); /* isolate the action */ + if ((i == limit) && (actionImp == 0)) { + /* there is an unprocessed sequence if its property == eor */ + actionImp = 1; /* process the last sequence */ + } + if (actionImp != 0) { + resProp = impTabProps[oldStateImp][IMPTABPROPS_RES]; + switch (actionImp) { + case 1: /* process current seq1, init new seq1 */ + processPropertySeq(levState, resProp, start1, i); + start1 = i; + break; + case 2: /* init new seq2 */ + start2 = i; + break; + case 3: /* process seq1, process seq2, init new seq1 */ + processPropertySeq(levState, resProp, start1, start2); + processPropertySeq(levState, _ON, start2, i); + start1 = i; + break; + case 4: /* process seq1, set seq1=seq2, init new seq2 */ + processPropertySeq(levState, resProp, start1, start2); + start1 = start2; + start2 = i; + break; + default: /* we should never get here */ + throw new IllegalStateException("Internal ICU error in resolveImplicitLevels"); + } + } + } + + /* look for the last char not a BN or LRE/RLE/LRO/RLO/PDF */ + for (i = limit - 1; + i > start && + (DirPropFlag(dirProps[i]) & MASK_BN_EXPLICIT) != 0; + i--); + dirProp = dirProps[i]; + if ((dirProp == LRI || dirProp == RLI) && limit < length) { + isolateCount++; + if (isolates[isolateCount] == null) + isolates[isolateCount] = new Isolate(); + isolates[isolateCount].stateImp = stateImp; + isolates[isolateCount].state = levState.state; + isolates[isolateCount].start1 = start1; + isolates[isolateCount].startON = levState.startON; + } + else + processPropertySeq(levState, eor, limit, limit); + } + + /* perform (L1) and (X9) ---------------------------------------------------- */ + + /* + * Reset the embedding levels for some non-graphic characters (L1). + * This method also sets appropriate levels for BN, and + * explicit embedding types that are supposed to have been removed + * from the paragraph in (X9). + */ + private void adjustWSLevels() { + int i; + + if ((flags & MASK_WS) != 0) { + int flag; + i = trailingWSStart; + while (i > 0) { + /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */ + while (i > 0 && ((flag = DirPropFlag(dirProps[--i])) & MASK_WS) != 0) { + if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { + levels[i] = 0; + } else { + levels[i] = GetParaLevelAt(i); + } + } + + /* reset BN to the next character's paraLevel until B/S, which restarts above loop */ + /* here, i+1 is guaranteed to be 0) { + flag = DirPropFlag(dirProps[--i]); + if ((flag & MASK_BN_EXPLICIT) != 0) { + levels[i] = levels[i + 1]; + } else if (orderParagraphsLTR && (flag & DirPropFlag(B)) != 0) { + levels[i] = 0; + break; + } else if ((flag & MASK_B_S) != 0){ + levels[i] = GetParaLevelAt(i); + break; + } + } + } + } + } + + private void setParaSuccess() { + paraBidi = this; /* mark successful setPara */ + } + + private int Bidi_Min(int x, int y) { + return x < y ? x : y; + } + + private int Bidi_Abs(int x) { + return x >= 0 ? x : -x; + } + + void setParaRunsOnly(char[] parmText, byte parmParaLevel) { + int[] visualMap; + String visualText; + int saveLength, saveTrailingWSStart; + byte[] saveLevels; + byte saveDirection; + int i, j, visualStart, logicalStart, + oldRunCount, runLength, addedRuns, insertRemove, + start, limit, step, indexOddBit, logicalPos, + index, index1; + int saveOptions; + + reorderingMode = REORDER_DEFAULT; + int parmLength = parmText.length; + if (parmLength == 0) { + setPara(parmText, parmParaLevel, null); + reorderingMode = REORDER_RUNS_ONLY; + return; + } + /* obtain memory for mapping table and visual text */ + saveOptions = reorderingOptions; + if ((saveOptions & OPTION_INSERT_MARKS) > 0) { + reorderingOptions &= ~OPTION_INSERT_MARKS; + reorderingOptions |= OPTION_REMOVE_CONTROLS; + } + parmParaLevel &= 1; /* accept only 0 or 1 */ + setPara(parmText, parmParaLevel, null); + /* we cannot access directly levels since it is not yet set if + * direction is not MIXED + */ + saveLevels = new byte[this.length]; + System.arraycopy(getLevels(), 0, saveLevels, 0, this.length); + saveTrailingWSStart = trailingWSStart; + + /* FOOD FOR THOUGHT: instead of writing the visual text, we could use + * the visual map and the dirProps array to drive the second call + * to setPara (but must make provision for possible removal of + * Bidi controls. Alternatively, only use the dirProps array via + * customized classifier callback. + */ + visualText = writeReordered(DO_MIRRORING); + visualMap = getVisualMap(); + this.reorderingOptions = saveOptions; + saveLength = this.length; + saveDirection=this.direction; + + this.reorderingMode = REORDER_INVERSE_LIKE_DIRECT; + parmParaLevel ^= 1; + setPara(visualText, parmParaLevel, null); + BidiLine.getRuns(this); + /* check if some runs must be split, count how many splits */ + addedRuns = 0; + oldRunCount = this.runCount; + visualStart = 0; + for (i = 0; i < oldRunCount; i++, visualStart += runLength) { + runLength = runs[i].limit - visualStart; + if (runLength < 2) { + continue; + } + logicalStart = runs[i].start; + for (j = logicalStart+1; j < logicalStart+runLength; j++) { + index = visualMap[j]; + index1 = visualMap[j-1]; + if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { + addedRuns++; + } + } + } + if (addedRuns > 0) { + getRunsMemory(oldRunCount + addedRuns); + if (runCount == 1) { + /* because we switch from UBiDi.simpleRuns to UBiDi.runs */ + runsMemory[0] = runs[0]; + } else { + System.arraycopy(runs, 0, runsMemory, 0, runCount); + } + runs = runsMemory; + runCount += addedRuns; + for (i = oldRunCount; i < runCount; i++) { + if (runs[i] == null) { + runs[i] = new BidiRun(0, 0, (byte)0); + } + } + } + /* split runs which are not consecutive in source text */ + int newI; + for (i = oldRunCount-1; i >= 0; i--) { + newI = i + addedRuns; + runLength = i==0 ? runs[0].limit : + runs[i].limit - runs[i-1].limit; + logicalStart = runs[i].start; + indexOddBit = runs[i].level & 1; + if (runLength < 2) { + if (addedRuns > 0) { + runs[newI].copyFrom(runs[i]); + } + logicalPos = visualMap[logicalStart]; + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + continue; + } + if (indexOddBit > 0) { + start = logicalStart; + limit = logicalStart + runLength - 1; + step = 1; + } else { + start = logicalStart + runLength - 1; + limit = logicalStart; + step = -1; + } + for (j = start; j != limit; j += step) { + index = visualMap[j]; + index1 = visualMap[j+step]; + if ((Bidi_Abs(index-index1)!=1) || (saveLevels[index]!=saveLevels[index1])) { + logicalPos = Bidi_Min(visualMap[start], index); + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + runs[newI].limit = runs[i].limit; + runs[i].limit -= Bidi_Abs(j - start) + 1; + insertRemove = runs[i].insertRemove & (LRM_AFTER|RLM_AFTER); + runs[newI].insertRemove = insertRemove; + runs[i].insertRemove &= ~insertRemove; + start = j + step; + addedRuns--; + newI--; + } + } + if (addedRuns > 0) { + runs[newI].copyFrom(runs[i]); + } + logicalPos = Bidi_Min(visualMap[start], visualMap[limit]); + runs[newI].start = logicalPos; + runs[newI].level = (byte)(saveLevels[logicalPos] ^ indexOddBit); + } + + cleanup1: + /* restore initial paraLevel */ + this.paraLevel ^= 1; + cleanup2: + /* restore real text */ + this.text = parmText; + this.length = saveLength; + this.originalLength = parmLength; + this.direction=saveDirection; + this.levels = saveLevels; + this.trailingWSStart = saveTrailingWSStart; + if (runCount > 1) { + this.direction = MIXED; + } + cleanup3: + this.reorderingMode = REORDER_RUNS_ONLY; + } + + /** + * Perform the Unicode Bidi algorithm. It is defined in the + * Unicode Standard Annex #9, + * version 13, + * also described in The Unicode Standard, Version 4.0 .

+ * + * This method takes a piece of plain text containing one or more paragraphs, + * with or without externally specified embedding levels from styled + * text and computes the left-right-directionality of each character.

+ * + * If the entire text is all of the same directionality, then + * the method may not perform all the steps described by the algorithm, + * i.e., some levels may not be the same as if all steps were performed. + * This is not relevant for unidirectional text.
+ * For example, in pure LTR text with numbers the numbers would get + * a resolved level of 2 higher than the surrounding text according to + * the algorithm. This implementation may set all resolved levels to + * the same value in such a case.

+ * + * The text can be composed of multiple paragraphs. Occurrence of a block + * separator in the text terminates a paragraph, and whatever comes next starts + * a new paragraph. The exception to this rule is when a Carriage Return (CR) + * is followed by a Line Feed (LF). Both CR and LF are block separators, but + * in that case, the pair of characters is considered as terminating the + * preceding paragraph, and a new paragraph will be started by a character + * coming after the LF. + * + * Although the text is passed here as a String, it is + * stored internally as an array of characters. Therefore the + * documentation will refer to indexes of the characters in the text. + * + * @param text contains the text that the Bidi algorithm will be performed + * on. This text can be retrieved with getText() or + * getTextAsString.
+ * + * @param paraLevel specifies the default level for the text; + * it is typically 0 (LTR) or 1 (RTL). + * If the method shall determine the paragraph level from the text, + * then paraLevel can be set to + * either LEVEL_DEFAULT_LTR + * or LEVEL_DEFAULT_RTL; if the text contains multiple + * paragraphs, the paragraph level shall be determined separately for + * each paragraph; if a paragraph does not include any strongly typed + * character, then the desired default is used (0 for LTR or 1 for RTL). + * Any other value between 0 and MAX_EXPLICIT_LEVEL + * is also valid, with odd levels indicating RTL. + * + * @param embeddingLevels (in) may be used to preset the embedding and override levels, + * ignoring characters like LRE and PDF in the text. + * A level overrides the directional property of its corresponding + * (same index) character if the level has the + * LEVEL_OVERRIDE bit set.

+ * Except for that bit, it must be + * paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL, + * with one exception: a level of zero may be specified for a + * paragraph separator even if paraLevel>0 when multiple + * paragraphs are submitted in the same call to setPara().

+ * Caution: A reference to this array, not a copy + * of the levels, will be stored in the Bidi object; + * the embeddingLevels + * should not be modified to avoid unexpected results on subsequent + * Bidi operations. However, the setPara() and + * setLine() methods may modify some or all of the + * levels.

+ * Note: the embeddingLevels array must + * have one entry for each character in text. + * + * @throws IllegalArgumentException if the values in embeddingLevels are + * not within the allowed range + * + * @see #LEVEL_DEFAULT_LTR + * @see #LEVEL_DEFAULT_RTL + * @see #LEVEL_OVERRIDE + * @see #MAX_EXPLICIT_LEVEL + * @stable ICU 3.8 + */ + void setPara(String text, byte paraLevel, byte[] embeddingLevels) + { + if (text == null) { + setPara(new char[0], paraLevel, embeddingLevels); + } else { + setPara(text.toCharArray(), paraLevel, embeddingLevels); + } + } + + /** + * Perform the Unicode Bidi algorithm. It is defined in the + * Unicode Standard Annex #9, + * version 13, + * also described in The Unicode Standard, Version 4.0 .

+ * + * This method takes a piece of plain text containing one or more paragraphs, + * with or without externally specified embedding levels from styled + * text and computes the left-right-directionality of each character.

+ * + * If the entire text is all of the same directionality, then + * the method may not perform all the steps described by the algorithm, + * i.e., some levels may not be the same as if all steps were performed. + * This is not relevant for unidirectional text.
+ * For example, in pure LTR text with numbers the numbers would get + * a resolved level of 2 higher than the surrounding text according to + * the algorithm. This implementation may set all resolved levels to + * the same value in such a case. + * + * The text can be composed of multiple paragraphs. Occurrence of a block + * separator in the text terminates a paragraph, and whatever comes next starts + * a new paragraph. The exception to this rule is when a Carriage Return (CR) + * is followed by a Line Feed (LF). Both CR and LF are block separators, but + * in that case, the pair of characters is considered as terminating the + * preceding paragraph, and a new paragraph will be started by a character + * coming after the LF. + * + * The text is stored internally as an array of characters. Therefore the + * documentation will refer to indexes of the characters in the text. + * + * @param chars contains the text that the Bidi algorithm will be performed + * on. This text can be retrieved with getText() or + * getTextAsString.
+ * + * @param paraLevel specifies the default level for the text; + * it is typically 0 (LTR) or 1 (RTL). + * If the method shall determine the paragraph level from the text, + * then paraLevel can be set to + * either LEVEL_DEFAULT_LTR + * or LEVEL_DEFAULT_RTL; if the text contains multiple + * paragraphs, the paragraph level shall be determined separately for + * each paragraph; if a paragraph does not include any strongly typed + * character, then the desired default is used (0 for LTR or 1 for RTL). + * Any other value between 0 and MAX_EXPLICIT_LEVEL + * is also valid, with odd levels indicating RTL. + * + * @param embeddingLevels (in) may be used to preset the embedding and + * override levels, ignoring characters like LRE and PDF in the text. + * A level overrides the directional property of its corresponding + * (same index) character if the level has the + * LEVEL_OVERRIDE bit set.

+ * Except for that bit, it must be + * paraLevel<=embeddingLevels[]<=MAX_EXPLICIT_LEVEL, + * with one exception: a level of zero may be specified for a + * paragraph separator even if paraLevel>0 when multiple + * paragraphs are submitted in the same call to setPara().

+ * Caution: A reference to this array, not a copy + * of the levels, will be stored in the Bidi object; + * the embeddingLevels + * should not be modified to avoid unexpected results on subsequent + * Bidi operations. However, the setPara() and + * setLine() methods may modify some or all of the + * levels.

+ * Note: the embeddingLevels array must + * have one entry for each character in text. + * + * @throws IllegalArgumentException if the values in embeddingLevels are + * not within the allowed range + * + * @see #LEVEL_DEFAULT_LTR + * @see #LEVEL_DEFAULT_RTL + * @see #LEVEL_OVERRIDE + * @see #MAX_EXPLICIT_LEVEL + * @stable ICU 3.8 + */ + void setPara(char[] chars, byte paraLevel, byte[] embeddingLevels) + { + /* check the argument values */ + if (paraLevel < LEVEL_DEFAULT_LTR) { + verifyRange(paraLevel, 0, MAX_EXPLICIT_LEVEL + 1); + } + if (chars == null) { + chars = new char[0]; + } + + /* special treatment for RUNS_ONLY mode */ + if (reorderingMode == REORDER_RUNS_ONLY) { + setParaRunsOnly(chars, paraLevel); + return; + } + + /* initialize the Bidi object */ + this.paraBidi = null; /* mark unfinished setPara */ + this.text = chars; + this.length = this.originalLength = this.resultLength = text.length; + this.paraLevel = paraLevel; + this.direction = (byte)(paraLevel & 1); + this.paraCount = 1; + + /* Allocate zero-length arrays instead of setting to null here; then + * checks for null in various places can be eliminated. + */ + dirProps = new byte[0]; + levels = new byte[0]; + runs = new BidiRun[0]; + isGoodLogicalToVisualRunsMap = false; + insertPoints.size = 0; /* clean up from last call */ + insertPoints.confirmed = 0; /* clean up from last call */ + + /* + * Save the original paraLevel if contextual; otherwise, set to 0. + */ + defaultParaLevel = IsDefaultLevel(paraLevel) ? paraLevel : 0; + + if (length == 0) { + /* + * For an empty paragraph, create a Bidi object with the paraLevel and + * the flags and the direction set but without allocating zero-length arrays. + * There is nothing more to do. + */ + if (IsDefaultLevel(paraLevel)) { + this.paraLevel &= 1; + defaultParaLevel = 0; + } + flags = DirPropFlagLR(paraLevel); + runCount = 0; + paraCount = 0; + setParaSuccess(); + return; + } + + runCount = -1; + + /* + * Get the directional properties, + * the flags bit-set, and + * determine the paragraph level if necessary. + */ + getDirPropsMemory(length); + dirProps = dirPropsMemory; + getDirProps(); + /* the processed length may have changed if OPTION_STREAMING is set */ + trailingWSStart = length; /* the levels[] will reflect the WS run */ + + /* are explicit levels specified? */ + if (embeddingLevels == null) { + /* no: determine explicit levels according to the (Xn) rules */ + getLevelsMemory(length); + levels = levelsMemory; + direction = resolveExplicitLevels(); + } else { + /* set BN for all explicit codes, check that all levels are 0 or paraLevel..MAX_EXPLICIT_LEVEL */ + levels = embeddingLevels; + direction = checkExplicitLevels(); + } + + /* allocate isolate memory */ + if (isolateCount > 0) { + if (isolates == null || isolates.length < isolateCount) + isolates = new Isolate[isolateCount + 3]; /* keep some reserve */ + } + isolateCount = -1; /* current isolates stack entry == none */ + + /* + * The steps after (X9) in the Bidi algorithm are performed only if + * the paragraph text has mixed directionality! + */ + switch (direction) { + case LTR: + /* all levels are implicitly at paraLevel (important for getLevels()) */ + trailingWSStart = 0; + break; + case RTL: + /* all levels are implicitly at paraLevel (important for getLevels()) */ + trailingWSStart = 0; + break; + default: + /* + * Choose the right implicit state table + */ + switch(reorderingMode) { + case REORDER_DEFAULT: + this.impTabPair = impTab_DEFAULT; + break; + case REORDER_NUMBERS_SPECIAL: + this.impTabPair = impTab_NUMBERS_SPECIAL; + break; + case REORDER_GROUP_NUMBERS_WITH_R: + this.impTabPair = impTab_GROUP_NUMBERS_WITH_R; + break; + case REORDER_RUNS_ONLY: + /* we should never get here */ + throw new InternalError("Internal ICU error in setPara"); + /* break; */ + case REORDER_INVERSE_NUMBERS_AS_L: + this.impTabPair = impTab_INVERSE_NUMBERS_AS_L; + break; + case REORDER_INVERSE_LIKE_DIRECT: + if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { + this.impTabPair = impTab_INVERSE_LIKE_DIRECT_WITH_MARKS; + } else { + this.impTabPair = impTab_INVERSE_LIKE_DIRECT; + } + break; + case REORDER_INVERSE_FOR_NUMBERS_SPECIAL: + if ((reorderingOptions & OPTION_INSERT_MARKS) != 0) { + this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS; + } else { + this.impTabPair = impTab_INVERSE_FOR_NUMBERS_SPECIAL; + } + break; + } + /* + * If there are no external levels specified and there + * are no significant explicit level codes in the text, + * then we can treat the entire paragraph as one run. + * Otherwise, we need to perform the following rules on runs of + * the text with the same embedding levels. (X10) + * "Significant" explicit level codes are ones that actually + * affect non-BN characters. + * Examples for "insignificant" ones are empty embeddings + * LRE-PDF, LRE-RLE-PDF-PDF, etc. + */ + if (embeddingLevels == null && paraCount <= 1 && + (flags & DirPropFlagMultiRuns) == 0) { + resolveImplicitLevels(0, length, + GetLRFromLevel(GetParaLevelAt(0)), + GetLRFromLevel(GetParaLevelAt(length - 1))); + } else { + /* sor, eor: start and end types of same-level-run */ + int start, limit = 0; + byte level, nextLevel; + short sor, eor; + + /* determine the first sor and set eor to it because of the loop body (sor=eor there) */ + level = GetParaLevelAt(0); + nextLevel = levels[0]; + if (level < nextLevel) { + eor = GetLRFromLevel(nextLevel); + } else { + eor = GetLRFromLevel(level); + } + + do { + /* determine start and limit of the run (end points just behind the run) */ + + /* the values for this run's start are the same as for the previous run's end */ + start = limit; + level = nextLevel; + if ((start > 0) && (dirProps[start - 1] == B)) { + /* except if this is a new paragraph, then set sor = para level */ + sor = GetLRFromLevel(GetParaLevelAt(start)); + } else { + sor = eor; + } + + /* search for the limit of this run */ + while ((++limit < length) && + ((levels[limit] == level) || + ((DirPropFlag(dirProps[limit]) & MASK_BN_EXPLICIT) != 0))) {} + + /* get the correct level of the next run */ + if (limit < length) { + nextLevel = levels[limit]; + } else { + nextLevel = GetParaLevelAt(length - 1); + } + + /* determine eor from max(level, nextLevel); sor is last run's eor */ + if (NoOverride(level) < NoOverride(nextLevel)) { + eor = GetLRFromLevel(nextLevel); + } else { + eor = GetLRFromLevel(level); + } + + /* if the run consists of overridden directional types, then there + are no implicit types to be resolved */ + if ((level & LEVEL_OVERRIDE) == 0) { + resolveImplicitLevels(start, limit, sor, eor); + } else { + /* remove the LEVEL_OVERRIDE flags */ + do { + levels[start++] &= ~LEVEL_OVERRIDE; + } while (start < limit); + } + } while (limit < length); + } + + /* reset the embedding levels for some non-graphic characters (L1), (X9) */ + adjustWSLevels(); + + break; + } + + /* add RLM for inverse Bidi with contextual orientation resolving + * to RTL which would not round-trip otherwise + */ + if ((defaultParaLevel > 0) && + ((reorderingOptions & OPTION_INSERT_MARKS) != 0) && + ((reorderingMode == REORDER_INVERSE_LIKE_DIRECT) || + (reorderingMode == REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) { + int start, last; + byte level; + byte dirProp; + for (int i = 0; i < paraCount; i++) { + last = paras_limit[i] - 1; + level = paras_level[i]; + if (level == 0) + continue; /* LTR paragraph */ + start = i == 0 ? 0 : paras_limit[i - 1]; + for (int j = last; j >= start; j--) { + dirProp = dirProps[j]; + if (dirProp == L) { + if (j < last) { + while (dirProps[last] == B) { + last--; + } + } + addPoint(last, RLM_BEFORE); + break; + } + if ((DirPropFlag(dirProp) & MASK_R_AL) != 0) { + break; + } + } + } + } + + if ((reorderingOptions & OPTION_REMOVE_CONTROLS) != 0) { + resultLength -= controlCount; + } else { + resultLength += insertPoints.size; + } + setParaSuccess(); + } + + /** + * Perform the Unicode Bidi algorithm on a given paragraph, as defined in the + * Unicode Standard Annex #9, + * version 13, + * also described in The Unicode Standard, Version 4.0 .

+ * + * This method takes a paragraph of text and computes the + * left-right-directionality of each character. The text should not + * contain any Unicode block separators.

+ * + * The RUN_DIRECTION attribute in the text, if present, determines the base + * direction (left-to-right or right-to-left). If not present, the base + * direction is computed using the Unicode Bidirectional Algorithm, + * defaulting to left-to-right if there are no strong directional characters + * in the text. This attribute, if present, must be applied to all the text + * in the paragraph.

+ * + * The BIDI_EMBEDDING attribute in the text, if present, represents + * embedding level information. Negative values from -1 to -62 indicate + * overrides at the absolute value of the level. Positive values from 1 to + * 62 indicate embeddings. Where values are zero or not defined, the base + * embedding level as determined by the base direction is assumed.

+ * + * The NUMERIC_SHAPING attribute in the text, if present, converts European + * digits to other decimal digits before running the bidi algorithm. This + * attribute, if present, must be applied to all the text in the paragraph. + * + * If the entire text is all of the same directionality, then + * the method may not perform all the steps described by the algorithm, + * i.e., some levels may not be the same as if all steps were performed. + * This is not relevant for unidirectional text.
+ * For example, in pure LTR text with numbers the numbers would get + * a resolved level of 2 higher than the surrounding text according to + * the algorithm. This implementation may set all resolved levels to + * the same value in such a case.

+ * + * @param paragraph a paragraph of text with optional character and + * paragraph attribute information + * @stable ICU 3.8 + */ + public void setPara(AttributedCharacterIterator paragraph) + { + byte paraLvl; + char ch = paragraph.first(); + Boolean runDirection = + (Boolean) paragraph.getAttribute(TextAttributeConstants.RUN_DIRECTION); + Object shaper = paragraph.getAttribute(TextAttributeConstants.NUMERIC_SHAPING); + + if (runDirection == null) { + paraLvl = LEVEL_DEFAULT_LTR; + } else { + paraLvl = (runDirection.equals(TextAttributeConstants.RUN_DIRECTION_LTR)) ? + LTR : RTL; + } + + byte[] lvls = null; + int len = paragraph.getEndIndex() - paragraph.getBeginIndex(); + byte[] embeddingLevels = new byte[len]; + char[] txt = new char[len]; + int i = 0; + while (ch != AttributedCharacterIterator.DONE) { + txt[i] = ch; + Integer embedding = + (Integer) paragraph.getAttribute(TextAttributeConstants.BIDI_EMBEDDING); + if (embedding != null) { + byte level = embedding.byteValue(); + if (level == 0) { + /* no-op */ + } else if (level < 0) { + lvls = embeddingLevels; + embeddingLevels[i] = (byte)((0 - level) | LEVEL_OVERRIDE); + } else { + lvls = embeddingLevels; + embeddingLevels[i] = level; + } + } + ch = paragraph.next(); + ++i; + } + + if (shaper != null) { + NumericShapings.shape(shaper, txt, 0, len); + } + setPara(txt, paraLvl, lvls); + } + + /** + * Specify whether block separators must be allocated level zero, + * so that successive paragraphs will progress from left to right. + * This method must be called before setPara(). + * Paragraph separators (B) may appear in the text. Setting them to level zero + * means that all paragraph separators (including one possibly appearing + * in the last text position) are kept in the reordered text after the text + * that they follow in the source text. + * When this feature is not enabled, a paragraph separator at the last + * position of the text before reordering will go to the first position + * of the reordered text when the paragraph level is odd. + * + * @param ordarParaLTR specifies whether paragraph separators (B) must + * receive level 0, so that successive paragraphs progress from left to right. + * + * @see #setPara + * @stable ICU 3.8 + */ + public void orderParagraphsLTR(boolean ordarParaLTR) { + orderParagraphsLTR = ordarParaLTR; + } + + /** + * Get the directionality of the text. + * + * @return a value of LTR, RTL or MIXED + * that indicates if the entire text + * represented by this object is unidirectional, + * and which direction, or if it is mixed-directional. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @see #LTR + * @see #RTL + * @see #MIXED + * @stable ICU 3.8 + */ + public byte getDirection() + { + verifyValidParaOrLine(); + return direction; + } + + /** + * Get the length of the text. + * + * @return The length of the text that the Bidi object was + * created for. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @stable ICU 3.8 + */ + public int getLength() + { + verifyValidParaOrLine(); + return originalLength; + } + + /* paragraphs API methods ------------------------------------------------- */ + + /** + * Get the paragraph level of the text. + * + * @return The paragraph level. If there are multiple paragraphs, their + * level may vary if the required paraLevel is LEVEL_DEFAULT_LTR or + * LEVEL_DEFAULT_RTL. In that case, the level of the first paragraph + * is returned. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @see #LEVEL_DEFAULT_LTR + * @see #LEVEL_DEFAULT_RTL + * @see #getParagraph + * @see #getParagraphByIndex + * @stable ICU 3.8 + */ + public byte getParaLevel() + { + verifyValidParaOrLine(); + return paraLevel; + } + + /** + * Retrieves the Bidi class for a given code point. + *

If a BidiClassifier is defined and returns a value + * other than CLASS_DEFAULT, that value is used; otherwise + * the default class determination mechanism is invoked.

+ * + * @param c The code point to get a Bidi class for. + * + * @return The Bidi class for the character c that is in effect + * for this Bidi instance. + * + * @stable ICU 3.8 + */ + public int getCustomizedClass(int c) { + int dir; + + dir = bdp.getClass(c); + if (dir >= CHAR_DIRECTION_COUNT) + dir = ON; + return dir; + } + + /** + * setLine() returns a Bidi object to + * contain the reordering information, especially the resolved levels, + * for all the characters in a line of text. This line of text is + * specified by referring to a Bidi object representing + * this information for a piece of text containing one or more paragraphs, + * and by specifying a range of indexes in this text.

+ * In the new line object, the indexes will range from 0 to limit-start-1.

+ * + * This is used after calling setPara() + * for a piece of text, and after line-breaking on that text. + * It is not necessary if each paragraph is treated as a single line.

+ * + * After line-breaking, rules (L1) and (L2) for the treatment of + * trailing WS and for reordering are performed on + * a Bidi object that represents a line.

+ * + * Important: the line Bidi object may + * reference data within the global text Bidi object. + * You should not alter the content of the global text object until + * you are finished using the line object. + * + * @param start is the line's first index into the text. + * + * @param limit is just behind the line's last index into the text + * (its last index +1). + * + * @return a Bidi object that will now represent a line of the text. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara + * @throws IllegalArgumentException if start and limit are not in the range + * 0<=start<limit<=getProcessedLength(), + * or if the specified line crosses a paragraph boundary + * + * @see #setPara + * @see #getProcessedLength + * @stable ICU 3.8 + */ + public Bidi setLine(Bidi bidi, BidiBase bidiBase, Bidi newBidi, BidiBase newBidiBase, int start, int limit) + { + verifyValidPara(); + verifyRange(start, 0, limit); + verifyRange(limit, 0, length+1); + + return BidiLine.setLine(this, newBidi, newBidiBase, start, limit); + } + + /** + * Get the level for one character. + * + * @param charIndex the index of a character. + * + * @return The level for the character at charIndex. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @throws IllegalArgumentException if charIndex is not in the range + * 0<=charIndex<getProcessedLength() + * + * @see #getProcessedLength + * @stable ICU 3.8 + */ + public byte getLevelAt(int charIndex) + { + // for backward compatibility + if (charIndex < 0 || charIndex >= length) { + return (byte)getBaseLevel(); + } + + verifyValidParaOrLine(); + verifyRange(charIndex, 0, length); + return BidiLine.getLevelAt(this, charIndex); + } + + /** + * Get an array of levels for each character.

+ * + * Note that this method may allocate memory under some + * circumstances, unlike getLevelAt(). + * + * @return The levels array for the text, + * or null if an error occurs. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @stable ICU 3.8 + */ + byte[] getLevels() + { + verifyValidParaOrLine(); + if (length <= 0) { + return new byte[0]; + } + return BidiLine.getLevels(this); + } + + /** + * Get the number of runs. + * This method may invoke the actual reordering on the + * Bidi object, after setPara() + * may have resolved only the levels of the text. Therefore, + * countRuns() may have to allocate memory, + * and may throw an exception if it fails to do so. + * + * @return The number of runs. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @stable ICU 3.8 + */ + public int countRuns() + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + return runCount; + } + + /** + * + * Get a BidiRun object according to its index. BidiRun methods + * may be used to retrieve the run's logical start, length and level, + * which can be even for an LTR run or odd for an RTL run. + * In an RTL run, the character at the logical start is + * visually on the right of the displayed run. + * The length is the number of characters in the run.

+ * countRuns() is normally called + * before the runs are retrieved. + * + *

+ * Example: + *

+     *  Bidi bidi = new Bidi();
+     *  String text = "abc 123 DEFG xyz";
+     *  bidi.setPara(text, Bidi.RTL, null);
+     *  int i, count=bidi.countRuns(), logicalStart, visualIndex=0, length;
+     *  BidiRun run;
+     *  for (i = 0; i < count; ++i) {
+     *      run = bidi.getVisualRun(i);
+     *      logicalStart = run.getStart();
+     *      length = run.getLength();
+     *      if (Bidi.LTR == run.getEmbeddingLevel()) {
+     *          do { // LTR
+     *              show_char(text.charAt(logicalStart++), visualIndex++);
+     *          } while (--length > 0);
+     *      } else {
+     *          logicalStart += length;  // logicalLimit
+     *          do { // RTL
+     *              show_char(text.charAt(--logicalStart), visualIndex++);
+     *          } while (--length > 0);
+     *      }
+     *  }
+     * 
+ *

+ * Note that in right-to-left runs, code like this places + * second surrogates before first ones (which is generally a bad idea) + * and combining characters before base characters. + *

+ * Use of {@link #writeReordered}, optionally with the + * {@link #KEEP_BASE_COMBINING} option, can be considered in + * order to avoid these issues. + * + * @param runIndex is the number of the run in visual order, in the + * range [0..countRuns()-1]. + * + * @return a BidiRun object containing the details of the run. The + * directionality of the run is + * LTR==0 or RTL==1, + * never MIXED. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @throws IllegalArgumentException if runIndex is not in + * the range 0<=runIndex<countRuns() + * + * @see #countRuns() + * @see com.ibm.icu.text.BidiRun + * @see com.ibm.icu.text.BidiRun#getStart() + * @see com.ibm.icu.text.BidiRun#getLength() + * @see com.ibm.icu.text.BidiRun#getEmbeddingLevel() + * @stable ICU 3.8 + */ + BidiRun getVisualRun(int runIndex) + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + verifyRange(runIndex, 0, runCount); + return BidiLine.getVisualRun(this, runIndex); + } + + /** + * Get a visual-to-logical index map (array) for the characters in the + * Bidi (paragraph or line) object. + *

+ * Some values in the map may be MAP_NOWHERE if the + * corresponding text characters are Bidi marks inserted in the visual + * output by the option OPTION_INSERT_MARKS. + *

+ * When the visual output is altered by using options of + * writeReordered() such as INSERT_LRM_FOR_NUMERIC, + * KEEP_BASE_COMBINING, OUTPUT_REVERSE, + * REMOVE_BIDI_CONTROLS, the logical positions returned may not + * be correct. It is advised to use, when possible, reordering options + * such as {@link #OPTION_INSERT_MARKS} and {@link #OPTION_REMOVE_CONTROLS}. + * + * @return an array of getResultLength() + * indexes which will reflect the reordering of the characters.

+ * The index map will result in + * indexMap[visualIndex]==logicalIndex, where + * indexMap represents the returned array. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @see #getLogicalMap + * @see #getLogicalIndex + * @see #getResultLength + * @see #MAP_NOWHERE + * @see #OPTION_INSERT_MARKS + * @see #writeReordered + * @stable ICU 3.8 + */ + private int[] getVisualMap() + { + /* countRuns() checks successful call to setPara/setLine */ + countRuns(); + if (resultLength <= 0) { + return new int[0]; + } + return BidiLine.getVisualMap(this); + } + + /** + * This is a convenience method that does not use a Bidi object. + * It is intended to be used for when an application has determined the levels + * of objects (character sequences) and just needs to have them reordered (L2). + * This is equivalent to using getVisualMap() on a + * Bidi object. + * + * @param levels is an array of levels that have been determined by + * the application. + * + * @return an array of levels.length + * indexes which will reflect the reordering of the characters.

+ * The index map will result in + * indexMap[visualIndex]==logicalIndex, where + * indexMap represents the returned array. + * + * @stable ICU 3.8 + */ + private static int[] reorderVisual(byte[] levels) + { + return BidiLine.reorderVisual(levels); + } + + /** + * Constant indicating that the base direction depends on the first strong + * directional character in the text according to the Unicode Bidirectional + * Algorithm. If no strong directional character is present, the base + * direction is right-to-left. + * @stable ICU 3.8 + */ + public static final int DIRECTION_DEFAULT_RIGHT_TO_LEFT = LEVEL_DEFAULT_RTL; + + /** + * Create Bidi from the given text, embedding, and direction information. + * The embeddings array may be null. If present, the values represent + * embedding level information. Negative values from -1 to -61 indicate + * overrides at the absolute value of the level. Positive values from 1 to + * 61 indicate embeddings. Where values are zero, the base embedding level + * as determined by the base direction is assumed.

+ * + * Note: this constructor calls setPara() internally. + * + * @param text an array containing the paragraph of text to process. + * @param textStart the index into the text array of the start of the + * paragraph. + * @param embeddings an array containing embedding values for each character + * in the paragraph. This can be null, in which case it is assumed + * that there is no external embedding information. + * @param embStart the index into the embedding array of the start of the + * paragraph. + * @param paragraphLength the length of the paragraph in the text and + * embeddings arrays. + * @param flags a collection of flags that control the algorithm. The + * algorithm understands the flags DIRECTION_LEFT_TO_RIGHT, + * DIRECTION_RIGHT_TO_LEFT, DIRECTION_DEFAULT_LEFT_TO_RIGHT, and + * DIRECTION_DEFAULT_RIGHT_TO_LEFT. Other values are reserved. + * + * @throws IllegalArgumentException if the values in embeddings are + * not within the allowed range + * + * @see #DIRECTION_LEFT_TO_RIGHT + * @see #DIRECTION_RIGHT_TO_LEFT + * @see #DIRECTION_DEFAULT_LEFT_TO_RIGHT + * @see #DIRECTION_DEFAULT_RIGHT_TO_LEFT + * @stable ICU 3.8 + */ + public BidiBase(char[] text, + int textStart, + byte[] embeddings, + int embStart, + int paragraphLength, + int flags) + { + this(0, 0); + byte paraLvl; + switch (flags) { + case Bidi.DIRECTION_LEFT_TO_RIGHT: + default: + paraLvl = LTR; + break; + case Bidi.DIRECTION_RIGHT_TO_LEFT: + paraLvl = RTL; + break; + case Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT: + paraLvl = LEVEL_DEFAULT_LTR; + break; + case Bidi.DIRECTION_DEFAULT_RIGHT_TO_LEFT: + paraLvl = LEVEL_DEFAULT_RTL; + break; + } + byte[] paraEmbeddings; + if (embeddings == null) { + paraEmbeddings = null; + } else { + paraEmbeddings = new byte[paragraphLength]; + byte lev; + for (int i = 0; i < paragraphLength; i++) { + lev = embeddings[i + embStart]; + if (lev < 0) { + lev = (byte)((- lev) | LEVEL_OVERRIDE); + } else if (lev == 0) { + lev = paraLvl; + if (paraLvl > MAX_EXPLICIT_LEVEL) { + lev &= 1; + } + } + paraEmbeddings[i] = lev; + } + } + + char[] paraText = new char[paragraphLength]; + System.arraycopy(text, textStart, paraText, 0, paragraphLength); + setPara(paraText, paraLvl, paraEmbeddings); + } + + /** + * Return true if the line is not left-to-right or right-to-left. This means + * it either has mixed runs of left-to-right and right-to-left text, or the + * base direction differs from the direction of the only run of text. + * + * @return true if the line is not left-to-right or right-to-left. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara + * @stable ICU 3.8 + */ + public boolean isMixed() + { + return (!isLeftToRight() && !isRightToLeft()); + } + + /** + * Return true if the line is all left-to-right text and the base direction + * is left-to-right. + * + * @return true if the line is all left-to-right text and the base direction + * is left-to-right. + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara + * @stable ICU 3.8 + */ + public boolean isLeftToRight() + { + return (getDirection() == LTR && (paraLevel & 1) == 0); + } + + /** + * Return true if the line is all right-to-left text, and the base direction + * is right-to-left + * + * @return true if the line is all right-to-left text, and the base + * direction is right-to-left + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara + * @stable ICU 3.8 + */ + public boolean isRightToLeft() + { + return (getDirection() == RTL && (paraLevel & 1) == 1); + } + + /** + * Return true if the base direction is left-to-right + * + * @return true if the base direction is left-to-right + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @stable ICU 3.8 + */ + public boolean baseIsLeftToRight() + { + return (getParaLevel() == LTR); + } + + /** + * Return the base level (0 if left-to-right, 1 if right-to-left). + * + * @return the base level + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @stable ICU 3.8 + */ + public int getBaseLevel() + { + return getParaLevel(); + } + + /** + * Compute the logical to visual run mapping + */ + void getLogicalToVisualRunsMap() + { + if (isGoodLogicalToVisualRunsMap) { + return; + } + int count = countRuns(); + if ((logicalToVisualRunsMap == null) || + (logicalToVisualRunsMap.length < count)) { + logicalToVisualRunsMap = new int[count]; + } + int i; + long[] keys = new long[count]; + for (i = 0; i < count; i++) { + keys[i] = ((long)(runs[i].start)<<32) + i; + } + Arrays.sort(keys); + for (i = 0; i < count; i++) { + logicalToVisualRunsMap[i] = (int)(keys[i] & 0x00000000FFFFFFFF); + } + isGoodLogicalToVisualRunsMap = true; + } + + /** + * Return the level of the nth logical run in this line. + * + * @param run the index of the run, between 0 and countRuns()-1 + * + * @return the level of the run + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @throws IllegalArgumentException if run is not in + * the range 0<=run<countRuns() + * @stable ICU 3.8 + */ + public int getRunLevel(int run) + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + + // for backward compatibility + if (run < 0 || run >= runCount) { + return getParaLevel(); + } + + getLogicalToVisualRunsMap(); + return runs[logicalToVisualRunsMap[run]].level; + } + + /** + * Return the index of the character at the start of the nth logical run in + * this line, as an offset from the start of the line. + * + * @param run the index of the run, between 0 and countRuns() + * + * @return the start of the run + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @throws IllegalArgumentException if run is not in + * the range 0<=run<countRuns() + * @stable ICU 3.8 + */ + public int getRunStart(int run) + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + + // for backward compatibility + if (runCount == 1) { + return 0; + } else if (run == runCount) { + return length; + } + + getLogicalToVisualRunsMap(); + return runs[logicalToVisualRunsMap[run]].start; + } + + /** + * Return the index of the character past the end of the nth logical run in + * this line, as an offset from the start of the line. For example, this + * will return the length of the line for the last run on the line. + * + * @param run the index of the run, between 0 and countRuns() + * + * @return the limit of the run + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * @throws IllegalArgumentException if run is not in + * the range 0<=run<countRuns() + * @stable ICU 3.8 + */ + public int getRunLimit(int run) + { + verifyValidParaOrLine(); + BidiLine.getRuns(this); + + // for backward compatibility + if (runCount == 1) { + return length; + } + + getLogicalToVisualRunsMap(); + int idx = logicalToVisualRunsMap[run]; + int len = idx == 0 ? runs[idx].limit : + runs[idx].limit - runs[idx-1].limit; + return runs[idx].start + len; + } + + /** + * Return true if the specified text requires bidi analysis. If this returns + * false, the text will display left-to-right. Clients can then avoid + * constructing a Bidi object. Text in the Arabic Presentation Forms area of + * Unicode is presumed to already be shaped and ordered for display, and so + * will not cause this method to return true. + * + * @param text the text containing the characters to test + * @param start the start of the range of characters to test + * @param limit the limit of the range of characters to test + * + * @return true if the range of characters requires bidi analysis + * + * @stable ICU 3.8 + */ + public static boolean requiresBidi(char[] text, + int start, + int limit) + { + final int RTLMask = (1 << R | + 1 << AL | + 1 << RLE | + 1 << RLO | + 1 << AN); + + if (0 > start || start > limit || limit > text.length) { + throw new IllegalArgumentException("Value start " + start + + " is out of range 0 to " + limit); + } + + for (int i = start; i < limit; ++i) { + if (Character.isHighSurrogate(text[i]) && i < (limit-1) && + Character.isLowSurrogate(text[i+1])) { + if (((1 << UCharacter.getDirection(Character.codePointAt(text, i))) & RTLMask) != 0) { + return true; + } + } else if (((1 << UCharacter.getDirection(text[i])) & RTLMask) != 0) { + return true; + } + } + + return false; + } + + /** + * Reorder the objects in the array into visual order based on their levels. + * This is a utility method to use when you have a collection of objects + * representing runs of text in logical order, each run containing text at a + * single level. The elements at index from + * objectStart up to objectStart + count in the + * objects array will be reordered into visual order assuming + * each run of text has the level indicated by the corresponding element in + * the levels array (at index - objectStart + levelStart). + * + * @param levels an array representing the bidi level of each object + * @param levelStart the start position in the levels array + * @param objects the array of objects to be reordered into visual order + * @param objectStart the start position in the objects array + * @param count the number of objects to reorder + * @stable ICU 3.8 + */ + public static void reorderVisually(byte[] levels, + int levelStart, + Object[] objects, + int objectStart, + int count) + { + // for backward compatibility + if (0 > levelStart || levels.length <= levelStart) { + throw new IllegalArgumentException("Value levelStart " + + levelStart + " is out of range 0 to " + + (levels.length-1)); + } + if (0 > objectStart || objects.length <= objectStart) { + throw new IllegalArgumentException("Value objectStart " + + levelStart + " is out of range 0 to " + + (objects.length-1)); + } + if (0 > count || objects.length < (objectStart+count)) { + throw new IllegalArgumentException("Value count " + + levelStart + " is out of range 0 to " + + (objects.length - objectStart)); + } + + byte[] reorderLevels = new byte[count]; + System.arraycopy(levels, levelStart, reorderLevels, 0, count); + int[] indexMap = reorderVisual(reorderLevels); + Object[] temp = new Object[count]; + System.arraycopy(objects, objectStart, temp, 0, count); + for (int i = 0; i < count; ++i) { + objects[objectStart + i] = temp[indexMap[i]]; + } + } + + /** + * Take a Bidi object containing the reordering + * information for a piece of text (one or more paragraphs) set by + * setPara() or for a line of text set by setLine() + * and return a string containing the reordered text. + * + *

The text may have been aliased (only a reference was stored + * without copying the contents), thus it must not have been modified + * since the setPara() call.

+ * + * This method preserves the integrity of characters with multiple + * code units and (optionally) combining characters. + * Characters in RTL runs can be replaced by mirror-image characters + * in the returned string. Note that "real" mirroring has to be done in a + * rendering engine by glyph selection and that for many "mirrored" + * characters there are no Unicode characters as mirror-image equivalents. + * There are also options to insert or remove Bidi control + * characters; see the descriptions of the return value and the + * options parameter, and of the option bit flags. + * + * @param options A bit set of options for the reordering that control + * how the reordered text is written. + * The options include mirroring the characters on a code + * point basis and inserting LRM characters, which is used + * especially for transforming visually stored text + * to logically stored text (although this is still an + * imperfect implementation of an "inverse Bidi" algorithm + * because it uses the "forward Bidi" algorithm at its core). + * The available options are: + * DO_MIRRORING, + * INSERT_LRM_FOR_NUMERIC, + * KEEP_BASE_COMBINING, + * OUTPUT_REVERSE, + * REMOVE_BIDI_CONTROLS, + * STREAMING + * + * @return The reordered text. + * If the INSERT_LRM_FOR_NUMERIC option is set, then + * the length of the returned string could be as large as + * getLength()+2*countRuns().
+ * If the REMOVE_BIDI_CONTROLS option is set, then the + * length of the returned string may be less than + * getLength().
+ * If none of these options is set, then the length of the returned + * string will be exactly getProcessedLength(). + * + * @throws IllegalStateException if this call is not preceded by a successful + * call to setPara or setLine + * + * @see #DO_MIRRORING + * @see #INSERT_LRM_FOR_NUMERIC + * @see #KEEP_BASE_COMBINING + * @see #OUTPUT_REVERSE + * @see #REMOVE_BIDI_CONTROLS + * @see #OPTION_STREAMING + * @see #getProcessedLength + * @stable ICU 3.8 + */ + public String writeReordered(int options) + { + verifyValidParaOrLine(); + if (length == 0) { + /* nothing to do */ + return ""; + } + return BidiWriter.writeReordered(this, options); + } + + /** + * Display the bidi internal state, used in debugging. + */ + public String toString() { + StringBuilder buf = new StringBuilder(getClass().getName()); + + buf.append("[dir: "); + buf.append(direction); + buf.append(" baselevel: "); + buf.append(paraLevel); + buf.append(" length: "); + buf.append(length); + buf.append(" runs: "); + if (levels == null) { + buf.append("none"); + } else { + buf.append('['); + buf.append(levels[0]); + for (int i = 1; i < levels.length; i++) { + buf.append(' '); + buf.append(levels[i]); + } + buf.append(']'); + } + buf.append(" text: [0x"); + buf.append(Integer.toHexString(text[0])); + for (int i = 1; i < text.length; i++) { + buf.append(" 0x"); + buf.append(Integer.toHexString(text[i])); + } + buf.append("]]"); + + return buf.toString(); + } + + /** + * A class that provides access to constants defined by + * java.awt.font.TextAttribute without creating a static dependency. + */ + private static class TextAttributeConstants { + // Make sure to load the AWT's TextAttribute class before using the constants, if any. + static { + try { + Class.forName("java.awt.font.TextAttribute", true, null); + } catch (ClassNotFoundException e) {} + } + static final JavaAWTFontAccess jafa = SharedSecrets.getJavaAWTFontAccess(); + + /** + * TextAttribute instances (or a fake Attribute type if + * java.awt.font.TextAttribute is not present) + */ + static final AttributedCharacterIterator.Attribute RUN_DIRECTION = + getTextAttribute("RUN_DIRECTION"); + static final AttributedCharacterIterator.Attribute NUMERIC_SHAPING = + getTextAttribute("NUMERIC_SHAPING"); + static final AttributedCharacterIterator.Attribute BIDI_EMBEDDING = + getTextAttribute("BIDI_EMBEDDING"); + + /** + * TextAttribute.RUN_DIRECTION_LTR + */ + static final Boolean RUN_DIRECTION_LTR = (jafa == null) ? + Boolean.FALSE : (Boolean)jafa.getTextAttributeConstant("RUN_DIRECTION_LTR"); + + @SuppressWarnings("serial") + private static AttributedCharacterIterator.Attribute + getTextAttribute(String name) + { + if (jafa == null) { + // fake attribute + return new AttributedCharacterIterator.Attribute(name) { }; + } else { + return (AttributedCharacterIterator.Attribute)jafa.getTextAttributeConstant(name); + } + } + } + + /** + * A class that provides access to java.awt.font.NumericShaper without + * creating a static dependency. + */ + private static class NumericShapings { + // Make sure to load the AWT's NumericShaper class before calling shape, if any. + static { + try { + Class.forName("java.awt.font.NumericShaper", true, null); + } catch (ClassNotFoundException e) {} + } + static final JavaAWTFontAccess jafa = SharedSecrets.getJavaAWTFontAccess(); + + /** + * Invokes NumericShaping shape(text,start,count) method. + */ + static void shape(Object shaper, char[] text, int start, int count) { + if (jafa != null) { + jafa.shape(shaper, text, start, count); + } + } + } + +} --- old/src/java.base/share/classes/sun/text/bidi/BidiLine.java 2020-01-10 15:57:54.000000000 -0800 +++ /dev/null 2020-01-10 15:57:54.000000000 -0800 @@ -1,835 +0,0 @@ -/* - * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* -******************************************************************************* -* Copyright (C) 2001-2014, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -*/ -/* Written by Simon Montagu, Matitiahu Allouche - * (ported from C code written by Markus W. Scherer) - */ - -package sun.text.bidi; - -import java.text.Bidi; -import java.util.Arrays; - -final class BidiLine { - - /* - * General remarks about the functions in this file: - * - * These functions deal with the aspects of potentially mixed-directional - * text in a single paragraph or in a line of a single paragraph - * which has already been processed according to - * the Unicode 3.0 Bidi algorithm as defined in - * http://www.unicode.org/unicode/reports/tr9/ , version 13, - * also described in The Unicode Standard, Version 4.0.1 . - * - * This means that there is a Bidi object with a levels - * and a dirProps array. - * paraLevel and direction are also set. - * Only if the length of the text is zero, then levels==dirProps==NULL. - * - * The overall directionality of the paragraph - * or line is used to bypass the reordering steps if possible. - * Even purely RTL text does not need reordering there because - * the getLogical/VisualIndex() methods can compute the - * index on the fly in such a case. - * - * The implementation of the access to same-level-runs and of the reordering - * do attempt to provide better performance and less memory usage compared to - * a direct implementation of especially rule (L2) with an array of - * one (32-bit) integer per text character. - * - * Here, the levels array is scanned as soon as necessary, and a vector of - * same-level-runs is created. Reordering then is done on this vector. - * For each run of text positions that were resolved to the same level, - * only 8 bytes are stored: the first text position of the run and the visual - * position behind the run after reordering. - * One sign bit is used to hold the directionality of the run. - * This is inefficient if there are many very short runs. If the average run - * length is <2, then this uses more memory. - * - * In a further attempt to save memory, the levels array is never changed - * after all the resolution rules (Xn, Wn, Nn, In). - * Many methods have to consider the field trailingWSStart: - * if it is less than length, then there is an implicit trailing run - * at the paraLevel, - * which is not reflected in the levels array. - * This allows a line Bidi object to use the same levels array as - * its paragraph parent object. - * - * When a Bidi object is created for a line of a paragraph, then the - * paragraph's levels and dirProps arrays are reused by way of setting - * a pointer into them, not by copying. This again saves memory and forbids to - * change the now shared levels for (L1). - */ - - /* handle trailing WS (L1) -------------------------------------------------- */ - - /* - * setTrailingWSStart() sets the start index for a trailing - * run of WS in the line. This is necessary because we do not modify - * the paragraph's levels array that we just point into. - * Using trailingWSStart is another form of performing (L1). - * - * To make subsequent operations easier, we also include the run - * before the WS if it is at the paraLevel - we merge the two here. - * - * This method is called only from setLine(), so paraLevel is - * set correctly for the line even when contextual multiple paragraphs. - */ - - static void setTrailingWSStart(BidiBase bidiBase) - { - byte[] dirProps = bidiBase.dirProps; - byte[] levels = bidiBase.levels; - int start = bidiBase.length; - byte paraLevel = bidiBase.paraLevel; - - /* If the line is terminated by a block separator, all preceding WS etc... - are already set to paragraph level. - Setting trailingWSStart to pBidi->length will avoid changing the - level of B chars from 0 to paraLevel in getLevels when - orderParagraphsLTR==TRUE - */ - if (dirProps[start - 1] == BidiBase.B) { - bidiBase.trailingWSStart = start; /* currently == bidiBase.length */ - return; - } - /* go backwards across all WS, BN, explicit codes */ - while (start > 0 && - (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) { - --start; - } - - /* if the WS run can be merged with the previous run then do so here */ - while (start > 0 && levels[start - 1] == paraLevel) { - --start; - } - - bidiBase.trailingWSStart=start; - } - - static Bidi setLine(BidiBase paraBidi, - Bidi newBidi, BidiBase lineBidi, - int start, int limit) { - int length; - - /* set the values in lineBidi from its paraBidi parent */ - /* class members are already initialized to 0 */ - // lineBidi.paraBidi = null; /* mark unfinished setLine */ - // lineBidi.flags = 0; - // lineBidi.controlCount = 0; - - length = lineBidi.length = lineBidi.originalLength = - lineBidi.resultLength = limit - start; - - lineBidi.text = new char[length]; - System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length); - lineBidi.paraLevel = paraBidi.GetParaLevelAt(start); - lineBidi.paraCount = paraBidi.paraCount; - lineBidi.runs = new BidiRun[0]; - lineBidi.reorderingMode = paraBidi.reorderingMode; - lineBidi.reorderingOptions = paraBidi.reorderingOptions; - if (paraBidi.controlCount > 0) { - int j; - for (j = start; j < limit; j++) { - if (BidiBase.IsBidiControlChar(paraBidi.text[j])) { - lineBidi.controlCount++; - } - } - lineBidi.resultLength -= lineBidi.controlCount; - } - /* copy proper subset of DirProps */ - lineBidi.getDirPropsMemory(length); - lineBidi.dirProps = lineBidi.dirPropsMemory; - System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0, - length); - /* copy proper subset of Levels */ - lineBidi.getLevelsMemory(length); - lineBidi.levels = lineBidi.levelsMemory; - System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0, - length); - lineBidi.runCount = -1; - - if (paraBidi.direction != BidiBase.MIXED) { - /* the parent is already trivial */ - lineBidi.direction = paraBidi.direction; - - /* - * The parent's levels are all either - * implicitly or explicitly ==paraLevel; - * do the same here. - */ - if (paraBidi.trailingWSStart <= start) { - lineBidi.trailingWSStart = 0; - } else if (paraBidi.trailingWSStart < limit) { - lineBidi.trailingWSStart = paraBidi.trailingWSStart - start; - } else { - lineBidi.trailingWSStart = length; - } - } else { - byte[] levels = lineBidi.levels; - int i, trailingWSStart; - byte level; - - setTrailingWSStart(lineBidi); - trailingWSStart = lineBidi.trailingWSStart; - - /* recalculate lineBidiBase.direction */ - if (trailingWSStart == 0) { - /* all levels are at paraLevel */ - lineBidi.direction = (byte)(lineBidi.paraLevel & 1); - } else { - /* get the level of the first character */ - level = (byte)(levels[0] & 1); - - /* if there is anything of a different level, then the line - is mixed */ - if (trailingWSStart < length && - (lineBidi.paraLevel & 1) != level) { - /* the trailing WS is at paraLevel, which differs from - levels[0] */ - lineBidi.direction = BidiBase.MIXED; - } else { - /* see if levels[1..trailingWSStart-1] have the same - direction as levels[0] and paraLevel */ - for (i = 1; ; i++) { - if (i == trailingWSStart) { - /* the direction values match those in level */ - lineBidi.direction = level; - break; - } else if ((levels[i] & 1) != level) { - lineBidi.direction = BidiBase.MIXED; - break; - } - } - } - } - - switch(lineBidi.direction) { - case Bidi.DIRECTION_LEFT_TO_RIGHT: - /* make sure paraLevel is even */ - lineBidi.paraLevel = (byte) - ((lineBidi.paraLevel + 1) & ~1); - - /* all levels are implicitly at paraLevel (important for - getLevels()) */ - lineBidi.trailingWSStart = 0; - break; - case Bidi.DIRECTION_RIGHT_TO_LEFT: - /* make sure paraLevel is odd */ - lineBidi.paraLevel |= 1; - - /* all levels are implicitly at paraLevel (important for - getLevels()) */ - lineBidi.trailingWSStart = 0; - break; - default: - break; - } - } - - lineBidi.paraBidi = paraBidi; /* mark successful setLine */ - - return newBidi; - } - - static byte getLevelAt(BidiBase bidiBase, int charIndex) - { - /* return paraLevel if in the trailing WS run, otherwise the real level */ - if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) { - return bidiBase.GetParaLevelAt(charIndex); - } else { - return bidiBase.levels[charIndex]; - } - } - - static byte[] getLevels(BidiBase bidiBase) - { - int start = bidiBase.trailingWSStart; - int length = bidiBase.length; - - if (start != length) { - /* the current levels array does not reflect the WS run */ - /* - * After the previous if(), we know that the levels array - * has an implicit trailing WS run and therefore does not fully - * reflect itself all the levels. - * This must be a Bidi object for a line, and - * we need to create a new levels array. - */ - /* bidiBase.paraLevel is ok even if contextual multiple paragraphs, - since bidiBase is a line object */ - Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel); - - /* this new levels array is set for the line and reflects the WS run */ - bidiBase.trailingWSStart = length; - } - if (length < bidiBase.levels.length) { - byte[] levels = new byte[length]; - System.arraycopy(bidiBase.levels, 0, levels, 0, length); - return levels; - } - return bidiBase.levels; - } - - static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) { - int start = bidiBase.runs[runIndex].start; - int limit; - byte level = bidiBase.runs[runIndex].level; - - if (runIndex > 0) { - limit = start + - bidiBase.runs[runIndex].limit - - bidiBase.runs[runIndex - 1].limit; - } else { - limit = start + bidiBase.runs[0].limit; - } - return new BidiRun(start, limit, level); - } - - /* in trivial cases there is only one trivial run; called by getRuns() */ - private static void getSingleRun(BidiBase bidiBase, byte level) { - /* simple, single-run case */ - bidiBase.runs = bidiBase.simpleRuns; - bidiBase.runCount = 1; - - /* fill and reorder the single run */ - bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level); - } - - /* reorder the runs array (L2) ---------------------------------------------- */ - - /* - * Reorder the same-level runs in the runs array. - * Here, runCount>1 and maxLevel>=minLevel>=paraLevel. - * All the visualStart fields=logical start before reordering. - * The "odd" bits are not set yet. - * - * Reordering with this data structure lends itself to some handy shortcuts: - * - * Since each run is moved but not modified, and since at the initial maxLevel - * each sequence of same-level runs consists of only one run each, we - * don't need to do anything there and can predecrement maxLevel. - * In many simple cases, the reordering is thus done entirely in the - * index mapping. - * Also, reordering occurs only down to the lowest odd level that occurs, - * which is minLevel|1. However, if the lowest level itself is odd, then - * in the last reordering the sequence of the runs at this level or higher - * will be all runs, and we don't need the elaborate loop to search for them. - * This is covered by ++minLevel instead of minLevel|=1 followed - * by an extra reorder-all after the reorder-some loop. - * About a trailing WS run: - * Such a run would need special treatment because its level is not - * reflected in levels[] if this is not a paragraph object. - * Instead, all characters from trailingWSStart on are implicitly at - * paraLevel. - * However, for all maxLevel>paraLevel, this run will never be reordered - * and does not need to be taken into account. maxLevel==paraLevel is only reordered - * if minLevel==paraLevel is odd, which is done in the extra segment. - * This means that for the main reordering loop we don't need to consider - * this run and can --runCount. If it is later part of the all-runs - * reordering, then runCount is adjusted accordingly. - */ - private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) { - - /* nothing to do? */ - if (maxLevel<=(minLevel|1)) { - return; - } - - BidiRun[] runs; - BidiRun tempRun; - byte[] levels; - int firstRun, endRun, limitRun, runCount; - - /* - * Reorder only down to the lowest odd level - * and reorder at an odd minLevel in a separate, simpler loop. - * See comments above for why minLevel is always incremented. - */ - ++minLevel; - - runs = bidiBase.runs; - levels = bidiBase.levels; - runCount = bidiBase.runCount; - - /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */ - if (bidiBase.trailingWSStart < bidiBase.length) { - --runCount; - } - - while (--maxLevel >= minLevel) { - firstRun = 0; - - /* loop for all sequences of runs */ - for ( ; ; ) { - /* look for a sequence of runs that are all at >=maxLevel */ - /* look for the first run of such a sequence */ - while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) { - ++firstRun; - } - if (firstRun >= runCount) { - break; /* no more such runs */ - } - - /* look for the limit run of such a sequence (the run behind it) */ - for (limitRun = firstRun; ++limitRun < runCount && - levels[runs[limitRun].start]>=maxLevel; ) {} - - /* Swap the entire sequence of runs from firstRun to limitRun-1. */ - endRun = limitRun - 1; - while (firstRun < endRun) { - tempRun = runs[firstRun]; - runs[firstRun] = runs[endRun]; - runs[endRun] = tempRun; - ++firstRun; - --endRun; - } - - if (limitRun == runCount) { - break; /* no more such runs */ - } else { - firstRun = limitRun + 1; - } - } - } - - /* now do maxLevel==old minLevel (==odd!), see above */ - if ((minLevel & 1) == 0) { - firstRun = 0; - - /* include the trailing WS run in this complete reordering */ - if (bidiBase.trailingWSStart == bidiBase.length) { - --runCount; - } - - /* Swap the entire sequence of all runs. (endRun==runCount) */ - while (firstRun < runCount) { - tempRun = runs[firstRun]; - runs[firstRun] = runs[runCount]; - runs[runCount] = tempRun; - ++firstRun; - --runCount; - } - } - } - - /* compute the runs array --------------------------------------------------- */ - - static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) { - BidiRun[] runs = bidiBase.runs; - int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart; - - for (i = 0; i < runCount; i++) { - length = runs[i].limit - visualStart; - logicalStart = runs[i].start; - if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart+length))) { - return i; - } - visualStart += length; - } - /* we should never get here */ - throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex"); - } - - /* - * Compute the runs array from the levels array. - * After getRuns() returns true, runCount is guaranteed to be >0 - * and the runs are reordered. - * Odd-level runs have visualStart on their visual right edge and - * they progress visually to the left. - * If option OPTION_INSERT_MARKS is set, insertRemove will contain the - * sum of appropriate LRM/RLM_BEFORE/AFTER flags. - * If option OPTION_REMOVE_CONTROLS is set, insertRemove will contain the - * negative number of BiDi control characters within this run. - */ - static void getRuns(BidiBase bidiBase) { - /* - * This method returns immediately if the runs are already set. This - * includes the case of length==0 (handled in setPara).. - */ - if (bidiBase.runCount >= 0) { - return; - } - if (bidiBase.direction != BidiBase.MIXED) { - /* simple, single-run case - this covers length==0 */ - /* bidiBase.paraLevel is ok even for contextual multiple paragraphs */ - getSingleRun(bidiBase, bidiBase.paraLevel); - } else /* BidiBase.MIXED, length>0 */ { - /* mixed directionality */ - int length = bidiBase.length, limit; - byte[] levels = bidiBase.levels; - int i, runCount; - byte level = -1; /* initialize with no valid level */ - /* - * If there are WS characters at the end of the line - * and the run preceding them has a level different from - * paraLevel, then they will form their own run at paraLevel (L1). - * Count them separately. - * We need some special treatment for this in order to not - * modify the levels array which a line Bidi object shares - * with its paragraph parent and its other line siblings. - * In other words, for the trailing WS, it may be - * levels[]!=paraLevel but we have to treat it like it were so. - */ - limit = bidiBase.trailingWSStart; - /* count the runs, there is at least one non-WS run, and limit>0 */ - runCount = 0; - for (i = 0; i < limit; ++i) { - /* increment runCount at the start of each run */ - if (levels[i] != level) { - ++runCount; - level = levels[i]; - } - } - - /* - * We don't need to see if the last run can be merged with a trailing - * WS run because setTrailingWSStart() would have done that. - */ - if (runCount == 1 && limit == length) { - /* There is only one non-WS run and no trailing WS-run. */ - getSingleRun(bidiBase, levels[0]); - } else /* runCount>1 || limit 1 */ - bidiBase.getRunsMemory(runCount); - runs = bidiBase.runsMemory; - - /* set the runs */ - /* FOOD FOR THOUGHT: this could be optimized, e.g.: - * 464->444, 484->444, 575->555, 595->555 - * However, that would take longer. Check also how it would - * interact with BiDi control removal and inserting Marks. - */ - runIndex = 0; - - /* search for the run limits and initialize visualLimit values with the run lengths */ - i = 0; - do { - /* prepare this run */ - start = i; - level = levels[i]; - if (level < minLevel) { - minLevel = level; - } - if (level > maxLevel) { - maxLevel = level; - } - - /* look for the run limit */ - while (++i < limit && levels[i] == level) {} - - /* i is another run limit */ - runs[runIndex] = new BidiRun(start, i - start, level); - ++runIndex; - } while (i < limit); - - if (limit < length) { - /* there is a separate WS run */ - runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel); - /* For the trailing WS run, bidiBase.paraLevel is ok even - if contextual multiple paragraphs. */ - if (bidiBase.paraLevel < minLevel) { - minLevel = bidiBase.paraLevel; - } - } - - /* set the object fields */ - bidiBase.runs = runs; - bidiBase.runCount = runCount; - - reorderLine(bidiBase, minLevel, maxLevel); - - /* now add the direction flags and adjust the visualLimit's to be just that */ - /* this loop will also handle the trailing WS run */ - limit = 0; - for (i = 0; i < runCount; ++i) { - runs[i].level = levels[runs[i].start]; - limit = (runs[i].limit += limit); - } - - /* Set the embedding level for the trailing WS run. */ - /* For a RTL paragraph, it will be the *first* run in visual order. */ - /* For the trailing WS run, bidiBase.paraLevel is ok even if - contextual multiple paragraphs. */ - if (runIndex < runCount) { - int trailingRun = ((bidiBase.paraLevel & 1) != 0)? 0 : runIndex; - runs[trailingRun].level = bidiBase.paraLevel; - } - } - } - - /* handle insert LRM/RLM BEFORE/AFTER run */ - if (bidiBase.insertPoints.size > 0) { - BidiBase.Point point; - int runIndex, ip; - for (ip = 0; ip < bidiBase.insertPoints.size; ip++) { - point = bidiBase.insertPoints.points[ip]; - runIndex = getRunFromLogicalIndex(bidiBase, point.pos); - bidiBase.runs[runIndex].insertRemove |= point.flag; - } - } - - /* handle remove BiDi control characters */ - if (bidiBase.controlCount > 0) { - int runIndex, ic; - char c; - for (ic = 0; ic < bidiBase.length; ic++) { - c = bidiBase.text[ic]; - if (BidiBase.IsBidiControlChar(c)) { - runIndex = getRunFromLogicalIndex(bidiBase, ic); - bidiBase.runs[runIndex].insertRemove--; - } - } - } - } - - static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel) - { - int start; - byte level, minLevel, maxLevel; - - if (levels == null || levels.length <= 0) { - return null; - } - - /* determine minLevel and maxLevel */ - minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1; - maxLevel = 0; - for (start = levels.length; start>0; ) { - level = levels[--start]; - if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) { - return null; - } - if (level < minLevel) { - minLevel = level; - } - if (level > maxLevel) { - maxLevel = level; - } - } - pMinLevel[0] = minLevel; - pMaxLevel[0] = maxLevel; - - /* initialize the index map */ - int[] indexMap = new int[levels.length]; - for (start = levels.length; start > 0; ) { - --start; - indexMap[start] = start; - } - - return indexMap; - } - - static int[] reorderVisual(byte[] levels) - { - byte[] aMinLevel = new byte[1]; - byte[] aMaxLevel = new byte[1]; - int start, end, limit, temp; - byte minLevel, maxLevel; - - int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel); - if (indexMap == null) { - return null; - } - - minLevel = aMinLevel[0]; - maxLevel = aMaxLevel[0]; - - /* nothing to do? */ - if (minLevel == maxLevel && (minLevel & 1) == 0) { - return indexMap; - } - - /* reorder only down to the lowest odd level */ - minLevel |= 1; - - /* loop maxLevel..minLevel */ - do { - start = 0; - - /* loop for all sequences of levels to reorder at the current maxLevel */ - for ( ; ; ) { - /* look for a sequence of levels that are all at >=maxLevel */ - /* look for the first index of such a sequence */ - while (start < levels.length && levels[start] < maxLevel) { - ++start; - } - if (start >= levels.length) { - break; /* no more such runs */ - } - - /* look for the limit of such a sequence (the index behind it) */ - for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel; ) {} - - /* - * Swap the entire interval of indexes from start to limit-1. - * We don't need to swap the levels for the purpose of this - * algorithm: the sequence of levels that we look at does not - * move anyway. - */ - end = limit - 1; - while (start < end) { - temp = indexMap[start]; - indexMap[start] = indexMap[end]; - indexMap[end] = temp; - - ++start; - --end; - } - - if (limit == levels.length) { - break; /* no more such sequences */ - } else { - start = limit + 1; - } - } - } while (--maxLevel >= minLevel); - - return indexMap; - } - - static int[] getVisualMap(BidiBase bidiBase) - { - /* fill a visual-to-logical index map using the runs[] */ - BidiRun[] runs = bidiBase.runs; - int logicalStart, visualStart, visualLimit; - int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length - : bidiBase.resultLength; - int[] indexMap = new int[allocLength]; - - visualStart = 0; - int idx = 0; - for (int j = 0; j < bidiBase.runCount; ++j) { - logicalStart = runs[j].start; - visualLimit = runs[j].limit; - if (runs[j].isEvenRun()) { - do { /* LTR */ - indexMap[idx++] = logicalStart++; - } while (++visualStart < visualLimit); - } else { - logicalStart += visualLimit - visualStart; /* logicalLimit */ - do { /* RTL */ - indexMap[idx++] = --logicalStart; - } while (++visualStart < visualLimit); - } - /* visualStart==visualLimit; */ - } - - if (bidiBase.insertPoints.size > 0) { - int markFound = 0, runCount = bidiBase.runCount; - int insertRemove, i, j, k; - runs = bidiBase.runs; - /* count all inserted marks */ - for (i = 0; i < runCount; i++) { - insertRemove = runs[i].insertRemove; - if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) { - markFound++; - } - if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) { - markFound++; - } - } - /* move back indexes by number of preceding marks */ - k = bidiBase.resultLength; - for (i = runCount - 1; i >= 0 && markFound > 0; i--) { - insertRemove = runs[i].insertRemove; - if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) { - indexMap[--k] = BidiBase.MAP_NOWHERE; - markFound--; - } - visualStart = i > 0 ? runs[i-1].limit : 0; - for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) { - indexMap[--k] = indexMap[j]; - } - if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) { - indexMap[--k] = BidiBase.MAP_NOWHERE; - markFound--; - } - } - } - else if (bidiBase.controlCount > 0) { - int runCount = bidiBase.runCount, logicalEnd; - int insertRemove, length, i, j, k, m; - char uchar; - boolean evenRun; - runs = bidiBase.runs; - visualStart = 0; - /* move forward indexes by number of preceding controls */ - k = 0; - for (i = 0; i < runCount; i++, visualStart += length) { - length = runs[i].limit - visualStart; - insertRemove = runs[i].insertRemove; - /* if no control found yet, nothing to do in this run */ - if ((insertRemove == 0) && (k == visualStart)) { - k += length; - continue; - } - /* if no control in this run */ - if (insertRemove == 0) { - visualLimit = runs[i].limit; - for (j = visualStart; j < visualLimit; j++) { - indexMap[k++] = indexMap[j]; - } - continue; - } - logicalStart = runs[i].start; - evenRun = runs[i].isEvenRun(); - logicalEnd = logicalStart + length - 1; - for (j = 0; j < length; j++) { - m = evenRun ? logicalStart + j : logicalEnd - j; - uchar = bidiBase.text[m]; - if (!BidiBase.IsBidiControlChar(uchar)) { - indexMap[k++] = m; - } - } - } - } - if (allocLength == bidiBase.resultLength) { - return indexMap; - } - int[] newMap = new int[bidiBase.resultLength]; - System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength); - return newMap; - } - -} --- /dev/null 2020-01-10 15:57:54.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/BidiLine.java 2020-01-10 15:57:53.000000000 -0800 @@ -0,0 +1,835 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* +******************************************************************************* +* Copyright (C) 2001-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +/* Written by Simon Montagu, Matitiahu Allouche + * (ported from C code written by Markus W. Scherer) + */ + +package jdk.internal.icu.text; + +import java.text.Bidi; +import java.util.Arrays; + +final class BidiLine { + + /* + * General remarks about the functions in this file: + * + * These functions deal with the aspects of potentially mixed-directional + * text in a single paragraph or in a line of a single paragraph + * which has already been processed according to + * the Unicode 3.0 Bidi algorithm as defined in + * http://www.unicode.org/unicode/reports/tr9/ , version 13, + * also described in The Unicode Standard, Version 4.0.1 . + * + * This means that there is a Bidi object with a levels + * and a dirProps array. + * paraLevel and direction are also set. + * Only if the length of the text is zero, then levels==dirProps==NULL. + * + * The overall directionality of the paragraph + * or line is used to bypass the reordering steps if possible. + * Even purely RTL text does not need reordering there because + * the getLogical/VisualIndex() methods can compute the + * index on the fly in such a case. + * + * The implementation of the access to same-level-runs and of the reordering + * do attempt to provide better performance and less memory usage compared to + * a direct implementation of especially rule (L2) with an array of + * one (32-bit) integer per text character. + * + * Here, the levels array is scanned as soon as necessary, and a vector of + * same-level-runs is created. Reordering then is done on this vector. + * For each run of text positions that were resolved to the same level, + * only 8 bytes are stored: the first text position of the run and the visual + * position behind the run after reordering. + * One sign bit is used to hold the directionality of the run. + * This is inefficient if there are many very short runs. If the average run + * length is <2, then this uses more memory. + * + * In a further attempt to save memory, the levels array is never changed + * after all the resolution rules (Xn, Wn, Nn, In). + * Many methods have to consider the field trailingWSStart: + * if it is less than length, then there is an implicit trailing run + * at the paraLevel, + * which is not reflected in the levels array. + * This allows a line Bidi object to use the same levels array as + * its paragraph parent object. + * + * When a Bidi object is created for a line of a paragraph, then the + * paragraph's levels and dirProps arrays are reused by way of setting + * a pointer into them, not by copying. This again saves memory and forbids to + * change the now shared levels for (L1). + */ + + /* handle trailing WS (L1) -------------------------------------------------- */ + + /* + * setTrailingWSStart() sets the start index for a trailing + * run of WS in the line. This is necessary because we do not modify + * the paragraph's levels array that we just point into. + * Using trailingWSStart is another form of performing (L1). + * + * To make subsequent operations easier, we also include the run + * before the WS if it is at the paraLevel - we merge the two here. + * + * This method is called only from setLine(), so paraLevel is + * set correctly for the line even when contextual multiple paragraphs. + */ + + static void setTrailingWSStart(BidiBase bidiBase) + { + byte[] dirProps = bidiBase.dirProps; + byte[] levels = bidiBase.levels; + int start = bidiBase.length; + byte paraLevel = bidiBase.paraLevel; + + /* If the line is terminated by a block separator, all preceding WS etc... + are already set to paragraph level. + Setting trailingWSStart to pBidi->length will avoid changing the + level of B chars from 0 to paraLevel in getLevels when + orderParagraphsLTR==TRUE + */ + if (dirProps[start - 1] == BidiBase.B) { + bidiBase.trailingWSStart = start; /* currently == bidiBase.length */ + return; + } + /* go backwards across all WS, BN, explicit codes */ + while (start > 0 && + (BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) { + --start; + } + + /* if the WS run can be merged with the previous run then do so here */ + while (start > 0 && levels[start - 1] == paraLevel) { + --start; + } + + bidiBase.trailingWSStart=start; + } + + static Bidi setLine(BidiBase paraBidi, + Bidi newBidi, BidiBase lineBidi, + int start, int limit) { + int length; + + /* set the values in lineBidi from its paraBidi parent */ + /* class members are already initialized to 0 */ + // lineBidi.paraBidi = null; /* mark unfinished setLine */ + // lineBidi.flags = 0; + // lineBidi.controlCount = 0; + + length = lineBidi.length = lineBidi.originalLength = + lineBidi.resultLength = limit - start; + + lineBidi.text = new char[length]; + System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length); + lineBidi.paraLevel = paraBidi.GetParaLevelAt(start); + lineBidi.paraCount = paraBidi.paraCount; + lineBidi.runs = new BidiRun[0]; + lineBidi.reorderingMode = paraBidi.reorderingMode; + lineBidi.reorderingOptions = paraBidi.reorderingOptions; + if (paraBidi.controlCount > 0) { + int j; + for (j = start; j < limit; j++) { + if (BidiBase.IsBidiControlChar(paraBidi.text[j])) { + lineBidi.controlCount++; + } + } + lineBidi.resultLength -= lineBidi.controlCount; + } + /* copy proper subset of DirProps */ + lineBidi.getDirPropsMemory(length); + lineBidi.dirProps = lineBidi.dirPropsMemory; + System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0, + length); + /* copy proper subset of Levels */ + lineBidi.getLevelsMemory(length); + lineBidi.levels = lineBidi.levelsMemory; + System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0, + length); + lineBidi.runCount = -1; + + if (paraBidi.direction != BidiBase.MIXED) { + /* the parent is already trivial */ + lineBidi.direction = paraBidi.direction; + + /* + * The parent's levels are all either + * implicitly or explicitly ==paraLevel; + * do the same here. + */ + if (paraBidi.trailingWSStart <= start) { + lineBidi.trailingWSStart = 0; + } else if (paraBidi.trailingWSStart < limit) { + lineBidi.trailingWSStart = paraBidi.trailingWSStart - start; + } else { + lineBidi.trailingWSStart = length; + } + } else { + byte[] levels = lineBidi.levels; + int i, trailingWSStart; + byte level; + + setTrailingWSStart(lineBidi); + trailingWSStart = lineBidi.trailingWSStart; + + /* recalculate lineBidiBase.direction */ + if (trailingWSStart == 0) { + /* all levels are at paraLevel */ + lineBidi.direction = (byte)(lineBidi.paraLevel & 1); + } else { + /* get the level of the first character */ + level = (byte)(levels[0] & 1); + + /* if there is anything of a different level, then the line + is mixed */ + if (trailingWSStart < length && + (lineBidi.paraLevel & 1) != level) { + /* the trailing WS is at paraLevel, which differs from + levels[0] */ + lineBidi.direction = BidiBase.MIXED; + } else { + /* see if levels[1..trailingWSStart-1] have the same + direction as levels[0] and paraLevel */ + for (i = 1; ; i++) { + if (i == trailingWSStart) { + /* the direction values match those in level */ + lineBidi.direction = level; + break; + } else if ((levels[i] & 1) != level) { + lineBidi.direction = BidiBase.MIXED; + break; + } + } + } + } + + switch(lineBidi.direction) { + case Bidi.DIRECTION_LEFT_TO_RIGHT: + /* make sure paraLevel is even */ + lineBidi.paraLevel = (byte) + ((lineBidi.paraLevel + 1) & ~1); + + /* all levels are implicitly at paraLevel (important for + getLevels()) */ + lineBidi.trailingWSStart = 0; + break; + case Bidi.DIRECTION_RIGHT_TO_LEFT: + /* make sure paraLevel is odd */ + lineBidi.paraLevel |= 1; + + /* all levels are implicitly at paraLevel (important for + getLevels()) */ + lineBidi.trailingWSStart = 0; + break; + default: + break; + } + } + + lineBidi.paraBidi = paraBidi; /* mark successful setLine */ + + return newBidi; + } + + static byte getLevelAt(BidiBase bidiBase, int charIndex) + { + /* return paraLevel if in the trailing WS run, otherwise the real level */ + if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) { + return bidiBase.GetParaLevelAt(charIndex); + } else { + return bidiBase.levels[charIndex]; + } + } + + static byte[] getLevels(BidiBase bidiBase) + { + int start = bidiBase.trailingWSStart; + int length = bidiBase.length; + + if (start != length) { + /* the current levels array does not reflect the WS run */ + /* + * After the previous if(), we know that the levels array + * has an implicit trailing WS run and therefore does not fully + * reflect itself all the levels. + * This must be a Bidi object for a line, and + * we need to create a new levels array. + */ + /* bidiBase.paraLevel is ok even if contextual multiple paragraphs, + since bidiBase is a line object */ + Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel); + + /* this new levels array is set for the line and reflects the WS run */ + bidiBase.trailingWSStart = length; + } + if (length < bidiBase.levels.length) { + byte[] levels = new byte[length]; + System.arraycopy(bidiBase.levels, 0, levels, 0, length); + return levels; + } + return bidiBase.levels; + } + + static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) { + int start = bidiBase.runs[runIndex].start; + int limit; + byte level = bidiBase.runs[runIndex].level; + + if (runIndex > 0) { + limit = start + + bidiBase.runs[runIndex].limit - + bidiBase.runs[runIndex - 1].limit; + } else { + limit = start + bidiBase.runs[0].limit; + } + return new BidiRun(start, limit, level); + } + + /* in trivial cases there is only one trivial run; called by getRuns() */ + private static void getSingleRun(BidiBase bidiBase, byte level) { + /* simple, single-run case */ + bidiBase.runs = bidiBase.simpleRuns; + bidiBase.runCount = 1; + + /* fill and reorder the single run */ + bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level); + } + + /* reorder the runs array (L2) ---------------------------------------------- */ + + /* + * Reorder the same-level runs in the runs array. + * Here, runCount>1 and maxLevel>=minLevel>=paraLevel. + * All the visualStart fields=logical start before reordering. + * The "odd" bits are not set yet. + * + * Reordering with this data structure lends itself to some handy shortcuts: + * + * Since each run is moved but not modified, and since at the initial maxLevel + * each sequence of same-level runs consists of only one run each, we + * don't need to do anything there and can predecrement maxLevel. + * In many simple cases, the reordering is thus done entirely in the + * index mapping. + * Also, reordering occurs only down to the lowest odd level that occurs, + * which is minLevel|1. However, if the lowest level itself is odd, then + * in the last reordering the sequence of the runs at this level or higher + * will be all runs, and we don't need the elaborate loop to search for them. + * This is covered by ++minLevel instead of minLevel|=1 followed + * by an extra reorder-all after the reorder-some loop. + * About a trailing WS run: + * Such a run would need special treatment because its level is not + * reflected in levels[] if this is not a paragraph object. + * Instead, all characters from trailingWSStart on are implicitly at + * paraLevel. + * However, for all maxLevel>paraLevel, this run will never be reordered + * and does not need to be taken into account. maxLevel==paraLevel is only reordered + * if minLevel==paraLevel is odd, which is done in the extra segment. + * This means that for the main reordering loop we don't need to consider + * this run and can --runCount. If it is later part of the all-runs + * reordering, then runCount is adjusted accordingly. + */ + private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) { + + /* nothing to do? */ + if (maxLevel<=(minLevel|1)) { + return; + } + + BidiRun[] runs; + BidiRun tempRun; + byte[] levels; + int firstRun, endRun, limitRun, runCount; + + /* + * Reorder only down to the lowest odd level + * and reorder at an odd minLevel in a separate, simpler loop. + * See comments above for why minLevel is always incremented. + */ + ++minLevel; + + runs = bidiBase.runs; + levels = bidiBase.levels; + runCount = bidiBase.runCount; + + /* do not include the WS run at paraLevel<=old minLevel except in the simple loop */ + if (bidiBase.trailingWSStart < bidiBase.length) { + --runCount; + } + + while (--maxLevel >= minLevel) { + firstRun = 0; + + /* loop for all sequences of runs */ + for ( ; ; ) { + /* look for a sequence of runs that are all at >=maxLevel */ + /* look for the first run of such a sequence */ + while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) { + ++firstRun; + } + if (firstRun >= runCount) { + break; /* no more such runs */ + } + + /* look for the limit run of such a sequence (the run behind it) */ + for (limitRun = firstRun; ++limitRun < runCount && + levels[runs[limitRun].start]>=maxLevel; ) {} + + /* Swap the entire sequence of runs from firstRun to limitRun-1. */ + endRun = limitRun - 1; + while (firstRun < endRun) { + tempRun = runs[firstRun]; + runs[firstRun] = runs[endRun]; + runs[endRun] = tempRun; + ++firstRun; + --endRun; + } + + if (limitRun == runCount) { + break; /* no more such runs */ + } else { + firstRun = limitRun + 1; + } + } + } + + /* now do maxLevel==old minLevel (==odd!), see above */ + if ((minLevel & 1) == 0) { + firstRun = 0; + + /* include the trailing WS run in this complete reordering */ + if (bidiBase.trailingWSStart == bidiBase.length) { + --runCount; + } + + /* Swap the entire sequence of all runs. (endRun==runCount) */ + while (firstRun < runCount) { + tempRun = runs[firstRun]; + runs[firstRun] = runs[runCount]; + runs[runCount] = tempRun; + ++firstRun; + --runCount; + } + } + } + + /* compute the runs array --------------------------------------------------- */ + + static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) { + BidiRun[] runs = bidiBase.runs; + int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart; + + for (i = 0; i < runCount; i++) { + length = runs[i].limit - visualStart; + logicalStart = runs[i].start; + if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart+length))) { + return i; + } + visualStart += length; + } + /* we should never get here */ + throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex"); + } + + /* + * Compute the runs array from the levels array. + * After getRuns() returns true, runCount is guaranteed to be >0 + * and the runs are reordered. + * Odd-level runs have visualStart on their visual right edge and + * they progress visually to the left. + * If option OPTION_INSERT_MARKS is set, insertRemove will contain the + * sum of appropriate LRM/RLM_BEFORE/AFTER flags. + * If option OPTION_REMOVE_CONTROLS is set, insertRemove will contain the + * negative number of BiDi control characters within this run. + */ + static void getRuns(BidiBase bidiBase) { + /* + * This method returns immediately if the runs are already set. This + * includes the case of length==0 (handled in setPara).. + */ + if (bidiBase.runCount >= 0) { + return; + } + if (bidiBase.direction != BidiBase.MIXED) { + /* simple, single-run case - this covers length==0 */ + /* bidiBase.paraLevel is ok even for contextual multiple paragraphs */ + getSingleRun(bidiBase, bidiBase.paraLevel); + } else /* BidiBase.MIXED, length>0 */ { + /* mixed directionality */ + int length = bidiBase.length, limit; + byte[] levels = bidiBase.levels; + int i, runCount; + byte level = -1; /* initialize with no valid level */ + /* + * If there are WS characters at the end of the line + * and the run preceding them has a level different from + * paraLevel, then they will form their own run at paraLevel (L1). + * Count them separately. + * We need some special treatment for this in order to not + * modify the levels array which a line Bidi object shares + * with its paragraph parent and its other line siblings. + * In other words, for the trailing WS, it may be + * levels[]!=paraLevel but we have to treat it like it were so. + */ + limit = bidiBase.trailingWSStart; + /* count the runs, there is at least one non-WS run, and limit>0 */ + runCount = 0; + for (i = 0; i < limit; ++i) { + /* increment runCount at the start of each run */ + if (levels[i] != level) { + ++runCount; + level = levels[i]; + } + } + + /* + * We don't need to see if the last run can be merged with a trailing + * WS run because setTrailingWSStart() would have done that. + */ + if (runCount == 1 && limit == length) { + /* There is only one non-WS run and no trailing WS-run. */ + getSingleRun(bidiBase, levels[0]); + } else /* runCount>1 || limit 1 */ + bidiBase.getRunsMemory(runCount); + runs = bidiBase.runsMemory; + + /* set the runs */ + /* FOOD FOR THOUGHT: this could be optimized, e.g.: + * 464->444, 484->444, 575->555, 595->555 + * However, that would take longer. Check also how it would + * interact with BiDi control removal and inserting Marks. + */ + runIndex = 0; + + /* search for the run limits and initialize visualLimit values with the run lengths */ + i = 0; + do { + /* prepare this run */ + start = i; + level = levels[i]; + if (level < minLevel) { + minLevel = level; + } + if (level > maxLevel) { + maxLevel = level; + } + + /* look for the run limit */ + while (++i < limit && levels[i] == level) {} + + /* i is another run limit */ + runs[runIndex] = new BidiRun(start, i - start, level); + ++runIndex; + } while (i < limit); + + if (limit < length) { + /* there is a separate WS run */ + runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel); + /* For the trailing WS run, bidiBase.paraLevel is ok even + if contextual multiple paragraphs. */ + if (bidiBase.paraLevel < minLevel) { + minLevel = bidiBase.paraLevel; + } + } + + /* set the object fields */ + bidiBase.runs = runs; + bidiBase.runCount = runCount; + + reorderLine(bidiBase, minLevel, maxLevel); + + /* now add the direction flags and adjust the visualLimit's to be just that */ + /* this loop will also handle the trailing WS run */ + limit = 0; + for (i = 0; i < runCount; ++i) { + runs[i].level = levels[runs[i].start]; + limit = (runs[i].limit += limit); + } + + /* Set the embedding level for the trailing WS run. */ + /* For a RTL paragraph, it will be the *first* run in visual order. */ + /* For the trailing WS run, bidiBase.paraLevel is ok even if + contextual multiple paragraphs. */ + if (runIndex < runCount) { + int trailingRun = ((bidiBase.paraLevel & 1) != 0)? 0 : runIndex; + runs[trailingRun].level = bidiBase.paraLevel; + } + } + } + + /* handle insert LRM/RLM BEFORE/AFTER run */ + if (bidiBase.insertPoints.size > 0) { + BidiBase.Point point; + int runIndex, ip; + for (ip = 0; ip < bidiBase.insertPoints.size; ip++) { + point = bidiBase.insertPoints.points[ip]; + runIndex = getRunFromLogicalIndex(bidiBase, point.pos); + bidiBase.runs[runIndex].insertRemove |= point.flag; + } + } + + /* handle remove BiDi control characters */ + if (bidiBase.controlCount > 0) { + int runIndex, ic; + char c; + for (ic = 0; ic < bidiBase.length; ic++) { + c = bidiBase.text[ic]; + if (BidiBase.IsBidiControlChar(c)) { + runIndex = getRunFromLogicalIndex(bidiBase, ic); + bidiBase.runs[runIndex].insertRemove--; + } + } + } + } + + static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel) + { + int start; + byte level, minLevel, maxLevel; + + if (levels == null || levels.length <= 0) { + return null; + } + + /* determine minLevel and maxLevel */ + minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1; + maxLevel = 0; + for (start = levels.length; start>0; ) { + level = levels[--start]; + if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) { + return null; + } + if (level < minLevel) { + minLevel = level; + } + if (level > maxLevel) { + maxLevel = level; + } + } + pMinLevel[0] = minLevel; + pMaxLevel[0] = maxLevel; + + /* initialize the index map */ + int[] indexMap = new int[levels.length]; + for (start = levels.length; start > 0; ) { + --start; + indexMap[start] = start; + } + + return indexMap; + } + + static int[] reorderVisual(byte[] levels) + { + byte[] aMinLevel = new byte[1]; + byte[] aMaxLevel = new byte[1]; + int start, end, limit, temp; + byte minLevel, maxLevel; + + int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel); + if (indexMap == null) { + return null; + } + + minLevel = aMinLevel[0]; + maxLevel = aMaxLevel[0]; + + /* nothing to do? */ + if (minLevel == maxLevel && (minLevel & 1) == 0) { + return indexMap; + } + + /* reorder only down to the lowest odd level */ + minLevel |= 1; + + /* loop maxLevel..minLevel */ + do { + start = 0; + + /* loop for all sequences of levels to reorder at the current maxLevel */ + for ( ; ; ) { + /* look for a sequence of levels that are all at >=maxLevel */ + /* look for the first index of such a sequence */ + while (start < levels.length && levels[start] < maxLevel) { + ++start; + } + if (start >= levels.length) { + break; /* no more such runs */ + } + + /* look for the limit of such a sequence (the index behind it) */ + for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel; ) {} + + /* + * Swap the entire interval of indexes from start to limit-1. + * We don't need to swap the levels for the purpose of this + * algorithm: the sequence of levels that we look at does not + * move anyway. + */ + end = limit - 1; + while (start < end) { + temp = indexMap[start]; + indexMap[start] = indexMap[end]; + indexMap[end] = temp; + + ++start; + --end; + } + + if (limit == levels.length) { + break; /* no more such sequences */ + } else { + start = limit + 1; + } + } + } while (--maxLevel >= minLevel); + + return indexMap; + } + + static int[] getVisualMap(BidiBase bidiBase) + { + /* fill a visual-to-logical index map using the runs[] */ + BidiRun[] runs = bidiBase.runs; + int logicalStart, visualStart, visualLimit; + int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length + : bidiBase.resultLength; + int[] indexMap = new int[allocLength]; + + visualStart = 0; + int idx = 0; + for (int j = 0; j < bidiBase.runCount; ++j) { + logicalStart = runs[j].start; + visualLimit = runs[j].limit; + if (runs[j].isEvenRun()) { + do { /* LTR */ + indexMap[idx++] = logicalStart++; + } while (++visualStart < visualLimit); + } else { + logicalStart += visualLimit - visualStart; /* logicalLimit */ + do { /* RTL */ + indexMap[idx++] = --logicalStart; + } while (++visualStart < visualLimit); + } + /* visualStart==visualLimit; */ + } + + if (bidiBase.insertPoints.size > 0) { + int markFound = 0, runCount = bidiBase.runCount; + int insertRemove, i, j, k; + runs = bidiBase.runs; + /* count all inserted marks */ + for (i = 0; i < runCount; i++) { + insertRemove = runs[i].insertRemove; + if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) { + markFound++; + } + if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) { + markFound++; + } + } + /* move back indexes by number of preceding marks */ + k = bidiBase.resultLength; + for (i = runCount - 1; i >= 0 && markFound > 0; i--) { + insertRemove = runs[i].insertRemove; + if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) { + indexMap[--k] = BidiBase.MAP_NOWHERE; + markFound--; + } + visualStart = i > 0 ? runs[i-1].limit : 0; + for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) { + indexMap[--k] = indexMap[j]; + } + if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) { + indexMap[--k] = BidiBase.MAP_NOWHERE; + markFound--; + } + } + } + else if (bidiBase.controlCount > 0) { + int runCount = bidiBase.runCount, logicalEnd; + int insertRemove, length, i, j, k, m; + char uchar; + boolean evenRun; + runs = bidiBase.runs; + visualStart = 0; + /* move forward indexes by number of preceding controls */ + k = 0; + for (i = 0; i < runCount; i++, visualStart += length) { + length = runs[i].limit - visualStart; + insertRemove = runs[i].insertRemove; + /* if no control found yet, nothing to do in this run */ + if ((insertRemove == 0) && (k == visualStart)) { + k += length; + continue; + } + /* if no control in this run */ + if (insertRemove == 0) { + visualLimit = runs[i].limit; + for (j = visualStart; j < visualLimit; j++) { + indexMap[k++] = indexMap[j]; + } + continue; + } + logicalStart = runs[i].start; + evenRun = runs[i].isEvenRun(); + logicalEnd = logicalStart + length - 1; + for (j = 0; j < length; j++) { + m = evenRun ? logicalStart + j : logicalEnd - j; + uchar = bidiBase.text[m]; + if (!BidiBase.IsBidiControlChar(uchar)) { + indexMap[k++] = m; + } + } + } + } + if (allocLength == bidiBase.resultLength) { + return indexMap; + } + int[] newMap = new int[bidiBase.resultLength]; + System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength); + return newMap; + } + +} --- old/src/java.base/share/classes/sun/text/bidi/BidiRun.java 2020-01-10 15:57:55.000000000 -0800 +++ /dev/null 2020-01-10 15:57:55.000000000 -0800 @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ -/* Written by Simon Montagu, Matitiahu Allouche - * (ported from C code written by Markus W. Scherer) - */ - -package sun.text.bidi; - -/** - * A BidiRun represents a sequence of characters at the same embedding level. - * The Bidi algorithm decomposes a piece of text into sequences of characters - * at the same embedding level, each such sequence is called a "run". - * - *

A BidiRun represents such a run by storing its essential properties, - * but does not duplicate the characters which form the run. - * - *

The "limit" of the run is the position just after the - * last character, i.e., one more than that position. - * - *

This class has no public constructor, and its members cannot be - * modified by users. - * - * @see com.ibm.icu.text.Bidi - */ -class BidiRun { - - int start; /* first logical position of the run */ - int limit; /* last visual position of the run +1 */ - int insertRemove; /* if >0, flags for inserting LRM/RLM before/after run, - if <0, count of bidi controls within run */ - byte level; - - /* - * Default constructor - * - * Note that members start and limit of a run instance have different - * meanings depending whether the run is part of the runs array of a Bidi - * object, or if it is a reference returned by getVisualRun() or - * getLogicalRun(). - * For a member of the runs array of a Bidi object, - * - start is the first logical position of the run in the source text. - * - limit is one after the last visual position of the run. - * For a reference returned by getLogicalRun() or getVisualRun(), - * - start is the first logical position of the run in the source text. - * - limit is one after the last logical position of the run. - */ - BidiRun() - { - this(0, 0, (byte)0); - } - - /* - * Constructor - */ - BidiRun(int start, int limit, byte embeddingLevel) - { - this.start = start; - this.limit = limit; - this.level = embeddingLevel; - } - - /* - * Copy the content of a BidiRun instance - */ - void copyFrom(BidiRun run) - { - this.start = run.start; - this.limit = run.limit; - this.level = run.level; - this.insertRemove = run.insertRemove; - } - - /** - * Get level of run - */ - byte getEmbeddingLevel() - { - return level; - } - - /** - * Check if run level is even - * @return true if the embedding level of this run is even, i.e. it is a - * left-to-right run. - */ - boolean isEvenRun() - { - return (level & 1) == 0; - } - -} --- /dev/null 2020-01-10 15:57:55.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/BidiRun.java 2020-01-10 15:57:55.000000000 -0800 @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ +/* Written by Simon Montagu, Matitiahu Allouche + * (ported from C code written by Markus W. Scherer) + */ + +package jdk.internal.icu.text; + +/** + * A BidiRun represents a sequence of characters at the same embedding level. + * The Bidi algorithm decomposes a piece of text into sequences of characters + * at the same embedding level, each such sequence is called a "run". + * + *

A BidiRun represents such a run by storing its essential properties, + * but does not duplicate the characters which form the run. + * + *

The "limit" of the run is the position just after the + * last character, i.e., one more than that position. + * + *

This class has no public constructor, and its members cannot be + * modified by users. + * + * @see com.ibm.icu.text.Bidi + */ +class BidiRun { + + int start; /* first logical position of the run */ + int limit; /* last visual position of the run +1 */ + int insertRemove; /* if >0, flags for inserting LRM/RLM before/after run, + if <0, count of bidi controls within run */ + byte level; + + /* + * Default constructor + * + * Note that members start and limit of a run instance have different + * meanings depending whether the run is part of the runs array of a Bidi + * object, or if it is a reference returned by getVisualRun() or + * getLogicalRun(). + * For a member of the runs array of a Bidi object, + * - start is the first logical position of the run in the source text. + * - limit is one after the last visual position of the run. + * For a reference returned by getLogicalRun() or getVisualRun(), + * - start is the first logical position of the run in the source text. + * - limit is one after the last logical position of the run. + */ + BidiRun() + { + this(0, 0, (byte)0); + } + + /* + * Constructor + */ + BidiRun(int start, int limit, byte embeddingLevel) + { + this.start = start; + this.limit = limit; + this.level = embeddingLevel; + } + + /* + * Copy the content of a BidiRun instance + */ + void copyFrom(BidiRun run) + { + this.start = run.start; + this.limit = run.limit; + this.level = run.level; + this.insertRemove = run.insertRemove; + } + + /** + * Get level of run + */ + byte getEmbeddingLevel() + { + return level; + } + + /** + * Check if run level is even + * @return true if the embedding level of this run is even, i.e. it is a + * left-to-right run. + */ + boolean isEvenRun() + { + return (level & 1) == 0; + } + +} --- old/src/java.base/share/classes/sun/text/bidi/BidiWriter.java 2020-01-10 15:57:56.000000000 -0800 +++ /dev/null 2020-01-10 15:57:56.000000000 -0800 @@ -1,452 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* -******************************************************************************* -* Copyright (C) 2001-2010, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -*/ -/* Written by Simon Montagu, Matitiahu Allouche - * (ported from C code written by Markus W. Scherer) - */ - -package sun.text.bidi; - -import sun.text.normalizer.UCharacter; -import sun.text.normalizer.UTF16; - -final class BidiWriter { - - /** Bidi control code points */ - static final char LRM_CHAR = 0x200e; - static final char RLM_CHAR = 0x200f; - static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | - 1 << UCharacter.RIGHT_TO_LEFT_ARABIC); - - private static boolean IsCombining(int type) { - return ((1< 0); - break; - - case BidiBase.KEEP_BASE_COMBINING: - /* - * Here, too, the destination - * run will have the same length as the source run, - * and there is no mirroring. - * We do need to keep combining characters with their base - * characters. - */ - srcLength = src.length(); - - /* preserve character integrity */ - do { - /* i is always after the last code unit known to need to be kept - * in this segment */ - int c; - int i = srcLength; - - /* collect code units and modifier letters for one base - * character */ - do { - c = UTF16.charAt(src, srcLength - 1); - srcLength -= UTF16.getCharCount(c); - } while(srcLength > 0 && IsCombining(UCharacter.getType(c))); - - /* copy this "user character" */ - dest.append(src.substring(srcLength, i)); - } while(srcLength > 0); - break; - - default: - /* - * With several "complicated" options set, this is the most - * general and the slowest copying of an RTL run. - * We will do mirroring, remove Bidi controls, and - * keep combining characters with their base characters - * as requested. - */ - srcLength = src.length(); - - /* preserve character integrity */ - do { - /* i is always after the last code unit known to need to be kept - * in this segment */ - int i = srcLength; - - /* collect code units for one base character */ - int c = UTF16.charAt(src, srcLength - 1); - srcLength -= UTF16.getCharCount(c); - if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) { - /* collect modifier letters for this base character */ - while(srcLength > 0 && IsCombining(UCharacter.getType(c))) { - c = UTF16.charAt(src, srcLength - 1); - srcLength -= UTF16.getCharCount(c); - } - } - - if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && - BidiBase.IsBidiControlChar(c)) { - /* do not copy this Bidi control character */ - continue; - } - - /* copy this "user character" */ - int j = srcLength; - if((options & BidiBase.DO_MIRRORING) != 0) { - /* mirror only the base character */ - c = UCharacter.getMirror(c); - UTF16.append(dest, c); - j += UTF16.getCharCount(c); - } - dest.append(src.substring(j, i)); - } while(srcLength > 0); - break; - } /* end of switch */ - - return dest.toString(); - } - - static String doWriteReverse(char[] text, int start, int limit, int options) { - return writeReverse(new String(text, start, limit - start), options); - } - - static String writeReordered(BidiBase bidi, int options) { - int run, runCount; - StringBuilder dest; - char[] text = bidi.text; - runCount = bidi.countRuns(); - - /* - * Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the - * reordering mode (checked below) is appropriate. - */ - if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) { - options |= BidiBase.INSERT_LRM_FOR_NUMERIC; - options &= ~BidiBase.REMOVE_BIDI_CONTROLS; - } - /* - * Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS - * and cancels BidiBase.INSERT_LRM_FOR_NUMERIC. - */ - if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) { - options |= BidiBase.REMOVE_BIDI_CONTROLS; - options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; - } - /* - * If we do not perform the "inverse Bidi" algorithm, then we - * don't need to insert any LRMs, and don't need to test for it. - */ - if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) && - (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) && - (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && - (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) { - options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; - } - dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? - bidi.length * 2 : bidi.length); - /* - * Iterate through all visual runs and copy the run text segments to - * the destination, according to the options. - * - * The tests for where to insert LRMs ignore the fact that there may be - * BN codes or non-BMP code points at the beginning and end of a run; - * they may insert LRMs unnecessarily but the tests are faster this way - * (this would have to be improved for UTF-8). - */ - if ((options & BidiBase.OUTPUT_REVERSE) == 0) { - /* forward output */ - if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { - /* do not insert Bidi controls */ - for (run = 0; run < runCount; ++run) { - BidiRun bidiRun = bidi.getVisualRun(run); - if (bidiRun.isEvenRun()) { - dest.append(doWriteForward(text, bidiRun.start, - bidiRun.limit, - options & ~BidiBase.DO_MIRRORING)); - } else { - dest.append(doWriteReverse(text, bidiRun.start, - bidiRun.limit, options)); - } - } - } else { - /* insert Bidi controls for "inverse Bidi" */ - byte[] dirProps = bidi.dirProps; - char uc; - int markFlag; - - for (run = 0; run < runCount; ++run) { - BidiRun bidiRun = bidi.getVisualRun(run); - markFlag=0; - /* check if something relevant in insertPoints */ - markFlag = bidi.runs[run].insertRemove; - if (markFlag < 0) { /* bidi controls count */ - markFlag = 0; - } - if (bidiRun.isEvenRun()) { - if (bidi.isInverse() && - dirProps[bidiRun.start] != BidiBase.L) { - markFlag |= BidiBase.LRM_BEFORE; - } - if ((markFlag & BidiBase.LRM_BEFORE) != 0) { - uc = LRM_CHAR; - } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { - uc = RLM_CHAR; - } else { - uc = 0; - } - if (uc != 0) { - dest.append(uc); - } - dest.append(doWriteForward(text, - bidiRun.start, bidiRun.limit, - options & ~BidiBase.DO_MIRRORING)); - - if (bidi.isInverse() && - dirProps[bidiRun.limit - 1] != BidiBase.L) { - markFlag |= BidiBase.LRM_AFTER; - } - if ((markFlag & BidiBase.LRM_AFTER) != 0) { - uc = LRM_CHAR; - } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { - uc = RLM_CHAR; - } else { - uc = 0; - } - if (uc != 0) { - dest.append(uc); - } - } else { /* RTL run */ - if (bidi.isInverse() && - !bidi.testDirPropFlagAt(MASK_R_AL, - bidiRun.limit - 1)) { - markFlag |= BidiBase.RLM_BEFORE; - } - if ((markFlag & BidiBase.LRM_BEFORE) != 0) { - uc = LRM_CHAR; - } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { - uc = RLM_CHAR; - } else { - uc = 0; - } - if (uc != 0) { - dest.append(uc); - } - dest.append(doWriteReverse(text, bidiRun.start, - bidiRun.limit, options)); - - if(bidi.isInverse() && - (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { - markFlag |= BidiBase.RLM_AFTER; - } - if ((markFlag & BidiBase.LRM_AFTER) != 0) { - uc = LRM_CHAR; - } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { - uc = RLM_CHAR; - } else { - uc = 0; - } - if (uc != 0) { - dest.append(uc); - } - } - } - } - } else { - /* reverse output */ - if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { - /* do not insert Bidi controls */ - for(run = runCount; --run >= 0; ) { - BidiRun bidiRun = bidi.getVisualRun(run); - if (bidiRun.isEvenRun()) { - dest.append(doWriteReverse(text, - bidiRun.start, bidiRun.limit, - options & ~BidiBase.DO_MIRRORING)); - } else { - dest.append(doWriteForward(text, bidiRun.start, - bidiRun.limit, options)); - } - } - } else { - /* insert Bidi controls for "inverse Bidi" */ - - byte[] dirProps = bidi.dirProps; - - for (run = runCount; --run >= 0; ) { - /* reverse output */ - BidiRun bidiRun = bidi.getVisualRun(run); - if (bidiRun.isEvenRun()) { - if (dirProps[bidiRun.limit - 1] != BidiBase.L) { - dest.append(LRM_CHAR); - } - - dest.append(doWriteReverse(text, bidiRun.start, - bidiRun.limit, options & ~BidiBase.DO_MIRRORING)); - - if (dirProps[bidiRun.start] != BidiBase.L) { - dest.append(LRM_CHAR); - } - } else { - if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { - dest.append(RLM_CHAR); - } - - dest.append(doWriteForward(text, bidiRun.start, - bidiRun.limit, options)); - - if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) { - dest.append(RLM_CHAR); - } - } - } - } - } - - return dest.toString(); - } -} --- /dev/null 2020-01-10 15:57:56.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/BidiWriter.java 2020-01-10 15:57:56.000000000 -0800 @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* +******************************************************************************* +* Copyright (C) 2001-2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +/* Written by Simon Montagu, Matitiahu Allouche + * (ported from C code written by Markus W. Scherer) + */ + +package jdk.internal.icu.text; + +import jdk.internal.icu.lang.UCharacter; + +final class BidiWriter { + + /** Bidi control code points */ + static final char LRM_CHAR = 0x200e; + static final char RLM_CHAR = 0x200f; + static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT | + 1 << UCharacter.RIGHT_TO_LEFT_ARABIC); + + private static boolean IsCombining(int type) { + return ((1< 0); + break; + + case BidiBase.KEEP_BASE_COMBINING: + /* + * Here, too, the destination + * run will have the same length as the source run, + * and there is no mirroring. + * We do need to keep combining characters with their base + * characters. + */ + srcLength = src.length(); + + /* preserve character integrity */ + do { + /* i is always after the last code unit known to need to be kept + * in this segment */ + int c; + int i = srcLength; + + /* collect code units and modifier letters for one base + * character */ + do { + c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + } while(srcLength > 0 && IsCombining(UCharacter.getType(c))); + + /* copy this "user character" */ + dest.append(src.substring(srcLength, i)); + } while(srcLength > 0); + break; + + default: + /* + * With several "complicated" options set, this is the most + * general and the slowest copying of an RTL run. + * We will do mirroring, remove Bidi controls, and + * keep combining characters with their base characters + * as requested. + */ + srcLength = src.length(); + + /* preserve character integrity */ + do { + /* i is always after the last code unit known to need to be kept + * in this segment */ + int i = srcLength; + + /* collect code units for one base character */ + int c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) { + /* collect modifier letters for this base character */ + while(srcLength > 0 && IsCombining(UCharacter.getType(c))) { + c = UTF16.charAt(src, srcLength - 1); + srcLength -= UTF16.getCharCount(c); + } + } + + if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 && + BidiBase.IsBidiControlChar(c)) { + /* do not copy this Bidi control character */ + continue; + } + + /* copy this "user character" */ + int j = srcLength; + if((options & BidiBase.DO_MIRRORING) != 0) { + /* mirror only the base character */ + c = UCharacter.getMirror(c); + UTF16.append(dest, c); + j += UTF16.getCharCount(c); + } + dest.append(src.substring(j, i)); + } while(srcLength > 0); + break; + } /* end of switch */ + + return dest.toString(); + } + + static String doWriteReverse(char[] text, int start, int limit, int options) { + return writeReverse(new String(text, start, limit - start), options); + } + + static String writeReordered(BidiBase bidi, int options) { + int run, runCount; + StringBuilder dest; + char[] text = bidi.text; + runCount = bidi.countRuns(); + + /* + * Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the + * reordering mode (checked below) is appropriate. + */ + if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) { + options |= BidiBase.INSERT_LRM_FOR_NUMERIC; + options &= ~BidiBase.REMOVE_BIDI_CONTROLS; + } + /* + * Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS + * and cancels BidiBase.INSERT_LRM_FOR_NUMERIC. + */ + if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) { + options |= BidiBase.REMOVE_BIDI_CONTROLS; + options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; + } + /* + * If we do not perform the "inverse Bidi" algorithm, then we + * don't need to insert any LRMs, and don't need to test for it. + */ + if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) && + (bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) && + (bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) && + (bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) { + options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC; + } + dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ? + bidi.length * 2 : bidi.length); + /* + * Iterate through all visual runs and copy the run text segments to + * the destination, according to the options. + * + * The tests for where to insert LRMs ignore the fact that there may be + * BN codes or non-BMP code points at the beginning and end of a run; + * they may insert LRMs unnecessarily but the tests are faster this way + * (this would have to be improved for UTF-8). + */ + if ((options & BidiBase.OUTPUT_REVERSE) == 0) { + /* forward output */ + if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { + /* do not insert Bidi controls */ + for (run = 0; run < runCount; ++run) { + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + } else { + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options)); + } + } + } else { + /* insert Bidi controls for "inverse Bidi" */ + byte[] dirProps = bidi.dirProps; + char uc; + int markFlag; + + for (run = 0; run < runCount; ++run) { + BidiRun bidiRun = bidi.getVisualRun(run); + markFlag=0; + /* check if something relevant in insertPoints */ + markFlag = bidi.runs[run].insertRemove; + if (markFlag < 0) { /* bidi controls count */ + markFlag = 0; + } + if (bidiRun.isEvenRun()) { + if (bidi.isInverse() && + dirProps[bidiRun.start] != BidiBase.L) { + markFlag |= BidiBase.LRM_BEFORE; + } + if ((markFlag & BidiBase.LRM_BEFORE) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + dest.append(doWriteForward(text, + bidiRun.start, bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + + if (bidi.isInverse() && + dirProps[bidiRun.limit - 1] != BidiBase.L) { + markFlag |= BidiBase.LRM_AFTER; + } + if ((markFlag & BidiBase.LRM_AFTER) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + } else { /* RTL run */ + if (bidi.isInverse() && + !bidi.testDirPropFlagAt(MASK_R_AL, + bidiRun.limit - 1)) { + markFlag |= BidiBase.RLM_BEFORE; + } + if ((markFlag & BidiBase.LRM_BEFORE) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_BEFORE) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options)); + + if(bidi.isInverse() && + (MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { + markFlag |= BidiBase.RLM_AFTER; + } + if ((markFlag & BidiBase.LRM_AFTER) != 0) { + uc = LRM_CHAR; + } else if ((markFlag & BidiBase.RLM_AFTER) != 0) { + uc = RLM_CHAR; + } else { + uc = 0; + } + if (uc != 0) { + dest.append(uc); + } + } + } + } + } else { + /* reverse output */ + if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) { + /* do not insert Bidi controls */ + for(run = runCount; --run >= 0; ) { + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + dest.append(doWriteReverse(text, + bidiRun.start, bidiRun.limit, + options & ~BidiBase.DO_MIRRORING)); + } else { + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, options)); + } + } + } else { + /* insert Bidi controls for "inverse Bidi" */ + + byte[] dirProps = bidi.dirProps; + + for (run = runCount; --run >= 0; ) { + /* reverse output */ + BidiRun bidiRun = bidi.getVisualRun(run); + if (bidiRun.isEvenRun()) { + if (dirProps[bidiRun.limit - 1] != BidiBase.L) { + dest.append(LRM_CHAR); + } + + dest.append(doWriteReverse(text, bidiRun.start, + bidiRun.limit, options & ~BidiBase.DO_MIRRORING)); + + if (dirProps[bidiRun.start] != BidiBase.L) { + dest.append(LRM_CHAR); + } + } else { + if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) { + dest.append(RLM_CHAR); + } + + dest.append(doWriteForward(text, bidiRun.start, + bidiRun.limit, options)); + + if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) { + dest.append(RLM_CHAR); + } + } + } + } + } + + return dest.toString(); + } +} --- old/src/java.base/share/classes/sun/text/normalizer/FilteredNormalizer2.java 2020-01-10 15:57:58.000000000 -0800 +++ /dev/null 2020-01-10 15:57:58.000000000 -0800 @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* -******************************************************************************* -* Copyright (C) 2009-2014, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -*/ -package sun.text.normalizer; - -import java.io.IOException; - -/** - * Normalization filtered by a UnicodeSet. - * Normalizes portions of the text contained in the filter set and leaves - * portions not contained in the filter set unchanged. - * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE). - * Not-in-the-filter text is treated as "is normalized" and "quick check yes". - * This class implements all of (and only) the Normalizer2 API. - * An instance of this class is unmodifiable/immutable. - * @stable ICU 4.4 - * @author Markus W. Scherer - */ -class FilteredNormalizer2 extends Normalizer2 { - - /** - * Constructs a filtered normalizer wrapping any Normalizer2 instance - * and a filter set. - * Both are aliased and must not be modified or deleted while this object - * is used. - * The filter set should be frozen; otherwise the performance will suffer greatly. - * @param n2 wrapped Normalizer2 instance - * @param filterSet UnicodeSet which determines the characters to be normalized - * @stable ICU 4.4 - */ - public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) { - norm2=n2; - set=filterSet; - } - - /** - * {@inheritDoc} - * @stable ICU 4.4 - */ - @Override - public StringBuilder normalize(CharSequence src, StringBuilder dest) { - if(dest==src) { - throw new IllegalArgumentException(); - } - dest.setLength(0); - normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); - return dest; - } - - /** - * {@inheritDoc} - * @stable ICU 4.6 - */ - @Override - public Appendable normalize(CharSequence src, Appendable dest) { - if(dest==src) { - throw new IllegalArgumentException(); - } - return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE); - } - - /** - * {@inheritDoc} - * @stable ICU 4.4 - */ - @Override - public StringBuilder normalizeSecondAndAppend( - StringBuilder first, CharSequence second) { - return normalizeSecondAndAppend(first, second, true); - } - - /** - * {@inheritDoc} - * @stable ICU 4.4 - */ - @Override - public StringBuilder append(StringBuilder first, CharSequence second) { - return normalizeSecondAndAppend(first, second, false); - } - - /** - * {@inheritDoc} - * @stable ICU 4.6 - */ - @Override - public String getDecomposition(int c) { - return set.contains(c) ? norm2.getDecomposition(c) : null; - } - - /** - * {@inheritDoc} - * @stable ICU 49 - */ - @Override - public int getCombiningClass(int c) { - return set.contains(c) ? norm2.getCombiningClass(c) : 0; - } - - /** - * {@inheritDoc} - * @stable ICU 4.4 - */ - @Override - public boolean isNormalized(CharSequence s) { - UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE; - for(int prevSpanLimit=0; prevSpanLimit - * The primary functions are to produce a normalized string and to detect whether - * a string is already normalized. - * The most commonly used normalization forms are those defined in - * http://www.unicode.org/unicode/reports/tr15/ - * However, this API supports additional normalization forms for specialized purposes. - * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) - * and can be used in implementations of UTS #46. - *

- * Not only are the standard compose and decompose modes supplied, - * but additional modes are provided as documented in the Mode enum. - *

- * Some of the functions in this class identify normalization boundaries. - * At a normalization boundary, the portions of the string - * before it and starting from it do not interact and can be handled independently. - *

- * The spanQuickCheckYes() stops at a normalization boundary. - * When the goal is a normalized string, then the text before the boundary - * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). - *

- * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether - * a character is guaranteed to be at a normalization boundary, - * regardless of context. - * This is used for moving from one normalization boundary to the next - * or preceding boundary, and for performing iterative normalization. - *

- * Iterative normalization is useful when only a small portion of a - * longer string needs to be processed. - * For example, in ICU, iterative normalization is used by the NormalizationTransliterator - * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() - * (to process only the substring for which sort key bytes are computed). - *

- * The set of normalization boundaries returned by these functions may not be - * complete: There may be more boundaries that could be returned. - * Different functions may return different boundaries. - * @stable ICU 4.4 - * @author Markus W. Scherer - */ -abstract class Normalizer2 { - - /** - * Returns a Normalizer2 instance for Unicode NFC normalization. - * Same as getInstance(null, "nfc", Mode.COMPOSE). - * Returns an unmodifiable singleton instance. - * @return the requested Normalizer2, if successful - * @stable ICU 49 - */ - public static Normalizer2 getNFCInstance() { - return Norm2AllModes.getNFCInstance().comp; - } - - /** - * Returns a Normalizer2 instance for Unicode NFD normalization. - * Same as getInstance(null, "nfc", Mode.DECOMPOSE). - * Returns an unmodifiable singleton instance. - * @return the requested Normalizer2, if successful - * @stable ICU 49 - */ - public static Normalizer2 getNFDInstance() { - return Norm2AllModes.getNFCInstance().decomp; - } - - /** - * Returns a Normalizer2 instance for Unicode NFKC normalization. - * Same as getInstance(null, "nfkc", Mode.COMPOSE). - * Returns an unmodifiable singleton instance. - * @return the requested Normalizer2, if successful - * @stable ICU 49 - */ - public static Normalizer2 getNFKCInstance() { - return Norm2AllModes.getNFKCInstance().comp; - } - - /** - * Returns a Normalizer2 instance for Unicode NFKD normalization. - * Same as getInstance(null, "nfkc", Mode.DECOMPOSE). - * Returns an unmodifiable singleton instance. - * @return the requested Normalizer2, if successful - * @stable ICU 49 - */ - public static Normalizer2 getNFKDInstance() { - return Norm2AllModes.getNFKCInstance().decomp; - } - - /** - * Returns the normalized form of the source string. - * @param src source string - * @return normalized src - * @stable ICU 4.4 - */ - public String normalize(CharSequence src) { - if(src instanceof String) { - // Fastpath: Do not construct a new String if the src is a String - // and is already normalized. - int spanLength=spanQuickCheckYes(src); - if(spanLength==src.length()) { - return (String)src; - } - if (spanLength != 0) { - StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); - return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); - } - } - return normalize(src, new StringBuilder(src.length())).toString(); - } - - /** - * Writes the normalized form of the source string to the destination string - * (replacing its contents) and returns the destination string. - * The source and destination strings must be different objects. - * @param src source string - * @param dest destination string; its contents is replaced with normalized src - * @return dest - * @stable ICU 4.4 - */ - public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); - - /** - * Writes the normalized form of the source string to the destination Appendable - * and returns the destination Appendable. - * The source and destination strings must be different objects. - * - *

Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}. - * - * @param src source string - * @param dest destination Appendable; gets normalized src appended - * @return dest - * @stable ICU 4.6 - */ - public abstract Appendable normalize(CharSequence src, Appendable dest); - - /** - * Appends the normalized form of the second string to the first string - * (merging them at the boundary) and returns the first string. - * The result is normalized if the first string was normalized. - * The first and second strings must be different objects. - * @param first string, should be normalized - * @param second string, will be normalized - * @return first - * @stable ICU 4.4 - */ - public abstract StringBuilder normalizeSecondAndAppend( - StringBuilder first, CharSequence second); - - /** - * Appends the second string to the first string - * (merging them at the boundary) and returns the first string. - * The result is normalized if both the strings were normalized. - * The first and second strings must be different objects. - * @param first string, should be normalized - * @param second string, should be normalized - * @return first - * @stable ICU 4.4 - */ - public abstract StringBuilder append(StringBuilder first, CharSequence second); - - /** - * Gets the decomposition mapping of c. - * Roughly equivalent to normalizing the String form of c - * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function - * returns null if c does not have a decomposition mapping in this instance's data. - * This function is independent of the mode of the Normalizer2. - * @param c code point - * @return c's decomposition mapping, if any; otherwise null - * @stable ICU 4.6 - */ - public abstract String getDecomposition(int c); - - /** - * Gets the combining class of c. - * The default implementation returns 0 - * but all standard implementations return the Unicode Canonical_Combining_Class value. - * @param c code point - * @return c's combining class - * @stable ICU 49 - */ - public int getCombiningClass(int c) { return 0; } - - /** - * Tests if the string is normalized. - * Internally, in cases where the quickCheck() method would return "maybe" - * (which is only possible for the two COMPOSE modes) this method - * resolves to "yes" or "no" to provide a definitive result, - * at the cost of doing more work in those cases. - * @param s input string - * @return true if s is normalized - * @stable ICU 4.4 - */ - public abstract boolean isNormalized(CharSequence s); - - /** - * Returns the end of the normalized substring of the input string. - * In other words, with end=spanQuickCheckYes(s); - * the substring s.subSequence(0, end) - * will pass the quick check with a "yes" result. - *

- * The returned end index is usually one or more characters before the - * "no" or "maybe" character: The end index is at a normalization boundary. - * (See the class documentation for more about normalization boundaries.) - *

- * When the goal is a normalized string and most input strings are expected - * to be normalized already, then call this method, - * and if it returns a prefix shorter than the input string, - * copy that prefix and use normalizeSecondAndAppend() for the remainder. - * @param s input string - * @return "yes" span end index - * @stable ICU 4.4 - */ - public abstract int spanQuickCheckYes(CharSequence s); - - /** - * Tests if the character always has a normalization boundary before it, - * regardless of context. - * If true, then the character does not normalization-interact with - * preceding characters. - * In other words, a string containing this character can be normalized - * by processing portions before this character and starting from this - * character independently. - * This is used for iterative normalization. See the class documentation for details. - * @param c character to test - * @return true if c has a normalization boundary before it - * @stable ICU 4.4 - */ - public abstract boolean hasBoundaryBefore(int c); - - /** - * Sole constructor. (For invocation by subclass constructors, - * typically implicit.) - * @internal - * deprecated This API is ICU internal only. - */ - protected Normalizer2() { - } -} --- /dev/null 2020-01-10 15:57:59.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/Normalizer2.java 2020-01-10 15:57:59.000000000 -0800 @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.text; + +import jdk.internal.icu.impl.Norm2AllModes; + +/** + * Unicode normalization functionality for standard Unicode normalization or + * for using custom mapping tables. + * All instances of this class are unmodifiable/immutable. + * The Normalizer2 class is not intended for public subclassing. + *

+ * The primary functions are to produce a normalized string and to detect whether + * a string is already normalized. + * The most commonly used normalization forms are those defined in + * http://www.unicode.org/unicode/reports/tr15/ + * However, this API supports additional normalization forms for specialized purposes. + * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) + * and can be used in implementations of UTS #46. + *

+ * Not only are the standard compose and decompose modes supplied, + * but additional modes are provided as documented in the Mode enum. + *

+ * Some of the functions in this class identify normalization boundaries. + * At a normalization boundary, the portions of the string + * before it and starting from it do not interact and can be handled independently. + *

+ * The spanQuickCheckYes() stops at a normalization boundary. + * When the goal is a normalized string, then the text before the boundary + * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). + *

+ * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether + * a character is guaranteed to be at a normalization boundary, + * regardless of context. + * This is used for moving from one normalization boundary to the next + * or preceding boundary, and for performing iterative normalization. + *

+ * Iterative normalization is useful when only a small portion of a + * longer string needs to be processed. + * For example, in ICU, iterative normalization is used by the NormalizationTransliterator + * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() + * (to process only the substring for which sort key bytes are computed). + *

+ * The set of normalization boundaries returned by these functions may not be + * complete: There may be more boundaries that could be returned. + * Different functions may return different boundaries. + * @stable ICU 4.4 + * @author Markus W. Scherer + */ +public abstract class Normalizer2 { + + /** + * Returns a Normalizer2 instance for Unicode NFC normalization. + * Same as getInstance(null, "nfc", Mode.COMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFCInstance() { + return Norm2AllModes.getNFCInstance().comp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFD normalization. + * Same as getInstance(null, "nfc", Mode.DECOMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFDInstance() { + return Norm2AllModes.getNFCInstance().decomp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFKC normalization. + * Same as getInstance(null, "nfkc", Mode.COMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFKCInstance() { + return Norm2AllModes.getNFKCInstance().comp; + } + + /** + * Returns a Normalizer2 instance for Unicode NFKD normalization. + * Same as getInstance(null, "nfkc", Mode.DECOMPOSE). + * Returns an unmodifiable singleton instance. + * @return the requested Normalizer2, if successful + * @stable ICU 49 + */ + public static Normalizer2 getNFKDInstance() { + return Norm2AllModes.getNFKCInstance().decomp; + } + + /** + * Returns the normalized form of the source string. + * @param src source string + * @return normalized src + * @stable ICU 4.4 + */ + public String normalize(CharSequence src) { + if(src instanceof String) { + // Fastpath: Do not construct a new String if the src is a String + // and is already normalized. + int spanLength=spanQuickCheckYes(src); + if(spanLength==src.length()) { + return (String)src; + } + if (spanLength != 0) { + StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength); + return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString(); + } + } + return normalize(src, new StringBuilder(src.length())).toString(); + } + + /** + * Writes the normalized form of the source string to the destination string + * (replacing its contents) and returns the destination string. + * The source and destination strings must be different objects. + * @param src source string + * @param dest destination string; its contents is replaced with normalized src + * @return dest + * @stable ICU 4.4 + */ + public abstract StringBuilder normalize(CharSequence src, StringBuilder dest); + + /** + * Writes the normalized form of the source string to the destination Appendable + * and returns the destination Appendable. + * The source and destination strings must be different objects. + * + *

Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}. + * + * @param src source string + * @param dest destination Appendable; gets normalized src appended + * @return dest + * @stable ICU 4.6 + */ + public abstract Appendable normalize(CharSequence src, Appendable dest); + + /** + * Appends the normalized form of the second string to the first string + * (merging them at the boundary) and returns the first string. + * The result is normalized if the first string was normalized. + * The first and second strings must be different objects. + * @param first string, should be normalized + * @param second string, will be normalized + * @return first + * @stable ICU 4.4 + */ + public abstract StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second); + + /** + * Appends the second string to the first string + * (merging them at the boundary) and returns the first string. + * The result is normalized if both the strings were normalized. + * The first and second strings must be different objects. + * @param first string, should be normalized + * @param second string, should be normalized + * @return first + * @stable ICU 4.4 + */ + public abstract StringBuilder append(StringBuilder first, CharSequence second); + + /** + * Gets the decomposition mapping of c. + * Roughly equivalent to normalizing the String form of c + * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function + * returns null if c does not have a decomposition mapping in this instance's data. + * This function is independent of the mode of the Normalizer2. + * @param c code point + * @return c's decomposition mapping, if any; otherwise null + * @stable ICU 4.6 + */ + public abstract String getDecomposition(int c); + + /** + * Gets the combining class of c. + * The default implementation returns 0 + * but all standard implementations return the Unicode Canonical_Combining_Class value. + * @param c code point + * @return c's combining class + * @stable ICU 49 + */ + public int getCombiningClass(int c) { return 0; } + + /** + * Tests if the string is normalized. + * Internally, in cases where the quickCheck() method would return "maybe" + * (which is only possible for the two COMPOSE modes) this method + * resolves to "yes" or "no" to provide a definitive result, + * at the cost of doing more work in those cases. + * @param s input string + * @return true if s is normalized + * @stable ICU 4.4 + */ + public abstract boolean isNormalized(CharSequence s); + + /** + * Returns the end of the normalized substring of the input string. + * In other words, with end=spanQuickCheckYes(s); + * the substring s.subSequence(0, end) + * will pass the quick check with a "yes" result. + *

+ * The returned end index is usually one or more characters before the + * "no" or "maybe" character: The end index is at a normalization boundary. + * (See the class documentation for more about normalization boundaries.) + *

+ * When the goal is a normalized string and most input strings are expected + * to be normalized already, then call this method, + * and if it returns a prefix shorter than the input string, + * copy that prefix and use normalizeSecondAndAppend() for the remainder. + * @param s input string + * @return "yes" span end index + * @stable ICU 4.4 + */ + public abstract int spanQuickCheckYes(CharSequence s); + + /** + * Tests if the character always has a normalization boundary before it, + * regardless of context. + * If true, then the character does not normalization-interact with + * preceding characters. + * In other words, a string containing this character can be normalized + * by processing portions before this character and starting from this + * character independently. + * This is used for iterative normalization. See the class documentation for details. + * @param c character to test + * @return true if c has a normalization boundary before it + * @stable ICU 4.4 + */ + public abstract boolean hasBoundaryBefore(int c); + + /** + * Sole constructor. (For invocation by subclass constructors, + * typically implicit.) + * @internal + * deprecated This API is ICU internal only. + */ + protected Normalizer2() { + } +} --- old/src/java.base/share/classes/sun/text/normalizer/NormalizerBase.java 2020-01-10 15:58:00.000000000 -0800 +++ /dev/null 2020-01-10 15:58:00.000000000 -0800 @@ -1,782 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2000-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.text.CharacterIterator; -import java.text.Normalizer; - -/** - * Unicode Normalization - * - *

Unicode normalization API

- * - * normalize transforms Unicode text into an equivalent composed or - * decomposed form, allowing for easier sorting and searching of text. - * normalize supports the standard normalization forms described in - * - * Unicode Standard Annex #15 — Unicode Normalization Forms. - * - * Characters with accents or other adornments can be encoded in - * several different ways in Unicode. For example, take the character A-acute. - * In Unicode, this can be encoded as a single character (the - * "composed" form): - * - *
- *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
- * 
- * - * or as two separate characters (the "decomposed" form): - * - *
- *      0041    LATIN CAPITAL LETTER A
- *      0301    COMBINING ACUTE ACCENT
- * 
- * - * To a user of your program, however, both of these sequences should be - * treated as the same "user-level" character "A with acute accent". When you - * are searching or comparing text, you must ensure that these two sequences are - * treated equivalently. In addition, you must handle characters with more than - * one accent. Sometimes the order of a character's combining accents is - * significant, while in other cases accent sequences in different orders are - * really equivalent. - * - * Similarly, the string "ffi" can be encoded as three separate letters: - * - *
- *      0066    LATIN SMALL LETTER F
- *      0066    LATIN SMALL LETTER F
- *      0069    LATIN SMALL LETTER I
- * 
- * - * or as the single character - * - *
- *      FB03    LATIN SMALL LIGATURE FFI
- * 
- * - * The ffi ligature is not a distinct semantic character, and strictly speaking - * it shouldn't be in Unicode at all, but it was included for compatibility - * with existing character sets that already provided it. The Unicode standard - * identifies such characters by giving them "compatibility" decompositions - * into the corresponding semantic characters. When sorting and searching, you - * will often want to use these mappings. - * - * normalize helps solve these problems by transforming text into - * the canonical composed and decomposed forms as shown in the first example - * above. In addition, you can have it perform compatibility decompositions so - * that you can treat compatibility characters the same as their equivalents. - * Finally, normalize rearranges accents into the proper canonical - * order, so that you do not have to worry about accent rearrangement on your - * own. - * - * Form FCD, "Fast C or D", is also designed for collation. - * It allows to work on strings that are not necessarily normalized - * with an algorithm (like in collation) that works under "canonical closure", - * i.e., it treats precomposed characters and their decomposed equivalents the - * same. - * - * It is not a normalization form because it does not provide for uniqueness of - * representation. Multiple strings may be canonically equivalent (their NFDs - * are identical) and may all conform to FCD without being identical themselves. - * - * The form is defined such that the "raw decomposition", the recursive - * canonical decomposition of each character, results in a string that is - * canonically ordered. This means that precomposed characters are allowed for - * as long as their decompositions do not need canonical reordering. - * - * Its advantage for a process like collation is that all NFD and most NFC texts - * - and many unnormalized texts - already conform to FCD and do not need to be - * normalized (NFD) for such a process. The FCD quick check will return YES for - * most strings in practice. - * - * normalize(FCD) may be implemented with NFD. - * - * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): - * http://www.unicode.org/notes/tn5/#FCD - * - * ICU collation performs either NFD or FCD normalization automatically if - * normalization is turned on for the collator object. Beyond collation and - * string search, normalized strings may be useful for string equivalence - * comparisons, transliteration/transcription, unique representations, etc. - * - * The W3C generally recommends to exchange texts in NFC. - * Note also that most legacy character encodings use only precomposed forms and - * often do not encode any combining marks by themselves. For conversion to such - * character encodings the Unicode text needs to be normalized to NFC. - * For more usage examples, see the Unicode Standard Annex. - * - * Note: The Normalizer class also provides API for iterative normalization. - * While the setIndex() and getIndex() refer to indices in the - * underlying Unicode input text, the next() and previous() methods - * iterate through characters in the normalized output. - * This means that there is not necessarily a one-to-one correspondence - * between characters returned by next() and previous() and the indices - * passed to and returned from setIndex() and getIndex(). - * It is for this reason that Normalizer does not implement the CharacterIterator interface. - * - * @stable ICU 2.8 - */ -// Original filename in ICU4J: Normalizer.java -public final class NormalizerBase implements Cloneable { - - // The input text and our position in it - private UCharacterIterator text; - private Normalizer2 norm2; - private Mode mode; - private int options; - - // The normalization buffer is the result of normalization - // of the source in [currentIndex..nextIndex] . - private int currentIndex; - private int nextIndex; - - // A buffer for holding intermediate results - private StringBuilder buffer; - private int bufferPos; - - // Helper classes to defer loading of normalization data. - private static final class ModeImpl { - private ModeImpl(Normalizer2 n2) { - normalizer2 = n2; - } - private final Normalizer2 normalizer2; - } - - private static final class NFDModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); - } - - private static final class NFKDModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); - } - - private static final class NFCModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); - } - - private static final class NFKCModeImpl { - private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); - } - - private static final class Unicode32 { - private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); - } - - private static final class NFD32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFKD32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFC32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), - Unicode32.INSTANCE)); - } - - private static final class NFKC32ModeImpl { - private static final ModeImpl INSTANCE = - new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), - Unicode32.INSTANCE)); - } - - /** - * Options bit set value to select Unicode 3.2 normalization - * (except NormalizationCorrections). - * At most one Unicode version can be selected at a time. - * @stable ICU 2.6 - */ - public static final int UNICODE_3_2=0x20; - - public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; - - /* - * Default option for the latest Unicode normalization. This option is - * provided mainly for testing. - * The value zero means that normalization is done with the fixes for - * - Corrigendum 4 (Five CJK Canonical Mapping Errors) - * - Corrigendum 5 (Normalization Idempotency) - */ - public static final int UNICODE_LATEST = 0x00; - - /** - * Constant indicating that the end of the iteration has been reached. - * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. - * @stable ICU 2.8 - */ - public static final int DONE = UCharacterIterator.DONE; - - /** - * Constants for normalization modes. - *

- * The Mode class is not intended for public subclassing. - * Only the Mode constants provided by the Normalizer class should be used, - * and any fields or methods should not be called or overridden by users. - * @stable ICU 2.8 - */ - public abstract static class Mode { - - /** - * Sole constructor - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected Mode() { - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected abstract Normalizer2 getNormalizer2(int options); - } - - private static Mode toMode(Normalizer.Form form) { - switch (form) { - case NFC : - return NFC; - case NFD : - return NFD; - case NFKC : - return NFKC; - case NFKD : - return NFKD; - } - - throw new IllegalArgumentException("Unexpected normalization form: " + - form); - } - - private static final class NONEMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } - } - - private static final class NFDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFD32ModeImpl.INSTANCE.normalizer2 : - NFDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKDMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKD32ModeImpl.INSTANCE.normalizer2 : - NFKDModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFC32ModeImpl.INSTANCE.normalizer2 : - NFCModeImpl.INSTANCE.normalizer2; - } - } - - private static final class NFKCMode extends Mode { - protected Normalizer2 getNormalizer2(int options) { - return (options&UNICODE_3_2) != 0 ? - NFKC32ModeImpl.INSTANCE.normalizer2 : - NFKCModeImpl.INSTANCE.normalizer2; - } - } - - /** - * No decomposition/composition. - * @stable ICU 2.8 - */ - public static final Mode NONE = new NONEMode(); - - /** - * Canonical decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFD = new NFDMode(); - - /** - * Compatibility decomposition. - * @stable ICU 2.8 - */ - public static final Mode NFKD = new NFKDMode(); - - /** - * Canonical decomposition followed by canonical composition. - * @stable ICU 2.8 - */ - public static final Mode NFC = new NFCMode(); - - public static final Mode NFKC =new NFKCMode(); - - //------------------------------------------------------------------------- - // Iterator constructors - //------------------------------------------------------------------------- - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of a given string. - *

- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this object. - *

- * @param str The string to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(String str, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance(str); - this.mode = mode; - this.options=opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(String str, Mode mode) { - this(str, mode, 0); - } - - - /** - * Creates a new {@code NormalizerBase} object for iterating over the - * normalized form of the given text. - *

- * @param iter The input text to be normalized. The normalization - * will start at the beginning of the string. - * - * @param mode The normalization mode. - * - * @param opt Any optional features to be enabled. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the - * standard Unicode Normalization Forms, use 0 for this argument. - * @stable ICU 2.6 - */ - public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { - this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); - this.mode = mode; - this.options = opt; - norm2 = mode.getNormalizer2(opt); - buffer = new StringBuilder(); - } - - public NormalizerBase(CharacterIterator iter, Mode mode) { - this(iter, mode, 0); - } - - /** - * Clones this {@code NormalizerBase} object. All properties of this - * object are duplicated in the new object, including the cloning of any - * {@link CharacterIterator} that was passed in to the constructor - * or to {@link #setText(CharacterIterator) setText}. - * However, the text storage underlying - * the {@code CharacterIterator} is not duplicated unless the - * iterator's {@code clone} method does so. - * @stable ICU 2.8 - */ - public Object clone() { - try { - NormalizerBase copy = (NormalizerBase) super.clone(); - copy.text = (UCharacterIterator) text.clone(); - copy.mode = mode; - copy.options = options; - copy.norm2 = norm2; - copy.buffer = new StringBuilder(buffer); - copy.bufferPos = bufferPos; - copy.currentIndex = currentIndex; - copy.nextIndex = nextIndex; - return copy; - } - catch (CloneNotSupportedException e) { - throw new InternalError(e.toString(), e); - } - } - - /** - * Normalizes a {@code String} using the given normalization operation. - *

- * The {@code options} parameter specifies which optional - * {@code NormalizerBase} features are to be enabled for this operation. - * Currently the only available option is {@link #UNICODE_3_2}. - * If you want the default behavior corresponding to one of the standard - * Unicode Normalization Forms, use 0 for this argument. - *

- * @param str the input string to be normalized. - * @param mode the normalization mode - * @param options the optional features to be enabled. - * @return String the normalized string - * @stable ICU 2.6 - */ - public static String normalize(String str, Mode mode, int options) { - return mode.getNormalizer2(options).normalize(str); - } - - public static String normalize(String str, Normalizer.Form form) { - return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); - } - - public static String normalize(String str, Normalizer.Form form, int options) { - return NormalizerBase.normalize(str, toMode(form), options); - } - - /** - * Test if a string is in a given normalization form. - * This is semantically equivalent to source.equals(normalize(source, mode)). - * - * Unlike quickCheck(), this function returns a definitive result, - * never a "maybe". - * For NFD, NFKD, and FCD, both functions work exactly the same. - * For NFC and NFKC where quickCheck may return "maybe", this function will - * perform further tests to arrive at a true/false result. - * @param str the input string to be checked to see if it is - * normalized - * @param mode the normalization mode - * @param options Options for use with exclusion set and tailored Normalization - * The only option that is currently recognized is UNICODE_3_2 - * @see #isNormalized - * @stable ICU 2.6 - */ - public static boolean isNormalized(String str, Mode mode, int options) { - return mode.getNormalizer2(options).isNormalized(str); - } - - public static boolean isNormalized(String str, Normalizer.Form form) { - return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); - } - - public static boolean isNormalized(String str, Normalizer.Form form, int options) { - return NormalizerBase.isNormalized(str, toMode(form), options); - } - - //------------------------------------------------------------------------- - // Iteration API - //------------------------------------------------------------------------- - - /** - * Return the current character in the normalized text. - * @return The codepoint as an int - * @stable ICU 2.8 - */ - public int current() { - if(bufferPos0 || previousNormalize()) { - int c=buffer.codePointBefore(bufferPos); - bufferPos-=Character.charCount(c); - return c; - } else { - return DONE; - } - } - - /** - * Reset the index to the beginning of the text. - * This is equivalent to setIndexOnly(startIndex)). - * @stable ICU 2.8 - */ - public void reset() { - text.setIndex(0); - currentIndex=nextIndex=0; - clearBuffer(); - } - - /** - * Set the iteration position in the input text that is being normalized, - * without any immediate normalization. - * After setIndexOnly(), getIndex() will return the same index that is - * specified here. - * - * @param index the desired index in the input text. - * @stable ICU 2.8 - */ - public void setIndexOnly(int index) { - text.setIndex(index); // validates index - currentIndex=nextIndex=index; - clearBuffer(); - } - - /** - * Set the iteration position in the input text that is being normalized - * and return the first normalized character at that position. - *

- * Note: This method sets the position in the input text, - * while {@link #next} and {@link #previous} iterate through characters - * in the normalized output. This means that there is not - * necessarily a one-to-one correspondence between characters returned - * by {@code next} and {@code previous} and the indices passed to and - * returned from {@code setIndex} and {@link #getIndex}. - *

- * @param index the desired index in the input text. - * - * @return the first normalized character that is the result of iterating - * forward starting at the given index. - * - * @throws IllegalArgumentException if the given index is less than - * {@link #getBeginIndex} or greater than {@link #getEndIndex}. - * deprecated ICU 3.2 - * @obsolete ICU 3.2 - */ - public int setIndex(int index) { - setIndexOnly(index); - return current(); - } - - /** - * Retrieve the index of the start of the input text. This is the begin - * index of the {@code CharacterIterator} or the start (i.e. 0) of the - * {@code String} over which this {@code NormalizerBase} is iterating - * @deprecated ICU 2.2. Use startIndex() instead. - * @return The codepoint as an int - * @see #startIndex - */ - @Deprecated - public int getBeginIndex() { - return 0; - } - - /** - * Retrieve the index of the end of the input text. This is the end index - * of the {@code CharacterIterator} or the length of the {@code String} - * over which this {@code NormalizerBase} is iterating - * @deprecated ICU 2.2. Use endIndex() instead. - * @return The codepoint as an int - * @see #endIndex - */ - @Deprecated - public int getEndIndex() { - return endIndex(); - } - - /** - * Retrieve the current iteration position in the input text that is - * being normalized. This method is useful in applications such as - * searching, where you need to be able to determine the position in - * the input text that corresponds to a given normalized output character. - *

- * Note: This method sets the position in the input, while - * {@link #next} and {@link #previous} iterate through characters in the - * output. This means that there is not necessarily a one-to-one - * correspondence between characters returned by {@code next} and - * {@code previous} and the indices passed to and returned from - * {@code setIndex} and {@link #getIndex}. - * @return The current iteration position - * @stable ICU 2.8 - */ - public int getIndex() { - if(bufferPos - * Note:If the normalization mode is changed while iterating - * over a string, calls to {@link #next} and {@link #previous} may - * return previously buffers characters in the old normalization mode - * until the iteration is able to re-sync at the next base character. - * It is safest to call {@link #setText setText()}, {@link #first}, - * {@link #last}, etc. after calling {@code setMode}. - *

- * @param newMode the new mode for this {@code NormalizerBase}. - * The supported modes are: - *

    - *
  • {@link #NFC} - Unicode canonical decompositiion - * followed by canonical composition. - *
  • {@link #NFKC} - Unicode compatibility decompositiion - * follwed by canonical composition. - *
  • {@link #NFD} - Unicode canonical decomposition - *
  • {@link #NFKD} - Unicode compatibility decomposition. - *
  • {@link #NONE} - Do nothing but return characters - * from the underlying input text. - *
- * - * @see #getMode - * @stable ICU 2.8 - */ - public void setMode(Mode newMode) { - mode = newMode; - norm2 = mode.getNormalizer2(options); - } - - /** - * Return the basic operation performed by this {@code NormalizerBase} - * - * @see #setMode - * @stable ICU 2.8 - */ - public Mode getMode() { - return mode; - } - - /** - * Set the input text over which this {@code NormalizerBase} will iterate. - * The iteration position is set to the beginning of the input text. - * @param newText The new string to be normalized. - * @stable ICU 2.8 - */ - public void setText(String newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); - if (newIter == null) { - throw new IllegalStateException("Could not create a new UCharacterIterator"); - } - text = newIter; - reset(); - } - - /** - * Set the input text over which this {@code NormalizerBase} will iterate. - * The iteration position is set to the beginning of the input text. - * @param newText The new string to be normalized. - * @stable ICU 2.8 - */ - public void setText(CharacterIterator newText) { - UCharacterIterator newIter = UCharacterIterator.getInstance(newText); - if (newIter == null) { - throw new IllegalStateException("Could not create a new UCharacterIterator"); - } - text = newIter; - currentIndex=nextIndex=0; - clearBuffer(); - } - - private void clearBuffer() { - buffer.setLength(0); - bufferPos=0; - } - - private boolean nextNormalize() { - clearBuffer(); - currentIndex=nextIndex; - text.setIndex(nextIndex); - // Skip at least one character so we make progress. - int c=text.nextCodePoint(); - if(c<0) { - return false; - } - StringBuilder segment=new StringBuilder().appendCodePoint(c); - while((c=text.nextCodePoint())>=0) { - if(norm2.hasBoundaryBefore(c)) { - text.moveCodePointIndex(-1); - break; - } - segment.appendCodePoint(c); - } - nextIndex=text.getIndex(); - norm2.normalize(segment, buffer); - return buffer.length()!=0; - } - - private boolean previousNormalize() { - clearBuffer(); - nextIndex=currentIndex; - text.setIndex(currentIndex); - StringBuilder segment=new StringBuilder(); - int c; - while((c=text.previousCodePoint())>=0) { - if(c<=0xffff) { - segment.insert(0, (char)c); - } else { - segment.insert(0, Character.toChars(c)); - } - if(norm2.hasBoundaryBefore(c)) { - break; - } - } - currentIndex=text.getIndex(); - norm2.normalize(segment, buffer); - bufferPos=buffer.length(); - return buffer.length()!=0; - } - -} --- /dev/null 2020-01-10 15:58:00.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/NormalizerBase.java 2020-01-10 15:58:00.000000000 -0800 @@ -0,0 +1,784 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2000-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package jdk.internal.icu.text; + +import jdk.internal.icu.impl.Norm2AllModes; + +import java.text.CharacterIterator; +import java.text.Normalizer; + +/** + * Unicode Normalization + * + *

Unicode normalization API

+ * + * normalize transforms Unicode text into an equivalent composed or + * decomposed form, allowing for easier sorting and searching of text. + * normalize supports the standard normalization forms described in + * + * Unicode Standard Annex #15 — Unicode Normalization Forms. + * + * Characters with accents or other adornments can be encoded in + * several different ways in Unicode. For example, take the character A-acute. + * In Unicode, this can be encoded as a single character (the + * "composed" form): + * + *
+ *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
+ * 
+ * + * or as two separate characters (the "decomposed" form): + * + *
+ *      0041    LATIN CAPITAL LETTER A
+ *      0301    COMBINING ACUTE ACCENT
+ * 
+ * + * To a user of your program, however, both of these sequences should be + * treated as the same "user-level" character "A with acute accent". When you + * are searching or comparing text, you must ensure that these two sequences are + * treated equivalently. In addition, you must handle characters with more than + * one accent. Sometimes the order of a character's combining accents is + * significant, while in other cases accent sequences in different orders are + * really equivalent. + * + * Similarly, the string "ffi" can be encoded as three separate letters: + * + *
+ *      0066    LATIN SMALL LETTER F
+ *      0066    LATIN SMALL LETTER F
+ *      0069    LATIN SMALL LETTER I
+ * 
+ * + * or as the single character + * + *
+ *      FB03    LATIN SMALL LIGATURE FFI
+ * 
+ * + * The ffi ligature is not a distinct semantic character, and strictly speaking + * it shouldn't be in Unicode at all, but it was included for compatibility + * with existing character sets that already provided it. The Unicode standard + * identifies such characters by giving them "compatibility" decompositions + * into the corresponding semantic characters. When sorting and searching, you + * will often want to use these mappings. + * + * normalize helps solve these problems by transforming text into + * the canonical composed and decomposed forms as shown in the first example + * above. In addition, you can have it perform compatibility decompositions so + * that you can treat compatibility characters the same as their equivalents. + * Finally, normalize rearranges accents into the proper canonical + * order, so that you do not have to worry about accent rearrangement on your + * own. + * + * Form FCD, "Fast C or D", is also designed for collation. + * It allows to work on strings that are not necessarily normalized + * with an algorithm (like in collation) that works under "canonical closure", + * i.e., it treats precomposed characters and their decomposed equivalents the + * same. + * + * It is not a normalization form because it does not provide for uniqueness of + * representation. Multiple strings may be canonically equivalent (their NFDs + * are identical) and may all conform to FCD without being identical themselves. + * + * The form is defined such that the "raw decomposition", the recursive + * canonical decomposition of each character, results in a string that is + * canonically ordered. This means that precomposed characters are allowed for + * as long as their decompositions do not need canonical reordering. + * + * Its advantage for a process like collation is that all NFD and most NFC texts + * - and many unnormalized texts - already conform to FCD and do not need to be + * normalized (NFD) for such a process. The FCD quick check will return YES for + * most strings in practice. + * + * normalize(FCD) may be implemented with NFD. + * + * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): + * http://www.unicode.org/notes/tn5/#FCD + * + * ICU collation performs either NFD or FCD normalization automatically if + * normalization is turned on for the collator object. Beyond collation and + * string search, normalized strings may be useful for string equivalence + * comparisons, transliteration/transcription, unique representations, etc. + * + * The W3C generally recommends to exchange texts in NFC. + * Note also that most legacy character encodings use only precomposed forms and + * often do not encode any combining marks by themselves. For conversion to such + * character encodings the Unicode text needs to be normalized to NFC. + * For more usage examples, see the Unicode Standard Annex. + * + * Note: The Normalizer class also provides API for iterative normalization. + * While the setIndex() and getIndex() refer to indices in the + * underlying Unicode input text, the next() and previous() methods + * iterate through characters in the normalized output. + * This means that there is not necessarily a one-to-one correspondence + * between characters returned by next() and previous() and the indices + * passed to and returned from setIndex() and getIndex(). + * It is for this reason that Normalizer does not implement the CharacterIterator interface. + * + * @stable ICU 2.8 + */ +// Original filename in ICU4J: Normalizer.java +public final class NormalizerBase implements Cloneable { + + // The input text and our position in it + private UCharacterIterator text; + private Normalizer2 norm2; + private Mode mode; + private int options; + + // The normalization buffer is the result of normalization + // of the source in [currentIndex..nextIndex] . + private int currentIndex; + private int nextIndex; + + // A buffer for holding intermediate results + private StringBuilder buffer; + private int bufferPos; + + // Helper classes to defer loading of normalization data. + private static final class ModeImpl { + private ModeImpl(Normalizer2 n2) { + normalizer2 = n2; + } + private final Normalizer2 normalizer2; + } + + private static final class NFDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); + } + + private static final class NFKDModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); + } + + private static final class NFCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); + } + + private static final class NFKCModeImpl { + private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); + } + + private static final class Unicode32 { + private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); + } + + private static final class NFD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKD32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), + Unicode32.INSTANCE)); + } + + private static final class NFKC32ModeImpl { + private static final ModeImpl INSTANCE = + new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), + Unicode32.INSTANCE)); + } + + /** + * Options bit set value to select Unicode 3.2 normalization + * (except NormalizationCorrections). + * At most one Unicode version can be selected at a time. + * @stable ICU 2.6 + */ + public static final int UNICODE_3_2=0x20; + + public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2; + + /* + * Default option for the latest Unicode normalization. This option is + * provided mainly for testing. + * The value zero means that normalization is done with the fixes for + * - Corrigendum 4 (Five CJK Canonical Mapping Errors) + * - Corrigendum 5 (Normalization Idempotency) + */ + public static final int UNICODE_LATEST = 0x00; + + /** + * Constant indicating that the end of the iteration has been reached. + * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. + * @stable ICU 2.8 + */ + public static final int DONE = UCharacterIterator.DONE; + + /** + * Constants for normalization modes. + *

+ * The Mode class is not intended for public subclassing. + * Only the Mode constants provided by the Normalizer class should be used, + * and any fields or methods should not be called or overridden by users. + * @stable ICU 2.8 + */ + public abstract static class Mode { + + /** + * Sole constructor + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected Mode() { + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected abstract Normalizer2 getNormalizer2(int options); + } + + private static Mode toMode(Normalizer.Form form) { + switch (form) { + case NFC : + return NFC; + case NFD : + return NFD; + case NFKC : + return NFKC; + case NFKD : + return NFKD; + } + + throw new IllegalArgumentException("Unexpected normalization form: " + + form); + } + + private static final class NONEMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } + } + + private static final class NFDMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFD32ModeImpl.INSTANCE.normalizer2 : + NFDModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFKDMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKD32ModeImpl.INSTANCE.normalizer2 : + NFKDModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFC32ModeImpl.INSTANCE.normalizer2 : + NFCModeImpl.INSTANCE.normalizer2; + } + } + + private static final class NFKCMode extends Mode { + protected Normalizer2 getNormalizer2(int options) { + return (options&UNICODE_3_2) != 0 ? + NFKC32ModeImpl.INSTANCE.normalizer2 : + NFKCModeImpl.INSTANCE.normalizer2; + } + } + + /** + * No decomposition/composition. + * @stable ICU 2.8 + */ + public static final Mode NONE = new NONEMode(); + + /** + * Canonical decomposition. + * @stable ICU 2.8 + */ + public static final Mode NFD = new NFDMode(); + + /** + * Compatibility decomposition. + * @stable ICU 2.8 + */ + public static final Mode NFKD = new NFKDMode(); + + /** + * Canonical decomposition followed by canonical composition. + * @stable ICU 2.8 + */ + public static final Mode NFC = new NFCMode(); + + public static final Mode NFKC =new NFKCMode(); + + //------------------------------------------------------------------------- + // Iterator constructors + //------------------------------------------------------------------------- + + /** + * Creates a new {@code NormalizerBase} object for iterating over the + * normalized form of a given string. + *

+ * The {@code options} parameter specifies which optional + * {@code NormalizerBase} features are to be enabled for this object. + *

+ * @param str The string to be normalized. The normalization + * will start at the beginning of the string. + * + * @param mode The normalization mode. + * + * @param opt Any optional features to be enabled. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the + * standard Unicode Normalization Forms, use 0 for this argument. + * @stable ICU 2.6 + */ + public NormalizerBase(String str, Mode mode, int opt) { + this.text = UCharacterIterator.getInstance(str); + this.mode = mode; + this.options=opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(String str, Mode mode) { + this(str, mode, 0); + } + + + /** + * Creates a new {@code NormalizerBase} object for iterating over the + * normalized form of the given text. + *

+ * @param iter The input text to be normalized. The normalization + * will start at the beginning of the string. + * + * @param mode The normalization mode. + * + * @param opt Any optional features to be enabled. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the + * standard Unicode Normalization Forms, use 0 for this argument. + * @stable ICU 2.6 + */ + public NormalizerBase(CharacterIterator iter, Mode mode, int opt) { + this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); + this.mode = mode; + this.options = opt; + norm2 = mode.getNormalizer2(opt); + buffer = new StringBuilder(); + } + + public NormalizerBase(CharacterIterator iter, Mode mode) { + this(iter, mode, 0); + } + + /** + * Clones this {@code NormalizerBase} object. All properties of this + * object are duplicated in the new object, including the cloning of any + * {@link CharacterIterator} that was passed in to the constructor + * or to {@link #setText(CharacterIterator) setText}. + * However, the text storage underlying + * the {@code CharacterIterator} is not duplicated unless the + * iterator's {@code clone} method does so. + * @stable ICU 2.8 + */ + public Object clone() { + try { + NormalizerBase copy = (NormalizerBase) super.clone(); + copy.text = (UCharacterIterator) text.clone(); + copy.mode = mode; + copy.options = options; + copy.norm2 = norm2; + copy.buffer = new StringBuilder(buffer); + copy.bufferPos = bufferPos; + copy.currentIndex = currentIndex; + copy.nextIndex = nextIndex; + return copy; + } + catch (CloneNotSupportedException e) { + throw new InternalError(e.toString(), e); + } + } + + /** + * Normalizes a {@code String} using the given normalization operation. + *

+ * The {@code options} parameter specifies which optional + * {@code NormalizerBase} features are to be enabled for this operation. + * Currently the only available option is {@link #UNICODE_3_2}. + * If you want the default behavior corresponding to one of the standard + * Unicode Normalization Forms, use 0 for this argument. + *

+ * @param str the input string to be normalized. + * @param mode the normalization mode + * @param options the optional features to be enabled. + * @return String the normalized string + * @stable ICU 2.6 + */ + public static String normalize(String str, Mode mode, int options) { + return mode.getNormalizer2(options).normalize(str); + } + + public static String normalize(String str, Normalizer.Form form) { + return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST); + } + + public static String normalize(String str, Normalizer.Form form, int options) { + return NormalizerBase.normalize(str, toMode(form), options); + } + + /** + * Test if a string is in a given normalization form. + * This is semantically equivalent to source.equals(normalize(source, mode)). + * + * Unlike quickCheck(), this function returns a definitive result, + * never a "maybe". + * For NFD, NFKD, and FCD, both functions work exactly the same. + * For NFC and NFKC where quickCheck may return "maybe", this function will + * perform further tests to arrive at a true/false result. + * @param str the input string to be checked to see if it is + * normalized + * @param mode the normalization mode + * @param options Options for use with exclusion set and tailored Normalization + * The only option that is currently recognized is UNICODE_3_2 + * @see #isNormalized + * @stable ICU 2.6 + */ + public static boolean isNormalized(String str, Mode mode, int options) { + return mode.getNormalizer2(options).isNormalized(str); + } + + public static boolean isNormalized(String str, Normalizer.Form form) { + return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST); + } + + public static boolean isNormalized(String str, Normalizer.Form form, int options) { + return NormalizerBase.isNormalized(str, toMode(form), options); + } + + //------------------------------------------------------------------------- + // Iteration API + //------------------------------------------------------------------------- + + /** + * Return the current character in the normalized text. + * @return The codepoint as an int + * @stable ICU 2.8 + */ + public int current() { + if(bufferPos0 || previousNormalize()) { + int c=buffer.codePointBefore(bufferPos); + bufferPos-=Character.charCount(c); + return c; + } else { + return DONE; + } + } + + /** + * Reset the index to the beginning of the text. + * This is equivalent to setIndexOnly(startIndex)). + * @stable ICU 2.8 + */ + public void reset() { + text.setIndex(0); + currentIndex=nextIndex=0; + clearBuffer(); + } + + /** + * Set the iteration position in the input text that is being normalized, + * without any immediate normalization. + * After setIndexOnly(), getIndex() will return the same index that is + * specified here. + * + * @param index the desired index in the input text. + * @stable ICU 2.8 + */ + public void setIndexOnly(int index) { + text.setIndex(index); // validates index + currentIndex=nextIndex=index; + clearBuffer(); + } + + /** + * Set the iteration position in the input text that is being normalized + * and return the first normalized character at that position. + *

+ * Note: This method sets the position in the input text, + * while {@link #next} and {@link #previous} iterate through characters + * in the normalized output. This means that there is not + * necessarily a one-to-one correspondence between characters returned + * by {@code next} and {@code previous} and the indices passed to and + * returned from {@code setIndex} and {@link #getIndex}. + *

+ * @param index the desired index in the input text. + * + * @return the first normalized character that is the result of iterating + * forward starting at the given index. + * + * @throws IllegalArgumentException if the given index is less than + * {@link #getBeginIndex} or greater than {@link #getEndIndex}. + * deprecated ICU 3.2 + * @obsolete ICU 3.2 + */ + public int setIndex(int index) { + setIndexOnly(index); + return current(); + } + + /** + * Retrieve the index of the start of the input text. This is the begin + * index of the {@code CharacterIterator} or the start (i.e. 0) of the + * {@code String} over which this {@code NormalizerBase} is iterating + * @deprecated ICU 2.2. Use startIndex() instead. + * @return The codepoint as an int + * @see #startIndex + */ + @Deprecated + public int getBeginIndex() { + return 0; + } + + /** + * Retrieve the index of the end of the input text. This is the end index + * of the {@code CharacterIterator} or the length of the {@code String} + * over which this {@code NormalizerBase} is iterating + * @deprecated ICU 2.2. Use endIndex() instead. + * @return The codepoint as an int + * @see #endIndex + */ + @Deprecated + public int getEndIndex() { + return endIndex(); + } + + /** + * Retrieve the current iteration position in the input text that is + * being normalized. This method is useful in applications such as + * searching, where you need to be able to determine the position in + * the input text that corresponds to a given normalized output character. + *

+ * Note: This method sets the position in the input, while + * {@link #next} and {@link #previous} iterate through characters in the + * output. This means that there is not necessarily a one-to-one + * correspondence between characters returned by {@code next} and + * {@code previous} and the indices passed to and returned from + * {@code setIndex} and {@link #getIndex}. + * @return The current iteration position + * @stable ICU 2.8 + */ + public int getIndex() { + if(bufferPos + * Note:If the normalization mode is changed while iterating + * over a string, calls to {@link #next} and {@link #previous} may + * return previously buffers characters in the old normalization mode + * until the iteration is able to re-sync at the next base character. + * It is safest to call {@link #setText setText()}, {@link #first}, + * {@link #last}, etc. after calling {@code setMode}. + *

+ * @param newMode the new mode for this {@code NormalizerBase}. + * The supported modes are: + *

    + *
  • {@link #NFC} - Unicode canonical decompositiion + * followed by canonical composition. + *
  • {@link #NFKC} - Unicode compatibility decompositiion + * follwed by canonical composition. + *
  • {@link #NFD} - Unicode canonical decomposition + *
  • {@link #NFKD} - Unicode compatibility decomposition. + *
  • {@link #NONE} - Do nothing but return characters + * from the underlying input text. + *
+ * + * @see #getMode + * @stable ICU 2.8 + */ + public void setMode(Mode newMode) { + mode = newMode; + norm2 = mode.getNormalizer2(options); + } + + /** + * Return the basic operation performed by this {@code NormalizerBase} + * + * @see #setMode + * @stable ICU 2.8 + */ + public Mode getMode() { + return mode; + } + + /** + * Set the input text over which this {@code NormalizerBase} will iterate. + * The iteration position is set to the beginning of the input text. + * @param newText The new string to be normalized. + * @stable ICU 2.8 + */ + public void setText(String newText) { + UCharacterIterator newIter = UCharacterIterator.getInstance(newText); + if (newIter == null) { + throw new IllegalStateException("Could not create a new UCharacterIterator"); + } + text = newIter; + reset(); + } + + /** + * Set the input text over which this {@code NormalizerBase} will iterate. + * The iteration position is set to the beginning of the input text. + * @param newText The new string to be normalized. + * @stable ICU 2.8 + */ + public void setText(CharacterIterator newText) { + UCharacterIterator newIter = UCharacterIterator.getInstance(newText); + if (newIter == null) { + throw new IllegalStateException("Could not create a new UCharacterIterator"); + } + text = newIter; + currentIndex=nextIndex=0; + clearBuffer(); + } + + private void clearBuffer() { + buffer.setLength(0); + bufferPos=0; + } + + private boolean nextNormalize() { + clearBuffer(); + currentIndex=nextIndex; + text.setIndex(nextIndex); + // Skip at least one character so we make progress. + int c=text.nextCodePoint(); + if(c<0) { + return false; + } + StringBuilder segment=new StringBuilder().appendCodePoint(c); + while((c=text.nextCodePoint())>=0) { + if(norm2.hasBoundaryBefore(c)) { + text.moveCodePointIndex(-1); + break; + } + segment.appendCodePoint(c); + } + nextIndex=text.getIndex(); + norm2.normalize(segment, buffer); + return buffer.length()!=0; + } + + private boolean previousNormalize() { + clearBuffer(); + nextIndex=currentIndex; + text.setIndex(currentIndex); + StringBuilder segment=new StringBuilder(); + int c; + while((c=text.previousCodePoint())>=0) { + if(c<=0xffff) { + segment.insert(0, (char)c); + } else { + segment.insert(0, Character.toChars(c)); + } + if(norm2.hasBoundaryBefore(c)) { + break; + } + } + currentIndex=text.getIndex(); + norm2.normalize(segment, buffer); + bufferPos=buffer.length(); + return buffer.length()!=0; + } + +} --- old/src/java.base/share/classes/sun/text/normalizer/Replaceable.java 2020-01-10 15:58:02.000000000 -0800 +++ /dev/null 2020-01-10 15:58:02.000000000 -0800 @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - * Replaceable is an interface representing a - * string of characters that supports the replacement of a range of - * itself with a new string of characters. It is used by APIs that - * change a piece of text while retaining metadata. Metadata is data - * other than the Unicode characters returned by char32At(). One - * example of metadata is style attributes; another is an edit - * history, marking each character with an author and revision number. - * - *

An implicit aspect of the Replaceable API is that - * during a replace operation, new characters take on the metadata of - * the old characters. For example, if the string "the bold - * font" has range (4, 8) replaced with "strong", then it becomes "the - * strong font". - * - *

Replaceable specifies ranges using a start - * offset and a limit offset. The range of characters thus specified - * includes the characters at offset start..limit-1. That is, the - * start offset is inclusive, and the limit offset is exclusive. - * - *

Replaceable also includes API to access characters - * in the string: length(), charAt(), - * char32At(), and extractBetween(). - * - *

For a subclass to support metadata, typical behavior of - * replace() is the following: - *

    - *
  • Set the metadata of the new text to the metadata of the first - * character replaced
  • - *
  • If no characters are replaced, use the metadata of the - * previous character
  • - *
  • If there is no previous character (i.e. start == 0), use the - * following character
  • - *
  • If there is no following character (i.e. the replaceable was - * empty), use default metadata
  • - *
  • If the code point U+FFFF is seen, it should be interpreted as - * a special marker having no metadata
  • - *
- * If this is not the behavior, the subclass should document any differences. - * - *

Copyright © IBM Corporation 1999. All rights reserved. - * - * @author Alan Liu - * @stable ICU 2.0 - */ -interface Replaceable { - /** - * Returns the number of 16-bit code units in the text. - * @return number of 16-bit code units in text - * @stable ICU 2.0 - */ - int length(); - - /** - * Returns the 16-bit code unit at the given offset into the text. - * @param offset an integer between 0 and length()-1 - * inclusive - * @return 16-bit code unit of text at given offset - * @stable ICU 2.0 - */ - char charAt(int offset); - - /** - * Copies characters from this object into the destination - * character array. The first character to be copied is at index - * srcStart; the last character to be copied is at - * index srcLimit-1 (thus the total number of - * characters to be copied is srcLimit-srcStart). The - * characters are copied into the subarray of dst - * starting at index dstStart and ending at index - * dstStart + (srcLimit-srcStart) - 1. - * - * @param srcStart the beginning index to copy, inclusive; - * {@code 0 <= start <= limit}. - * @param srcLimit the ending index to copy, exclusive; - * {@code start <= limit <= length()}. - * @param dst the destination array. - * @param dstStart the start offset in the destination array. - * @stable ICU 2.0 - */ - void getChars(int srcStart, int srcLimit, char dst[], int dstStart); -} --- /dev/null 2020-01-10 15:58:02.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/Replaceable.java 2020-01-10 15:58:01.000000000 -0800 @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.text; + +/** + * Replaceable is an interface representing a + * string of characters that supports the replacement of a range of + * itself with a new string of characters. It is used by APIs that + * change a piece of text while retaining metadata. Metadata is data + * other than the Unicode characters returned by char32At(). One + * example of metadata is style attributes; another is an edit + * history, marking each character with an author and revision number. + * + *

An implicit aspect of the Replaceable API is that + * during a replace operation, new characters take on the metadata of + * the old characters. For example, if the string "the bold + * font" has range (4, 8) replaced with "strong", then it becomes "the + * strong font". + * + *

Replaceable specifies ranges using a start + * offset and a limit offset. The range of characters thus specified + * includes the characters at offset start..limit-1. That is, the + * start offset is inclusive, and the limit offset is exclusive. + * + *

Replaceable also includes API to access characters + * in the string: length(), charAt(), + * char32At(), and extractBetween(). + * + *

For a subclass to support metadata, typical behavior of + * replace() is the following: + *

    + *
  • Set the metadata of the new text to the metadata of the first + * character replaced
  • + *
  • If no characters are replaced, use the metadata of the + * previous character
  • + *
  • If there is no previous character (i.e. start == 0), use the + * following character
  • + *
  • If there is no following character (i.e. the replaceable was + * empty), use default metadata
  • + *
  • If the code point U+FFFF is seen, it should be interpreted as + * a special marker having no metadata
  • + *
+ * If this is not the behavior, the subclass should document any differences. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @author Alan Liu + * @stable ICU 2.0 + */ +public interface Replaceable { + /** + * Returns the number of 16-bit code units in the text. + * @return number of 16-bit code units in text + * @stable ICU 2.0 + */ + int length(); + + /** + * Returns the 16-bit code unit at the given offset into the text. + * @param offset an integer between 0 and length()-1 + * inclusive + * @return 16-bit code unit of text at given offset + * @stable ICU 2.0 + */ + char charAt(int offset); + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; + * {@code 0 <= start <= limit}. + * @param srcLimit the ending index to copy, exclusive; + * {@code start <= limit <= length()}. + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + * @stable ICU 2.0 + */ + void getChars(int srcStart, int srcLimit, char dst[], int dstStart); +} --- old/src/java.base/share/classes/sun/text/normalizer/ReplaceableString.java 2020-01-10 15:58:03.000000000 -0800 +++ /dev/null 2020-01-10 15:58:03.000000000 -0800 @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 1996-2009, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - * ReplaceableString is an adapter class that implements the - * Replaceable API around an ordinary StringBuffer. - * - *

Note: This class does not support attributes and is not - * intended for general use. Most clients will need to implement - * {@link Replaceable} in their text representation class. - * - *

Copyright © IBM Corporation 1999. All rights reserved. - * - * @see Replaceable - * @author Alan Liu - * @stable ICU 2.0 - */ -class ReplaceableString implements Replaceable { - - private StringBuffer buf; - - /** - * Construct a new object with the given initial contents. - * @param str initial contents - * @stable ICU 2.0 - */ - public ReplaceableString(String str) { - buf = new StringBuffer(str); - } - - /** - * Construct a new object using buf for internal - * storage. The contents of buf at the time of - * construction are used as the initial contents. Note! - * Modifications to buf will modify this object, and - * vice versa. - * @param buf object to be used as internal storage - * @stable ICU 2.0 - */ - public ReplaceableString(StringBuffer buf) { - this.buf = buf; - } - - /** - * Return the number of characters contained in this object. - * Replaceable API. - * @stable ICU 2.0 - */ - public int length() { - return buf.length(); - } - - /** - * Return the character at the given position in this object. - * Replaceable API. - * @param offset offset into the contents, from 0 to - * length() - 1 - * @stable ICU 2.0 - */ - public char charAt(int offset) { - return buf.charAt(offset); - } - - /** - * Copies characters from this object into the destination - * character array. The first character to be copied is at index - * srcStart; the last character to be copied is at - * index srcLimit-1 (thus the total number of - * characters to be copied is srcLimit-srcStart). The - * characters are copied into the subarray of dst - * starting at index dstStart and ending at index - * dstStart + (srcLimit-srcStart) - 1. - * - * @param srcStart the beginning index to copy, inclusive; - * {@code 0 <= start <= limit}. - * @param srcLimit the ending index to copy, exclusive; - * {@code start <= limit <= length()}. - * @param dst the destination array. - * @param dstStart the start offset in the destination array. - * @stable ICU 2.0 - */ - public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) { - if (srcStart != srcLimit) { - buf.getChars(srcStart, srcLimit, dst, dstStart); - } - } -} --- /dev/null 2020-01-10 15:58:03.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/ReplaceableString.java 2020-01-10 15:58:03.000000000 -0800 @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2009, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +package jdk.internal.icu.text; + +/** + * ReplaceableString is an adapter class that implements the + * Replaceable API around an ordinary StringBuffer. + * + *

Note: This class does not support attributes and is not + * intended for general use. Most clients will need to implement + * {@link Replaceable} in their text representation class. + * + *

Copyright © IBM Corporation 1999. All rights reserved. + * + * @see Replaceable + * @author Alan Liu + * @stable ICU 2.0 + */ +public class ReplaceableString implements Replaceable { + + private StringBuffer buf; + + /** + * Construct a new object with the given initial contents. + * @param str initial contents + * @stable ICU 2.0 + */ + public ReplaceableString(String str) { + buf = new StringBuffer(str); + } + + /** + * Construct a new object using buf for internal + * storage. The contents of buf at the time of + * construction are used as the initial contents. Note! + * Modifications to buf will modify this object, and + * vice versa. + * @param buf object to be used as internal storage + * @stable ICU 2.0 + */ + public ReplaceableString(StringBuffer buf) { + this.buf = buf; + } + + /** + * Return the number of characters contained in this object. + * Replaceable API. + * @stable ICU 2.0 + */ + public int length() { + return buf.length(); + } + + /** + * Return the character at the given position in this object. + * Replaceable API. + * @param offset offset into the contents, from 0 to + * length() - 1 + * @stable ICU 2.0 + */ + public char charAt(int offset) { + return buf.charAt(offset); + } + + /** + * Copies characters from this object into the destination + * character array. The first character to be copied is at index + * srcStart; the last character to be copied is at + * index srcLimit-1 (thus the total number of + * characters to be copied is srcLimit-srcStart). The + * characters are copied into the subarray of dst + * starting at index dstStart and ending at index + * dstStart + (srcLimit-srcStart) - 1. + * + * @param srcStart the beginning index to copy, inclusive; + * {@code 0 <= start <= limit}. + * @param srcLimit the ending index to copy, exclusive; + * {@code start <= limit <= length()}. + * @param dst the destination array. + * @param dstStart the start offset in the destination array. + * @stable ICU 2.0 + */ + public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) { + if (srcStart != srcLimit) { + buf.getChars(srcStart, srcLimit, dst, dstStart); + } + } +} --- old/src/java.base/share/classes/sun/net/idn/StringPrep.java 2020-01-10 15:58:04.000000000 -0800 +++ /dev/null 2020-01-10 15:58:04.000000000 -0800 @@ -1,486 +0,0 @@ -/* - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* -/* - ******************************************************************************* - * Copyright (C) 2003-2004, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ -// -// CHANGELOG -// 2005-05-19 Edward Wang -// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java -// - move from package com.ibm.icu.text to package sun.net.idn -// - use ParseException instead of StringPrepParseException -// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()' -// - remove all @deprecated tag to make compiler happy -// 2007-08-14 Martin Buchholz -// - remove redundant casts -// -package sun.net.idn; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; - -import sun.text.Normalizer; -import sun.text.normalizer.CharTrie; -import sun.text.normalizer.Trie; -import sun.text.normalizer.VersionInfo; -import sun.text.normalizer.UCharacter; -import sun.text.normalizer.UCharacterIterator; -import sun.text.normalizer.UTF16; -import sun.net.idn.UCharacterDirection; -import sun.net.idn.StringPrepDataReader; - -/** - * StringPrep API implements the StingPrep framework as described by - * RFC 3454. - * StringPrep prepares Unicode strings for use in network protocols. - * Profiles of StingPrep are set of rules and data according to which the - * Unicode Strings are prepared. Each profiles contains tables which describe - * how a code point should be treated. The tables are broadly classied into - *

    - *
  • Unassigned Table: Contains code points that are unassigned - * in the Unicode Version supported by StringPrep. Currently - * RFC 3454 supports Unicode 3.2.
  • - *
  • Prohibited Table: Contains code points that are prohibted from - * the output of the StringPrep processing function.
  • - *
  • Mapping Table: Contains code ponts that are deleted from the output or case mapped.
  • - *
- * - * The procedure for preparing Unicode strings: - *
    - *
  1. Map: For each character in the input, check if it has a mapping - * and, if so, replace it with its mapping.
  2. - *
  3. Normalize: Possibly normalize the result of step 1 using Unicode - * normalization.
  4. - *
  5. Prohibit: Check for any characters that are not allowed in the - * output. If any are found, return an error.
  6. - *
  7. Check bidi: Possibly check for right-to-left characters, and if - * any are found, make sure that the whole string satisfies the - * requirements for bidirectional strings. If the string does not - * satisfy the requirements for bidirectional strings, return an - * error.
  8. - *
- * @author Ram Viswanadha - * @draft ICU 2.8 - */ -public final class StringPrep { - /** - * Option to prohibit processing of unassigned code points in the input - * - * @see #prepare - * @draft ICU 2.8 - */ - public static final int DEFAULT = 0x0000; - - /** - * Option to allow processing of unassigned code points in the input - * - * @see #prepare - * @draft ICU 2.8 - */ - public static final int ALLOW_UNASSIGNED = 0x0001; - - private static final int UNASSIGNED = 0x0000; - private static final int MAP = 0x0001; - private static final int PROHIBITED = 0x0002; - private static final int DELETE = 0x0003; - private static final int TYPE_LIMIT = 0x0004; - - private static final int NORMALIZATION_ON = 0x0001; - private static final int CHECK_BIDI_ON = 0x0002; - - private static final int TYPE_THRESHOLD = 0xFFF0; - private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ - private static final int MAX_INDEX_TOP_LENGTH = 0x0003; - - /* indexes[] value names */ - private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ - private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ - private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ - private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ - private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ - private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; - private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; - private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ - private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ - - - /** - * Default buffer size of datafile - */ - private static final int DATA_BUFFER_SIZE = 25000; - - /* Wrappers for Trie implementations */ - private static final class StringPrepTrieImpl implements Trie.DataManipulate{ - private CharTrie sprepTrie = null; - /** - * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's - * data the index array offset of the indexes for that lead surrogate. - * @param property data value for a surrogate from the trie, including - * the folding offset - * @return data offset or 0 if there is no data for the lead surrogate - */ - public int getFoldingOffset(int value){ - return value; - } - } - - // CharTrie implementation for reading the trie data - private StringPrepTrieImpl sprepTrieImpl; - // Indexes read from the data file - private int[] indexes; - // mapping data read from the data file - private char[] mappingData; - // format version of the data file - private byte[] formatVersion; - // the version of Unicode supported by the data file - private VersionInfo sprepUniVer; - // the Unicode version of last entry in the - // NormalizationCorrections.txt file if normalization - // is turned on - private VersionInfo normCorrVer; - // Option to turn on Normalization - private boolean doNFKC; - // Option to turn on checking for BiDi rules - private boolean checkBiDi; - - - private char getCodePointValue(int ch){ - return sprepTrieImpl.sprepTrie.getCodePointValue(ch); - } - - private static VersionInfo getVersionInfo(int comp){ - int micro = comp & 0xFF; - int milli =(comp >> 8) & 0xFF; - int minor =(comp >> 16) & 0xFF; - int major =(comp >> 24) & 0xFF; - return VersionInfo.getInstance(major,minor,milli,micro); - } - private static VersionInfo getVersionInfo(byte[] version){ - if(version.length != 4){ - return null; - } - return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); - } - /** - * Creates an StringPrep object after reading the input stream. - * The object does not hold a reference to the input steam, so the stream can be - * closed after the method returns. - * - * @param inputStream The stream for reading the StringPrep profile binarySun - * @throws IOException - * @draft ICU 2.8 - */ - public StringPrep(InputStream inputStream) throws IOException{ - - BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE); - - StringPrepDataReader reader = new StringPrepDataReader(b); - - // read the indexes - indexes = reader.readIndexes(INDEX_TOP); - - byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]]; - - - //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes - mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2]; - // load the rest of the data and initialize the data members - reader.read(sprepBytes,mappingData); - - sprepTrieImpl = new StringPrepTrieImpl(); - sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl ); - - // get the data format version - formatVersion = reader.getDataFormatVersion(); - - // get the options - doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); - checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); - sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); - normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); - VersionInfo normUniVer = UCharacter.getUnicodeVersion(); - if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ - normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ - ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ - ){ - throw new IOException("Normalization Correction version not supported"); - } - b.close(); - } - - private static final class Values{ - boolean isIndex; - int value; - int type; - public void reset(){ - isIndex = false; - value = 0; - type = -1; - } - } - - private static final void getValues(char trieWord,Values values){ - values.reset(); - if(trieWord == 0){ - /* - * Initial value stored in the mapping table - * just return TYPE_LIMIT .. so that - * the source codepoint is copied to the destination - */ - values.type = TYPE_LIMIT; - }else if(trieWord >= TYPE_THRESHOLD){ - values.type = (trieWord - TYPE_THRESHOLD); - }else{ - /* get the type */ - values.type = MAP; - /* ascertain if the value is index or delta */ - if((trieWord & 0x02)>0){ - values.isIndex = true; - values.value = trieWord >> 2; //mask off the lower 2 bits and shift - - }else{ - values.isIndex = false; - values.value = (trieWord<<16)>>16; - values.value = (values.value >> 2); - - } - - if((trieWord>>2) == MAX_INDEX_VALUE){ - values.type = DELETE; - values.isIndex = false; - values.value = 0; - } - } - } - - - - private StringBuffer map( UCharacterIterator iter, int options) - throws ParseException { - - Values val = new Values(); - char result = 0; - int ch = UCharacterIterator.DONE; - StringBuffer dest = new StringBuffer(); - boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); - - while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ - - result = getCodePointValue(ch); - getValues(result,val); - - // check if the source codepoint is unassigned - if(val.type == UNASSIGNED && allowUnassigned == false){ - throw new ParseException("An unassigned code point was found in the input " + - iter.getText(), iter.getIndex()); - }else if((val.type == MAP)){ - int index, length; - - if(val.isIndex){ - index = val.value; - if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && - index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ - length = 1; - }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && - index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ - length = 2; - }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && - index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ - length = 3; - }else{ - length = mappingData[index++]; - } - /* copy mapping to destination */ - dest.append(mappingData,index,length); - continue; - - }else{ - ch -= val.value; - } - }else if(val.type == DELETE){ - // just consume the codepoint and contine - continue; - } - //copy the source into destination - UTF16.append(dest,ch); - } - - return dest; - } - - - private StringBuffer normalize(StringBuffer src){ - /* - * Option UNORM_BEFORE_PRI_29: - * - * IDNA as interpreted by IETF members (see unicode mailing list 2004H1) - * requires strict adherence to Unicode 3.2 normalization, - * including buggy composition from before fixing Public Review Issue #29. - * Note that this results in some valid but nonsensical text to be - * either corrupted or rejected, depending on the text. - * See http://www.unicode.org/review/resolved-pri.html#pri29 - * See unorm.cpp and cnormtst.c - */ - return new StringBuffer( - Normalizer.normalize( - src.toString(), - java.text.Normalizer.Form.NFKC, - Normalizer.UNICODE_3_2)); - } - /* - boolean isLabelSeparator(int ch){ - int result = getCodePointValue(ch); - if( (result & 0x07) == LABEL_SEPARATOR){ - return true; - } - return false; - } - */ - /* - 1) Map -- For each character in the input, check if it has a mapping - and, if so, replace it with its mapping. - - 2) Normalize -- Possibly normalize the result of step 1 using Unicode - normalization. - - 3) Prohibit -- Check for any characters that are not allowed in the - output. If any are found, return an error. - - 4) Check bidi -- Possibly check for right-to-left characters, and if - any are found, make sure that the whole string satisfies the - requirements for bidirectional strings. If the string does not - satisfy the requirements for bidirectional strings, return an - error. - [Unicode3.2] defines several bidirectional categories; each character - has one bidirectional category assigned to it. For the purposes of - the requirements below, an "RandALCat character" is a character that - has Unicode bidirectional categories "R" or "AL"; an "LCat character" - is a character that has Unicode bidirectional category "L". Note - - - that there are many characters which fall in neither of the above - definitions; Latin digits ( through ) are examples of - this because they have bidirectional category "EN". - - In any profile that specifies bidirectional character handling, all - three of the following requirements MUST be met: - - 1) The characters in section 5.8 MUST be prohibited. - - 2) If a string contains any RandALCat character, the string MUST NOT - contain any LCat character. - - 3) If a string contains any RandALCat character, a RandALCat - character MUST be the first character of the string, and a - RandALCat character MUST be the last character of the string. - */ - /** - * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), - * checks for prohited and BiDi characters in the order defined by RFC 3454 - * depending on the options specified in the profile. - * - * @param src A UCharacterIterator object containing the source string - * @param options A bit set of options: - * - * - StringPrep.NONE Prohibit processing of unassigned code points in the input - * - * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input - * as normal Unicode code points. - * - * @return StringBuffer A StringBuffer containing the output - * @throws ParseException - * @draft ICU 2.8 - */ - public StringBuffer prepare(UCharacterIterator src, int options) - throws ParseException{ - - // map - StringBuffer mapOut = map(src,options); - StringBuffer normOut = mapOut;// initialize - - if(doNFKC){ - // normalize - normOut = normalize(mapOut); - } - - int ch; - char result; - UCharacterIterator iter = UCharacterIterator.getInstance(normOut); - Values val = new Values(); - int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, - firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; - int rtlPos=-1, ltrPos=-1; - boolean rightToLeft=false, leftToRight=false; - - while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ - result = getCodePointValue(ch); - getValues(result,val); - - if(val.type == PROHIBITED ){ - throw new ParseException("A prohibited code point was found in the input" + - iter.getText(), val.value); - } - - direction = UCharacter.getDirection(ch); - if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ - firstCharDir = direction; - } - if(direction == UCharacterDirection.LEFT_TO_RIGHT){ - leftToRight = true; - ltrPos = iter.getIndex()-1; - } - if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ - rightToLeft = true; - rtlPos = iter.getIndex()-1; - } - } - if(checkBiDi == true){ - // satisfy 2 - if( leftToRight == true && rightToLeft == true){ - throw new ParseException("The input does not conform to the rules for BiDi code points." + - iter.getText(), - (rtlPos>ltrPos) ? rtlPos : ltrPos); - } - - //satisfy 3 - if( rightToLeft == true && - !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && - (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) - ){ - throw new ParseException("The input does not conform to the rules for BiDi code points." + - iter.getText(), - (rtlPos>ltrPos) ? rtlPos : ltrPos); - } - } - return normOut; - - } -} --- /dev/null 2020-01-10 15:58:04.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/StringPrep.java 2020-01-10 15:58:04.000000000 -0800 @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/* + ******************************************************************************* + * Copyright (C) 2003-2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +// +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java +// - move from package com.ibm.icu.text to package sun.net.idn +// - use ParseException instead of StringPrepParseException +// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()' +// - remove all @deprecated tag to make compiler happy +// 2007-08-14 Martin Buchholz +// - remove redundant casts +// +package jdk.internal.icu.text; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; + +import sun.text.Normalizer; +import jdk.internal.icu.impl.CharTrie; +import jdk.internal.icu.impl.StringPrepDataReader; +import jdk.internal.icu.impl.Trie; +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.lang.UCharacterDirection; +import jdk.internal.icu.util.VersionInfo; + +/** + * StringPrep API implements the StingPrep framework as described by + * RFC 3454. + * StringPrep prepares Unicode strings for use in network protocols. + * Profiles of StingPrep are set of rules and data according to which the + * Unicode Strings are prepared. Each profiles contains tables which describe + * how a code point should be treated. The tables are broadly classied into + *
    + *
  • Unassigned Table: Contains code points that are unassigned + * in the Unicode Version supported by StringPrep. Currently + * RFC 3454 supports Unicode 3.2.
  • + *
  • Prohibited Table: Contains code points that are prohibted from + * the output of the StringPrep processing function.
  • + *
  • Mapping Table: Contains code ponts that are deleted from the output or case mapped.
  • + *
+ * + * The procedure for preparing Unicode strings: + *
    + *
  1. Map: For each character in the input, check if it has a mapping + * and, if so, replace it with its mapping.
  2. + *
  3. Normalize: Possibly normalize the result of step 1 using Unicode + * normalization.
  4. + *
  5. Prohibit: Check for any characters that are not allowed in the + * output. If any are found, return an error.
  6. + *
  7. Check bidi: Possibly check for right-to-left characters, and if + * any are found, make sure that the whole string satisfies the + * requirements for bidirectional strings. If the string does not + * satisfy the requirements for bidirectional strings, return an + * error.
  8. + *
+ * @author Ram Viswanadha + * @draft ICU 2.8 + */ +public final class StringPrep { + /** + * Option to prohibit processing of unassigned code points in the input + * + * @see #prepare + * @draft ICU 2.8 + */ + public static final int DEFAULT = 0x0000; + + /** + * Option to allow processing of unassigned code points in the input + * + * @see #prepare + * @draft ICU 2.8 + */ + public static final int ALLOW_UNASSIGNED = 0x0001; + + private static final int UNASSIGNED = 0x0000; + private static final int MAP = 0x0001; + private static final int PROHIBITED = 0x0002; + private static final int DELETE = 0x0003; + private static final int TYPE_LIMIT = 0x0004; + + private static final int NORMALIZATION_ON = 0x0001; + private static final int CHECK_BIDI_ON = 0x0002; + + private static final int TYPE_THRESHOLD = 0xFFF0; + private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ + private static final int MAX_INDEX_TOP_LENGTH = 0x0003; + + /* indexes[] value names */ + private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ + private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ + private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ + private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ + private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ + private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; + private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; + private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ + private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ + + + /** + * Default buffer size of datafile + */ + private static final int DATA_BUFFER_SIZE = 25000; + + /* Wrappers for Trie implementations */ + private static final class StringPrepTrieImpl implements Trie.DataManipulate{ + private CharTrie sprepTrie = null; + /** + * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's + * data the index array offset of the indexes for that lead surrogate. + * @param property data value for a surrogate from the trie, including + * the folding offset + * @return data offset or 0 if there is no data for the lead surrogate + */ + public int getFoldingOffset(int value){ + return value; + } + } + + // CharTrie implementation for reading the trie data + private StringPrepTrieImpl sprepTrieImpl; + // Indexes read from the data file + private int[] indexes; + // mapping data read from the data file + private char[] mappingData; + // format version of the data file + private byte[] formatVersion; + // the version of Unicode supported by the data file + private VersionInfo sprepUniVer; + // the Unicode version of last entry in the + // NormalizationCorrections.txt file if normalization + // is turned on + private VersionInfo normCorrVer; + // Option to turn on Normalization + private boolean doNFKC; + // Option to turn on checking for BiDi rules + private boolean checkBiDi; + + + private char getCodePointValue(int ch){ + return sprepTrieImpl.sprepTrie.getCodePointValue(ch); + } + + private static VersionInfo getVersionInfo(int comp){ + int micro = comp & 0xFF; + int milli =(comp >> 8) & 0xFF; + int minor =(comp >> 16) & 0xFF; + int major =(comp >> 24) & 0xFF; + return VersionInfo.getInstance(major,minor,milli,micro); + } + private static VersionInfo getVersionInfo(byte[] version){ + if(version.length != 4){ + return null; + } + return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); + } + /** + * Creates an StringPrep object after reading the input stream. + * The object does not hold a reference to the input steam, so the stream can be + * closed after the method returns. + * + * @param inputStream The stream for reading the StringPrep profile binarySun + * @throws IOException + * @draft ICU 2.8 + */ + public StringPrep(InputStream inputStream) throws IOException{ + + BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE); + + StringPrepDataReader reader = new StringPrepDataReader(b); + + // read the indexes + indexes = reader.readIndexes(INDEX_TOP); + + byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]]; + + + //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes + mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2]; + // load the rest of the data and initialize the data members + reader.read(sprepBytes,mappingData); + + sprepTrieImpl = new StringPrepTrieImpl(); + sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl ); + + // get the data format version + formatVersion = reader.getDataFormatVersion(); + + // get the options + doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); + checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); + sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); + normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); + VersionInfo normUniVer = UCharacter.getUnicodeVersion(); + if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ + normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ + ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ + ){ + throw new IOException("Normalization Correction version not supported"); + } + b.close(); + } + + private static final class Values{ + boolean isIndex; + int value; + int type; + public void reset(){ + isIndex = false; + value = 0; + type = -1; + } + } + + private static final void getValues(char trieWord,Values values){ + values.reset(); + if(trieWord == 0){ + /* + * Initial value stored in the mapping table + * just return TYPE_LIMIT .. so that + * the source codepoint is copied to the destination + */ + values.type = TYPE_LIMIT; + }else if(trieWord >= TYPE_THRESHOLD){ + values.type = (trieWord - TYPE_THRESHOLD); + }else{ + /* get the type */ + values.type = MAP; + /* ascertain if the value is index or delta */ + if((trieWord & 0x02)>0){ + values.isIndex = true; + values.value = trieWord >> 2; //mask off the lower 2 bits and shift + + }else{ + values.isIndex = false; + values.value = (trieWord<<16)>>16; + values.value = (values.value >> 2); + + } + + if((trieWord>>2) == MAX_INDEX_VALUE){ + values.type = DELETE; + values.isIndex = false; + values.value = 0; + } + } + } + + + + private StringBuffer map( UCharacterIterator iter, int options) + throws ParseException { + + Values val = new Values(); + char result = 0; + int ch = UCharacterIterator.DONE; + StringBuffer dest = new StringBuffer(); + boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); + + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + + result = getCodePointValue(ch); + getValues(result,val); + + // check if the source codepoint is unassigned + if(val.type == UNASSIGNED && allowUnassigned == false){ + throw new ParseException("An unassigned code point was found in the input " + + iter.getText(), iter.getIndex()); + }else if((val.type == MAP)){ + int index, length; + + if(val.isIndex){ + index = val.value; + if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && + index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ + length = 1; + }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && + index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ + length = 2; + }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && + index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ + length = 3; + }else{ + length = mappingData[index++]; + } + /* copy mapping to destination */ + dest.append(mappingData,index,length); + continue; + + }else{ + ch -= val.value; + } + }else if(val.type == DELETE){ + // just consume the codepoint and contine + continue; + } + //copy the source into destination + UTF16.append(dest,ch); + } + + return dest; + } + + + private StringBuffer normalize(StringBuffer src){ + /* + * Option UNORM_BEFORE_PRI_29: + * + * IDNA as interpreted by IETF members (see unicode mailing list 2004H1) + * requires strict adherence to Unicode 3.2 normalization, + * including buggy composition from before fixing Public Review Issue #29. + * Note that this results in some valid but nonsensical text to be + * either corrupted or rejected, depending on the text. + * See http://www.unicode.org/review/resolved-pri.html#pri29 + * See unorm.cpp and cnormtst.c + */ + return new StringBuffer( + Normalizer.normalize( + src.toString(), + java.text.Normalizer.Form.NFKC, + Normalizer.UNICODE_3_2)); + } + /* + boolean isLabelSeparator(int ch){ + int result = getCodePointValue(ch); + if( (result & 0x07) == LABEL_SEPARATOR){ + return true; + } + return false; + } + */ + /* + 1) Map -- For each character in the input, check if it has a mapping + and, if so, replace it with its mapping. + + 2) Normalize -- Possibly normalize the result of step 1 using Unicode + normalization. + + 3) Prohibit -- Check for any characters that are not allowed in the + output. If any are found, return an error. + + 4) Check bidi -- Possibly check for right-to-left characters, and if + any are found, make sure that the whole string satisfies the + requirements for bidirectional strings. If the string does not + satisfy the requirements for bidirectional strings, return an + error. + [Unicode3.2] defines several bidirectional categories; each character + has one bidirectional category assigned to it. For the purposes of + the requirements below, an "RandALCat character" is a character that + has Unicode bidirectional categories "R" or "AL"; an "LCat character" + is a character that has Unicode bidirectional category "L". Note + + + that there are many characters which fall in neither of the above + definitions; Latin digits ( through ) are examples of + this because they have bidirectional category "EN". + + In any profile that specifies bidirectional character handling, all + three of the following requirements MUST be met: + + 1) The characters in section 5.8 MUST be prohibited. + + 2) If a string contains any RandALCat character, the string MUST NOT + contain any LCat character. + + 3) If a string contains any RandALCat character, a RandALCat + character MUST be the first character of the string, and a + RandALCat character MUST be the last character of the string. + */ + /** + * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), + * checks for prohited and BiDi characters in the order defined by RFC 3454 + * depending on the options specified in the profile. + * + * @param src A UCharacterIterator object containing the source string + * @param options A bit set of options: + * + * - StringPrep.NONE Prohibit processing of unassigned code points in the input + * + * - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input + * as normal Unicode code points. + * + * @return StringBuffer A StringBuffer containing the output + * @throws ParseException + * @draft ICU 2.8 + */ + public StringBuffer prepare(UCharacterIterator src, int options) + throws ParseException{ + + // map + StringBuffer mapOut = map(src,options); + StringBuffer normOut = mapOut;// initialize + + if(doNFKC){ + // normalize + normOut = normalize(mapOut); + } + + int ch; + char result; + UCharacterIterator iter = UCharacterIterator.getInstance(normOut); + Values val = new Values(); + int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, + firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; + int rtlPos=-1, ltrPos=-1; + boolean rightToLeft=false, leftToRight=false; + + while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ + result = getCodePointValue(ch); + getValues(result,val); + + if(val.type == PROHIBITED ){ + throw new ParseException("A prohibited code point was found in the input" + + iter.getText(), val.value); + } + + direction = UCharacter.getDirection(ch); + if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ + firstCharDir = direction; + } + if(direction == UCharacterDirection.LEFT_TO_RIGHT){ + leftToRight = true; + ltrPos = iter.getIndex()-1; + } + if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ + rightToLeft = true; + rtlPos = iter.getIndex()-1; + } + } + if(checkBiDi == true){ + // satisfy 2 + if( leftToRight == true && rightToLeft == true){ + throw new ParseException("The input does not conform to the rules for BiDi code points." + + iter.getText(), + (rtlPos>ltrPos) ? rtlPos : ltrPos); + } + + //satisfy 3 + if( rightToLeft == true && + !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && + (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) + ){ + throw new ParseException("The input does not conform to the rules for BiDi code points." + + iter.getText(), + (rtlPos>ltrPos) ? rtlPos : ltrPos); + } + } + return normOut; + + } +} --- old/src/java.base/share/classes/sun/text/normalizer/UCharacterIterator.java 2020-01-10 15:58:06.000000000 -0800 +++ /dev/null 2020-01-10 15:58:06.000000000 -0800 @@ -1,313 +0,0 @@ -/* - * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 1996-2014, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.text.CharacterIterator; - -/** - * Abstract class that defines an API for iteration on text objects.This is an - * interface for forward and backward iteration and random access into a text - * object. Forward iteration is done with post-increment and backward iteration - * is done with pre-decrement semantics, while the - * java.text.CharacterIterator interface methods provided forward - * iteration with "pre-increment" and backward iteration with pre-decrement - * semantics. This API is more efficient for forward iteration over code points. - * The other major difference is that this API can do both code unit and code point - * iteration, java.text.CharacterIterator can only iterate over - * code units and is limited to BMP (0 - 0xFFFF) - * @author Ram - * @stable ICU 2.4 - */ -public abstract class UCharacterIterator - implements Cloneable { - - /** - * Protected default constructor for the subclasses - * @stable ICU 2.4 - */ - protected UCharacterIterator(){ - } - - /** - * Indicator that we have reached the ends of the UTF16 text. - * Moved from UForwardCharacterIterator.java - * @stable ICU 2.4 - */ - public static final int DONE = -1; - - // static final methods ---------------------------------------------------- - - /** - * Returns a UCharacterIterator object given a - * source string. - * @param source a string - * @return UCharacterIterator object - * @exception IllegalArgumentException if the argument is null - * @stable ICU 2.4 - */ - public static final UCharacterIterator getInstance(String source){ - return new ReplaceableUCharacterIterator(source); - } - - /** - * Returns a UCharacterIterator object given a - * source StringBuffer. - * @param source an string buffer of UTF-16 code units - * @return UCharacterIterator object - * @exception IllegalArgumentException if the argument is null - * @stable ICU 2.4 - */ - public static final UCharacterIterator getInstance(StringBuffer source){ - return new ReplaceableUCharacterIterator(source); - } - - /** - * Returns a UCharacterIterator object given a - * CharacterIterator. - * @param source a valid CharacterIterator object. - * @return UCharacterIterator object - * @exception IllegalArgumentException if the argument is null - * @stable ICU 2.4 - */ - public static final UCharacterIterator getInstance(CharacterIterator source){ - return new CharacterIteratorWrapper(source); - } - - // public methods ---------------------------------------------------------- - - /** - * Returns the length of the text - * @return length of the text - * @stable ICU 2.4 - */ - public abstract int getLength(); - - /** - * Gets the current index in text. - * @return current index in text. - * @stable ICU 2.4 - */ - public abstract int getIndex(); - - /** - * Returns the UTF16 code unit at index, and increments to the next - * code unit (post-increment semantics). If index is out of - * range, DONE is returned, and the iterator is reset to the limit - * of the text. - * @return the next UTF16 code unit, or DONE if the index is at the limit - * of the text. - * @stable ICU 2.4 - */ - public abstract int next(); - - /** - * Returns the code point at index, and increments to the next code - * point (post-increment semantics). If index does not point to a - * valid surrogate pair, the behavior is the same as - * next(). Otherwise the iterator is incremented past - * the surrogate pair, and the code point represented by the pair - * is returned. - * @return the next codepoint in text, or DONE if the index is at - * the limit of the text. - * @stable ICU 2.4 - */ - public int nextCodePoint(){ - int ch1 = next(); - if(UTF16.isLeadSurrogate((char)ch1)){ - int ch2 = next(); - if(UTF16.isTrailSurrogate((char)ch2)){ - return UCharacterProperty.getRawSupplementary((char)ch1, - (char)ch2); - }else if (ch2 != DONE) { - // unmatched surrogate so back out - previous(); - } - } - return ch1; - } - - /** - * Decrement to the position of the previous code unit in the - * text, and return it (pre-decrement semantics). If the - * resulting index is less than 0, the index is reset to 0 and - * DONE is returned. - * @return the previous code unit in the text, or DONE if the new - * index is before the start of the text. - * @stable ICU 2.4 - */ - public abstract int previous(); - - - /** - * Retreat to the start of the previous code point in the text, - * and return it (pre-decrement semantics). If the index is not - * preceeded by a valid surrogate pair, the behavior is the same - * as previous(). Otherwise the iterator is - * decremented to the start of the surrogate pair, and the code - * point represented by the pair is returned. - * @return the previous code point in the text, or DONE if the new - * index is before the start of the text. - * @stable ICU 2.4 - */ - public int previousCodePoint(){ - int ch1 = previous(); - if(UTF16.isTrailSurrogate((char)ch1)){ - int ch2 = previous(); - if(UTF16.isLeadSurrogate((char)ch2)){ - return UCharacterProperty.getRawSupplementary((char)ch2, - (char)ch1); - }else if (ch2 != DONE) { - //unmatched trail surrogate so back out - next(); - } - } - return ch1; - } - - /** - * Sets the index to the specified index in the text. - * @param index the index within the text. - * @exception IndexOutOfBoundsException is thrown if an invalid index is - * supplied - * @stable ICU 2.4 - */ - public abstract void setIndex(int index); - - /** - * Sets the current index to the start. - * @stable ICU 2.4 - */ - public void setToStart() { - setIndex(0); - } - - /** - * Fills the buffer with the underlying text storage of the iterator - * If the buffer capacity is not enough a exception is thrown. The capacity - * of the fill in buffer should at least be equal to length of text in the - * iterator obtained by calling getLength(). - * Usage: - * - *
{@code
-     *         UChacterIterator iter = new UCharacterIterator.getInstance(text);
-     *         char[] buf = new char[iter.getLength()];
-     *         iter.getText(buf);
-     *
-     *         OR
-     *         char[] buf= new char[1];
-     *         int len = 0;
-     *         for(;;){
-     *             try{
-     *                 len = iter.getText(buf);
-     *                 break;
-     *             }catch(IndexOutOfBoundsException e){
-     *                 buf = new char[iter.getLength()];
-     *             }
-     *         }
-     * }
- * - * @param fillIn an array of chars to fill with the underlying UTF-16 code - * units. - * @param offset the position within the array to start putting the data. - * @return the number of code units added to fillIn, as a convenience - * @exception IndexOutOfBoundsException exception if there is not enough - * room after offset in the array, or if offset < 0. - * @stable ICU 2.4 - */ - public abstract int getText(char[] fillIn, int offset); - - /** - * Convenience override for getText(char[], int) that provides - * an offset of 0. - * @param fillIn an array of chars to fill with the underlying UTF-16 code - * units. - * @return the number of code units added to fillIn, as a convenience - * @exception IndexOutOfBoundsException exception if there is not enough - * room in the array. - * @stable ICU 2.4 - */ - public final int getText(char[] fillIn) { - return getText(fillIn, 0); - } - - /** - * Convenience method for returning the underlying text storage as a string - * @return the underlying text storage in the iterator as a string - * @stable ICU 2.4 - */ - public String getText() { - char[] text = new char[getLength()]; - getText(text); - return new String(text); - } - - /** - * Moves the current position by the number of code points - * specified, either forward or backward depending on the sign of - * delta (positive or negative respectively). If the current index - * is at a trail surrogate then the first adjustment is by code - * unit, and the remaining adjustments are by code points. If the - * resulting index would be less than zero, the index is set to - * zero, and if the resulting index would be greater than limit, - * the index is set to limit. - * @param delta the number of code units to move the current index. - * @return the new index - * @exception IndexOutOfBoundsException is thrown if an invalid delta is - * supplied - * @stable ICU 2.4 - * - */ - public int moveCodePointIndex(int delta){ - if(delta>0){ - while(delta>0 && nextCodePoint() != DONE){delta--;} - }else{ - while(delta<0 && previousCodePoint() != DONE){delta++;} - } - if(delta!=0){ - throw new IndexOutOfBoundsException(); - } - - return getIndex(); - } - - /** - * Creates a copy of this iterator, independent from other iterators. - * If it is not possible to clone the iterator, returns null. - * @return copy of this iterator - * @stable ICU 2.4 - */ - public Object clone() throws CloneNotSupportedException{ - return super.clone(); - } - -} --- /dev/null 2020-01-10 15:58:06.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/UCharacterIterator.java 2020-01-10 15:58:05.000000000 -0800 @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +package jdk.internal.icu.text; + +import jdk.internal.icu.impl.CharacterIteratorWrapper; +import jdk.internal.icu.impl.ReplaceableUCharacterIterator; +import jdk.internal.icu.impl.UCharacterProperty; + +import java.text.CharacterIterator; + +/** + * Abstract class that defines an API for iteration on text objects.This is an + * interface for forward and backward iteration and random access into a text + * object. Forward iteration is done with post-increment and backward iteration + * is done with pre-decrement semantics, while the + * java.text.CharacterIterator interface methods provided forward + * iteration with "pre-increment" and backward iteration with pre-decrement + * semantics. This API is more efficient for forward iteration over code points. + * The other major difference is that this API can do both code unit and code point + * iteration, java.text.CharacterIterator can only iterate over + * code units and is limited to BMP (0 - 0xFFFF) + * @author Ram + * @stable ICU 2.4 + */ +public abstract class UCharacterIterator + implements Cloneable { + + /** + * Protected default constructor for the subclasses + * @stable ICU 2.4 + */ + protected UCharacterIterator(){ + } + + /** + * Indicator that we have reached the ends of the UTF16 text. + * Moved from UForwardCharacterIterator.java + * @stable ICU 2.4 + */ + public static final int DONE = -1; + + // static final methods ---------------------------------------------------- + + /** + * Returns a UCharacterIterator object given a + * source string. + * @param source a string + * @return UCharacterIterator object + * @exception IllegalArgumentException if the argument is null + * @stable ICU 2.4 + */ + public static final UCharacterIterator getInstance(String source){ + return new ReplaceableUCharacterIterator(source); + } + + /** + * Returns a UCharacterIterator object given a + * source StringBuffer. + * @param source an string buffer of UTF-16 code units + * @return UCharacterIterator object + * @exception IllegalArgumentException if the argument is null + * @stable ICU 2.4 + */ + public static final UCharacterIterator getInstance(StringBuffer source){ + return new ReplaceableUCharacterIterator(source); + } + + /** + * Returns a UCharacterIterator object given a + * CharacterIterator. + * @param source a valid CharacterIterator object. + * @return UCharacterIterator object + * @exception IllegalArgumentException if the argument is null + * @stable ICU 2.4 + */ + public static final UCharacterIterator getInstance(CharacterIterator source){ + return new CharacterIteratorWrapper(source); + } + + // public methods ---------------------------------------------------------- + + /** + * Returns the length of the text + * @return length of the text + * @stable ICU 2.4 + */ + public abstract int getLength(); + + /** + * Gets the current index in text. + * @return current index in text. + * @stable ICU 2.4 + */ + public abstract int getIndex(); + + /** + * Returns the UTF16 code unit at index, and increments to the next + * code unit (post-increment semantics). If index is out of + * range, DONE is returned, and the iterator is reset to the limit + * of the text. + * @return the next UTF16 code unit, or DONE if the index is at the limit + * of the text. + * @stable ICU 2.4 + */ + public abstract int next(); + + /** + * Returns the code point at index, and increments to the next code + * point (post-increment semantics). If index does not point to a + * valid surrogate pair, the behavior is the same as + * next(). Otherwise the iterator is incremented past + * the surrogate pair, and the code point represented by the pair + * is returned. + * @return the next codepoint in text, or DONE if the index is at + * the limit of the text. + * @stable ICU 2.4 + */ + public int nextCodePoint(){ + int ch1 = next(); + if(UTF16.isLeadSurrogate((char)ch1)){ + int ch2 = next(); + if(UTF16.isTrailSurrogate((char)ch2)){ + return UCharacterProperty.getRawSupplementary((char)ch1, + (char)ch2); + }else if (ch2 != DONE) { + // unmatched surrogate so back out + previous(); + } + } + return ch1; + } + + /** + * Decrement to the position of the previous code unit in the + * text, and return it (pre-decrement semantics). If the + * resulting index is less than 0, the index is reset to 0 and + * DONE is returned. + * @return the previous code unit in the text, or DONE if the new + * index is before the start of the text. + * @stable ICU 2.4 + */ + public abstract int previous(); + + + /** + * Retreat to the start of the previous code point in the text, + * and return it (pre-decrement semantics). If the index is not + * preceeded by a valid surrogate pair, the behavior is the same + * as previous(). Otherwise the iterator is + * decremented to the start of the surrogate pair, and the code + * point represented by the pair is returned. + * @return the previous code point in the text, or DONE if the new + * index is before the start of the text. + * @stable ICU 2.4 + */ + public int previousCodePoint(){ + int ch1 = previous(); + if(UTF16.isTrailSurrogate((char)ch1)){ + int ch2 = previous(); + if(UTF16.isLeadSurrogate((char)ch2)){ + return UCharacterProperty.getRawSupplementary((char)ch2, + (char)ch1); + }else if (ch2 != DONE) { + //unmatched trail surrogate so back out + next(); + } + } + return ch1; + } + + /** + * Sets the index to the specified index in the text. + * @param index the index within the text. + * @exception IndexOutOfBoundsException is thrown if an invalid index is + * supplied + * @stable ICU 2.4 + */ + public abstract void setIndex(int index); + + /** + * Sets the current index to the start. + * @stable ICU 2.4 + */ + public void setToStart() { + setIndex(0); + } + + /** + * Fills the buffer with the underlying text storage of the iterator + * If the buffer capacity is not enough a exception is thrown. The capacity + * of the fill in buffer should at least be equal to length of text in the + * iterator obtained by calling getLength(). + * Usage: + * + *
{@code
+     *         UChacterIterator iter = new UCharacterIterator.getInstance(text);
+     *         char[] buf = new char[iter.getLength()];
+     *         iter.getText(buf);
+     *
+     *         OR
+     *         char[] buf= new char[1];
+     *         int len = 0;
+     *         for(;;){
+     *             try{
+     *                 len = iter.getText(buf);
+     *                 break;
+     *             }catch(IndexOutOfBoundsException e){
+     *                 buf = new char[iter.getLength()];
+     *             }
+     *         }
+     * }
+ * + * @param fillIn an array of chars to fill with the underlying UTF-16 code + * units. + * @param offset the position within the array to start putting the data. + * @return the number of code units added to fillIn, as a convenience + * @exception IndexOutOfBoundsException exception if there is not enough + * room after offset in the array, or if offset < 0. + * @stable ICU 2.4 + */ + public abstract int getText(char[] fillIn, int offset); + + /** + * Convenience override for getText(char[], int) that provides + * an offset of 0. + * @param fillIn an array of chars to fill with the underlying UTF-16 code + * units. + * @return the number of code units added to fillIn, as a convenience + * @exception IndexOutOfBoundsException exception if there is not enough + * room in the array. + * @stable ICU 2.4 + */ + public final int getText(char[] fillIn) { + return getText(fillIn, 0); + } + + /** + * Convenience method for returning the underlying text storage as a string + * @return the underlying text storage in the iterator as a string + * @stable ICU 2.4 + */ + public String getText() { + char[] text = new char[getLength()]; + getText(text); + return new String(text); + } + + /** + * Moves the current position by the number of code points + * specified, either forward or backward depending on the sign of + * delta (positive or negative respectively). If the current index + * is at a trail surrogate then the first adjustment is by code + * unit, and the remaining adjustments are by code points. If the + * resulting index would be less than zero, the index is set to + * zero, and if the resulting index would be greater than limit, + * the index is set to limit. + * @param delta the number of code units to move the current index. + * @return the new index + * @exception IndexOutOfBoundsException is thrown if an invalid delta is + * supplied + * @stable ICU 2.4 + * + */ + public int moveCodePointIndex(int delta){ + if(delta>0){ + while(delta>0 && nextCodePoint() != DONE){delta--;} + }else{ + while(delta<0 && previousCodePoint() != DONE){delta++;} + } + if(delta!=0){ + throw new IndexOutOfBoundsException(); + } + + return getIndex(); + } + + /** + * Creates a copy of this iterator, independent from other iterators. + * If it is not possible to clone the iterator, returns null. + * @return copy of this iterator + * @stable ICU 2.4 + */ + public Object clone() throws CloneNotSupportedException{ + return super.clone(); + } + +} --- old/src/java.base/share/classes/sun/text/normalizer/UTF16.java 2020-01-10 15:58:07.000000000 -0800 +++ /dev/null 2020-01-10 15:58:07.000000000 -0800 @@ -1,616 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/** - ******************************************************************************* - * Copyright (C) 1996-2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ - -package sun.text.normalizer; - -/** - *

Standalone utility class providing UTF16 character conversions and - * indexing conversions. - *

Code that uses strings alone rarely need modification. - * By design, UTF-16 does not allow overlap, so searching for strings is a safe - * operation. Similarly, concatenation is always safe. Substringing is safe if - * the start and end are both on UTF-32 boundaries. In normal code, the values - * for start and end are on those boundaries, since they arose from operations - * like searching. If not, the nearest UTF-32 boundaries can be determined - * using bounds(). - * Examples: - *

The following examples illustrate use of some of these methods. - *

{@code
- * // iteration forwards: Original
- * for (int i = 0; i < s.length(); ++i) {
- *     char ch = s.charAt(i);
- *     doSomethingWith(ch);
- * }
- *
- * // iteration forwards: Changes for UTF-32
- * int ch;
- * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
- *     ch = UTF16.charAt(s, i);
- *     doSomethingWith(ch);
- * }
- *
- * // iteration backwards: Original
- * for (int i = s.length() - 1; i >= 0; --i) {
- *     char ch = s.charAt(i);
- *     doSomethingWith(ch);
- * }
- *
- * // iteration backwards: Changes for UTF-32
- * int ch;
- * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
- *     ch = UTF16.charAt(s, i);
- *     doSomethingWith(ch);
- * }
- * }
- * Notes: - *
    - *
  • - * Naming: For clarity, High and Low surrogates are called - * Lead and Trail in the API, which gives a better - * sense of their ordering in a string. offset16 and - * offset32 are used to distinguish offsets to UTF-16 - * boundaries vs offsets to UTF-32 boundaries. int char32 is - * used to contain UTF-32 characters, as opposed to char16, - * which is a UTF-16 code unit. - *
  • - *
  • - * Roundtripping Offsets: You can always roundtrip from a - * UTF-32 offset to a UTF-16 offset and back. Because of the difference in - * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and - * back if and only if bounds(string, offset16) != TRAIL. - *
  • - *
  • - * Exceptions: The error checking will throw an exception - * if indices are out of bounds. Other than that, all methods will - * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 - * values are present. UCharacter.isLegal() can be used to check - * for validity if desired. - *
  • - *
  • - * Unmatched Surrogates: If the string contains unmatched - * surrogates, then these are counted as one UTF-32 value. This matches - * their iteration behavior, which is vital. It also matches common display - * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). - *
  • - *
  • - * Optimization: The method implementations may need - * optimization if the compiler doesn't fold static final methods. Since - * surrogate pairs will form an exceeding small percentage of all the text - * in the world, the singleton case should always be optimized for. - *
  • - *
- * @author Mark Davis, with help from Markus Scherer - * @stable ICU 2.1 - */ - -public final class UTF16 -{ - // public variables --------------------------------------------------- - - /** - * The lowest Unicode code point value. - * @stable ICU 2.1 - */ - public static final int CODEPOINT_MIN_VALUE = 0; - /** - * The highest Unicode code point value (scalar value) according to the - * Unicode Standard. - * @stable ICU 2.1 - */ - public static final int CODEPOINT_MAX_VALUE = 0x10ffff; - /** - * The minimum value for Supplementary code points - * @stable ICU 2.1 - */ - public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; - /** - * Lead surrogate minimum value - * @stable ICU 2.1 - */ - public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; - /** - * Trail surrogate minimum value - * @stable ICU 2.1 - */ - public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; - /** - * Lead surrogate maximum value - * @stable ICU 2.1 - */ - public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; - /** - * Trail surrogate maximum value - * @stable ICU 2.1 - */ - public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; - /** - * Surrogate minimum value - * @stable ICU 2.1 - */ - public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; - /** - * Lead surrogate bitmask - */ - private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; - /** - * Trail surrogate bitmask - */ - private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; - /** - * Surrogate bitmask - */ - private static final int SURROGATE_BITMASK = 0xFFFFF800; - /** - * Lead surrogate bits - */ - private static final int LEAD_SURROGATE_BITS = 0xD800; - /** - * Trail surrogate bits - */ - private static final int TRAIL_SURROGATE_BITS = 0xDC00; - /** - * Surrogate bits - */ - private static final int SURROGATE_BITS = 0xD800; - - // constructor -------------------------------------------------------- - - // /CLOVER:OFF - /** - * Prevent instance from being created. - */ - private UTF16() { - } - - // /CLOVER:ON - // public method ------------------------------------------------------ - - /** - * Extract a single UTF-32 value from a string. - * Used when iterating forwards or backwards (with - * UTF16.getCharCount(), as well as random access. If a - * validity check is required, use - * - * UCharacter.isLegal() on the return value. - * If the char retrieved is part of a surrogate pair, its supplementary - * character will be returned. If a complete supplementary character is - * not found the incomplete character will be returned - * @param source array of UTF-16 chars - * @param offset16 UTF-16 offset to the start of the character. - * @return UTF-32 value for the UTF-32 value that contains the char at - * offset16. The boundaries of that codepoint are the same as in - * bounds32(). - * @exception IndexOutOfBoundsException thrown if offset16 is out of - * bounds. - * @stable ICU 2.1 - */ - public static int charAt(String source, int offset16) { - char single = source.charAt(offset16); - if (single < LEAD_SURROGATE_MIN_VALUE) { - return single; - } - return _charAt(source, offset16, single); - } - - private static int _charAt(String source, int offset16, char single) { - if (single > TRAIL_SURROGATE_MAX_VALUE) { - return single; - } - - // Convert the UTF-16 surrogate pair if necessary. - // For simplicity in usage, and because the frequency of pairs is - // low, look both directions. - - if (single <= LEAD_SURROGATE_MAX_VALUE) { - ++offset16; - if (source.length() != offset16) { - char trail = source.charAt(offset16); - if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { - return UCharacterProperty.getRawSupplementary(single, trail); - } - } - } else { - --offset16; - if (offset16 >= 0) { - // single is a trail surrogate so - char lead = source.charAt(offset16); - if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { - return UCharacterProperty.getRawSupplementary(lead, single); - } - } - } - return single; // return unmatched surrogate - } - - /** - * Extract a single UTF-32 value from a string. - * Used when iterating forwards or backwards (with - * UTF16.getCharCount(), as well as random access. If a - * validity check is required, use - * UCharacter.isLegal() - * on the return value. - * If the char retrieved is part of a surrogate pair, its supplementary - * character will be returned. If a complete supplementary character is - * not found the incomplete character will be returned - * @param source array of UTF-16 chars - * @param offset16 UTF-16 offset to the start of the character. - * @return UTF-32 value for the UTF-32 value that contains the char at - * offset16. The boundaries of that codepoint are the same as in - * bounds32(). - * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds. - * @stable ICU 2.1 - */ - public static int charAt(CharSequence source, int offset16) { - char single = source.charAt(offset16); - if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { - return single; - } - return _charAt(source, offset16, single); - } - - private static int _charAt(CharSequence source, int offset16, char single) { - if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { - return single; - } - - // Convert the UTF-16 surrogate pair if necessary. - // For simplicity in usage, and because the frequency of pairs is - // low, look both directions. - - if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - ++offset16; - if (source.length() != offset16) { - char trail = source.charAt(offset16); - if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE - && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { - return UCharacterProperty.getRawSupplementary(single, trail); - } - } - } else { - --offset16; - if (offset16 >= 0) { - // single is a trail surrogate so - char lead = source.charAt(offset16); - if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE - && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - return UCharacterProperty.getRawSupplementary(lead, single); - } - } - } - return single; // return unmatched surrogate - } - - /** - * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards - * (with UTF16.getCharCount(), as well as random access. If a validity check is - * required, use UCharacter.isLegal() - * - * on the return value. If the char retrieved is part of a surrogate pair, its supplementary - * character will be returned. If a complete supplementary character is not found the incomplete - * character will be returned - * - * @param source Array of UTF-16 chars - * @param start Offset to substring in the source array for analyzing - * @param limit Offset to substring in the source array for analyzing - * @param offset16 UTF-16 offset relative to start - * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries - * of that codepoint are the same as in bounds32(). - * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. - * @stable ICU 2.1 - */ - public static int charAt(char source[], int start, int limit, int offset16) { - offset16 += start; - if (offset16 < start || offset16 >= limit) { - throw new ArrayIndexOutOfBoundsException(offset16); - } - - char single = source[offset16]; - if (!isSurrogate(single)) { - return single; - } - - // Convert the UTF-16 surrogate pair if necessary. - // For simplicity in usage, and because the frequency of pairs is - // low, look both directions. - if (single <= LEAD_SURROGATE_MAX_VALUE) { - offset16++; - if (offset16 >= limit) { - return single; - } - char trail = source[offset16]; - if (isTrailSurrogate(trail)) { - return UCharacterProperty.getRawSupplementary(single, trail); - } - } - else { // isTrailSurrogate(single), so - if (offset16 == start) { - return single; - } - offset16--; - char lead = source[offset16]; - if (isLeadSurrogate(lead)) - return UCharacterProperty.getRawSupplementary(lead, single); - } - return single; // return unmatched surrogate - } - - /** - * Determines how many chars this char32 requires. - * If a validity check is required, use - * isLegal() on - * char32 before calling. - * @param char32 the input codepoint. - * @return 2 if is in supplementary space, otherwise 1. - * @stable ICU 2.1 - */ - public static int getCharCount(int char32) - { - if (char32 < SUPPLEMENTARY_MIN_VALUE) { - return 1; - } - return 2; - } - - /** - * Determines whether the code value is a surrogate. - * @param char16 the input character. - * @return true if the input character is a surrogate. - * @stable ICU 2.1 - */ - public static boolean isSurrogate(char char16) - { - return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; - } - - /** - * Determines whether the character is a trail surrogate. - * @param char16 the input character. - * @return true if the input character is a trail surrogate. - * @stable ICU 2.1 - */ - public static boolean isTrailSurrogate(char char16) - { - return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; - } - - /** - * Determines whether the character is a lead surrogate. - * @param char16 the input character. - * @return true if the input character is a lead surrogate - * @stable ICU 2.1 - */ - public static boolean isLeadSurrogate(char char16) - { - return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; - } - - /** - * Returns the lead surrogate. - * If a validity check is required, use - * isLegal() - * on char32 before calling. - * @param char32 the input character. - * @return lead surrogate if the getCharCount(ch) is 2;
- * and 0 otherwise (note: 0 is not a valid lead surrogate). - * @stable ICU 2.1 - */ - public static char getLeadSurrogate(int char32) - { - if (char32 >= SUPPLEMENTARY_MIN_VALUE) { - return (char)(LEAD_SURROGATE_OFFSET_ + - (char32 >> LEAD_SURROGATE_SHIFT_)); - } - - return 0; - } - - /** - * Returns the trail surrogate. - * If a validity check is required, use - * isLegal() - * on char32 before calling. - * @param char32 the input character. - * @return the trail surrogate if the getCharCount(ch) is 2;
otherwise - * the character itself - * @stable ICU 2.1 - */ - public static char getTrailSurrogate(int char32) - { - if (char32 >= SUPPLEMENTARY_MIN_VALUE) { - return (char)(TRAIL_SURROGATE_MIN_VALUE + - (char32 & TRAIL_SURROGATE_MASK_)); - } - - return (char) char32; - } - - /** - * Convenience method corresponding to String.valueOf(char). Returns a one - * or two char string containing the UTF-32 value in UTF16 format. If a - * validity check is required, use - * isLegal() - * on char32 before calling. - * @param char32 the input character. - * @return string value of char32 in UTF16 format - * @exception IllegalArgumentException thrown if char32 is a invalid - * codepoint. - * @stable ICU 2.1 - */ - public static String valueOf(int char32) - { - if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { - throw new IllegalArgumentException("Illegal codepoint"); - } - return toString(char32); - } - - /** - * Append a single UTF-32 value to the end of a StringBuffer. - * If a validity check is required, use - * isLegal() - * on char32 before calling. - * @param target the buffer to append to - * @param char32 value to append. - * @return the updated StringBuffer - * @exception IllegalArgumentException thrown when char32 does not lie - * within the range of the Unicode codepoints - * @stable ICU 2.1 - */ - public static StringBuffer append(StringBuffer target, int char32) - { - // Check for irregular values - if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { - throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); - } - - // Write the UTF-16 values - if (char32 >= SUPPLEMENTARY_MIN_VALUE) - { - target.append(getLeadSurrogate(char32)); - target.append(getTrailSurrogate(char32)); - } - else { - target.append((char) char32); - } - return target; - } - - /** - * Shifts offset16 by the argument number of codepoints within a subarray. - * @param source char array - * @param start position of the subarray to be performed on - * @param limit position of the subarray to be performed on - * @param offset16 UTF16 position to shift relative to start - * @param shift32 number of codepoints to shift - * @return new shifted offset16 relative to start - * @exception IndexOutOfBoundsException if the new offset16 is out of - * bounds with respect to the subarray or the subarray bounds - * are out of range. - * @stable ICU 2.1 - */ - public static int moveCodePointOffset(char source[], int start, int limit, - int offset16, int shift32) - { - int size = source.length; - int count; - char ch; - int result = offset16 + start; - if (start < 0 || limit < start) { - throw new StringIndexOutOfBoundsException(start); - } - if (limit > size) { - throw new StringIndexOutOfBoundsException(limit); - } - if (offset16 < 0 || result > limit) { - throw new StringIndexOutOfBoundsException(offset16); - } - if (shift32 > 0) { - if (shift32 + result > size) { - throw new StringIndexOutOfBoundsException(result); - } - count = shift32; - while (result < limit && count > 0) - { - ch = source[result]; - if (isLeadSurrogate(ch) && (result + 1 < limit) && - isTrailSurrogate(source[result + 1])) { - result++; - } - count--; - result++; - } - } else { - if (result + shift32 < start) { - throw new StringIndexOutOfBoundsException(result); - } - for (count = -shift32; count > 0; count--) { - result--; - if (result < start) { - break; - } - ch = source[result]; - if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { - result--; - } - } - } - if (count != 0) { - throw new StringIndexOutOfBoundsException(shift32); - } - result -= start; - return result; - } - - // private data members ------------------------------------------------- - - /** - * Shift value for lead surrogate to form a supplementary character. - */ - private static final int LEAD_SURROGATE_SHIFT_ = 10; - - /** - * Mask to retrieve the significant value from a trail surrogate. - */ - private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; - - /** - * Value that all lead surrogate starts with - */ - private static final int LEAD_SURROGATE_OFFSET_ = - LEAD_SURROGATE_MIN_VALUE - - (SUPPLEMENTARY_MIN_VALUE - >> LEAD_SURROGATE_SHIFT_); - - // private methods ------------------------------------------------------ - - /** - *

Converts argument code point and returns a String object representing - * the code point's value in UTF16 format. - *

This method does not check for the validity of the codepoint, the - * results are not guaranteed if a invalid codepoint is passed as - * argument. - *

The result is a string whose length is 1 for non-supplementary code - * points, 2 otherwise. - * @param ch code point - * @return string representation of the code point - */ - private static String toString(int ch) - { - if (ch < SUPPLEMENTARY_MIN_VALUE) { - return String.valueOf((char) ch); - } - - StringBuilder result = new StringBuilder(); - result.append(getLeadSurrogate(ch)); - result.append(getTrailSurrogate(ch)); - return result.toString(); - } -} --- /dev/null 2020-01-10 15:58:07.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/UTF16.java 2020-01-10 15:58:07.000000000 -0800 @@ -0,0 +1,618 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/** + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.text; + +import jdk.internal.icu.impl.UCharacterProperty; + +/** + *

Standalone utility class providing UTF16 character conversions and + * indexing conversions. + *

Code that uses strings alone rarely need modification. + * By design, UTF-16 does not allow overlap, so searching for strings is a safe + * operation. Similarly, concatenation is always safe. Substringing is safe if + * the start and end are both on UTF-32 boundaries. In normal code, the values + * for start and end are on those boundaries, since they arose from operations + * like searching. If not, the nearest UTF-32 boundaries can be determined + * using bounds(). + * Examples: + *

The following examples illustrate use of some of these methods. + *

{@code
+ * // iteration forwards: Original
+ * for (int i = 0; i < s.length(); ++i) {
+ *     char ch = s.charAt(i);
+ *     doSomethingWith(ch);
+ * }
+ *
+ * // iteration forwards: Changes for UTF-32
+ * int ch;
+ * for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
+ *     ch = UTF16.charAt(s, i);
+ *     doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Original
+ * for (int i = s.length() - 1; i >= 0; --i) {
+ *     char ch = s.charAt(i);
+ *     doSomethingWith(ch);
+ * }
+ *
+ * // iteration backwards: Changes for UTF-32
+ * int ch;
+ * for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
+ *     ch = UTF16.charAt(s, i);
+ *     doSomethingWith(ch);
+ * }
+ * }
+ * Notes: + *
    + *
  • + * Naming: For clarity, High and Low surrogates are called + * Lead and Trail in the API, which gives a better + * sense of their ordering in a string. offset16 and + * offset32 are used to distinguish offsets to UTF-16 + * boundaries vs offsets to UTF-32 boundaries. int char32 is + * used to contain UTF-32 characters, as opposed to char16, + * which is a UTF-16 code unit. + *
  • + *
  • + * Roundtripping Offsets: You can always roundtrip from a + * UTF-32 offset to a UTF-16 offset and back. Because of the difference in + * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and + * back if and only if bounds(string, offset16) != TRAIL. + *
  • + *
  • + * Exceptions: The error checking will throw an exception + * if indices are out of bounds. Other than that, all methods will + * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 + * values are present. UCharacter.isLegal() can be used to check + * for validity if desired. + *
  • + *
  • + * Unmatched Surrogates: If the string contains unmatched + * surrogates, then these are counted as one UTF-32 value. This matches + * their iteration behavior, which is vital. It also matches common display + * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5). + *
  • + *
  • + * Optimization: The method implementations may need + * optimization if the compiler doesn't fold static final methods. Since + * surrogate pairs will form an exceeding small percentage of all the text + * in the world, the singleton case should always be optimized for. + *
  • + *
+ * @author Mark Davis, with help from Markus Scherer + * @stable ICU 2.1 + */ + +public final class UTF16 +{ + // public variables --------------------------------------------------- + + /** + * The lowest Unicode code point value. + * @stable ICU 2.1 + */ + public static final int CODEPOINT_MIN_VALUE = 0; + /** + * The highest Unicode code point value (scalar value) according to the + * Unicode Standard. + * @stable ICU 2.1 + */ + public static final int CODEPOINT_MAX_VALUE = 0x10ffff; + /** + * The minimum value for Supplementary code points + * @stable ICU 2.1 + */ + public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; + /** + * Lead surrogate minimum value + * @stable ICU 2.1 + */ + public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; + /** + * Trail surrogate minimum value + * @stable ICU 2.1 + */ + public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; + /** + * Lead surrogate maximum value + * @stable ICU 2.1 + */ + public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; + /** + * Trail surrogate maximum value + * @stable ICU 2.1 + */ + public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; + /** + * Surrogate minimum value + * @stable ICU 2.1 + */ + public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE; + /** + * Lead surrogate bitmask + */ + private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00; + /** + * Trail surrogate bitmask + */ + private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00; + /** + * Surrogate bitmask + */ + private static final int SURROGATE_BITMASK = 0xFFFFF800; + /** + * Lead surrogate bits + */ + private static final int LEAD_SURROGATE_BITS = 0xD800; + /** + * Trail surrogate bits + */ + private static final int TRAIL_SURROGATE_BITS = 0xDC00; + /** + * Surrogate bits + */ + private static final int SURROGATE_BITS = 0xD800; + + // constructor -------------------------------------------------------- + + // /CLOVER:OFF + /** + * Prevent instance from being created. + */ + private UTF16() { + } + + // /CLOVER:ON + // public method ------------------------------------------------------ + + /** + * Extract a single UTF-32 value from a string. + * Used when iterating forwards or backwards (with + * UTF16.getCharCount(), as well as random access. If a + * validity check is required, use + * + * UCharacter.isLegal() on the return value. + * If the char retrieved is part of a surrogate pair, its supplementary + * character will be returned. If a complete supplementary character is + * not found the incomplete character will be returned + * @param source array of UTF-16 chars + * @param offset16 UTF-16 offset to the start of the character. + * @return UTF-32 value for the UTF-32 value that contains the char at + * offset16. The boundaries of that codepoint are the same as in + * bounds32(). + * @exception IndexOutOfBoundsException thrown if offset16 is out of + * bounds. + * @stable ICU 2.1 + */ + public static int charAt(String source, int offset16) { + char single = source.charAt(offset16); + if (single < LEAD_SURROGATE_MIN_VALUE) { + return single; + } + return _charAt(source, offset16, single); + } + + private static int _charAt(String source, int offset16, char single) { + if (single > TRAIL_SURROGATE_MAX_VALUE) { + return single; + } + + // Convert the UTF-16 surrogate pair if necessary. + // For simplicity in usage, and because the frequency of pairs is + // low, look both directions. + + if (single <= LEAD_SURROGATE_MAX_VALUE) { + ++offset16; + if (source.length() != offset16) { + char trail = source.charAt(offset16); + if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(single, trail); + } + } + } else { + --offset16; + if (offset16 >= 0) { + // single is a trail surrogate so + char lead = source.charAt(offset16); + if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(lead, single); + } + } + } + return single; // return unmatched surrogate + } + + /** + * Extract a single UTF-32 value from a string. + * Used when iterating forwards or backwards (with + * UTF16.getCharCount(), as well as random access. If a + * validity check is required, use + * UCharacter.isLegal() + * on the return value. + * If the char retrieved is part of a surrogate pair, its supplementary + * character will be returned. If a complete supplementary character is + * not found the incomplete character will be returned + * @param source array of UTF-16 chars + * @param offset16 UTF-16 offset to the start of the character. + * @return UTF-32 value for the UTF-32 value that contains the char at + * offset16. The boundaries of that codepoint are the same as in + * bounds32(). + * @exception IndexOutOfBoundsException thrown if offset16 is out of bounds. + * @stable ICU 2.1 + */ + public static int charAt(CharSequence source, int offset16) { + char single = source.charAt(offset16); + if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) { + return single; + } + return _charAt(source, offset16, single); + } + + private static int _charAt(CharSequence source, int offset16, char single) { + if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) { + return single; + } + + // Convert the UTF-16 surrogate pair if necessary. + // For simplicity in usage, and because the frequency of pairs is + // low, look both directions. + + if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + ++offset16; + if (source.length() != offset16) { + char trail = source.charAt(offset16); + if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE + && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(single, trail); + } + } + } else { + --offset16; + if (offset16 >= 0) { + // single is a trail surrogate so + char lead = source.charAt(offset16); + if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE + && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + return UCharacterProperty.getRawSupplementary(lead, single); + } + } + } + return single; // return unmatched surrogate + } + + /** + * Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards + * (with UTF16.getCharCount(), as well as random access. If a validity check is + * required, use UCharacter.isLegal() + * + * on the return value. If the char retrieved is part of a surrogate pair, its supplementary + * character will be returned. If a complete supplementary character is not found the incomplete + * character will be returned + * + * @param source Array of UTF-16 chars + * @param start Offset to substring in the source array for analyzing + * @param limit Offset to substring in the source array for analyzing + * @param offset16 UTF-16 offset relative to start + * @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries + * of that codepoint are the same as in bounds32(). + * @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit. + * @stable ICU 2.1 + */ + public static int charAt(char source[], int start, int limit, int offset16) { + offset16 += start; + if (offset16 < start || offset16 >= limit) { + throw new ArrayIndexOutOfBoundsException(offset16); + } + + char single = source[offset16]; + if (!isSurrogate(single)) { + return single; + } + + // Convert the UTF-16 surrogate pair if necessary. + // For simplicity in usage, and because the frequency of pairs is + // low, look both directions. + if (single <= LEAD_SURROGATE_MAX_VALUE) { + offset16++; + if (offset16 >= limit) { + return single; + } + char trail = source[offset16]; + if (isTrailSurrogate(trail)) { + return UCharacterProperty.getRawSupplementary(single, trail); + } + } + else { // isTrailSurrogate(single), so + if (offset16 == start) { + return single; + } + offset16--; + char lead = source[offset16]; + if (isLeadSurrogate(lead)) + return UCharacterProperty.getRawSupplementary(lead, single); + } + return single; // return unmatched surrogate + } + + /** + * Determines how many chars this char32 requires. + * If a validity check is required, use + * isLegal() on + * char32 before calling. + * @param char32 the input codepoint. + * @return 2 if is in supplementary space, otherwise 1. + * @stable ICU 2.1 + */ + public static int getCharCount(int char32) + { + if (char32 < SUPPLEMENTARY_MIN_VALUE) { + return 1; + } + return 2; + } + + /** + * Determines whether the code value is a surrogate. + * @param char16 the input character. + * @return true if the input character is a surrogate. + * @stable ICU 2.1 + */ + public static boolean isSurrogate(char char16) + { + return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS; + } + + /** + * Determines whether the character is a trail surrogate. + * @param char16 the input character. + * @return true if the input character is a trail surrogate. + * @stable ICU 2.1 + */ + public static boolean isTrailSurrogate(char char16) + { + return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS; + } + + /** + * Determines whether the character is a lead surrogate. + * @param char16 the input character. + * @return true if the input character is a lead surrogate + * @stable ICU 2.1 + */ + public static boolean isLeadSurrogate(char char16) + { + return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS; + } + + /** + * Returns the lead surrogate. + * If a validity check is required, use + * isLegal() + * on char32 before calling. + * @param char32 the input character. + * @return lead surrogate if the getCharCount(ch) is 2;
+ * and 0 otherwise (note: 0 is not a valid lead surrogate). + * @stable ICU 2.1 + */ + public static char getLeadSurrogate(int char32) + { + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + return (char)(LEAD_SURROGATE_OFFSET_ + + (char32 >> LEAD_SURROGATE_SHIFT_)); + } + + return 0; + } + + /** + * Returns the trail surrogate. + * If a validity check is required, use + * isLegal() + * on char32 before calling. + * @param char32 the input character. + * @return the trail surrogate if the getCharCount(ch) is 2;
otherwise + * the character itself + * @stable ICU 2.1 + */ + public static char getTrailSurrogate(int char32) + { + if (char32 >= SUPPLEMENTARY_MIN_VALUE) { + return (char)(TRAIL_SURROGATE_MIN_VALUE + + (char32 & TRAIL_SURROGATE_MASK_)); + } + + return (char) char32; + } + + /** + * Convenience method corresponding to String.valueOf(char). Returns a one + * or two char string containing the UTF-32 value in UTF16 format. If a + * validity check is required, use + * isLegal() + * on char32 before calling. + * @param char32 the input character. + * @return string value of char32 in UTF16 format + * @exception IllegalArgumentException thrown if char32 is a invalid + * codepoint. + * @stable ICU 2.1 + */ + public static String valueOf(int char32) + { + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint"); + } + return toString(char32); + } + + /** + * Append a single UTF-32 value to the end of a StringBuffer. + * If a validity check is required, use + * isLegal() + * on char32 before calling. + * @param target the buffer to append to + * @param char32 value to append. + * @return the updated StringBuffer + * @exception IllegalArgumentException thrown when char32 does not lie + * within the range of the Unicode codepoints + * @stable ICU 2.1 + */ + public static StringBuffer append(StringBuffer target, int char32) + { + // Check for irregular values + if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) { + throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32)); + } + + // Write the UTF-16 values + if (char32 >= SUPPLEMENTARY_MIN_VALUE) + { + target.append(getLeadSurrogate(char32)); + target.append(getTrailSurrogate(char32)); + } + else { + target.append((char) char32); + } + return target; + } + + /** + * Shifts offset16 by the argument number of codepoints within a subarray. + * @param source char array + * @param start position of the subarray to be performed on + * @param limit position of the subarray to be performed on + * @param offset16 UTF16 position to shift relative to start + * @param shift32 number of codepoints to shift + * @return new shifted offset16 relative to start + * @exception IndexOutOfBoundsException if the new offset16 is out of + * bounds with respect to the subarray or the subarray bounds + * are out of range. + * @stable ICU 2.1 + */ + public static int moveCodePointOffset(char source[], int start, int limit, + int offset16, int shift32) + { + int size = source.length; + int count; + char ch; + int result = offset16 + start; + if (start < 0 || limit < start) { + throw new StringIndexOutOfBoundsException(start); + } + if (limit > size) { + throw new StringIndexOutOfBoundsException(limit); + } + if (offset16 < 0 || result > limit) { + throw new StringIndexOutOfBoundsException(offset16); + } + if (shift32 > 0) { + if (shift32 + result > size) { + throw new StringIndexOutOfBoundsException(result); + } + count = shift32; + while (result < limit && count > 0) + { + ch = source[result]; + if (isLeadSurrogate(ch) && (result + 1 < limit) && + isTrailSurrogate(source[result + 1])) { + result++; + } + count--; + result++; + } + } else { + if (result + shift32 < start) { + throw new StringIndexOutOfBoundsException(result); + } + for (count = -shift32; count > 0; count--) { + result--; + if (result < start) { + break; + } + ch = source[result]; + if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) { + result--; + } + } + } + if (count != 0) { + throw new StringIndexOutOfBoundsException(shift32); + } + result -= start; + return result; + } + + // private data members ------------------------------------------------- + + /** + * Shift value for lead surrogate to form a supplementary character. + */ + private static final int LEAD_SURROGATE_SHIFT_ = 10; + + /** + * Mask to retrieve the significant value from a trail surrogate. + */ + private static final int TRAIL_SURROGATE_MASK_ = 0x3FF; + + /** + * Value that all lead surrogate starts with + */ + private static final int LEAD_SURROGATE_OFFSET_ = + LEAD_SURROGATE_MIN_VALUE - + (SUPPLEMENTARY_MIN_VALUE + >> LEAD_SURROGATE_SHIFT_); + + // private methods ------------------------------------------------------ + + /** + *

Converts argument code point and returns a String object representing + * the code point's value in UTF16 format. + *

This method does not check for the validity of the codepoint, the + * results are not guaranteed if a invalid codepoint is passed as + * argument. + *

The result is a string whose length is 1 for non-supplementary code + * points, 2 otherwise. + * @param ch code point + * @return string representation of the code point + */ + private static String toString(int ch) + { + if (ch < SUPPLEMENTARY_MIN_VALUE) { + return String.valueOf((char) ch); + } + + StringBuilder result = new StringBuilder(); + result.append(getLeadSurrogate(ch)); + result.append(getTrailSurrogate(ch)); + return result.toString(); + } +} --- old/src/java.base/share/classes/sun/text/normalizer/UnicodeSet.java 2020-01-10 15:58:09.000000000 -0800 +++ /dev/null 2020-01-10 15:58:09.000000000 -0800 @@ -1,1407 +0,0 @@ -/* - * Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 1996-2015, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -import java.io.IOException; -import java.text.ParsePosition; -import java.util.ArrayList; -import java.util.TreeSet; - -/** - * A mutable set of Unicode characters and multicharacter strings. - * Objects of this class represent character classes used - * in regular expressions. A character specifies a subset of Unicode - * code points. Legal code points are U+0000 to U+10FFFF, inclusive. - * - * Note: method freeze() will not only make the set immutable, but - * also makes important methods much higher performance: - * contains(c), containsNone(...), span(...), spanBack(...) etc. - * After the object is frozen, any subsequent call that wants to change - * the object will throw UnsupportedOperationException. - * - *

The UnicodeSet class is not designed to be subclassed. - * - *

UnicodeSet supports two APIs. The first is the - * operand API that allows the caller to modify the value of - * a UnicodeSet object. It conforms to Java 2's - * java.util.Set interface, although - * UnicodeSet does not actually implement that - * interface. All methods of Set are supported, with the - * modification that they take a character range or single character - * instead of an Object, and they take a - * UnicodeSet instead of a Collection. The - * operand API may be thought of in terms of boolean logic: a boolean - * OR is implemented by add, a boolean AND is implemented - * by retain, a boolean XOR is implemented by - * complement taking an argument, and a boolean NOT is - * implemented by complement with no argument. In terms - * of traditional set theory function names, add is a - * union, retain is an intersection, remove - * is an asymmetric difference, and complement with no - * argument is a set complement with respect to the superset range - * MIN_VALUE-MAX_VALUE - * - *

The second API is the - * applyPattern()/toPattern() API from the - * java.text.Format-derived classes. Unlike the - * methods that add characters, add categories, and control the logic - * of the set, the method applyPattern() sets all - * attributes of a UnicodeSet at once, based on a - * string pattern. - * - *

Pattern syntax

- * - * Patterns are accepted by the constructors and the - * applyPattern() methods and returned by the - * toPattern() method. These patterns follow a syntax - * similar to that employed by version 8 regular expression character - * classes. Here are some simple examples: - * - *
- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
[]No characters
[a]The character 'a'
[ae]The characters 'a' and 'e'
[a-e]The characters 'a' through 'e' inclusive, in Unicode code - * point order
[\\u4E01]The character U+4E01
[a{ab}{ac}]The character 'a' and the multicharacter strings "ab" and - * "ac"
[\p{Lu}]All characters in the general category Uppercase Letter
- *
- * - * Any character may be preceded by a backslash in order to remove any special - * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are - * ignored, unless they are escaped. - * - *

Property patterns specify a set of characters having a certain - * property as defined by the Unicode standard. Both the POSIX-like - * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a - * complete list of supported property patterns, see the User's Guide - * for UnicodeSet at - * - * http://www.icu-project.org/userguide/unicodeSet.html. - * Actual determination of property data is defined by the underlying - * Unicode database as implemented by UCharacter. - * - *

Patterns specify individual characters, ranges of characters, and - * Unicode property sets. When elements are concatenated, they - * specify their union. To complement a set, place a '^' immediately - * after the opening '['. Property patterns are inverted by modifying - * their delimiters; "[:^foo]" and "\P{foo}". In any other location, - * '^' has no special meaning. - * - *

Ranges are indicated by placing two a '-' between two - * characters, as in "a-z". This specifies the range of all - * characters from the left to the right, in Unicode order. If the - * left character is greater than or equal to the - * right character it is a syntax error. If a '-' occurs as the first - * character after the opening '[' or '[^', or if it occurs as the - * last character before the closing ']', then it is taken as a - * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same - * set of three characters, 'a', 'b', and '-'. - * - *

Sets may be intersected using the {@literal '&'} operator or the asymmetric - * set difference may be taken using the '-' operator, for example, - * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters - * with values less than 4096. Operators ({@literal '&'} and '|') have equal - * precedence and bind left-to-right. Thus - * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to - * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for - * difference; intersection is commutative. - * - * - *
[a]The set containing 'a' - *
[a-z]The set containing 'a' - * through 'z' and all letters in between, in Unicode order - *
[^a-z]The set containing - * all characters but 'a' through 'z', - * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF - *
[[pat1][pat2]] - * The union of sets specified by pat1 and pat2 - *
[[pat1]&[pat2]] - * The intersection of sets specified by pat1 and pat2 - *
[[pat1]-[pat2]] - * The asymmetric difference of sets specified by pat1 and - * pat2 - *
[:Lu:] or \p{Lu} - * The set of characters having the specified - * Unicode property; in - * this case, Unicode uppercase letters - *
[:^Lu:] or \P{Lu} - * The set of characters not having the given - * Unicode property - *
- * - *

Warning: you cannot add an empty string ("") to a UnicodeSet.

- * - *

Formal syntax

- * - *
- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
pattern :=  ('[' '^'? item* ']') | - * property
item :=  char | (char '-' char) | pattern-expr
- *
pattern-expr :=  pattern | pattern-expr pattern | - * pattern-expr op pattern
- *
op :=  '&' | '-'
- *
special :=  '[' | ']' | '-'
- *
char :=  any character that is not special
- * | ('\\'
any character)
- * | ('\u' hex hex hex hex)
- *
hex :=  any character for which - * Character.digit(c, 16) - * returns a non-negative result
property :=  a Unicode property set pattern
- *
- * - * - * - * - *
Legend: - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
a := b  a may be replaced by b
a?zero or one instance of a
- *
a*one or more instances of a
- *
a | beither a or b
- *
'a'the literal string between the quotes
- *
- *
- *

To iterate over contents of UnicodeSet, the following are available: - *

  • {@link #ranges()} to iterate through the ranges
  • - *
  • {@link #strings()} to iterate through the strings
  • - *
  • {@link #iterator()} to iterate through the entire contents in a single loop. - * That method is, however, not particularly efficient, since it "boxes" each code point into a String. - *
- * All of the above can be used in for loops. - * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in for loops. - *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. - * - * @author Alan Liu - * @stable ICU 2.0 - */ -class UnicodeSet { - - private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints - private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. - // 110000 for codepoints - - /** - * Minimum value that can be stored in a UnicodeSet. - * @stable ICU 2.0 - */ - public static final int MIN_VALUE = LOW; - - /** - * Maximum value that can be stored in a UnicodeSet. - * @stable ICU 2.0 - */ - public static final int MAX_VALUE = HIGH - 1; - - private int len; // length used; list may be longer to minimize reallocs - private int[] list; // MUST be terminated with HIGH - private int[] rangeList; // internal buffer - private int[] buffer; // internal buffer - - // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! - // is not private so that UnicodeSetIterator can get access - TreeSet strings = new TreeSet(); - - /** - * The pattern representation of this set. This may not be the - * most economical pattern. It is the pattern supplied to - * applyPattern(), with variables substituted and whitespace - * removed. For sets constructed without applyPattern(), or - * modified using the non-pattern API, this string will be null, - * indicating that toPattern() must generate a pattern - * representation from the inversion list. - */ - - private static final int START_EXTRA = 16; // initial storage. Must be >= 0 - private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 - - private static UnicodeSet INCLUSION = null; - - private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. - private volatile UnicodeSetStringSpan stringSpan; - - //---------------------------------------------------------------- - // Public API - //---------------------------------------------------------------- - - /** - * Constructs an empty set. - * @stable ICU 2.0 - */ - private UnicodeSet() { - list = new int[1 + START_EXTRA]; - list[len++] = HIGH; - } - - /** - * Constructs a copy of an existing set. - * @stable ICU 2.0 - */ - private UnicodeSet(UnicodeSet other) { - set(other); - } - - /** - * Constructs a set containing the given range. If end > - * start then an empty set is created. - * - * @param start first character, inclusive, of range - * @param end last character, inclusive, of range - * @stable ICU 2.0 - */ - public UnicodeSet(int start, int end) { - this(); - complement(start, end); - } - - /** - * Constructs a set from the given pattern. See the class description - * for the syntax of the pattern language. Whitespace is ignored. - * @param pattern a string specifying what characters are in the set - * @exception java.lang.IllegalArgumentException if the pattern contains - * a syntax error. - * @stable ICU 2.0 - */ - public UnicodeSet(String pattern) { - this(); - applyPattern(pattern, null); - } - - /** - * Make this object represent the same set as other. - * @param other a UnicodeSet whose value will be - * copied to this object - * @stable ICU 2.0 - */ - public UnicodeSet set(UnicodeSet other) { - checkFrozen(); - list = other.list.clone(); - len = other.len; - strings = new TreeSet(other.strings); - return this; - } - - /** - * Returns the number of elements in this set (its cardinality) - * Note than the elements of a set may include both individual - * codepoints and strings. - * - * @return the number of elements in this set (its cardinality). - * @stable ICU 2.0 - */ - public int size() { - int n = 0; - int count = getRangeCount(); - for (int i = 0; i < count; ++i) { - n += getRangeEnd(i) - getRangeStart(i) + 1; - } - return n + strings.size(); - } - - // for internal use, after checkFrozen has been called - private UnicodeSet add_unchecked(int start, int end) { - if (start < MIN_VALUE || start > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); - } - if (end < MIN_VALUE || end > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); - } - if (start < end) { - add(range(start, end), 2, 0); - } else if (start == end) { - add(start); - } - return this; - } - - /** - * Adds the specified character to this set if it is not already - * present. If this set already contains the specified character, - * the call leaves this set unchanged. - * @stable ICU 2.0 - */ - public final UnicodeSet add(int c) { - checkFrozen(); - return add_unchecked(c); - } - - // for internal use only, after checkFrozen has been called - private final UnicodeSet add_unchecked(int c) { - if (c < MIN_VALUE || c > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); - } - - // find smallest i such that c < list[i] - // if odd, then it is IN the set - // if even, then it is OUT of the set - int i = findCodePoint(c); - - // already in set? - if ((i & 1) != 0) return this; - - // HIGH is 0x110000 - // assert(list[len-1] == HIGH); - - // empty = [HIGH] - // [start_0, limit_0, start_1, limit_1, HIGH] - - // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] - // ^ - // list[i] - - // i == 0 means c is before the first range - - if (c == list[i]-1) { - // c is before start of next range - list[i] = c; - // if we touched the HIGH mark, then add a new one - if (c == MAX_VALUE) { - ensureCapacity(len+1); - list[len++] = HIGH; - } - if (i > 0 && c == list[i-1]) { - // collapse adjacent ranges - - // [..., start_k-1, c, c, limit_k, ..., HIGH] - // ^ - // list[i] - System.arraycopy(list, i+1, list, i-1, len-i-1); - len -= 2; - } - } - - else if (i > 0 && c == list[i-1]) { - // c is after end of prior range - list[i-1]++; - // no need to chcek for collapse here - } - - else { - // At this point we know the new char is not adjacent to - // any existing ranges, and it is not 10FFFF. - - - // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] - // ^ - // list[i] - - // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] - // ^ - // list[i] - - // Don't use ensureCapacity() to save on copying. - // NOTE: This has no measurable impact on performance, - // but it might help in some usage patterns. - if (len+2 > list.length) { - int[] temp = new int[len + 2 + GROW_EXTRA]; - if (i != 0) System.arraycopy(list, 0, temp, 0, i); - System.arraycopy(list, i, temp, i+2, len-i); - list = temp; - } else { - System.arraycopy(list, i, list, i+2, len-i); - } - - list[i] = c; - list[i+1] = c+1; - len += 2; - } - - return this; - } - - /** - * Adds the specified multicharacter to this set if it is not already - * present. If this set already contains the multicharacter, - * the call leaves this set unchanged. - * Thus {@code "ch" => {"ch"}} - *
Warning: you cannot add an empty string ("") to a UnicodeSet. - * @param s the source string - * @return this object, for chaining - * @stable ICU 2.0 - */ - public final UnicodeSet add(CharSequence s) { - checkFrozen(); - int cp = getSingleCP(s); - if (cp < 0) { - strings.add(s.toString()); - } else { - add_unchecked(cp, cp); - } - return this; - } - - /** - * Utility for getting code point from single code point CharSequence. - * See the public UTF16.getSingleCodePoint() - * @return a code point IF the string consists of a single one. - * otherwise returns -1. - * @param s to test - */ - private static int getSingleCP(CharSequence s) { - if (s.length() < 1) { - throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); - } - if (s.length() > 2) return -1; - if (s.length() == 1) return s.charAt(0); - - // at this point, len = 2 - int cp = UTF16.charAt(s, 0); - if (cp > 0xFFFF) { // is surrogate pair - return cp; - } - return -1; - } - - /** - * Complements the specified range in this set. Any character in - * the range will be removed if it is in this set, or will be - * added if it is not in this set. If {@code end > start} - * then an empty range is complemented, leaving the set unchanged. - * - * @param start first character, inclusive, of range to be removed - * from this set. - * @param end last character, inclusive, of range to be removed - * from this set. - * @stable ICU 2.0 - */ - public UnicodeSet complement(int start, int end) { - checkFrozen(); - if (start < MIN_VALUE || start > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); - } - if (end < MIN_VALUE || end > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); - } - if (start <= end) { - xor(range(start, end), 2, 0); - } - return this; - } - - /** - * Returns true if this set contains the given character. - * @param c character to be checked for containment - * @return true if the test condition is met - * @stable ICU 2.0 - */ - public boolean contains(int c) { - if (c < MIN_VALUE || c > MAX_VALUE) { - throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); - } - if (bmpSet != null) { - return bmpSet.contains(c); - } - if (stringSpan != null) { - return stringSpan.contains(c); - } - - /* - // Set i to the index of the start item greater than ch - // We know we will terminate without length test! - int i = -1; - while (true) { - if (c < list[++i]) break; - } - */ - - int i = findCodePoint(c); - - return ((i & 1) != 0); // return true if odd - } - - /** - * Returns the smallest value i such that c < list[i]. Caller - * must ensure that c is a legal value or this method will enter - * an infinite loop. This method performs a binary search. - * @param c a character in the range MIN_VALUE..MAX_VALUE - * inclusive - * @return the smallest integer i in the range 0..len-1, - * inclusive, such that c < list[i] - */ - private final int findCodePoint(int c) { - /* Examples: - findCodePoint(c) - set list[] c=0 1 3 4 7 8 - === ============== =========== - [] [110000] 0 0 0 0 0 0 - [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 - [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 - [:all:] [0, 110000] 1 1 1 1 1 1 - */ - - // Return the smallest i such that c < list[i]. Assume - // list[len - 1] == HIGH and that c is legal (0..HIGH-1). - if (c < list[0]) return 0; - // High runner test. c is often after the last range, so an - // initial check for this condition pays off. - if (len >= 2 && c >= list[len-2]) return len-1; - int lo = 0; - int hi = len - 1; - // invariant: c >= list[lo] - // invariant: c < list[hi] - for (;;) { - int i = (lo + hi) >>> 1; - if (i == lo) return hi; - if (c < list[i]) { - hi = i; - } else { - lo = i; - } - } - } - - /** - * Retains only the elements in this set that are contained in the - * specified set. In other words, removes from this set all of - * its elements that are not contained in the specified set. This - * operation effectively modifies this set so that its value is - * the intersection of the two sets. - * - * @param c set that defines which elements this set will retain. - * @stable ICU 2.0 - */ - public UnicodeSet retainAll(UnicodeSet c) { - checkFrozen(); - retain(c.list, c.len, 0); - strings.retainAll(c.strings); - return this; - } - - /** - * Removes all of the elements from this set. This set will be - * empty after this call returns. - * @stable ICU 2.0 - */ - public UnicodeSet clear() { - checkFrozen(); - list[0] = HIGH; - len = 1; - strings.clear(); - return this; - } - - /** - * Iteration method that returns the number of ranges contained in - * this set. - * @see #getRangeStart - * @see #getRangeEnd - * @stable ICU 2.0 - */ - public int getRangeCount() { - return len/2; - } - - /** - * Iteration method that returns the first character in the - * specified range of this set. - * @exception ArrayIndexOutOfBoundsException if index is outside - * the range 0..getRangeCount()-1 - * @see #getRangeCount - * @see #getRangeEnd - * @stable ICU 2.0 - */ - public int getRangeStart(int index) { - return list[index*2]; - } - - /** - * Iteration method that returns the last character in the - * specified range of this set. - * @exception ArrayIndexOutOfBoundsException if index is outside - * the range 0..getRangeCount()-1 - * @see #getRangeStart - * @see #getRangeEnd - * @stable ICU 2.0 - */ - public int getRangeEnd(int index) { - return (list[index*2 + 1] - 1); - } - - //---------------------------------------------------------------- - // Implementation: Pattern parsing - //---------------------------------------------------------------- - - /** - * Parses the given pattern, starting at the given position. The character - * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. - * Parsing continues until the corresponding closing ']'. If a syntax error - * is encountered between the opening and closing brace, the parse fails. - * Upon return from a successful parse, the ParsePosition is updated to - * point to the character following the closing ']', and an inversion - * list for the parsed pattern is returned. This method - * calls itself recursively to parse embedded subpatterns. - * - * @param pattern the string containing the pattern to be parsed. The - * portion of the string from pos.getIndex(), which must be a '[', to the - * corresponding closing ']', is parsed. - * @param pos upon entry, the position at which to being parsing. The - * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return - * from a successful parse, pos.getIndex() is either the character after the - * closing ']' of the parsed pattern, or pattern.length() if the closing ']' - * is the last character of the pattern string. - * @return an inversion list for the parsed substring - * of pattern - * @exception java.lang.IllegalArgumentException if the parse fails. - */ - private UnicodeSet applyPattern(String pattern, - ParsePosition pos) { - if ("[:age=3.2:]".equals(pattern)) { - checkFrozen(); - VersionInfo version = VersionInfo.getInstance("3.2"); - applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); - } else { - throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern " - + pattern + ")"); - } - - return this; - } - - //---------------------------------------------------------------- - // Implementation: Utility methods - //---------------------------------------------------------------- - - private void ensureCapacity(int newLen) { - if (newLen <= list.length) return; - int[] temp = new int[newLen + GROW_EXTRA]; - System.arraycopy(list, 0, temp, 0, len); - list = temp; - } - - private void ensureBufferCapacity(int newLen) { - if (buffer != null && newLen <= buffer.length) return; - buffer = new int[newLen + GROW_EXTRA]; - } - - /** - * Assumes start <= end. - */ - private int[] range(int start, int end) { - if (rangeList == null) { - rangeList = new int[] { start, end+1, HIGH }; - } else { - rangeList[0] = start; - rangeList[1] = end+1; - } - return rangeList; - } - - //---------------------------------------------------------------- - // Implementation: Fundamental operations - //---------------------------------------------------------------- - - // polarity = 0, 3 is normal: x xor y - // polarity = 1, 2: x xor ~y == x === y - - private UnicodeSet xor(int[] other, int otherLen, int polarity) { - ensureBufferCapacity(len + otherLen); - int i = 0, j = 0, k = 0; - int a = list[i++]; - int b; - if (polarity == 1 || polarity == 2) { - b = LOW; - if (other[j] == LOW) { // skip base if already LOW - ++j; - b = other[j]; - } - } else { - b = other[j++]; - } - // simplest of all the routines - // sort the values, discarding identicals! - while (true) { - if (a < b) { - buffer[k++] = a; - a = list[i++]; - } else if (b < a) { - buffer[k++] = b; - b = other[j++]; - } else if (a != HIGH) { // at this point, a == b - // discard both values! - a = list[i++]; - b = other[j++]; - } else { // DONE! - buffer[k++] = HIGH; - len = k; - break; - } - } - // swap list and buffer - int[] temp = list; - list = buffer; - buffer = temp; - return this; - } - - // polarity = 0 is normal: x union y - // polarity = 2: x union ~y - // polarity = 1: ~x union y - // polarity = 3: ~x union ~y - - private UnicodeSet add(int[] other, int otherLen, int polarity) { - ensureBufferCapacity(len + otherLen); - int i = 0, j = 0, k = 0; - int a = list[i++]; - int b = other[j++]; - // change from xor is that we have to check overlapping pairs - // polarity bit 1 means a is second, bit 2 means b is. - main: - while (true) { - switch (polarity) { - case 0: // both first; take lower if unequal - if (a < b) { // take a - // Back up over overlapping ranges in buffer[] - if (k > 0 && a <= buffer[k-1]) { - // Pick latter end value in buffer[] vs. list[] - a = max(list[i], buffer[--k]); - } else { - // No overlap - buffer[k++] = a; - a = list[i]; - } - i++; // Common if/else code factored out - polarity ^= 1; - } else if (b < a) { // take b - if (k > 0 && b <= buffer[k-1]) { - b = max(other[j], buffer[--k]); - } else { - buffer[k++] = b; - b = other[j]; - } - j++; - polarity ^= 2; - } else { // a == b, take a, drop b - if (a == HIGH) break main; - // This is symmetrical; it doesn't matter if - // we backtrack with a or b. - liu - if (k > 0 && a <= buffer[k-1]) { - a = max(list[i], buffer[--k]); - } else { - // No overlap - buffer[k++] = a; - a = list[i]; - } - i++; - polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 3: // both second; take higher if unequal, and drop other - if (b <= a) { // take a - if (a == HIGH) break main; - buffer[k++] = a; - } else { // take b - if (b == HIGH) break main; - buffer[k++] = b; - } - a = list[i++]; polarity ^= 1; // factored common code - b = other[j++]; polarity ^= 2; - break; - case 1: // a second, b first; if b < a, overlap - if (a < b) { // no overlap, take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else if (b < a) { // OVERLAP, drop b - b = other[j++]; polarity ^= 2; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 2: // a first, b second; if a < b, overlap - if (b < a) { // no overlap, take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else if (a < b) { // OVERLAP, drop a - a = list[i++]; polarity ^= 1; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - } - } - buffer[k++] = HIGH; // terminate - len = k; - // swap list and buffer - int[] temp = list; - list = buffer; - buffer = temp; - return this; - } - - // polarity = 0 is normal: x intersect y - // polarity = 2: x intersect ~y == set-minus - // polarity = 1: ~x intersect y - // polarity = 3: ~x intersect ~y - - private UnicodeSet retain(int[] other, int otherLen, int polarity) { - ensureBufferCapacity(len + otherLen); - int i = 0, j = 0, k = 0; - int a = list[i++]; - int b = other[j++]; - // change from xor is that we have to check overlapping pairs - // polarity bit 1 means a is second, bit 2 means b is. - main: - while (true) { - switch (polarity) { - case 0: // both first; drop the smaller - if (a < b) { // drop a - a = list[i++]; polarity ^= 1; - } else if (b < a) { // drop b - b = other[j++]; polarity ^= 2; - } else { // a == b, take one, drop other - if (a == HIGH) break main; - buffer[k++] = a; a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 3: // both second; take lower if unequal - if (a < b) { // take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else if (b < a) { // take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else { // a == b, take one, drop other - if (a == HIGH) break main; - buffer[k++] = a; a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 1: // a second, b first; - if (a < b) { // NO OVERLAP, drop a - a = list[i++]; polarity ^= 1; - } else if (b < a) { // OVERLAP, take b - buffer[k++] = b; b = other[j++]; polarity ^= 2; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - case 2: // a first, b second; if a < b, overlap - if (b < a) { // no overlap, drop b - b = other[j++]; polarity ^= 2; - } else if (a < b) { // OVERLAP, take a - buffer[k++] = a; a = list[i++]; polarity ^= 1; - } else { // a == b, drop both! - if (a == HIGH) break main; - a = list[i++]; polarity ^= 1; - b = other[j++]; polarity ^= 2; - } - break; - } - } - buffer[k++] = HIGH; // terminate - len = k; - // swap list and buffer - int[] temp = list; - list = buffer; - buffer = temp; - return this; - } - - private static final int max(int a, int b) { - return (a > b) ? a : b; - } - - //---------------------------------------------------------------- - // Generic filter-based scanning code - //---------------------------------------------------------------- - - private static interface Filter { - boolean contains(int codePoint); - } - - private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); - - private static class VersionFilter implements Filter { - VersionInfo version; - VersionFilter(VersionInfo version) { this.version = version; } - public boolean contains(int ch) { - VersionInfo v = UCharacter.getAge(ch); - // Reference comparison ok; VersionInfo caches and reuses - // unique objects. - return v != NO_VERSION && - v.compareTo(version) <= 0; - } - } - - private static synchronized UnicodeSet getInclusions(int src) { - if (src != UCharacterProperty.SRC_PROPSVEC) { - throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); - } - - if (INCLUSION == null) { - UnicodeSet incl = new UnicodeSet(); - UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); - INCLUSION = incl; - } - return INCLUSION; - } - - /** - * Generic filter-based scanning code for UCD property UnicodeSets. - */ - private UnicodeSet applyFilter(Filter filter, int src) { - // Logically, walk through all Unicode characters, noting the start - // and end of each range for which filter.contain(c) is - // true. Add each range to a set. - // - // To improve performance, use an inclusions set which - // encodes information about character ranges that are known - // to have identical properties. - // getInclusions(src) contains exactly the first characters of - // same-value ranges for the given properties "source". - - clear(); - - int startHasProperty = -1; - UnicodeSet inclusions = getInclusions(src); - int limitRange = inclusions.getRangeCount(); - - for (int j=0; j= 0) { - add_unchecked(startHasProperty, ch-1); - startHasProperty = -1; - } - } - } - if (startHasProperty >= 0) { - add_unchecked(startHasProperty, 0x10FFFF); - } - - return this; - } - - /** - * Is this frozen, according to the Freezable interface? - * - * @return value - * @stable ICU 3.8 - */ - public boolean isFrozen() { - return (bmpSet != null || stringSpan != null); - } - - /** - * Freeze this class, according to the Freezable interface. - * - * @return this - * @stable ICU 4.4 - */ - public UnicodeSet freeze() { - if (!isFrozen()) { - // Do most of what compact() does before freezing because - // compact() will not work when the set is frozen. - // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). - - // Delete buffer first to defragment memory less. - buffer = null; - if (list.length > (len + GROW_EXTRA)) { - // Make the capacity equal to len or 1. - // We don't want to realloc of 0 size. - int capacity = (len == 0) ? 1 : len; - int[] oldList = list; - list = new int[capacity]; - for (int i = capacity; i-- > 0;) { - list[i] = oldList[i]; - } - } - - // Optimize contains() and span() and similar functions. - if (!strings.isEmpty()) { - stringSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), UnicodeSetStringSpan.ALL); - } - if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { - // Optimize for code point spans. - // There are no strings, or - // all strings are irrelevant for span() etc. because - // all of each string's code points are contained in this set. - // However, fully contained strings are relevant for spanAndCount(), - // so we create both objects. - bmpSet = new BMPSet(list, len); - } - } - return this; - } - - /** - * Span a string using this UnicodeSet. - *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. - * @param s The string to be spanned - * @param spanCondition The span condition - * @return the length of the span - * @stable ICU 4.4 - */ - public int span(CharSequence s, SpanCondition spanCondition) { - return span(s, 0, spanCondition); - } - - /** - * Span a string using this UnicodeSet. - * If the start index is less than 0, span will start from 0. - * If the start index is greater than the string length, span returns the string length. - *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. - * @param s The string to be spanned - * @param start The start index that the span begins - * @param spanCondition The span condition - * @return the string index which ends the span (i.e. exclusive) - * @stable ICU 4.4 - */ - public int span(CharSequence s, int start, SpanCondition spanCondition) { - int end = s.length(); - if (start < 0) { - start = 0; - } else if (start >= end) { - return end; - } - if (bmpSet != null) { - // Frozen set without strings, or no string is relevant for span(). - return bmpSet.span(s, start, spanCondition, null); - } - if (stringSpan != null) { - return stringSpan.span(s, start, spanCondition); - } else if (!strings.isEmpty()) { - int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED - : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; - UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); - if (strSpan.needsStringSpanUTF16()) { - return strSpan.span(s, start, spanCondition); - } - } - - return spanCodePointsAndCount(s, start, spanCondition, null); - } - - /** - * Same as span() but also counts the smallest number of set elements on any path across the span. - *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. - * @param outCount An output-only object (must not be null) for returning the count. - * @return the limit (exclusive end) of the span - */ - public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { - if (outCount == null) { - throw new IllegalArgumentException("outCount must not be null"); - } - int end = s.length(); - if (start < 0) { - start = 0; - } else if (start >= end) { - return end; - } - if (stringSpan != null) { - // We might also have bmpSet != null, - // but fully-contained strings are relevant for counting elements. - return stringSpan.spanAndCount(s, start, spanCondition, outCount); - } else if (bmpSet != null) { - return bmpSet.span(s, start, spanCondition, outCount); - } else if (!strings.isEmpty()) { - int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED - : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; - which |= UnicodeSetStringSpan.WITH_COUNT; - UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); - return strSpan.spanAndCount(s, start, spanCondition, outCount); - } - - return spanCodePointsAndCount(s, start, spanCondition, outCount); - } - - private int spanCodePointsAndCount(CharSequence s, int start, - SpanCondition spanCondition, OutputInt outCount) { - // Pin to 0/1 values. - boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); - - int c; - int next = start; - int length = s.length(); - int count = 0; - do { - c = Character.codePointAt(s, next); - if (spanContained != contains(c)) { - break; - } - ++count; - next += Character.charCount(c); - } while (next < length); - if (outCount != null) { outCount.value = count; } - return next; - } - - /** - * Span a string backwards (from the fromIndex) using this UnicodeSet. - * If the fromIndex is less than 0, spanBack will return 0. - * If fromIndex is greater than the string length, spanBack will start from the string length. - *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. - * @param s The string to be spanned - * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards - * @param spanCondition The span condition - * @return The string index which starts the span (i.e. inclusive). - * @stable ICU 4.4 - */ - public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { - if (fromIndex <= 0) { - return 0; - } - if (fromIndex > s.length()) { - fromIndex = s.length(); - } - if (bmpSet != null) { - // Frozen set without strings, or no string is relevant for spanBack(). - return bmpSet.spanBack(s, fromIndex, spanCondition); - } - if (stringSpan != null) { - return stringSpan.spanBack(s, fromIndex, spanCondition); - } else if (!strings.isEmpty()) { - int which = (spanCondition == SpanCondition.NOT_CONTAINED) - ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED - : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; - UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); - if (strSpan.needsStringSpanUTF16()) { - return strSpan.spanBack(s, fromIndex, spanCondition); - } - } - - // Pin to 0/1 values. - boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); - - int c; - int prev = fromIndex; - do { - c = Character.codePointBefore(s, prev); - if (spanContained != contains(c)) { - break; - } - prev -= Character.charCount(c); - } while (prev > 0); - return prev; - } - - /** - * Clone a thawed version of this class, according to the Freezable interface. - * @return the clone, not frozen - * @stable ICU 4.4 - */ - public UnicodeSet cloneAsThawed() { - UnicodeSet result = new UnicodeSet(this); - assert !result.isFrozen(); - return result; - } - - // internal function - private void checkFrozen() { - if (isFrozen()) { - throw new UnsupportedOperationException("Attempt to modify frozen object"); - } - } - - /** - * Argument values for whether span() and similar functions continue while the current character is contained vs. - * not contained in the set. - *

- * The functionality is straightforward for sets with only single code points, without strings (which is the common - * case): - *

    - *
  • CONTAINED and SIMPLE work the same. - *
  • CONTAINED and SIMPLE are inverses of NOT_CONTAINED. - *
  • span() and spanBack() partition any string the - * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). - *
  • Using a - * complemented (inverted) set and the opposite span conditions yields the same results. - *
- * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in - * the set (for example, whether they overlap with each other) and the string that is processed. For a set with - * strings: - *
    - *
  • The complement of the set contains the opposite set of code points, but the same set of strings. - * Therefore, complementing both the set and the span conditions may yield different results. - *
  • When starting spans - * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different - * because a set string may start before the later position. - *
  • span(SIMPLE) may be shorter than - * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which - * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", - * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). - *
  • With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, - * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield - * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. - *
- * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then - * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could - * be used. - *

- * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point - * boundaries, never in the middle of a surrogate pair. - * - * @stable ICU 4.4 - */ - public enum SpanCondition { - /** - * Continues a span() while there is no set element at the current position. - * Increments by one code point at a time. - * Stops before the first set element (character or string). - * (For code points only, this is like while contains(current)==false). - *

- * When span() returns, the substring between where it started and the position it returned consists only of - * characters that are not in the set, and none of its strings overlap with the span. - * - * @stable ICU 4.4 - */ - NOT_CONTAINED, - - /** - * Spans the longest substring that is a concatenation of set elements (characters or strings). - * (For characters only, this is like while contains(current)==true). - *

- * When span() returns, the substring between where it started and the position it returned consists only of set - * elements (characters or strings) that are in the set. - *

- * If a set contains strings, then the span will be the longest substring for which there - * exists at least one non-overlapping concatenation of set elements (characters or strings). - * This is equivalent to a POSIX regular expression for (OR of each set element)*. - * (Java/ICU/Perl regex stops at the first match of an OR.) - * - * @stable ICU 4.4 - */ - CONTAINED, - - /** - * Continues a span() while there is a set element at the current position. - * Increments by the longest matching element at each position. - * (For characters only, this is like while contains(current)==true). - *

- * When span() returns, the substring between where it started and the position it returned consists only of set - * elements (characters or strings) that are in the set. - *

- * If a set only contains single characters, then this is the same as CONTAINED. - *

- * If a set contains strings, then the span will be the longest substring with a match at each position with the - * longest single set element (character or string). - *

- * Use this span condition together with other longest-match algorithms, such as ICU converters - * (ucnv_getUnicodeSet()). - * - * @stable ICU 4.4 - */ - SIMPLE, - } - -} --- /dev/null 2020-01-10 15:58:09.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/text/UnicodeSet.java 2020-01-10 15:58:08.000000000 -0800 @@ -0,0 +1,1414 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2015, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package jdk.internal.icu.text; + +import java.text.ParsePosition; +import java.util.ArrayList; +import java.util.TreeSet; + +import jdk.internal.icu.impl.BMPSet; +import jdk.internal.icu.impl.UCharacterProperty; +import jdk.internal.icu.impl.UnicodeSetStringSpan; +import jdk.internal.icu.impl.Utility; +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.util.OutputInt; +import jdk.internal.icu.util.VersionInfo; + +/** + * A mutable set of Unicode characters and multicharacter strings. + * Objects of this class represent character classes used + * in regular expressions. A character specifies a subset of Unicode + * code points. Legal code points are U+0000 to U+10FFFF, inclusive. + * + * Note: method freeze() will not only make the set immutable, but + * also makes important methods much higher performance: + * contains(c), containsNone(...), span(...), spanBack(...) etc. + * After the object is frozen, any subsequent call that wants to change + * the object will throw UnsupportedOperationException. + * + *

The UnicodeSet class is not designed to be subclassed. + * + *

UnicodeSet supports two APIs. The first is the + * operand API that allows the caller to modify the value of + * a UnicodeSet object. It conforms to Java 2's + * java.util.Set interface, although + * UnicodeSet does not actually implement that + * interface. All methods of Set are supported, with the + * modification that they take a character range or single character + * instead of an Object, and they take a + * UnicodeSet instead of a Collection. The + * operand API may be thought of in terms of boolean logic: a boolean + * OR is implemented by add, a boolean AND is implemented + * by retain, a boolean XOR is implemented by + * complement taking an argument, and a boolean NOT is + * implemented by complement with no argument. In terms + * of traditional set theory function names, add is a + * union, retain is an intersection, remove + * is an asymmetric difference, and complement with no + * argument is a set complement with respect to the superset range + * MIN_VALUE-MAX_VALUE + * + *

The second API is the + * applyPattern()/toPattern() API from the + * java.text.Format-derived classes. Unlike the + * methods that add characters, add categories, and control the logic + * of the set, the method applyPattern() sets all + * attributes of a UnicodeSet at once, based on a + * string pattern. + * + *

Pattern syntax

+ * + * Patterns are accepted by the constructors and the + * applyPattern() methods and returned by the + * toPattern() method. These patterns follow a syntax + * similar to that employed by version 8 regular expression character + * classes. Here are some simple examples: + * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
[]No characters
[a]The character 'a'
[ae]The characters 'a' and 'e'
[a-e]The characters 'a' through 'e' inclusive, in Unicode code + * point order
[\\u4E01]The character U+4E01
[a{ab}{ac}]The character 'a' and the multicharacter strings "ab" and + * "ac"
[\p{Lu}]All characters in the general category Uppercase Letter
+ *
+ * + * Any character may be preceded by a backslash in order to remove any special + * meaning. White space characters, as defined by the Unicode Pattern_White_Space property, are + * ignored, unless they are escaped. + * + *

Property patterns specify a set of characters having a certain + * property as defined by the Unicode standard. Both the POSIX-like + * "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a + * complete list of supported property patterns, see the User's Guide + * for UnicodeSet at + * + * http://www.icu-project.org/userguide/unicodeSet.html. + * Actual determination of property data is defined by the underlying + * Unicode database as implemented by UCharacter. + * + *

Patterns specify individual characters, ranges of characters, and + * Unicode property sets. When elements are concatenated, they + * specify their union. To complement a set, place a '^' immediately + * after the opening '['. Property patterns are inverted by modifying + * their delimiters; "[:^foo]" and "\P{foo}". In any other location, + * '^' has no special meaning. + * + *

Ranges are indicated by placing two a '-' between two + * characters, as in "a-z". This specifies the range of all + * characters from the left to the right, in Unicode order. If the + * left character is greater than or equal to the + * right character it is a syntax error. If a '-' occurs as the first + * character after the opening '[' or '[^', or if it occurs as the + * last character before the closing ']', then it is taken as a + * literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same + * set of three characters, 'a', 'b', and '-'. + * + *

Sets may be intersected using the {@literal '&'} operator or the asymmetric + * set difference may be taken using the '-' operator, for example, + * "{@code [[:L:]&[\\u0000-\\u0FFF]]}" indicates the set of all Unicode letters + * with values less than 4096. Operators ({@literal '&'} and '|') have equal + * precedence and bind left-to-right. Thus + * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to + * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for + * difference; intersection is commutative. + * + * + *
[a]The set containing 'a' + *
[a-z]The set containing 'a' + * through 'z' and all letters in between, in Unicode order + *
[^a-z]The set containing + * all characters but 'a' through 'z', + * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF + *
[[pat1][pat2]] + * The union of sets specified by pat1 and pat2 + *
[[pat1]&[pat2]] + * The intersection of sets specified by pat1 and pat2 + *
[[pat1]-[pat2]] + * The asymmetric difference of sets specified by pat1 and + * pat2 + *
[:Lu:] or \p{Lu} + * The set of characters having the specified + * Unicode property; in + * this case, Unicode uppercase letters + *
[:^Lu:] or \P{Lu} + * The set of characters not having the given + * Unicode property + *
+ * + *

Warning: you cannot add an empty string ("") to a UnicodeSet.

+ * + *

Formal syntax

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
pattern :=  ('[' '^'? item* ']') | + * property
item :=  char | (char '-' char) | pattern-expr
+ *
pattern-expr :=  pattern | pattern-expr pattern | + * pattern-expr op pattern
+ *
op :=  '&' | '-'
+ *
special :=  '[' | ']' | '-'
+ *
char :=  any character that is not special
+ * | ('\\'
any character)
+ * | ('\u' hex hex hex hex)
+ *
hex :=  any character for which + * Character.digit(c, 16) + * returns a non-negative result
property :=  a Unicode property set pattern
+ *
+ * + * + * + * + *
Legend: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
a := b  a may be replaced by b
a?zero or one instance of a
+ *
a*one or more instances of a
+ *
a | beither a or b
+ *
'a'the literal string between the quotes
+ *
+ *
+ *

To iterate over contents of UnicodeSet, the following are available: + *

  • {@link #ranges()} to iterate through the ranges
  • + *
  • {@link #strings()} to iterate through the strings
  • + *
  • {@link #iterator()} to iterate through the entire contents in a single loop. + * That method is, however, not particularly efficient, since it "boxes" each code point into a String. + *
+ * All of the above can be used in for loops. + * The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in for loops. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * + * @author Alan Liu + * @stable ICU 2.0 + */ +public class UnicodeSet { + + private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints + private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. + // 110000 for codepoints + + /** + * Minimum value that can be stored in a UnicodeSet. + * @stable ICU 2.0 + */ + public static final int MIN_VALUE = LOW; + + /** + * Maximum value that can be stored in a UnicodeSet. + * @stable ICU 2.0 + */ + public static final int MAX_VALUE = HIGH - 1; + + private int len; // length used; list may be longer to minimize reallocs + private int[] list; // MUST be terminated with HIGH + private int[] rangeList; // internal buffer + private int[] buffer; // internal buffer + + // NOTE: normally the field should be of type SortedSet; but that is missing a public clone!! + // is not private so that UnicodeSetIterator can get access + TreeSet strings = new TreeSet(); + + /** + * The pattern representation of this set. This may not be the + * most economical pattern. It is the pattern supplied to + * applyPattern(), with variables substituted and whitespace + * removed. For sets constructed without applyPattern(), or + * modified using the non-pattern API, this string will be null, + * indicating that toPattern() must generate a pattern + * representation from the inversion list. + */ + + private static final int START_EXTRA = 16; // initial storage. Must be >= 0 + private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 + + private static UnicodeSet INCLUSION = null; + + private volatile BMPSet bmpSet; // The set is frozen if bmpSet or stringSpan is not null. + private volatile UnicodeSetStringSpan stringSpan; + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Constructs an empty set. + * @stable ICU 2.0 + */ + private UnicodeSet() { + list = new int[1 + START_EXTRA]; + list[len++] = HIGH; + } + + /** + * Constructs a copy of an existing set. + * @stable ICU 2.0 + */ + private UnicodeSet(UnicodeSet other) { + set(other); + } + + /** + * Constructs a set containing the given range. If end > + * start then an empty set is created. + * + * @param start first character, inclusive, of range + * @param end last character, inclusive, of range + * @stable ICU 2.0 + */ + public UnicodeSet(int start, int end) { + this(); + complement(start, end); + } + + /** + * Constructs a set from the given pattern. See the class description + * for the syntax of the pattern language. Whitespace is ignored. + * @param pattern a string specifying what characters are in the set + * @exception java.lang.IllegalArgumentException if the pattern contains + * a syntax error. + * @stable ICU 2.0 + */ + public UnicodeSet(String pattern) { + this(); + applyPattern(pattern, null); + } + + /** + * Make this object represent the same set as other. + * @param other a UnicodeSet whose value will be + * copied to this object + * @stable ICU 2.0 + */ + public UnicodeSet set(UnicodeSet other) { + checkFrozen(); + list = other.list.clone(); + len = other.len; + strings = new TreeSet(other.strings); + return this; + } + + /** + * Returns the number of elements in this set (its cardinality) + * Note than the elements of a set may include both individual + * codepoints and strings. + * + * @return the number of elements in this set (its cardinality). + * @stable ICU 2.0 + */ + public int size() { + int n = 0; + int count = getRangeCount(); + for (int i = 0; i < count; ++i) { + n += getRangeEnd(i) - getRangeStart(i) + 1; + } + return n + strings.size(); + } + + // for internal use, after checkFrozen has been called + private UnicodeSet add_unchecked(int start, int end) { + if (start < MIN_VALUE || start > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); + } + if (end < MIN_VALUE || end > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); + } + if (start < end) { + add(range(start, end), 2, 0); + } else if (start == end) { + add(start); + } + return this; + } + + /** + * Adds the specified character to this set if it is not already + * present. If this set already contains the specified character, + * the call leaves this set unchanged. + * @stable ICU 2.0 + */ + public final UnicodeSet add(int c) { + checkFrozen(); + return add_unchecked(c); + } + + // for internal use only, after checkFrozen has been called + private final UnicodeSet add_unchecked(int c) { + if (c < MIN_VALUE || c > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); + } + + // find smallest i such that c < list[i] + // if odd, then it is IN the set + // if even, then it is OUT of the set + int i = findCodePoint(c); + + // already in set? + if ((i & 1) != 0) return this; + + // HIGH is 0x110000 + // assert(list[len-1] == HIGH); + + // empty = [HIGH] + // [start_0, limit_0, start_1, limit_1, HIGH] + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // i == 0 means c is before the first range + + if (c == list[i]-1) { + // c is before start of next range + list[i] = c; + // if we touched the HIGH mark, then add a new one + if (c == MAX_VALUE) { + ensureCapacity(len+1); + list[len++] = HIGH; + } + if (i > 0 && c == list[i-1]) { + // collapse adjacent ranges + + // [..., start_k-1, c, c, limit_k, ..., HIGH] + // ^ + // list[i] + System.arraycopy(list, i+1, list, i-1, len-i-1); + len -= 2; + } + } + + else if (i > 0 && c == list[i-1]) { + // c is after end of prior range + list[i-1]++; + // no need to chcek for collapse here + } + + else { + // At this point we know the new char is not adjacent to + // any existing ranges, and it is not 10FFFF. + + + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] + // ^ + // list[i] + + // Don't use ensureCapacity() to save on copying. + // NOTE: This has no measurable impact on performance, + // but it might help in some usage patterns. + if (len+2 > list.length) { + int[] temp = new int[len + 2 + GROW_EXTRA]; + if (i != 0) System.arraycopy(list, 0, temp, 0, i); + System.arraycopy(list, i, temp, i+2, len-i); + list = temp; + } else { + System.arraycopy(list, i, list, i+2, len-i); + } + + list[i] = c; + list[i+1] = c+1; + len += 2; + } + + return this; + } + + /** + * Adds the specified multicharacter to this set if it is not already + * present. If this set already contains the multicharacter, + * the call leaves this set unchanged. + * Thus {@code "ch" => {"ch"}} + *
Warning: you cannot add an empty string ("") to a UnicodeSet. + * @param s the source string + * @return this object, for chaining + * @stable ICU 2.0 + */ + public final UnicodeSet add(CharSequence s) { + checkFrozen(); + int cp = getSingleCP(s); + if (cp < 0) { + strings.add(s.toString()); + } else { + add_unchecked(cp, cp); + } + return this; + } + + /** + * Utility for getting code point from single code point CharSequence. + * See the public UTF16.getSingleCodePoint() + * @return a code point IF the string consists of a single one. + * otherwise returns -1. + * @param s to test + */ + private static int getSingleCP(CharSequence s) { + if (s.length() < 1) { + throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); + } + if (s.length() > 2) return -1; + if (s.length() == 1) return s.charAt(0); + + // at this point, len = 2 + int cp = UTF16.charAt(s, 0); + if (cp > 0xFFFF) { // is surrogate pair + return cp; + } + return -1; + } + + /** + * Complements the specified range in this set. Any character in + * the range will be removed if it is in this set, or will be + * added if it is not in this set. If {@code end > start} + * then an empty range is complemented, leaving the set unchanged. + * + * @param start first character, inclusive, of range to be removed + * from this set. + * @param end last character, inclusive, of range to be removed + * from this set. + * @stable ICU 2.0 + */ + public UnicodeSet complement(int start, int end) { + checkFrozen(); + if (start < MIN_VALUE || start > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6)); + } + if (end < MIN_VALUE || end > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6)); + } + if (start <= end) { + xor(range(start, end), 2, 0); + } + return this; + } + + /** + * Returns true if this set contains the given character. + * @param c character to be checked for containment + * @return true if the test condition is met + * @stable ICU 2.0 + */ + public boolean contains(int c) { + if (c < MIN_VALUE || c > MAX_VALUE) { + throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6)); + } + if (bmpSet != null) { + return bmpSet.contains(c); + } + if (stringSpan != null) { + return stringSpan.contains(c); + } + + /* + // Set i to the index of the start item greater than ch + // We know we will terminate without length test! + int i = -1; + while (true) { + if (c < list[++i]) break; + } + */ + + int i = findCodePoint(c); + + return ((i & 1) != 0); // return true if odd + } + + /** + * Returns the smallest value i such that c < list[i]. Caller + * must ensure that c is a legal value or this method will enter + * an infinite loop. This method performs a binary search. + * @param c a character in the range MIN_VALUE..MAX_VALUE + * inclusive + * @return the smallest integer i in the range 0..len-1, + * inclusive, such that c < list[i] + */ + private final int findCodePoint(int c) { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:all:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if (c < list[0]) return 0; + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if (len >= 2 && c >= list[len-2]) return len-1; + int lo = 0; + int hi = len - 1; + // invariant: c >= list[lo] + // invariant: c < list[hi] + for (;;) { + int i = (lo + hi) >>> 1; + if (i == lo) return hi; + if (c < list[i]) { + hi = i; + } else { + lo = i; + } + } + } + + /** + * Retains only the elements in this set that are contained in the + * specified set. In other words, removes from this set all of + * its elements that are not contained in the specified set. This + * operation effectively modifies this set so that its value is + * the intersection of the two sets. + * + * @param c set that defines which elements this set will retain. + * @stable ICU 2.0 + */ + public UnicodeSet retainAll(UnicodeSet c) { + checkFrozen(); + retain(c.list, c.len, 0); + strings.retainAll(c.strings); + return this; + } + + /** + * Removes all of the elements from this set. This set will be + * empty after this call returns. + * @stable ICU 2.0 + */ + public UnicodeSet clear() { + checkFrozen(); + list[0] = HIGH; + len = 1; + strings.clear(); + return this; + } + + /** + * Iteration method that returns the number of ranges contained in + * this set. + * @see #getRangeStart + * @see #getRangeEnd + * @stable ICU 2.0 + */ + public int getRangeCount() { + return len/2; + } + + /** + * Iteration method that returns the first character in the + * specified range of this set. + * @exception ArrayIndexOutOfBoundsException if index is outside + * the range 0..getRangeCount()-1 + * @see #getRangeCount + * @see #getRangeEnd + * @stable ICU 2.0 + */ + public int getRangeStart(int index) { + return list[index*2]; + } + + /** + * Iteration method that returns the last character in the + * specified range of this set. + * @exception ArrayIndexOutOfBoundsException if index is outside + * the range 0..getRangeCount()-1 + * @see #getRangeStart + * @see #getRangeEnd + * @stable ICU 2.0 + */ + public int getRangeEnd(int index) { + return (list[index*2 + 1] - 1); + } + + //---------------------------------------------------------------- + // Implementation: Pattern parsing + //---------------------------------------------------------------- + + /** + * Parses the given pattern, starting at the given position. The character + * at pattern.charAt(pos.getIndex()) must be '[', or the parse fails. + * Parsing continues until the corresponding closing ']'. If a syntax error + * is encountered between the opening and closing brace, the parse fails. + * Upon return from a successful parse, the ParsePosition is updated to + * point to the character following the closing ']', and an inversion + * list for the parsed pattern is returned. This method + * calls itself recursively to parse embedded subpatterns. + * + * @param pattern the string containing the pattern to be parsed. The + * portion of the string from pos.getIndex(), which must be a '[', to the + * corresponding closing ']', is parsed. + * @param pos upon entry, the position at which to being parsing. The + * character at pattern.charAt(pos.getIndex()) must be a '['. Upon return + * from a successful parse, pos.getIndex() is either the character after the + * closing ']' of the parsed pattern, or pattern.length() if the closing ']' + * is the last character of the pattern string. + * @return an inversion list for the parsed substring + * of pattern + * @exception java.lang.IllegalArgumentException if the parse fails. + */ + private UnicodeSet applyPattern(String pattern, + ParsePosition pos) { + if ("[:age=3.2:]".equals(pattern)) { + checkFrozen(); + VersionInfo version = VersionInfo.getInstance("3.2"); + applyFilter(new VersionFilter(version), UCharacterProperty.SRC_PROPSVEC); + } else { + throw new IllegalStateException("UnicodeSet.applyPattern(unexpected pattern " + + pattern + ")"); + } + + return this; + } + + //---------------------------------------------------------------- + // Implementation: Utility methods + //---------------------------------------------------------------- + + private void ensureCapacity(int newLen) { + if (newLen <= list.length) return; + int[] temp = new int[newLen + GROW_EXTRA]; + System.arraycopy(list, 0, temp, 0, len); + list = temp; + } + + private void ensureBufferCapacity(int newLen) { + if (buffer != null && newLen <= buffer.length) return; + buffer = new int[newLen + GROW_EXTRA]; + } + + /** + * Assumes start <= end. + */ + private int[] range(int start, int end) { + if (rangeList == null) { + rangeList = new int[] { start, end+1, HIGH }; + } else { + rangeList[0] = start; + rangeList[1] = end+1; + } + return rangeList; + } + + //---------------------------------------------------------------- + // Implementation: Fundamental operations + //---------------------------------------------------------------- + + // polarity = 0, 3 is normal: x xor y + // polarity = 1, 2: x xor ~y == x === y + + private UnicodeSet xor(int[] other, int otherLen, int polarity) { + ensureBufferCapacity(len + otherLen); + int i = 0, j = 0, k = 0; + int a = list[i++]; + int b; + if (polarity == 1 || polarity == 2) { + b = LOW; + if (other[j] == LOW) { // skip base if already LOW + ++j; + b = other[j]; + } + } else { + b = other[j++]; + } + // simplest of all the routines + // sort the values, discarding identicals! + while (true) { + if (a < b) { + buffer[k++] = a; + a = list[i++]; + } else if (b < a) { + buffer[k++] = b; + b = other[j++]; + } else if (a != HIGH) { // at this point, a == b + // discard both values! + a = list[i++]; + b = other[j++]; + } else { // DONE! + buffer[k++] = HIGH; + len = k; + break; + } + } + // swap list and buffer + int[] temp = list; + list = buffer; + buffer = temp; + return this; + } + + // polarity = 0 is normal: x union y + // polarity = 2: x union ~y + // polarity = 1: ~x union y + // polarity = 3: ~x union ~y + + private UnicodeSet add(int[] other, int otherLen, int polarity) { + ensureBufferCapacity(len + otherLen); + int i = 0, j = 0, k = 0; + int a = list[i++]; + int b = other[j++]; + // change from xor is that we have to check overlapping pairs + // polarity bit 1 means a is second, bit 2 means b is. + main: + while (true) { + switch (polarity) { + case 0: // both first; take lower if unequal + if (a < b) { // take a + // Back up over overlapping ranges in buffer[] + if (k > 0 && a <= buffer[k-1]) { + // Pick latter end value in buffer[] vs. list[] + a = max(list[i], buffer[--k]); + } else { + // No overlap + buffer[k++] = a; + a = list[i]; + } + i++; // Common if/else code factored out + polarity ^= 1; + } else if (b < a) { // take b + if (k > 0 && b <= buffer[k-1]) { + b = max(other[j], buffer[--k]); + } else { + buffer[k++] = b; + b = other[j]; + } + j++; + polarity ^= 2; + } else { // a == b, take a, drop b + if (a == HIGH) break main; + // This is symmetrical; it doesn't matter if + // we backtrack with a or b. - liu + if (k > 0 && a <= buffer[k-1]) { + a = max(list[i], buffer[--k]); + } else { + // No overlap + buffer[k++] = a; + a = list[i]; + } + i++; + polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 3: // both second; take higher if unequal, and drop other + if (b <= a) { // take a + if (a == HIGH) break main; + buffer[k++] = a; + } else { // take b + if (b == HIGH) break main; + buffer[k++] = b; + } + a = list[i++]; polarity ^= 1; // factored common code + b = other[j++]; polarity ^= 2; + break; + case 1: // a second, b first; if b < a, overlap + if (a < b) { // no overlap, take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else if (b < a) { // OVERLAP, drop b + b = other[j++]; polarity ^= 2; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 2: // a first, b second; if a < b, overlap + if (b < a) { // no overlap, take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else if (a < b) { // OVERLAP, drop a + a = list[i++]; polarity ^= 1; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + } + } + buffer[k++] = HIGH; // terminate + len = k; + // swap list and buffer + int[] temp = list; + list = buffer; + buffer = temp; + return this; + } + + // polarity = 0 is normal: x intersect y + // polarity = 2: x intersect ~y == set-minus + // polarity = 1: ~x intersect y + // polarity = 3: ~x intersect ~y + + private UnicodeSet retain(int[] other, int otherLen, int polarity) { + ensureBufferCapacity(len + otherLen); + int i = 0, j = 0, k = 0; + int a = list[i++]; + int b = other[j++]; + // change from xor is that we have to check overlapping pairs + // polarity bit 1 means a is second, bit 2 means b is. + main: + while (true) { + switch (polarity) { + case 0: // both first; drop the smaller + if (a < b) { // drop a + a = list[i++]; polarity ^= 1; + } else if (b < a) { // drop b + b = other[j++]; polarity ^= 2; + } else { // a == b, take one, drop other + if (a == HIGH) break main; + buffer[k++] = a; a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 3: // both second; take lower if unequal + if (a < b) { // take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else if (b < a) { // take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else { // a == b, take one, drop other + if (a == HIGH) break main; + buffer[k++] = a; a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 1: // a second, b first; + if (a < b) { // NO OVERLAP, drop a + a = list[i++]; polarity ^= 1; + } else if (b < a) { // OVERLAP, take b + buffer[k++] = b; b = other[j++]; polarity ^= 2; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + case 2: // a first, b second; if a < b, overlap + if (b < a) { // no overlap, drop b + b = other[j++]; polarity ^= 2; + } else if (a < b) { // OVERLAP, take a + buffer[k++] = a; a = list[i++]; polarity ^= 1; + } else { // a == b, drop both! + if (a == HIGH) break main; + a = list[i++]; polarity ^= 1; + b = other[j++]; polarity ^= 2; + } + break; + } + } + buffer[k++] = HIGH; // terminate + len = k; + // swap list and buffer + int[] temp = list; + list = buffer; + buffer = temp; + return this; + } + + private static final int max(int a, int b) { + return (a > b) ? a : b; + } + + //---------------------------------------------------------------- + // Generic filter-based scanning code + //---------------------------------------------------------------- + + private static interface Filter { + boolean contains(int codePoint); + } + + private static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0); + + private static class VersionFilter implements Filter { + VersionInfo version; + VersionFilter(VersionInfo version) { this.version = version; } + public boolean contains(int ch) { + VersionInfo v = UCharacter.getAge(ch); + // Reference comparison ok; VersionInfo caches and reuses + // unique objects. + return v != NO_VERSION && + v.compareTo(version) <= 0; + } + } + + private static synchronized UnicodeSet getInclusions(int src) { + if (src != UCharacterProperty.SRC_PROPSVEC) { + throw new IllegalStateException("UnicodeSet.getInclusions(unknown src "+src+")"); + } + + if (INCLUSION == null) { + UnicodeSet incl = new UnicodeSet(); + UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl); + INCLUSION = incl; + } + return INCLUSION; + } + + /** + * Generic filter-based scanning code for UCD property UnicodeSets. + */ + private UnicodeSet applyFilter(Filter filter, int src) { + // Logically, walk through all Unicode characters, noting the start + // and end of each range for which filter.contain(c) is + // true. Add each range to a set. + // + // To improve performance, use an inclusions set which + // encodes information about character ranges that are known + // to have identical properties. + // getInclusions(src) contains exactly the first characters of + // same-value ranges for the given properties "source". + + clear(); + + int startHasProperty = -1; + UnicodeSet inclusions = getInclusions(src); + int limitRange = inclusions.getRangeCount(); + + for (int j=0; j= 0) { + add_unchecked(startHasProperty, ch-1); + startHasProperty = -1; + } + } + } + if (startHasProperty >= 0) { + add_unchecked(startHasProperty, 0x10FFFF); + } + + return this; + } + + /** + * Is this frozen, according to the Freezable interface? + * + * @return value + * @stable ICU 3.8 + */ + public boolean isFrozen() { + return (bmpSet != null || stringSpan != null); + } + + /** + * Freeze this class, according to the Freezable interface. + * + * @return this + * @stable ICU 4.4 + */ + public UnicodeSet freeze() { + if (!isFrozen()) { + // Do most of what compact() does before freezing because + // compact() will not work when the set is frozen. + // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). + + // Delete buffer first to defragment memory less. + buffer = null; + if (list.length > (len + GROW_EXTRA)) { + // Make the capacity equal to len or 1. + // We don't want to realloc of 0 size. + int capacity = (len == 0) ? 1 : len; + int[] oldList = list; + list = new int[capacity]; + for (int i = capacity; i-- > 0;) { + list[i] = oldList[i]; + } + } + + // Optimize contains() and span() and similar functions. + if (!strings.isEmpty()) { + stringSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), UnicodeSetStringSpan.ALL); + } + if (stringSpan == null || !stringSpan.needsStringSpanUTF16()) { + // Optimize for code point spans. + // There are no strings, or + // all strings are irrelevant for span() etc. because + // all of each string's code points are contained in this set. + // However, fully contained strings are relevant for spanAndCount(), + // so we create both objects. + bmpSet = new BMPSet(list, len); + } + } + return this; + } + + /** + * Span a string using this UnicodeSet. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param spanCondition The span condition + * @return the length of the span + * @stable ICU 4.4 + */ + public int span(CharSequence s, SpanCondition spanCondition) { + return span(s, 0, spanCondition); + } + + /** + * Span a string using this UnicodeSet. + * If the start index is less than 0, span will start from 0. + * If the start index is greater than the string length, span returns the string length. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the string index which ends the span (i.e. exclusive) + * @stable ICU 4.4 + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (bmpSet != null) { + // Frozen set without strings, or no string is relevant for span(). + return bmpSet.span(s, start, spanCondition, null); + } + if (stringSpan != null) { + return stringSpan.span(s, start, spanCondition); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); + if (strSpan.needsStringSpanUTF16()) { + return strSpan.span(s, start, spanCondition); + } + } + + return spanCodePointsAndCount(s, start, spanCondition, null); + } + + /** + * Same as span() but also counts the smallest number of set elements on any path across the span. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param outCount An output-only object (must not be null) for returning the count. + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount) { + if (outCount == null) { + throw new IllegalArgumentException("outCount must not be null"); + } + int end = s.length(); + if (start < 0) { + start = 0; + } else if (start >= end) { + return end; + } + if (stringSpan != null) { + // We might also have bmpSet != null, + // but fully-contained strings are relevant for counting elements. + return stringSpan.spanAndCount(s, start, spanCondition, outCount); + } else if (bmpSet != null) { + return bmpSet.span(s, start, spanCondition, outCount); + } else if (!strings.isEmpty()) { + int which = spanCondition == SpanCondition.NOT_CONTAINED ? UnicodeSetStringSpan.FWD_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.FWD_UTF16_CONTAINED; + which |= UnicodeSetStringSpan.WITH_COUNT; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); + return strSpan.spanAndCount(s, start, spanCondition, outCount); + } + + return spanCodePointsAndCount(s, start, spanCondition, outCount); + } + + private int spanCodePointsAndCount(CharSequence s, int start, + SpanCondition spanCondition, OutputInt outCount) { + // Pin to 0/1 values. + boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); + + int c; + int next = start; + int length = s.length(); + int count = 0; + do { + c = Character.codePointAt(s, next); + if (spanContained != contains(c)) { + break; + } + ++count; + next += Character.charCount(c); + } while (next < length); + if (outCount != null) { outCount.value = count; } + return next; + } + + /** + * Span a string backwards (from the fromIndex) using this UnicodeSet. + * If the fromIndex is less than 0, spanBack will return 0. + * If fromIndex is greater than the string length, spanBack will start from the string length. + *

To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}. + * @param s The string to be spanned + * @param fromIndex The index of the char (exclusive) that the string should be spanned backwards + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + * @stable ICU 4.4 + */ + public int spanBack(CharSequence s, int fromIndex, SpanCondition spanCondition) { + if (fromIndex <= 0) { + return 0; + } + if (fromIndex > s.length()) { + fromIndex = s.length(); + } + if (bmpSet != null) { + // Frozen set without strings, or no string is relevant for spanBack(). + return bmpSet.spanBack(s, fromIndex, spanCondition); + } + if (stringSpan != null) { + return stringSpan.spanBack(s, fromIndex, spanCondition); + } else if (!strings.isEmpty()) { + int which = (spanCondition == SpanCondition.NOT_CONTAINED) + ? UnicodeSetStringSpan.BACK_UTF16_NOT_CONTAINED + : UnicodeSetStringSpan.BACK_UTF16_CONTAINED; + UnicodeSetStringSpan strSpan = new UnicodeSetStringSpan(this, new ArrayList(strings), which); + if (strSpan.needsStringSpanUTF16()) { + return strSpan.spanBack(s, fromIndex, spanCondition); + } + } + + // Pin to 0/1 values. + boolean spanContained = (spanCondition != SpanCondition.NOT_CONTAINED); + + int c; + int prev = fromIndex; + do { + c = Character.codePointBefore(s, prev); + if (spanContained != contains(c)) { + break; + } + prev -= Character.charCount(c); + } while (prev > 0); + return prev; + } + + /** + * Clone a thawed version of this class, according to the Freezable interface. + * @return the clone, not frozen + * @stable ICU 4.4 + */ + public UnicodeSet cloneAsThawed() { + UnicodeSet result = new UnicodeSet(this); + assert !result.isFrozen(); + return result; + } + + // internal function + private void checkFrozen() { + if (isFrozen()) { + throw new UnsupportedOperationException("Attempt to modify frozen object"); + } + } + + /** + * Argument values for whether span() and similar functions continue while the current character is contained vs. + * not contained in the set. + *

+ * The functionality is straightforward for sets with only single code points, without strings (which is the common + * case): + *

    + *
  • CONTAINED and SIMPLE work the same. + *
  • CONTAINED and SIMPLE are inverses of NOT_CONTAINED. + *
  • span() and spanBack() partition any string the + * same way when alternating between span(NOT_CONTAINED) and span(either "contained" condition). + *
  • Using a + * complemented (inverted) set and the opposite span conditions yields the same results. + *
+ * When a set contains multi-code point strings, then these statements may not be true, depending on the strings in + * the set (for example, whether they overlap with each other) and the string that is processed. For a set with + * strings: + *
    + *
  • The complement of the set contains the opposite set of code points, but the same set of strings. + * Therefore, complementing both the set and the span conditions may yield different results. + *
  • When starting spans + * at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different + * because a set string may start before the later position. + *
  • span(SIMPLE) may be shorter than + * span(CONTAINED) because it will not recursively try all possible paths. For example, with a set which + * contains the three strings "xy", "xya" and "ax", span("xyax", CONTAINED) will return 4 but span("xyax", + * SIMPLE) will return 3. span(SIMPLE) will never be longer than span(CONTAINED). + *
  • With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, + * with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield + * contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }. + *
+ * Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then + * either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could + * be used. + *

+ * Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point + * boundaries, never in the middle of a surrogate pair. + * + * @stable ICU 4.4 + */ + public enum SpanCondition { + /** + * Continues a span() while there is no set element at the current position. + * Increments by one code point at a time. + * Stops before the first set element (character or string). + * (For code points only, this is like while contains(current)==false). + *

+ * When span() returns, the substring between where it started and the position it returned consists only of + * characters that are not in the set, and none of its strings overlap with the span. + * + * @stable ICU 4.4 + */ + NOT_CONTAINED, + + /** + * Spans the longest substring that is a concatenation of set elements (characters or strings). + * (For characters only, this is like while contains(current)==true). + *

+ * When span() returns, the substring between where it started and the position it returned consists only of set + * elements (characters or strings) that are in the set. + *

+ * If a set contains strings, then the span will be the longest substring for which there + * exists at least one non-overlapping concatenation of set elements (characters or strings). + * This is equivalent to a POSIX regular expression for (OR of each set element)*. + * (Java/ICU/Perl regex stops at the first match of an OR.) + * + * @stable ICU 4.4 + */ + CONTAINED, + + /** + * Continues a span() while there is a set element at the current position. + * Increments by the longest matching element at each position. + * (For characters only, this is like while contains(current)==true). + *

+ * When span() returns, the substring between where it started and the position it returned consists only of set + * elements (characters or strings) that are in the set. + *

+ * If a set only contains single characters, then this is the same as CONTAINED. + *

+ * If a set contains strings, then the span will be the longest substring with a match at each position with the + * longest single set element (character or string). + *

+ * Use this span condition together with other longest-match algorithms, such as ICU converters + * (ucnv_getUnicodeSet()). + * + * @stable ICU 4.4 + */ + SIMPLE, + } + +} --- old/src/java.base/share/classes/sun/text/normalizer/CodePointMap.java 2020-01-10 15:58:10.000000000 -0800 +++ /dev/null 2020-01-10 15:58:10.000000000 -0800 @@ -1,501 +0,0 @@ -/* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -// (c) 2018 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License - -// created: 2018may10 Markus W. Scherer - -package sun.text.normalizer; - -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** - * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. - * This does not implement java.util.Map. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ -public abstract class CodePointMap implements Iterable { - /** - * Selectors for how getRange() should report value ranges overlapping with surrogates. - * Most users should use NORMAL. - * - * @see #getRange - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public enum RangeOption { - /** - * getRange() enumerates all same-value ranges as stored in the map. - * Most users should use this option. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - NORMAL, - /** - * getRange() enumerates all same-value ranges as stored in the map, - * except that lead surrogates (U+D800..U+DBFF) are treated as having the - * surrogateValue, which is passed to getRange() as a separate parameter. - * The surrogateValue is not transformed via filter(). - * See {@link Character#isHighSurrogate}. - * - *

Most users should use NORMAL instead. - * - *

This option is useful for maps that map surrogate code *units* to - * special values optimized for UTF-16 string processing - * or for special error behavior for unpaired surrogates, - * but those values are not to be associated with the lead surrogate code *points*. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - FIXED_LEAD_SURROGATES, - /** - * getRange() enumerates all same-value ranges as stored in the map, - * except that all surrogates (U+D800..U+DFFF) are treated as having the - * surrogateValue, which is passed to getRange() as a separate parameter. - * The surrogateValue is not transformed via filter(). - * See {@link Character#isSurrogate}. - * - *

Most users should use NORMAL instead. - * - *

This option is useful for maps that map surrogate code *units* to - * special values optimized for UTF-16 string processing - * or for special error behavior for unpaired surrogates, - * but those values are not to be associated with the lead surrogate code *points*. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - FIXED_ALL_SURROGATES - } - - /** - * Callback function interface: Modifies a map value. - * Optionally called by getRange(). - * The modified value will be returned by the getRange() function. - * - *

Can be used to ignore some of the value bits, - * make a filter for one of several values, - * return a value index computed from the map value, etc. - * - * @see #getRange - * @see #iterator - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public interface ValueFilter { - /** - * Modifies the map value. - * - * @param value map value - * @return modified value - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public int apply(int value); - } - - /** - * Range iteration result data. - * Code points from start to end map to the same value. - * The value may have been modified by {@link ValueFilter#apply(int)}, - * or it may be the surrogateValue if a RangeOption other than "normal" was used. - * - * @see #getRange - * @see #iterator - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Range { - private int start; - private int end; - private int value; - - /** - * Constructor. Sets start and end to -1 and value to 0. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public Range() { - start = end = -1; - value = 0; - } - - /** - * @return the start code point - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public int getStart() { return start; } - /** - * @return the (inclusive) end code point - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public int getEnd() { return end; } - /** - * @return the range value - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public int getValue() { return value; } - /** - * Sets the range. When using {@link #iterator()}, - * iteration will resume after the newly set end. - * - * @param start new start code point - * @param end new end code point - * @param value new value - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public void set(int start, int end, int value) { - this.start = start; - this.end = end; - this.value = value; - } - } - - private final class RangeIterator implements Iterator { - private Range range = new Range(); - - @Override - public boolean hasNext() { - return -1 <= range.end && range.end < 0x10ffff; - } - - @Override - public Range next() { - if (getRange(range.end + 1, null, range)) { - return range; - } else { - throw new NoSuchElementException(); - } - } - - @Override - public final void remove() { - throw new UnsupportedOperationException(); - } - } - - /** - * Iterates over code points of a string and fetches map values. - * This does not implement java.util.Iterator. - * - *

-     * void onString(CodePointMap map, CharSequence s, int start) {
-     *     CodePointMap.StringIterator iter = map.stringIterator(s, start);
-     *     while (iter.next()) {
-     *         int end = iter.getIndex();  // code point from between start and end
-     *         useValue(s, start, end, iter.getCodePoint(), iter.getValue());
-     *         start = end;
-     *     }
-     * }
-     * 
- * - *

This class is not intended for public subclassing. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public class StringIterator { - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected CharSequence s; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected int sIndex; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected int c; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected int value; - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected StringIterator(CharSequence s, int sIndex) { - this.s = s; - this.sIndex = sIndex; - c = -1; - value = 0; - } - - /** - * Resets the iterator to a new string and/or a new string index. - * - * @param s string to iterate over - * @param sIndex string index where the iteration will start - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public void reset(CharSequence s, int sIndex) { - this.s = s; - this.sIndex = sIndex; - c = -1; - value = 0; - } - - /** - * Reads the next code point, post-increments the string index, - * and gets a value from the map. - * Sets an implementation-defined error value if the code point is an unpaired surrogate. - * - * @return true if the string index was not yet at the end of the string; - * otherwise the iterator did not advance - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public boolean next() { - if (sIndex >= s.length()) { - return false; - } - c = Character.codePointAt(s, sIndex); - sIndex += Character.charCount(c); - value = get(c); - return true; - } - - /** - * Reads the previous code point, pre-decrements the string index, - * and gets a value from the map. - * Sets an implementation-defined error value if the code point is an unpaired surrogate. - * - * @return true if the string index was not yet at the start of the string; - * otherwise the iterator did not advance - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public boolean previous() { - if (sIndex <= 0) { - return false; - } - c = Character.codePointBefore(s, sIndex); - sIndex -= Character.charCount(c); - value = get(c); - return true; - } - /** - * @return the string index - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final int getIndex() { return sIndex; } - /** - * @return the code point - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final int getCodePoint() { return c; } - /** - * @return the map value, - * or an implementation-defined error value if - * the code point is an unpaired surrogate - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final int getValue() { return value; } - } - - /** - * Protected no-args constructor. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - protected CodePointMap() { - } - - /** - * Returns the value for a code point as stored in the map, with range checking. - * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. - * - * @param c the code point - * @return the map value, - * or an implementation-defined error value if - * the code point is not in the range 0..U+10FFFF - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public abstract int get(int c); - - /** - * Sets the range object to a range of code points beginning with the start parameter. - * The range start is the same as the start input parameter - * (even if there are preceding code points that have the same value). - * The range end is the last code point such that - * all those from start to there have the same value. - * Returns false if start is not 0..U+10FFFF. - * Can be used to efficiently iterate over all same-value ranges in a map. - * (This is normally faster than iterating over code points and get()ting each value, - * but may be much slower than a data structure that stores ranges directly.) - * - *

If the {@link ValueFilter} parameter is not null, then - * the value to be delivered is passed through that filter, and the return value is the end - * of the range where all values are modified to the same actual value. - * The value is unchanged if that parameter is null. - * - *

Example: - *

-     * int start = 0;
-     * CodePointMap.Range range = new CodePointMap.Range();
-     * while (map.getRange(start, null, range)) {
-     *     int end = range.getEnd();
-     *     int value = range.getValue();
-     *     // Work with the range start..end and its value.
-     *     start = end + 1;
-     * }
-     * 
- * - * @param start range start - * @param filter an object that may modify the map data value, - * or null if the values from the map are to be used unmodified - * @param range the range object that will be set to the code point range and value - * @return true if start is 0..U+10FFFF; otherwise no new range is fetched - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public abstract boolean getRange(int start, ValueFilter filter, Range range); - - /** - * Sets the range object to a range of code points beginning with the start parameter. - * The range start is the same as the start input parameter - * (even if there are preceding code points that have the same value). - * The range end is the last code point such that - * all those from start to there have the same value. - * Returns false if start is not 0..U+10FFFF. - * - *

Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally - * modifies the range if it overlaps with surrogate code points. - * - * @param start range start - * @param option defines whether surrogates are treated normally, - * or as having the surrogateValue; usually {@link RangeOption#NORMAL} - * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL} - * @param filter an object that may modify the map data value, - * or null if the values from the map are to be used unmodified - * @param range the range object that will be set to the code point range and value - * @return true if start is 0..U+10FFFF; otherwise no new range is fetched - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public boolean getRange(int start, RangeOption option, int surrogateValue, - ValueFilter filter, Range range) { - assert option != null; - if (!getRange(start, filter, range)) { - return false; - } - if (option == RangeOption.NORMAL) { - return true; - } - int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; - int end = range.end; - if (end < 0xd7ff || start > surrEnd) { - return true; - } - // The range overlaps with surrogates, or ends just before the first one. - if (range.value == surrogateValue) { - if (end >= surrEnd) { - // Surrogates followed by a non-surrValue range, - // or surrogates are part of a larger surrValue range. - return true; - } - } else { - if (start <= 0xd7ff) { - range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates. - return true; - } - // Start is a surrogate with a non-surrValue code *unit* value. - // Return a surrValue code *point* range. - range.value = surrogateValue; - if (end > surrEnd) { - range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range. - return true; - } - } - // See if the surrValue surrogate range can be merged with - // an immediately following range. - if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) { - range.start = start; - return true; - } - range.start = start; - range.end = surrEnd; - range.value = surrogateValue; - return true; - } - - /** - * Convenience iterator over same-map-value code point ranges. - * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)} - * without filtering. - * Adjacent ranges have different map values. - * - *

The iterator always returns the same Range object. - * - * @return a Range iterator - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public Iterator iterator() { - return new RangeIterator(); - } - - /** - * Returns an iterator (not a java.util.Iterator) over code points of a string - * for fetching map values. - * - * @param s string to iterate over - * @param sIndex string index where the iteration will start - * @return the iterator - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public StringIterator stringIterator(CharSequence s, int sIndex) { - return new StringIterator(s, sIndex); - } -} --- /dev/null 2020-01-10 15:58:10.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/util/CodePointMap.java 2020-01-10 15:58:10.000000000 -0800 @@ -0,0 +1,501 @@ +/* + * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +// (c) 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// created: 2018may10 Markus W. Scherer + +package jdk.internal.icu.util; + +import java.util.Iterator; +import java.util.NoSuchElementException; + +/** + * Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values. + * This does not implement java.util.Map. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ +public abstract class CodePointMap implements Iterable { + /** + * Selectors for how getRange() should report value ranges overlapping with surrogates. + * Most users should use NORMAL. + * + * @see #getRange + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public enum RangeOption { + /** + * getRange() enumerates all same-value ranges as stored in the map. + * Most users should use this option. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + NORMAL, + /** + * getRange() enumerates all same-value ranges as stored in the map, + * except that lead surrogates (U+D800..U+DBFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See {@link Character#isHighSurrogate}. + * + *

Most users should use NORMAL instead. + * + *

This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + FIXED_LEAD_SURROGATES, + /** + * getRange() enumerates all same-value ranges as stored in the map, + * except that all surrogates (U+D800..U+DFFF) are treated as having the + * surrogateValue, which is passed to getRange() as a separate parameter. + * The surrogateValue is not transformed via filter(). + * See {@link Character#isSurrogate}. + * + *

Most users should use NORMAL instead. + * + *

This option is useful for maps that map surrogate code *units* to + * special values optimized for UTF-16 string processing + * or for special error behavior for unpaired surrogates, + * but those values are not to be associated with the lead surrogate code *points*. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + FIXED_ALL_SURROGATES + } + + /** + * Callback function interface: Modifies a map value. + * Optionally called by getRange(). + * The modified value will be returned by the getRange() function. + * + *

Can be used to ignore some of the value bits, + * make a filter for one of several values, + * return a value index computed from the map value, etc. + * + * @see #getRange + * @see #iterator + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public interface ValueFilter { + /** + * Modifies the map value. + * + * @param value map value + * @return modified value + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public int apply(int value); + } + + /** + * Range iteration result data. + * Code points from start to end map to the same value. + * The value may have been modified by {@link ValueFilter#apply(int)}, + * or it may be the surrogateValue if a RangeOption other than "normal" was used. + * + * @see #getRange + * @see #iterator + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Range { + private int start; + private int end; + private int value; + + /** + * Constructor. Sets start and end to -1 and value to 0. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public Range() { + start = end = -1; + value = 0; + } + + /** + * @return the start code point + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public int getStart() { return start; } + /** + * @return the (inclusive) end code point + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public int getEnd() { return end; } + /** + * @return the range value + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public int getValue() { return value; } + /** + * Sets the range. When using {@link #iterator()}, + * iteration will resume after the newly set end. + * + * @param start new start code point + * @param end new end code point + * @param value new value + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public void set(int start, int end, int value) { + this.start = start; + this.end = end; + this.value = value; + } + } + + private final class RangeIterator implements Iterator { + private Range range = new Range(); + + @Override + public boolean hasNext() { + return -1 <= range.end && range.end < 0x10ffff; + } + + @Override + public Range next() { + if (getRange(range.end + 1, null, range)) { + return range; + } else { + throw new NoSuchElementException(); + } + } + + @Override + public final void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * Iterates over code points of a string and fetches map values. + * This does not implement java.util.Iterator. + * + *

+     * void onString(CodePointMap map, CharSequence s, int start) {
+     *     CodePointMap.StringIterator iter = map.stringIterator(s, start);
+     *     while (iter.next()) {
+     *         int end = iter.getIndex();  // code point from between start and end
+     *         useValue(s, start, end, iter.getCodePoint(), iter.getValue());
+     *         start = end;
+     *     }
+     * }
+     * 
+ * + *

This class is not intended for public subclassing. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public class StringIterator { + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected CharSequence s; + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected int sIndex; + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected int c; + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected int value; + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected StringIterator(CharSequence s, int sIndex) { + this.s = s; + this.sIndex = sIndex; + c = -1; + value = 0; + } + + /** + * Resets the iterator to a new string and/or a new string index. + * + * @param s string to iterate over + * @param sIndex string index where the iteration will start + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public void reset(CharSequence s, int sIndex) { + this.s = s; + this.sIndex = sIndex; + c = -1; + value = 0; + } + + /** + * Reads the next code point, post-increments the string index, + * and gets a value from the map. + * Sets an implementation-defined error value if the code point is an unpaired surrogate. + * + * @return true if the string index was not yet at the end of the string; + * otherwise the iterator did not advance + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public boolean next() { + if (sIndex >= s.length()) { + return false; + } + c = Character.codePointAt(s, sIndex); + sIndex += Character.charCount(c); + value = get(c); + return true; + } + + /** + * Reads the previous code point, pre-decrements the string index, + * and gets a value from the map. + * Sets an implementation-defined error value if the code point is an unpaired surrogate. + * + * @return true if the string index was not yet at the start of the string; + * otherwise the iterator did not advance + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public boolean previous() { + if (sIndex <= 0) { + return false; + } + c = Character.codePointBefore(s, sIndex); + sIndex -= Character.charCount(c); + value = get(c); + return true; + } + /** + * @return the string index + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final int getIndex() { return sIndex; } + /** + * @return the code point + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final int getCodePoint() { return c; } + /** + * @return the map value, + * or an implementation-defined error value if + * the code point is an unpaired surrogate + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final int getValue() { return value; } + } + + /** + * Protected no-args constructor. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + protected CodePointMap() { + } + + /** + * Returns the value for a code point as stored in the map, with range checking. + * Returns an implementation-defined error value if c is not in the range 0..U+10FFFF. + * + * @param c the code point + * @return the map value, + * or an implementation-defined error value if + * the code point is not in the range 0..U+10FFFF + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public abstract int get(int c); + + /** + * Sets the range object to a range of code points beginning with the start parameter. + * The range start is the same as the start input parameter + * (even if there are preceding code points that have the same value). + * The range end is the last code point such that + * all those from start to there have the same value. + * Returns false if start is not 0..U+10FFFF. + * Can be used to efficiently iterate over all same-value ranges in a map. + * (This is normally faster than iterating over code points and get()ting each value, + * but may be much slower than a data structure that stores ranges directly.) + * + *

If the {@link ValueFilter} parameter is not null, then + * the value to be delivered is passed through that filter, and the return value is the end + * of the range where all values are modified to the same actual value. + * The value is unchanged if that parameter is null. + * + *

Example: + *

+     * int start = 0;
+     * CodePointMap.Range range = new CodePointMap.Range();
+     * while (map.getRange(start, null, range)) {
+     *     int end = range.getEnd();
+     *     int value = range.getValue();
+     *     // Work with the range start..end and its value.
+     *     start = end + 1;
+     * }
+     * 
+ * + * @param start range start + * @param filter an object that may modify the map data value, + * or null if the values from the map are to be used unmodified + * @param range the range object that will be set to the code point range and value + * @return true if start is 0..U+10FFFF; otherwise no new range is fetched + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public abstract boolean getRange(int start, ValueFilter filter, Range range); + + /** + * Sets the range object to a range of code points beginning with the start parameter. + * The range start is the same as the start input parameter + * (even if there are preceding code points that have the same value). + * The range end is the last code point such that + * all those from start to there have the same value. + * Returns false if start is not 0..U+10FFFF. + * + *

Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally + * modifies the range if it overlaps with surrogate code points. + * + * @param start range start + * @param option defines whether surrogates are treated normally, + * or as having the surrogateValue; usually {@link RangeOption#NORMAL} + * @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL} + * @param filter an object that may modify the map data value, + * or null if the values from the map are to be used unmodified + * @param range the range object that will be set to the code point range and value + * @return true if start is 0..U+10FFFF; otherwise no new range is fetched + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public boolean getRange(int start, RangeOption option, int surrogateValue, + ValueFilter filter, Range range) { + assert option != null; + if (!getRange(start, filter, range)) { + return false; + } + if (option == RangeOption.NORMAL) { + return true; + } + int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff; + int end = range.end; + if (end < 0xd7ff || start > surrEnd) { + return true; + } + // The range overlaps with surrogates, or ends just before the first one. + if (range.value == surrogateValue) { + if (end >= surrEnd) { + // Surrogates followed by a non-surrValue range, + // or surrogates are part of a larger surrValue range. + return true; + } + } else { + if (start <= 0xd7ff) { + range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates. + return true; + } + // Start is a surrogate with a non-surrValue code *unit* value. + // Return a surrValue code *point* range. + range.value = surrogateValue; + if (end > surrEnd) { + range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range. + return true; + } + } + // See if the surrValue surrogate range can be merged with + // an immediately following range. + if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) { + range.start = start; + return true; + } + range.start = start; + range.end = surrEnd; + range.value = surrogateValue; + return true; + } + + /** + * Convenience iterator over same-map-value code point ranges. + * Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)} + * without filtering. + * Adjacent ranges have different map values. + * + *

The iterator always returns the same Range object. + * + * @return a Range iterator + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public Iterator iterator() { + return new RangeIterator(); + } + + /** + * Returns an iterator (not a java.util.Iterator) over code points of a string + * for fetching map values. + * + * @param s string to iterate over + * @param sIndex string index where the iteration will start + * @return the iterator + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public StringIterator stringIterator(CharSequence s, int sIndex) { + return new StringIterator(s, sIndex); + } +} --- old/src/java.base/share/classes/sun/text/normalizer/CodePointTrie.java 2020-01-10 15:58:11.000000000 -0800 +++ /dev/null 2020-01-10 15:58:11.000000000 -0800 @@ -1,1310 +0,0 @@ -/* - * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -// (c) 2018 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License - -// created: 2018may04 Markus W. Scherer - -package sun.text.normalizer; - -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.io.OutputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -import static sun.text.normalizer.NormalizerImpl.UTF16Plus; - -/** - * Immutable Unicode code point trie. - * Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values. - * For details see http://site.icu-project.org/design/struct/utrie - * - *

This class is not intended for public subclassing. - * - * @see MutableCodePointTrie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ -@SuppressWarnings("deprecation") -public abstract class CodePointTrie extends CodePointMap { - /** - * Selectors for the type of a CodePointTrie. - * Different trade-offs for size vs. speed. - * - *

Use null for {@link #fromBinary} to accept any type; - * {@link #getType} will return the actual type. - * - * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) - * @see #fromBinary - * @see #getType - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public enum Type { - /** - * Fast/simple/larger BMP data structure. - * The {@link Fast} subclasses have additional functions for lookup for BMP and supplementary code points. - * - * @see Fast - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - FAST, - /** - * Small/slower BMP data structure. - * - * @see Small - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - SMALL - } - - /** - * Selectors for the number of bits in a CodePointTrie data value. - * - *

Use null for {@link #fromBinary} to accept any data value width; - * {@link #getValueWidth} will return the actual data value width. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public enum ValueWidth { - /** - * The trie stores 16 bits per data value. - * It returns them as unsigned values 0..0xffff=65535. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - BITS_16, - /** - * The trie stores 32 bits per data value. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - BITS_32, - /** - * The trie stores 8 bits per data value. - * It returns them as unsigned values 0..0xff=255. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - BITS_8 - } - - private CodePointTrie(char[] index, Data data, int highStart, - int index3NullOffset, int dataNullOffset) { - this.ascii = new int[ASCII_LIMIT]; - this.index = index; - this.data = data; - this.dataLength = data.getDataLength(); - this.highStart = highStart; - this.index3NullOffset = index3NullOffset; - this.dataNullOffset = dataNullOffset; - - for (int c = 0; c < ASCII_LIMIT; ++c) { - ascii[c] = data.getFromIndex(c); - } - - int nullValueOffset = dataNullOffset; - if (nullValueOffset >= dataLength) { - nullValueOffset = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; - } - nullValue = data.getFromIndex(nullValueOffset); - } - - /** - * Creates a trie from its binary form, - * stored in the ByteBuffer starting at the current position. - * Advances the buffer position to just after the trie data. - * Inverse of {@link #toBinary(OutputStream)}. - * - *

The data is copied from the buffer; - * later modification of the buffer will not affect the trie. - * - * @param type selects the trie type; this method throws an exception - * if the type does not match the binary data; - * use null to accept any type - * @param valueWidth selects the number of bits in a data value; this method throws an exception - * if the valueWidth does not match the binary data; - * use null to accept any data value width - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @see MutableCodePointTrie#MutableCodePointTrie(int, int) - * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) - * @see #toBinary(OutputStream) - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static CodePointTrie fromBinary(Type type, ValueWidth valueWidth, ByteBuffer bytes) { - ByteOrder outerByteOrder = bytes.order(); - try { - // Enough data for a trie header? - if (bytes.remaining() < 16 /* sizeof(UCPTrieHeader) */) { - throw new InternalError("Buffer too short for a CodePointTrie header"); - } - - // struct UCPTrieHeader - /** "Tri3" in big-endian US-ASCII (0x54726933) */ - int signature = bytes.getInt(); - - // Check the signature. - switch (signature) { - case 0x54726933: - // The buffer is already set to the trie data byte order. - break; - case 0x33697254: - // Temporarily reverse the byte order. - boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; - bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); - signature = 0x54726933; - break; - default: - throw new InternalError("Buffer does not contain a serialized CodePointTrie"); - } - - // struct UCPTrieHeader continued - /** - * Options bit field: - * Bits 15..12: Data length bits 19..16. - * Bits 11..8: Data null block offset bits 19..16. - * Bits 7..6: UCPTrieType - * Bits 5..3: Reserved (0). - * Bits 2..0: UCPTrieValueWidth - */ - int options = bytes.getChar(); - - /** Total length of the index tables. */ - int indexLength = bytes.getChar(); - - /** Data length bits 15..0. */ - int dataLength = bytes.getChar(); - - /** Index-3 null block offset, 0x7fff or 0xffff if none. */ - int index3NullOffset = bytes.getChar(); - - /** Data null block offset bits 15..0, 0xfffff if none. */ - int dataNullOffset = bytes.getChar(); - - /** - * First code point of the single-value range ending with U+10ffff, - * rounded up and then shifted right by SHIFT_2. - */ - int shiftedHighStart = bytes.getChar(); - // struct UCPTrieHeader end - - int typeInt = (options >> 6) & 3; - Type actualType; - switch (typeInt) { - case 0: actualType = Type.FAST; break; - case 1: actualType = Type.SMALL; break; - default: - throw new InternalError("CodePointTrie data header has an unsupported type"); - } - - int valueWidthInt = options & OPTIONS_VALUE_BITS_MASK; - ValueWidth actualValueWidth; - switch (valueWidthInt) { - case 0: actualValueWidth = ValueWidth.BITS_16; break; - case 1: actualValueWidth = ValueWidth.BITS_32; break; - case 2: actualValueWidth = ValueWidth.BITS_8; break; - default: - throw new InternalError("CodePointTrie data header has an unsupported value width"); - } - - if ((options & OPTIONS_RESERVED_MASK) != 0) { - throw new InternalError("CodePointTrie data header has unsupported options"); - } - - if (type == null) { - type = actualType; - } - if (valueWidth == null) { - valueWidth = actualValueWidth; - } - if (type != actualType || valueWidth != actualValueWidth) { - throw new InternalError("CodePointTrie data header has a different type or value width than required"); - } - - // Get the length values and offsets. - dataLength |= ((options & OPTIONS_DATA_LENGTH_MASK) << 4); - dataNullOffset |= ((options & OPTIONS_DATA_NULL_OFFSET_MASK) << 8); - - int highStart = shiftedHighStart << SHIFT_2; - - // Calculate the actual length, minus the header. - int actualLength = indexLength * 2; - if (valueWidth == ValueWidth.BITS_16) { - actualLength += dataLength * 2; - } else if (valueWidth == ValueWidth.BITS_32) { - actualLength += dataLength * 4; - } else { - actualLength += dataLength; - } - if (bytes.remaining() < actualLength) { - throw new InternalError("Buffer too short for the CodePointTrie data"); - } - - char[] index = ICUBinary.getChars(bytes, indexLength, 0); - switch (valueWidth) { - case BITS_16: { - char[] data16 = ICUBinary.getChars(bytes, dataLength, 0); - return type == Type.FAST ? - new Fast16(index, data16, highStart, index3NullOffset, dataNullOffset) : - new Small16(index, data16, highStart, index3NullOffset, dataNullOffset); - } - case BITS_32: { - int[] data32 = ICUBinary.getInts(bytes, dataLength, 0); - return type == Type.FAST ? - new Fast32(index, data32, highStart, index3NullOffset, dataNullOffset) : - new Small32(index, data32, highStart, index3NullOffset, dataNullOffset); - } - case BITS_8: { - byte[] data8 = ICUBinary.getBytes(bytes, dataLength, 0); - return type == Type.FAST ? - new Fast8(index, data8, highStart, index3NullOffset, dataNullOffset) : - new Small8(index, data8, highStart, index3NullOffset, dataNullOffset); - } - default: - throw new AssertionError("should be unreachable"); - } - } finally { - bytes.order(outerByteOrder); - } - } - - /** - * Returns the trie type. - * - * @return the trie type - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public abstract Type getType(); - /** - * Returns the number of bits in a trie data value. - * - * @return the number of bits in a trie data value - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final ValueWidth getValueWidth() { return data.getValueWidth(); } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public int get(int c) { - return data.getFromIndex(cpIndex(c)); - } - - /** - * Returns a trie value for an ASCII code point, without range checking. - * - * @param c the input code point; must be U+0000..U+007F - * @return The ASCII code point's trie value. - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final int asciiGet(int c) { - return ascii[c]; - } - - private static final int MAX_UNICODE = 0x10ffff; - - private static final int ASCII_LIMIT = 0x80; - - private static final int maybeFilterValue(int value, int trieNullValue, int nullValue, - ValueFilter filter) { - if (value == trieNullValue) { - value = nullValue; - } else if (filter != null) { - value = filter.apply(value); - } - return value; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final boolean getRange(int start, ValueFilter filter, Range range) { - if (start < 0 || MAX_UNICODE < start) { - return false; - } - if (start >= highStart) { - int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; - int value = data.getFromIndex(di); - if (filter != null) { value = filter.apply(value); } - range.set(start, MAX_UNICODE, value); - return true; - } - - int nullValue = this.nullValue; - if (filter != null) { nullValue = filter.apply(nullValue); } - Type type = getType(); - - int prevI3Block = -1; - int prevBlock = -1; - int c = start; - // Initialize to make compiler happy. Real value when haveValue is true. - int trieValue = 0, value = 0; - boolean haveValue = false; - do { - int i3Block; - int i3; - int i3BlockLength; - int dataBlockLength; - if (c <= 0xffff && (type == Type.FAST || c <= SMALL_MAX)) { - i3Block = 0; - i3 = c >> FAST_SHIFT; - i3BlockLength = type == Type.FAST ? BMP_INDEX_LENGTH : SMALL_INDEX_LENGTH; - dataBlockLength = FAST_DATA_BLOCK_LENGTH; - } else { - // Use the multi-stage index. - int i1 = c >> SHIFT_1; - if (type == Type.FAST) { - assert(0xffff < c && c < highStart); - i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; - } else { - assert(c < highStart && highStart > SMALL_LIMIT); - i1 += SMALL_INDEX_LENGTH; - } - i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; - if (i3Block == prevI3Block && (c - start) >= CP_PER_INDEX_2_ENTRY) { - // The index-3 block is the same as the previous one, and filled with value. - assert((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0); - c += CP_PER_INDEX_2_ENTRY; - continue; - } - prevI3Block = i3Block; - if (i3Block == index3NullOffset) { - // This is the index-3 null block. - if (haveValue) { - if (nullValue != value) { - range.set(start, c - 1, value); - return true; - } - } else { - trieValue = this.nullValue; - value = nullValue; - haveValue = true; - } - prevBlock = dataNullOffset; - c = (c + CP_PER_INDEX_2_ENTRY) & ~(CP_PER_INDEX_2_ENTRY - 1); - continue; - } - i3 = (c >> SHIFT_3) & INDEX_3_MASK; - i3BlockLength = INDEX_3_BLOCK_LENGTH; - dataBlockLength = SMALL_DATA_BLOCK_LENGTH; - } - // Enumerate data blocks for one index-3 block. - do { - int block; - if ((i3Block & 0x8000) == 0) { - block = index[i3Block + i3]; - } else { - // 18-bit indexes stored in groups of 9 entries per 8 indexes. - int group = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); - int gi = i3 & 7; - block = (index[group++] << (2 + (2 * gi))) & 0x30000; - block |= index[group + gi]; - } - if (block == prevBlock && (c - start) >= dataBlockLength) { - // The block is the same as the previous one, and filled with value. - assert((c & (dataBlockLength - 1)) == 0); - c += dataBlockLength; - } else { - int dataMask = dataBlockLength - 1; - prevBlock = block; - if (block == dataNullOffset) { - // This is the data null block. - if (haveValue) { - if (nullValue != value) { - range.set(start, c - 1, value); - return true; - } - } else { - trieValue = this.nullValue; - value = nullValue; - haveValue = true; - } - c = (c + dataBlockLength) & ~dataMask; - } else { - int di = block + (c & dataMask); - int trieValue2 = data.getFromIndex(di); - if (haveValue) { - if (trieValue2 != trieValue) { - if (filter == null || - maybeFilterValue(trieValue2, this.nullValue, nullValue, - filter) != value) { - range.set(start, c - 1, value); - return true; - } - trieValue = trieValue2; // may or may not help - } - } else { - trieValue = trieValue2; - value = maybeFilterValue(trieValue2, this.nullValue, nullValue, filter); - haveValue = true; - } - while ((++c & dataMask) != 0) { - trieValue2 = data.getFromIndex(++di); - if (trieValue2 != trieValue) { - if (filter == null || - maybeFilterValue(trieValue2, this.nullValue, nullValue, - filter) != value) { - range.set(start, c - 1, value); - return true; - } - trieValue = trieValue2; // may or may not help - } - } - } - } - } while (++i3 < i3BlockLength); - } while (c < highStart); - assert(haveValue); - int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; - int highValue = data.getFromIndex(di); - if (maybeFilterValue(highValue, this.nullValue, nullValue, filter) != value) { - --c; - } else { - c = MAX_UNICODE; - } - range.set(start, c, value); - return true; - } - - /** - * Writes a representation of the trie to the output stream. - * Inverse of {@link #fromBinary}. - * - * @param os the output stream - * @return the number of bytes written - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public final int toBinary(OutputStream os) { - try { - DataOutputStream dos = new DataOutputStream(os); - - // Write the UCPTrieHeader - dos.writeInt(0x54726933); // signature="Tri3" - dos.writeChar( // options - ((dataLength & 0xf0000) >> 4) | - ((dataNullOffset & 0xf0000) >> 8) | - (getType().ordinal() << 6) | - getValueWidth().ordinal()); - dos.writeChar(index.length); - dos.writeChar(dataLength); - dos.writeChar(index3NullOffset); - dos.writeChar(dataNullOffset); - dos.writeChar(highStart >> SHIFT_2); // shiftedHighStart - int length = 16; // sizeof(UCPTrieHeader) - - for (char i : index) { dos.writeChar(i); } - length += index.length * 2; - length += data.write(dos); - return length; - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - /** @internal */ - static final int FAST_SHIFT = 6; - - /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ - static final int FAST_DATA_BLOCK_LENGTH = 1 << FAST_SHIFT; - - /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ - private static final int FAST_DATA_MASK = FAST_DATA_BLOCK_LENGTH - 1; - - /** @internal */ - private static final int SMALL_MAX = 0xfff; - - /** - * Offset from dataLength (to be subtracted) for fetching the - * value returned for out-of-range code points and ill-formed UTF-8/16. - * @internal - */ - private static final int ERROR_VALUE_NEG_DATA_OFFSET = 1; - /** - * Offset from dataLength (to be subtracted) for fetching the - * value returned for code points highStart..U+10FFFF. - * @internal - */ - private static final int HIGH_VALUE_NEG_DATA_OFFSET = 2; - - // ucptrie_impl.h - - /** The length of the BMP index table. 1024=0x400 */ - private static final int BMP_INDEX_LENGTH = 0x10000 >> FAST_SHIFT; - - static final int SMALL_LIMIT = 0x1000; - private static final int SMALL_INDEX_LENGTH = SMALL_LIMIT >> FAST_SHIFT; - - /** Shift size for getting the index-3 table offset. */ - static final int SHIFT_3 = 4; - - /** Shift size for getting the index-2 table offset. */ - private static final int SHIFT_2 = 5 + SHIFT_3; - - /** Shift size for getting the index-1 table offset. */ - private static final int SHIFT_1 = 5 + SHIFT_2; - - /** - * Difference between two shift sizes, - * for getting an index-2 offset from an index-3 offset. 5=9-4 - */ - static final int SHIFT_2_3 = SHIFT_2 - SHIFT_3; - - /** - * Difference between two shift sizes, - * for getting an index-1 offset from an index-2 offset. 5=14-9 - */ - static final int SHIFT_1_2 = SHIFT_1 - SHIFT_2; - - /** - * Number of index-1 entries for the BMP. (4) - * This part of the index-1 table is omitted from the serialized form. - */ - private static final int OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1; - - /** Number of entries in an index-2 block. 32=0x20 */ - static final int INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2; - - /** Mask for getting the lower bits for the in-index-2-block offset. */ - static final int INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1; - - /** Number of code points per index-2 table entry. 512=0x200 */ - static final int CP_PER_INDEX_2_ENTRY = 1 << SHIFT_2; - - /** Number of entries in an index-3 block. 32=0x20 */ - static final int INDEX_3_BLOCK_LENGTH = 1 << SHIFT_2_3; - - /** Mask for getting the lower bits for the in-index-3-block offset. */ - private static final int INDEX_3_MASK = INDEX_3_BLOCK_LENGTH - 1; - - /** Number of entries in a small data block. 16=0x10 */ - static final int SMALL_DATA_BLOCK_LENGTH = 1 << SHIFT_3; - - /** Mask for getting the lower bits for the in-small-data-block offset. */ - static final int SMALL_DATA_MASK = SMALL_DATA_BLOCK_LENGTH - 1; - - // ucptrie_impl.h: Constants for use with UCPTrieHeader.options. - private static final int OPTIONS_DATA_LENGTH_MASK = 0xf000; - private static final int OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00; - private static final int OPTIONS_RESERVED_MASK = 0x38; - private static final int OPTIONS_VALUE_BITS_MASK = 7; - /** - * Value for index3NullOffset which indicates that there is no index-3 null block. - * Bit 15 is unused for this value because this bit is used if the index-3 contains - * 18-bit indexes. - */ - static final int NO_INDEX3_NULL_OFFSET = 0x7fff; - static final int NO_DATA_NULL_OFFSET = 0xfffff; - - private static abstract class Data { - abstract ValueWidth getValueWidth(); - abstract int getDataLength(); - abstract int getFromIndex(int index); - abstract int write(DataOutputStream dos) throws IOException; - } - - private static final class Data16 extends Data { - char[] array; - Data16(char[] a) { array = a; } - @Override ValueWidth getValueWidth() { return ValueWidth.BITS_16; } - @Override int getDataLength() { return array.length; } - @Override int getFromIndex(int index) { return array[index]; } - @Override int write(DataOutputStream dos) throws IOException { - for (char v : array) { dos.writeChar(v); } - return array.length * 2; - } - } - - private static final class Data32 extends Data { - int[] array; - Data32(int[] a) { array = a; } - @Override ValueWidth getValueWidth() { return ValueWidth.BITS_32; } - @Override int getDataLength() { return array.length; } - @Override int getFromIndex(int index) { return array[index]; } - @Override int write(DataOutputStream dos) throws IOException { - for (int v : array) { dos.writeInt(v); } - return array.length * 4; - } - } - - private static final class Data8 extends Data { - byte[] array; - Data8(byte[] a) { array = a; } - @Override ValueWidth getValueWidth() { return ValueWidth.BITS_8; } - @Override int getDataLength() { return array.length; } - @Override int getFromIndex(int index) { return array[index] & 0xff; } - @Override int write(DataOutputStream dos) throws IOException { - for (byte v : array) { dos.writeByte(v); } - return array.length; - } - } - - /** @internal */ - private final int[] ascii; - - /** @internal */ - private final char[] index; - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected final Data data; - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected final int dataLength; - /** - * Start of the last range which ends at U+10FFFF. - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected final int highStart; - - /** - * Internal index-3 null block offset. - * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. - * @internal - */ - private final int index3NullOffset; - /** - * Internal data null block offset, not shifted. - * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. - * @internal - */ - private final int dataNullOffset; - /** @internal */ - private final int nullValue; - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected final int fastIndex(int c) { - return index[c >> FAST_SHIFT] + (c & FAST_DATA_MASK); - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected final int smallIndex(Type type, int c) { - // Split into two methods to make this part inline-friendly. - // In C, this part is a macro. - if (c >= highStart) { - return dataLength - HIGH_VALUE_NEG_DATA_OFFSET; - } - return internalSmallIndex(type, c); - } - - private final int internalSmallIndex(Type type, int c) { - int i1 = c >> SHIFT_1; - if (type == Type.FAST) { - assert(0xffff < c && c < highStart); - i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; - } else { - assert(0 <= c && c < highStart && highStart > SMALL_LIMIT); - i1 += SMALL_INDEX_LENGTH; - } - int i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; - int i3 = (c >> SHIFT_3) & INDEX_3_MASK; - int dataBlock; - if ((i3Block & 0x8000) == 0) { - // 16-bit indexes - dataBlock = index[i3Block + i3]; - } else { - // 18-bit indexes stored in groups of 9 entries per 8 indexes. - i3Block = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); - i3 &= 7; - dataBlock = (index[i3Block++] << (2 + (2 * i3))) & 0x30000; - dataBlock |= index[i3Block + i3]; - } - return dataBlock + (c & SMALL_DATA_MASK); - } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - protected abstract int cpIndex(int c); - - /** - * A CodePointTrie with {@link Type#FAST}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static abstract class Fast extends CodePointTrie { - private Fast(char[] index, Data data, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, data, highStart, index3NullOffset, dataNullOffset); - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#FAST}. - * - * @param valueWidth selects the number of bits in a data value; this method throws an exception - * if the valueWidth does not match the binary data; - * use null to accept any data value width - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Fast fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { - return (Fast) CodePointTrie.fromBinary(Type.FAST, valueWidth, bytes); - } - - /** - * @return {@link Type#FAST} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final Type getType() { return Type.FAST; } - - /** - * Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking. - * Can be used to look up a value for a UTF-16 code unit if other parts of - * the string processing check for surrogates. - * - * @param c the input code point, must be U+0000..U+FFFF - * @return The BMP code point's trie value. - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public abstract int bmpGet(int c); - - /** - * Returns a trie value for a supplementary code point (U+10000..U+10FFFF), - * without range checking. - * - * @param c the input code point, must be U+10000..U+10FFFF - * @return The supplementary code point's trie value. - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public abstract int suppGet(int c); - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - @Override - protected final int cpIndex(int c) { - if (c >= 0) { - if (c <= 0xffff) { - return fastIndex(c); - } else if (c <= 0x10ffff) { - return smallIndex(Type.FAST, c); - } - } - return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final StringIterator stringIterator(CharSequence s, int sIndex) { - return new FastStringIterator(s, sIndex); - } - - private final class FastStringIterator extends StringIterator { - private FastStringIterator(CharSequence s, int sIndex) { - super(s, sIndex); - } - - @Override - public boolean next() { - if (sIndex >= s.length()) { - return false; - } - char lead = s.charAt(sIndex++); - c = lead; - int dataIndex; - if (!Character.isSurrogate(lead)) { - dataIndex = fastIndex(c); - } else { - char trail; - if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && - Character.isLowSurrogate(trail = s.charAt(sIndex))) { - ++sIndex; - c = Character.toCodePoint(lead, trail); - dataIndex = smallIndex(Type.FAST, c); - } else { - dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - } - value = data.getFromIndex(dataIndex); - return true; - } - - @Override - public boolean previous() { - if (sIndex <= 0) { - return false; - } - char trail = s.charAt(--sIndex); - c = trail; - int dataIndex; - if (!Character.isSurrogate(trail)) { - dataIndex = fastIndex(c); - } else { - char lead; - if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && - Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { - --sIndex; - c = Character.toCodePoint(lead, trail); - dataIndex = smallIndex(Type.FAST, c); - } else { - dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - } - value = data.getFromIndex(dataIndex); - return true; - } - } - } - - /** - * A CodePointTrie with {@link Type#SMALL}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static abstract class Small extends CodePointTrie { - private Small(char[] index, Data data, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, data, highStart, index3NullOffset, dataNullOffset); - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#SMALL}. - * - * @param valueWidth selects the number of bits in a data value; this method throws an exception - * if the valueWidth does not match the binary data; - * use null to accept any data value width - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Small fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { - return (Small) CodePointTrie.fromBinary(Type.SMALL, valueWidth, bytes); - } - - /** - * @return {@link Type#SMALL} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final Type getType() { return Type.SMALL; } - - /** - * @internal - * @deprecated This API is ICU internal only. - */ - @Deprecated - @Override - protected final int cpIndex(int c) { - if (c >= 0) { - if (c <= SMALL_MAX) { - return fastIndex(c); - } else if (c <= 0x10ffff) { - return smallIndex(Type.SMALL, c); - } - } - return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final StringIterator stringIterator(CharSequence s, int sIndex) { - return new SmallStringIterator(s, sIndex); - } - - private final class SmallStringIterator extends StringIterator { - private SmallStringIterator(CharSequence s, int sIndex) { - super(s, sIndex); - } - - @Override - public boolean next() { - if (sIndex >= s.length()) { - return false; - } - char lead = s.charAt(sIndex++); - c = lead; - int dataIndex; - if (!Character.isSurrogate(lead)) { - dataIndex = cpIndex(c); - } else { - char trail; - if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && - Character.isLowSurrogate(trail = s.charAt(sIndex))) { - ++sIndex; - c = Character.toCodePoint(lead, trail); - dataIndex = smallIndex(Type.SMALL, c); - } else { - dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - } - value = data.getFromIndex(dataIndex); - return true; - } - - @Override - public boolean previous() { - if (sIndex <= 0) { - return false; - } - char trail = s.charAt(--sIndex); - c = trail; - int dataIndex; - if (!Character.isSurrogate(trail)) { - dataIndex = cpIndex(c); - } else { - char lead; - if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && - Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { - --sIndex; - c = Character.toCodePoint(lead, trail); - dataIndex = smallIndex(Type.SMALL, c); - } else { - dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; - } - } - value = data.getFromIndex(dataIndex); - return true; - } - } - } - - /** - * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_16}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Fast16 extends Fast { - private final char[] dataArray; - - Fast16(char[] index, char[] data16, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); - this.dataArray = data16; - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#FAST} and {@link ValueWidth#BITS_16}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Fast16 fromBinary(ByteBuffer bytes) { - return (Fast16) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_16, bytes); - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int get(int c) { - return dataArray[cpIndex(c)]; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int bmpGet(int c) { - assert 0 <= c && c <= 0xffff; - return dataArray[fastIndex(c)]; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int suppGet(int c) { - assert 0x10000 <= c && c <= 0x10ffff; - return dataArray[smallIndex(Type.FAST, c)]; - } - } - - /** - * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_32}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Fast32 extends Fast { - private final int[] dataArray; - - Fast32(char[] index, int[] data32, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); - this.dataArray = data32; - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#FAST} and {@link ValueWidth#BITS_32}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Fast32 fromBinary(ByteBuffer bytes) { - return (Fast32) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_32, bytes); - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int get(int c) { - return dataArray[cpIndex(c)]; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int bmpGet(int c) { - assert 0 <= c && c <= 0xffff; - return dataArray[fastIndex(c)]; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int suppGet(int c) { - assert 0x10000 <= c && c <= 0x10ffff; - return dataArray[smallIndex(Type.FAST, c)]; - } - } - - /** - * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_8}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Fast8 extends Fast { - private final byte[] dataArray; - - Fast8(char[] index, byte[] data8, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); - this.dataArray = data8; - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#FAST} and {@link ValueWidth#BITS_8}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Fast8 fromBinary(ByteBuffer bytes) { - return (Fast8) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_8, bytes); - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int get(int c) { - return dataArray[cpIndex(c)] & 0xff; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int bmpGet(int c) { - assert 0 <= c && c <= 0xffff; - return dataArray[fastIndex(c)] & 0xff; - } - - /** - * {@inheritDoc} - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - @Override - public final int suppGet(int c) { - assert 0x10000 <= c && c <= 0x10ffff; - return dataArray[smallIndex(Type.FAST, c)] & 0xff; - } - } - - /** - * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Small16 extends Small { - Small16(char[] index, char[] data16, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Small16 fromBinary(ByteBuffer bytes) { - return (Small16) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_16, bytes); - } - } - - /** - * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Small32 extends Small { - Small32(char[] index, int[] data32, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Small32 fromBinary(ByteBuffer bytes) { - return (Small32) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_32, bytes); - } - } - - /** - * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. - * - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static final class Small8 extends Small { - Small8(char[] index, byte[] data8, int highStart, - int index3NullOffset, int dataNullOffset) { - super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); - } - - /** - * Creates a trie from its binary form. - * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} - * with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. - * - * @param bytes a buffer containing the binary data of a CodePointTrie - * @return the trie - * @draft ICU 63 - * @provisional This API might change or be removed in a future release. - */ - public static Small8 fromBinary(ByteBuffer bytes) { - return (Small8) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_8, bytes); - } - } -} --- /dev/null 2020-01-10 15:58:11.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/util/CodePointTrie.java 2020-01-10 15:58:11.000000000 -0800 @@ -0,0 +1,1312 @@ +/* + * Copyright (c) 2019, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +// (c) 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License + +// created: 2018may04 Markus W. Scherer + +package jdk.internal.icu.util; + +import jdk.internal.icu.impl.ICUBinary; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import static jdk.internal.icu.impl.NormalizerImpl.UTF16Plus; + +/** + * Immutable Unicode code point trie. + * Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values. + * For details see http://site.icu-project.org/design/struct/utrie + * + *

This class is not intended for public subclassing. + * + * @see MutableCodePointTrie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ +@SuppressWarnings("deprecation") +public abstract class CodePointTrie extends CodePointMap { + /** + * Selectors for the type of a CodePointTrie. + * Different trade-offs for size vs. speed. + * + *

Use null for {@link #fromBinary} to accept any type; + * {@link #getType} will return the actual type. + * + * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) + * @see #fromBinary + * @see #getType + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public enum Type { + /** + * Fast/simple/larger BMP data structure. + * The {@link Fast} subclasses have additional functions for lookup for BMP and supplementary code points. + * + * @see Fast + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + FAST, + /** + * Small/slower BMP data structure. + * + * @see Small + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + SMALL + } + + /** + * Selectors for the number of bits in a CodePointTrie data value. + * + *

Use null for {@link #fromBinary} to accept any data value width; + * {@link #getValueWidth} will return the actual data value width. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public enum ValueWidth { + /** + * The trie stores 16 bits per data value. + * It returns them as unsigned values 0..0xffff=65535. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + BITS_16, + /** + * The trie stores 32 bits per data value. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + BITS_32, + /** + * The trie stores 8 bits per data value. + * It returns them as unsigned values 0..0xff=255. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + BITS_8 + } + + private CodePointTrie(char[] index, Data data, int highStart, + int index3NullOffset, int dataNullOffset) { + this.ascii = new int[ASCII_LIMIT]; + this.index = index; + this.data = data; + this.dataLength = data.getDataLength(); + this.highStart = highStart; + this.index3NullOffset = index3NullOffset; + this.dataNullOffset = dataNullOffset; + + for (int c = 0; c < ASCII_LIMIT; ++c) { + ascii[c] = data.getFromIndex(c); + } + + int nullValueOffset = dataNullOffset; + if (nullValueOffset >= dataLength) { + nullValueOffset = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; + } + nullValue = data.getFromIndex(nullValueOffset); + } + + /** + * Creates a trie from its binary form, + * stored in the ByteBuffer starting at the current position. + * Advances the buffer position to just after the trie data. + * Inverse of {@link #toBinary(OutputStream)}. + * + *

The data is copied from the buffer; + * later modification of the buffer will not affect the trie. + * + * @param type selects the trie type; this method throws an exception + * if the type does not match the binary data; + * use null to accept any type + * @param valueWidth selects the number of bits in a data value; this method throws an exception + * if the valueWidth does not match the binary data; + * use null to accept any data value width + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @see MutableCodePointTrie#MutableCodePointTrie(int, int) + * @see MutableCodePointTrie#buildImmutable(CodePointTrie.Type, CodePointTrie.ValueWidth) + * @see #toBinary(OutputStream) + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static CodePointTrie fromBinary(Type type, ValueWidth valueWidth, ByteBuffer bytes) { + ByteOrder outerByteOrder = bytes.order(); + try { + // Enough data for a trie header? + if (bytes.remaining() < 16 /* sizeof(UCPTrieHeader) */) { + throw new InternalError("Buffer too short for a CodePointTrie header"); + } + + // struct UCPTrieHeader + /** "Tri3" in big-endian US-ASCII (0x54726933) */ + int signature = bytes.getInt(); + + // Check the signature. + switch (signature) { + case 0x54726933: + // The buffer is already set to the trie data byte order. + break; + case 0x33697254: + // Temporarily reverse the byte order. + boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; + bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); + signature = 0x54726933; + break; + default: + throw new InternalError("Buffer does not contain a serialized CodePointTrie"); + } + + // struct UCPTrieHeader continued + /** + * Options bit field: + * Bits 15..12: Data length bits 19..16. + * Bits 11..8: Data null block offset bits 19..16. + * Bits 7..6: UCPTrieType + * Bits 5..3: Reserved (0). + * Bits 2..0: UCPTrieValueWidth + */ + int options = bytes.getChar(); + + /** Total length of the index tables. */ + int indexLength = bytes.getChar(); + + /** Data length bits 15..0. */ + int dataLength = bytes.getChar(); + + /** Index-3 null block offset, 0x7fff or 0xffff if none. */ + int index3NullOffset = bytes.getChar(); + + /** Data null block offset bits 15..0, 0xfffff if none. */ + int dataNullOffset = bytes.getChar(); + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by SHIFT_2. + */ + int shiftedHighStart = bytes.getChar(); + // struct UCPTrieHeader end + + int typeInt = (options >> 6) & 3; + Type actualType; + switch (typeInt) { + case 0: actualType = Type.FAST; break; + case 1: actualType = Type.SMALL; break; + default: + throw new InternalError("CodePointTrie data header has an unsupported type"); + } + + int valueWidthInt = options & OPTIONS_VALUE_BITS_MASK; + ValueWidth actualValueWidth; + switch (valueWidthInt) { + case 0: actualValueWidth = ValueWidth.BITS_16; break; + case 1: actualValueWidth = ValueWidth.BITS_32; break; + case 2: actualValueWidth = ValueWidth.BITS_8; break; + default: + throw new InternalError("CodePointTrie data header has an unsupported value width"); + } + + if ((options & OPTIONS_RESERVED_MASK) != 0) { + throw new InternalError("CodePointTrie data header has unsupported options"); + } + + if (type == null) { + type = actualType; + } + if (valueWidth == null) { + valueWidth = actualValueWidth; + } + if (type != actualType || valueWidth != actualValueWidth) { + throw new InternalError("CodePointTrie data header has a different type or value width than required"); + } + + // Get the length values and offsets. + dataLength |= ((options & OPTIONS_DATA_LENGTH_MASK) << 4); + dataNullOffset |= ((options & OPTIONS_DATA_NULL_OFFSET_MASK) << 8); + + int highStart = shiftedHighStart << SHIFT_2; + + // Calculate the actual length, minus the header. + int actualLength = indexLength * 2; + if (valueWidth == ValueWidth.BITS_16) { + actualLength += dataLength * 2; + } else if (valueWidth == ValueWidth.BITS_32) { + actualLength += dataLength * 4; + } else { + actualLength += dataLength; + } + if (bytes.remaining() < actualLength) { + throw new InternalError("Buffer too short for the CodePointTrie data"); + } + + char[] index = ICUBinary.getChars(bytes, indexLength, 0); + switch (valueWidth) { + case BITS_16: { + char[] data16 = ICUBinary.getChars(bytes, dataLength, 0); + return type == Type.FAST ? + new Fast16(index, data16, highStart, index3NullOffset, dataNullOffset) : + new Small16(index, data16, highStart, index3NullOffset, dataNullOffset); + } + case BITS_32: { + int[] data32 = ICUBinary.getInts(bytes, dataLength, 0); + return type == Type.FAST ? + new Fast32(index, data32, highStart, index3NullOffset, dataNullOffset) : + new Small32(index, data32, highStart, index3NullOffset, dataNullOffset); + } + case BITS_8: { + byte[] data8 = ICUBinary.getBytes(bytes, dataLength, 0); + return type == Type.FAST ? + new Fast8(index, data8, highStart, index3NullOffset, dataNullOffset) : + new Small8(index, data8, highStart, index3NullOffset, dataNullOffset); + } + default: + throw new AssertionError("should be unreachable"); + } + } finally { + bytes.order(outerByteOrder); + } + } + + /** + * Returns the trie type. + * + * @return the trie type + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public abstract Type getType(); + /** + * Returns the number of bits in a trie data value. + * + * @return the number of bits in a trie data value + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final ValueWidth getValueWidth() { return data.getValueWidth(); } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public int get(int c) { + return data.getFromIndex(cpIndex(c)); + } + + /** + * Returns a trie value for an ASCII code point, without range checking. + * + * @param c the input code point; must be U+0000..U+007F + * @return The ASCII code point's trie value. + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final int asciiGet(int c) { + return ascii[c]; + } + + private static final int MAX_UNICODE = 0x10ffff; + + private static final int ASCII_LIMIT = 0x80; + + private static final int maybeFilterValue(int value, int trieNullValue, int nullValue, + ValueFilter filter) { + if (value == trieNullValue) { + value = nullValue; + } else if (filter != null) { + value = filter.apply(value); + } + return value; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final boolean getRange(int start, ValueFilter filter, Range range) { + if (start < 0 || MAX_UNICODE < start) { + return false; + } + if (start >= highStart) { + int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; + int value = data.getFromIndex(di); + if (filter != null) { value = filter.apply(value); } + range.set(start, MAX_UNICODE, value); + return true; + } + + int nullValue = this.nullValue; + if (filter != null) { nullValue = filter.apply(nullValue); } + Type type = getType(); + + int prevI3Block = -1; + int prevBlock = -1; + int c = start; + // Initialize to make compiler happy. Real value when haveValue is true. + int trieValue = 0, value = 0; + boolean haveValue = false; + do { + int i3Block; + int i3; + int i3BlockLength; + int dataBlockLength; + if (c <= 0xffff && (type == Type.FAST || c <= SMALL_MAX)) { + i3Block = 0; + i3 = c >> FAST_SHIFT; + i3BlockLength = type == Type.FAST ? BMP_INDEX_LENGTH : SMALL_INDEX_LENGTH; + dataBlockLength = FAST_DATA_BLOCK_LENGTH; + } else { + // Use the multi-stage index. + int i1 = c >> SHIFT_1; + if (type == Type.FAST) { + assert(0xffff < c && c < highStart); + i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; + } else { + assert(c < highStart && highStart > SMALL_LIMIT); + i1 += SMALL_INDEX_LENGTH; + } + i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; + if (i3Block == prevI3Block && (c - start) >= CP_PER_INDEX_2_ENTRY) { + // The index-3 block is the same as the previous one, and filled with value. + assert((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0); + c += CP_PER_INDEX_2_ENTRY; + continue; + } + prevI3Block = i3Block; + if (i3Block == index3NullOffset) { + // This is the index-3 null block. + if (haveValue) { + if (nullValue != value) { + range.set(start, c - 1, value); + return true; + } + } else { + trieValue = this.nullValue; + value = nullValue; + haveValue = true; + } + prevBlock = dataNullOffset; + c = (c + CP_PER_INDEX_2_ENTRY) & ~(CP_PER_INDEX_2_ENTRY - 1); + continue; + } + i3 = (c >> SHIFT_3) & INDEX_3_MASK; + i3BlockLength = INDEX_3_BLOCK_LENGTH; + dataBlockLength = SMALL_DATA_BLOCK_LENGTH; + } + // Enumerate data blocks for one index-3 block. + do { + int block; + if ((i3Block & 0x8000) == 0) { + block = index[i3Block + i3]; + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + int group = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); + int gi = i3 & 7; + block = (index[group++] << (2 + (2 * gi))) & 0x30000; + block |= index[group + gi]; + } + if (block == prevBlock && (c - start) >= dataBlockLength) { + // The block is the same as the previous one, and filled with value. + assert((c & (dataBlockLength - 1)) == 0); + c += dataBlockLength; + } else { + int dataMask = dataBlockLength - 1; + prevBlock = block; + if (block == dataNullOffset) { + // This is the data null block. + if (haveValue) { + if (nullValue != value) { + range.set(start, c - 1, value); + return true; + } + } else { + trieValue = this.nullValue; + value = nullValue; + haveValue = true; + } + c = (c + dataBlockLength) & ~dataMask; + } else { + int di = block + (c & dataMask); + int trieValue2 = data.getFromIndex(di); + if (haveValue) { + if (trieValue2 != trieValue) { + if (filter == null || + maybeFilterValue(trieValue2, this.nullValue, nullValue, + filter) != value) { + range.set(start, c - 1, value); + return true; + } + trieValue = trieValue2; // may or may not help + } + } else { + trieValue = trieValue2; + value = maybeFilterValue(trieValue2, this.nullValue, nullValue, filter); + haveValue = true; + } + while ((++c & dataMask) != 0) { + trieValue2 = data.getFromIndex(++di); + if (trieValue2 != trieValue) { + if (filter == null || + maybeFilterValue(trieValue2, this.nullValue, nullValue, + filter) != value) { + range.set(start, c - 1, value); + return true; + } + trieValue = trieValue2; // may or may not help + } + } + } + } + } while (++i3 < i3BlockLength); + } while (c < highStart); + assert(haveValue); + int di = dataLength - HIGH_VALUE_NEG_DATA_OFFSET; + int highValue = data.getFromIndex(di); + if (maybeFilterValue(highValue, this.nullValue, nullValue, filter) != value) { + --c; + } else { + c = MAX_UNICODE; + } + range.set(start, c, value); + return true; + } + + /** + * Writes a representation of the trie to the output stream. + * Inverse of {@link #fromBinary}. + * + * @param os the output stream + * @return the number of bytes written + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public final int toBinary(OutputStream os) { + try { + DataOutputStream dos = new DataOutputStream(os); + + // Write the UCPTrieHeader + dos.writeInt(0x54726933); // signature="Tri3" + dos.writeChar( // options + ((dataLength & 0xf0000) >> 4) | + ((dataNullOffset & 0xf0000) >> 8) | + (getType().ordinal() << 6) | + getValueWidth().ordinal()); + dos.writeChar(index.length); + dos.writeChar(dataLength); + dos.writeChar(index3NullOffset); + dos.writeChar(dataNullOffset); + dos.writeChar(highStart >> SHIFT_2); // shiftedHighStart + int length = 16; // sizeof(UCPTrieHeader) + + for (char i : index) { dos.writeChar(i); } + length += index.length * 2; + length += data.write(dos); + return length; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** @internal */ + static final int FAST_SHIFT = 6; + + /** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */ + static final int FAST_DATA_BLOCK_LENGTH = 1 << FAST_SHIFT; + + /** Mask for getting the lower bits for the in-fast-data-block offset. @internal */ + private static final int FAST_DATA_MASK = FAST_DATA_BLOCK_LENGTH - 1; + + /** @internal */ + private static final int SMALL_MAX = 0xfff; + + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for out-of-range code points and ill-formed UTF-8/16. + * @internal + */ + private static final int ERROR_VALUE_NEG_DATA_OFFSET = 1; + /** + * Offset from dataLength (to be subtracted) for fetching the + * value returned for code points highStart..U+10FFFF. + * @internal + */ + private static final int HIGH_VALUE_NEG_DATA_OFFSET = 2; + + // ucptrie_impl.h + + /** The length of the BMP index table. 1024=0x400 */ + private static final int BMP_INDEX_LENGTH = 0x10000 >> FAST_SHIFT; + + static final int SMALL_LIMIT = 0x1000; + private static final int SMALL_INDEX_LENGTH = SMALL_LIMIT >> FAST_SHIFT; + + /** Shift size for getting the index-3 table offset. */ + static final int SHIFT_3 = 4; + + /** Shift size for getting the index-2 table offset. */ + private static final int SHIFT_2 = 5 + SHIFT_3; + + /** Shift size for getting the index-1 table offset. */ + private static final int SHIFT_1 = 5 + SHIFT_2; + + /** + * Difference between two shift sizes, + * for getting an index-2 offset from an index-3 offset. 5=9-4 + */ + static final int SHIFT_2_3 = SHIFT_2 - SHIFT_3; + + /** + * Difference between two shift sizes, + * for getting an index-1 offset from an index-2 offset. 5=14-9 + */ + static final int SHIFT_1_2 = SHIFT_1 - SHIFT_2; + + /** + * Number of index-1 entries for the BMP. (4) + * This part of the index-1 table is omitted from the serialized form. + */ + private static final int OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> SHIFT_1; + + /** Number of entries in an index-2 block. 32=0x20 */ + static final int INDEX_2_BLOCK_LENGTH = 1 << SHIFT_1_2; + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + static final int INDEX_2_MASK = INDEX_2_BLOCK_LENGTH - 1; + + /** Number of code points per index-2 table entry. 512=0x200 */ + static final int CP_PER_INDEX_2_ENTRY = 1 << SHIFT_2; + + /** Number of entries in an index-3 block. 32=0x20 */ + static final int INDEX_3_BLOCK_LENGTH = 1 << SHIFT_2_3; + + /** Mask for getting the lower bits for the in-index-3-block offset. */ + private static final int INDEX_3_MASK = INDEX_3_BLOCK_LENGTH - 1; + + /** Number of entries in a small data block. 16=0x10 */ + static final int SMALL_DATA_BLOCK_LENGTH = 1 << SHIFT_3; + + /** Mask for getting the lower bits for the in-small-data-block offset. */ + static final int SMALL_DATA_MASK = SMALL_DATA_BLOCK_LENGTH - 1; + + // ucptrie_impl.h: Constants for use with UCPTrieHeader.options. + private static final int OPTIONS_DATA_LENGTH_MASK = 0xf000; + private static final int OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00; + private static final int OPTIONS_RESERVED_MASK = 0x38; + private static final int OPTIONS_VALUE_BITS_MASK = 7; + /** + * Value for index3NullOffset which indicates that there is no index-3 null block. + * Bit 15 is unused for this value because this bit is used if the index-3 contains + * 18-bit indexes. + */ + static final int NO_INDEX3_NULL_OFFSET = 0x7fff; + static final int NO_DATA_NULL_OFFSET = 0xfffff; + + private static abstract class Data { + abstract ValueWidth getValueWidth(); + abstract int getDataLength(); + abstract int getFromIndex(int index); + abstract int write(DataOutputStream dos) throws IOException; + } + + private static final class Data16 extends Data { + char[] array; + Data16(char[] a) { array = a; } + @Override ValueWidth getValueWidth() { return ValueWidth.BITS_16; } + @Override int getDataLength() { return array.length; } + @Override int getFromIndex(int index) { return array[index]; } + @Override int write(DataOutputStream dos) throws IOException { + for (char v : array) { dos.writeChar(v); } + return array.length * 2; + } + } + + private static final class Data32 extends Data { + int[] array; + Data32(int[] a) { array = a; } + @Override ValueWidth getValueWidth() { return ValueWidth.BITS_32; } + @Override int getDataLength() { return array.length; } + @Override int getFromIndex(int index) { return array[index]; } + @Override int write(DataOutputStream dos) throws IOException { + for (int v : array) { dos.writeInt(v); } + return array.length * 4; + } + } + + private static final class Data8 extends Data { + byte[] array; + Data8(byte[] a) { array = a; } + @Override ValueWidth getValueWidth() { return ValueWidth.BITS_8; } + @Override int getDataLength() { return array.length; } + @Override int getFromIndex(int index) { return array[index] & 0xff; } + @Override int write(DataOutputStream dos) throws IOException { + for (byte v : array) { dos.writeByte(v); } + return array.length; + } + } + + /** @internal */ + private final int[] ascii; + + /** @internal */ + private final char[] index; + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected final Data data; + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected final int dataLength; + /** + * Start of the last range which ends at U+10FFFF. + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected final int highStart; + + /** + * Internal index-3 null block offset. + * Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block. + * @internal + */ + private final int index3NullOffset; + /** + * Internal data null block offset, not shifted. + * Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block. + * @internal + */ + private final int dataNullOffset; + /** @internal */ + private final int nullValue; + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected final int fastIndex(int c) { + return index[c >> FAST_SHIFT] + (c & FAST_DATA_MASK); + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected final int smallIndex(Type type, int c) { + // Split into two methods to make this part inline-friendly. + // In C, this part is a macro. + if (c >= highStart) { + return dataLength - HIGH_VALUE_NEG_DATA_OFFSET; + } + return internalSmallIndex(type, c); + } + + private final int internalSmallIndex(Type type, int c) { + int i1 = c >> SHIFT_1; + if (type == Type.FAST) { + assert(0xffff < c && c < highStart); + i1 += BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; + } else { + assert(0 <= c && c < highStart && highStart > SMALL_LIMIT); + i1 += SMALL_INDEX_LENGTH; + } + int i3Block = index[index[i1] + ((c >> SHIFT_2) & INDEX_2_MASK)]; + int i3 = (c >> SHIFT_3) & INDEX_3_MASK; + int dataBlock; + if ((i3Block & 0x8000) == 0) { + // 16-bit indexes + dataBlock = index[i3Block + i3]; + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + i3Block = (i3Block & 0x7fff) + (i3 & ~7) + (i3 >> 3); + i3 &= 7; + dataBlock = (index[i3Block++] << (2 + (2 * i3))) & 0x30000; + dataBlock |= index[i3Block + i3]; + } + return dataBlock + (c & SMALL_DATA_MASK); + } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + protected abstract int cpIndex(int c); + + /** + * A CodePointTrie with {@link Type#FAST}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static abstract class Fast extends CodePointTrie { + private Fast(char[] index, Data data, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, data, highStart, index3NullOffset, dataNullOffset); + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#FAST}. + * + * @param valueWidth selects the number of bits in a data value; this method throws an exception + * if the valueWidth does not match the binary data; + * use null to accept any data value width + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Fast fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { + return (Fast) CodePointTrie.fromBinary(Type.FAST, valueWidth, bytes); + } + + /** + * @return {@link Type#FAST} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final Type getType() { return Type.FAST; } + + /** + * Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking. + * Can be used to look up a value for a UTF-16 code unit if other parts of + * the string processing check for surrogates. + * + * @param c the input code point, must be U+0000..U+FFFF + * @return The BMP code point's trie value. + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public abstract int bmpGet(int c); + + /** + * Returns a trie value for a supplementary code point (U+10000..U+10FFFF), + * without range checking. + * + * @param c the input code point, must be U+10000..U+10FFFF + * @return The supplementary code point's trie value. + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public abstract int suppGet(int c); + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + @Override + protected final int cpIndex(int c) { + if (c >= 0) { + if (c <= 0xffff) { + return fastIndex(c); + } else if (c <= 0x10ffff) { + return smallIndex(Type.FAST, c); + } + } + return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final StringIterator stringIterator(CharSequence s, int sIndex) { + return new FastStringIterator(s, sIndex); + } + + private final class FastStringIterator extends StringIterator { + private FastStringIterator(CharSequence s, int sIndex) { + super(s, sIndex); + } + + @Override + public boolean next() { + if (sIndex >= s.length()) { + return false; + } + char lead = s.charAt(sIndex++); + c = lead; + int dataIndex; + if (!Character.isSurrogate(lead)) { + dataIndex = fastIndex(c); + } else { + char trail; + if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && + Character.isLowSurrogate(trail = s.charAt(sIndex))) { + ++sIndex; + c = Character.toCodePoint(lead, trail); + dataIndex = smallIndex(Type.FAST, c); + } else { + dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + } + value = data.getFromIndex(dataIndex); + return true; + } + + @Override + public boolean previous() { + if (sIndex <= 0) { + return false; + } + char trail = s.charAt(--sIndex); + c = trail; + int dataIndex; + if (!Character.isSurrogate(trail)) { + dataIndex = fastIndex(c); + } else { + char lead; + if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && + Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { + --sIndex; + c = Character.toCodePoint(lead, trail); + dataIndex = smallIndex(Type.FAST, c); + } else { + dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + } + value = data.getFromIndex(dataIndex); + return true; + } + } + } + + /** + * A CodePointTrie with {@link Type#SMALL}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static abstract class Small extends CodePointTrie { + private Small(char[] index, Data data, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, data, highStart, index3NullOffset, dataNullOffset); + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#SMALL}. + * + * @param valueWidth selects the number of bits in a data value; this method throws an exception + * if the valueWidth does not match the binary data; + * use null to accept any data value width + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Small fromBinary(ValueWidth valueWidth, ByteBuffer bytes) { + return (Small) CodePointTrie.fromBinary(Type.SMALL, valueWidth, bytes); + } + + /** + * @return {@link Type#SMALL} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final Type getType() { return Type.SMALL; } + + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + @Override + protected final int cpIndex(int c) { + if (c >= 0) { + if (c <= SMALL_MAX) { + return fastIndex(c); + } else if (c <= 0x10ffff) { + return smallIndex(Type.SMALL, c); + } + } + return dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final StringIterator stringIterator(CharSequence s, int sIndex) { + return new SmallStringIterator(s, sIndex); + } + + private final class SmallStringIterator extends StringIterator { + private SmallStringIterator(CharSequence s, int sIndex) { + super(s, sIndex); + } + + @Override + public boolean next() { + if (sIndex >= s.length()) { + return false; + } + char lead = s.charAt(sIndex++); + c = lead; + int dataIndex; + if (!Character.isSurrogate(lead)) { + dataIndex = cpIndex(c); + } else { + char trail; + if (UTF16Plus.isSurrogateLead(lead) && sIndex < s.length() && + Character.isLowSurrogate(trail = s.charAt(sIndex))) { + ++sIndex; + c = Character.toCodePoint(lead, trail); + dataIndex = smallIndex(Type.SMALL, c); + } else { + dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + } + value = data.getFromIndex(dataIndex); + return true; + } + + @Override + public boolean previous() { + if (sIndex <= 0) { + return false; + } + char trail = s.charAt(--sIndex); + c = trail; + int dataIndex; + if (!Character.isSurrogate(trail)) { + dataIndex = cpIndex(c); + } else { + char lead; + if (!UTF16Plus.isSurrogateLead(trail) && sIndex > 0 && + Character.isHighSurrogate(lead = s.charAt(sIndex - 1))) { + --sIndex; + c = Character.toCodePoint(lead, trail); + dataIndex = smallIndex(Type.SMALL, c); + } else { + dataIndex = dataLength - ERROR_VALUE_NEG_DATA_OFFSET; + } + } + value = data.getFromIndex(dataIndex); + return true; + } + } + } + + /** + * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_16}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Fast16 extends Fast { + private final char[] dataArray; + + Fast16(char[] index, char[] data16, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); + this.dataArray = data16; + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#FAST} and {@link ValueWidth#BITS_16}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Fast16 fromBinary(ByteBuffer bytes) { + return (Fast16) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_16, bytes); + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int get(int c) { + return dataArray[cpIndex(c)]; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int bmpGet(int c) { + assert 0 <= c && c <= 0xffff; + return dataArray[fastIndex(c)]; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int suppGet(int c) { + assert 0x10000 <= c && c <= 0x10ffff; + return dataArray[smallIndex(Type.FAST, c)]; + } + } + + /** + * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_32}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Fast32 extends Fast { + private final int[] dataArray; + + Fast32(char[] index, int[] data32, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); + this.dataArray = data32; + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#FAST} and {@link ValueWidth#BITS_32}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Fast32 fromBinary(ByteBuffer bytes) { + return (Fast32) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_32, bytes); + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int get(int c) { + return dataArray[cpIndex(c)]; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int bmpGet(int c) { + assert 0 <= c && c <= 0xffff; + return dataArray[fastIndex(c)]; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int suppGet(int c) { + assert 0x10000 <= c && c <= 0x10ffff; + return dataArray[smallIndex(Type.FAST, c)]; + } + } + + /** + * A CodePointTrie with {@link Type#FAST} and {@link ValueWidth#BITS_8}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Fast8 extends Fast { + private final byte[] dataArray; + + Fast8(char[] index, byte[] data8, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); + this.dataArray = data8; + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#FAST} and {@link ValueWidth#BITS_8}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Fast8 fromBinary(ByteBuffer bytes) { + return (Fast8) CodePointTrie.fromBinary(Type.FAST, ValueWidth.BITS_8, bytes); + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int get(int c) { + return dataArray[cpIndex(c)] & 0xff; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int bmpGet(int c) { + assert 0 <= c && c <= 0xffff; + return dataArray[fastIndex(c)] & 0xff; + } + + /** + * {@inheritDoc} + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + @Override + public final int suppGet(int c) { + assert 0x10000 <= c && c <= 0x10ffff; + return dataArray[smallIndex(Type.FAST, c)] & 0xff; + } + } + + /** + * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Small16 extends Small { + Small16(char[] index, char[] data16, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data16(data16), highStart, index3NullOffset, dataNullOffset); + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#SMALL} and {@link ValueWidth#BITS_16}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Small16 fromBinary(ByteBuffer bytes) { + return (Small16) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_16, bytes); + } + } + + /** + * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Small32 extends Small { + Small32(char[] index, int[] data32, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data32(data32), highStart, index3NullOffset, dataNullOffset); + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#SMALL} and {@link ValueWidth#BITS_32}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Small32 fromBinary(ByteBuffer bytes) { + return (Small32) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_32, bytes); + } + } + + /** + * A CodePointTrie with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. + * + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static final class Small8 extends Small { + Small8(char[] index, byte[] data8, int highStart, + int index3NullOffset, int dataNullOffset) { + super(index, new Data8(data8), highStart, index3NullOffset, dataNullOffset); + } + + /** + * Creates a trie from its binary form. + * Same as {@link CodePointTrie#fromBinary(Type, ValueWidth, ByteBuffer)} + * with {@link Type#SMALL} and {@link ValueWidth#BITS_8}. + * + * @param bytes a buffer containing the binary data of a CodePointTrie + * @return the trie + * @draft ICU 63 + * @provisional This API might change or be removed in a future release. + */ + public static Small8 fromBinary(ByteBuffer bytes) { + return (Small8) CodePointTrie.fromBinary(Type.SMALL, ValueWidth.BITS_8, bytes); + } + } +} --- old/src/java.base/share/classes/sun/text/normalizer/OutputInt.java 2020-01-10 15:58:13.000000000 -0800 +++ /dev/null 2020-01-10 15:58:13.000000000 -0800 @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/* - ******************************************************************************* - * Copyright (C) 2014, International Business Machines Corporation and - * others. All Rights Reserved. - ******************************************************************************* - */ -package sun.text.normalizer; - -/** - * Simple struct-like class for int output parameters. - * Like Output<Integer> but without auto-boxing. - * - * @internal but could become public - * deprecated This API is ICU internal only. - */ -class OutputInt { - - /** - * The value field. - * - * @internal - * deprecated This API is ICU internal only. - */ - public int value; -} --- /dev/null 2020-01-10 15:58:13.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/util/OutputInt.java 2020-01-10 15:58:12.000000000 -0800 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ +package jdk.internal.icu.util; + +/** + * Simple struct-like class for int output parameters. + * Like Output<Integer> but without auto-boxing. + * + * @internal but could become public + * deprecated This API is ICU internal only. + */ +public class OutputInt { + + /** + * The value field. + * + * @internal + * deprecated This API is ICU internal only. + */ + public int value; +} --- old/src/java.base/share/classes/sun/text/normalizer/VersionInfo.java 2020-01-10 15:58:14.000000000 -0800 +++ /dev/null 2020-01-10 15:58:14.000000000 -0800 @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ -/* - ******************************************************************************* - * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * - * * - * The original version of this source code and documentation is copyrighted * - * and owned by IBM, These materials are provided under terms of a License * - * Agreement between IBM and Sun. This technology is protected by multiple * - * US and International patents. This notice and attribution to IBM may not * - * to removed. * - ******************************************************************************* - */ - -package sun.text.normalizer; - -import java.util.HashMap; - -/** - * Class to store version numbers of the form major.minor.milli.micro. - * @author synwee - * @stable ICU 2.6 - */ -public final class VersionInfo -{ - - // public methods ------------------------------------------------------ - - /** - * Returns an instance of VersionInfo with the argument version. - * @param version version String in the format of "major.minor.milli.micro" - * or "major.minor.milli" or "major.minor" or "major", - * where major, minor, milli, micro are non-negative numbers - * {@literal <=} 255. If the trailing version numbers are - * not specified they are taken as 0s. E.g. Version "3.1" is - * equivalent to "3.1.0.0". - * @return an instance of VersionInfo with the argument version. - * @exception throws an IllegalArgumentException when the argument version - * is not in the right format - * @stable ICU 2.6 - */ - public static VersionInfo getInstance(String version) - { - int length = version.length(); - int array[] = {0, 0, 0, 0}; - int count = 0; - int index = 0; - - while (count < 4 && index < length) { - char c = version.charAt(index); - if (c == '.') { - count ++; - } - else { - c -= '0'; - if (c < 0 || c > 9) { - throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); - } - array[count] *= 10; - array[count] += c; - } - index ++; - } - if (index != length) { - throw new IllegalArgumentException( - "Invalid version number: String '" + version + "' exceeds version format"); - } - for (int i = 0; i < 4; i ++) { - if (array[i] < 0 || array[i] > 255) { - throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); - } - } - - return getInstance(array[0], array[1], array[2], array[3]); - } - - /** - * Returns an instance of VersionInfo with the argument version. - * @param major major version, non-negative number {@literal <=} 255. - * @param minor minor version, non-negative number {@literal <=} 255. - * @param milli milli version, non-negative number {@literal <=} 255. - * @param micro micro version, non-negative number {@literal <=} 255. - * @exception throws an IllegalArgumentException when either arguments are - * negative or {@literal >} 255 - * @stable ICU 2.6 - */ - public static VersionInfo getInstance(int major, int minor, int milli, - int micro) - { - // checks if it is in the hashmap - // else - if (major < 0 || major > 255 || minor < 0 || minor > 255 || - milli < 0 || milli > 255 || micro < 0 || micro > 255) { - throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); - } - int version = getInt(major, minor, milli, micro); - Integer key = Integer.valueOf(version); - Object result = MAP_.get(key); - if (result == null) { - result = new VersionInfo(version); - MAP_.put(key, result); - } - return (VersionInfo)result; - } - - /** - * Compares other with this VersionInfo. - * @param other VersionInfo to be compared - * @return 0 if the argument is a VersionInfo object that has version - * information equals to this object. - * Less than 0 if the argument is a VersionInfo object that has - * version information greater than this object. - * Greater than 0 if the argument is a VersionInfo object that - * has version information less than this object. - * @stable ICU 2.6 - */ - public int compareTo(VersionInfo other) - { - return m_version_ - other.m_version_; - } - - // private data members ---------------------------------------------- - - /** - * Version number stored as a byte for each of the major, minor, milli and - * micro numbers in the 32 bit int. - * Most significant for the major and the least significant contains the - * micro numbers. - */ - private int m_version_; - /** - * Map of singletons - */ - private static final HashMap MAP_ = new HashMap<>(); - /** - * Error statement string - */ - private static final String INVALID_VERSION_NUMBER_ = - "Invalid version number: Version number may be negative or greater than 255"; - - // private constructor ----------------------------------------------- - - /** - * Constructor with int - * @param compactversion a 32 bit int with each byte representing a number - */ - private VersionInfo(int compactversion) - { - m_version_ = compactversion; - } - - /** - * Gets the int from the version numbers - * @param major non-negative version number - * @param minor non-negativeversion number - * @param milli non-negativeversion number - * @param micro non-negativeversion number - */ - private static int getInt(int major, int minor, int milli, int micro) - { - return (major << 24) | (minor << 16) | (milli << 8) | micro; - } -} --- /dev/null 2020-01-10 15:58:14.000000000 -0800 +++ new/src/java.base/share/classes/jdk/internal/icu/util/VersionInfo.java 2020-01-10 15:58:14.000000000 -0800 @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.util; + +import java.util.HashMap; + +/** + * Class to store version numbers of the form major.minor.milli.micro. + * @author synwee + * @stable ICU 2.6 + */ +public final class VersionInfo +{ + // public data members ------------------------------------------------- + + /** + * Data version string for ICU's internal data. + * Used for appending to data path (e.g. icudt43b) + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public static final String ICU_DATA_VERSION_PATH = "64b"; + + // public methods ------------------------------------------------------ + + /** + * Returns an instance of VersionInfo with the argument version. + * @param version version String in the format of "major.minor.milli.micro" + * or "major.minor.milli" or "major.minor" or "major", + * where major, minor, milli, micro are non-negative numbers + * {@literal <=} 255. If the trailing version numbers are + * not specified they are taken as 0s. E.g. Version "3.1" is + * equivalent to "3.1.0.0". + * @return an instance of VersionInfo with the argument version. + * @exception throws an IllegalArgumentException when the argument version + * is not in the right format + * @stable ICU 2.6 + */ + public static VersionInfo getInstance(String version) + { + int length = version.length(); + int array[] = {0, 0, 0, 0}; + int count = 0; + int index = 0; + + while (count < 4 && index < length) { + char c = version.charAt(index); + if (c == '.') { + count ++; + } + else { + c -= '0'; + if (c < 0 || c > 9) { + throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); + } + array[count] *= 10; + array[count] += c; + } + index ++; + } + if (index != length) { + throw new IllegalArgumentException( + "Invalid version number: String '" + version + "' exceeds version format"); + } + for (int i = 0; i < 4; i ++) { + if (array[i] < 0 || array[i] > 255) { + throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); + } + } + + return getInstance(array[0], array[1], array[2], array[3]); + } + + /** + * Returns an instance of VersionInfo with the argument version. + * @param major major version, non-negative number {@literal <=} 255. + * @param minor minor version, non-negative number {@literal <=} 255. + * @param milli milli version, non-negative number {@literal <=} 255. + * @param micro micro version, non-negative number {@literal <=} 255. + * @exception throws an IllegalArgumentException when either arguments are + * negative or {@literal >} 255 + * @stable ICU 2.6 + */ + public static VersionInfo getInstance(int major, int minor, int milli, + int micro) + { + // checks if it is in the hashmap + // else + if (major < 0 || major > 255 || minor < 0 || minor > 255 || + milli < 0 || milli > 255 || micro < 0 || micro > 255) { + throw new IllegalArgumentException(INVALID_VERSION_NUMBER_); + } + int version = getInt(major, minor, milli, micro); + Integer key = Integer.valueOf(version); + Object result = MAP_.get(key); + if (result == null) { + result = new VersionInfo(version); + MAP_.put(key, result); + } + return (VersionInfo)result; + } + + /** + * Compares other with this VersionInfo. + * @param other VersionInfo to be compared + * @return 0 if the argument is a VersionInfo object that has version + * information equals to this object. + * Less than 0 if the argument is a VersionInfo object that has + * version information greater than this object. + * Greater than 0 if the argument is a VersionInfo object that + * has version information less than this object. + * @stable ICU 2.6 + */ + public int compareTo(VersionInfo other) + { + return m_version_ - other.m_version_; + } + + // private data members ---------------------------------------------- + + /** + * Version number stored as a byte for each of the major, minor, milli and + * micro numbers in the 32 bit int. + * Most significant for the major and the least significant contains the + * micro numbers. + */ + private int m_version_; + /** + * Map of singletons + */ + private static final HashMap MAP_ = new HashMap<>(); + /** + * Error statement string + */ + private static final String INVALID_VERSION_NUMBER_ = + "Invalid version number: Version number may be negative or greater than 255"; + + // private constructor ----------------------------------------------- + + /** + * Constructor with int + * @param compactversion a 32 bit int with each byte representing a number + */ + private VersionInfo(int compactversion) + { + m_version_ = compactversion; + } + + /** + * Gets the int from the version numbers + * @param major non-negative version number + * @param minor non-negativeversion number + * @param milli non-negativeversion number + * @param micro non-negativeversion number + */ + private static int getInt(int major, int minor, int milli, int micro) + { + return (major << 24) | (minor << 16) | (milli << 8) | micro; + } +}