1 /*
   2  * Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
  27  *      4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
  28  *      4152416 4153072 4158381 4214367 4217703 4638433
  29  * @library /java/text/testlib
  30  * @run main/timeout=2000 BreakIteratorTest
  31  * @summary test BreakIterator
  32  */
  33 
  34 /*
  35  *
  36  *
  37  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  38  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  39  *
  40  * Portions copyright (c) 2007 Sun Microsystems, Inc.
  41  * All Rights Reserved.
  42  *
  43  * The original version of this source code and documentation
  44  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  45  * subsidiary of IBM. These materials are provided under terms
  46  * of a License Agreement between Taligent and Sun. This technology
  47  * is protected by multiple US and International patents.
  48  *
  49  * This notice and attribution to Taligent may not be removed.
  50  * Taligent is a registered trademark of Taligent, Inc.
  51  *
  52  * Permission to use, copy, modify, and distribute this software
  53  * and its documentation for NON-COMMERCIAL purposes and without
  54  * fee is hereby granted provided that this copyright notice
  55  * appears in all copies. Please refer to the file "copyright.html"
  56  * for further important copyright and licensing information.
  57  *
  58  * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  59  * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  60  * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  61  * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  62  * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  63  * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  64  *
  65  */
  66 
  67 import java.text.BreakIterator;
  68 import java.text.CharacterIterator;
  69 import java.text.StringCharacterIterator;
  70 import java.util.Locale;
  71 import java.util.Vector;
  72 import java.util.Enumeration;
  73 import java.io.*;
  74 
  75 public class BreakIteratorTest extends IntlTest
  76 {
  77     private BreakIterator characterBreak;
  78     private BreakIterator wordBreak;
  79     private BreakIterator lineBreak;
  80     private BreakIterator sentenceBreak;
  81 
  82     public static void main(String[] args) throws Exception {
  83         new BreakIteratorTest().run(args);
  84     }
  85 
  86     public BreakIteratorTest()
  87     {
  88         characterBreak = BreakIterator.getCharacterInstance();
  89         wordBreak = BreakIterator.getWordInstance();
  90         lineBreak = BreakIterator.getLineInstance();
  91         sentenceBreak = BreakIterator.getSentenceInstance();
  92     }
  93 
  94     //=========================================================================
  95     // general test subroutines
  96     //=========================================================================
  97 
  98     private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
  99         StringBuffer buffer = new StringBuffer();
 100         String text;
 101         for (int i = 0; i < expectedResult.size(); i++) {
 102             text = (String)expectedResult.elementAt(i);
 103             buffer.append(text);
 104         }
 105         text = buffer.toString();
 106 
 107         bi.setText(text);
 108 
 109         Vector nextResults = testFirstAndNext(bi, text);
 110         Vector previousResults = testLastAndPrevious(bi, text);
 111 
 112         logln("comparing forward and backward...");
 113         int errs = getErrorCount();
 114         compareFragmentLists("forward iteration", "backward iteration", nextResults,
 115                         previousResults);
 116         if (getErrorCount() == errs) {
 117             logln("comparing expected and actual...");
 118             compareFragmentLists("expected result", "actual result", expectedResult,
 119                             nextResults);
 120         }
 121 
 122         int[] boundaries = new int[expectedResult.size() + 3];
 123         boundaries[0] = BreakIterator.DONE;
 124         boundaries[1] = 0;
 125         for (int i = 0; i < expectedResult.size(); i++)
 126             boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
 127                             length();
 128         boundaries[boundaries.length - 1] = BreakIterator.DONE;
 129 
 130         testFollowing(bi, text, boundaries);
 131         testPreceding(bi, text, boundaries);
 132         testIsBoundary(bi, text, boundaries);
 133 
 134         doMultipleSelectionTest(bi, text);
 135     }
 136 
 137     private Vector testFirstAndNext(BreakIterator bi, String text) {
 138         int p = bi.first();
 139         int lastP = p;
 140         Vector<String> result = new Vector<String>();
 141 
 142         if (p != 0)
 143             errln("first() returned " + p + " instead of 0");
 144         while (p != BreakIterator.DONE) {
 145             p = bi.next();
 146             if (p != BreakIterator.DONE) {
 147                 if (p <= lastP)
 148                     errln("next() failed to move forward: next() on position "
 149                                     + lastP + " yielded " + p);
 150 
 151                 result.addElement(text.substring(lastP, p));
 152             }
 153             else {
 154                 if (lastP != text.length())
 155                     errln("next() returned DONE prematurely: offset was "
 156                                     + lastP + " instead of " + text.length());
 157             }
 158             lastP = p;
 159         }
 160         return result;
 161     }
 162 
 163     private Vector testLastAndPrevious(BreakIterator bi, String text) {
 164         int p = bi.last();
 165         int lastP = p;
 166         Vector<String> result = new Vector<String>();
 167 
 168         if (p != text.length())
 169             errln("last() returned " + p + " instead of " + text.length());
 170         while (p != BreakIterator.DONE) {
 171             p = bi.previous();
 172             if (p != BreakIterator.DONE) {
 173                 if (p >= lastP)
 174                     errln("previous() failed to move backward: previous() on position "
 175                                     + lastP + " yielded " + p);
 176 
 177                 result.insertElementAt(text.substring(p, lastP), 0);
 178             }
 179             else {
 180                 if (lastP != 0)
 181                     errln("previous() returned DONE prematurely: offset was "
 182                                     + lastP + " instead of 0");
 183             }
 184             lastP = p;
 185         }
 186         return result;
 187     }
 188 
 189     private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
 190         int p1 = 0;
 191         int p2 = 0;
 192         String s1;
 193         String s2;
 194         int t1 = 0;
 195         int t2 = 0;
 196 
 197         while (p1 < f1.size() && p2 < f2.size()) {
 198             s1 = (String)f1.elementAt(p1);
 199             s2 = (String)f2.elementAt(p2);
 200             t1 += s1.length();
 201             t2 += s2.length();
 202 
 203             if (s1.equals(s2)) {
 204                 debugLogln("   >" + s1 + "<");
 205                 ++p1;
 206                 ++p2;
 207             }
 208             else {
 209                 int tempT1 = t1;
 210                 int tempT2 = t2;
 211                 int tempP1 = p1;
 212                 int tempP2 = p2;
 213 
 214                 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
 215                     while (tempT1 < tempT2 && tempP1 < f1.size()) {
 216                         tempT1 += ((String)f1.elementAt(tempP1)).length();
 217                         ++tempP1;
 218                     }
 219                     while (tempT2 < tempT1 && tempP2 < f2.size()) {
 220                         tempT2 += ((String)f2.elementAt(tempP2)).length();
 221                         ++tempP2;
 222                     }
 223                 }
 224                 logln("*** " + f1Name + " has:");
 225                 while (p1 <= tempP1 && p1 < f1.size()) {
 226                     s1 = (String)f1.elementAt(p1);
 227                     t1 += s1.length();
 228                     debugLogln(" *** >" + s1 + "<");
 229                     ++p1;
 230                 }
 231                 logln("***** " + f2Name + " has:");
 232                 while (p2 <= tempP2 && p2 < f2.size()) {
 233                     s2 = (String)f2.elementAt(p2);
 234                     t2 += s2.length();
 235                     debugLogln(" ***** >" + s2 + "<");
 236                     ++p2;
 237                 }
 238                 errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2);
 239             }
 240         }
 241     }
 242 
 243     private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
 244         logln("testFollowing():");
 245         int p = 2;
 246         int i = 0;
 247         try {
 248             for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 249                 if (i == boundaries[p])
 250                     ++p;
 251 
 252                 int b = bi.following(i);
 253                 logln("bi.following(" + i + ") -> " + b);
 254                 if (b != boundaries[p])
 255                     errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
 256                           + ", got " + b);
 257             }
 258         } catch (IllegalArgumentException illargExp) {
 259             errln("IllegalArgumentException caught from following() for offset: " + i);
 260         }
 261     }
 262 
 263     private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
 264         logln("testPreceding():");
 265         int p = 0;
 266         int i = 0;
 267         try {
 268             for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 269                 int b = bi.preceding(i);
 270                 logln("bi.preceding(" + i + ") -> " + b);
 271                 if (b != boundaries[p])
 272                     errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
 273                           + ", got " + b);
 274 
 275                 if (i == boundaries[p + 1])
 276                     ++p;
 277             }
 278         } catch (IllegalArgumentException illargExp) {
 279             errln("IllegalArgumentException caught from preceding() for offset: " + i);
 280         }
 281     }
 282 
 283     private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
 284         logln("testIsBoundary():");
 285         int p = 1;
 286         boolean isB;
 287         for (int i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 288             isB = bi.isBoundary(i);
 289             logln("bi.isBoundary(" + i + ") -> " + isB);
 290 
 291             if (i == boundaries[p]) {
 292                 if (!isB)
 293                     errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
 294                 ++p;
 295             }
 296             else {
 297                 if (isB)
 298                     errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
 299             }
 300         }
 301     }
 302 
 303     private void doMultipleSelectionTest(BreakIterator iterator, String testText)
 304     {
 305         logln("Multiple selection test...");
 306         BreakIterator testIterator = (BreakIterator)iterator.clone();
 307         int offset = iterator.first();
 308         int testOffset;
 309         int count = 0;
 310 
 311         do {
 312             testOffset = testIterator.first();
 313             testOffset = testIterator.next(count);
 314             logln("next(" + count + ") -> " + testOffset);
 315             if (offset != testOffset)
 316                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 317 
 318             if (offset != BreakIterator.DONE) {
 319                 count++;
 320                 offset = iterator.next();
 321             }
 322         } while (offset != BreakIterator.DONE);
 323 
 324         // now do it backwards...
 325         offset = iterator.last();
 326         count = 0;
 327 
 328         do {
 329             testOffset = testIterator.last();
 330             testOffset = testIterator.next(count);
 331             logln("next(" + count + ") -> " + testOffset);
 332             if (offset != testOffset)
 333                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 334 
 335             if (offset != BreakIterator.DONE) {
 336                 count--;
 337                 offset = iterator.previous();
 338             }
 339         } while (offset != BreakIterator.DONE);
 340     }
 341 
 342     private void doBreakInvariantTest(BreakIterator tb, String testChars)
 343     {
 344         StringBuffer work = new StringBuffer("aaa");
 345         int errorCount = 0;
 346 
 347         // a break should always occur after CR (unless followed by LF), LF, PS, and LS
 348         String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
 349                             // change this back when new BI code is added
 350 
 351         for (int i = 0; i < breaks.length(); i++) {
 352             work.setCharAt(1, breaks.charAt(i));
 353             for (int j = 0; j < testChars.length(); j++) {
 354                 work.setCharAt(0, testChars.charAt(j));
 355                 for (int k = 0; k < testChars.length(); k++) {
 356                     char c = testChars.charAt(k);
 357 
 358                     // if a cr is followed by lf, don't do the check (they stay together)
 359                     if (work.charAt(1) == '\r' && (c == '\n'))
 360                         continue;
 361 
 362                     // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
 363                     // for breaking purposes as per UTR14
 364                     int type1 = Character.getType(work.charAt(1));
 365                     int type2 = Character.getType(c);
 366                     if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
 367                         type2 == Character.CONTROL || type2 == Character.FORMAT) {
 368                         continue;
 369                     }
 370 
 371                     work.setCharAt(2, c);
 372                     tb.setText(work.toString());
 373                     boolean seen2 = false;
 374                     for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
 375                         if (l == 2)
 376                             seen2 = true;
 377                     }
 378                     if (!seen2) {
 379                         errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
 380                                     + " and U+" + Integer.toHexString((int)(work.charAt(2))));
 381                         errorCount++;
 382                         if (errorCount >= 75)
 383                             return;
 384                     }
 385                 }
 386             }
 387         }
 388     }
 389 
 390     private void doOtherInvariantTest(BreakIterator tb, String testChars)
 391     {
 392         StringBuffer work = new StringBuffer("a\r\na");
 393         int errorCount = 0;
 394 
 395         // a break should never occur between CR and LF
 396         for (int i = 0; i < testChars.length(); i++) {
 397             work.setCharAt(0, testChars.charAt(i));
 398             for (int j = 0; j < testChars.length(); j++) {
 399                 work.setCharAt(3, testChars.charAt(j));
 400                 tb.setText(work.toString());
 401                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
 402                     if (k == 2) {
 403                         errln("Break between CR and LF in string U+" + Integer.toHexString(
 404                                 (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
 405                                 (int)(work.charAt(3))));
 406                         errorCount++;
 407                         if (errorCount >= 75)
 408                             return;
 409                     }
 410             }
 411         }
 412 
 413         // a break should never occur before a non-spacing mark, unless it's preceded
 414         // by a line terminator
 415         work.setLength(0);
 416         work.append("aaaa");
 417         for (int i = 0; i < testChars.length(); i++) {
 418             char c = testChars.charAt(i);
 419             if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
 420                 continue;
 421             work.setCharAt(1, c);
 422             for (int j = 0; j < testChars.length(); j++) {
 423                 c = testChars.charAt(j);
 424                 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
 425                         != Character.ENCLOSING_MARK)
 426                     continue;
 427                 work.setCharAt(2, c);
 428 
 429                 // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
 430                 // for breaking purposes as per UTR14
 431                 int type1 = Character.getType(work.charAt(1));
 432                 int type2 = Character.getType(work.charAt(2));
 433                 if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
 434                     type2 == Character.CONTROL || type2 == Character.FORMAT) {
 435                     continue;
 436                 }
 437 
 438                 tb.setText(work.toString());
 439                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
 440                     if (k == 2) {
 441                         errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
 442                                 + " and U+" + Integer.toHexString((int)(work.charAt(2))));
 443                         errorCount++;
 444                         if (errorCount >= 75)
 445                             return;
 446                     }
 447             }
 448         }
 449     }
 450 
 451     public void debugLogln(String s) {
 452         final String zeros = "0000";
 453         String temp;
 454         StringBuffer out = new StringBuffer();
 455         for (int i = 0; i < s.length(); i++) {
 456             char c = s.charAt(i);
 457             if (c >= ' ' && c < '\u007f')
 458                 out.append(c);
 459             else {
 460                 out.append("\\u");
 461                 temp = Integer.toHexString((int)c);
 462                 out.append(zeros.substring(0, 4 - temp.length()));
 463                 out.append(temp);
 464             }
 465         }
 466         logln(out.toString());
 467     }
 468 
 469     //=========================================================================
 470     // tests
 471     //=========================================================================
 472 
 473     public void TestWordBreak() {
 474 
 475         Vector<String> wordSelectionData = new Vector<String>();
 476 
 477         wordSelectionData.addElement("12,34");
 478 
 479         wordSelectionData.addElement(" ");
 480         wordSelectionData.addElement("\u00A2"); //cent sign
 481         wordSelectionData.addElement("\u00A3"); //pound sign
 482         wordSelectionData.addElement("\u00A4"); //currency sign
 483         wordSelectionData.addElement("\u00A5"); //yen sign
 484         wordSelectionData.addElement("alpha-beta-gamma");
 485         wordSelectionData.addElement(".");
 486         wordSelectionData.addElement(" ");
 487         wordSelectionData.addElement("Badges");
 488         wordSelectionData.addElement("?");
 489         wordSelectionData.addElement(" ");
 490         wordSelectionData.addElement("BADGES");
 491         wordSelectionData.addElement("!");
 492         wordSelectionData.addElement("?");
 493         wordSelectionData.addElement("!");
 494         wordSelectionData.addElement(" ");
 495         wordSelectionData.addElement("We");
 496         wordSelectionData.addElement(" ");
 497         wordSelectionData.addElement("don't");
 498         wordSelectionData.addElement(" ");
 499         wordSelectionData.addElement("need");
 500         wordSelectionData.addElement(" ");
 501         wordSelectionData.addElement("no");
 502         wordSelectionData.addElement(" ");
 503         wordSelectionData.addElement("STINKING");
 504         wordSelectionData.addElement(" ");
 505         wordSelectionData.addElement("BADGES");
 506         wordSelectionData.addElement("!");
 507         wordSelectionData.addElement("!");
 508         wordSelectionData.addElement("!");
 509 
 510         wordSelectionData.addElement("012.566,5");
 511         wordSelectionData.addElement(" ");
 512         wordSelectionData.addElement("123.3434,900");
 513         wordSelectionData.addElement(" ");
 514         wordSelectionData.addElement("1000,233,456.000");
 515         wordSelectionData.addElement(" ");
 516         wordSelectionData.addElement("1,23.322%");
 517         wordSelectionData.addElement(" ");
 518         wordSelectionData.addElement("123.1222");
 519 
 520         wordSelectionData.addElement(" ");
 521         wordSelectionData.addElement("\u0024123,000.20");
 522 
 523         wordSelectionData.addElement(" ");
 524         wordSelectionData.addElement("179.01\u0025");
 525 
 526         wordSelectionData.addElement("Hello");
 527         wordSelectionData.addElement(",");
 528         wordSelectionData.addElement(" ");
 529         wordSelectionData.addElement("how");
 530         wordSelectionData.addElement(" ");
 531         wordSelectionData.addElement("are");
 532         wordSelectionData.addElement(" ");
 533         wordSelectionData.addElement("you");
 534         wordSelectionData.addElement(" ");
 535         wordSelectionData.addElement("X");
 536         wordSelectionData.addElement(" ");
 537 
 538         wordSelectionData.addElement("Now");
 539         wordSelectionData.addElement("\r");
 540         wordSelectionData.addElement("is");
 541         wordSelectionData.addElement("\n");
 542         wordSelectionData.addElement("the");
 543         wordSelectionData.addElement("\r\n");
 544         wordSelectionData.addElement("time");
 545         wordSelectionData.addElement("\n");
 546         wordSelectionData.addElement("\r");
 547         wordSelectionData.addElement("for");
 548         wordSelectionData.addElement("\r");
 549         wordSelectionData.addElement("\r");
 550         wordSelectionData.addElement("all");
 551         wordSelectionData.addElement(" ");
 552 
 553         generalIteratorTest(wordBreak, wordSelectionData);
 554     }
 555 
 556     public void TestBug4097779() {
 557         Vector<String> wordSelectionData = new Vector<String>();
 558 
 559         wordSelectionData.addElement("aa\u0300a");
 560         wordSelectionData.addElement(" ");
 561 
 562         generalIteratorTest(wordBreak, wordSelectionData);
 563     }
 564 
 565     public void TestBug4098467Words() {
 566         Vector<String> wordSelectionData = new Vector<String>();
 567 
 568         // What follows is a string of Korean characters (I found it in the Yellow Pages
 569         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 570         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 571         // Both sequences should be semantically identical and break the same way.
 572         // precomposed syllables...
 573         wordSelectionData.addElement("\uc0c1\ud56d");
 574         wordSelectionData.addElement(" ");
 575         wordSelectionData.addElement("\ud55c\uc778");
 576         wordSelectionData.addElement(" ");
 577         wordSelectionData.addElement("\uc5f0\ud569");
 578         wordSelectionData.addElement(" ");
 579         wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
 580         wordSelectionData.addElement(" ");
 581         // conjoining jamo...
 582         wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
 583         wordSelectionData.addElement(" ");
 584         wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
 585         wordSelectionData.addElement(" ");
 586         wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
 587         wordSelectionData.addElement(" ");
 588         wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
 589         wordSelectionData.addElement(" ");
 590 
 591         generalIteratorTest(wordBreak, wordSelectionData);
 592     }
 593 
 594     public void TestBug4117554Words() {
 595         Vector<String> wordSelectionData = new Vector<String>();
 596 
 597         // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
 598         // count as a Kanji character for the purposes of word breaking
 599         wordSelectionData.addElement("abc");
 600         wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
 601         wordSelectionData.addElement("abc");
 602 
 603         generalIteratorTest(wordBreak, wordSelectionData);
 604     }
 605 
 606     public void TestSentenceBreak() {
 607         Vector<String> sentenceSelectionData = new Vector<String>();
 608 
 609         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 610         sentenceSelectionData.addElement("(This is it.) ");
 611         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 612         sentenceSelectionData.addElement("\"This isn\'t it.\" ");
 613         sentenceSelectionData.addElement("Hi! ");
 614         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 615         sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
 616         sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
 617         sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
 618         sentenceSelectionData.addElement("He said, that I said, that you said!! ");
 619 
 620         sentenceSelectionData.addElement("Don't rock the boat.\u2029");
 621 
 622         sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
 623         sentenceSelectionData.addElement("Not on my time (el timo.)! ");
 624 
 625         sentenceSelectionData.addElement("So what!!\u2029");
 626 
 627         sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
 628         sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
 629         sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
 630         sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
 631         sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
 632         sentenceSelectionData.addElement("He answered, \"You may not!\" ");
 633         sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
 634         sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
 635         sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
 636         sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
 637 
 638         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 639     }
 640 
 641     public void TestBug4113835() {
 642         Vector<String> sentenceSelectionData = new Vector<String>();
 643 
 644         // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
 645         sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
 646 
 647         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 648     }
 649 
 650     public void TestBug4111338() {
 651         Vector<String> sentenceSelectionData = new Vector<String>();
 652 
 653         // test for bug #4111338: Don't break sentences at the boundary between CJK
 654         // and other letters
 655         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
 656                 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
 657                 + "\u611d\u57b6\u2510\u5d46\".\u2029");
 658         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
 659                 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
 660                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 661         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
 662                 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
 663                 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 664         sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
 665 
 666         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 667     }
 668 
 669     public void TestBug4117554Sentences() {
 670         Vector<String> sentenceSelectionData = new Vector<String>();
 671 
 672         // Treat fullwidth variants of .!? the same as their
 673         // normal counterparts
 674         sentenceSelectionData.addElement("I know I'm right\uff0e ");
 675         sentenceSelectionData.addElement("Right\uff1f ");
 676         sentenceSelectionData.addElement("Right\uff01 ");
 677 
 678         // Don't break sentences at boundary between CJK and digits
 679         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
 680                 + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
 681                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 682 
 683         // Break sentence between a sentence terminator and
 684         // opening punctuation
 685         sentenceSelectionData.addElement("no?");
 686         sentenceSelectionData.addElement("(yes)");
 687 
 688         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 689     }
 690 
 691     public void TestBug4158381() {
 692         Vector<String> sentenceSelectionData = new Vector<String>();
 693 
 694         // Don't break sentence after period if it isn't followed by a space
 695         sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class.  ");
 696         sentenceSelectionData.addElement("Another test.\u2029");
 697 
 698         // No breaks when there are no terminators around
 699         sentenceSelectionData.addElement("<P>Provides a set of "
 700                 + "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
 701                 + "</SUP></FONT> language) components that, "
 702                 + "to the maximum degree possible, work the same on all platforms.  ");
 703         sentenceSelectionData.addElement("Another test.\u2029");
 704 
 705         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 706     }
 707 
 708     public void TestBug4143071() {
 709         Vector<String> sentenceSelectionData = new Vector<String>();
 710 
 711         // Make sure sentences that end with digits work right
 712         sentenceSelectionData.addElement("Today is the 27th of May, 1998.  ");
 713         sentenceSelectionData.addElement("Tomorrow with be 28 May 1998.  ");
 714         sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
 715 
 716         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 717     }
 718 
 719     public void TestBug4152416() {
 720         Vector<String> sentenceSelectionData = new Vector<String>();
 721 
 722         // Make sure sentences ending with a capital letter are treated correctly
 723         sentenceSelectionData.addElement("The type of all primitive "
 724                 + "<code>boolean</code> values accessed in the target VM.  ");
 725         sentenceSelectionData.addElement("Calls to xxx will return an "
 726                 + "implementor of this interface.\u2029");
 727 
 728         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 729     }
 730 
 731     public void TestBug4152117() {
 732         Vector<String> sentenceSelectionData = new Vector<String>();
 733 
 734         // Make sure sentence breaking is handling punctuation correctly
 735         // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
 736         // IT DOESN'T CROP UP]
 737         sentenceSelectionData.addElement("Constructs a randomly generated "
 738                 + "BigInteger, uniformly distributed over the range <tt>0</tt> "
 739                 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
 740         sentenceSelectionData.addElement("The uniformity of the distribution "
 741                 + "assumes that a fair source of random bits is provided in "
 742                 + "<tt>rnd</tt>.  ");
 743         sentenceSelectionData.addElement("Note that this constructor always "
 744                 + "constructs a non-negative BigInteger.\u2029");
 745 
 746         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 747     }
 748 
 749     public void TestLineBreak() {
 750         Vector<String> lineSelectionData = new Vector<String>();
 751 
 752         lineSelectionData.addElement("Multi-");
 753         lineSelectionData.addElement("Level ");
 754         lineSelectionData.addElement("example ");
 755         lineSelectionData.addElement("of ");
 756         lineSelectionData.addElement("a ");
 757         lineSelectionData.addElement("semi-");
 758         lineSelectionData.addElement("idiotic ");
 759         lineSelectionData.addElement("non-");
 760         lineSelectionData.addElement("sensical ");
 761         lineSelectionData.addElement("(non-");
 762         lineSelectionData.addElement("important) ");
 763         lineSelectionData.addElement("sentence. ");
 764 
 765         lineSelectionData.addElement("Hi  ");
 766         lineSelectionData.addElement("Hello ");
 767         lineSelectionData.addElement("How\n");
 768         lineSelectionData.addElement("are\r");
 769         lineSelectionData.addElement("you\u2028");
 770         lineSelectionData.addElement("fine.\t");
 771         lineSelectionData.addElement("good.  ");
 772 
 773         lineSelectionData.addElement("Now\r");
 774         lineSelectionData.addElement("is\n");
 775         lineSelectionData.addElement("the\r\n");
 776         lineSelectionData.addElement("time\n");
 777         lineSelectionData.addElement("\r");
 778         lineSelectionData.addElement("for\r");
 779         lineSelectionData.addElement("\r");
 780         lineSelectionData.addElement("all");
 781 
 782         generalIteratorTest(lineBreak, lineSelectionData);
 783     }
 784 
 785     public void TestBug4068133() {
 786         Vector<String> lineSelectionData = new Vector<String>();
 787 
 788         lineSelectionData.addElement("\u96f6");
 789         lineSelectionData.addElement("\u4e00\u3002");
 790         lineSelectionData.addElement("\u4e8c\u3001");
 791         lineSelectionData.addElement("\u4e09\u3002\u3001");
 792         lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
 793         lineSelectionData.addElement("\u4e94,");
 794         lineSelectionData.addElement("\u516d.");
 795         lineSelectionData.addElement("\u4e03.\u3001,\u3002");
 796         lineSelectionData.addElement("\u516b");
 797 
 798         generalIteratorTest(lineBreak, lineSelectionData);
 799     }
 800 
 801     public void TestBug4086052() {
 802         Vector<String> lineSelectionData = new Vector<String>();
 803 
 804         lineSelectionData.addElement("foo\u00a0bar ");
 805 //        lineSelectionData.addElement("foo\ufeffbar");
 806 
 807         generalIteratorTest(lineBreak, lineSelectionData);
 808     }
 809 
 810     public void TestBug4097920() {
 811         Vector<String> lineSelectionData = new Vector<String>();
 812 
 813         lineSelectionData.addElement("dog,");
 814         lineSelectionData.addElement("cat,");
 815         lineSelectionData.addElement("mouse ");
 816         lineSelectionData.addElement("(one)");
 817         lineSelectionData.addElement("(two)\n");
 818 
 819         generalIteratorTest(lineBreak, lineSelectionData);
 820     }
 821     /*
 822     public void TestBug4035266() {
 823         Vector<String> lineSelectionData = new Vector<String>();
 824 
 825         lineSelectionData.addElement("The ");
 826         lineSelectionData.addElement("balance ");
 827         lineSelectionData.addElement("is ");
 828         lineSelectionData.addElement("$-23,456.78, ");
 829         lineSelectionData.addElement("not ");
 830         lineSelectionData.addElement("-$32,456.78!\n");
 831 
 832         generalIteratorTest(lineBreak, lineSelectionData);
 833     }
 834     */
 835     public void TestBug4098467Lines() {
 836         Vector<String> lineSelectionData = new Vector<String>();
 837 
 838         // What follows is a string of Korean characters (I found it in the Yellow Pages
 839         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 840         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 841         // Both sequences should be semantically identical and break the same way.
 842         // precomposed syllables...
 843         lineSelectionData.addElement("\uc0c1");
 844         lineSelectionData.addElement("\ud56d ");
 845         lineSelectionData.addElement("\ud55c");
 846         lineSelectionData.addElement("\uc778 ");
 847         lineSelectionData.addElement("\uc5f0");
 848         lineSelectionData.addElement("\ud569 ");
 849         lineSelectionData.addElement("\uc7a5");
 850         lineSelectionData.addElement("\ub85c");
 851         lineSelectionData.addElement("\uad50");
 852         lineSelectionData.addElement("\ud68c ");
 853         // conjoining jamo...
 854         lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
 855         lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
 856         lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
 857         lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
 858 
 859         if (Locale.getDefault().getLanguage().equals("th")) {
 860             logln("This test is skipped in th locale.");
 861             return;
 862         }
 863 
 864         generalIteratorTest(lineBreak, lineSelectionData);
 865     }
 866 
 867     public void TestBug4117554Lines() {
 868         Vector<String> lineSelectionData = new Vector<String>();
 869 
 870         // Fullwidth .!? should be treated as postJwrd
 871         lineSelectionData.addElement("\u4e01\uff0e");
 872         lineSelectionData.addElement("\u4e02\uff01");
 873         lineSelectionData.addElement("\u4e03\uff1f");
 874 
 875         generalIteratorTest(lineBreak, lineSelectionData);
 876     }
 877 
 878     public void TestBug4217703() {
 879         if (Locale.getDefault().getLanguage().equals("th")) {
 880             logln("This test is skipped in th locale.");
 881             return;
 882         }
 883 
 884         Vector<String> lineSelectionData = new Vector<String>();
 885 
 886         // There shouldn't be a line break between sentence-ending punctuation
 887         // and a closing quote
 888         lineSelectionData.addElement("He ");
 889         lineSelectionData.addElement("said ");
 890         lineSelectionData.addElement("\"Go!\"  ");
 891         lineSelectionData.addElement("I ");
 892         lineSelectionData.addElement("went.  ");
 893 
 894         lineSelectionData.addElement("Hashtable$Enumeration ");
 895         lineSelectionData.addElement("getText().");
 896         lineSelectionData.addElement("getIndex()");
 897 
 898         generalIteratorTest(lineBreak, lineSelectionData);
 899     }
 900 
 901     private static final String graveS = "S\u0300";
 902     private static final String acuteBelowI = "i\u0317";
 903     private static final String acuteE = "e\u0301";
 904     private static final String circumflexA = "a\u0302";
 905     private static final String tildeE = "e\u0303";
 906 
 907     public void TestCharacterBreak() {
 908         Vector<String> characterSelectionData = new Vector<String>();
 909 
 910         characterSelectionData.addElement(graveS);
 911         characterSelectionData.addElement(acuteBelowI);
 912         characterSelectionData.addElement("m");
 913         characterSelectionData.addElement("p");
 914         characterSelectionData.addElement("l");
 915         characterSelectionData.addElement(acuteE);
 916         characterSelectionData.addElement(" ");
 917         characterSelectionData.addElement("s");
 918         characterSelectionData.addElement(circumflexA);
 919         characterSelectionData.addElement("m");
 920         characterSelectionData.addElement("p");
 921         characterSelectionData.addElement("l");
 922         characterSelectionData.addElement(tildeE);
 923         characterSelectionData.addElement(".");
 924         characterSelectionData.addElement("w");
 925         characterSelectionData.addElement(circumflexA);
 926         characterSelectionData.addElement("w");
 927         characterSelectionData.addElement("a");
 928         characterSelectionData.addElement("f");
 929         characterSelectionData.addElement("q");
 930         characterSelectionData.addElement("\n");
 931         characterSelectionData.addElement("\r");
 932         characterSelectionData.addElement("\r\n");
 933         characterSelectionData.addElement("\n");
 934 
 935         generalIteratorTest(characterBreak, characterSelectionData);
 936     }
 937 
 938     public void TestBug4098467Characters() {
 939         Vector<String> characterSelectionData = new Vector<String>();
 940 
 941         // What follows is a string of Korean characters (I found it in the Yellow Pages
 942         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 943         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 944         // Both sequences should be semantically identical and break the same way.
 945         // precomposed syllables...
 946         characterSelectionData.addElement("\uc0c1");
 947         characterSelectionData.addElement("\ud56d");
 948         characterSelectionData.addElement(" ");
 949         characterSelectionData.addElement("\ud55c");
 950         characterSelectionData.addElement("\uc778");
 951         characterSelectionData.addElement(" ");
 952         characterSelectionData.addElement("\uc5f0");
 953         characterSelectionData.addElement("\ud569");
 954         characterSelectionData.addElement(" ");
 955         characterSelectionData.addElement("\uc7a5");
 956         characterSelectionData.addElement("\ub85c");
 957         characterSelectionData.addElement("\uad50");
 958         characterSelectionData.addElement("\ud68c");
 959         characterSelectionData.addElement(" ");
 960         // conjoining jamo...
 961         characterSelectionData.addElement("\u1109\u1161\u11bc");
 962         characterSelectionData.addElement("\u1112\u1161\u11bc");
 963         characterSelectionData.addElement(" ");
 964         characterSelectionData.addElement("\u1112\u1161\u11ab");
 965         characterSelectionData.addElement("\u110b\u1175\u11ab");
 966         characterSelectionData.addElement(" ");
 967         characterSelectionData.addElement("\u110b\u1167\u11ab");
 968         characterSelectionData.addElement("\u1112\u1161\u11b8");
 969         characterSelectionData.addElement(" ");
 970         characterSelectionData.addElement("\u110c\u1161\u11bc");
 971         characterSelectionData.addElement("\u1105\u1169");
 972         characterSelectionData.addElement("\u1100\u116d");
 973         characterSelectionData.addElement("\u1112\u116c");
 974 
 975         generalIteratorTest(characterBreak, characterSelectionData);
 976     }
 977 
 978     public void TestBug4153072() {
 979         BreakIterator iter = BreakIterator.getWordInstance();
 980         String str = "...Hello, World!...";
 981         int begin = 3;
 982         int end = str.length() - 3;
 983         boolean gotException = false;
 984         boolean dummy;
 985 
 986         iter.setText(new StringCharacterIterator(str, begin, end, begin));
 987         for (int index = -1; index < begin + 1; ++index) {
 988             try {
 989                 dummy = iter.isBoundary(index);
 990                 if (index < begin)
 991                     errln("Didn't get exception with offset = " + index +
 992                                     " and begin index = " + begin);
 993             }
 994             catch (IllegalArgumentException e) {
 995                 if (index >= begin)
 996                     errln("Got exception with offset = " + index +
 997                                     " and begin index = " + begin);
 998             }
 999         }
1000     }
1001 
1002     public void TestBug4146175Sentences() {
1003         Vector<String> sentenceSelectionData = new Vector<String>();
1004 
1005         // break between periods and opening punctuation even when there's no
1006         // intervening space
1007         sentenceSelectionData.addElement("end.");
1008         sentenceSelectionData.addElement("(This is\u2029");
1009 
1010         // treat the fullwidth period as an unambiguous sentence terminator
1011         sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
1012         sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
1013 
1014         generalIteratorTest(sentenceBreak, sentenceSelectionData);
1015     }
1016 
1017     public void TestBug4146175Lines() {
1018         if (Locale.getDefault().getLanguage().equals("th")) {
1019             logln("This test is skipped in th locale.");
1020             return;
1021         }
1022 
1023         Vector<String> lineSelectionData = new Vector<String>();
1024 
1025         // the fullwidth comma should stick to the preceding Japanese character
1026         lineSelectionData.addElement("\u7d42\uff0c");
1027         lineSelectionData.addElement("\u308f");
1028 
1029         generalIteratorTest(lineBreak, lineSelectionData);
1030     }
1031 
1032     public void TestBug4214367() {
1033         if (Locale.getDefault().getLanguage().equals("th")) {
1034             logln("This test is skipped in th locale.");
1035             return;
1036         }
1037 
1038         Vector<String> wordSelectionData = new Vector<String>();
1039 
1040         // the hiragana and katakana iteration marks and the long vowel mark
1041         // are not being treated correctly by the word-break iterator
1042         wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
1043         wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
1044 
1045         generalIteratorTest(wordBreak, wordSelectionData);
1046     }
1047 
1048     private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
1049         = /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
1050         + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
1051         + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
1052         + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
1053         + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
1054         + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
1055 
1056     public void TestSentenceInvariants()
1057     {
1058         BreakIterator e = BreakIterator.getSentenceInstance();
1059         doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
1060     }
1061 
1062     public void TestWordInvariants()
1063     {
1064         if (Locale.getDefault().getLanguage().equals("th")) {
1065             logln("This test is skipped in th locale.");
1066             return;
1067         }
1068 
1069         BreakIterator e = BreakIterator.getWordInstance();
1070         doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1071             + "\u30a3\u4e00\u4e01\u4e02");
1072         doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1073             + "\u30a3\u4e00\u4e01\u4e02");
1074     }
1075 
1076     public void TestLineInvariants()
1077     {
1078         if (Locale.getDefault().getLanguage().equals("th")) {
1079             logln("This test is skipped in th locale.");
1080             return;
1081         }
1082 
1083         BreakIterator e = BreakIterator.getLineInstance();
1084         String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
1085             + "\u30a3\u4e00\u4e01\u4e02";
1086         doBreakInvariantTest(e, testChars);
1087         doOtherInvariantTest(e, testChars);
1088 
1089         int errorCount = 0;
1090 
1091         // in addition to the other invariants, a line-break iterator should make sure that:
1092         // it doesn't break around the non-breaking characters
1093         String noBreak = "\u00a0\u2007\u2011\ufeff";
1094         StringBuffer work = new StringBuffer("aaa");
1095         for (int i = 0; i < testChars.length(); i++) {
1096             char c = testChars.charAt(i);
1097             if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
1098                 continue;
1099             work.setCharAt(0, c);
1100             for (int j = 0; j < noBreak.length(); j++) {
1101                 work.setCharAt(1, noBreak.charAt(j));
1102                 for (int k = 0; k < testChars.length(); k++) {
1103                     work.setCharAt(2, testChars.charAt(k));
1104                     // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
1105                     // for breaking purposes as per UTR14
1106                     int type1 = Character.getType(work.charAt(1));
1107                     int type2 = Character.getType(work.charAt(2));
1108                     if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
1109                         type2 == Character.CONTROL || type2 == Character.FORMAT) {
1110                         continue;
1111                     }
1112                     e.setText(work.toString());
1113                     for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
1114                         if (l == 1 || l == 2) {
1115                             //errln("Got break between U+" + Integer.toHexString((int)
1116                             //        (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1117                             //        (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
1118                             // as per UTR14 spaces followed by a GLUE character should allow
1119                             // line breaking
1120                             if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
1121                                                                  work.charAt(l) == '\u0f0c' ||
1122                                                                  work.charAt(l) == '\u2007' ||
1123                                                                  work.charAt(l) == '\u2011' ||
1124                                                                  work.charAt(l) == '\u202f' ||
1125                                                                  work.charAt(l) == '\ufeff')) {
1126                                 continue;
1127                             }
1128                             errln("Got break between U+" + Integer.toHexString((int)
1129                                     (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1130                                     (int)(work.charAt(l))));
1131                             errorCount++;
1132                             if (errorCount >= 75)
1133                                 return;
1134                         }
1135                     }
1136                 }
1137             }
1138         }
1139 
1140         // The following test has so many exceptions that it would be better to write a new set of data
1141         // that tested exactly what should be tested
1142         // Until that point it will be commented out
1143         /*
1144 
1145         // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
1146         // a currency symbol, a space, a format-control character, a regular control character,
1147         // a line or paragraph separator, or another dash)
1148         String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
1149         for (int i = 0; i < testChars.length(); i++) {
1150             work.setCharAt(0, testChars.charAt(i));
1151             for (int j = 0; j < dashes.length(); j++) {
1152                 work.setCharAt(1, dashes.charAt(j));
1153                 for (int k = 0; k < testChars.length(); k++) {
1154                     char c = testChars.charAt(k);
1155                     if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
1156                         Character.getType(c) == Character.OTHER_NUMBER ||
1157                         Character.getType(c) == Character.NON_SPACING_MARK ||
1158                         Character.getType(c) == Character.ENCLOSING_MARK ||
1159                         Character.getType(c) == Character.CURRENCY_SYMBOL ||
1160                         Character.getType(c) == Character.DASH_PUNCTUATION ||
1161                         Character.getType(c) == Character.SPACE_SEPARATOR ||
1162                         Character.getType(c) == Character.FORMAT ||
1163                         Character.getType(c) == Character.CONTROL ||
1164                         Character.getType(c) == Character.END_PUNCTUATION ||
1165                         Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
1166                         Character.getType(c) == Character.OTHER_PUNCTUATION ||
1167                         c == '\'' || c == '\"' ||
1168                         // category EX as per UTR14
1169                         c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
1170                         c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
1171                         c == '\u0003' || c == '\u2007' || c == '\u2011' ||
1172                         c == '\ufeff')
1173                         continue;
1174                     work.setCharAt(2, c);
1175                     e.setText(work.toString());
1176                     boolean saw2 = false;
1177                     for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
1178                         if (l == 2)
1179                             saw2 = true;
1180                     if (!saw2) {
1181                         errln("Didn't get break between U+" + Integer.toHexString((int)
1182                                     (work.charAt(1))) + " and U+" + Integer.toHexString(
1183                                     (int)(work.charAt(2))));
1184                         errorCount++;
1185                         if (errorCount >= 75)
1186                             return;
1187                     }
1188                 }
1189             }
1190         }
1191         */
1192     }
1193 
1194     public void TestCharacterInvariants()
1195     {
1196         BreakIterator e = BreakIterator.getCharacterInstance();
1197         doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1198             + "\u11a9\u11aa");
1199         doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1200             + "\u11a9\u11aa");
1201     }
1202 
1203     public void TestEmptyString()
1204     {
1205         String text = "";
1206         Vector<String> x = new Vector<String>();
1207         x.addElement(text);
1208 
1209         generalIteratorTest(lineBreak, x);
1210     }
1211 
1212     public void TestGetAvailableLocales()
1213     {
1214         Locale[] locList = BreakIterator.getAvailableLocales();
1215 
1216         if (locList.length == 0)
1217             errln("getAvailableLocales() returned an empty list!");
1218         // I have no idea how to test this function...
1219     }
1220 
1221 
1222     /**
1223      * Bug 4095322
1224      */
1225     public void TestJapaneseLineBreak()
1226     {
1227         StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
1228         // Breaking on <Kanji>$<Kanji> is inconsistent
1229 
1230         /* Characters in precedingChars and followingChars have been updated
1231          * from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
1232          * In concrete terms,
1233          *   0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
1234          *   0x169B & 0x169C : added since Unicode 3.0.0.
1235          */
1236         String precedingChars =
1237             /* Puctuation, Open */
1238           "([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
1239             /* Punctuation, Initial quote */
1240           + "\u00ab\u2018\u201b\u201c\u201f\u2039"
1241             /* Symbol, Currency */
1242           + "\u00a5\u00a3\u00a4\u20a0";
1243 
1244         String followingChars =
1245             /* Puctuation, Close */
1246           ")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
1247             /* Punctuation, Final quote */
1248           + "\u00bb\u2019\u201d\u203a"
1249             /* Punctuation, Other */
1250           + "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
1251             /* Punctuation, Dash */
1252           + "\u2103\u2109"
1253             /* Symbol, Currency */
1254           + "\u00a2"
1255             /* Letter, Modifier */
1256           + "\u3005\u309d\u309e"
1257             /* Letter, Other */
1258           + "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
1259            /* Mark, Non-Spacing */
1260           + "\u0300\u0301\u0302"
1261             /* Symbol, Modifier */
1262           + "\u309b\u309c"
1263             /* Symbol, Other */
1264           + "\u00b0";
1265 
1266         BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
1267 
1268         for (int i = 0; i < precedingChars.length(); i++) {
1269             testString.setCharAt(1, precedingChars.charAt(i));
1270             iter.setText(testString.toString());
1271             int j = iter.first();
1272             if (j != 0) {
1273                 errln("ja line break failure: failed to start at 0 and bounced at " + j);
1274             }
1275             j = iter.next();
1276             if (j != 1) {
1277                 errln("ja line break failure: failed to stop before '"
1278                         + precedingChars.charAt(i) + "' (\\u"
1279                         + Integer.toString(precedingChars.charAt(i), 16)
1280                         + ") at 1 and bounded at " + j);
1281             }
1282             j = iter.next();
1283             if (j != 3) {
1284                 errln("ja line break failure: failed to skip position after '"
1285                         + precedingChars.charAt(i) + "' (\\u"
1286                         + Integer.toString(precedingChars.charAt(i), 16)
1287                         + ") at 3 and bounded at " + j);
1288             }
1289         }
1290 
1291         for (int i = 0; i < followingChars.length(); i++) {
1292             testString.setCharAt(1, followingChars.charAt(i));
1293             iter.setText(testString.toString());
1294             int j = iter.first();
1295             if (j != 0) {
1296                 errln("ja line break failure: failed to start at 0 and bounded at " + j);
1297             }
1298             j = iter.next();
1299             if (j != 2) {
1300                 errln("ja line break failure: failed to skip position before '"
1301                         + followingChars.charAt(i) + "' (\\u"
1302                         + Integer.toString(followingChars.charAt(i), 16)
1303                         + ") at 2 and bounded at " + j);
1304             }
1305             j = iter.next();
1306             if (j != 3) {
1307                 errln("ja line break failure: failed to stop after '"
1308                         + followingChars.charAt(i) + "' (\\u"
1309                         + Integer.toString(followingChars.charAt(i), 16)
1310                         + ") at 3 and bounded at " + j);
1311             }
1312         }
1313     }
1314 
1315     /**
1316      * Bug 4638433
1317      */
1318     public void TestLineBreakBasedOnUnicode3_0_0()
1319     {
1320         BreakIterator iter;
1321         int i;
1322 
1323         /* Latin Extend-B characters
1324          * 0x0218-0x0233 which have been added since Unicode 3.0.0.
1325          */
1326         iter = BreakIterator.getWordInstance(Locale.US);
1327         iter.setText("\u0216\u0217\u0218\u0219\u021A");
1328         i = iter.first();
1329         i = iter.next();
1330         if (i != 5) {
1331             errln("Word break failure: failed to stop at 5 and bounded at " + i);
1332         }
1333 
1334 
1335         iter = BreakIterator.getLineInstance(Locale.US);
1336 
1337         /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
1338          * \u301f has changed its category from Ps to Pe since Unicode 2.1.
1339          */
1340         iter.setText("32\u301f1");
1341         i = iter.first();
1342         i = iter.next();
1343         if (i != 3) {
1344             errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
1345         }
1346 
1347         /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
1348          * which have been added since Unicode 3.0.0.
1349          */
1350         iter.setText("\u1820\u1806\u1821");
1351         i = iter.first();
1352         i = iter.next();
1353         if (i != 2) {
1354             errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
1355         }
1356 
1357         /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
1358          * been added since Unicode 3.0.0.
1359          */
1360         iter.setText("\u17E0\u17DB\u17E1");
1361         i = iter.first();
1362         i = iter.next();
1363         if (i != 1) {
1364             errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
1365         }
1366         i = iter.next();
1367         if (i != 3) {
1368             errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
1369         }
1370 
1371         /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
1372          * been added since Unicode 3.0.0.
1373          */
1374         iter.setText("\u1692\u1680\u1696");
1375         i = iter.first();
1376         i = iter.next();
1377         if (i != 2) {
1378             errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
1379         }
1380 
1381 
1382         // Confirm changes in BreakIteratorRules_th.java have been reflected.
1383         iter = BreakIterator.getLineInstance(new Locale("th", ""));
1384 
1385         /* Thai <Seven(Nd)>
1386          *      <Left Double Quotation Mark(Pi)>
1387          *      <Five(Nd)>
1388          *      <Right Double Quotation Mark(Pf)>
1389          *      <Three(Nd)>
1390          */
1391         iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
1392         i = iter.first();
1393         i = iter.next();
1394         if (i != 1) {
1395             errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
1396         }
1397         i = iter.next();
1398         if (i != 4) {
1399             errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
1400         }
1401     }
1402 
1403     /**
1404      * Bug 4068137
1405      */
1406     public void TestEndBehavior()
1407     {
1408         String testString = "boo.";
1409         BreakIterator wb = BreakIterator.getWordInstance();
1410         wb.setText(testString);
1411 
1412         if (wb.first() != 0)
1413             errln("Didn't get break at beginning of string.");
1414         if (wb.next() != 3)
1415             errln("Didn't get break before period in \"boo.\"");
1416         if (wb.current() != 4 && wb.next() != 4)
1417             errln("Didn't get break at end of string.");
1418     }
1419 
1420     // [serialization test has been removed pursuant to bug #4152965]
1421 
1422     /**
1423      * Bug 4450804
1424      */
1425     public void TestLineBreakContractions() {
1426         Vector<String> expected = new Vector<String>();
1427 
1428         expected.add("These ");
1429         expected.add("are ");
1430         expected.add("'foobles'. ");
1431         expected.add("Don't ");
1432         expected.add("you ");
1433         expected.add("like ");
1434         expected.add("them?");
1435         generalIteratorTest(lineBreak, expected);
1436     }
1437 
1438 }