1 /* 2 * Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* 25 * @test 26 * @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779 27 * 4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117 28 * 4152416 4153072 4158381 4214367 4217703 4638433 29 * @library /java/text/testlib 30 * @run main/timeout=2000 BreakIteratorTest 31 * @summary test BreakIterator 32 */ 33 34 /* 35 * 36 * 37 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 38 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved 39 * 40 * Portions copyright (c) 2007 Sun Microsystems, Inc. 41 * All Rights Reserved. 42 * 43 * The original version of this source code and documentation 44 * is copyrighted and owned by Taligent, Inc., a wholly-owned 45 * subsidiary of IBM. These materials are provided under terms 46 * of a License Agreement between Taligent and Sun. This technology 47 * is protected by multiple US and International patents. 48 * 49 * This notice and attribution to Taligent may not be removed. 50 * Taligent is a registered trademark of Taligent, Inc. 51 * 52 * Permission to use, copy, modify, and distribute this software 53 * and its documentation for NON-COMMERCIAL purposes and without 54 * fee is hereby granted provided that this copyright notice 55 * appears in all copies. Please refer to the file "copyright.html" 56 * for further important copyright and licensing information. 57 * 58 * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF 59 * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 60 * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 61 * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR 62 * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR 63 * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. 64 * 65 */ 66 67 import java.text.BreakIterator; 68 import java.text.CharacterIterator; 69 import java.text.StringCharacterIterator; 70 import java.util.Locale; 71 import java.util.Vector; 72 import java.util.Enumeration; 73 import java.io.*; 74 75 public class BreakIteratorTest extends IntlTest 76 { 77 private BreakIterator characterBreak; 78 private BreakIterator wordBreak; 79 private BreakIterator lineBreak; 80 private BreakIterator sentenceBreak; 81 82 public static void main(String[] args) throws Exception { 83 new BreakIteratorTest().run(args); 84 } 85 86 public BreakIteratorTest() 87 { 88 characterBreak = BreakIterator.getCharacterInstance(); 89 wordBreak = BreakIterator.getWordInstance(); 90 lineBreak = BreakIterator.getLineInstance(); 91 sentenceBreak = BreakIterator.getSentenceInstance(); 92 } 93 94 //========================================================================= 95 // general test subroutines 96 //========================================================================= 97 98 private void generalIteratorTest(BreakIterator bi, Vector expectedResult) { 99 StringBuffer buffer = new StringBuffer(); 100 String text; 101 for (int i = 0; i < expectedResult.size(); i++) { 102 text = (String)expectedResult.elementAt(i); 103 buffer.append(text); 104 } 105 text = buffer.toString(); 106 107 bi.setText(text); 108 109 Vector nextResults = testFirstAndNext(bi, text); 110 Vector previousResults = testLastAndPrevious(bi, text); 111 112 logln("comparing forward and backward..."); 113 int errs = getErrorCount(); 114 compareFragmentLists("forward iteration", "backward iteration", nextResults, 115 previousResults); 116 if (getErrorCount() == errs) { 117 logln("comparing expected and actual..."); 118 compareFragmentLists("expected result", "actual result", expectedResult, 119 nextResults); 120 } 121 122 int[] boundaries = new int[expectedResult.size() + 3]; 123 boundaries[0] = BreakIterator.DONE; 124 boundaries[1] = 0; 125 for (int i = 0; i < expectedResult.size(); i++) 126 boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)). 127 length(); 128 boundaries[boundaries.length - 1] = BreakIterator.DONE; 129 130 testFollowing(bi, text, boundaries); 131 testPreceding(bi, text, boundaries); 132 testIsBoundary(bi, text, boundaries); 133 134 doMultipleSelectionTest(bi, text); 135 } 136 137 private Vector testFirstAndNext(BreakIterator bi, String text) { 138 int p = bi.first(); 139 int lastP = p; 140 Vector<String> result = new Vector<String>(); 141 142 if (p != 0) 143 errln("first() returned " + p + " instead of 0"); 144 while (p != BreakIterator.DONE) { 145 p = bi.next(); 146 if (p != BreakIterator.DONE) { 147 if (p <= lastP) 148 errln("next() failed to move forward: next() on position " 149 + lastP + " yielded " + p); 150 151 result.addElement(text.substring(lastP, p)); 152 } 153 else { 154 if (lastP != text.length()) 155 errln("next() returned DONE prematurely: offset was " 156 + lastP + " instead of " + text.length()); 157 } 158 lastP = p; 159 } 160 return result; 161 } 162 163 private Vector testLastAndPrevious(BreakIterator bi, String text) { 164 int p = bi.last(); 165 int lastP = p; 166 Vector<String> result = new Vector<String>(); 167 168 if (p != text.length()) 169 errln("last() returned " + p + " instead of " + text.length()); 170 while (p != BreakIterator.DONE) { 171 p = bi.previous(); 172 if (p != BreakIterator.DONE) { 173 if (p >= lastP) 174 errln("previous() failed to move backward: previous() on position " 175 + lastP + " yielded " + p); 176 177 result.insertElementAt(text.substring(p, lastP), 0); 178 } 179 else { 180 if (lastP != 0) 181 errln("previous() returned DONE prematurely: offset was " 182 + lastP + " instead of 0"); 183 } 184 lastP = p; 185 } 186 return result; 187 } 188 189 private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) { 190 int p1 = 0; 191 int p2 = 0; 192 String s1; 193 String s2; 194 int t1 = 0; 195 int t2 = 0; 196 197 while (p1 < f1.size() && p2 < f2.size()) { 198 s1 = (String)f1.elementAt(p1); 199 s2 = (String)f2.elementAt(p2); 200 t1 += s1.length(); 201 t2 += s2.length(); 202 203 if (s1.equals(s2)) { 204 debugLogln(" >" + s1 + "<"); 205 ++p1; 206 ++p2; 207 } 208 else { 209 int tempT1 = t1; 210 int tempT2 = t2; 211 int tempP1 = p1; 212 int tempP2 = p2; 213 214 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) { 215 while (tempT1 < tempT2 && tempP1 < f1.size()) { 216 tempT1 += ((String)f1.elementAt(tempP1)).length(); 217 ++tempP1; 218 } 219 while (tempT2 < tempT1 && tempP2 < f2.size()) { 220 tempT2 += ((String)f2.elementAt(tempP2)).length(); 221 ++tempP2; 222 } 223 } 224 logln("*** " + f1Name + " has:"); 225 while (p1 <= tempP1 && p1 < f1.size()) { 226 s1 = (String)f1.elementAt(p1); 227 t1 += s1.length(); 228 debugLogln(" *** >" + s1 + "<"); 229 ++p1; 230 } 231 logln("***** " + f2Name + " has:"); 232 while (p2 <= tempP2 && p2 < f2.size()) { 233 s2 = (String)f2.elementAt(p2); 234 t2 += s2.length(); 235 debugLogln(" ***** >" + s2 + "<"); 236 ++p2; 237 } 238 errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2); 239 } 240 } 241 } 242 243 private void testFollowing(BreakIterator bi, String text, int[] boundaries) { 244 logln("testFollowing():"); 245 int p = 2; 246 int i = 0; 247 try { 248 for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in 249 if (i == boundaries[p]) 250 ++p; 251 252 int b = bi.following(i); 253 logln("bi.following(" + i + ") -> " + b); 254 if (b != boundaries[p]) 255 errln("Wrong result from following() for " + i + ": expected " + boundaries[p] 256 + ", got " + b); 257 } 258 } catch (IllegalArgumentException illargExp) { 259 errln("IllegalArgumentException caught from following() for offset: " + i); 260 } 261 } 262 263 private void testPreceding(BreakIterator bi, String text, int[] boundaries) { 264 logln("testPreceding():"); 265 int p = 0; 266 int i = 0; 267 try { 268 for (i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in 269 int b = bi.preceding(i); 270 logln("bi.preceding(" + i + ") -> " + b); 271 if (b != boundaries[p]) 272 errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p] 273 + ", got " + b); 274 275 if (i == boundaries[p + 1]) 276 ++p; 277 } 278 } catch (IllegalArgumentException illargExp) { 279 errln("IllegalArgumentException caught from preceding() for offset: " + i); 280 } 281 } 282 283 private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) { 284 logln("testIsBoundary():"); 285 int p = 1; 286 boolean isB; 287 for (int i = 0; i <= text.length(); i++) { // change to <= when new BI code goes in 288 isB = bi.isBoundary(i); 289 logln("bi.isBoundary(" + i + ") -> " + isB); 290 291 if (i == boundaries[p]) { 292 if (!isB) 293 errln("Wrong result from isBoundary() for " + i + ": expected true, got false"); 294 ++p; 295 } 296 else { 297 if (isB) 298 errln("Wrong result from isBoundary() for " + i + ": expected false, got true"); 299 } 300 } 301 } 302 303 private void doMultipleSelectionTest(BreakIterator iterator, String testText) 304 { 305 logln("Multiple selection test..."); 306 BreakIterator testIterator = (BreakIterator)iterator.clone(); 307 int offset = iterator.first(); 308 int testOffset; 309 int count = 0; 310 311 do { 312 testOffset = testIterator.first(); 313 testOffset = testIterator.next(count); 314 logln("next(" + count + ") -> " + testOffset); 315 if (offset != testOffset) 316 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset); 317 318 if (offset != BreakIterator.DONE) { 319 count++; 320 offset = iterator.next(); 321 } 322 } while (offset != BreakIterator.DONE); 323 324 // now do it backwards... 325 offset = iterator.last(); 326 count = 0; 327 328 do { 329 testOffset = testIterator.last(); 330 testOffset = testIterator.next(count); 331 logln("next(" + count + ") -> " + testOffset); 332 if (offset != testOffset) 333 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset); 334 335 if (offset != BreakIterator.DONE) { 336 count--; 337 offset = iterator.previous(); 338 } 339 } while (offset != BreakIterator.DONE); 340 } 341 342 private void doBreakInvariantTest(BreakIterator tb, String testChars) 343 { 344 StringBuffer work = new StringBuffer("aaa"); 345 int errorCount = 0; 346 347 // a break should always occur after CR (unless followed by LF), LF, PS, and LS 348 String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028"; 349 // change this back when new BI code is added 350 351 for (int i = 0; i < breaks.length(); i++) { 352 work.setCharAt(1, breaks.charAt(i)); 353 for (int j = 0; j < testChars.length(); j++) { 354 work.setCharAt(0, testChars.charAt(j)); 355 for (int k = 0; k < testChars.length(); k++) { 356 char c = testChars.charAt(k); 357 358 // if a cr is followed by lf, don't do the check (they stay together) 359 if (work.charAt(1) == '\r' && (c == '\n')) 360 continue; 361 362 // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored 363 // for breaking purposes as per UTR14 364 int type1 = Character.getType(work.charAt(1)); 365 int type2 = Character.getType(c); 366 if (type1 == Character.CONTROL || type1 == Character.FORMAT || 367 type2 == Character.CONTROL || type2 == Character.FORMAT) { 368 continue; 369 } 370 371 work.setCharAt(2, c); 372 tb.setText(work.toString()); 373 boolean seen2 = false; 374 for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) { 375 if (l == 2) 376 seen2 = true; 377 } 378 if (!seen2) { 379 errln("No break between U+" + Integer.toHexString((int)(work.charAt(1))) 380 + " and U+" + Integer.toHexString((int)(work.charAt(2)))); 381 errorCount++; 382 if (errorCount >= 75) 383 return; 384 } 385 } 386 } 387 } 388 } 389 390 private void doOtherInvariantTest(BreakIterator tb, String testChars) 391 { 392 StringBuffer work = new StringBuffer("a\r\na"); 393 int errorCount = 0; 394 395 // a break should never occur between CR and LF 396 for (int i = 0; i < testChars.length(); i++) { 397 work.setCharAt(0, testChars.charAt(i)); 398 for (int j = 0; j < testChars.length(); j++) { 399 work.setCharAt(3, testChars.charAt(j)); 400 tb.setText(work.toString()); 401 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) 402 if (k == 2) { 403 errln("Break between CR and LF in string U+" + Integer.toHexString( 404 (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString( 405 (int)(work.charAt(3)))); 406 errorCount++; 407 if (errorCount >= 75) 408 return; 409 } 410 } 411 } 412 413 // a break should never occur before a non-spacing mark, unless it's preceded 414 // by a line terminator 415 work.setLength(0); 416 work.append("aaaa"); 417 for (int i = 0; i < testChars.length(); i++) { 418 char c = testChars.charAt(i); 419 if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003') 420 continue; 421 work.setCharAt(1, c); 422 for (int j = 0; j < testChars.length(); j++) { 423 c = testChars.charAt(j); 424 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c) 425 != Character.ENCLOSING_MARK) 426 continue; 427 work.setCharAt(2, c); 428 429 // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored 430 // for breaking purposes as per UTR14 431 int type1 = Character.getType(work.charAt(1)); 432 int type2 = Character.getType(work.charAt(2)); 433 if (type1 == Character.CONTROL || type1 == Character.FORMAT || 434 type2 == Character.CONTROL || type2 == Character.FORMAT) { 435 continue; 436 } 437 438 tb.setText(work.toString()); 439 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next()) 440 if (k == 2) { 441 errln("Break between U+" + Integer.toHexString((int)(work.charAt(1))) 442 + " and U+" + Integer.toHexString((int)(work.charAt(2)))); 443 errorCount++; 444 if (errorCount >= 75) 445 return; 446 } 447 } 448 } 449 } 450 451 public void debugLogln(String s) { 452 final String zeros = "0000"; 453 String temp; 454 StringBuffer out = new StringBuffer(); 455 for (int i = 0; i < s.length(); i++) { 456 char c = s.charAt(i); 457 if (c >= ' ' && c < '\u007f') 458 out.append(c); 459 else { 460 out.append("\\u"); 461 temp = Integer.toHexString((int)c); 462 out.append(zeros.substring(0, 4 - temp.length())); 463 out.append(temp); 464 } 465 } 466 logln(out.toString()); 467 } 468 469 //========================================================================= 470 // tests 471 //========================================================================= 472 473 public void TestWordBreak() { 474 475 Vector<String> wordSelectionData = new Vector<String>(); 476 477 wordSelectionData.addElement("12,34"); 478 479 wordSelectionData.addElement(" "); 480 wordSelectionData.addElement("\u00A2"); //cent sign 481 wordSelectionData.addElement("\u00A3"); //pound sign 482 wordSelectionData.addElement("\u00A4"); //currency sign 483 wordSelectionData.addElement("\u00A5"); //yen sign 484 wordSelectionData.addElement("alpha-beta-gamma"); 485 wordSelectionData.addElement("."); 486 wordSelectionData.addElement(" "); 487 wordSelectionData.addElement("Badges"); 488 wordSelectionData.addElement("?"); 489 wordSelectionData.addElement(" "); 490 wordSelectionData.addElement("BADGES"); 491 wordSelectionData.addElement("!"); 492 wordSelectionData.addElement("?"); 493 wordSelectionData.addElement("!"); 494 wordSelectionData.addElement(" "); 495 wordSelectionData.addElement("We"); 496 wordSelectionData.addElement(" "); 497 wordSelectionData.addElement("don't"); 498 wordSelectionData.addElement(" "); 499 wordSelectionData.addElement("need"); 500 wordSelectionData.addElement(" "); 501 wordSelectionData.addElement("no"); 502 wordSelectionData.addElement(" "); 503 wordSelectionData.addElement("STINKING"); 504 wordSelectionData.addElement(" "); 505 wordSelectionData.addElement("BADGES"); 506 wordSelectionData.addElement("!"); 507 wordSelectionData.addElement("!"); 508 wordSelectionData.addElement("!"); 509 510 wordSelectionData.addElement("012.566,5"); 511 wordSelectionData.addElement(" "); 512 wordSelectionData.addElement("123.3434,900"); 513 wordSelectionData.addElement(" "); 514 wordSelectionData.addElement("1000,233,456.000"); 515 wordSelectionData.addElement(" "); 516 wordSelectionData.addElement("1,23.322%"); 517 wordSelectionData.addElement(" "); 518 wordSelectionData.addElement("123.1222"); 519 520 wordSelectionData.addElement(" "); 521 wordSelectionData.addElement("\u0024123,000.20"); 522 523 wordSelectionData.addElement(" "); 524 wordSelectionData.addElement("179.01\u0025"); 525 526 wordSelectionData.addElement("Hello"); 527 wordSelectionData.addElement(","); 528 wordSelectionData.addElement(" "); 529 wordSelectionData.addElement("how"); 530 wordSelectionData.addElement(" "); 531 wordSelectionData.addElement("are"); 532 wordSelectionData.addElement(" "); 533 wordSelectionData.addElement("you"); 534 wordSelectionData.addElement(" "); 535 wordSelectionData.addElement("X"); 536 wordSelectionData.addElement(" "); 537 538 wordSelectionData.addElement("Now"); 539 wordSelectionData.addElement("\r"); 540 wordSelectionData.addElement("is"); 541 wordSelectionData.addElement("\n"); 542 wordSelectionData.addElement("the"); 543 wordSelectionData.addElement("\r\n"); 544 wordSelectionData.addElement("time"); 545 wordSelectionData.addElement("\n"); 546 wordSelectionData.addElement("\r"); 547 wordSelectionData.addElement("for"); 548 wordSelectionData.addElement("\r"); 549 wordSelectionData.addElement("\r"); 550 wordSelectionData.addElement("all"); 551 wordSelectionData.addElement(" "); 552 553 generalIteratorTest(wordBreak, wordSelectionData); 554 } 555 556 public void TestBug4097779() { 557 Vector<String> wordSelectionData = new Vector<String>(); 558 559 wordSelectionData.addElement("aa\u0300a"); 560 wordSelectionData.addElement(" "); 561 562 generalIteratorTest(wordBreak, wordSelectionData); 563 } 564 565 public void TestBug4098467Words() { 566 Vector<String> wordSelectionData = new Vector<String>(); 567 568 // What follows is a string of Korean characters (I found it in the Yellow Pages 569 // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 570 // it correctly), first as precomposed syllables, and then as conjoining jamo. 571 // Both sequences should be semantically identical and break the same way. 572 // precomposed syllables... 573 wordSelectionData.addElement("\uc0c1\ud56d"); 574 wordSelectionData.addElement(" "); 575 wordSelectionData.addElement("\ud55c\uc778"); 576 wordSelectionData.addElement(" "); 577 wordSelectionData.addElement("\uc5f0\ud569"); 578 wordSelectionData.addElement(" "); 579 wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c"); 580 wordSelectionData.addElement(" "); 581 // conjoining jamo... 582 wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc"); 583 wordSelectionData.addElement(" "); 584 wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab"); 585 wordSelectionData.addElement(" "); 586 wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8"); 587 wordSelectionData.addElement(" "); 588 wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c"); 589 wordSelectionData.addElement(" "); 590 591 generalIteratorTest(wordBreak, wordSelectionData); 592 } 593 594 public void TestBug4117554Words() { 595 Vector<String> wordSelectionData = new Vector<String>(); 596 597 // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should 598 // count as a Kanji character for the purposes of word breaking 599 wordSelectionData.addElement("abc"); 600 wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03"); 601 wordSelectionData.addElement("abc"); 602 603 generalIteratorTest(wordBreak, wordSelectionData); 604 } 605 606 public void TestSentenceBreak() { 607 Vector<String> sentenceSelectionData = new Vector<String>(); 608 609 sentenceSelectionData.addElement("This is a simple sample sentence. "); 610 sentenceSelectionData.addElement("(This is it.) "); 611 sentenceSelectionData.addElement("This is a simple sample sentence. "); 612 sentenceSelectionData.addElement("\"This isn\'t it.\" "); 613 sentenceSelectionData.addElement("Hi! "); 614 sentenceSelectionData.addElement("This is a simple sample sentence. "); 615 sentenceSelectionData.addElement("It does not have to make any sense as you can see. "); 616 sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. "); 617 sentenceSelectionData.addElement("Che la dritta via aveo smarrita. "); 618 sentenceSelectionData.addElement("He said, that I said, that you said!! "); 619 620 sentenceSelectionData.addElement("Don't rock the boat.\u2029"); 621 622 sentenceSelectionData.addElement("Because I am the daddy, that is why. "); 623 sentenceSelectionData.addElement("Not on my time (el timo.)! "); 624 625 sentenceSelectionData.addElement("So what!!\u2029"); 626 627 sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" "); 628 sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). "); 629 sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n"); 630 sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" "); 631 sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? "); 632 sentenceSelectionData.addElement("He answered, \"You may not!\" "); 633 sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". "); 634 sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' "); 635 sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? "); 636 sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!"); 637 638 generalIteratorTest(sentenceBreak, sentenceSelectionData); 639 } 640 641 public void TestBug4113835() { 642 Vector<String> sentenceSelectionData = new Vector<String>(); 643 644 // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks 645 sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029"); 646 647 generalIteratorTest(sentenceBreak, sentenceSelectionData); 648 } 649 650 public void TestBug4111338() { 651 Vector<String> sentenceSelectionData = new Vector<String>(); 652 653 // test for bug #4111338: Don't break sentences at the boundary between CJK 654 // and other letters 655 sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c" 656 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba" 657 + "\u611d\u57b6\u2510\u5d46\".\u2029"); 658 sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8" 659 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0" 660 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); 661 sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4" 662 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8" 663 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); 664 sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029"); 665 666 generalIteratorTest(sentenceBreak, sentenceSelectionData); 667 } 668 669 public void TestBug4117554Sentences() { 670 Vector<String> sentenceSelectionData = new Vector<String>(); 671 672 // Treat fullwidth variants of .!? the same as their 673 // normal counterparts 674 sentenceSelectionData.addElement("I know I'm right\uff0e "); 675 sentenceSelectionData.addElement("Right\uff1f "); 676 sentenceSelectionData.addElement("Right\uff01 "); 677 678 // Don't break sentences at boundary between CJK and digits 679 sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8" 680 + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0" 681 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029"); 682 683 // Break sentence between a sentence terminator and 684 // opening punctuation 685 sentenceSelectionData.addElement("no?"); 686 sentenceSelectionData.addElement("(yes)"); 687 688 generalIteratorTest(sentenceBreak, sentenceSelectionData); 689 } 690 691 public void TestBug4158381() { 692 Vector<String> sentenceSelectionData = new Vector<String>(); 693 694 // Don't break sentence after period if it isn't followed by a space 695 sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class. "); 696 sentenceSelectionData.addElement("Another test.\u2029"); 697 698 // No breaks when there are no terminators around 699 sentenceSelectionData.addElement("<P>Provides a set of " 700 + ""lightweight" (all-java<FONT SIZE=\"-2\"><SUP>TM" 701 + "</SUP></FONT> language) components that, " 702 + "to the maximum degree possible, work the same on all platforms. "); 703 sentenceSelectionData.addElement("Another test.\u2029"); 704 705 generalIteratorTest(sentenceBreak, sentenceSelectionData); 706 } 707 708 public void TestBug4143071() { 709 Vector<String> sentenceSelectionData = new Vector<String>(); 710 711 // Make sure sentences that end with digits work right 712 sentenceSelectionData.addElement("Today is the 27th of May, 1998. "); 713 sentenceSelectionData.addElement("Tomorrow with be 28 May 1998. "); 714 sentenceSelectionData.addElement("The day after will be the 30th.\u2029"); 715 716 generalIteratorTest(sentenceBreak, sentenceSelectionData); 717 } 718 719 public void TestBug4152416() { 720 Vector<String> sentenceSelectionData = new Vector<String>(); 721 722 // Make sure sentences ending with a capital letter are treated correctly 723 sentenceSelectionData.addElement("The type of all primitive " 724 + "<code>boolean</code> values accessed in the target VM. "); 725 sentenceSelectionData.addElement("Calls to xxx will return an " 726 + "implementor of this interface.\u2029"); 727 728 generalIteratorTest(sentenceBreak, sentenceSelectionData); 729 } 730 731 public void TestBug4152117() { 732 Vector<String> sentenceSelectionData = new Vector<String>(); 733 734 // Make sure sentence breaking is handling punctuation correctly 735 // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE 736 // IT DOESN'T CROP UP] 737 sentenceSelectionData.addElement("Constructs a randomly generated " 738 + "BigInteger, uniformly distributed over the range <tt>0</tt> " 739 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. "); 740 sentenceSelectionData.addElement("The uniformity of the distribution " 741 + "assumes that a fair source of random bits is provided in " 742 + "<tt>rnd</tt>. "); 743 sentenceSelectionData.addElement("Note that this constructor always " 744 + "constructs a non-negative BigInteger.\u2029"); 745 746 generalIteratorTest(sentenceBreak, sentenceSelectionData); 747 } 748 749 public void TestLineBreak() { 750 Vector<String> lineSelectionData = new Vector<String>(); 751 752 lineSelectionData.addElement("Multi-"); 753 lineSelectionData.addElement("Level "); 754 lineSelectionData.addElement("example "); 755 lineSelectionData.addElement("of "); 756 lineSelectionData.addElement("a "); 757 lineSelectionData.addElement("semi-"); 758 lineSelectionData.addElement("idiotic "); 759 lineSelectionData.addElement("non-"); 760 lineSelectionData.addElement("sensical "); 761 lineSelectionData.addElement("(non-"); 762 lineSelectionData.addElement("important) "); 763 lineSelectionData.addElement("sentence. "); 764 765 lineSelectionData.addElement("Hi "); 766 lineSelectionData.addElement("Hello "); 767 lineSelectionData.addElement("How\n"); 768 lineSelectionData.addElement("are\r"); 769 lineSelectionData.addElement("you\u2028"); 770 lineSelectionData.addElement("fine.\t"); 771 lineSelectionData.addElement("good. "); 772 773 lineSelectionData.addElement("Now\r"); 774 lineSelectionData.addElement("is\n"); 775 lineSelectionData.addElement("the\r\n"); 776 lineSelectionData.addElement("time\n"); 777 lineSelectionData.addElement("\r"); 778 lineSelectionData.addElement("for\r"); 779 lineSelectionData.addElement("\r"); 780 lineSelectionData.addElement("all"); 781 782 generalIteratorTest(lineBreak, lineSelectionData); 783 } 784 785 public void TestBug4068133() { 786 Vector<String> lineSelectionData = new Vector<String>(); 787 788 lineSelectionData.addElement("\u96f6"); 789 lineSelectionData.addElement("\u4e00\u3002"); 790 lineSelectionData.addElement("\u4e8c\u3001"); 791 lineSelectionData.addElement("\u4e09\u3002\u3001"); 792 lineSelectionData.addElement("\u56db\u3001\u3002\u3001"); 793 lineSelectionData.addElement("\u4e94,"); 794 lineSelectionData.addElement("\u516d."); 795 lineSelectionData.addElement("\u4e03.\u3001,\u3002"); 796 lineSelectionData.addElement("\u516b"); 797 798 generalIteratorTest(lineBreak, lineSelectionData); 799 } 800 801 public void TestBug4086052() { 802 Vector<String> lineSelectionData = new Vector<String>(); 803 804 lineSelectionData.addElement("foo\u00a0bar "); 805 // lineSelectionData.addElement("foo\ufeffbar"); 806 807 generalIteratorTest(lineBreak, lineSelectionData); 808 } 809 810 public void TestBug4097920() { 811 Vector<String> lineSelectionData = new Vector<String>(); 812 813 lineSelectionData.addElement("dog,"); 814 lineSelectionData.addElement("cat,"); 815 lineSelectionData.addElement("mouse "); 816 lineSelectionData.addElement("(one)"); 817 lineSelectionData.addElement("(two)\n"); 818 819 generalIteratorTest(lineBreak, lineSelectionData); 820 } 821 /* 822 public void TestBug4035266() { 823 Vector<String> lineSelectionData = new Vector<String>(); 824 825 lineSelectionData.addElement("The "); 826 lineSelectionData.addElement("balance "); 827 lineSelectionData.addElement("is "); 828 lineSelectionData.addElement("$-23,456.78, "); 829 lineSelectionData.addElement("not "); 830 lineSelectionData.addElement("-$32,456.78!\n"); 831 832 generalIteratorTest(lineBreak, lineSelectionData); 833 } 834 */ 835 public void TestBug4098467Lines() { 836 Vector<String> lineSelectionData = new Vector<String>(); 837 838 // What follows is a string of Korean characters (I found it in the Yellow Pages 839 // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 840 // it correctly), first as precomposed syllables, and then as conjoining jamo. 841 // Both sequences should be semantically identical and break the same way. 842 // precomposed syllables... 843 lineSelectionData.addElement("\uc0c1"); 844 lineSelectionData.addElement("\ud56d "); 845 lineSelectionData.addElement("\ud55c"); 846 lineSelectionData.addElement("\uc778 "); 847 lineSelectionData.addElement("\uc5f0"); 848 lineSelectionData.addElement("\ud569 "); 849 lineSelectionData.addElement("\uc7a5"); 850 lineSelectionData.addElement("\ub85c"); 851 lineSelectionData.addElement("\uad50"); 852 lineSelectionData.addElement("\ud68c "); 853 // conjoining jamo... 854 lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc "); 855 lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab "); 856 lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 "); 857 lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c"); 858 859 if (Locale.getDefault().getLanguage().equals("th")) { 860 logln("This test is skipped in th locale."); 861 return; 862 } 863 864 generalIteratorTest(lineBreak, lineSelectionData); 865 } 866 867 public void TestBug4117554Lines() { 868 Vector<String> lineSelectionData = new Vector<String>(); 869 870 // Fullwidth .!? should be treated as postJwrd 871 lineSelectionData.addElement("\u4e01\uff0e"); 872 lineSelectionData.addElement("\u4e02\uff01"); 873 lineSelectionData.addElement("\u4e03\uff1f"); 874 875 generalIteratorTest(lineBreak, lineSelectionData); 876 } 877 878 public void TestBug4217703() { 879 if (Locale.getDefault().getLanguage().equals("th")) { 880 logln("This test is skipped in th locale."); 881 return; 882 } 883 884 Vector<String> lineSelectionData = new Vector<String>(); 885 886 // There shouldn't be a line break between sentence-ending punctuation 887 // and a closing quote 888 lineSelectionData.addElement("He "); 889 lineSelectionData.addElement("said "); 890 lineSelectionData.addElement("\"Go!\" "); 891 lineSelectionData.addElement("I "); 892 lineSelectionData.addElement("went. "); 893 894 lineSelectionData.addElement("Hashtable$Enumeration "); 895 lineSelectionData.addElement("getText()."); 896 lineSelectionData.addElement("getIndex()"); 897 898 generalIteratorTest(lineBreak, lineSelectionData); 899 } 900 901 private static final String graveS = "S\u0300"; 902 private static final String acuteBelowI = "i\u0317"; 903 private static final String acuteE = "e\u0301"; 904 private static final String circumflexA = "a\u0302"; 905 private static final String tildeE = "e\u0303"; 906 907 public void TestCharacterBreak() { 908 Vector<String> characterSelectionData = new Vector<String>(); 909 910 characterSelectionData.addElement(graveS); 911 characterSelectionData.addElement(acuteBelowI); 912 characterSelectionData.addElement("m"); 913 characterSelectionData.addElement("p"); 914 characterSelectionData.addElement("l"); 915 characterSelectionData.addElement(acuteE); 916 characterSelectionData.addElement(" "); 917 characterSelectionData.addElement("s"); 918 characterSelectionData.addElement(circumflexA); 919 characterSelectionData.addElement("m"); 920 characterSelectionData.addElement("p"); 921 characterSelectionData.addElement("l"); 922 characterSelectionData.addElement(tildeE); 923 characterSelectionData.addElement("."); 924 characterSelectionData.addElement("w"); 925 characterSelectionData.addElement(circumflexA); 926 characterSelectionData.addElement("w"); 927 characterSelectionData.addElement("a"); 928 characterSelectionData.addElement("f"); 929 characterSelectionData.addElement("q"); 930 characterSelectionData.addElement("\n"); 931 characterSelectionData.addElement("\r"); 932 characterSelectionData.addElement("\r\n"); 933 characterSelectionData.addElement("\n"); 934 935 generalIteratorTest(characterBreak, characterSelectionData); 936 } 937 938 public void TestBug4098467Characters() { 939 Vector<String> characterSelectionData = new Vector<String>(); 940 941 // What follows is a string of Korean characters (I found it in the Yellow Pages 942 // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 943 // it correctly), first as precomposed syllables, and then as conjoining jamo. 944 // Both sequences should be semantically identical and break the same way. 945 // precomposed syllables... 946 characterSelectionData.addElement("\uc0c1"); 947 characterSelectionData.addElement("\ud56d"); 948 characterSelectionData.addElement(" "); 949 characterSelectionData.addElement("\ud55c"); 950 characterSelectionData.addElement("\uc778"); 951 characterSelectionData.addElement(" "); 952 characterSelectionData.addElement("\uc5f0"); 953 characterSelectionData.addElement("\ud569"); 954 characterSelectionData.addElement(" "); 955 characterSelectionData.addElement("\uc7a5"); 956 characterSelectionData.addElement("\ub85c"); 957 characterSelectionData.addElement("\uad50"); 958 characterSelectionData.addElement("\ud68c"); 959 characterSelectionData.addElement(" "); 960 // conjoining jamo... 961 characterSelectionData.addElement("\u1109\u1161\u11bc"); 962 characterSelectionData.addElement("\u1112\u1161\u11bc"); 963 characterSelectionData.addElement(" "); 964 characterSelectionData.addElement("\u1112\u1161\u11ab"); 965 characterSelectionData.addElement("\u110b\u1175\u11ab"); 966 characterSelectionData.addElement(" "); 967 characterSelectionData.addElement("\u110b\u1167\u11ab"); 968 characterSelectionData.addElement("\u1112\u1161\u11b8"); 969 characterSelectionData.addElement(" "); 970 characterSelectionData.addElement("\u110c\u1161\u11bc"); 971 characterSelectionData.addElement("\u1105\u1169"); 972 characterSelectionData.addElement("\u1100\u116d"); 973 characterSelectionData.addElement("\u1112\u116c"); 974 975 generalIteratorTest(characterBreak, characterSelectionData); 976 } 977 978 public void TestBug4153072() { 979 BreakIterator iter = BreakIterator.getWordInstance(); 980 String str = "...Hello, World!..."; 981 int begin = 3; 982 int end = str.length() - 3; 983 boolean gotException = false; 984 boolean dummy; 985 986 iter.setText(new StringCharacterIterator(str, begin, end, begin)); 987 for (int index = -1; index < begin + 1; ++index) { 988 try { 989 dummy = iter.isBoundary(index); 990 if (index < begin) 991 errln("Didn't get exception with offset = " + index + 992 " and begin index = " + begin); 993 } 994 catch (IllegalArgumentException e) { 995 if (index >= begin) 996 errln("Got exception with offset = " + index + 997 " and begin index = " + begin); 998 } 999 } 1000 } 1001 1002 public void TestBug4146175Sentences() { 1003 Vector<String> sentenceSelectionData = new Vector<String>(); 1004 1005 // break between periods and opening punctuation even when there's no 1006 // intervening space 1007 sentenceSelectionData.addElement("end."); 1008 sentenceSelectionData.addElement("(This is\u2029"); 1009 1010 // treat the fullwidth period as an unambiguous sentence terminator 1011 sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e"); 1012 sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f"); 1013 1014 generalIteratorTest(sentenceBreak, sentenceSelectionData); 1015 } 1016 1017 public void TestBug4146175Lines() { 1018 if (Locale.getDefault().getLanguage().equals("th")) { 1019 logln("This test is skipped in th locale."); 1020 return; 1021 } 1022 1023 Vector<String> lineSelectionData = new Vector<String>(); 1024 1025 // the fullwidth comma should stick to the preceding Japanese character 1026 lineSelectionData.addElement("\u7d42\uff0c"); 1027 lineSelectionData.addElement("\u308f"); 1028 1029 generalIteratorTest(lineBreak, lineSelectionData); 1030 } 1031 1032 public void TestBug4214367() { 1033 if (Locale.getDefault().getLanguage().equals("th")) { 1034 logln("This test is skipped in th locale."); 1035 return; 1036 } 1037 1038 Vector<String> wordSelectionData = new Vector<String>(); 1039 1040 // the hiragana and katakana iteration marks and the long vowel mark 1041 // are not being treated correctly by the word-break iterator 1042 wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042"); 1043 wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2"); 1044 1045 generalIteratorTest(wordBreak, wordSelectionData); 1046 } 1047 1048 private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking 1049 = /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2" 1050 + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3" 1051 + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303" 1052 + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000" 1053 + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f" 1054 + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164"; 1055 1056 public void TestSentenceInvariants() 1057 { 1058 BreakIterator e = BreakIterator.getSentenceInstance(); 1059 doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff"); 1060 } 1061 1062 public void TestWordInvariants() 1063 { 1064 if (Locale.getDefault().getLanguage().equals("th")) { 1065 logln("This test is skipped in th locale."); 1066 return; 1067 } 1068 1069 BreakIterator e = BreakIterator.getWordInstance(); 1070 doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" 1071 + "\u30a3\u4e00\u4e01\u4e02"); 1072 doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2" 1073 + "\u30a3\u4e00\u4e01\u4e02"); 1074 } 1075 1076 public void TestLineInvariants() 1077 { 1078 if (Locale.getDefault().getLanguage().equals("th")) { 1079 logln("This test is skipped in th locale."); 1080 return; 1081 } 1082 1083 BreakIterator e = BreakIterator.getLineInstance(); 1084 String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045" 1085 + "\u30a3\u4e00\u4e01\u4e02"; 1086 doBreakInvariantTest(e, testChars); 1087 doOtherInvariantTest(e, testChars); 1088 1089 int errorCount = 0; 1090 1091 // in addition to the other invariants, a line-break iterator should make sure that: 1092 // it doesn't break around the non-breaking characters 1093 String noBreak = "\u00a0\u2007\u2011\ufeff"; 1094 StringBuffer work = new StringBuffer("aaa"); 1095 for (int i = 0; i < testChars.length(); i++) { 1096 char c = testChars.charAt(i); 1097 if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003') 1098 continue; 1099 work.setCharAt(0, c); 1100 for (int j = 0; j < noBreak.length(); j++) { 1101 work.setCharAt(1, noBreak.charAt(j)); 1102 for (int k = 0; k < testChars.length(); k++) { 1103 work.setCharAt(2, testChars.charAt(k)); 1104 // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored 1105 // for breaking purposes as per UTR14 1106 int type1 = Character.getType(work.charAt(1)); 1107 int type2 = Character.getType(work.charAt(2)); 1108 if (type1 == Character.CONTROL || type1 == Character.FORMAT || 1109 type2 == Character.CONTROL || type2 == Character.FORMAT) { 1110 continue; 1111 } 1112 e.setText(work.toString()); 1113 for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) { 1114 if (l == 1 || l == 2) { 1115 //errln("Got break between U+" + Integer.toHexString((int) 1116 // (work.charAt(l - 1))) + " and U+" + Integer.toHexString( 1117 // (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2); 1118 // as per UTR14 spaces followed by a GLUE character should allow 1119 // line breaking 1120 if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' || 1121 work.charAt(l) == '\u0f0c' || 1122 work.charAt(l) == '\u2007' || 1123 work.charAt(l) == '\u2011' || 1124 work.charAt(l) == '\u202f' || 1125 work.charAt(l) == '\ufeff')) { 1126 continue; 1127 } 1128 errln("Got break between U+" + Integer.toHexString((int) 1129 (work.charAt(l - 1))) + " and U+" + Integer.toHexString( 1130 (int)(work.charAt(l)))); 1131 errorCount++; 1132 if (errorCount >= 75) 1133 return; 1134 } 1135 } 1136 } 1137 } 1138 } 1139 1140 // The following test has so many exceptions that it would be better to write a new set of data 1141 // that tested exactly what should be tested 1142 // Until that point it will be commented out 1143 /* 1144 1145 // it does break after dashes (unless they're followed by a digit, a non-spacing mark, 1146 // a currency symbol, a space, a format-control character, a regular control character, 1147 // a line or paragraph separator, or another dash) 1148 String dashes = "-\u00ad\u2010\u2012\u2013\u2014"; 1149 for (int i = 0; i < testChars.length(); i++) { 1150 work.setCharAt(0, testChars.charAt(i)); 1151 for (int j = 0; j < dashes.length(); j++) { 1152 work.setCharAt(1, dashes.charAt(j)); 1153 for (int k = 0; k < testChars.length(); k++) { 1154 char c = testChars.charAt(k); 1155 if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER || 1156 Character.getType(c) == Character.OTHER_NUMBER || 1157 Character.getType(c) == Character.NON_SPACING_MARK || 1158 Character.getType(c) == Character.ENCLOSING_MARK || 1159 Character.getType(c) == Character.CURRENCY_SYMBOL || 1160 Character.getType(c) == Character.DASH_PUNCTUATION || 1161 Character.getType(c) == Character.SPACE_SEPARATOR || 1162 Character.getType(c) == Character.FORMAT || 1163 Character.getType(c) == Character.CONTROL || 1164 Character.getType(c) == Character.END_PUNCTUATION || 1165 Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION || 1166 Character.getType(c) == Character.OTHER_PUNCTUATION || 1167 c == '\'' || c == '\"' || 1168 // category EX as per UTR14 1169 c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' || 1170 c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' || 1171 c == '\u0003' || c == '\u2007' || c == '\u2011' || 1172 c == '\ufeff') 1173 continue; 1174 work.setCharAt(2, c); 1175 e.setText(work.toString()); 1176 boolean saw2 = false; 1177 for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) 1178 if (l == 2) 1179 saw2 = true; 1180 if (!saw2) { 1181 errln("Didn't get break between U+" + Integer.toHexString((int) 1182 (work.charAt(1))) + " and U+" + Integer.toHexString( 1183 (int)(work.charAt(2)))); 1184 errorCount++; 1185 if (errorCount >= 75) 1186 return; 1187 } 1188 } 1189 } 1190 } 1191 */ 1192 } 1193 1194 public void TestCharacterInvariants() 1195 { 1196 BreakIterator e = BreakIterator.getCharacterInstance(); 1197 doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" 1198 + "\u11a9\u11aa"); 1199 doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8" 1200 + "\u11a9\u11aa"); 1201 } 1202 1203 public void TestEmptyString() 1204 { 1205 String text = ""; 1206 Vector<String> x = new Vector<String>(); 1207 x.addElement(text); 1208 1209 generalIteratorTest(lineBreak, x); 1210 } 1211 1212 public void TestGetAvailableLocales() 1213 { 1214 Locale[] locList = BreakIterator.getAvailableLocales(); 1215 1216 if (locList.length == 0) 1217 errln("getAvailableLocales() returned an empty list!"); 1218 // I have no idea how to test this function... 1219 } 1220 1221 1222 /** 1223 * Bug 4095322 1224 */ 1225 public void TestJapaneseLineBreak() 1226 { 1227 StringBuffer testString = new StringBuffer("\u4e00x\u4e8c"); 1228 // Breaking on <Kanji>$<Kanji> is inconsistent 1229 1230 /* Characters in precedingChars and followingChars have been updated 1231 * from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed. 1232 * In concrete terms, 1233 * 0x301F : Its category was changed from Ps to Pe since Unicode 2.1. 1234 * 0x169B & 0x169C : added since Unicode 3.0.0. 1235 */ 1236 String precedingChars = 1237 /* Puctuation, Open */ 1238 "([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b" 1239 /* Punctuation, Initial quote */ 1240 + "\u00ab\u2018\u201b\u201c\u201f\u2039" 1241 /* Symbol, Currency */ 1242 + "\u00a5\u00a3\u00a4\u20a0"; 1243 1244 String followingChars = 1245 /* Puctuation, Close */ 1246 ")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c" 1247 /* Punctuation, Final quote */ 1248 + "\u00bb\u2019\u201d\u203a" 1249 /* Punctuation, Other */ 1250 + "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034" 1251 /* Punctuation, Dash */ 1252 + "\u2103\u2109" 1253 /* Symbol, Currency */ 1254 + "\u00a2" 1255 /* Letter, Modifier */ 1256 + "\u3005\u309d\u309e" 1257 /* Letter, Other */ 1258 + "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe" 1259 /* Mark, Non-Spacing */ 1260 + "\u0300\u0301\u0302" 1261 /* Symbol, Modifier */ 1262 + "\u309b\u309c" 1263 /* Symbol, Other */ 1264 + "\u00b0"; 1265 1266 BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN); 1267 1268 for (int i = 0; i < precedingChars.length(); i++) { 1269 testString.setCharAt(1, precedingChars.charAt(i)); 1270 iter.setText(testString.toString()); 1271 int j = iter.first(); 1272 if (j != 0) { 1273 errln("ja line break failure: failed to start at 0 and bounced at " + j); 1274 } 1275 j = iter.next(); 1276 if (j != 1) { 1277 errln("ja line break failure: failed to stop before '" 1278 + precedingChars.charAt(i) + "' (\\u" 1279 + Integer.toString(precedingChars.charAt(i), 16) 1280 + ") at 1 and bounded at " + j); 1281 } 1282 j = iter.next(); 1283 if (j != 3) { 1284 errln("ja line break failure: failed to skip position after '" 1285 + precedingChars.charAt(i) + "' (\\u" 1286 + Integer.toString(precedingChars.charAt(i), 16) 1287 + ") at 3 and bounded at " + j); 1288 } 1289 } 1290 1291 for (int i = 0; i < followingChars.length(); i++) { 1292 testString.setCharAt(1, followingChars.charAt(i)); 1293 iter.setText(testString.toString()); 1294 int j = iter.first(); 1295 if (j != 0) { 1296 errln("ja line break failure: failed to start at 0 and bounded at " + j); 1297 } 1298 j = iter.next(); 1299 if (j != 2) { 1300 errln("ja line break failure: failed to skip position before '" 1301 + followingChars.charAt(i) + "' (\\u" 1302 + Integer.toString(followingChars.charAt(i), 16) 1303 + ") at 2 and bounded at " + j); 1304 } 1305 j = iter.next(); 1306 if (j != 3) { 1307 errln("ja line break failure: failed to stop after '" 1308 + followingChars.charAt(i) + "' (\\u" 1309 + Integer.toString(followingChars.charAt(i), 16) 1310 + ") at 3 and bounded at " + j); 1311 } 1312 } 1313 } 1314 1315 /** 1316 * Bug 4638433 1317 */ 1318 public void TestLineBreakBasedOnUnicode3_0_0() 1319 { 1320 BreakIterator iter; 1321 int i; 1322 1323 /* Latin Extend-B characters 1324 * 0x0218-0x0233 which have been added since Unicode 3.0.0. 1325 */ 1326 iter = BreakIterator.getWordInstance(Locale.US); 1327 iter.setText("\u0216\u0217\u0218\u0219\u021A"); 1328 i = iter.first(); 1329 i = iter.next(); 1330 if (i != 5) { 1331 errln("Word break failure: failed to stop at 5 and bounded at " + i); 1332 } 1333 1334 1335 iter = BreakIterator.getLineInstance(Locale.US); 1336 1337 /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)> 1338 * \u301f has changed its category from Ps to Pe since Unicode 2.1. 1339 */ 1340 iter.setText("32\u301f1"); 1341 i = iter.first(); 1342 i = iter.next(); 1343 if (i != 3) { 1344 errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i); 1345 } 1346 1347 /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)> 1348 * which have been added since Unicode 3.0.0. 1349 */ 1350 iter.setText("\u1820\u1806\u1821"); 1351 i = iter.first(); 1352 i = iter.next(); 1353 if (i != 2) { 1354 errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i); 1355 } 1356 1357 /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have 1358 * been added since Unicode 3.0.0. 1359 */ 1360 iter.setText("\u17E0\u17DB\u17E1"); 1361 i = iter.first(); 1362 i = iter.next(); 1363 if (i != 1) { 1364 errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i); 1365 } 1366 i = iter.next(); 1367 if (i != 3) { 1368 errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i); 1369 } 1370 1371 /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have 1372 * been added since Unicode 3.0.0. 1373 */ 1374 iter.setText("\u1692\u1680\u1696"); 1375 i = iter.first(); 1376 i = iter.next(); 1377 if (i != 2) { 1378 errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i); 1379 } 1380 1381 1382 // Confirm changes in BreakIteratorRules_th.java have been reflected. 1383 iter = BreakIterator.getLineInstance(new Locale("th", "")); 1384 1385 /* Thai <Seven(Nd)> 1386 * <Left Double Quotation Mark(Pi)> 1387 * <Five(Nd)> 1388 * <Right Double Quotation Mark(Pf)> 1389 * <Three(Nd)> 1390 */ 1391 iter.setText("\u0E57\u201C\u0E55\u201D\u0E53"); 1392 i = iter.first(); 1393 i = iter.next(); 1394 if (i != 1) { 1395 errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i); 1396 } 1397 i = iter.next(); 1398 if (i != 4) { 1399 errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i); 1400 } 1401 } 1402 1403 /** 1404 * Bug 4068137 1405 */ 1406 public void TestEndBehavior() 1407 { 1408 String testString = "boo."; 1409 BreakIterator wb = BreakIterator.getWordInstance(); 1410 wb.setText(testString); 1411 1412 if (wb.first() != 0) 1413 errln("Didn't get break at beginning of string."); 1414 if (wb.next() != 3) 1415 errln("Didn't get break before period in \"boo.\""); 1416 if (wb.current() != 4 && wb.next() != 4) 1417 errln("Didn't get break at end of string."); 1418 } 1419 1420 // [serialization test has been removed pursuant to bug #4152965] 1421 1422 /** 1423 * Bug 4450804 1424 */ 1425 public void TestLineBreakContractions() { 1426 Vector<String> expected = new Vector<String>(); 1427 1428 expected.add("These "); 1429 expected.add("are "); 1430 expected.add("'foobles'. "); 1431 expected.add("Don't "); 1432 expected.add("you "); 1433 expected.add("like "); 1434 expected.add("them?"); 1435 generalIteratorTest(lineBreak, expected); 1436 } 1437 1438 }