1 /* 2 * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved 29 * 30 * The original version of this source code and documentation 31 * is copyrighted and owned by Taligent, Inc., a wholly-owned 32 * subsidiary of IBM. These materials are provided under terms 33 * of a License Agreement between Taligent and Sun. This technology 34 * is protected by multiple US and International patents. 35 * 36 * This notice and attribution to Taligent may not be removed. 37 * Taligent is a registered trademark of Taligent, Inc. 38 */ 39 40 package org.openjdk.buildtools.generatebreakiteratordata; 41 42 import java.util.Arrays; 43 import java.util.Hashtable; 44 45 /** 46 * An object representing a set of characters. (This is a "set" in the 47 * mathematical sense: an unduplicated list of characters on which set 48 * operations such as union and intersection can be performed.) The 49 * set information is stored in compressed, optimized form: The object 50 * contains an integer array with an even number of characters. Each 51 * pair of characters represents a range of characters contained in the set 52 * (a pair of the same character represents a single character). The 53 * characters are sorted in increasing order. 54 */ 55 class CharSet { 56 /** 57 * The structure containing the set information. The characters 58 * in this array are organized into pairs, each pair representing 59 * a range of characters contained in the set 60 */ 61 private int[] chars; 62 63 //========================================================================== 64 // parseString() and associated routines 65 //========================================================================== 66 /** 67 * A cache which is used to speed up parseString() whenever it is 68 * used to parse a description that has been parsed before 69 */ 70 private static Hashtable<String, CharSet> expressionCache = null; 71 72 /** 73 * Builds a CharSet based on a textual description. For the syntax of 74 * the description, see the documentation of RuleBasedBreakIterator. 75 * @see java.text.RuleBasedBreakIterator 76 */ 77 public static CharSet parseString(String s) { 78 CharSet result = null; 79 80 // if "s" is in the expression cache, pull the result out 81 // of the expresison cache 82 if (expressionCache != null) { 83 result = expressionCache.get(s); 84 } 85 86 // otherwise, use doParseString() to actually parse the string, 87 // and then add a corresponding entry to the expression cache 88 if (result == null) { 89 result = doParseString(s); 90 if (expressionCache == null) { 91 expressionCache = new Hashtable<>(); 92 } 93 expressionCache.put(s, result); 94 } 95 result = (CharSet)(result.clone()); 96 return result; 97 } 98 99 /** 100 * This function is used by parseString() to actually parse the string 101 */ 102 private static CharSet doParseString(String s) { 103 CharSet result = new CharSet(); 104 int p = 0; 105 106 boolean haveDash = false; 107 boolean haveTilde = false; 108 boolean wIsReal = false; 109 int w = 0x0000; 110 111 // for each character in the description... 112 while (p < s.length()) { 113 int c = s.codePointAt(p); 114 115 // if it's an opening bracket... 116 if (c == '[') { 117 // flush the single-character cache 118 if (wIsReal) { 119 result.internalUnion(new CharSet(w)); 120 } 121 122 // locate the matching closing bracket 123 int bracketLevel = 1; 124 int q = p + 1; 125 while (bracketLevel != 0) { 126 // if no matching bracket by end of string then... 127 if (q >= s.length()) { 128 throw new IllegalArgumentException("Parse error at position " + p + " in " + s); 129 } 130 int ch = s.codePointAt(q); 131 switch (ch) { 132 case '\\': // need to step over next character 133 ch = s.codePointAt(++q); 134 break; 135 case '[': 136 ++bracketLevel; 137 break; 138 case ']': 139 --bracketLevel; 140 break; 141 } 142 q += Character.charCount(ch); 143 } 144 --q; 145 146 // call parseString() recursively to parse the text inside 147 // the brackets, then either add or subtract the result from 148 // our running result depending on whether or not the [] 149 // expresison was preceded by a ^ 150 if (!haveTilde) { 151 result.internalUnion(CharSet.parseString(s.substring(p + 1, q))); 152 } 153 else { 154 result.internalDifference(CharSet.parseString(s.substring(p + 1, q))); 155 } 156 haveTilde = false; 157 haveDash = false; 158 wIsReal = false; 159 p = q + 1; 160 } 161 162 // if the character is a colon... 163 else if (c == ':') { 164 // flush the single-character cache 165 if (wIsReal) { 166 result.internalUnion(new CharSet(w)); 167 } 168 169 // locate the matching colon (and throw an error if there 170 // isn't one) 171 int q = s.indexOf(':', p + 1); 172 if (q == -1) { 173 throw new IllegalArgumentException("Parse error at position " + p + " in " + s); 174 } 175 176 // use charSetForCategory() to parse the text in the colons, 177 // and either add or substract the result from our running 178 // result depending on whether the :: expression was 179 // preceded by a ^ 180 if (!haveTilde) { 181 result.internalUnion(charSetForCategory(s.substring(p + 1, q))); 182 } 183 else { 184 result.internalDifference(charSetForCategory(s.substring(p + 1, q))); 185 } 186 187 // reset everything and advance to the next character 188 haveTilde = false; 189 haveDash = false; 190 wIsReal = false; 191 p = q + 1; 192 } 193 194 // if the character is a dash, set an appropriate flag 195 else if (c == '-') { 196 if (wIsReal) { 197 haveDash = true; 198 } 199 ++p; 200 } 201 202 // if the character is a caret, flush the single-character 203 // cache and set an appropriate flag. If the set is empty 204 // (i.e., if the expression begins with ^), invert the set 205 // (i.e., set it to include everything). The idea here is 206 // that a set that includes nothing but ^ expressions 207 // means "everything but these things". 208 else if (c == '^') { 209 if (wIsReal) { 210 result.internalUnion(new CharSet(w)); 211 wIsReal = false; 212 } 213 haveTilde = true; 214 ++p; 215 if (result.empty()) { 216 result.internalComplement(); 217 } 218 } 219 220 // throw an exception on an illegal character 221 else if (c >= ' ' && c < '\u007f' && !Character.isLetter((char)c) 222 && !Character.isDigit((char)c) && c != '\\') { 223 throw new IllegalArgumentException("Parse error at position " + p + " in " + s); 224 } 225 226 // otherwise, we end up here... 227 else { 228 // on a backslash, advance to the next character 229 if (c == '\\') { 230 ++p; 231 } 232 233 // if the preceding character was a dash, this character 234 // defines the end of a range. Add or subtract that range 235 // from the running result depending on whether or not it 236 // was preceded by a ^ 237 if (haveDash) { 238 if (s.codePointAt(p) < w) { 239 throw new IllegalArgumentException("U+" + 240 Integer.toHexString(s.codePointAt(p)) 241 + " is less than U+" + Integer.toHexString(w) + ". Dash expressions " 242 + "can't have their endpoints in reverse order."); 243 } 244 245 int ch = s.codePointAt(p); 246 if (!haveTilde) { 247 result.internalUnion(new CharSet(w, ch)); 248 } 249 else { 250 result.internalDifference(new CharSet(w, ch)); 251 } 252 p += Character.charCount(ch); 253 haveDash = false; 254 haveTilde = false; 255 wIsReal = false; 256 } 257 258 // if the preceding character was a caret, remove this character 259 // from the running result 260 else if (haveTilde) { 261 w = s.codePointAt(p); 262 result.internalDifference(new CharSet(w)); 263 p += Character.charCount(w); 264 haveTilde = false; 265 wIsReal = false; 266 } 267 268 // otherwise, flush the single-character cache and then 269 // put this character into the cache 270 else if (wIsReal) { 271 result.internalUnion(new CharSet(w)); 272 w = s.codePointAt(p); 273 p += Character.charCount(w); 274 wIsReal = true; 275 } else { 276 w = s.codePointAt(p); 277 p += Character.charCount(w); 278 wIsReal = true; 279 } 280 } 281 } 282 283 // finally, flush the single-character cache one last time 284 if (wIsReal) { 285 result.internalUnion(new CharSet(w)); 286 } 287 288 return result; 289 } 290 291 /** 292 * Creates a CharSet containing all the characters in a particular 293 * Unicode category. The text is either a two-character code from 294 * the Unicode database or a single character that begins one or more 295 * two-character codes. 296 */ 297 private static CharSet charSetForCategory(String category) { 298 // throw an exception if we have anything other than one or two 299 // characters inside the colons 300 if (category.length() == 0 || category.length() >= 3) { 301 throw new IllegalArgumentException("Invalid character category: " + category); 302 } 303 304 // if we have two characters, search the category map for that code 305 // and either construct and return a CharSet from the data in the 306 // category map or throw an exception 307 if (category.length() == 2) { 308 for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { 309 if (CharacterCategory.categoryNames[i].equals(category)) { 310 return new CharSet(CharacterCategory.getCategoryMap(i)); 311 } 312 } 313 throw new IllegalArgumentException("Invalid character category: " + category); 314 } 315 316 // if we have one character, search the category map for codes beginning 317 // with that letter, and union together all of the matching sets that 318 // we find (or throw an exception if there are no matches) 319 else if (category.length() == 1) { 320 CharSet result = new CharSet(); 321 for (int i = 0; i < CharacterCategory.categoryNames.length; i++) { 322 if (CharacterCategory.categoryNames[i].startsWith(category)) { 323 result = result.union(new CharSet(CharacterCategory.getCategoryMap(i))); 324 } 325 } 326 if (result.empty()) { 327 throw new IllegalArgumentException("Invalid character category: " + category); 328 } 329 else { 330 return result; 331 } 332 } 333 return new CharSet(); // should never get here, but to make the compiler happy... 334 } 335 336 /** 337 * Returns a copy of CharSet's expression cache and sets CharSet's 338 * expression cache to empty. 339 */ 340 public static Hashtable<String, CharSet> releaseExpressionCache() { 341 Hashtable<String, CharSet> result = expressionCache; 342 expressionCache = null; 343 return result; 344 } 345 346 //========================================================================== 347 // CharSet manipulation 348 //========================================================================== 349 /** 350 * Creates an empty CharSet. 351 */ 352 public CharSet() { 353 chars = new int[0]; 354 } 355 356 /** 357 * Creates a CharSet containing a single character. 358 * @param c The character to put into the CharSet 359 */ 360 public CharSet(int c) { 361 chars = new int[2]; 362 chars[0] = c; 363 chars[1] = c; 364 } 365 366 /** 367 * Creates a CharSet containing a range of characters. 368 * @param lo The lowest-numbered character to include in the range 369 * @param hi The highest-numbered character to include in the range 370 */ 371 public CharSet(int lo, int hi) { 372 chars = new int[2]; 373 if (lo <= hi) { 374 chars[0] = lo; 375 chars[1] = hi; 376 } 377 else { 378 chars[0] = hi; 379 chars[1] = lo; 380 } 381 } 382 383 /** 384 * Creates a CharSet, initializing it from the internal storage 385 * of another CharSet (this function performs no error checking 386 * on "chars", so if it's malformed, undefined behavior will result) 387 */ 388 private CharSet(int[] chars) { 389 this.chars = chars; 390 } 391 392 /** 393 * Returns a CharSet representing the union of two CharSets. 394 */ 395 public CharSet union(CharSet that) { 396 return new CharSet(doUnion(that.chars)); 397 } 398 399 /** 400 * Adds the characters in "that" to this CharSet 401 */ 402 private void internalUnion(CharSet that) { 403 chars = doUnion(that.chars); 404 } 405 406 /** 407 * The actual implementation of the union functions 408 */ 409 private int[] doUnion(int[] c2) { 410 int[] result = new int[chars.length+c2.length]; 411 412 int i = 0; 413 int j = 0; 414 int index = 0; 415 416 // consider all the characters in both strings 417 while (i < chars.length && j < c2.length) { 418 int ub; 419 420 // the first character in the result is the lower of the 421 // starting characters of the two strings, and "ub" gets 422 // set to the upper bound of that range 423 if (chars[i] < c2[j]) { 424 result[index++] = chars[i]; 425 ub = chars[++i]; 426 } 427 else { 428 result[index++] = c2[j]; 429 ub = c2[++j]; 430 } 431 432 // for as long as one of our two pointers is pointing to a range's 433 // end point, or i is pointing to a character that is less than 434 // "ub" plus one (the "plus one" stitches touching ranges together)... 435 while (i % 2 == 1 || 436 j % 2 == 1 || 437 (i < chars.length && chars[i] <= ub + 1)) { 438 439 // advance i to the first character that is greater than 440 // "ub" plus one 441 while (i < chars.length && chars[i] <= ub + 1) { 442 ++i; 443 } 444 445 // if i points to the endpoint of a range, update "ub" 446 // to that character, or if i points to the start of 447 // a range and the endpoint of the preceding range is 448 // greater than "ub", update "up" to _that_ character 449 if (i % 2 == 1) { 450 ub = chars[i]; 451 } 452 else if (i > 0 && chars[i - 1] > ub) { 453 ub = chars[i - 1]; 454 } 455 456 // now advance j to the first character that is greater 457 // that "ub" plus one 458 while (j < c2.length && c2[j] <= ub + 1) { 459 ++j; 460 } 461 462 // if j points to the endpoint of a range, update "ub" 463 // to that character, or if j points to the start of 464 // a range and the endpoint of the preceding range is 465 // greater than "ub", update "up" to _that_ character 466 if (j % 2 == 1) { 467 ub = c2[j]; 468 } 469 else if (j > 0 && c2[j - 1] > ub) { 470 ub = c2[j - 1]; 471 } 472 } 473 // when we finally fall out of this loop, we will have stitched 474 // together a series of ranges that overlap or touch, i and j 475 // will both point to starting points of ranges, and "ub" will 476 // be the endpoint of the range we're working on. Write "ub" 477 // to the result 478 result[index++] = ub; 479 480 // loop back around to create the next range in the result 481 } 482 483 // we fall out to here when we've exhausted all the characters in 484 // one of the operands. We can append all of the remaining characters 485 // in the other operand without doing any extra work. 486 if (i < chars.length) { 487 for (int k = i; k < chars.length; k++) { 488 result[index++] = chars[k]; 489 } 490 } 491 if (j < c2.length) { 492 for (int k = j; k < c2.length; k++) { 493 result[index++] = c2[k]; 494 } 495 } 496 497 if (result.length > index) { 498 int[] tmpbuf = new int[index]; 499 System.arraycopy(result, 0, tmpbuf, 0, index); 500 return tmpbuf; 501 } 502 503 return result; 504 } 505 506 /** 507 * Returns the intersection of two CharSets. 508 */ 509 public CharSet intersection(CharSet that) { 510 return new CharSet(doIntersection(that.chars)); 511 } 512 513 /** 514 * Removes from this CharSet any characters that aren't also in "that" 515 */ 516 private void internalIntersection(CharSet that) { 517 chars = doIntersection(that.chars); 518 } 519 520 /** 521 * The internal implementation of the two intersection functions 522 */ 523 private int[] doIntersection(int[] c2) { 524 int[] result = new int[chars.length+c2.length]; 525 526 int i = 0; 527 int j = 0; 528 int oldI; 529 int oldJ; 530 int index = 0; 531 532 // iterate until we've exhausted one of the operands 533 while (i < chars.length && j < c2.length) { 534 535 // advance j until it points to a character that is larger than 536 // the one i points to. If this is the beginning of a one- 537 // character range, advance j to point to the end 538 if (i < chars.length && i % 2 == 0) { 539 while (j < c2.length && c2[j] < chars[i]) { 540 ++j; 541 } 542 if (j < c2.length && j % 2 == 0 && c2[j] == chars[i]) { 543 ++j; 544 } 545 } 546 547 // if j points to the endpoint of a range, save the current 548 // value of i, then advance i until it reaches a character 549 // which is larger than the character pointed at 550 // by j. All of the characters we've advanced over (except 551 // the one currently pointed to by i) are added to the result 552 oldI = i; 553 while (j % 2 == 1 && i < chars.length && chars[i] <= c2[j]) { 554 ++i; 555 } 556 for (int k = oldI; k < i; k++) { 557 result[index++] = chars[k]; 558 } 559 560 // if i points to the endpoint of a range, save the current 561 // value of j, then advance j until it reaches a character 562 // which is larger than the character pointed at 563 // by i. All of the characters we've advanced over (except 564 // the one currently pointed to by i) are added to the result 565 oldJ = j; 566 while (i % 2 == 1 && j < c2.length && c2[j] <= chars[i]) { 567 ++j; 568 } 569 for (int k = oldJ; k < j; k++) { 570 result[index++] = c2[k]; 571 } 572 573 // advance i until it points to a character larger than j 574 // If it points at the beginning of a one-character range, 575 // advance it to the end of that range 576 if (j < c2.length && j % 2 == 0) { 577 while (i < chars.length && chars[i] < c2[j]) { 578 ++i; 579 } 580 if (i < chars.length && i % 2 == 0 && c2[j] == chars[i]) { 581 ++i; 582 } 583 } 584 } 585 586 if (result.length > index) { 587 int[] tmpbuf = new int[index]; 588 System.arraycopy(result, 0, tmpbuf, 0, index); 589 return tmpbuf; 590 } 591 592 return result; 593 } 594 595 /** 596 * Returns a CharSet containing all the characters in "this" that 597 * aren't also in "that" 598 */ 599 public CharSet difference(CharSet that) { 600 return new CharSet(doIntersection(that.doComplement())); 601 } 602 603 /** 604 * Removes from "this" all the characters that are also in "that" 605 */ 606 private void internalDifference(CharSet that) { 607 chars = doIntersection(that.doComplement()); 608 } 609 610 /** 611 * Returns a CharSet containing all the characters which are not 612 * in "this" 613 */ 614 public CharSet complement() { 615 return new CharSet(doComplement()); 616 } 617 618 /** 619 * Complements "this". All the characters it contains are removed, 620 * and all the characters it doesn't contain are added. 621 */ 622 private void internalComplement() { 623 chars = doComplement(); 624 } 625 626 /** 627 * The internal implementation function for the complement routines 628 */ 629 private int[] doComplement() { 630 // the complement of an empty CharSet is one containing everything 631 if (empty()) { 632 int[] result = new int[2]; 633 result[0] = 0x0000; 634 result[1] = 0x10FFFF; 635 return result; 636 } 637 638 int[] result = new int[chars.length+2]; 639 640 int i = 0; 641 int index = 0; 642 643 // the result begins with \u0000 unless the original CharSet does 644 if (chars[0] != 0x0000) { 645 result[index++] = 0x0000; 646 } 647 648 // walk through the characters in this CharSet. Append a pair of 649 // characters the first of which is one less than the first 650 // character we see and the second of which is one plus the second 651 // character we see (don't write the first character if it's \u0000, 652 // and don't write the second character if it's \uffff. 653 while (i < chars.length) { 654 if (chars[i] != 0x0000) { 655 result[index++] = chars[i] - 1; 656 } 657 if (chars[i + 1] != 0x10FFFF) { 658 result[index++] = chars[i + 1] + 1; 659 } 660 i += 2; 661 } 662 663 // add 0x10ffff to the end of the result, unless it was in 664 // the original set 665 if (chars[i-1] != 0x10FFFF) { 666 result[index++] = 0x10FFFF; 667 } 668 669 if (result.length > index) { 670 int[] tmpbuf = new int[index]; 671 System.arraycopy(result, 0, tmpbuf, 0, index); 672 return tmpbuf; 673 } 674 675 return result; 676 } 677 678 /** 679 * Returns true if this CharSet contains the specified character 680 * @param c The character we're testing for set membership 681 */ 682 public boolean contains(int c) { 683 // search for the first range endpoint that is greater than or 684 // equal to c 685 int i = 1; 686 while (i < chars.length && chars[i] < c) { 687 i += 2; 688 } 689 690 // if we've walked off the end, we don't contain c 691 if (i == chars.length) { 692 return false; 693 } 694 695 // otherwise, we contain c if the beginning of the range is less 696 // than or equal to c 697 return chars[i - 1] <= c; 698 } 699 700 /** 701 * Returns true if "that" is another instance of CharSet containing 702 * the exact same characters as this one 703 */ 704 public boolean equals(Object that) { 705 return (that instanceof CharSet) && Arrays.equals(chars, ((CharSet)that).chars); 706 } 707 708 /** 709 * Returns the hash code for this set of characters 710 */ 711 public int hashCode() { 712 return Arrays.hashCode(chars); 713 } 714 715 /** 716 * Creates a new CharSet that is equal to this one 717 */ 718 public Object clone() { 719 return new CharSet(chars); 720 } 721 722 /** 723 * Returns true if this CharSet contains no characters 724 */ 725 public boolean empty() { 726 return chars.length == 0; 727 } 728 729 /** 730 * Returns a textual representation of this CharSet. If the result 731 * of calling this function is passed to CharSet.parseString(), it 732 * will produce another CharSet that is equal to this one. 733 */ 734 public String toString() { 735 StringBuffer result = new StringBuffer(); 736 737 // the result begins with an opening bracket 738 result.append('['); 739 740 // iterate through the ranges in the CharSet 741 for (int i = 0; i < chars.length; i += 2) { 742 // for a range with the same beginning and ending point, 743 // output that character 744 if (chars[i] == chars[i + 1]) { 745 result.append("0x"); 746 result.append(Integer.toHexString(chars[i])); 747 } 748 749 // otherwise, output the start and end points of the range 750 // separated by a dash 751 else { 752 result.append("0x"); 753 result.append(Integer.toHexString(chars[i])); 754 result.append("-0x"); 755 result.append(Integer.toHexString(chars[i + 1])); 756 } 757 } 758 759 // the result ends with a closing bracket 760 result.append(']'); 761 return result.toString(); 762 } 763 764 /** 765 * Returns an integer array representing the contents of this CharSet 766 * in the same form in which they're stored internally: as pairs 767 * of characters representing the start and end points of ranges 768 */ 769 public int[] getRanges() { 770 return chars; 771 } 772 773 /** 774 * Returns an Enumeration that will return the ranges of characters 775 * contained in this CharSet one at a time 776 */ 777 public Enumeration getChars() { 778 return new Enumeration(this); 779 } 780 781 //========================================================================== 782 // CharSet.Enumeration 783 //========================================================================== 784 785 /** 786 * An Enumeration that can be used to extract the character ranges 787 * from a CharSet one at a time 788 */ 789 public class Enumeration implements java.util.Enumeration<int[]> { 790 /** 791 * Initializes a CharSet.Enumeration 792 */ 793 Enumeration(CharSet cs) { 794 this.chars = cs.chars; 795 p = 0; 796 } 797 798 /** 799 * Returns true if the enumeration hasn't yet returned 800 * all the ranges in the CharSet 801 */ 802 public boolean hasMoreElements() { 803 return p < chars.length; 804 } 805 806 /** 807 * Returns the next range in the CarSet 808 */ 809 public int[] nextElement() { 810 int[] result = new int[2]; 811 result[0] = chars[p++]; 812 result[1] = chars[p++]; 813 return result; 814 } 815 816 int p; 817 int[] chars; 818 } 819 }