2480 prev = rightNode; 2481 } else { 2482 prev = intersection(prev, node); 2483 } 2484 } else { 2485 // treat as a literal & 2486 unread(); 2487 break; 2488 } 2489 continue; 2490 case 0: 2491 firstInClass = false; 2492 if (cursor >= patternLength) 2493 throw error("Unclosed character class"); 2494 break; 2495 case ']': 2496 firstInClass = false; 2497 if (prev != null) { 2498 if (consume) 2499 next(); 2500 return prev; 2501 } 2502 break; 2503 default: 2504 firstInClass = false; 2505 break; 2506 } 2507 node = range(bits); 2508 if (include) { 2509 if (prev == null) { 2510 prev = node; 2511 } else { 2512 if (prev != node) 2513 prev = union(prev, node); 2514 } 2515 } else { 2516 if (prev == null) { 2517 prev = node.complement(); 2518 } else { 2519 if (prev != node) 2520 prev = setDifference(prev, node); 2521 } 2522 } 2523 ch = peek(); 2524 } 2525 } 2526 2527 private CharProperty bitsOrSingle(BitClass bits, int ch) { 2528 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2529 Use "single" node instead of bits when dealing with unicode 2530 case folding for codepoints listed below. 2531 (1)Uppercase out of range: u+00ff, u+00b5 2532 toUpperCase(u+00ff) -> u+0178 2533 toUpperCase(u+00b5) -> u+039c 2534 (2)LatinSmallLetterLongS u+17f 2535 toUpperCase(u+017f) -> u+0053 2536 (3)LatinSmallLetterDotlessI u+131 2537 toUpperCase(u+0131) -> u+0049 2538 (4)LatinCapitalLetterIWithDotAbove u+0130 2539 toLowerCase(u+0130) -> u+0069 2540 (5)KelvinSign u+212a 2541 toLowerCase(u+212a) ==> u+006B 2542 (6)AngstromSign u+212b 5145 * Returns the set union of two CharProperty nodes. 5146 */ 5147 private static CharProperty union(final CharProperty lhs, 5148 final CharProperty rhs) { 5149 return new CharProperty() { 5150 boolean isSatisfiedBy(int ch) { 5151 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; 5152 } 5153 5154 /** 5155 * Returns the set intersection of two CharProperty nodes. 5156 */ 5157 private static CharProperty intersection(final CharProperty lhs, 5158 final CharProperty rhs) { 5159 return new CharProperty() { 5160 boolean isSatisfiedBy(int ch) { 5161 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; 5162 } 5163 5164 /** 5165 * Returns the set difference of two CharProperty nodes. 5166 */ 5167 private static CharProperty setDifference(final CharProperty lhs, 5168 final CharProperty rhs) { 5169 return new CharProperty() { 5170 boolean isSatisfiedBy(int ch) { 5171 return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}}; 5172 } 5173 5174 /** 5175 * Handles word boundaries. Includes a field to allow this one class to 5176 * deal with the different types of word boundaries we can match. The word 5177 * characters include underscores, letters, and digits. Non spacing marks 5178 * can are also part of a word if they have a base character, otherwise 5179 * they are ignored for purposes of finding word boundaries. 5180 */ 5181 static final class Bound extends Node { 5182 static int LEFT = 0x1; 5183 static int RIGHT= 0x2; 5184 static int BOTH = 0x3; 5185 static int NONE = 0x4; 5186 int type; 5187 boolean useUWORD; 5188 Bound(int n, boolean useUWORD) { 5189 type = n; 5190 this.useUWORD = useUWORD; 5191 } 5192 5193 boolean isWord(int ch) { 5194 return useUWORD ? UnicodeProp.WORD.is(ch) | 2480 prev = rightNode; 2481 } else { 2482 prev = intersection(prev, node); 2483 } 2484 } else { 2485 // treat as a literal & 2486 unread(); 2487 break; 2488 } 2489 continue; 2490 case 0: 2491 firstInClass = false; 2492 if (cursor >= patternLength) 2493 throw error("Unclosed character class"); 2494 break; 2495 case ']': 2496 firstInClass = false; 2497 if (prev != null) { 2498 if (consume) 2499 next(); 2500 if (include) 2501 return prev; 2502 else 2503 return prev.complement(); 2504 } 2505 break; 2506 default: 2507 firstInClass = false; 2508 break; 2509 } 2510 node = range(bits); 2511 if (prev == null) { 2512 prev = node; 2513 } else { 2514 if (prev != node) 2515 prev = union(prev, node); 2516 } 2517 ch = peek(); 2518 } 2519 } 2520 2521 private CharProperty bitsOrSingle(BitClass bits, int ch) { 2522 /* Bits can only handle codepoints in [u+0000-u+00ff] range. 2523 Use "single" node instead of bits when dealing with unicode 2524 case folding for codepoints listed below. 2525 (1)Uppercase out of range: u+00ff, u+00b5 2526 toUpperCase(u+00ff) -> u+0178 2527 toUpperCase(u+00b5) -> u+039c 2528 (2)LatinSmallLetterLongS u+17f 2529 toUpperCase(u+017f) -> u+0053 2530 (3)LatinSmallLetterDotlessI u+131 2531 toUpperCase(u+0131) -> u+0049 2532 (4)LatinCapitalLetterIWithDotAbove u+0130 2533 toLowerCase(u+0130) -> u+0069 2534 (5)KelvinSign u+212a 2535 toLowerCase(u+212a) ==> u+006B 2536 (6)AngstromSign u+212b 5139 * Returns the set union of two CharProperty nodes. 5140 */ 5141 private static CharProperty union(final CharProperty lhs, 5142 final CharProperty rhs) { 5143 return new CharProperty() { 5144 boolean isSatisfiedBy(int ch) { 5145 return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}}; 5146 } 5147 5148 /** 5149 * Returns the set intersection of two CharProperty nodes. 5150 */ 5151 private static CharProperty intersection(final CharProperty lhs, 5152 final CharProperty rhs) { 5153 return new CharProperty() { 5154 boolean isSatisfiedBy(int ch) { 5155 return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}}; 5156 } 5157 5158 /** 5159 * Handles word boundaries. Includes a field to allow this one class to 5160 * deal with the different types of word boundaries we can match. The word 5161 * characters include underscores, letters, and digits. Non spacing marks 5162 * can are also part of a word if they have a base character, otherwise 5163 * they are ignored for purposes of finding word boundaries. 5164 */ 5165 static final class Bound extends Node { 5166 static int LEFT = 0x1; 5167 static int RIGHT= 0x2; 5168 static int BOTH = 0x3; 5169 static int NONE = 0x4; 5170 int type; 5171 boolean useUWORD; 5172 Bound(int n, boolean useUWORD) { 5173 type = n; 5174 this.useUWORD = useUWORD; 5175 } 5176 5177 boolean isWord(int ch) { 5178 return useUWORD ? UnicodeProp.WORD.is(ch) |