1 /* 2 * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.Objects; 29 30 /** 31 * An engine that performs match operations on a {@linkplain java.lang.CharSequence 32 * character sequence} by interpreting a {@link Pattern}. 33 * 34 * <p> A matcher is created from a pattern by invoking the pattern's {@link 35 * Pattern#matcher matcher} method. Once created, a matcher can be used to 36 * perform three different kinds of match operations: 37 * 38 * <ul> 39 * 40 * <li><p> The {@link #matches matches} method attempts to match the entire 41 * input sequence against the pattern. </p></li> 42 * 43 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the 44 * input sequence, starting at the beginning, against the pattern. </p></li> 45 * 46 * <li><p> The {@link #find find} method scans the input sequence looking for 47 * the next subsequence that matches the pattern. </p></li> 48 * 49 * </ul> 50 * 51 * <p> Each of these methods returns a boolean indicating success or failure. 52 * More information about a successful match can be obtained by querying the 53 * state of the matcher. 54 * 55 * <p> A matcher finds matches in a subset of its input called the 56 * <i>region</i>. By default, the region contains all of the matcher's input. 57 * The region can be modified via the{@link #region region} method and queried 58 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} 59 * methods. The way that the region boundaries interact with some pattern 60 * constructs can be changed. See {@link #useAnchoringBounds 61 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds} 62 * for more details. 63 * 64 * <p> This class also defines methods for replacing matched subsequences with 65 * new strings whose contents can, if desired, be computed from the match 66 * result. The {@link #appendReplacement appendReplacement} and {@link 67 * #appendTail appendTail} methods can be used in tandem in order to collect 68 * the result into an existing string buffer, or the more convenient {@link 69 * #replaceAll replaceAll} method can be used to create a string in which every 70 * matching subsequence in the input sequence is replaced. 71 * 72 * <p> The explicit state of a matcher includes the start and end indices of 73 * the most recent successful match. It also includes the start and end 74 * indices of the input subsequence captured by each <a 75 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total 76 * count of such subsequences. As a convenience, methods are also provided for 77 * returning these captured subsequences in string form. 78 * 79 * <p> The explicit state of a matcher is initially undefined; attempting to 80 * query any part of it before a successful match will cause an {@link 81 * IllegalStateException} to be thrown. The explicit state of a matcher is 82 * recomputed by every match operation. 83 * 84 * <p> The implicit state of a matcher includes the input character sequence as 85 * well as the <i>append position</i>, which is initially zero and is updated 86 * by the {@link #appendReplacement appendReplacement} method. 87 * 88 * <p> A matcher may be reset explicitly by invoking its {@link #reset()} 89 * method or, if a new input sequence is desired, its {@link 90 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a 91 * matcher discards its explicit state information and sets the append position 92 * to zero. 93 * 94 * <p> Instances of this class are not safe for use by multiple concurrent 95 * threads. </p> 96 * 97 * 98 * @author Mike McCloskey 99 * @author Mark Reinhold 100 * @author JSR-51 Expert Group 101 * @since 1.4 102 * @spec JSR-51 103 */ 104 105 public final class Matcher implements MatchResult { 106 107 /** 108 * The Pattern object that created this Matcher. 109 */ 110 Pattern parentPattern; 111 112 /** 113 * The storage used by groups. They may contain invalid values if 114 * a group was skipped during the matching. 115 */ 116 int[] groups; 117 118 /** 119 * The range within the sequence that is to be matched. Anchors 120 * will match at these "hard" boundaries. Changing the region 121 * changes these values. 122 */ 123 int from, to; 124 125 /** 126 * Lookbehind uses this value to ensure that the subexpression 127 * match ends at the point where the lookbehind was encountered. 128 */ 129 int lookbehindTo; 130 131 /** 132 * The original string being matched. 133 */ 134 CharSequence text; 135 136 /** 137 * Matcher state used by the last node. NOANCHOR is used when a 138 * match does not have to consume all of the input. ENDANCHOR is 139 * the mode used for matching all the input. 140 */ 141 static final int ENDANCHOR = 1; 142 static final int NOANCHOR = 0; 143 int acceptMode = NOANCHOR; 144 145 /** 146 * The range of string that last matched the pattern. If the last 147 * match failed then first is -1; last initially holds 0 then it 148 * holds the index of the end of the last match (which is where the 149 * next search starts). 150 */ 151 int first = -1, last = 0; 152 153 /** 154 * The end index of what matched in the last match operation. 155 */ 156 int oldLast = -1; 157 158 /** 159 * The index of the last position appended in a substitution. 160 */ 161 int lastAppendPosition = 0; 162 163 /** 164 * Storage used by nodes to tell what repetition they are on in 165 * a pattern, and where groups begin. The nodes themselves are stateless, 166 * so they rely on this field to hold state during a match. 167 */ 168 int[] locals; 169 170 /** 171 * Boolean indicating whether or not more input could change 172 * the results of the last match. 173 * 174 * If hitEnd is true, and a match was found, then more input 175 * might cause a different match to be found. 176 * If hitEnd is true and a match was not found, then more 177 * input could cause a match to be found. 178 * If hitEnd is false and a match was found, then more input 179 * will not change the match. 180 * If hitEnd is false and a match was not found, then more 181 * input will not cause a match to be found. 182 */ 183 boolean hitEnd; 184 185 /** 186 * Boolean indicating whether or not more input could change 187 * a positive match into a negative one. 188 * 189 * If requireEnd is true, and a match was found, then more 190 * input could cause the match to be lost. 191 * If requireEnd is false and a match was found, then more 192 * input might change the match but the match won't be lost. 193 * If a match was not found, then requireEnd has no meaning. 194 */ 195 boolean requireEnd; 196 197 /** 198 * If transparentBounds is true then the boundaries of this 199 * matcher's region are transparent to lookahead, lookbehind, 200 * and boundary matching constructs that try to see beyond them. 201 */ 202 boolean transparentBounds = false; 203 204 /** 205 * If anchoringBounds is true then the boundaries of this 206 * matcher's region match anchors such as ^ and $. 207 */ 208 boolean anchoringBounds = true; 209 210 /** 211 * No default constructor. 212 */ 213 Matcher() { 214 } 215 216 /** 217 * All matchers have the state used by Pattern during a match. 218 */ 219 Matcher(Pattern parent, CharSequence text) { 220 this.parentPattern = parent; 221 this.text = text; 222 223 // Allocate state storage 224 int parentGroupCount = Math.max(parent.capturingGroupCount, 10); 225 groups = new int[parentGroupCount * 2]; 226 locals = new int[parent.localCount]; 227 228 // Put fields into initial states 229 reset(); 230 } 231 232 /** 233 * Returns the pattern that is interpreted by this matcher. 234 * 235 * @return The pattern for which this matcher was created 236 */ 237 public Pattern pattern() { 238 return parentPattern; 239 } 240 241 /** 242 * Returns the match state of this matcher as a {@link MatchResult}. 243 * The result is unaffected by subsequent operations performed upon this 244 * matcher. 245 * 246 * @return a <code>MatchResult</code> with the state of this matcher 247 * @since 1.5 248 */ 249 public MatchResult toMatchResult() { 250 Matcher result = new Matcher(this.parentPattern, text.toString()); 251 result.first = this.first; 252 result.last = this.last; 253 result.groups = this.groups.clone(); 254 return result; 255 } 256 257 /** 258 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to 259 * find matches with. 260 * 261 * <p> This method causes this matcher to lose information 262 * about the groups of the last match that occurred. The 263 * matcher's position in the input is maintained and its 264 * last append position is unaffected.</p> 265 * 266 * @param newPattern 267 * The new pattern used by this matcher 268 * @return This matcher 269 * @throws IllegalArgumentException 270 * If newPattern is <tt>null</tt> 271 * @since 1.5 272 */ 273 public Matcher usePattern(Pattern newPattern) { 274 if (newPattern == null) 275 throw new IllegalArgumentException("Pattern cannot be null"); 276 parentPattern = newPattern; 277 278 // Reallocate state storage 279 int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10); 280 groups = new int[parentGroupCount * 2]; 281 locals = new int[newPattern.localCount]; 282 for (int i = 0; i < groups.length; i++) 283 groups[i] = -1; 284 for (int i = 0; i < locals.length; i++) 285 locals[i] = -1; 286 return this; 287 } 288 289 /** 290 * Resets this matcher. 291 * 292 * <p> Resetting a matcher discards all of its explicit state information 293 * and sets its append position to zero. The matcher's region is set to the 294 * default region, which is its entire character sequence. The anchoring 295 * and transparency of this matcher's region boundaries are unaffected. 296 * 297 * @return This matcher 298 */ 299 public Matcher reset() { 300 first = -1; 301 last = 0; 302 oldLast = -1; 303 for(int i=0; i<groups.length; i++) 304 groups[i] = -1; 305 for(int i=0; i<locals.length; i++) 306 locals[i] = -1; 307 lastAppendPosition = 0; 308 from = 0; 309 to = getTextLength(); 310 return this; 311 } 312 313 /** 314 * Resets this matcher with a new input sequence. 315 * 316 * <p> Resetting a matcher discards all of its explicit state information 317 * and sets its append position to zero. The matcher's region is set to 318 * the default region, which is its entire character sequence. The 319 * anchoring and transparency of this matcher's region boundaries are 320 * unaffected. 321 * 322 * @param input 323 * The new input character sequence 324 * 325 * @return This matcher 326 */ 327 public Matcher reset(CharSequence input) { 328 text = input; 329 return reset(); 330 } 331 332 /** 333 * Returns the start index of the previous match. 334 * 335 * @return The index of the first character matched 336 * 337 * @throws IllegalStateException 338 * If no match has yet been attempted, 339 * or if the previous match operation failed 340 */ 341 public int start() { 342 if (first < 0) 343 throw new IllegalStateException("No match available"); 344 return first; 345 } 346 347 /** 348 * Returns the start index of the subsequence captured by the given group 349 * during the previous match operation. 350 * 351 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 352 * to right, starting at one. Group zero denotes the entire pattern, so 353 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to 354 * <i>m.</i><tt>start()</tt>. </p> 355 * 356 * @param group 357 * The index of a capturing group in this matcher's pattern 358 * 359 * @return The index of the first character captured by the group, 360 * or <tt>-1</tt> if the match was successful but the group 361 * itself did not match anything 362 * 363 * @throws IllegalStateException 364 * If no match has yet been attempted, 365 * or if the previous match operation failed 366 * 367 * @throws IndexOutOfBoundsException 368 * If there is no capturing group in the pattern 369 * with the given index 370 */ 371 public int start(int group) { 372 if (first < 0) 373 throw new IllegalStateException("No match available"); 374 if (group < 0 || group > groupCount()) 375 throw new IndexOutOfBoundsException("No group " + group); 376 return groups[group * 2]; 377 } 378 379 /** 380 * Returns the start index of the subsequence captured by the given 381 * <a href="Pattern.html#groupname">named-capturing group</a> during the 382 * previous match operation. 383 * 384 * @param name 385 * The name of a named-capturing group in this matcher's pattern 386 * 387 * @return The index of the first character captured by the group, 388 * or {@code -1} if the match was successful but the group 389 * itself did not match anything 390 * 391 * @throws IllegalStateException 392 * If no match has yet been attempted, 393 * or if the previous match operation failed 394 * 395 * @throws IllegalArgumentException 396 * If there is no capturing group in the pattern 397 * with the given name 398 * @since 1.8 399 */ 400 public int start(String name) { 401 return groups[getMatchedGroupIndex(name) * 2]; 402 } 403 404 /** 405 * Returns the offset after the last character matched. 406 * 407 * @return The offset after the last character matched 408 * 409 * @throws IllegalStateException 410 * If no match has yet been attempted, 411 * or if the previous match operation failed 412 */ 413 public int end() { 414 if (first < 0) 415 throw new IllegalStateException("No match available"); 416 return last; 417 } 418 419 /** 420 * Returns the offset after the last character of the subsequence 421 * captured by the given group during the previous match operation. 422 * 423 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 424 * to right, starting at one. Group zero denotes the entire pattern, so 425 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to 426 * <i>m.</i><tt>end()</tt>. </p> 427 * 428 * @param group 429 * The index of a capturing group in this matcher's pattern 430 * 431 * @return The offset after the last character captured by the group, 432 * or <tt>-1</tt> if the match was successful 433 * but the group itself did not match anything 434 * 435 * @throws IllegalStateException 436 * If no match has yet been attempted, 437 * or if the previous match operation failed 438 * 439 * @throws IndexOutOfBoundsException 440 * If there is no capturing group in the pattern 441 * with the given index 442 */ 443 public int end(int group) { 444 if (first < 0) 445 throw new IllegalStateException("No match available"); 446 if (group < 0 || group > groupCount()) 447 throw new IndexOutOfBoundsException("No group " + group); 448 return groups[group * 2 + 1]; 449 } 450 451 /** 452 * Returns the offset after the last character of the subsequence 453 * captured by the given <a href="Pattern.html#groupname">named-capturing 454 * group</a> during the previous match operation. 455 * 456 * @param name 457 * The name of a named-capturing group in this matcher's pattern 458 * 459 * @return The offset after the last character captured by the group, 460 * or {@code -1} if the match was successful 461 * but the group itself did not match anything 462 * 463 * @throws IllegalStateException 464 * If no match has yet been attempted, 465 * or if the previous match operation failed 466 * 467 * @throws IllegalArgumentException 468 * If there is no capturing group in the pattern 469 * with the given name 470 * @since 1.8 471 */ 472 public int end(String name) { 473 return groups[getMatchedGroupIndex(name) * 2 + 1]; 474 } 475 476 /** 477 * Returns the input subsequence matched by the previous match. 478 * 479 * <p> For a matcher <i>m</i> with input sequence <i>s</i>, 480 * the expressions <i>m.</i><tt>group()</tt> and 481 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt> 482 * are equivalent. </p> 483 * 484 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty 485 * string. This method will return the empty string when the pattern 486 * successfully matches the empty string in the input. </p> 487 * 488 * @return The (possibly empty) subsequence matched by the previous match, 489 * in string form 490 * 491 * @throws IllegalStateException 492 * If no match has yet been attempted, 493 * or if the previous match operation failed 494 */ 495 public String group() { 496 return group(0); 497 } 498 499 /** 500 * Returns the input subsequence captured by the given group during the 501 * previous match operation. 502 * 503 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index 504 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and 505 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt> 506 * are equivalent. </p> 507 * 508 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 509 * to right, starting at one. Group zero denotes the entire pattern, so 510 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>. 511 * </p> 512 * 513 * <p> If the match was successful but the group specified failed to match 514 * any part of the input sequence, then <tt>null</tt> is returned. Note 515 * that some groups, for example <tt>(a*)</tt>, match the empty string. 516 * This method will return the empty string when such a group successfully 517 * matches the empty string in the input. </p> 518 * 519 * @param group 520 * The index of a capturing group in this matcher's pattern 521 * 522 * @return The (possibly empty) subsequence captured by the group 523 * during the previous match, or <tt>null</tt> if the group 524 * failed to match part of the input 525 * 526 * @throws IllegalStateException 527 * If no match has yet been attempted, 528 * or if the previous match operation failed 529 * 530 * @throws IndexOutOfBoundsException 531 * If there is no capturing group in the pattern 532 * with the given index 533 */ 534 public String group(int group) { 535 if (first < 0) 536 throw new IllegalStateException("No match found"); 537 if (group < 0 || group > groupCount()) 538 throw new IndexOutOfBoundsException("No group " + group); 539 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 540 return null; 541 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 542 } 543 544 /** 545 * Returns the input subsequence captured by the given 546 * <a href="Pattern.html#groupname">named-capturing group</a> during the previous 547 * match operation. 548 * 549 * <p> If the match was successful but the group specified failed to match 550 * any part of the input sequence, then <tt>null</tt> is returned. Note 551 * that some groups, for example <tt>(a*)</tt>, match the empty string. 552 * This method will return the empty string when such a group successfully 553 * matches the empty string in the input. </p> 554 * 555 * @param name 556 * The name of a named-capturing group in this matcher's pattern 557 * 558 * @return The (possibly empty) subsequence captured by the named group 559 * during the previous match, or <tt>null</tt> if the group 560 * failed to match part of the input 561 * 562 * @throws IllegalStateException 563 * If no match has yet been attempted, 564 * or if the previous match operation failed 565 * 566 * @throws IllegalArgumentException 567 * If there is no capturing group in the pattern 568 * with the given name 569 * @since 1.7 570 */ 571 public String group(String name) { 572 int group = getMatchedGroupIndex(name); 573 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 574 return null; 575 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 576 } 577 578 /** 579 * Returns the number of capturing groups in this matcher's pattern. 580 * 581 * <p> Group zero denotes the entire pattern by convention. It is not 582 * included in this count. 583 * 584 * <p> Any non-negative integer smaller than or equal to the value 585 * returned by this method is guaranteed to be a valid group index for 586 * this matcher. </p> 587 * 588 * @return The number of capturing groups in this matcher's pattern 589 */ 590 public int groupCount() { 591 return parentPattern.capturingGroupCount - 1; 592 } 593 594 /** 595 * Attempts to match the entire region against the pattern. 596 * 597 * <p> If the match succeeds then more information can be obtained via the 598 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 599 * 600 * @return <tt>true</tt> if, and only if, the entire region sequence 601 * matches this matcher's pattern 602 */ 603 public boolean matches() { 604 return match(from, ENDANCHOR); 605 } 606 607 /** 608 * Attempts to find the next subsequence of the input sequence that matches 609 * the pattern. 610 * 611 * <p> This method starts at the beginning of this matcher's region, or, if 612 * a previous invocation of the method was successful and the matcher has 613 * not since been reset, at the first character not matched by the previous 614 * match. 615 * 616 * <p> If the match succeeds then more information can be obtained via the 617 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 618 * 619 * @return <tt>true</tt> if, and only if, a subsequence of the input 620 * sequence matches this matcher's pattern 621 */ 622 public boolean find() { 623 int nextSearchIndex = last; 624 if (nextSearchIndex == first) 625 nextSearchIndex++; 626 627 // If next search starts before region, start it at region 628 if (nextSearchIndex < from) 629 nextSearchIndex = from; 630 631 // If next search starts beyond region then it fails 632 if (nextSearchIndex > to) { 633 for (int i = 0; i < groups.length; i++) 634 groups[i] = -1; 635 return false; 636 } 637 return search(nextSearchIndex); 638 } 639 640 /** 641 * Resets this matcher and then attempts to find the next subsequence of 642 * the input sequence that matches the pattern, starting at the specified 643 * index. 644 * 645 * <p> If the match succeeds then more information can be obtained via the 646 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent 647 * invocations of the {@link #find()} method will start at the first 648 * character not matched by this match. </p> 649 * 650 * @param start the index to start searching for a match 651 * @throws IndexOutOfBoundsException 652 * If start is less than zero or if start is greater than the 653 * length of the input sequence. 654 * 655 * @return <tt>true</tt> if, and only if, a subsequence of the input 656 * sequence starting at the given index matches this matcher's 657 * pattern 658 */ 659 public boolean find(int start) { 660 int limit = getTextLength(); 661 if ((start < 0) || (start > limit)) 662 throw new IndexOutOfBoundsException("Illegal start index"); 663 reset(); 664 return search(start); 665 } 666 667 /** 668 * Attempts to match the input sequence, starting at the beginning of the 669 * region, against the pattern. 670 * 671 * <p> Like the {@link #matches matches} method, this method always starts 672 * at the beginning of the region; unlike that method, it does not 673 * require that the entire region be matched. 674 * 675 * <p> If the match succeeds then more information can be obtained via the 676 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 677 * 678 * @return <tt>true</tt> if, and only if, a prefix of the input 679 * sequence matches this matcher's pattern 680 */ 681 public boolean lookingAt() { 682 return match(from, NOANCHOR); 683 } 684 685 /** 686 * Returns a literal replacement <code>String</code> for the specified 687 * <code>String</code>. 688 * 689 * This method produces a <code>String</code> that will work 690 * as a literal replacement <code>s</code> in the 691 * <code>appendReplacement</code> method of the {@link Matcher} class. 692 * The <code>String</code> produced will match the sequence of characters 693 * in <code>s</code> treated as a literal sequence. Slashes ('\') and 694 * dollar signs ('$') will be given no special meaning. 695 * 696 * @param s The string to be literalized 697 * @return A literal string replacement 698 * @since 1.5 699 */ 700 public static String quoteReplacement(String s) { 701 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1)) 702 return s; 703 StringBuilder sb = new StringBuilder(); 704 for (int i=0; i<s.length(); i++) { 705 char c = s.charAt(i); 706 if (c == '\\' || c == '$') { 707 sb.append('\\'); 708 } 709 sb.append(c); 710 } 711 return sb.toString(); 712 } 713 714 /** 715 * Implements a non-terminal append-and-replace step. 716 * 717 * <p> This method performs the following actions: </p> 718 * 719 * <ol> 720 * 721 * <li><p> It reads characters from the input sequence, starting at the 722 * append position, and appends them to the given string buffer. It 723 * stops after reading the last character preceding the previous match, 724 * that is, the character at index {@link 725 * #start()} <tt>-</tt> <tt>1</tt>. </p></li> 726 * 727 * <li><p> It appends the given replacement string to the string buffer. 728 * </p></li> 729 * 730 * <li><p> It sets the append position of this matcher to the index of 731 * the last character matched, plus one, that is, to {@link #end()}. 732 * </p></li> 733 * 734 * </ol> 735 * 736 * <p> The replacement string may contain references to subsequences 737 * captured during the previous match: Each occurrence of 738 * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i> 739 * will be replaced by the result of evaluating the corresponding 740 * {@link #group(String) group(name)} or {@link #group(int) group(g)} 741 * respectively. For <tt>$</tt><i>g</i>, 742 * the first number after the <tt>$</tt> is always treated as part of 743 * the group reference. Subsequent numbers are incorporated into g if 744 * they would form a legal group reference. Only the numerals '0' 745 * through '9' are considered as potential components of the group 746 * reference. If the second group matched the string <tt>"foo"</tt>, for 747 * example, then passing the replacement string <tt>"$2bar"</tt> would 748 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar 749 * sign (<tt>$</tt>) may be included as a literal in the replacement 750 * string by preceding it with a backslash (<tt>\$</tt>). 751 * 752 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 753 * the replacement string may cause the results to be different than if it 754 * were being treated as a literal replacement string. Dollar signs may be 755 * treated as references to captured subsequences as described above, and 756 * backslashes are used to escape literal characters in the replacement 757 * string. 758 * 759 * <p> This method is intended to be used in a loop together with the 760 * {@link #appendTail appendTail} and {@link #find find} methods. The 761 * following code, for example, writes <tt>one dog two dogs in the 762 * yard</tt> to the standard-output stream: </p> 763 * 764 * <blockquote><pre> 765 * Pattern p = Pattern.compile("cat"); 766 * Matcher m = p.matcher("one cat two cats in the yard"); 767 * StringBuffer sb = new StringBuffer(); 768 * while (m.find()) { 769 * m.appendReplacement(sb, "dog"); 770 * } 771 * m.appendTail(sb); 772 * System.out.println(sb.toString());</pre></blockquote> 773 * 774 * @param sb 775 * The target string buffer 776 * 777 * @param replacement 778 * The replacement string 779 * 780 * @return This matcher 781 * 782 * @throws IllegalStateException 783 * If no match has yet been attempted, 784 * or if the previous match operation failed 785 * 786 * @throws IllegalArgumentException 787 * If the replacement string refers to a named-capturing 788 * group that does not exist in the pattern 789 * 790 * @throws IndexOutOfBoundsException 791 * If the replacement string refers to a capturing group 792 * that does not exist in the pattern 793 */ 794 public Matcher appendReplacement(StringBuffer sb, String replacement) { 795 796 // If no match, return error 797 if (first < 0) 798 throw new IllegalStateException("No match available"); 799 800 // Process substitution string to replace group references with groups 801 int cursor = 0; 802 StringBuilder result = new StringBuilder(); 803 804 while (cursor < replacement.length()) { 805 char nextChar = replacement.charAt(cursor); 806 if (nextChar == '\\') { 807 cursor++; 808 if (cursor == replacement.length()) 809 throw new IllegalArgumentException( 810 "character to be escaped is missing"); 811 nextChar = replacement.charAt(cursor); 812 result.append(nextChar); 813 cursor++; 814 } else if (nextChar == '$') { 815 // Skip past $ 816 cursor++; 817 // Throw IAE if this "$" is the last character in replacement 818 if (cursor == replacement.length()) 819 throw new IllegalArgumentException( 820 "Illegal group reference: group index is missing"); 821 nextChar = replacement.charAt(cursor); 822 int refNum = -1; 823 if (nextChar == '{') { 824 cursor++; 825 StringBuilder gsb = new StringBuilder(); 826 while (cursor < replacement.length()) { 827 nextChar = replacement.charAt(cursor); 828 if (ASCII.isLower(nextChar) || 829 ASCII.isUpper(nextChar) || 830 ASCII.isDigit(nextChar)) { 831 gsb.append(nextChar); 832 cursor++; 833 } else { 834 break; 835 } 836 } 837 if (gsb.length() == 0) 838 throw new IllegalArgumentException( 839 "named capturing group has 0 length name"); 840 if (nextChar != '}') 841 throw new IllegalArgumentException( 842 "named capturing group is missing trailing '}'"); 843 String gname = gsb.toString(); 844 if (ASCII.isDigit(gname.charAt(0))) 845 throw new IllegalArgumentException( 846 "capturing group name {" + gname + 847 "} starts with digit character"); 848 if (!parentPattern.namedGroups().containsKey(gname)) 849 throw new IllegalArgumentException( 850 "No group with name {" + gname + "}"); 851 refNum = parentPattern.namedGroups().get(gname); 852 cursor++; 853 } else { 854 // The first number is always a group 855 refNum = (int)nextChar - '0'; 856 if ((refNum < 0)||(refNum > 9)) 857 throw new IllegalArgumentException( 858 "Illegal group reference"); 859 cursor++; 860 // Capture the largest legal group string 861 boolean done = false; 862 while (!done) { 863 if (cursor >= replacement.length()) { 864 break; 865 } 866 int nextDigit = replacement.charAt(cursor) - '0'; 867 if ((nextDigit < 0)||(nextDigit > 9)) { // not a number 868 break; 869 } 870 int newRefNum = (refNum * 10) + nextDigit; 871 if (groupCount() < newRefNum) { 872 done = true; 873 } else { 874 refNum = newRefNum; 875 cursor++; 876 } 877 } 878 } 879 // Append group 880 if (start(refNum) != -1 && end(refNum) != -1) 881 result.append(text, start(refNum), end(refNum)); 882 } else { 883 result.append(nextChar); 884 cursor++; 885 } 886 } 887 // Append the intervening text 888 sb.append(text, lastAppendPosition, first); 889 // Append the match substitution 890 sb.append(result); 891 892 lastAppendPosition = last; 893 return this; 894 } 895 896 /** 897 * Implements a terminal append-and-replace step. 898 * 899 * <p> This method reads characters from the input sequence, starting at 900 * the append position, and appends them to the given string buffer. It is 901 * intended to be invoked after one or more invocations of the {@link 902 * #appendReplacement appendReplacement} method in order to copy the 903 * remainder of the input sequence. </p> 904 * 905 * @param sb 906 * The target string buffer 907 * 908 * @return The target string buffer 909 */ 910 public StringBuffer appendTail(StringBuffer sb) { 911 sb.append(text, lastAppendPosition, getTextLength()); 912 return sb; 913 } 914 915 /** 916 * Replaces every subsequence of the input sequence that matches the 917 * pattern with the given replacement string. 918 * 919 * <p> This method first resets this matcher. It then scans the input 920 * sequence looking for matches of the pattern. Characters that are not 921 * part of any match are appended directly to the result string; each match 922 * is replaced in the result by the replacement string. The replacement 923 * string may contain references to captured subsequences as in the {@link 924 * #appendReplacement appendReplacement} method. 925 * 926 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 927 * the replacement string may cause the results to be different than if it 928 * were being treated as a literal replacement string. Dollar signs may be 929 * treated as references to captured subsequences as described above, and 930 * backslashes are used to escape literal characters in the replacement 931 * string. 932 * 933 * <p> Given the regular expression <tt>a*b</tt>, the input 934 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string 935 * <tt>"-"</tt>, an invocation of this method on a matcher for that 936 * expression would yield the string <tt>"-foo-foo-foo-"</tt>. 937 * 938 * <p> Invoking this method changes this matcher's state. If the matcher 939 * is to be used in further matching operations then it should first be 940 * reset. </p> 941 * 942 * @param replacement 943 * The replacement string 944 * 945 * @return The string constructed by replacing each matching subsequence 946 * by the replacement string, substituting captured subsequences 947 * as needed 948 */ 949 public String replaceAll(String replacement) { 950 reset(); 951 boolean result = find(); 952 if (result) { 953 StringBuffer sb = new StringBuffer(); 954 do { 955 appendReplacement(sb, replacement); 956 result = find(); 957 } while (result); 958 appendTail(sb); 959 return sb.toString(); 960 } 961 return text.toString(); 962 } 963 964 /** 965 * Replaces the first subsequence of the input sequence that matches the 966 * pattern with the given replacement string. 967 * 968 * <p> This method first resets this matcher. It then scans the input 969 * sequence looking for a match of the pattern. Characters that are not 970 * part of the match are appended directly to the result string; the match 971 * is replaced in the result by the replacement string. The replacement 972 * string may contain references to captured subsequences as in the {@link 973 * #appendReplacement appendReplacement} method. 974 * 975 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 976 * the replacement string may cause the results to be different than if it 977 * were being treated as a literal replacement string. Dollar signs may be 978 * treated as references to captured subsequences as described above, and 979 * backslashes are used to escape literal characters in the replacement 980 * string. 981 * 982 * <p> Given the regular expression <tt>dog</tt>, the input 983 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string 984 * <tt>"cat"</tt>, an invocation of this method on a matcher for that 985 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p> 986 * 987 * <p> Invoking this method changes this matcher's state. If the matcher 988 * is to be used in further matching operations then it should first be 989 * reset. </p> 990 * 991 * @param replacement 992 * The replacement string 993 * @return The string constructed by replacing the first matching 994 * subsequence by the replacement string, substituting captured 995 * subsequences as needed 996 */ 997 public String replaceFirst(String replacement) { 998 if (replacement == null) 999 throw new NullPointerException("replacement"); 1000 reset(); 1001 if (!find()) 1002 return text.toString(); 1003 StringBuffer sb = new StringBuffer(); 1004 appendReplacement(sb, replacement); 1005 appendTail(sb); 1006 return sb.toString(); 1007 } 1008 1009 /** 1010 * Sets the limits of this matcher's region. The region is the part of the 1011 * input sequence that will be searched to find a match. Invoking this 1012 * method resets the matcher, and then sets the region to start at the 1013 * index specified by the <code>start</code> parameter and end at the 1014 * index specified by the <code>end</code> parameter. 1015 * 1016 * <p>Depending on the transparency and anchoring being used (see 1017 * {@link #useTransparentBounds useTransparentBounds} and 1018 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such 1019 * as anchors may behave differently at or around the boundaries of the 1020 * region. 1021 * 1022 * @param start 1023 * The index to start searching at (inclusive) 1024 * @param end 1025 * The index to end searching at (exclusive) 1026 * @throws IndexOutOfBoundsException 1027 * If start or end is less than zero, if 1028 * start is greater than the length of the input sequence, if 1029 * end is greater than the length of the input sequence, or if 1030 * start is greater than end. 1031 * @return this matcher 1032 * @since 1.5 1033 */ 1034 public Matcher region(int start, int end) { 1035 if ((start < 0) || (start > getTextLength())) 1036 throw new IndexOutOfBoundsException("start"); 1037 if ((end < 0) || (end > getTextLength())) 1038 throw new IndexOutOfBoundsException("end"); 1039 if (start > end) 1040 throw new IndexOutOfBoundsException("start > end"); 1041 reset(); 1042 from = start; 1043 to = end; 1044 return this; 1045 } 1046 1047 /** 1048 * Reports the start index of this matcher's region. The 1049 * searches this matcher conducts are limited to finding matches 1050 * within {@link #regionStart regionStart} (inclusive) and 1051 * {@link #regionEnd regionEnd} (exclusive). 1052 * 1053 * @return The starting point of this matcher's region 1054 * @since 1.5 1055 */ 1056 public int regionStart() { 1057 return from; 1058 } 1059 1060 /** 1061 * Reports the end index (exclusive) of this matcher's region. 1062 * The searches this matcher conducts are limited to finding matches 1063 * within {@link #regionStart regionStart} (inclusive) and 1064 * {@link #regionEnd regionEnd} (exclusive). 1065 * 1066 * @return the ending point of this matcher's region 1067 * @since 1.5 1068 */ 1069 public int regionEnd() { 1070 return to; 1071 } 1072 1073 /** 1074 * Queries the transparency of region bounds for this matcher. 1075 * 1076 * <p> This method returns <tt>true</tt> if this matcher uses 1077 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i> 1078 * bounds. 1079 * 1080 * <p> See {@link #useTransparentBounds useTransparentBounds} for a 1081 * description of transparent and opaque bounds. 1082 * 1083 * <p> By default, a matcher uses opaque region boundaries. 1084 * 1085 * @return <tt>true</tt> iff this matcher is using transparent bounds, 1086 * <tt>false</tt> otherwise. 1087 * @see java.util.regex.Matcher#useTransparentBounds(boolean) 1088 * @since 1.5 1089 */ 1090 public boolean hasTransparentBounds() { 1091 return transparentBounds; 1092 } 1093 1094 /** 1095 * Sets the transparency of region bounds for this matcher. 1096 * 1097 * <p> Invoking this method with an argument of <tt>true</tt> will set this 1098 * matcher to use <i>transparent</i> bounds. If the boolean 1099 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used. 1100 * 1101 * <p> Using transparent bounds, the boundaries of this 1102 * matcher's region are transparent to lookahead, lookbehind, 1103 * and boundary matching constructs. Those constructs can see beyond the 1104 * boundaries of the region to see if a match is appropriate. 1105 * 1106 * <p> Using opaque bounds, the boundaries of this matcher's 1107 * region are opaque to lookahead, lookbehind, and boundary matching 1108 * constructs that may try to see beyond them. Those constructs cannot 1109 * look past the boundaries so they will fail to match anything outside 1110 * of the region. 1111 * 1112 * <p> By default, a matcher uses opaque bounds. 1113 * 1114 * @param b a boolean indicating whether to use opaque or transparent 1115 * regions 1116 * @return this matcher 1117 * @see java.util.regex.Matcher#hasTransparentBounds 1118 * @since 1.5 1119 */ 1120 public Matcher useTransparentBounds(boolean b) { 1121 transparentBounds = b; 1122 return this; 1123 } 1124 1125 /** 1126 * Queries the anchoring of region bounds for this matcher. 1127 * 1128 * <p> This method returns <tt>true</tt> if this matcher uses 1129 * <i>anchoring</i> bounds, <tt>false</tt> otherwise. 1130 * 1131 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a 1132 * description of anchoring bounds. 1133 * 1134 * <p> By default, a matcher uses anchoring region boundaries. 1135 * 1136 * @return <tt>true</tt> iff this matcher is using anchoring bounds, 1137 * <tt>false</tt> otherwise. 1138 * @see java.util.regex.Matcher#useAnchoringBounds(boolean) 1139 * @since 1.5 1140 */ 1141 public boolean hasAnchoringBounds() { 1142 return anchoringBounds; 1143 } 1144 1145 /** 1146 * Sets the anchoring of region bounds for this matcher. 1147 * 1148 * <p> Invoking this method with an argument of <tt>true</tt> will set this 1149 * matcher to use <i>anchoring</i> bounds. If the boolean 1150 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be 1151 * used. 1152 * 1153 * <p> Using anchoring bounds, the boundaries of this 1154 * matcher's region match anchors such as ^ and $. 1155 * 1156 * <p> Without anchoring bounds, the boundaries of this 1157 * matcher's region will not match anchors such as ^ and $. 1158 * 1159 * <p> By default, a matcher uses anchoring region boundaries. 1160 * 1161 * @param b a boolean indicating whether or not to use anchoring bounds. 1162 * @return this matcher 1163 * @see java.util.regex.Matcher#hasAnchoringBounds 1164 * @since 1.5 1165 */ 1166 public Matcher useAnchoringBounds(boolean b) { 1167 anchoringBounds = b; 1168 return this; 1169 } 1170 1171 /** 1172 * <p>Returns the string representation of this matcher. The 1173 * string representation of a <code>Matcher</code> contains information 1174 * that may be useful for debugging. The exact format is unspecified. 1175 * 1176 * @return The string representation of this matcher 1177 * @since 1.5 1178 */ 1179 public String toString() { 1180 StringBuilder sb = new StringBuilder(); 1181 sb.append("java.util.regex.Matcher"); 1182 sb.append("[pattern=" + pattern()); 1183 sb.append(" region="); 1184 sb.append(regionStart() + "," + regionEnd()); 1185 sb.append(" lastmatch="); 1186 if ((first >= 0) && (group() != null)) { 1187 sb.append(group()); 1188 } 1189 sb.append("]"); 1190 return sb.toString(); 1191 } 1192 1193 /** 1194 * <p>Returns true if the end of input was hit by the search engine in 1195 * the last match operation performed by this matcher. 1196 * 1197 * <p>When this method returns true, then it is possible that more input 1198 * would have changed the result of the last search. 1199 * 1200 * @return true iff the end of input was hit in the last match; false 1201 * otherwise 1202 * @since 1.5 1203 */ 1204 public boolean hitEnd() { 1205 return hitEnd; 1206 } 1207 1208 /** 1209 * <p>Returns true if more input could change a positive match into a 1210 * negative one. 1211 * 1212 * <p>If this method returns true, and a match was found, then more 1213 * input could cause the match to be lost. If this method returns false 1214 * and a match was found, then more input might change the match but the 1215 * match won't be lost. If a match was not found, then requireEnd has no 1216 * meaning. 1217 * 1218 * @return true iff more input could change a positive match into a 1219 * negative one. 1220 * @since 1.5 1221 */ 1222 public boolean requireEnd() { 1223 return requireEnd; 1224 } 1225 1226 /** 1227 * Initiates a search to find a Pattern within the given bounds. 1228 * The groups are filled with default values and the match of the root 1229 * of the state machine is called. The state machine will hold the state 1230 * of the match as it proceeds in this matcher. 1231 * 1232 * Matcher.from is not set here, because it is the "hard" boundary 1233 * of the start of the search which anchors will set to. The from param 1234 * is the "soft" boundary of the start of the search, meaning that the 1235 * regex tries to match at that index but ^ won't match there. Subsequent 1236 * calls to the search methods start at a new "soft" boundary which is 1237 * the end of the previous match. 1238 */ 1239 boolean search(int from) { 1240 this.hitEnd = false; 1241 this.requireEnd = false; 1242 from = from < 0 ? 0 : from; 1243 this.first = from; 1244 this.oldLast = oldLast < 0 ? from : oldLast; 1245 for (int i = 0; i < groups.length; i++) 1246 groups[i] = -1; 1247 acceptMode = NOANCHOR; 1248 boolean result = parentPattern.root.match(this, from, text); 1249 if (!result) 1250 this.first = -1; 1251 this.oldLast = this.last; 1252 return result; 1253 } 1254 1255 /** 1256 * Initiates a search for an anchored match to a Pattern within the given 1257 * bounds. The groups are filled with default values and the match of the 1258 * root of the state machine is called. The state machine will hold the 1259 * state of the match as it proceeds in this matcher. 1260 */ 1261 boolean match(int from, int anchor) { 1262 this.hitEnd = false; 1263 this.requireEnd = false; 1264 from = from < 0 ? 0 : from; 1265 this.first = from; 1266 this.oldLast = oldLast < 0 ? from : oldLast; 1267 for (int i = 0; i < groups.length; i++) 1268 groups[i] = -1; 1269 acceptMode = anchor; 1270 boolean result = parentPattern.matchRoot.match(this, from, text); 1271 if (!result) 1272 this.first = -1; 1273 this.oldLast = this.last; 1274 return result; 1275 } 1276 1277 /** 1278 * Returns the end index of the text. 1279 * 1280 * @return the index after the last character in the text 1281 */ 1282 int getTextLength() { 1283 return text.length(); 1284 } 1285 1286 /** 1287 * Generates a String from this Matcher's input in the specified range. 1288 * 1289 * @param beginIndex the beginning index, inclusive 1290 * @param endIndex the ending index, exclusive 1291 * @return A String generated from this Matcher's input 1292 */ 1293 CharSequence getSubSequence(int beginIndex, int endIndex) { 1294 return text.subSequence(beginIndex, endIndex); 1295 } 1296 1297 /** 1298 * Returns this Matcher's input character at index i. 1299 * 1300 * @return A char from the specified index 1301 */ 1302 char charAt(int i) { 1303 return text.charAt(i); 1304 } 1305 1306 /** 1307 * Returns the group index of the matched capturing group. 1308 * 1309 * @return the index of the named-capturing group 1310 */ 1311 int getMatchedGroupIndex(String name) { 1312 Objects.requireNonNull(name, "Group name"); 1313 if (first < 0) 1314 throw new IllegalStateException("No match found"); 1315 if (!parentPattern.namedGroups().containsKey(name)) 1316 throw new IllegalArgumentException("No group with name <" + name + ">"); 1317 return parentPattern.namedGroups().get(name); 1318 } 1319 }