1 /* 2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.Objects; 29 30 /** 31 * An engine that performs match operations on a {@link java.lang.CharSequence 32 * </code>character sequence<code>} by interpreting a {@link Pattern}. 33 * 34 * <p> A matcher is created from a pattern by invoking the pattern's {@link 35 * Pattern#matcher matcher} method. Once created, a matcher can be used to 36 * perform three different kinds of match operations: 37 * 38 * <ul> 39 * 40 * <li><p> The {@link #matches matches} method attempts to match the entire 41 * input sequence against the pattern. </p></li> 42 * 43 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the 44 * input sequence, starting at the beginning, against the pattern. </p></li> 45 * 46 * <li><p> The {@link #find find} method scans the input sequence looking for 47 * the next subsequence that matches the pattern. </p></li> 48 * 49 * </ul> 50 * 51 * <p> Each of these methods returns a boolean indicating success or failure. 52 * More information about a successful match can be obtained by querying the 53 * state of the matcher. 54 * 55 * <p> A matcher finds matches in a subset of its input called the 56 * <i>region</i>. By default, the region contains all of the matcher's input. 57 * The region can be modified via the{@link #region region} method and queried 58 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd} 59 * methods. The way that the region boundaries interact with some pattern 60 * constructs can be changed. See {@link #useAnchoringBounds 61 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds} 62 * for more details. 63 * 64 * <p> This class also defines methods for replacing matched subsequences with 65 * new strings whose contents can, if desired, be computed from the match 66 * result. The {@link #appendReplacement appendReplacement} and {@link 67 * #appendTail appendTail} methods can be used in tandem in order to collect 68 * the result into an existing string buffer, or the more convenient {@link 69 * #replaceAll replaceAll} method can be used to create a string in which every 70 * matching subsequence in the input sequence is replaced. 71 * 72 * <p> The explicit state of a matcher includes the start and end indices of 73 * the most recent successful match. It also includes the start and end 74 * indices of the input subsequence captured by each <a 75 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total 76 * count of such subsequences. As a convenience, methods are also provided for 77 * returning these captured subsequences in string form. 78 * 79 * <p> The explicit state of a matcher is initially undefined; attempting to 80 * query any part of it before a successful match will cause an {@link 81 * IllegalStateException} to be thrown. The explicit state of a matcher is 82 * recomputed by every match operation. 83 * 84 * <p> The implicit state of a matcher includes the input character sequence as 85 * well as the <i>append position</i>, which is initially zero and is updated 86 * by the {@link #appendReplacement appendReplacement} method. 87 * 88 * <p> A matcher may be reset explicitly by invoking its {@link #reset()} 89 * method or, if a new input sequence is desired, its {@link 90 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a 91 * matcher discards its explicit state information and sets the append position 92 * to zero. 93 * 94 * <p> Instances of this class are not safe for use by multiple concurrent 95 * threads. </p> 96 * 97 * 98 * @author Mike McCloskey 99 * @author Mark Reinhold 100 * @author JSR-51 Expert Group 101 * @since 1.4 102 * @spec JSR-51 103 */ 104 105 public final class Matcher implements MatchResult { 106 107 /** 108 * The Pattern object that created this Matcher. 109 */ 110 Pattern parentPattern; 111 112 /** 113 * The storage used by groups. They may contain invalid values if 114 * a group was skipped during the matching. 115 */ 116 int[] groups; 117 118 /** 119 * The range within the sequence that is to be matched. Anchors 120 * will match at these "hard" boundaries. Changing the region 121 * changes these values. 122 */ 123 int from, to; 124 125 /** 126 * Lookbehind uses this value to ensure that the subexpression 127 * match ends at the point where the lookbehind was encountered. 128 */ 129 int lookbehindTo; 130 131 /** 132 * The original string being matched. 133 */ 134 CharSequence text; 135 136 /** 137 * Matcher state used by the last node. NOANCHOR is used when a 138 * match does not have to consume all of the input. ENDANCHOR is 139 * the mode used for matching all the input. 140 */ 141 static final int ENDANCHOR = 1; 142 static final int NOANCHOR = 0; 143 int acceptMode = NOANCHOR; 144 145 /** 146 * The range of string that last matched the pattern. If the last 147 * match failed then first is -1; last initially holds 0 then it 148 * holds the index of the end of the last match (which is where the 149 * next search starts). 150 */ 151 int first = -1, last = 0; 152 153 /** 154 * The end index of what matched in the last match operation. 155 */ 156 int oldLast = -1; 157 158 /** 159 * The index of the last position appended in a substitution. 160 */ 161 int lastAppendPosition = 0; 162 163 /** 164 * Storage used by nodes to tell what repetition they are on in 165 * a pattern, and where groups begin. The nodes themselves are stateless, 166 * so they rely on this field to hold state during a match. 167 */ 168 int[] locals; 169 170 /** 171 * Boolean indicating whether or not more input could change 172 * the results of the last match. 173 * 174 * If hitEnd is true, and a match was found, then more input 175 * might cause a different match to be found. 176 * If hitEnd is true and a match was not found, then more 177 * input could cause a match to be found. 178 * If hitEnd is false and a match was found, then more input 179 * will not change the match. 180 * If hitEnd is false and a match was not found, then more 181 * input will not cause a match to be found. 182 */ 183 boolean hitEnd; 184 185 /** 186 * Boolean indicating whether or not more input could change 187 * a positive match into a negative one. 188 * 189 * If requireEnd is true, and a match was found, then more 190 * input could cause the match to be lost. 191 * If requireEnd is false and a match was found, then more 192 * input might change the match but the match won't be lost. 193 * If a match was not found, then requireEnd has no meaning. 194 */ 195 boolean requireEnd; 196 197 /** 198 * If transparentBounds is true then the boundaries of this 199 * matcher's region are transparent to lookahead, lookbehind, 200 * and boundary matching constructs that try to see beyond them. 201 */ 202 boolean transparentBounds = false; 203 204 /** 205 * If anchoringBounds is true then the boundaries of this 206 * matcher's region match anchors such as ^ and $. 207 */ 208 boolean anchoringBounds = true; 209 210 /** 211 * No default constructor. 212 */ 213 Matcher() { 214 } 215 216 /** 217 * All matchers have the state used by Pattern during a match. 218 */ 219 Matcher(Pattern parent, CharSequence text) { 220 this.parentPattern = parent; 221 this.text = text; 222 223 // Allocate state storage 224 int parentGroupCount = Math.max(parent.capturingGroupCount, 10); 225 groups = new int[parentGroupCount * 2]; 226 locals = new int[parent.localCount]; 227 228 // Put fields into initial states 229 reset(); 230 } 231 232 /** 233 * Returns the pattern that is interpreted by this matcher. 234 * 235 * @return The pattern for which this matcher was created 236 */ 237 public Pattern pattern() { 238 return parentPattern; 239 } 240 241 /** 242 * Returns the match state of this matcher as a {@link MatchResult}. 243 * The result is unaffected by subsequent operations performed upon this 244 * matcher. 245 * 246 * @return a <code>MatchResult</code> with the state of this matcher 247 * @since 1.5 248 */ 249 public MatchResult toMatchResult() { 250 Matcher result = new Matcher(this.parentPattern, text.toString()); 251 result.first = this.first; 252 result.last = this.last; 253 result.groups = this.groups.clone(); 254 return result; 255 } 256 257 /** 258 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to 259 * find matches with. 260 * 261 * <p> This method causes this matcher to lose information 262 * about the groups of the last match that occurred. The 263 * matcher's position in the input is maintained and its 264 * last append position is unaffected.</p> 265 * 266 * @param newPattern 267 * The new pattern used by this matcher 268 * @return This matcher 269 * @throws IllegalArgumentException 270 * If newPattern is <tt>null</tt> 271 * @since 1.5 272 */ 273 public Matcher usePattern(Pattern newPattern) { 274 if (newPattern == null) 275 throw new IllegalArgumentException("Pattern cannot be null"); 276 parentPattern = newPattern; 277 278 // Reallocate state storage 279 int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10); 280 groups = new int[parentGroupCount * 2]; 281 locals = new int[newPattern.localCount]; 282 for (int i = 0; i < groups.length; i++) 283 groups[i] = -1; 284 for (int i = 0; i < locals.length; i++) 285 locals[i] = -1; 286 return this; 287 } 288 289 /** 290 * Resets this matcher. 291 * 292 * <p> Resetting a matcher discards all of its explicit state information 293 * and sets its append position to zero. The matcher's region is set to the 294 * default region, which is its entire character sequence. The anchoring 295 * and transparency of this matcher's region boundaries are unaffected. 296 * 297 * @return This matcher 298 */ 299 public Matcher reset() { 300 first = -1; 301 last = 0; 302 oldLast = -1; 303 for(int i=0; i<groups.length; i++) 304 groups[i] = -1; 305 for(int i=0; i<locals.length; i++) 306 locals[i] = -1; 307 lastAppendPosition = 0; 308 from = 0; 309 to = getTextLength(); 310 return this; 311 } 312 313 /** 314 * Resets this matcher with a new input sequence. 315 * 316 * <p> Resetting a matcher discards all of its explicit state information 317 * and sets its append position to zero. The matcher's region is set to 318 * the default region, which is its entire character sequence. The 319 * anchoring and transparency of this matcher's region boundaries are 320 * unaffected. 321 * 322 * @param input 323 * The new input character sequence 324 * 325 * @return This matcher 326 */ 327 public Matcher reset(CharSequence input) { 328 text = input; 329 return reset(); 330 } 331 332 /** 333 * Returns the start index of the previous match. </p> 334 * 335 * @return The index of the first character matched 336 * 337 * @throws IllegalStateException 338 * If no match has yet been attempted, 339 * or if the previous match operation failed 340 */ 341 public int start() { 342 if (first < 0) 343 throw new IllegalStateException("No match available"); 344 return first; 345 } 346 347 /** 348 * Returns the start index of the subsequence captured by the given group 349 * during the previous match operation. 350 * 351 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 352 * to right, starting at one. Group zero denotes the entire pattern, so 353 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to 354 * <i>m.</i><tt>start()</tt>. </p> 355 * 356 * @param group 357 * The index of a capturing group in this matcher's pattern 358 * 359 * @return The index of the first character captured by the group, 360 * or <tt>-1</tt> if the match was successful but the group 361 * itself did not match anything 362 * 363 * @throws IllegalStateException 364 * If no match has yet been attempted, 365 * or if the previous match operation failed 366 * 367 * @throws IndexOutOfBoundsException 368 * If there is no capturing group in the pattern 369 * with the given index 370 */ 371 public int start(int group) { 372 if (first < 0) 373 throw new IllegalStateException("No match available"); 374 if (group < 0 || group > groupCount()) 375 throw new IndexOutOfBoundsException("No group " + group); 376 return groups[group * 2]; 377 } 378 379 /** 380 * Returns the start index of the subsequence captured by the given 381 * <a href="Pattern.html#groupname">named-capturing group</a> during the 382 * previous match operation. 383 * 384 * @param name 385 * The name of a named-capturing group in this matcher's pattern 386 * 387 * @return The index of the first character captured by the group, 388 * or {@code -1} if the match was successful but the group 389 * itself did not match anything 390 * 391 * @throws IllegalStateException 392 * If no match has yet been attempted, 393 * or if the previous match operation failed 394 * 395 * @throws IllegalArgumentException 396 * If there is no capturing group in the pattern 397 * with the given name 398 * @since 1.8 399 */ 400 public int start(String name) { 401 return groups[getMatchedGroupIndex(name) * 2]; 402 } 403 404 /** 405 * Returns the offset after the last character matched. </p> 406 * 407 * @return The offset after the last character matched 408 * 409 * @throws IllegalStateException 410 * If no match has yet been attempted, 411 * or if the previous match operation failed 412 */ 413 public int end() { 414 if (first < 0) 415 throw new IllegalStateException("No match available"); 416 return last; 417 } 418 419 /** 420 * Returns the offset after the last character of the subsequence 421 * captured by the given group during the previous match operation. 422 * 423 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 424 * to right, starting at one. Group zero denotes the entire pattern, so 425 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to 426 * <i>m.</i><tt>end()</tt>. </p> 427 * 428 * @param group 429 * The index of a capturing group in this matcher's pattern 430 * 431 * @return The offset after the last character captured by the group, 432 * or <tt>-1</tt> if the match was successful 433 * but the group itself did not match anything 434 * 435 * @throws IllegalStateException 436 * If no match has yet been attempted, 437 * or if the previous match operation failed 438 * 439 * @throws IndexOutOfBoundsException 440 * If there is no capturing group in the pattern 441 * with the given index 442 */ 443 public int end(int group) { 444 if (first < 0) 445 throw new IllegalStateException("No match available"); 446 if (group < 0 || group > groupCount()) 447 throw new IndexOutOfBoundsException("No group " + group); 448 return groups[group * 2 + 1]; 449 } 450 451 /** 452 * Returns the offset after the last character of the subsequence 453 * captured by the given <a href="Pattern.html#groupname">named-capturing 454 * group</a> during the previous match operation. 455 * 456 * @param name 457 * The name of a named-capturing group in this matcher's pattern 458 * 459 * @return The offset after the last character captured by the group, 460 * or {@code -1} if the match was successful 461 * but the group itself did not match anything 462 * 463 * @throws IllegalStateException 464 * If no match has yet been attempted, 465 * or if the previous match operation failed 466 * 467 * @throws IllegalArgumentException 468 * If there is no capturing group in the pattern 469 * with the given name 470 * @since 1.8 471 */ 472 public int end(String name) { 473 return groups[getMatchedGroupIndex(name) * 2 + 1]; 474 } 475 476 /** 477 * Returns the input subsequence matched by the previous match. 478 * 479 * <p> For a matcher <i>m</i> with input sequence <i>s</i>, 480 * the expressions <i>m.</i><tt>group()</tt> and 481 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt> 482 * are equivalent. </p> 483 * 484 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty 485 * string. This method will return the empty string when the pattern 486 * successfully matches the empty string in the input. </p> 487 * 488 * @return The (possibly empty) subsequence matched by the previous match, 489 * in string form 490 * 491 * @throws IllegalStateException 492 * If no match has yet been attempted, 493 * or if the previous match operation failed 494 */ 495 public String group() { 496 return group(0); 497 } 498 499 /** 500 * Returns the input subsequence captured by the given group during the 501 * previous match operation. 502 * 503 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index 504 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and 505 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt> 506 * are equivalent. </p> 507 * 508 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left 509 * to right, starting at one. Group zero denotes the entire pattern, so 510 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>. 511 * </p> 512 * 513 * <p> If the match was successful but the group specified failed to match 514 * any part of the input sequence, then <tt>null</tt> is returned. Note 515 * that some groups, for example <tt>(a*)</tt>, match the empty string. 516 * This method will return the empty string when such a group successfully 517 * matches the empty string in the input. </p> 518 * 519 * @param group 520 * The index of a capturing group in this matcher's pattern 521 * 522 * @return The (possibly empty) subsequence captured by the group 523 * during the previous match, or <tt>null</tt> if the group 524 * failed to match part of the input 525 * 526 * @throws IllegalStateException 527 * If no match has yet been attempted, 528 * or if the previous match operation failed 529 * 530 * @throws IndexOutOfBoundsException 531 * If there is no capturing group in the pattern 532 * with the given index 533 */ 534 public String group(int group) { 535 if (first < 0) 536 throw new IllegalStateException("No match found"); 537 if (group < 0 || group > groupCount()) 538 throw new IndexOutOfBoundsException("No group " + group); 539 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 540 return null; 541 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 542 } 543 544 /** 545 * Returns the input subsequence captured by the given 546 * <a href="Pattern.html#groupname">named-capturing group</a> during the previous 547 * match operation. 548 * 549 * <p> If the match was successful but the group specified failed to match 550 * any part of the input sequence, then <tt>null</tt> is returned. Note 551 * that some groups, for example <tt>(a*)</tt>, match the empty string. 552 * This method will return the empty string when such a group successfully 553 * matches the empty string in the input. </p> 554 * 555 * @param name 556 * The name of a named-capturing group in this matcher's pattern 557 * 558 * @return The (possibly empty) subsequence captured by the named group 559 * during the previous match, or <tt>null</tt> if the group 560 * failed to match part of the input 561 * 562 * @throws IllegalStateException 563 * If no match has yet been attempted, 564 * or if the previous match operation failed 565 * 566 * @throws IllegalArgumentException 567 * If there is no capturing group in the pattern 568 * with the given name 569 * @since 1.7 570 */ 571 public String group(String name) { 572 int group = getMatchedGroupIndex(name); 573 if ((groups[group*2] == -1) || (groups[group*2+1] == -1)) 574 return null; 575 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString(); 576 } 577 578 /** 579 * Returns the number of capturing groups in this matcher's pattern. 580 * 581 * <p> Group zero denotes the entire pattern by convention. It is not 582 * included in this count. 583 * 584 * <p> Any non-negative integer smaller than or equal to the value 585 * returned by this method is guaranteed to be a valid group index for 586 * this matcher. </p> 587 * 588 * @return The number of capturing groups in this matcher's pattern 589 */ 590 public int groupCount() { 591 return parentPattern.capturingGroupCount - 1; 592 } 593 594 /** 595 * Attempts to match the entire region against the pattern. 596 * 597 * <p> If the match succeeds then more information can be obtained via the 598 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 599 * 600 * @return <tt>true</tt> if, and only if, the entire region sequence 601 * matches this matcher's pattern 602 */ 603 public boolean matches() { 604 return match(from, ENDANCHOR); 605 } 606 607 /** 608 * Attempts to find the next subsequence of the input sequence that matches 609 * the pattern. 610 * 611 * <p> This method starts at the beginning of this matcher's region, or, if 612 * a previous invocation of the method was successful and the matcher has 613 * not since been reset, at the first character not matched by the previous 614 * match. 615 * 616 * <p> If the match succeeds then more information can be obtained via the 617 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 618 * 619 * @return <tt>true</tt> if, and only if, a subsequence of the input 620 * sequence matches this matcher's pattern 621 */ 622 public boolean find() { 623 int nextSearchIndex = last; 624 if (nextSearchIndex == first) 625 nextSearchIndex++; 626 627 // If next search starts before region, start it at region 628 if (nextSearchIndex < from) 629 nextSearchIndex = from; 630 631 // If next search starts beyond region then it fails 632 if (nextSearchIndex > to) { 633 for (int i = 0; i < groups.length; i++) 634 groups[i] = -1; 635 return false; 636 } 637 return search(nextSearchIndex); 638 } 639 640 /** 641 * Resets this matcher and then attempts to find the next subsequence of 642 * the input sequence that matches the pattern, starting at the specified 643 * index. 644 * 645 * <p> If the match succeeds then more information can be obtained via the 646 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent 647 * invocations of the {@link #find()} method will start at the first 648 * character not matched by this match. </p> 649 * 650 * @throws IndexOutOfBoundsException 651 * If start is less than zero or if start is greater than the 652 * length of the input sequence. 653 * 654 * @return <tt>true</tt> if, and only if, a subsequence of the input 655 * sequence starting at the given index matches this matcher's 656 * pattern 657 */ 658 public boolean find(int start) { 659 int limit = getTextLength(); 660 if ((start < 0) || (start > limit)) 661 throw new IndexOutOfBoundsException("Illegal start index"); 662 reset(); 663 return search(start); 664 } 665 666 /** 667 * Attempts to match the input sequence, starting at the beginning of the 668 * region, against the pattern. 669 * 670 * <p> Like the {@link #matches matches} method, this method always starts 671 * at the beginning of the region; unlike that method, it does not 672 * require that the entire region be matched. 673 * 674 * <p> If the match succeeds then more information can be obtained via the 675 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p> 676 * 677 * @return <tt>true</tt> if, and only if, a prefix of the input 678 * sequence matches this matcher's pattern 679 */ 680 public boolean lookingAt() { 681 return match(from, NOANCHOR); 682 } 683 684 /** 685 * Returns a literal replacement <code>String</code> for the specified 686 * <code>String</code>. 687 * 688 * This method produces a <code>String</code> that will work 689 * as a literal replacement <code>s</code> in the 690 * <code>appendReplacement</code> method of the {@link Matcher} class. 691 * The <code>String</code> produced will match the sequence of characters 692 * in <code>s</code> treated as a literal sequence. Slashes ('\') and 693 * dollar signs ('$') will be given no special meaning. 694 * 695 * @param s The string to be literalized 696 * @return A literal string replacement 697 * @since 1.5 698 */ 699 public static String quoteReplacement(String s) { 700 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1)) 701 return s; 702 StringBuilder sb = new StringBuilder(); 703 for (int i=0; i<s.length(); i++) { 704 char c = s.charAt(i); 705 if (c == '\\' || c == '$') { 706 sb.append('\\'); 707 } 708 sb.append(c); 709 } 710 return sb.toString(); 711 } 712 713 /** 714 * Implements a non-terminal append-and-replace step. 715 * 716 * <p> This method performs the following actions: </p> 717 * 718 * <ol> 719 * 720 * <li><p> It reads characters from the input sequence, starting at the 721 * append position, and appends them to the given string buffer. It 722 * stops after reading the last character preceding the previous match, 723 * that is, the character at index {@link 724 * #start()} <tt>-</tt> <tt>1</tt>. </p></li> 725 * 726 * <li><p> It appends the given replacement string to the string buffer. 727 * </p></li> 728 * 729 * <li><p> It sets the append position of this matcher to the index of 730 * the last character matched, plus one, that is, to {@link #end()}. 731 * </p></li> 732 * 733 * </ol> 734 * 735 * <p> The replacement string may contain references to subsequences 736 * captured during the previous match: Each occurrence of 737 * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i> 738 * will be replaced by the result of evaluating the corresponding 739 * {@link #group(String) group(name)} or {@link #group(int) group(g)</tt>} 740 * respectively. For <tt>$</tt><i>g</i><tt></tt>, 741 * the first number after the <tt>$</tt> is always treated as part of 742 * the group reference. Subsequent numbers are incorporated into g if 743 * they would form a legal group reference. Only the numerals '0' 744 * through '9' are considered as potential components of the group 745 * reference. If the second group matched the string <tt>"foo"</tt>, for 746 * example, then passing the replacement string <tt>"$2bar"</tt> would 747 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar 748 * sign (<tt>$</tt>) may be included as a literal in the replacement 749 * string by preceding it with a backslash (<tt>\$</tt>). 750 * 751 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 752 * the replacement string may cause the results to be different than if it 753 * were being treated as a literal replacement string. Dollar signs may be 754 * treated as references to captured subsequences as described above, and 755 * backslashes are used to escape literal characters in the replacement 756 * string. 757 * 758 * <p> This method is intended to be used in a loop together with the 759 * {@link #appendTail appendTail} and {@link #find find} methods. The 760 * following code, for example, writes <tt>one dog two dogs in the 761 * yard</tt> to the standard-output stream: </p> 762 * 763 * <blockquote><pre> 764 * Pattern p = Pattern.compile("cat"); 765 * Matcher m = p.matcher("one cat two cats in the yard"); 766 * StringBuffer sb = new StringBuffer(); 767 * while (m.find()) { 768 * m.appendReplacement(sb, "dog"); 769 * } 770 * m.appendTail(sb); 771 * System.out.println(sb.toString());</pre></blockquote> 772 * 773 * @param sb 774 * The target string buffer 775 * 776 * @param replacement 777 * The replacement string 778 * 779 * @return This matcher 780 * 781 * @throws IllegalStateException 782 * If no match has yet been attempted, 783 * or if the previous match operation failed 784 * 785 * @throws IllegalArgumentException 786 * If the replacement string refers to a named-capturing 787 * group that does not exist in the pattern 788 * 789 * @throws IndexOutOfBoundsException 790 * If the replacement string refers to a capturing group 791 * that does not exist in the pattern 792 */ 793 public Matcher appendReplacement(StringBuffer sb, String replacement) { 794 795 // If no match, return error 796 if (first < 0) 797 throw new IllegalStateException("No match available"); 798 799 // Process substitution string to replace group references with groups 800 int cursor = 0; 801 StringBuilder result = new StringBuilder(); 802 803 while (cursor < replacement.length()) { 804 char nextChar = replacement.charAt(cursor); 805 if (nextChar == '\\') { 806 cursor++; 807 if (cursor == replacement.length()) 808 throw new IllegalArgumentException( 809 "character to be escaped is missing"); 810 nextChar = replacement.charAt(cursor); 811 result.append(nextChar); 812 cursor++; 813 } else if (nextChar == '$') { 814 // Skip past $ 815 cursor++; 816 // Throw IAE if this "$" is the last character in replacement 817 if (cursor == replacement.length()) 818 throw new IllegalArgumentException( 819 "Illegal group reference: group index is missing"); 820 nextChar = replacement.charAt(cursor); 821 int refNum = -1; 822 if (nextChar == '{') { 823 cursor++; 824 StringBuilder gsb = new StringBuilder(); 825 while (cursor < replacement.length()) { 826 nextChar = replacement.charAt(cursor); 827 if (ASCII.isLower(nextChar) || 828 ASCII.isUpper(nextChar) || 829 ASCII.isDigit(nextChar)) { 830 gsb.append(nextChar); 831 cursor++; 832 } else { 833 break; 834 } 835 } 836 if (gsb.length() == 0) 837 throw new IllegalArgumentException( 838 "named capturing group has 0 length name"); 839 if (nextChar != '}') 840 throw new IllegalArgumentException( 841 "named capturing group is missing trailing '}'"); 842 String gname = gsb.toString(); 843 if (ASCII.isDigit(gname.charAt(0))) 844 throw new IllegalArgumentException( 845 "capturing group name {" + gname + 846 "} starts with digit character"); 847 if (!parentPattern.namedGroups().containsKey(gname)) 848 throw new IllegalArgumentException( 849 "No group with name {" + gname + "}"); 850 refNum = parentPattern.namedGroups().get(gname); 851 cursor++; 852 } else { 853 // The first number is always a group 854 refNum = (int)nextChar - '0'; 855 if ((refNum < 0)||(refNum > 9)) 856 throw new IllegalArgumentException( 857 "Illegal group reference"); 858 cursor++; 859 // Capture the largest legal group string 860 boolean done = false; 861 while (!done) { 862 if (cursor >= replacement.length()) { 863 break; 864 } 865 int nextDigit = replacement.charAt(cursor) - '0'; 866 if ((nextDigit < 0)||(nextDigit > 9)) { // not a number 867 break; 868 } 869 int newRefNum = (refNum * 10) + nextDigit; 870 if (groupCount() < newRefNum) { 871 done = true; 872 } else { 873 refNum = newRefNum; 874 cursor++; 875 } 876 } 877 } 878 // Append group 879 if (start(refNum) != -1 && end(refNum) != -1) 880 result.append(text, start(refNum), end(refNum)); 881 } else { 882 result.append(nextChar); 883 cursor++; 884 } 885 } 886 // Append the intervening text 887 sb.append(text, lastAppendPosition, first); 888 // Append the match substitution 889 sb.append(result); 890 891 lastAppendPosition = last; 892 return this; 893 } 894 895 /** 896 * Implements a terminal append-and-replace step. 897 * 898 * <p> This method reads characters from the input sequence, starting at 899 * the append position, and appends them to the given string buffer. It is 900 * intended to be invoked after one or more invocations of the {@link 901 * #appendReplacement appendReplacement} method in order to copy the 902 * remainder of the input sequence. </p> 903 * 904 * @param sb 905 * The target string buffer 906 * 907 * @return The target string buffer 908 */ 909 public StringBuffer appendTail(StringBuffer sb) { 910 sb.append(text, lastAppendPosition, getTextLength()); 911 return sb; 912 } 913 914 /** 915 * Replaces every subsequence of the input sequence that matches the 916 * pattern with the given replacement string. 917 * 918 * <p> This method first resets this matcher. It then scans the input 919 * sequence looking for matches of the pattern. Characters that are not 920 * part of any match are appended directly to the result string; each match 921 * is replaced in the result by the replacement string. The replacement 922 * string may contain references to captured subsequences as in the {@link 923 * #appendReplacement appendReplacement} method. 924 * 925 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 926 * the replacement string may cause the results to be different than if it 927 * were being treated as a literal replacement string. Dollar signs may be 928 * treated as references to captured subsequences as described above, and 929 * backslashes are used to escape literal characters in the replacement 930 * string. 931 * 932 * <p> Given the regular expression <tt>a*b</tt>, the input 933 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string 934 * <tt>"-"</tt>, an invocation of this method on a matcher for that 935 * expression would yield the string <tt>"-foo-foo-foo-"</tt>. 936 * 937 * <p> Invoking this method changes this matcher's state. If the matcher 938 * is to be used in further matching operations then it should first be 939 * reset. </p> 940 * 941 * @param replacement 942 * The replacement string 943 * 944 * @return The string constructed by replacing each matching subsequence 945 * by the replacement string, substituting captured subsequences 946 * as needed 947 */ 948 public String replaceAll(String replacement) { 949 reset(); 950 boolean result = find(); 951 if (result) { 952 StringBuffer sb = new StringBuffer(); 953 do { 954 appendReplacement(sb, replacement); 955 result = find(); 956 } while (result); 957 appendTail(sb); 958 return sb.toString(); 959 } 960 return text.toString(); 961 } 962 963 /** 964 * Replaces the first subsequence of the input sequence that matches the 965 * pattern with the given replacement string. 966 * 967 * <p> This method first resets this matcher. It then scans the input 968 * sequence looking for a match of the pattern. Characters that are not 969 * part of the match are appended directly to the result string; the match 970 * is replaced in the result by the replacement string. The replacement 971 * string may contain references to captured subsequences as in the {@link 972 * #appendReplacement appendReplacement} method. 973 * 974 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in 975 * the replacement string may cause the results to be different than if it 976 * were being treated as a literal replacement string. Dollar signs may be 977 * treated as references to captured subsequences as described above, and 978 * backslashes are used to escape literal characters in the replacement 979 * string. 980 * 981 * <p> Given the regular expression <tt>dog</tt>, the input 982 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string 983 * <tt>"cat"</tt>, an invocation of this method on a matcher for that 984 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p> 985 * 986 * <p> Invoking this method changes this matcher's state. If the matcher 987 * is to be used in further matching operations then it should first be 988 * reset. </p> 989 * 990 * @param replacement 991 * The replacement string 992 * @return The string constructed by replacing the first matching 993 * subsequence by the replacement string, substituting captured 994 * subsequences as needed 995 */ 996 public String replaceFirst(String replacement) { 997 if (replacement == null) 998 throw new NullPointerException("replacement"); 999 reset(); 1000 if (!find()) 1001 return text.toString(); 1002 StringBuffer sb = new StringBuffer(); 1003 appendReplacement(sb, replacement); 1004 appendTail(sb); 1005 return sb.toString(); 1006 } 1007 1008 /** 1009 * Sets the limits of this matcher's region. The region is the part of the 1010 * input sequence that will be searched to find a match. Invoking this 1011 * method resets the matcher, and then sets the region to start at the 1012 * index specified by the <code>start</code> parameter and end at the 1013 * index specified by the <code>end</code> parameter. 1014 * 1015 * <p>Depending on the transparency and anchoring being used (see 1016 * {@link #useTransparentBounds useTransparentBounds} and 1017 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such 1018 * as anchors may behave differently at or around the boundaries of the 1019 * region. 1020 * 1021 * @param start 1022 * The index to start searching at (inclusive) 1023 * @param end 1024 * The index to end searching at (exclusive) 1025 * @throws IndexOutOfBoundsException 1026 * If start or end is less than zero, if 1027 * start is greater than the length of the input sequence, if 1028 * end is greater than the length of the input sequence, or if 1029 * start is greater than end. 1030 * @return this matcher 1031 * @since 1.5 1032 */ 1033 public Matcher region(int start, int end) { 1034 if ((start < 0) || (start > getTextLength())) 1035 throw new IndexOutOfBoundsException("start"); 1036 if ((end < 0) || (end > getTextLength())) 1037 throw new IndexOutOfBoundsException("end"); 1038 if (start > end) 1039 throw new IndexOutOfBoundsException("start > end"); 1040 reset(); 1041 from = start; 1042 to = end; 1043 return this; 1044 } 1045 1046 /** 1047 * Reports the start index of this matcher's region. The 1048 * searches this matcher conducts are limited to finding matches 1049 * within {@link #regionStart regionStart} (inclusive) and 1050 * {@link #regionEnd regionEnd} (exclusive). 1051 * 1052 * @return The starting point of this matcher's region 1053 * @since 1.5 1054 */ 1055 public int regionStart() { 1056 return from; 1057 } 1058 1059 /** 1060 * Reports the end index (exclusive) of this matcher's region. 1061 * The searches this matcher conducts are limited to finding matches 1062 * within {@link #regionStart regionStart} (inclusive) and 1063 * {@link #regionEnd regionEnd} (exclusive). 1064 * 1065 * @return the ending point of this matcher's region 1066 * @since 1.5 1067 */ 1068 public int regionEnd() { 1069 return to; 1070 } 1071 1072 /** 1073 * Queries the transparency of region bounds for this matcher. 1074 * 1075 * <p> This method returns <tt>true</tt> if this matcher uses 1076 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i> 1077 * bounds. 1078 * 1079 * <p> See {@link #useTransparentBounds useTransparentBounds} for a 1080 * description of transparent and opaque bounds. 1081 * 1082 * <p> By default, a matcher uses opaque region boundaries. 1083 * 1084 * @return <tt>true</tt> iff this matcher is using transparent bounds, 1085 * <tt>false</tt> otherwise. 1086 * @see java.util.regex.Matcher#useTransparentBounds(boolean) 1087 * @since 1.5 1088 */ 1089 public boolean hasTransparentBounds() { 1090 return transparentBounds; 1091 } 1092 1093 /** 1094 * Sets the transparency of region bounds for this matcher. 1095 * 1096 * <p> Invoking this method with an argument of <tt>true</tt> will set this 1097 * matcher to use <i>transparent</i> bounds. If the boolean 1098 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used. 1099 * 1100 * <p> Using transparent bounds, the boundaries of this 1101 * matcher's region are transparent to lookahead, lookbehind, 1102 * and boundary matching constructs. Those constructs can see beyond the 1103 * boundaries of the region to see if a match is appropriate. 1104 * 1105 * <p> Using opaque bounds, the boundaries of this matcher's 1106 * region are opaque to lookahead, lookbehind, and boundary matching 1107 * constructs that may try to see beyond them. Those constructs cannot 1108 * look past the boundaries so they will fail to match anything outside 1109 * of the region. 1110 * 1111 * <p> By default, a matcher uses opaque bounds. 1112 * 1113 * @param b a boolean indicating whether to use opaque or transparent 1114 * regions 1115 * @return this matcher 1116 * @see java.util.regex.Matcher#hasTransparentBounds 1117 * @since 1.5 1118 */ 1119 public Matcher useTransparentBounds(boolean b) { 1120 transparentBounds = b; 1121 return this; 1122 } 1123 1124 /** 1125 * Queries the anchoring of region bounds for this matcher. 1126 * 1127 * <p> This method returns <tt>true</tt> if this matcher uses 1128 * <i>anchoring</i> bounds, <tt>false</tt> otherwise. 1129 * 1130 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a 1131 * description of anchoring bounds. 1132 * 1133 * <p> By default, a matcher uses anchoring region boundaries. 1134 * 1135 * @return <tt>true</tt> iff this matcher is using anchoring bounds, 1136 * <tt>false</tt> otherwise. 1137 * @see java.util.regex.Matcher#useAnchoringBounds(boolean) 1138 * @since 1.5 1139 */ 1140 public boolean hasAnchoringBounds() { 1141 return anchoringBounds; 1142 } 1143 1144 /** 1145 * Sets the anchoring of region bounds for this matcher. 1146 * 1147 * <p> Invoking this method with an argument of <tt>true</tt> will set this 1148 * matcher to use <i>anchoring</i> bounds. If the boolean 1149 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be 1150 * used. 1151 * 1152 * <p> Using anchoring bounds, the boundaries of this 1153 * matcher's region match anchors such as ^ and $. 1154 * 1155 * <p> Without anchoring bounds, the boundaries of this 1156 * matcher's region will not match anchors such as ^ and $. 1157 * 1158 * <p> By default, a matcher uses anchoring region boundaries. 1159 * 1160 * @param b a boolean indicating whether or not to use anchoring bounds. 1161 * @return this matcher 1162 * @see java.util.regex.Matcher#hasAnchoringBounds 1163 * @since 1.5 1164 */ 1165 public Matcher useAnchoringBounds(boolean b) { 1166 anchoringBounds = b; 1167 return this; 1168 } 1169 1170 /** 1171 * <p>Returns the string representation of this matcher. The 1172 * string representation of a <code>Matcher</code> contains information 1173 * that may be useful for debugging. The exact format is unspecified. 1174 * 1175 * @return The string representation of this matcher 1176 * @since 1.5 1177 */ 1178 public String toString() { 1179 StringBuilder sb = new StringBuilder(); 1180 sb.append("java.util.regex.Matcher"); 1181 sb.append("[pattern=" + pattern()); 1182 sb.append(" region="); 1183 sb.append(regionStart() + "," + regionEnd()); 1184 sb.append(" lastmatch="); 1185 if ((first >= 0) && (group() != null)) { 1186 sb.append(group()); 1187 } 1188 sb.append("]"); 1189 return sb.toString(); 1190 } 1191 1192 /** 1193 * <p>Returns true if the end of input was hit by the search engine in 1194 * the last match operation performed by this matcher. 1195 * 1196 * <p>When this method returns true, then it is possible that more input 1197 * would have changed the result of the last search. 1198 * 1199 * @return true iff the end of input was hit in the last match; false 1200 * otherwise 1201 * @since 1.5 1202 */ 1203 public boolean hitEnd() { 1204 return hitEnd; 1205 } 1206 1207 /** 1208 * <p>Returns true if more input could change a positive match into a 1209 * negative one. 1210 * 1211 * <p>If this method returns true, and a match was found, then more 1212 * input could cause the match to be lost. If this method returns false 1213 * and a match was found, then more input might change the match but the 1214 * match won't be lost. If a match was not found, then requireEnd has no 1215 * meaning. 1216 * 1217 * @return true iff more input could change a positive match into a 1218 * negative one. 1219 * @since 1.5 1220 */ 1221 public boolean requireEnd() { 1222 return requireEnd; 1223 } 1224 1225 /** 1226 * Initiates a search to find a Pattern within the given bounds. 1227 * The groups are filled with default values and the match of the root 1228 * of the state machine is called. The state machine will hold the state 1229 * of the match as it proceeds in this matcher. 1230 * 1231 * Matcher.from is not set here, because it is the "hard" boundary 1232 * of the start of the search which anchors will set to. The from param 1233 * is the "soft" boundary of the start of the search, meaning that the 1234 * regex tries to match at that index but ^ won't match there. Subsequent 1235 * calls to the search methods start at a new "soft" boundary which is 1236 * the end of the previous match. 1237 */ 1238 boolean search(int from) { 1239 this.hitEnd = false; 1240 this.requireEnd = false; 1241 from = from < 0 ? 0 : from; 1242 this.first = from; 1243 this.oldLast = oldLast < 0 ? from : oldLast; 1244 for (int i = 0; i < groups.length; i++) 1245 groups[i] = -1; 1246 acceptMode = NOANCHOR; 1247 boolean result = parentPattern.root.match(this, from, text); 1248 if (!result) 1249 this.first = -1; 1250 this.oldLast = this.last; 1251 return result; 1252 } 1253 1254 /** 1255 * Initiates a search for an anchored match to a Pattern within the given 1256 * bounds. The groups are filled with default values and the match of the 1257 * root of the state machine is called. The state machine will hold the 1258 * state of the match as it proceeds in this matcher. 1259 */ 1260 boolean match(int from, int anchor) { 1261 this.hitEnd = false; 1262 this.requireEnd = false; 1263 from = from < 0 ? 0 : from; 1264 this.first = from; 1265 this.oldLast = oldLast < 0 ? from : oldLast; 1266 for (int i = 0; i < groups.length; i++) 1267 groups[i] = -1; 1268 acceptMode = anchor; 1269 boolean result = parentPattern.matchRoot.match(this, from, text); 1270 if (!result) 1271 this.first = -1; 1272 this.oldLast = this.last; 1273 return result; 1274 } 1275 1276 /** 1277 * Returns the end index of the text. 1278 * 1279 * @return the index after the last character in the text 1280 */ 1281 int getTextLength() { 1282 return text.length(); 1283 } 1284 1285 /** 1286 * Generates a String from this Matcher's input in the specified range. 1287 * 1288 * @param beginIndex the beginning index, inclusive 1289 * @param endIndex the ending index, exclusive 1290 * @return A String generated from this Matcher's input 1291 */ 1292 CharSequence getSubSequence(int beginIndex, int endIndex) { 1293 return text.subSequence(beginIndex, endIndex); 1294 } 1295 1296 /** 1297 * Returns this Matcher's input character at index i. 1298 * 1299 * @return A char from the specified index 1300 */ 1301 char charAt(int i) { 1302 return text.charAt(i); 1303 } 1304 1305 /** 1306 * Returns the group index of the matched capturing group. 1307 * 1308 * @return the index of the named-capturing group 1309 */ 1310 int getMatchedGroupIndex(String name) { 1311 Objects.requireNonNull(name, "Group name"); 1312 if (first < 0) 1313 throw new IllegalStateException("No match found"); 1314 if (!parentPattern.namedGroups().containsKey(name)) 1315 throw new IllegalArgumentException("No group with name <" + name + ">"); 1316 return parentPattern.namedGroups().get(name); 1317 } 1318 }