48 *
49 * </ul>
50 *
51 * <p> Each of these methods returns a boolean indicating success or failure.
52 * More information about a successful match can be obtained by querying the
53 * state of the matcher.
54 *
55 * <p> A matcher finds matches in a subset of its input called the
56 * <i>region</i>. By default, the region contains all of the matcher's input.
57 * The region can be modified via the{@link #region region} method and queried
58 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
59 * methods. The way that the region boundaries interact with some pattern
60 * constructs can be changed. See {@link #useAnchoringBounds
61 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
62 * for more details.
63 *
64 * <p> This class also defines methods for replacing matched subsequences with
65 * new strings whose contents can, if desired, be computed from the match
66 * result. The {@link #appendReplacement appendReplacement} and {@link
67 * #appendTail appendTail} methods can be used in tandem in order to collect
68 * the result into an existing string buffer, or the more convenient {@link
69 * #replaceAll replaceAll} method can be used to create a string in which every
70 * matching subsequence in the input sequence is replaced.
71 *
72 * <p> The explicit state of a matcher includes the start and end indices of
73 * the most recent successful match. It also includes the start and end
74 * indices of the input subsequence captured by each <a
75 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
76 * count of such subsequences. As a convenience, methods are also provided for
77 * returning these captured subsequences in string form.
78 *
79 * <p> The explicit state of a matcher is initially undefined; attempting to
80 * query any part of it before a successful match will cause an {@link
81 * IllegalStateException} to be thrown. The explicit state of a matcher is
82 * recomputed by every match operation.
83 *
84 * <p> The implicit state of a matcher includes the input character sequence as
85 * well as the <i>append position</i>, which is initially zero and is updated
86 * by the {@link #appendReplacement appendReplacement} method.
87 *
88 * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
89 * method or, if a new input sequence is desired, its {@link
90 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
775 * The target string buffer
776 *
777 * @param replacement
778 * The replacement string
779 *
780 * @return This matcher
781 *
782 * @throws IllegalStateException
783 * If no match has yet been attempted,
784 * or if the previous match operation failed
785 *
786 * @throws IllegalArgumentException
787 * If the replacement string refers to a named-capturing
788 * group that does not exist in the pattern
789 *
790 * @throws IndexOutOfBoundsException
791 * If the replacement string refers to a capturing group
792 * that does not exist in the pattern
793 */
794 public Matcher appendReplacement(StringBuffer sb, String replacement) {
795
796 // If no match, return error
797 if (first < 0)
798 throw new IllegalStateException("No match available");
799
800 // Process substitution string to replace group references with groups
801 int cursor = 0;
802 StringBuilder result = new StringBuilder();
803
804 while (cursor < replacement.length()) {
805 char nextChar = replacement.charAt(cursor);
806 if (nextChar == '\\') {
807 cursor++;
808 if (cursor == replacement.length())
809 throw new IllegalArgumentException(
810 "character to be escaped is missing");
811 nextChar = replacement.charAt(cursor);
812 result.append(nextChar);
813 cursor++;
814 } else if (nextChar == '$') {
815 // Skip past $
816 cursor++;
817 // Throw IAE if this "$" is the last character in replacement
818 if (cursor == replacement.length())
819 throw new IllegalArgumentException(
820 "Illegal group reference: group index is missing");
821 nextChar = replacement.charAt(cursor);
822 int refNum = -1;
823 if (nextChar == '{') {
835 }
836 }
837 if (gsb.length() == 0)
838 throw new IllegalArgumentException(
839 "named capturing group has 0 length name");
840 if (nextChar != '}')
841 throw new IllegalArgumentException(
842 "named capturing group is missing trailing '}'");
843 String gname = gsb.toString();
844 if (ASCII.isDigit(gname.charAt(0)))
845 throw new IllegalArgumentException(
846 "capturing group name {" + gname +
847 "} starts with digit character");
848 if (!parentPattern.namedGroups().containsKey(gname))
849 throw new IllegalArgumentException(
850 "No group with name {" + gname + "}");
851 refNum = parentPattern.namedGroups().get(gname);
852 cursor++;
853 } else {
854 // The first number is always a group
855 refNum = (int)nextChar - '0';
856 if ((refNum < 0)||(refNum > 9))
857 throw new IllegalArgumentException(
858 "Illegal group reference");
859 cursor++;
860 // Capture the largest legal group string
861 boolean done = false;
862 while (!done) {
863 if (cursor >= replacement.length()) {
864 break;
865 }
866 int nextDigit = replacement.charAt(cursor) - '0';
867 if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
868 break;
869 }
870 int newRefNum = (refNum * 10) + nextDigit;
871 if (groupCount() < newRefNum) {
872 done = true;
873 } else {
874 refNum = newRefNum;
875 cursor++;
876 }
877 }
878 }
879 // Append group
880 if (start(refNum) != -1 && end(refNum) != -1)
881 result.append(text, start(refNum), end(refNum));
882 } else {
883 result.append(nextChar);
884 cursor++;
885 }
886 }
887 // Append the intervening text
888 sb.append(text, lastAppendPosition, first);
889 // Append the match substitution
890 sb.append(result);
891
892 lastAppendPosition = last;
893 return this;
894 }
895
896 /**
897 * Implements a terminal append-and-replace step.
898 *
899 * <p> This method reads characters from the input sequence, starting at
900 * the append position, and appends them to the given string buffer. It is
901 * intended to be invoked after one or more invocations of the {@link
902 * #appendReplacement appendReplacement} method in order to copy the
903 * remainder of the input sequence. </p>
904 *
905 * @param sb
906 * The target string buffer
907 *
908 * @return The target string buffer
909 */
910 public StringBuffer appendTail(StringBuffer sb) {
911 sb.append(text, lastAppendPosition, getTextLength());
912 return sb;
913 }
914
915 /**
916 * Replaces every subsequence of the input sequence that matches the
917 * pattern with the given replacement string.
918 *
919 * <p> This method first resets this matcher. It then scans the input
920 * sequence looking for matches of the pattern. Characters that are not
921 * part of any match are appended directly to the result string; each match
922 * is replaced in the result by the replacement string. The replacement
923 * string may contain references to captured subsequences as in the {@link
924 * #appendReplacement appendReplacement} method.
925 *
926 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
927 * the replacement string may cause the results to be different than if it
928 * were being treated as a literal replacement string. Dollar signs may be
929 * treated as references to captured subsequences as described above, and
930 * backslashes are used to escape literal characters in the replacement
931 * string.
932 *
933 * <p> Given the regular expression <tt>a*b</tt>, the input
934 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
935 * <tt>"-"</tt>, an invocation of this method on a matcher for that
936 * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
937 *
938 * <p> Invoking this method changes this matcher's state. If the matcher
939 * is to be used in further matching operations then it should first be
940 * reset. </p>
941 *
942 * @param replacement
943 * The replacement string
944 *
945 * @return The string constructed by replacing each matching subsequence
946 * by the replacement string, substituting captured subsequences
947 * as needed
948 */
949 public String replaceAll(String replacement) {
950 reset();
951 boolean result = find();
952 if (result) {
953 StringBuffer sb = new StringBuffer();
954 do {
955 appendReplacement(sb, replacement);
956 result = find();
957 } while (result);
958 appendTail(sb);
959 return sb.toString();
960 }
961 return text.toString();
962 }
963
964 /**
965 * Replaces the first subsequence of the input sequence that matches the
966 * pattern with the given replacement string.
967 *
968 * <p> This method first resets this matcher. It then scans the input
969 * sequence looking for a match of the pattern. Characters that are not
970 * part of the match are appended directly to the result string; the match
971 * is replaced in the result by the replacement string. The replacement
972 * string may contain references to captured subsequences as in the {@link
973 * #appendReplacement appendReplacement} method.
983 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
984 * <tt>"cat"</tt>, an invocation of this method on a matcher for that
985 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
986 *
987 * <p> Invoking this method changes this matcher's state. If the matcher
988 * is to be used in further matching operations then it should first be
989 * reset. </p>
990 *
991 * @param replacement
992 * The replacement string
993 * @return The string constructed by replacing the first matching
994 * subsequence by the replacement string, substituting captured
995 * subsequences as needed
996 */
997 public String replaceFirst(String replacement) {
998 if (replacement == null)
999 throw new NullPointerException("replacement");
1000 reset();
1001 if (!find())
1002 return text.toString();
1003 StringBuffer sb = new StringBuffer();
1004 appendReplacement(sb, replacement);
1005 appendTail(sb);
1006 return sb.toString();
1007 }
1008
1009 /**
1010 * Sets the limits of this matcher's region. The region is the part of the
1011 * input sequence that will be searched to find a match. Invoking this
1012 * method resets the matcher, and then sets the region to start at the
1013 * index specified by the <code>start</code> parameter and end at the
1014 * index specified by the <code>end</code> parameter.
1015 *
1016 * <p>Depending on the transparency and anchoring being used (see
1017 * {@link #useTransparentBounds useTransparentBounds} and
1018 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
1019 * as anchors may behave differently at or around the boundaries of the
1020 * region.
1021 *
1022 * @param start
1023 * The index to start searching at (inclusive)
|
48 *
49 * </ul>
50 *
51 * <p> Each of these methods returns a boolean indicating success or failure.
52 * More information about a successful match can be obtained by querying the
53 * state of the matcher.
54 *
55 * <p> A matcher finds matches in a subset of its input called the
56 * <i>region</i>. By default, the region contains all of the matcher's input.
57 * The region can be modified via the{@link #region region} method and queried
58 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
59 * methods. The way that the region boundaries interact with some pattern
60 * constructs can be changed. See {@link #useAnchoringBounds
61 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
62 * for more details.
63 *
64 * <p> This class also defines methods for replacing matched subsequences with
65 * new strings whose contents can, if desired, be computed from the match
66 * result. The {@link #appendReplacement appendReplacement} and {@link
67 * #appendTail appendTail} methods can be used in tandem in order to collect
68 * the result into an existing string buffer or string builder. Alternatively,
69 * the more convenient {@link #replaceAll replaceAll} method can be used to
70 * create a string in which every matching subsequence in the input sequence
71 * is replaced.
72 *
73 * <p> The explicit state of a matcher includes the start and end indices of
74 * the most recent successful match. It also includes the start and end
75 * indices of the input subsequence captured by each <a
76 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
77 * count of such subsequences. As a convenience, methods are also provided for
78 * returning these captured subsequences in string form.
79 *
80 * <p> The explicit state of a matcher is initially undefined; attempting to
81 * query any part of it before a successful match will cause an {@link
82 * IllegalStateException} to be thrown. The explicit state of a matcher is
83 * recomputed by every match operation.
84 *
85 * <p> The implicit state of a matcher includes the input character sequence as
86 * well as the <i>append position</i>, which is initially zero and is updated
87 * by the {@link #appendReplacement appendReplacement} method.
88 *
89 * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
90 * method or, if a new input sequence is desired, its {@link
91 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
776 * The target string buffer
777 *
778 * @param replacement
779 * The replacement string
780 *
781 * @return This matcher
782 *
783 * @throws IllegalStateException
784 * If no match has yet been attempted,
785 * or if the previous match operation failed
786 *
787 * @throws IllegalArgumentException
788 * If the replacement string refers to a named-capturing
789 * group that does not exist in the pattern
790 *
791 * @throws IndexOutOfBoundsException
792 * If the replacement string refers to a capturing group
793 * that does not exist in the pattern
794 */
795 public Matcher appendReplacement(StringBuffer sb, String replacement) {
796 // If no match, return error
797 if (first < 0)
798 throw new IllegalStateException("No match available");
799 StringBuilder result = new StringBuilder();
800 appendExpandedReplacement(replacement, result);
801 // Append the intervening text
802 sb.append(text, lastAppendPosition, first);
803 // Append the match substitution
804 sb.append(result);
805 lastAppendPosition = last;
806 return this;
807 }
808
809 /**
810 * Implements a non-terminal append-and-replace step.
811 *
812 * <p> This method performs the following actions: </p>
813 *
814 * <ol>
815 *
816 * <li><p> It reads characters from the input sequence, starting at the
817 * append position, and appends them to the given string builder. It
818 * stops after reading the last character preceding the previous match,
819 * that is, the character at index {@link
820 * #start()} <tt>-</tt> <tt>1</tt>. </p></li>
821 *
822 * <li><p> It appends the given replacement string to the string builder.
823 * </p></li>
824 *
825 * <li><p> It sets the append position of this matcher to the index of
826 * the last character matched, plus one, that is, to {@link #end()}.
827 * </p></li>
828 *
829 * </ol>
830 *
831 * <p> The replacement string may contain references to subsequences
832 * captured during the previous match: Each occurrence of
833 * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
834 * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
835 * The first number after the <tt>$</tt> is always treated as part of
836 * the group reference. Subsequent numbers are incorporated into g if
837 * they would form a legal group reference. Only the numerals '0'
838 * through '9' are considered as potential components of the group
839 * reference. If the second group matched the string <tt>"foo"</tt>, for
840 * example, then passing the replacement string <tt>"$2bar"</tt> would
841 * cause <tt>"foobar"</tt> to be appended to the string builder. A dollar
842 * sign (<tt>$</tt>) may be included as a literal in the replacement
843 * string by preceding it with a backslash (<tt>\$</tt>).
844 *
845 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
846 * the replacement string may cause the results to be different than if it
847 * were being treated as a literal replacement string. Dollar signs may be
848 * treated as references to captured subsequences as described above, and
849 * backslashes are used to escape literal characters in the replacement
850 * string.
851 *
852 * <p> This method is intended to be used in a loop together with the
853 * {@link #appendTail appendTail} and {@link #find find} methods. The
854 * following code, for example, writes <tt>one dog two dogs in the
855 * yard</tt> to the standard-output stream: </p>
856 *
857 * <blockquote><pre>
858 * Pattern p = Pattern.compile("cat");
859 * Matcher m = p.matcher("one cat two cats in the yard");
860 * StringBuilder sb = new StringBuilder();
861 * while (m.find()) {
862 * m.appendReplacement(sb, "dog");
863 * }
864 * m.appendTail(sb);
865 * System.out.println(sb.toString());</pre></blockquote>
866 *
867 * @param sb
868 * The target string builder
869 * @param replacement
870 * The replacement string
871 * @return This matcher
872 *
873 * @throws IllegalStateException
874 * If no match has yet been attempted,
875 * or if the previous match operation failed
876 * @throws IllegalArgumentException
877 * If the replacement string refers to a named-capturing
878 * group that does not exist in the pattern
879 * @throws IndexOutOfBoundsException
880 * If the replacement string refers to a capturing group
881 * that does not exist in the pattern
882 * @since 1.9
883 */
884 public Matcher appendReplacement(StringBuilder sb, String replacement) {
885 // If no match, return error
886 if (first < 0)
887 throw new IllegalStateException("No match available");
888 StringBuilder result = new StringBuilder();
889 appendExpandedReplacement(replacement, result);
890 // Append the intervening text
891 sb.append(text, lastAppendPosition, first);
892 // Append the match substitution
893 sb.append(result);
894 lastAppendPosition = last;
895 return this;
896 }
897
898 /**
899 * Processes replacement string to replace group references with
900 * groups.
901 */
902 private StringBuilder appendExpandedReplacement(
903 String replacement, StringBuilder result) {
904 int cursor = 0;
905 while (cursor < replacement.length()) {
906 char nextChar = replacement.charAt(cursor);
907 if (nextChar == '\\') {
908 cursor++;
909 if (cursor == replacement.length())
910 throw new IllegalArgumentException(
911 "character to be escaped is missing");
912 nextChar = replacement.charAt(cursor);
913 result.append(nextChar);
914 cursor++;
915 } else if (nextChar == '$') {
916 // Skip past $
917 cursor++;
918 // Throw IAE if this "$" is the last character in replacement
919 if (cursor == replacement.length())
920 throw new IllegalArgumentException(
921 "Illegal group reference: group index is missing");
922 nextChar = replacement.charAt(cursor);
923 int refNum = -1;
924 if (nextChar == '{') {
936 }
937 }
938 if (gsb.length() == 0)
939 throw new IllegalArgumentException(
940 "named capturing group has 0 length name");
941 if (nextChar != '}')
942 throw new IllegalArgumentException(
943 "named capturing group is missing trailing '}'");
944 String gname = gsb.toString();
945 if (ASCII.isDigit(gname.charAt(0)))
946 throw new IllegalArgumentException(
947 "capturing group name {" + gname +
948 "} starts with digit character");
949 if (!parentPattern.namedGroups().containsKey(gname))
950 throw new IllegalArgumentException(
951 "No group with name {" + gname + "}");
952 refNum = parentPattern.namedGroups().get(gname);
953 cursor++;
954 } else {
955 // The first number is always a group
956 refNum = nextChar - '0';
957 if ((refNum < 0) || (refNum > 9))
958 throw new IllegalArgumentException(
959 "Illegal group reference");
960 cursor++;
961 // Capture the largest legal group string
962 boolean done = false;
963 while (!done) {
964 if (cursor >= replacement.length()) {
965 break;
966 }
967 int nextDigit = replacement.charAt(cursor) - '0';
968 if ((nextDigit < 0) || (nextDigit > 9)) { // not a number
969 break;
970 }
971 int newRefNum = (refNum * 10) + nextDigit;
972 if (groupCount() < newRefNum) {
973 done = true;
974 } else {
975 refNum = newRefNum;
976 cursor++;
977 }
978 }
979 }
980 // Append group
981 if (start(refNum) != -1 && end(refNum) != -1)
982 result.append(text, start(refNum), end(refNum));
983 } else {
984 result.append(nextChar);
985 cursor++;
986 }
987 }
988 return result;
989 }
990
991 /**
992 * Implements a terminal append-and-replace step.
993 *
994 * <p> This method reads characters from the input sequence, starting at
995 * the append position, and appends them to the given string buffer. It is
996 * intended to be invoked after one or more invocations of the {@link
997 * #appendReplacement appendReplacement} method in order to copy the
998 * remainder of the input sequence. </p>
999 *
1000 * @param sb
1001 * The target string buffer
1002 *
1003 * @return The target string buffer
1004 */
1005 public StringBuffer appendTail(StringBuffer sb) {
1006 sb.append(text, lastAppendPosition, getTextLength());
1007 return sb;
1008 }
1009
1010 /**
1011 * Implements a terminal append-and-replace step.
1012 *
1013 * <p> This method reads characters from the input sequence, starting at
1014 * the append position, and appends them to the given string builder. It is
1015 * intended to be invoked after one or more invocations of the {@link
1016 * #appendReplacement appendReplacement} method in order to copy the
1017 * remainder of the input sequence. </p>
1018 *
1019 * @param sb
1020 * The target string builder
1021 *
1022 * @return The target string builder
1023 *
1024 * @since 1.9
1025 */
1026 public StringBuilder appendTail(StringBuilder sb) {
1027 sb.append(text, lastAppendPosition, getTextLength());
1028 return sb;
1029 }
1030
1031 /**
1032 * Replaces every subsequence of the input sequence that matches the
1033 * pattern with the given replacement string.
1034 *
1035 * <p> This method first resets this matcher. It then scans the input
1036 * sequence looking for matches of the pattern. Characters that are not
1037 * part of any match are appended directly to the result string; each match
1038 * is replaced in the result by the replacement string. The replacement
1039 * string may contain references to captured subsequences as in the {@link
1040 * #appendReplacement appendReplacement} method.
1041 *
1042 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
1043 * the replacement string may cause the results to be different than if it
1044 * were being treated as a literal replacement string. Dollar signs may be
1045 * treated as references to captured subsequences as described above, and
1046 * backslashes are used to escape literal characters in the replacement
1047 * string.
1048 *
1049 * <p> Given the regular expression <tt>a*b</tt>, the input
1050 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
1051 * <tt>"-"</tt>, an invocation of this method on a matcher for that
1052 * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
1053 *
1054 * <p> Invoking this method changes this matcher's state. If the matcher
1055 * is to be used in further matching operations then it should first be
1056 * reset. </p>
1057 *
1058 * @param replacement
1059 * The replacement string
1060 *
1061 * @return The string constructed by replacing each matching subsequence
1062 * by the replacement string, substituting captured subsequences
1063 * as needed
1064 */
1065 public String replaceAll(String replacement) {
1066 reset();
1067 boolean result = find();
1068 if (result) {
1069 StringBuilder sb = new StringBuilder();
1070 do {
1071 appendReplacement(sb, replacement);
1072 result = find();
1073 } while (result);
1074 appendTail(sb);
1075 return sb.toString();
1076 }
1077 return text.toString();
1078 }
1079
1080 /**
1081 * Replaces the first subsequence of the input sequence that matches the
1082 * pattern with the given replacement string.
1083 *
1084 * <p> This method first resets this matcher. It then scans the input
1085 * sequence looking for a match of the pattern. Characters that are not
1086 * part of the match are appended directly to the result string; the match
1087 * is replaced in the result by the replacement string. The replacement
1088 * string may contain references to captured subsequences as in the {@link
1089 * #appendReplacement appendReplacement} method.
1099 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
1100 * <tt>"cat"</tt>, an invocation of this method on a matcher for that
1101 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
1102 *
1103 * <p> Invoking this method changes this matcher's state. If the matcher
1104 * is to be used in further matching operations then it should first be
1105 * reset. </p>
1106 *
1107 * @param replacement
1108 * The replacement string
1109 * @return The string constructed by replacing the first matching
1110 * subsequence by the replacement string, substituting captured
1111 * subsequences as needed
1112 */
1113 public String replaceFirst(String replacement) {
1114 if (replacement == null)
1115 throw new NullPointerException("replacement");
1116 reset();
1117 if (!find())
1118 return text.toString();
1119 StringBuilder sb = new StringBuilder();
1120 appendReplacement(sb, replacement);
1121 appendTail(sb);
1122 return sb.toString();
1123 }
1124
1125 /**
1126 * Sets the limits of this matcher's region. The region is the part of the
1127 * input sequence that will be searched to find a match. Invoking this
1128 * method resets the matcher, and then sets the region to start at the
1129 * index specified by the <code>start</code> parameter and end at the
1130 * index specified by the <code>end</code> parameter.
1131 *
1132 * <p>Depending on the transparency and anchoring being used (see
1133 * {@link #useTransparentBounds useTransparentBounds} and
1134 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
1135 * as anchors may behave differently at or around the boundaries of the
1136 * region.
1137 *
1138 * @param start
1139 * The index to start searching at (inclusive)
|