1 //
   2 // Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 // This file contains test cases for regular expressions.
  25 // A test case consists of three lines:
  26 // The first line is a pattern used in the test
  27 // The second line is the input to search for the pattern in
  28 // The third line is a concatentation of the match, the number of groups,
  29 //     and the contents of the first four subexpressions.
  30 // Empty lines and lines beginning with comment slashes are ignored.
  31 //
  32 // Test unsetting of backed off groups
  33 ^(a)?a
  34 a
  35 true a 1
  36 
  37 ^(aa(bb)?)+$
  38 aabbaa
  39 true aabbaa 2 aa bb
  40 
  41 ((a|b)?b)+
  42 b
  43 true b 2 b
  44 
  45 (aaa)?aaa
  46 aaa
  47 true aaa 1
  48 
  49 ^(a(b)?)+$
  50 aba
  51 true aba 2 a b
  52 
  53 ^(a(b(c)?)?)?abc
  54 abc
  55 true abc 3
  56 
  57 ^(a(b(c))).*
  58 abc
  59 true abc 3 abc bc c
  60 
  61 // use of x modifier
  62 abc(?x)blah
  63 abcblah
  64 true abcblah 0
  65 
  66 abc(?x)  blah
  67 abcblah
  68 true abcblah 0
  69 
  70 abc(?x)  blah  blech
  71 abcblahblech
  72 true abcblahblech 0
  73 
  74 abc(?x)  blah # ignore comment
  75 abcblah
  76 true abcblah 0
  77 
  78 // Simple alternation
  79 a|b
  80 a
  81 true a 0
  82 
  83 a|b
  84 z
  85 false 0
  86 
  87 a|b
  88 b
  89 true b 0
  90 
  91 a|b|cd
  92 cd
  93 true cd 0
  94 
  95 a|ad
  96 ad
  97 true a 0
  98 
  99 z(a|ac)b
 100 zacb
 101 true zacb 1 ac
 102 
 103 // Simple char class
 104 [abc]+
 105 ababab
 106 true ababab 0
 107 
 108 [abc]+
 109 defg
 110 false 0
 111 
 112 [abc]+[def]+[ghi]+
 113 zzzaaddggzzz
 114 true aaddgg 0
 115 
 116 // Range char class
 117 [a-g]+
 118 zzzggg
 119 true ggg 0
 120 
 121 [a-g]+
 122 mmm
 123 false 0
 124 
 125 [a-]+
 126 za-9z
 127 true a- 0
 128 
 129 [a-\\u4444]+
 130 za-9z
 131 true za 0
 132 
 133 // Negated char class
 134 [^abc]+
 135 ababab
 136 false 0
 137 
 138 [^abc]+
 139 aaabbbcccdefg
 140 true defg 0
 141 
 142 // Making sure a ^ not in first position matches literal ^
 143 [abc^b]
 144 b
 145 true b 0
 146 
 147 [abc^b]
 148 ^
 149 true ^ 0
 150 
 151 // Class union and intersection
 152 [abc[def]]
 153 b
 154 true b 0
 155 
 156 [abc[def]]
 157 e
 158 true e 0
 159 
 160 [a-d[0-9][m-p]]
 161 a
 162 true a 0
 163 
 164 [a-d[0-9][m-p]]
 165 o
 166 true o 0
 167 
 168 [a-d[0-9][m-p]]
 169 4
 170 true 4 0
 171 
 172 [a-d[0-9][m-p]]
 173 e
 174 false 0
 175 
 176 [a-d[0-9][m-p]]
 177 u
 178 false 0
 179 
 180 [[a-d][0-9][m-p]]
 181 b
 182 true b 0
 183 
 184 [[a-d][0-9][m-p]]
 185 z
 186 false 0
 187 
 188 [a-c[d-f[g-i]]]
 189 a
 190 true a 0
 191 
 192 [a-c[d-f[g-i]]]
 193 e
 194 true e 0
 195 
 196 [a-c[d-f[g-i]]]
 197 h
 198 true h 0
 199 
 200 [a-c[d-f[g-i]]]
 201 m
 202 false 0
 203 
 204 [a-c[d-f[g-i]]m]
 205 m
 206 true m 0
 207 
 208 [abc[def]ghi]
 209 a
 210 true a 0
 211 
 212 [abc[def]ghi]
 213 d
 214 true d 0
 215 
 216 [abc[def]ghi]
 217 h
 218 true h 0
 219 
 220 [abc[def]ghi]
 221 w
 222 false 0
 223 
 224 [a-c&&[d-f]]
 225 a
 226 false 0
 227 
 228 [a-c&&[d-f]]
 229 e
 230 false 0
 231 
 232 [a-c&&[d-f]]
 233 z
 234 false 0
 235 
 236 [[a-c]&&[d-f]]
 237 a
 238 false 0
 239 
 240 [[a-c]&&[d-f]]
 241 e
 242 false 0
 243 
 244 [[a-c]&&[d-f]]
 245 z
 246 false 0
 247 
 248 [a-c&&d-f]
 249 a
 250 false 0
 251 
 252 [a-m&&m-z]
 253 m
 254 true m 0
 255 
 256 [a-m&&m-z&&a-c]
 257 m
 258 false 0
 259 
 260 [a-m&&m-z&&a-z]
 261 m
 262 true m 0
 263 
 264 [[a-m]&&[m-z]]
 265 a
 266 false 0
 267 
 268 [[a-m]&&[m-z]]
 269 m
 270 true m 0
 271 
 272 [[a-m]&&[m-z]]
 273 z
 274 false 0
 275 
 276 [[a-m]&&[^a-c]]
 277 a
 278 false 0
 279 
 280 [[a-m]&&[^a-c]]
 281 d
 282 true d 0
 283 
 284 [a-m&&[^a-c]]
 285 a
 286 false 0
 287 
 288 [a-m&&[^a-c]]
 289 d
 290 true d 0
 291 
 292 [a-cd-f&&[d-f]]
 293 a
 294 false 0
 295 
 296 [a-cd-f&&[d-f]]
 297 e
 298 true e 0
 299 
 300 [[a-c]&&d-fa-c]
 301 a
 302 true a 0
 303 
 304 [[a-c]&&[d-f][a-c]]
 305 a
 306 true a 0
 307 
 308 [[a-c][d-f]&&abc]
 309 a
 310 true a 0
 311 
 312 [[a-c][d-f]&&abc[def]]
 313 e
 314 true e 0
 315 
 316 [[a-c]&&[b-d]&&[c-e]]
 317 a
 318 false 0
 319 
 320 [[a-c]&&[b-d]&&[c-e]]
 321 c
 322 true c 0
 323 
 324 [[a-c]&&[b-d][c-e]&&[u-z]]
 325 c
 326 false 0
 327 
 328 [abc[^bcd]]
 329 a
 330 true a 0
 331 
 332 [abc[^bcd]]
 333 d
 334 false 0
 335 
 336 [a-c&&a-d&&a-eghi]
 337 b
 338 true b 0
 339 
 340 [a-c&&a-d&&a-eghi]
 341 g
 342 false 0
 343 
 344 [[a[b]]&&[b[a]]]
 345 a
 346 true a 0
 347 
 348 [[a]&&[b][c][a]&&[^d]]
 349 a
 350 true a 0
 351 
 352 [[a]&&[b][c][a]&&[^d]]
 353 d
 354 false 0
 355 
 356 [[[a-d]&&[c-f]]]
 357 a
 358 false 0
 359 
 360 [[[a-d]&&[c-f]]]
 361 c
 362 true c 0
 363 
 364 [[[a-d]&&[c-f]]&&[c]]
 365 c
 366 true c 0
 367 
 368 [[[a-d]&&[c-f]]&&[c]&&c]
 369 c
 370 true c 0
 371 
 372 [[[a-d]&&[c-f]]&&[c]&&c&&c]
 373 c
 374 true c 0
 375 
 376 [[[a-d]&&[c-f]]&&[c]&&c&&[cde]]
 377 c
 378 true c 0
 379 
 380 [z[abc&&bcd]]
 381 c
 382 true c 0
 383 
 384 [z[abc&&bcd]&&[u-z]]
 385 z
 386 true z 0
 387 
 388 [x[abc&&bcd[z]]&&[u-z]]
 389 z
 390 false 0
 391 
 392 [x[[wz]abc&&bcd[z]]&&[u-z]]
 393 z
 394 true z 0
 395 
 396 [[abc]&&[def]abc]
 397 a
 398 true a 0
 399 
 400 [[abc]&&[def]xyz[abc]]
 401 a
 402 true a 0
 403 
 404 \pL
 405 a
 406 true a 0
 407 
 408 \pL
 409 7
 410 false 0
 411 
 412 \p{L}
 413 a
 414 true a 0
 415 
 416 \p{LC}
 417 a
 418 true a 0
 419 
 420 \p{LC}
 421 A
 422 true A 0
 423 
 424 \p{IsL}
 425 a
 426 true a 0
 427 
 428 \p{IsLC}
 429 a
 430 true a 0
 431 
 432 \p{IsLC}
 433 A
 434 true A 0
 435 
 436 \p{IsLC}
 437 9
 438 false 0
 439 
 440 \P{IsLC}
 441 9
 442 true 9 0
 443 
 444 // Guillemet left is initial quote punctuation
 445 \p{Pi}
 446 \u00ab
 447 true \u00ab 0
 448 
 449 \P{Pi}
 450 \u00ac
 451 true \u00ac 0
 452 
 453 // Guillemet right is final quote punctuation
 454 \p{IsPf}
 455 \u00bb
 456 true \u00bb 0
 457 
 458 \p{P}
 459 \u00bb
 460 true \u00bb 0
 461 
 462 \p{P}+
 463 \u00bb
 464 true \u00bb 0
 465 
 466 \P{IsPf}
 467 \u00bc
 468 true \u00bc 0
 469 
 470 \P{IsP}
 471 \u00bc
 472 true \u00bc 0
 473 
 474 \p{L1}
 475 \u00bc
 476 true \u00bc 0
 477 
 478 \p{L1}+
 479 \u00bc
 480 true \u00bc 0
 481 
 482 \p{L1}
 483 \u02bc
 484 false 0
 485 
 486 \p{ASCII}
 487 a
 488 true a 0
 489 
 490 \p{IsASCII}
 491 a
 492 true a 0
 493 
 494 \p{IsASCII}
 495 \u0370
 496 false 0
 497 
 498 \pLbc
 499 abc
 500 true abc 0
 501 
 502 a[r\p{InGreek}]c
 503 a\u0370c
 504 true a\u0370c 0
 505 
 506 a\p{InGreek}
 507 a\u0370
 508 true a\u0370 0
 509 
 510 a\P{InGreek}
 511 a\u0370
 512 false 0
 513 
 514 a\P{InGreek}
 515 ab
 516 true ab 0
 517 
 518 a{^InGreek}
 519 -
 520 error
 521 
 522 a\p{^InGreek}
 523 -
 524 error
 525 
 526 a\P{^InGreek}
 527 -
 528 error
 529 
 530 a\p{InGreek}
 531 a\u0370
 532 true a\u0370 0
 533 
 534 a[\p{InGreek}]c
 535 a\u0370c
 536 true a\u0370c 0
 537 
 538 a[\P{InGreek}]c
 539 a\u0370c
 540 false 0
 541 
 542 a[\P{InGreek}]c
 543 abc
 544 true abc 0
 545 
 546 a[{^InGreek}]c
 547 anc
 548 true anc 0
 549 
 550 a[{^InGreek}]c
 551 azc
 552 false 0
 553 
 554 a[\p{^InGreek}]c
 555 -
 556 error
 557 
 558 a[\P{^InGreek}]c
 559 -
 560 error
 561 
 562 a[\p{InGreek}]
 563 a\u0370
 564 true a\u0370 0
 565 
 566 a[r\p{InGreek}]c
 567 arc
 568 true arc 0
 569 
 570 a[\p{InGreek}r]c
 571 arc
 572 true arc 0
 573 
 574 a[r\p{InGreek}]c
 575 arc
 576 true arc 0
 577 
 578 a[^\p{InGreek}]c
 579 a\u0370c
 580 false 0
 581 
 582 a[^\P{InGreek}]c
 583 a\u0370c
 584 true a\u0370c 0
 585 
 586 a[\p{InGreek}&&[^\u0370]]c
 587 a\u0370c
 588 false 0
 589 
 590 // Test the dot metacharacter
 591 a.c.+
 592 a#c%&
 593 true a#c%& 0
 594 
 595 ab.
 596 ab\n
 597 false 0
 598 
 599 (?s)ab.
 600 ab\n
 601 true ab\n 0
 602 
 603 a[\p{L}&&[\P{InGreek}]]c
 604 a\u6000c
 605 true a\u6000c 0
 606 
 607 a[\p{L}&&[\P{InGreek}]]c
 608 arc
 609 true arc 0
 610 
 611 a[\p{L}&&[\P{InGreek}]]c
 612 a\u0370c
 613 false 0
 614 
 615 a\p{InGreek}c
 616 a\u0370c
 617 true a\u0370c 0
 618 
 619 a\p{Sc}
 620 a$
 621 true a$ 0
 622 
 623 // Test the word char escape sequence
 624 ab\wc
 625 abcc
 626 true abcc 0
 627 
 628 \W\w\W
 629 #r#
 630 true #r# 0
 631 
 632 \W\w\W
 633 rrrr#ggg
 634 false 0
 635 
 636 abc[\w]
 637 abcd
 638 true abcd 0
 639 
 640 abc[\sdef]*
 641 abc  def
 642 true abc  def 0
 643 
 644 abc[\sy-z]*
 645 abc y z
 646 true abc y z 0
 647 
 648 abc[a-d\sm-p]*
 649 abcaa mn  p
 650 true abcaa mn  p 0
 651 
 652 // Test the whitespace escape sequence
 653 ab\sc
 654 ab c
 655 true ab c 0
 656 
 657 \s\s\s
 658 blah  err
 659 false 0
 660 
 661 \S\S\s
 662 blah  err
 663 true ah  0
 664 
 665 // Test the digit escape sequence
 666 ab\dc
 667 ab9c
 668 true ab9c 0
 669 
 670 \d\d\d
 671 blah45
 672 false 0
 673 
 674 // Test the caret metacharacter
 675 ^abc
 676 abcdef
 677 true abc 0
 678 
 679 ^abc
 680 bcdabc
 681 false 0
 682 
 683 // Greedy ? metacharacter
 684 a?b
 685 aaaab
 686 true ab 0
 687 
 688 a?b
 689 b
 690 true b 0
 691 
 692 a?b
 693 aaaccc
 694 false 0
 695 
 696 .?b
 697 aaaab
 698 true ab 0
 699 
 700 // Reluctant ? metacharacter
 701 a??b
 702 aaaab
 703 true ab 0
 704 
 705 a??b
 706 b
 707 true b 0
 708 
 709 a??b
 710 aaaccc
 711 false 0
 712 
 713 .??b
 714 aaaab
 715 true ab 0
 716 
 717 // Possessive ? metacharacter
 718 a?+b
 719 aaaab
 720 true ab 0
 721 
 722 a?+b
 723 b
 724 true b 0
 725 
 726 a?+b
 727 aaaccc
 728 false 0
 729 
 730 .?+b
 731 aaaab
 732 true ab 0
 733 
 734 // Greedy + metacharacter
 735 a+b
 736 aaaab
 737 true aaaab 0
 738 
 739 a+b
 740 b
 741 false 0
 742 
 743 a+b
 744 aaaccc
 745 false 0
 746 
 747 .+b
 748 aaaab
 749 true aaaab 0
 750 
 751 // Reluctant + metacharacter
 752 a+?b
 753 aaaab
 754 true aaaab 0
 755 
 756 a+?b
 757 b
 758 false 0
 759 
 760 a+?b
 761 aaaccc
 762 false 0
 763 
 764 .+?b
 765 aaaab
 766 true aaaab 0
 767 
 768 // Possessive + metacharacter
 769 a++b
 770 aaaab
 771 true aaaab 0
 772 
 773 a++b
 774 b
 775 false 0
 776 
 777 a++b
 778 aaaccc
 779 false 0
 780 
 781 .++b
 782 aaaab
 783 false 0
 784 
 785 // Greedy Repetition
 786 a{2,3}
 787 a
 788 false 0
 789 
 790 a{2,3}
 791 aa
 792 true aa 0
 793 
 794 a{2,3}
 795 aaa
 796 true aaa 0
 797 
 798 a{2,3}
 799 aaaa
 800 true aaa 0
 801 
 802 a{3,}
 803 zzzaaaazzz
 804 true aaaa 0
 805 
 806 a{3,}
 807 zzzaazzz
 808 false 0
 809 
 810 // Reluctant Repetition
 811 a{2,3}?
 812 a
 813 false 0
 814 
 815 a{2,3}?
 816 aa
 817 true aa 0
 818 
 819 a{2,3}?
 820 aaa
 821 true aa 0
 822 
 823 a{2,3}?
 824 aaaa
 825 true aa 0
 826 
 827 // Zero width Positive lookahead
 828 abc(?=d)
 829 zzzabcd
 830 true abc 0
 831 
 832 abc(?=d)
 833 zzzabced
 834 false 0
 835 
 836 // Zero width Negative lookahead
 837 abc(?!d)
 838 zzabcd
 839 false 0
 840 
 841 abc(?!d)
 842 zzabced
 843 true abc 0
 844 
 845 // Zero width Positive lookbehind
 846 \w(?<=a)
 847 ###abc###
 848 true a 0
 849 
 850 \w(?<=a)
 851 ###ert###
 852 false 0
 853 
 854 // Zero width Negative lookbehind
 855 (?<!a)\w
 856 ###abc###
 857 true a 0
 858 
 859 (?<!a)c
 860 bc
 861 true c 0
 862 
 863 (?<!a)c
 864 ac
 865 false 0
 866 
 867 // Nondeterministic group
 868 (a+b)+
 869 ababab
 870 true ababab 1 ab
 871 
 872 (a|b)+
 873 ccccd
 874 false 1
 875 
 876 // Deterministic group
 877 (ab)+
 878 ababab
 879 true ababab 1 ab
 880 
 881 (ab)+
 882 accccd
 883 false 1
 884 
 885 (ab)*
 886 ababab
 887 true ababab 1 ab
 888 
 889 (ab)(cd*)
 890 zzzabczzz
 891 true abc 2 ab c
 892 
 893 abc(d)*abc
 894 abcdddddabc
 895 true abcdddddabc 1 d
 896 
 897 // Escaped metacharacter
 898 \*
 899 *
 900 true * 0
 901 
 902 \\
 903 \
 904 true \ 0
 905 
 906 \\
 907 \\\\
 908 true \ 0
 909 
 910 // Back references
 911 (a*)bc\1
 912 zzzaabcaazzz
 913 true aabcaa 1 aa
 914 
 915 (a*)bc\1
 916 zzzaabcazzz
 917 true abca 1 a
 918 
 919 (gt*)(dde)*(yu)\1\3(vv)
 920 zzzgttddeddeyugttyuvvzzz
 921 true gttddeddeyugttyuvv 4 gtt dde yu vv
 922 
 923 // Greedy * metacharacter
 924 a*b
 925 aaaab
 926 true aaaab 0
 927 
 928 a*b
 929 b
 930 true b 0
 931 
 932 a*b
 933 aaaccc
 934 false 0
 935 
 936 .*b
 937 aaaab
 938 true aaaab 0
 939 
 940 // Reluctant * metacharacter
 941 a*?b
 942 aaaab
 943 true aaaab 0
 944 
 945 a*?b
 946 b
 947 true b 0
 948 
 949 a*?b
 950 aaaccc
 951 false 0
 952 
 953 .*?b
 954 aaaab
 955 true aaaab 0
 956 
 957 // Possessive * metacharacter
 958 a*+b
 959 aaaab
 960 true aaaab 0
 961 
 962 a*+b
 963 b
 964 true b 0
 965 
 966 a*+b
 967 aaaccc
 968 false 0
 969 
 970 .*+b
 971 aaaab
 972 false 0
 973 
 974 // Case insensitivity
 975 (?i)foobar
 976 fOobAr
 977 true fOobAr 0
 978 
 979 f(?i)oobar
 980 fOobAr
 981 true fOobAr 0
 982 
 983 foo(?i)bar
 984 fOobAr
 985 false 0
 986 
 987 (?i)foo[bar]+
 988 foObAr
 989 true foObAr 0
 990 
 991 (?i)foo[a-r]+
 992 foObAr
 993 true foObAr 0
 994 
 995 // Disable metacharacters- test both length <=3 and >3
 996 // So that the BM optimization is part of test
 997 \Q***\Eabc
 998 ***abc
 999 true ***abc 0
1000 
1001 bl\Q***\Eabc
1002 bl***abc
1003 true bl***abc 0
1004 
1005 \Q***abc
1006 ***abc
1007 true ***abc 0
1008 
1009 blah\Q***\Eabc
1010 blah***abc
1011 true blah***abc 0
1012 
1013 \Q***abc
1014 ***abc
1015 true ***abc 0
1016 
1017 \Q*ab
1018 *ab
1019 true *ab 0
1020 
1021 blah\Q***abc
1022 blah***abc
1023 true blah***abc 0
1024 
1025 bla\Q***abc
1026 bla***abc
1027 true bla***abc 0
1028 
1029 // Escapes in char classes
1030 [ab\Qdef\E]
1031 d
1032 true d 0
1033 
1034 [ab\Q[\E]
1035 [
1036 true [ 0
1037 
1038 [\Q]\E]
1039 ]
1040 true ] 0
1041 
1042 [\Q\\E]
1043 \
1044 true \ 0
1045 
1046 [\Q(\E]
1047 (
1048 true ( 0
1049 
1050 [\n-#]
1051 !
1052 true ! 0
1053 
1054 [\n-#]
1055 -
1056 false 0
1057 
1058 [\w-#]
1059 !
1060 false 0
1061 
1062 [\w-#]
1063 a
1064 true a 0
1065 
1066 [\w-#]
1067 -
1068 true - 0
1069 
1070 [\w-#]
1071 #
1072 true # 0
1073 
1074 [\043]+
1075 blahblah#blech
1076 true # 0
1077 
1078 [\042-\044]+
1079 blahblah#blech
1080 true # 0
1081 
1082 [\u1234-\u1236]
1083 blahblah\u1235blech
1084 true \u1235 0
1085 
1086 [^\043]*
1087 blahblah#blech
1088 true blahblah 0
1089 
1090 (|f)?+
1091 foo
1092 true  1