782 lines
12 KiB
TOML
782 lines
12 KiB
TOML
# Some of these are cribbed from RE2's test suite.
|
|
|
|
# These test \b. Below are tests for \B.
|
|
[[test]]
|
|
name = "wb1"
|
|
regex = '\b'
|
|
haystack = ""
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb2"
|
|
regex = '\b'
|
|
haystack = "a"
|
|
matches = [[0, 0], [1, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb3"
|
|
regex = '\b'
|
|
haystack = "ab"
|
|
matches = [[0, 0], [2, 2]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb4"
|
|
regex = '^\b'
|
|
haystack = "ab"
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb5"
|
|
regex = '\b$'
|
|
haystack = "ab"
|
|
matches = [[2, 2]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb6"
|
|
regex = '^\b$'
|
|
haystack = "ab"
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb7"
|
|
regex = '\bbar\b'
|
|
haystack = "nobar bar foo bar"
|
|
matches = [[6, 9], [14, 17]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb8"
|
|
regex = 'a\b'
|
|
haystack = "faoa x"
|
|
matches = [[3, 4]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb9"
|
|
regex = '\bbar'
|
|
haystack = "bar x"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb10"
|
|
regex = '\bbar'
|
|
haystack = "foo\nbar x"
|
|
matches = [[4, 7]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb11"
|
|
regex = 'bar\b'
|
|
haystack = "foobar"
|
|
matches = [[3, 6]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb12"
|
|
regex = 'bar\b'
|
|
haystack = "foobar\nxxx"
|
|
matches = [[3, 6]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb13"
|
|
regex = '(?:foo|bar|[A-Z])\b'
|
|
haystack = "foo"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb14"
|
|
regex = '(?:foo|bar|[A-Z])\b'
|
|
haystack = "foo\n"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb15"
|
|
regex = '\b(?:foo|bar|[A-Z])'
|
|
haystack = "foo"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb16"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "X"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb17"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "XY"
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb18"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "bar"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb19"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "foo"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb20"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "foo\n"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb21"
|
|
regex = '\b(?:foo|bar|[A-Z])\b'
|
|
haystack = "ffoo bbar N x"
|
|
matches = [[10, 11]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb22"
|
|
regex = '\b(?:fo|foo)\b'
|
|
haystack = "fo"
|
|
matches = [[0, 2]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb23"
|
|
regex = '\b(?:fo|foo)\b'
|
|
haystack = "foo"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb24"
|
|
regex = '\b\b'
|
|
haystack = ""
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb25"
|
|
regex = '\b\b'
|
|
haystack = "a"
|
|
matches = [[0, 0], [1, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb26"
|
|
regex = '\b$'
|
|
haystack = ""
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb27"
|
|
regex = '\b$'
|
|
haystack = "x"
|
|
matches = [[1, 1]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb28"
|
|
regex = '\b$'
|
|
haystack = "y x"
|
|
matches = [[3, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb29"
|
|
regex = '(?-u:\b).$'
|
|
haystack = "x"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "wb30"
|
|
regex = '^\b(?:fo|foo)\b'
|
|
haystack = "fo"
|
|
matches = [[0, 2]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb31"
|
|
regex = '^\b(?:fo|foo)\b'
|
|
haystack = "foo"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb32"
|
|
regex = '^\b$'
|
|
haystack = ""
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb33"
|
|
regex = '^\b$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb34"
|
|
regex = '^(?-u:\b).$'
|
|
haystack = "x"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "wb35"
|
|
regex = '^(?-u:\b).(?-u:\b)$'
|
|
haystack = "x"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "wb36"
|
|
regex = '^^^^^\b$$$$$'
|
|
haystack = ""
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb37"
|
|
regex = '^^^^^(?-u:\b).$$$$$'
|
|
haystack = "x"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "wb38"
|
|
regex = '^^^^^\b$$$$$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb39"
|
|
regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
|
|
haystack = "x"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "wb40"
|
|
regex = '(?-u:\b).+(?-u:\b)'
|
|
haystack = "$$abc$$"
|
|
matches = [[2, 5]]
|
|
|
|
[[test]]
|
|
name = "wb41"
|
|
regex = '\b'
|
|
haystack = "a b c"
|
|
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb42"
|
|
regex = '\bfoo\b'
|
|
haystack = "zzz foo zzz"
|
|
matches = [[4, 7]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb43"
|
|
regex = '\b^'
|
|
haystack = "ab"
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "wb44"
|
|
regex = '$\b'
|
|
haystack = "ab"
|
|
matches = [[2, 2]]
|
|
unicode = false
|
|
|
|
|
|
# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
|
|
# have to disable it for most of these tests. This is because \B can match at
|
|
# non-UTF-8 boundaries.
|
|
[[test]]
|
|
name = "nb1"
|
|
regex = '\Bfoo\B'
|
|
haystack = "n foo xfoox that"
|
|
matches = [[7, 10]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb2"
|
|
regex = 'a\B'
|
|
haystack = "faoa x"
|
|
matches = [[1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb3"
|
|
regex = '\Bbar'
|
|
haystack = "bar x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb4"
|
|
regex = '\Bbar'
|
|
haystack = "foo\nbar x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb5"
|
|
regex = 'bar\B'
|
|
haystack = "foobar"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb6"
|
|
regex = 'bar\B'
|
|
haystack = "foobar\nxxx"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb7"
|
|
regex = '(?:foo|bar|[A-Z])\B'
|
|
haystack = "foox"
|
|
matches = [[0, 3]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb8"
|
|
regex = '(?:foo|bar|[A-Z])\B'
|
|
haystack = "foo\n"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb9"
|
|
regex = '\B'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb10"
|
|
regex = '\B'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb11"
|
|
regex = '\B(?:foo|bar|[A-Z])'
|
|
haystack = "foo"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb12"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "xXy"
|
|
matches = [[1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb13"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "XY"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb14"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "XYZ"
|
|
matches = [[1, 2]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb15"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "abara"
|
|
matches = [[1, 4]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb16"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "xfoo_"
|
|
matches = [[1, 4]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb17"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "xfoo\n"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb18"
|
|
regex = '\B(?:foo|bar|[A-Z])\B'
|
|
haystack = "foo bar vNX"
|
|
matches = [[9, 10]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb19"
|
|
regex = '\B(?:fo|foo)\B'
|
|
haystack = "xfoo"
|
|
matches = [[1, 3]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb20"
|
|
regex = '\B(?:foo|fo)\B'
|
|
haystack = "xfooo"
|
|
matches = [[1, 4]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb21"
|
|
regex = '\B\B'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb22"
|
|
regex = '\B\B'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb23"
|
|
regex = '\B$'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb24"
|
|
regex = '\B$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb25"
|
|
regex = '\B$'
|
|
haystack = "y x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb26"
|
|
regex = '\B.$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb27"
|
|
regex = '^\B(?:fo|foo)\B'
|
|
haystack = "fo"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb28"
|
|
regex = '^\B(?:fo|foo)\B'
|
|
haystack = "fo"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb29"
|
|
regex = '^\B'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb30"
|
|
regex = '^\B'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb31"
|
|
regex = '^\B\B'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb32"
|
|
regex = '^\B\B'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb33"
|
|
regex = '^\B$'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb34"
|
|
regex = '^\B$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb35"
|
|
regex = '^\B.$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb36"
|
|
regex = '^\B.\B$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb37"
|
|
regex = '^^^^^\B$$$$$'
|
|
haystack = ""
|
|
matches = [[0, 0]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb38"
|
|
regex = '^^^^^\B.$$$$$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "nb39"
|
|
regex = '^^^^^\B$$$$$'
|
|
haystack = "x"
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
|
|
# unicode1* and unicode2* work for both Unicode and ASCII because all matches
|
|
# are reported as byte offsets, and « and » do not correspond to word
|
|
# boundaries at either the character or byte level.
|
|
[[test]]
|
|
name = "unicode1"
|
|
regex = '\bx\b'
|
|
haystack = "«x"
|
|
matches = [[2, 3]]
|
|
|
|
[[test]]
|
|
name = "unicode1-only-ascii"
|
|
regex = '\bx\b'
|
|
haystack = "«x"
|
|
matches = [[2, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "unicode2"
|
|
regex = '\bx\b'
|
|
haystack = "x»"
|
|
matches = [[0, 1]]
|
|
|
|
[[test]]
|
|
name = "unicode2-only-ascii"
|
|
regex = '\bx\b'
|
|
haystack = "x»"
|
|
matches = [[0, 1]]
|
|
unicode = false
|
|
|
|
# ASCII word boundaries are completely oblivious to Unicode characters, so
|
|
# even though β is a character, an ASCII \b treats it as a word boundary
|
|
# when it is adjacent to another ASCII character. (The ASCII \b only looks
|
|
# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
|
|
[[test]]
|
|
name = "unicode3"
|
|
regex = '\bx\b'
|
|
haystack = 'áxβ'
|
|
matches = []
|
|
|
|
[[test]]
|
|
name = "unicode3-only-ascii"
|
|
regex = '\bx\b'
|
|
haystack = 'áxβ'
|
|
matches = [[2, 3]]
|
|
unicode = false
|
|
|
|
[[test]]
|
|
name = "unicode4"
|
|
regex = '\Bx\B'
|
|
haystack = 'áxβ'
|
|
matches = [[2, 3]]
|
|
|
|
[[test]]
|
|
name = "unicode4-only-ascii"
|
|
regex = '\Bx\B'
|
|
haystack = 'áxβ'
|
|
matches = []
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
# The same as above, but with \b instead of \B as a sanity check.
|
|
[[test]]
|
|
name = "unicode5"
|
|
regex = '\b'
|
|
haystack = "0\U0007EF5E"
|
|
matches = [[0, 0], [1, 1]]
|
|
|
|
[[test]]
|
|
name = "unicode5-only-ascii"
|
|
regex = '\b'
|
|
haystack = "0\U0007EF5E"
|
|
matches = [[0, 0], [1, 1]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "unicode5-noutf8"
|
|
regex = '\b'
|
|
haystack = '0\xFF\xFF\xFF\xFF'
|
|
matches = [[0, 0], [1, 1]]
|
|
unescape = true
|
|
utf8 = false
|
|
|
|
[[test]]
|
|
name = "unicode5-noutf8-only-ascii"
|
|
regex = '\b'
|
|
haystack = '0\xFF\xFF\xFF\xFF'
|
|
matches = [[0, 0], [1, 1]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
# Weird special case to ensure that ASCII \B treats each individual code unit
|
|
# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
|
|
# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
|
|
# \w character class.)
|
|
[[test]]
|
|
name = "unicode5-not"
|
|
regex = '\B'
|
|
haystack = "0\U0007EF5E"
|
|
matches = [[5, 5]]
|
|
|
|
[[test]]
|
|
name = "unicode5-not-only-ascii"
|
|
regex = '\B'
|
|
haystack = "0\U0007EF5E"
|
|
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
# This gets no matches since \B only matches in the presence of valid UTF-8
|
|
# when Unicode is enabled, even when UTF-8 mode is disabled.
|
|
[[test]]
|
|
name = "unicode5-not-noutf8"
|
|
regex = '\B'
|
|
haystack = '0\xFF\xFF\xFF\xFF'
|
|
matches = []
|
|
unescape = true
|
|
utf8 = false
|
|
|
|
# But this DOES get matches since \B in ASCII mode only looks at individual
|
|
# bytes.
|
|
[[test]]
|
|
name = "unicode5-not-noutf8-only-ascii"
|
|
regex = '\B'
|
|
haystack = '0\xFF\xFF\xFF\xFF'
|
|
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
|
|
unescape = true
|
|
unicode = false
|
|
utf8 = false
|
|
|
|
# Some tests of no particular significance.
|
|
[[test]]
|
|
name = "unicode6"
|
|
regex = '\b[0-9]+\b'
|
|
haystack = "foo 123 bar 456 quux 789"
|
|
matches = [[4, 7], [12, 15], [21, 24]]
|
|
|
|
[[test]]
|
|
name = "unicode7"
|
|
regex = '\b[0-9]+\b'
|
|
haystack = "foo 123 bar a456 quux 789"
|
|
matches = [[4, 7], [22, 25]]
|
|
|
|
[[test]]
|
|
name = "unicode8"
|
|
regex = '\b[0-9]+\b'
|
|
haystack = "foo 123 bar 456a quux 789"
|
|
matches = [[4, 7], [22, 25]]
|
|
|
|
# A variant of the problem described here:
|
|
# https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667
|
|
[[test]]
|
|
name = "alt-with-assertion-repetition"
|
|
regex = '(?:\b|%)+'
|
|
haystack = "z%"
|
|
bounds = [1, 2]
|
|
anchored = true
|
|
matches = [[1, 1]]
|