typst/vendor/regex/testdata/word-boundary.toml

# Some of these are cribbed from RE2's test suite.

# These test \b. Below are tests for \B.
[[test]]
name = "wb1"
regex = '\b'
haystack = ""
matches = []
unicode = false

[[test]]
name = "wb2"
regex = '\b'
haystack = "a"
matches = [[0, 0], [1, 1]]
unicode = false

[[test]]
name = "wb3"
regex = '\b'
haystack = "ab"
matches = [[0, 0], [2, 2]]
unicode = false

[[test]]
name = "wb4"
regex = '^\b'
haystack = "ab"
matches = [[0, 0]]
unicode = false

[[test]]
name = "wb5"
regex = '\b$'
haystack = "ab"
matches = [[2, 2]]
unicode = false

[[test]]
name = "wb6"
regex = '^\b$'
haystack = "ab"
matches = []
unicode = false

[[test]]
name = "wb7"
regex = '\bbar\b'
haystack = "nobar bar foo bar"
matches = [[6, 9], [14, 17]]
unicode = false

[[test]]
name = "wb8"
regex = 'a\b'
haystack = "faoa x"
matches = [[3, 4]]
unicode = false

[[test]]
name = "wb9"
regex = '\bbar'
haystack = "bar x"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb10"
regex = '\bbar'
haystack = "foo\nbar x"
matches = [[4, 7]]
unicode = false

[[test]]
name = "wb11"
regex = 'bar\b'
haystack = "foobar"
matches = [[3, 6]]
unicode = false

[[test]]
name = "wb12"
regex = 'bar\b'
haystack = "foobar\nxxx"
matches = [[3, 6]]
unicode = false

[[test]]
name = "wb13"
regex = '(?:foo|bar|[A-Z])\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb14"
regex = '(?:foo|bar|[A-Z])\b'
haystack = "foo\n"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb15"
regex = '\b(?:foo|bar|[A-Z])'
haystack = "foo"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb16"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "X"
matches = [[0, 1]]
unicode = false

[[test]]
name = "wb17"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "XY"
matches = []
unicode = false

[[test]]
name = "wb18"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "bar"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb19"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb20"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "foo\n"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb21"
regex = '\b(?:foo|bar|[A-Z])\b'
haystack = "ffoo bbar N x"
matches = [[10, 11]]
unicode = false

[[test]]
name = "wb22"
regex = '\b(?:fo|foo)\b'
haystack = "fo"
matches = [[0, 2]]
unicode = false

[[test]]
name = "wb23"
regex = '\b(?:fo|foo)\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb24"
regex = '\b\b'
haystack = ""
matches = []
unicode = false

[[test]]
name = "wb25"
regex = '\b\b'
haystack = "a"
matches = [[0, 0], [1, 1]]
unicode = false

[[test]]
name = "wb26"
regex = '\b$'
haystack = ""
matches = []
unicode = false

[[test]]
name = "wb27"
regex = '\b$'
haystack = "x"
matches = [[1, 1]]
unicode = false

[[test]]
name = "wb28"
regex = '\b$'
haystack = "y x"
matches = [[3, 3]]
unicode = false

[[test]]
name = "wb29"
regex = '(?-u:\b).$'
haystack = "x"
matches = [[0, 1]]

[[test]]
name = "wb30"
regex = '^\b(?:fo|foo)\b'
haystack = "fo"
matches = [[0, 2]]
unicode = false

[[test]]
name = "wb31"
regex = '^\b(?:fo|foo)\b'
haystack = "foo"
matches = [[0, 3]]
unicode = false

[[test]]
name = "wb32"
regex = '^\b$'
haystack = ""
matches = []
unicode = false

[[test]]
name = "wb33"
regex = '^\b$'
haystack = "x"
matches = []
unicode = false

[[test]]
name = "wb34"
regex = '^(?-u:\b).$'
haystack = "x"
matches = [[0, 1]]

[[test]]
name = "wb35"
regex = '^(?-u:\b).(?-u:\b)$'
haystack = "x"
matches = [[0, 1]]

[[test]]
name = "wb36"
regex = '^^^^^\b$$$$$'
haystack = ""
matches = []
unicode = false

[[test]]
name = "wb37"
regex = '^^^^^(?-u:\b).$$$$$'
haystack = "x"
matches = [[0, 1]]

[[test]]
name = "wb38"
regex = '^^^^^\b$$$$$'
haystack = "x"
matches = []
unicode = false

[[test]]
name = "wb39"
regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$'
haystack = "x"
matches = [[0, 1]]

[[test]]
name = "wb40"
regex = '(?-u:\b).+(?-u:\b)'
haystack = "$$abc$$"
matches = [[2, 5]]

[[test]]
name = "wb41"
regex = '\b'
haystack = "a b c"
matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]
unicode = false

[[test]]
name = "wb42"
regex = '\bfoo\b'
haystack = "zzz foo zzz"
matches = [[4, 7]]
unicode = false

[[test]]
name = "wb43"
regex = '\b^'
haystack = "ab"
matches = [[0, 0]]
unicode = false

[[test]]
name = "wb44"
regex = '$\b'
haystack = "ab"
matches = [[2, 2]]
unicode = false


# Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we
# have to disable it for most of these tests. This is because \B can match at
# non-UTF-8 boundaries.
[[test]]
name = "nb1"
regex = '\Bfoo\B'
haystack = "n foo xfoox that"
matches = [[7, 10]]
unicode = false
utf8 = false

[[test]]
name = "nb2"
regex = 'a\B'
haystack = "faoa x"
matches = [[1, 2]]
unicode = false
utf8 = false

[[test]]
name = "nb3"
regex = '\Bbar'
haystack = "bar x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb4"
regex = '\Bbar'
haystack = "foo\nbar x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb5"
regex = 'bar\B'
haystack = "foobar"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb6"
regex = 'bar\B'
haystack = "foobar\nxxx"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb7"
regex = '(?:foo|bar|[A-Z])\B'
haystack = "foox"
matches = [[0, 3]]
unicode = false
utf8 = false

[[test]]
name = "nb8"
regex = '(?:foo|bar|[A-Z])\B'
haystack = "foo\n"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb9"
regex = '\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb10"
regex = '\B'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb11"
regex = '\B(?:foo|bar|[A-Z])'
haystack = "foo"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb12"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xXy"
matches = [[1, 2]]
unicode = false
utf8 = false

[[test]]
name = "nb13"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "XY"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb14"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "XYZ"
matches = [[1, 2]]
unicode = false
utf8 = false

[[test]]
name = "nb15"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "abara"
matches = [[1, 4]]
unicode = false
utf8 = false

[[test]]
name = "nb16"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xfoo_"
matches = [[1, 4]]
unicode = false
utf8 = false

[[test]]
name = "nb17"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "xfoo\n"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb18"
regex = '\B(?:foo|bar|[A-Z])\B'
haystack = "foo bar vNX"
matches = [[9, 10]]
unicode = false
utf8 = false

[[test]]
name = "nb19"
regex = '\B(?:fo|foo)\B'
haystack = "xfoo"
matches = [[1, 3]]
unicode = false
utf8 = false

[[test]]
name = "nb20"
regex = '\B(?:foo|fo)\B'
haystack = "xfooo"
matches = [[1, 4]]
unicode = false
utf8 = false

[[test]]
name = "nb21"
regex = '\B\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb22"
regex = '\B\B'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb23"
regex = '\B$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb24"
regex = '\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb25"
regex = '\B$'
haystack = "y x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb26"
regex = '\B.$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb27"
regex = '^\B(?:fo|foo)\B'
haystack = "fo"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb28"
regex = '^\B(?:fo|foo)\B'
haystack = "fo"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb29"
regex = '^\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb30"
regex = '^\B'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb31"
regex = '^\B\B'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb32"
regex = '^\B\B'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb33"
regex = '^\B$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb34"
regex = '^\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb35"
regex = '^\B.$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb36"
regex = '^\B.\B$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb37"
regex = '^^^^^\B$$$$$'
haystack = ""
matches = [[0, 0]]
unicode = false
utf8 = false

[[test]]
name = "nb38"
regex = '^^^^^\B.$$$$$'
haystack = "x"
matches = []
unicode = false
utf8 = false

[[test]]
name = "nb39"
regex = '^^^^^\B$$$$$'
haystack = "x"
matches = []
unicode = false
utf8 = false


# unicode1* and unicode2* work for both Unicode and ASCII because all matches
# are reported as byte offsets, and « and » do not correspond to word
# boundaries at either the character or byte level.
[[test]]
name = "unicode1"
regex = '\bx\b'
haystack = "«x"
matches = [[2, 3]]

[[test]]
name = "unicode1-only-ascii"
regex = '\bx\b'
haystack = "«x"
matches = [[2, 3]]
unicode = false

[[test]]
name = "unicode2"
regex = '\bx\b'
haystack = "x»"
matches = [[0, 1]]

[[test]]
name = "unicode2-only-ascii"
regex = '\bx\b'
haystack = "x»"
matches = [[0, 1]]
unicode = false

# ASCII word boundaries are completely oblivious to Unicode characters, so
# even though β is a character, an ASCII \b treats it as a word boundary
# when it is adjacent to another ASCII character. (The ASCII \b only looks
# at the leading byte of β.) For Unicode \b, the tests are precisely inverted.
[[test]]
name = "unicode3"
regex = '\bx\b'
haystack = 'áxβ'
matches = []

[[test]]
name = "unicode3-only-ascii"
regex = '\bx\b'
haystack = 'áxβ'
matches = [[2, 3]]
unicode = false

[[test]]
name = "unicode4"
regex = '\Bx\B'
haystack = 'áxβ'
matches = [[2, 3]]

[[test]]
name = "unicode4-only-ascii"
regex = '\Bx\B'
haystack = 'áxβ'
matches = []
unicode = false
utf8 = false

# The same as above, but with \b instead of \B as a sanity check.
[[test]]
name = "unicode5"
regex = '\b'
haystack = "0\U0007EF5E"
matches = [[0, 0], [1, 1]]

[[test]]
name = "unicode5-only-ascii"
regex = '\b'
haystack = "0\U0007EF5E"
matches = [[0, 0], [1, 1]]
unicode = false
utf8 = false

[[test]]
name = "unicode5-noutf8"
regex = '\b'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[0, 0], [1, 1]]
unescape = true
utf8 = false

[[test]]
name = "unicode5-noutf8-only-ascii"
regex = '\b'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[0, 0], [1, 1]]
unescape = true
unicode = false
utf8 = false

# Weird special case to ensure that ASCII \B treats each individual code unit
# as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary
# codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the
# \w character class.)
[[test]]
name = "unicode5-not"
regex = '\B'
haystack = "0\U0007EF5E"
matches = [[5, 5]]

[[test]]
name = "unicode5-not-only-ascii"
regex = '\B'
haystack = "0\U0007EF5E"
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
unicode = false
utf8 = false

# This gets no matches since \B only matches in the presence of valid UTF-8
# when Unicode is enabled, even when UTF-8 mode is disabled.
[[test]]
name = "unicode5-not-noutf8"
regex = '\B'
haystack = '0\xFF\xFF\xFF\xFF'
matches = []
unescape = true
utf8 = false

# But this DOES get matches since \B in ASCII mode only looks at individual
# bytes.
[[test]]
name = "unicode5-not-noutf8-only-ascii"
regex = '\B'
haystack = '0\xFF\xFF\xFF\xFF'
matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
unescape = true
unicode = false
utf8 = false

# Some tests of no particular significance.
[[test]]
name = "unicode6"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar 456 quux 789"
matches = [[4, 7], [12, 15], [21, 24]]

[[test]]
name = "unicode7"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar a456 quux 789"
matches = [[4, 7], [22, 25]]

[[test]]
name = "unicode8"
regex = '\b[0-9]+\b'
haystack = "foo 123 bar 456a quux 789"
matches = [[4, 7], [22, 25]]

# A variant of the problem described here:
# https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667
[[test]]
name = "alt-with-assertion-repetition"
regex = '(?:\b|%)+'
haystack = "z%"
bounds = [1, 2]
anchored = true
matches = [[1, 1]]