typst/vendor/regex/testdata/regex-lite.toml

# These tests are specifically written to test the regex-lite crate. While it
# largely has the same semantics as the regex crate, there are some differences
# around Unicode support and UTF-8.
#
# To be clear, regex-lite supports far fewer patterns because of its lack of
# Unicode support, nested character classes and character class set operations.
# What we're talking about here are the patterns that both crates support but
# where the semantics might differ.

# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-decimal"
regex = '\d'
haystack = '᠕'
matches = []
unicode = true

# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-space"
regex = '\s'
haystack = "\u2000"
matches = []
unicode = true

# regex-lite uses ASCII definitions for Perl character classes.
[[test]]
name = "perl-class-word"
regex = '\w'
haystack = 'δ'
matches = []
unicode = true

# regex-lite uses the ASCII definition of word for word boundary assertions.
[[test]]
name = "word-boundary"
regex = '\b'
haystack = 'δ'
matches = []
unicode = true

# regex-lite uses the ASCII definition of word for negated word boundary
# assertions. But note that it should still not split codepoints!
[[test]]
name = "word-boundary-negated"
regex = '\B'
haystack = 'δ'
matches = [[0, 0], [2, 2]]
unicode = true

# While we're here, the empty regex---which matches at every
# position---shouldn't split a codepoint either.
[[test]]
name = "empty-no-split-codepoint"
regex = ''
haystack = '💩'
matches = [[0, 0], [4, 4]]
unicode = true

# A dot always matches a full codepoint.
[[test]]
name = "dot-always-matches-codepoint"
regex = '.'
haystack = '💩'
matches = [[0, 4]]
unicode = false

# A negated character class also always matches a full codepoint.
[[test]]
name = "negated-class-always-matches-codepoint"
regex = '[^a]'
haystack = '💩'
matches = [[0, 4]]
unicode = false

# regex-lite only supports ASCII-aware case insensitive matching.
[[test]]
name = "case-insensitive-is-ascii-only"
regex = 's'
haystack = 'ſ'
matches = []
unicode = true
case-insensitive = true

# Negated word boundaries shouldn't split a codepoint, but they will match
# between invalid UTF-8.
#
# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
# regex-lite. This can't happen in the main API because &str can't contain
# invalid UTF-8.
# [[test]]
# name = "word-boundary-invalid-utf8"
# regex = '\B'
# haystack = '\xFF\xFF\xFF\xFF'
# unescape = true
# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
# unicode = true
# utf8 = false