99 lines
2.5 KiB
TOML
99 lines
2.5 KiB
TOML
# These tests are specifically written to test the regex-lite crate. While it
|
||
# largely has the same semantics as the regex crate, there are some differences
|
||
# around Unicode support and UTF-8.
|
||
#
|
||
# To be clear, regex-lite supports far fewer patterns because of its lack of
|
||
# Unicode support, nested character classes and character class set operations.
|
||
# What we're talking about here are the patterns that both crates support but
|
||
# where the semantics might differ.
|
||
|
||
# regex-lite uses ASCII definitions for Perl character classes.
|
||
[[test]]
|
||
name = "perl-class-decimal"
|
||
regex = '\d'
|
||
haystack = '᠕'
|
||
matches = []
|
||
unicode = true
|
||
|
||
# regex-lite uses ASCII definitions for Perl character classes.
|
||
[[test]]
|
||
name = "perl-class-space"
|
||
regex = '\s'
|
||
haystack = "\u2000"
|
||
matches = []
|
||
unicode = true
|
||
|
||
# regex-lite uses ASCII definitions for Perl character classes.
|
||
[[test]]
|
||
name = "perl-class-word"
|
||
regex = '\w'
|
||
haystack = 'δ'
|
||
matches = []
|
||
unicode = true
|
||
|
||
# regex-lite uses the ASCII definition of word for word boundary assertions.
|
||
[[test]]
|
||
name = "word-boundary"
|
||
regex = '\b'
|
||
haystack = 'δ'
|
||
matches = []
|
||
unicode = true
|
||
|
||
# regex-lite uses the ASCII definition of word for negated word boundary
|
||
# assertions. But note that it should still not split codepoints!
|
||
[[test]]
|
||
name = "word-boundary-negated"
|
||
regex = '\B'
|
||
haystack = 'δ'
|
||
matches = [[0, 0], [2, 2]]
|
||
unicode = true
|
||
|
||
# While we're here, the empty regex---which matches at every
|
||
# position---shouldn't split a codepoint either.
|
||
[[test]]
|
||
name = "empty-no-split-codepoint"
|
||
regex = ''
|
||
haystack = '💩'
|
||
matches = [[0, 0], [4, 4]]
|
||
unicode = true
|
||
|
||
# A dot always matches a full codepoint.
|
||
[[test]]
|
||
name = "dot-always-matches-codepoint"
|
||
regex = '.'
|
||
haystack = '💩'
|
||
matches = [[0, 4]]
|
||
unicode = false
|
||
|
||
# A negated character class also always matches a full codepoint.
|
||
[[test]]
|
||
name = "negated-class-always-matches-codepoint"
|
||
regex = '[^a]'
|
||
haystack = '💩'
|
||
matches = [[0, 4]]
|
||
unicode = false
|
||
|
||
# regex-lite only supports ASCII-aware case insensitive matching.
|
||
[[test]]
|
||
name = "case-insensitive-is-ascii-only"
|
||
regex = 's'
|
||
haystack = 'ſ'
|
||
matches = []
|
||
unicode = true
|
||
case-insensitive = true
|
||
|
||
# Negated word boundaries shouldn't split a codepoint, but they will match
|
||
# between invalid UTF-8.
|
||
#
|
||
# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
|
||
# regex-lite. This can't happen in the main API because &str can't contain
|
||
# invalid UTF-8.
|
||
# [[test]]
|
||
# name = "word-boundary-invalid-utf8"
|
||
# regex = '\B'
|
||
# haystack = '\xFF\xFF\xFF\xFF'
|
||
# unescape = true
|
||
# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
|
||
# unicode = true
|
||
# utf8 = false
|